-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy path.Rhistory
78 lines (78 loc) · 2.79 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
?ldply
library("RCurl")
library("XML")
library("data.table")
library("tidyverse")
# Get list of sub-pages in athlete directory (each contains list of individual athlete pages)
athlete_directory <- getURL("https://www.sports-reference.com/olympics/athletes/") %>%
htmlParse(asText=TRUE) %>%
xpathSApply('//td/a', xmlGetAttr, 'href') %>%
paste('https://www.sports-reference.com/', ., sep="")
# Check number of sub-pages
length(athlete_directory) # 453 pages
# Initialize vector to store links
individual_links <- c()
system.time( # ~3.5 minutes
for (i in 1:length(athlete_directory)) {
# parse athlete directory sub-page to get all links
new <- getURL(athlete_directory[i]) %>%
htmlParse(asText=TRUE) %>%
xpathSApply('//*[(@id = "page_content")]//a', xmlGetAttr, 'href') %>%
paste('http://www.sports-reference.com/', ., sep="")
# update vector of athlete pages
individual_links <- c(individual_links, new)
# track progress in console
print(i)
flush.console() # avoid output buffering
}
)
# Check number of individual links (athletes)
length(individual_links) # 135584
load("C:/Users/Randi Griffin/Documents/GitHub/Olympic_history/scrapings.Rdata")
rm(new)
rm(i)
infobox[[16000]]
save(individual_links, infobox, results_table, file="C:/Users/Randi Griffin/Documents/GitHub/Olympic_history/scrapings.Rdata")
infobox[[16013]]
infobox[[16012]]
results_table[[16012]]
system.time(
for (i in 16013:50000) {
# get xml (wait a minute and try again if it times out and throws and error)
xml <- try(getURL(individual_links[i], .opts=curlOptions(followlocation=TRUE)), silent=TRUE)
if(class(xml) == "try-error") {
Sys.sleep(60)
xml <- getURL(individual_links[i], .opts=curlOptions(followlocation=TRUE))
}
xml <- htmlParse(xml, asText=TRUE)
# save 'infobox'
infobox[[i]] <- xpathSApply(xml, '//*[@id="info_box"]/p', xmlValue) %>%
strsplit('\n') %>% .[[1]]
# save 'results table'
results_table[[i]] <- readHTMLTable(xml) %>% .$results
# track progress in console
print(i)
flush.console()
}
)
save(individual_links, infobox, results_table, file="C:/Users/Randi Griffin/Documents/GitHub/Olympic_history/scrapings.Rdata")
system.time(
for (i in 50001:80000) {
# get xml (wait a minute and try again if it times out and throws and error)
xml <- try(getURL(individual_links[i], .opts=curlOptions(followlocation=TRUE)), silent=TRUE)
if(class(xml) == "try-error") {
Sys.sleep(60)
xml <- getURL(individual_links[i], .opts=curlOptions(followlocation=TRUE))
}
xml <- htmlParse(xml, asText=TRUE)
# save 'infobox'
infobox[[i]] <- xpathSApply(xml, '//*[@id="info_box"]/p', xmlValue) %>%
strsplit('\n') %>% .[[1]]
# save 'results table'
results_table[[i]] <- readHTMLTable(xml) %>% .$results
# track progress in console
print(i)
flush.console()
}
)
save(individual_links, infobox, results_table, file="C:/Users/Randi Griffin/Documents/GitHub/Olympic_history/scrapings_1_80k.Rdata")