-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDataImportFromWeb.R
51 lines (32 loc) · 1.29 KB
/
DataImportFromWeb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Extracting data from the webpage using readHTMLTable of XML package since the data is available
# clearly on the HTML page
install.packages("XML")
require(XML)
sp.url<- "http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Dinov_091609_SnP_HomePriceIndex"
sp.extractedData<- readHTMLTable(sp.url, which =1, header= TRUE, stringAsFactors= FALSE)
head(sp.extractedData)
tail(sp.extractedData)
dim(sp.extractedData)
class(sp.extractedData)
names(sp.extractedData)
colnames(sp.extractedData)[23]<- "Composite-10"
#function to get rid off leading and trailing spaces
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
sp.oldName<- names(sp.dataNew)
sp.newName<- trim(sp.oldName)
#replacing dataset with new variable names
colnames(sp.extractedData) <- sp.newName
names(sp.extractedData)
#function to replace hyphens from variable names with _
repHyp <- function (x) gsub("\\-", "_", x)
sp.oldName1<- names(sp.extractedData)
sp.newName1<- repHyp(sp.oldName1)
#replacing dataset with new variable names
colnames(sp.extractedData) <- sp.newName1
names(sp.dataNew)
#saving the dataframe in R format
save(sp.extractedData, file="sp.fullDataset")
# saving the dataset in csv format
write.table(sp.extractedData, file="fullDataset.csv", sep=",", row.names= FALSE)
#check for missing values
any(is.na(sp.extractedData))