-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathamazon_reviews_scraper.R
236 lines (185 loc) · 8.7 KB
/
amazon_reviews_scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#Install pacman. It reduces typing in package management actions. The function names in pacman package follow the format p_xxx
install.packages("pacman")
library("pacman")
#p_load allows the user to load one or more packages as a substitute for the library function
pacman::p_load(XML, dplyr, stringr, rvest, audio)
# Creating a trim function to remove all white spaces
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
#Initialising the product code of the product you want to examine
#View the product in Amazon, get the product code from its URL. It looks something like this: B073HCFZM7
prod_code = "B075QMZH2L"
#Amazon's URL is easy to build. Just concatenate the product code at the end
url <- paste0("https://www.amazon.com/dp/", prod_code)
#Read the HTML source of the URL and store it in doc
doc <- read_html(url)
#Obtain the product name. The product name is stored in the 'doc' html. '#productTitle' gives the product name
#Also trim the spaces and new lines
prod <- html_nodes(doc, "#productTitle") %>% html_text() %>% gsub("\n", "", .) %>% trim()
#Now we have got the Product name!
prod
#Source funtion is used to parse Amazon html pages for data
#The source has the amazon_scraper function in it
#It parses the data and gives the following values - prod, title, author, date, ver.purchase, format, stars, comments and helpful for each product
# source("https://raw.githubusercontent.com/rjsaito/Just-R-Things/master/Text%20Mining/amazonscraper.R")
#Parse Amazon html pages for data
amazon_scraper <- function(doc, reviewer = T, delay = 0){
if(!"pacman" %in% installed.packages()[,"Package"]) install.packages("pacman")
pacman::p_load_gh("trinker/sentimentr")
pacman::p_load(RCurl, XML, dplyr, stringr, rvest, audio)
sec = 0
if(delay < 0) warning("delay was less than 0: set to 0")
if(delay > 0) sec = max(0, delay + runif(1, -1, 1))
#Remove all white space
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
title <- doc %>%
html_nodes("#cm_cr-review_list .a-color-base") %>%
html_text()
# author <- doc %>%
# html_nodes(".review-byline .author") %>%
# html_text()
author <- doc %>%
html_nodes("#cm_cr-review_list .a-profile-name") %>%
html_text()
date <- doc %>%
html_nodes("#cm_cr-review_list .review-date") %>%
html_text() %>%
gsub(".*on ", "", .)
ver.purchase <- doc%>%
html_nodes(".review-data.a-spacing-mini") %>%
html_text() %>%
grepl("Verified Purchase", .) %>%
as.numeric()
format <- doc %>%
html_nodes(".review-data.a-spacing-mini") %>%
html_text() %>%
gsub("Color: |\\|.*|Verified.*", "", .)
#if(length(format) == 0) format <- NA
stars <- doc %>%
html_nodes("#cm_cr-review_list .review-rating") %>%
html_text() %>%
str_extract("\\d") %>%
as.numeric()
comments <- doc %>%
html_nodes("#cm_cr-review_list .review-text") %>%
html_text()
#helpful <- doc %>%
# html_nodes(".cr-vote-buttons .a-color-secondary") %>%
# html_text() %>%
# str_extract("[:digit:]+|One") %>%
# gsub("One", "1", .) %>%
# as.numeric()
# Helpful votes number cannot be extracted from reviews with no helpful votes cast
helpful <- doc %>%
html_nodes("#cm_cr-review_list .cr-vote-text") %>%
html_text() %>%
str_extract("[:digit:]+|One") %>%
gsub("One", "1", .) %>%
as.numeric()
if(reviewer == T){
rver_url <- doc %>%
html_nodes(".review-byline .author") %>%
html_attr("href") %>%
gsub("/ref=cm_cr_othr_d_pdp\\?ie=UTF8", "", .) %>%
gsub("/gp/pdp/profile/", "", .) %>%
paste0("https://www.amazon.com/gp/cdp/member-reviews/",.)
#average rating of past 10 reviews
rver_avgrating_10 <- rver_url %>%
sapply(., function(x) {
read_html(x) %>%
html_nodes(".small span img") %>%
html_attr("title") %>%
gsub("out of.*|stars", "", .) %>%
as.numeric() %>%
mean(na.rm = T)
}) %>% as.numeric()
rver_prof <- rver_url %>%
sapply(., function(x)
read_html(x) %>%
html_nodes("div.small, td td td .tiny") %>%
html_text()
)
rver_numrev <- rver_prof %>%
lapply(., function(x)
gsub("\n Customer Reviews: |\n", "", x[1])
) %>% as.numeric()
rver_numhelpful <- rver_prof %>%
lapply(., function(x)
gsub(".*Helpful Votes:|\n", "", x[2]) %>%
trim()
) %>% as.numeric()
rver_rank <- rver_prof %>%
lapply(., function(x)
gsub(".*Top Reviewer Ranking:|Helpful Votes:.*|\n", "", x[2]) %>%
removePunctuation() %>%
trim()
) %>% as.numeric()
df <- data.frame(title, date, ver.purchase, format, stars, comments, helpful,
rver_url, rver_avgrating_10, rver_numrev, rver_numhelpful, rver_rank, stringsAsFactors = F)
} #else df <- data.frame(title, author, date, ver.purchase, format, stars, comments, helpful, stringsAsFactors = F)
# Removing 'author', 'helpful' from the dataframe. (Resolving the open issue: Error in data.frame(title, author, date, ver.purchase, format, stars, arguments imply differing number of rows)
# Amazon.com had changed the HTML code of the reviews page. Due to different HTML nodes, this script to extract author, and helpful votes did not work
# I added the right HTML tags in the code, but still removed them from the data frame
else df <- data.frame(title, date, ver.purchase, format, stars, comments, stringsAsFactors = F)
return(df)
}
#Give the number of pages of reviews you want to extract
pages <- 10
#Initialising the reviews_all variable as NULL
reviews_all <- NULL
#Extracting the first ten pages of reviews and storing it in 'reviews_all' list variable
for(page_num in 1:pages){
url <- paste0("http://www.amazon.com/product-reviews/",prod_code,"/?pageNumber=", page_num)
doc <- read_html(url)
reviews <- amazon_scraper(doc, reviewer = F, delay = 2)
reviews_all <- rbind(reviews_all, cbind(prod, reviews))
}
#Check the structure of 'reviews_all' list
str(reviews_all)
#Reviews:
reviews_all$comments
#Now we have obtained the first 10 pages of reviews
#We will do a sentimental analysis of these reviews/comments. Sentiment analysis is a Natural Language Processing method
#Load the trinker & sentimentr package using pacman / library function
pacman::p_load_gh("trinker/sentimentr")
#install.packages("sentimentr")
library("sentimentr")
#Applying the sentiment_by function to only the comments column of the reviews_all data using the 'with()' function
#sent_agg <- sentiment_by(reviews_all$comments). Both perform the same function
#sentiment_by() function returns the approximate sentiment of the reviews. It returns a datatable of element_id, sentence_id, word_count
#standard deviation and average sentiment for each review
sent_agg <- with(reviews_all, sentiment_by(comments))
#Print the first 6 rows of sent_agg
head(sent_agg)
#Order them based on the sentiment score
ordered_sent_agg <- sent_agg[order(-ave_sentiment)]
#Plot the histogram of the stars / ratings given by the customers
#(Rating is between 0 to 5)
#x <- hist(reviews_all$stars)
with(reviews_all, hist(stars))
#Plot the histogram of the average sentiment scores for the reviews
#y <- hist(reviews_all$ave_sentiment)
with(ordered_sent_agg, hist(ave_sentiment))
#Get the mean rating of the product
mean(reviews_all$stars)
#Get the mean sentiment score of the product
#Higher the sentiment score, more positive are the reviews
mean(ordered_sent_agg$ave_sentiment)
#Get the count of the number of reviews scraped
number_of_reviews <- nrow(reviews_all)
#Sort these reviews in descending order of their ave_sentiment scores
sortedReviewsFromBest <- slice(reviews_all, top_n(ordered_sent_agg, number_of_reviews, ave_sentiment)$element_id)
sortedReviewsFromBest
#Highlight these reviews by sentiment polarity (positive = green; negative = pink) and present them in an html file.
highlight(sentiment_by(sortedReviewsFromBest$comments))
#Create a subset of the top 3 reviews as 'best_reviews' based on the sentiment score
best_reviews <- slice(reviews_all, top_n(ordered_sent_agg, 3, ave_sentiment)$element_id)
best_reviews
#Highlight the top 3 reviews by sentiment polarity (positive = green; negative = pink) and present them in an html file.
highlight(sentiment_by(best_reviews$comments))
#with(best_reviews, sentiment_by(comments)) %>% highlight()
#Create a subset of the bottom 3 reviews as 'worst_reviews' based on their sentiment scores
worst_reviews <- slice(reviews_all, top_n(ordered_sent_agg, 3, -ave_sentiment)$element_id)
worst_reviews
#Highlight the bottom 3 reviews by sentiment polarity (positive = green; negative = pink) as an html file.
highlight(sentiment_by(worst_reviews$comments))
#with(worst_reviews, sentiment_by(comments)) %>% highlight()