-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyzeBooks.R
159 lines (105 loc) · 4.27 KB
/
analyzeBooks.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Author: Tom Kral
# About: This script is for analyzing a text/book in .txt format.
# Manual: The entire script can be called at once to perform the analyses. As per R rules, all functions are generated first.
# Operations are called at the bottom of the script.
################
# DEPENDENCIES #
################
#install.packages("RWeka") # install RWeka package
#install.packages("readr")
#install.packages("textdata")
#install.packages("dplyr")
#install.packages("ggplot2")
library(tidyverse) # data manipulation
library(tm) # text mining
library(tidytext) # text mining for word processing and sentiment analysis
library(reshape2) # reshapes a data frame
library(radarchart) # drawing the radar chart from a data frame
library(RWeka) # data mining tasks
library(knitr) # dynamic report generation
library(readr)
library(stringr)
library(dplyr)
library(ggplot2)
#############
# FUNCTIONS #
#############
cleanText <- function(text){ # cleans text
text <- str_conv(text, "latin1") # change encoding
text <- gsub("[[\\n\\r]]", "", text) # remove \n and \r made by read_fie
text <- gsub("[[:punct:]]", "", text) # delete all punctuation
text <- gsub("[[:digit:]]", "", text) # remove all numbers
text <- tolower(text) # makes all characters lowercase
text <- str_squish(text) # removes repeating whitespace
text <- gsub("merry", "", text) # remove the word "merry" from the text. This also the name of a Hobbit
return(text)
}
removeStopwords <- function(text){ # removes all English stopwords from a text
stopwords_df <- get_stopwords(source = "snowball")
textList <- unlist(str_split(text, " "))
text_nsw <- textList[!(textList) %in% stopwords_df$word]
text_cleaned <- paste(text_nsw, collapse = " ")
return(text_cleaned)
}
printDiversity <- function(text){ # print the lexical diversity of a text
wordList <- unlist(str_split(text, " "))
totalWords <- length(wordList)
uniqueWords <- length(unique(wordList))
lexDiv <- uniqueWords / totalWords
print(cat("Total words: ", totalWords, "\n Unique words: ", uniqueWords, "\n Lexical diversity: ", lexDiv, "\n"))
}
performSentimentAnalysis <- function(trilogy){ # perform a sentiment analysis
sentimentDataframe <- data_frame()
for (row in 1:nrow(trilogy)) {
print(trilogy$Title)
tokens <- tibble(text = trilogy[row, "Content"]) %>% unnest_tokens(word, text)
sentiments <- tokens %>%
inner_join(get_sentiments("nrc"), "word") %>%
count(word, sentiment, sort = TRUE)
sentimentDataframe <- bind_rows(sentimentDataframe, sentiments)
return(sentimentDataframe)
}
}
##############
# OPERATIONS #
##############
# Load books as strings
setwd("/home/tom/Projects/LotR_Books_vs_movies/BookData")
titles <- c("The Fellowship of the Ring", "The Two Towers", "The Return of the King")
fotr <- read_file("01 - The Fellowship Of The Ring.txt")
ttt <- read_file("02 - The Two Towers.txt")
rotk <- read_file("03 - The Return Of The King.txt")
# load test file instead
#test <- read_file("test.txt")
# Clean text
fotr_c <- cleanText(fotr)
ttt_c <- cleanText(ttt)
rotk_c <- cleanText(rotk)
# Remove stopwords
fotr_nsw <- removeStopwords(fotr_c)
ttt_nsw <- removeStopwords(ttt_c)
rotk_nsw <- removeStopwords(rotk_c)
# print lexical diversity
printDiversity(fotr_nsw)
printDiversity(ttt_nsw)
printDiversity(rotk_nsw)
# Merge all books into data frame
books <- c(fotr_nsw, ttt_nsw, rotk_nsw)
trilogy <- data.frame(titles, books, stringsAsFactors=FALSE)
names(trilogy) <- c("Title", "Content")
# Perform the sentiment analysis
sentimentResults <- performSentimentAnalysis(trilogy)
# Plot the results
sentimentPlot <- ggplot(data=sentimentResults, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency") +
theme_bw()
sentimentPlot + labs(title = "Emotions in the The Lord of the Rings by J.R.R. Tolkien")
# Condense the data
results <- subset(sentimentResults, select = -word)
results <- aggregate(results$n, list(results$sentiment), FUN = sum)
colnames(results) <- c('emotion', 'n')
# Write results to file
results$origin <- rep(c("book"), 10)
results <- results[ , c("origin", "emotion", "n")]
#write.csv(x = results, file="sentimentResults_books.csv", row.names = FALSE)