-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathR Text classfication using CSV files
68 lines (54 loc) · 1.94 KB
/
R Text classfication using CSV files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(tm)
library(plyr)
library(class)
libs=c("tm","plyr","class")
lapply(libs,require,character.only = T)
options(stringsAsFactors= F)
cat = c("Oneliner.csv","Sickness.csv","Complaint.csv","ATC.csv","Availabilty.csv","Oneliner.csv")
pathname = "C:/Users/s769346/Desktop/New folder"
cleanCorpus <-function(corpus) {
corpus.tmp = tm_map(corpus,removePunctuation)
corpus.tmp = tm_map(corpus.tmp,stripWhitespace)
corpus.tmp = tm_map(corpus.tmp,tolower)
corpus.tmp = tm_map(corpus.tmp,removeWords,stopwords("english"))
corpus.tmp = tm_map(corpus.tmp,stemDocument)
return(corpus.tmp)
}
generateTDM <- function(cate,path) {
s.path = sprintf("%s/%s",path,cate)
csv = read.csv(s.path)
s.cor = Corpus(DataframeSource(csv))
s.cor.cl = cleanCorpus(s.cor)
s.tdm= TermDocumentMatrix(s.cor.cl)
s.tdm = removeSparseTerms(s.tdm,0.7)
result <-list(name= cate,tdm = s.tdm)
}
tdm = lapply(cat,generateTDM,path = pathname)
# attach name
bindCategoryTDM <- function(tdm) {
s.mat = t(data.matrix(tdm[["tdm"]]))
s.df = as.data.frame(s.mat,stringsAsFactors = F)
s.df = cbind(s.df,rep(tdm[["name"]],nrow(s.df)))
colnames(s.df)[ncol(s.df)] <- "targetCat"
return(s.df)
}
catTDM = lapply(tdm,bindCategoryTDM)
#Stack
tdm.stack = do.call(rbind.fill,catTDM)
tdm.stack[is.na(tdm.stack)] = 0
#holdout
train.idx <- sample(nrow(tdm.stack),ceiling(nrow(tdm.stack) * 0.7))
text.idx = (1:nrow(tdm.stack))[-train.idx]
#model
tdm.cat = tdm.stack[,"targetCat"]
tdm.stack.nl = tdm.stack[,!colnames(tdm.stack) %in% "targetCat"]
knn.pred = knn(tdm.stack.nl[train.idx,],tdm.stack.nl[text.idx,],tdm.cat[train.idx])
#accuracy
conf.mat = table("predictions" = knn.pred,Actual = tdm.cat[text.idx])
accuracy = sum(diag(conf.mat)/length(text.idx) *100)
accuracy
conf.mat
---------------------------------------------------------------------
inspect(stem) - display the content in the Corpus
corpus <- Corpus(DataframeSource(csvpath)) - read all the documents in csv
findFreqTerms(tdm, 300) - frequency of terms