-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathnlp.py
306 lines (259 loc) · 10.2 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import numpy as np
import tensorflow.python.keras
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Conv1D
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from numpy import loadtxt
from tensorflow.python.keras.models import load_model
import pickle5 as pickle
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
lemmatizer = WordNetLemmatizer()
# load doc into memory
def load_doc(filename):
# open the file as read only
file = open(filename, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
# turn a doc into clean tokens
def clean_doc(doc):
# split into tokens by white space
tokens = doc.split()
# remove punctuation from each token
tokens = [lemmatizer.lemmatize(w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
# load doc
doc = load_doc(filename)
# clean doc
tokens = clean_doc(doc)
# update counts
vocab.update(tokens)
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
# walk through all files in the folder
for filename in listdir(directory):
# create the full path of the file to open
path = directory + '/' + filename
# add doc to vocab
add_doc_to_vocab(path, vocab)
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('data/neg_train', vocab, True)
process_docs('data/pos_train', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))
min_occurane = 3
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
# save list to file
def save_list(lines, filename):
# convert lines to a single blob of text
data = '\n'.join(lines)
# open file
file = open(filename, 'w')
# write text
file.write(data)
# close file
file.close()
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')
# load doc into memory
def load_doc(filename):
# open the file as read only
file = open(filename, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# turn a doc into clean tokens
def clean_doc(doc, vocab):
# split into tokens by white space
tokens = doc.split()
# remove punctuation from each token
tokens = [lemmatizer.lemmatize(w) for w in tokens]
# filter out tokens not in vocab
tokens = [w for w in tokens if w in vocab]
tokens = ' '.join(tokens)
return tokens
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
documents = list()
print(directory+" : ",len(listdir(directory)))
# walk through all files in the folder
for filename in listdir(directory):
# skip any reviews in the test set
# create the full path of the file to open
path = directory + '/' + filename
# load the doc
doc = load_doc(path)
# clean doc
tokens = clean_doc(doc, vocab)
# add to list
documents.append(tokens)
return documents
def make_model():
# load all training reviews
positive_docs = process_docs('data/pos_train', vocab, True)
negative_docs = process_docs('data/neg_train', vocab, True)
train_docs = negative_docs + positive_docs
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
print("\n\n maxlenght="+str(max_length))
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
y = np.array([0 for _ in range(270)] + [1 for _ in range(270)])
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42)
'''
# load all test reviews
positive_docs = process_docs('data/pos_test', vocab, False)
negative_docs = process_docs('data/neg_test', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = np.array([0 for _ in range(len(listdir("data/neg_test")))] + [1 for _ in range(len(listdir("data/pos_test")))])
'''
print("\n pad_sequences : ",Xtest)
print("\n ytest : ",ytest)
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=20, verbose=1)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
model.save("relevancy_model_v2.0.1.h5")
print("Done!")
def make_model_NB():
# load all training reviews
positive_docs = process_docs('data/pos_train', vocab, True)
negative_docs = process_docs('data/neg_train', vocab, True)
train_docs = negative_docs + positive_docs
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
print("\n\n maxlenght="+str(max_length))
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = np.array([0 for _ in range(250)] + [1 for _ in range(250)])
# load all test reviews
positive_docs = process_docs('data/pos_test', vocab, False)
negative_docs = process_docs('data/neg_test', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = np.array([0 for _ in range(len(listdir("data/neg_test")))] + [1 for _ in range(len(listdir("data/pos_test")))])
print("\n pad_sequences : ",Xtest)
print("\n ytest : ",ytest)
gnb = GaussianNB()
gnb.fit(Xtrain, ytrain)
# making predictions on the testing set
y_pred = gnb.predict(Xtest)
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(ytest, y_pred)*100)
def predict_process_docs(doc,vocab):
documents = list()
# clean doc
tokens = clean_doc(doc, vocab)
# add to list
documents.append(tokens)
return documents
def predict(doc):
predict_docs = predict_process_docs(doc,vocab)
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
encoded_docs = tokenizer.texts_to_sequences(predict_docs)
X = pad_sequences(encoded_docs, maxlen=94, padding='post')
# load model
model = load_model('relevancy_model.h5')
y=model.predict_classes(np.array(X))
if (y == [[1]]) :
#print("\nRelevant \n")
return(1)
else :
#print("\nIrrelevant \n")
return(0)
def relevant(notif):
""" Checks for relevance of the notification content """
result=predict(notif)
print(result)
if (result == 1) :
return 1
else :
return 0
if __name__ == "__main__":
result=predict(" It is here by notified that the result of B.Tech S5 (S) Exam July 2019 is published. The detailed results are available in the KTU website: www.ktu.edu.in. Students can apply for answer script copy and revaluation by registering in the KTU web portal from 28.10.2019 Monday to 01.11.2019 Friday. The Fee for answer script copy is Rs.500/- per answer script and for revaluation Rs. 600/- per answer script. Students should submit their requests through student login and pay the fee at College office on or before 01.11.2019 Friday. Requests for late registration for revaluation and answer book copy will not be entertained. However in case of technical issues the request will be considered only if the matter is reported to University before the last date with proof.Result Notification - S5 (S)")
print(result)
if (result == 1) :
print("relevant")
else :
print("Irrelevant")