-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtester.py
192 lines (159 loc) · 6.79 KB
/
tester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import string
import time
from gensim.models import KeyedVectors
import NLPyPort as nlpyport
from proverb_selector.sel_utils.file_manager import read_write_obj_file
#from headline_gen.gen_utils.utils_gen import find_label, get_word_tfidf
#from gen_methods.movie_titles import check_movie_pt
from headline_gen.gen_utils.utils_gen import check_pos, trim_pos
def get_tfidf(headline, all_words, all_occurrences):
token_dets = []
hl_tokens = headline.lower().translate(str.maketrans('', '', string.punctuation)).split()
for token in hl_tokens:
if token in all_words:
token_dets.append((token, all_occurrences[all_words.index(token)]))
return sorted(token_dets, key=lambda tup: tup[1], reverse=False)
def test_data_creation(tokens, model, filename, all_labels):
tfidf_words = []
tfidf_occurrences = []
nlpyport.load_config()
with open(filename, 'r') as proverb_file:
p_reader = proverb_file.readlines()
# print(len(p_reader))
for row in p_reader:
line = row.split()
occurrences = int(line[0])
if line[1] in model.vocab:
for label in all_labels:
if label[0] == line[1] and line[1] not in tfidf_words:
tfidf_occurrences.append(occurrences)
tfidf_words.append(line[1])
break
print(len(tfidf_words), tfidf_words)
read_write_obj_file(0, tfidf_words, 'headline_gen/gen_inputs/tfidf_words.pk1')
print(len(tfidf_occurrences), tfidf_occurrences)
read_write_obj_file(0, tfidf_occurrences, 'headline_gen/gen_inputs/tfidf_occur.pk1')
def create_freq_dict(filename, model, all_labels, min_freq=100):
dicio = {}
with open(filename, 'r') as file:
p_reader = file.readlines()
for row in p_reader:
line = row.split()
occ = int(line[0])
if occ >= min_freq and line[1] in all_labels and line[1] in model.vocab:
if line[1] not in dicio:
dicio[line[1]] = occ
read_write_obj_file(0, dicio, 'models_db/dict_freq.pk1')
def labels_2_dict(all_labels):
dicio = {}
for l in all_labels:
if l[0] not in dicio:
dicio[l[0]] = []
dicio[l[0]].append(l)
read_write_obj_file(0, dicio, 'models_db/dict_labels.pk1')
def best_rated_movies_pt(file_ids_ratings, file_ids_titles, min_rating, min_tokens=3):
ids_ratings = {}
ids_titles = {}
with open(file_ids_ratings, 'r') as ratings:
next(ratings)
for line in ratings:
cols = line.split('\t')
ids_ratings[cols[0]] = float(cols[1])
with open(file_ids_titles, 'r') as titles:
next(titles)
for line in titles:
cols = line.split('\t')
if cols[3] == 'PT' and len(cols[2].split()) >= min_tokens:
ids_titles[cols[0]] = cols[2]
titles = []
for id in ids_ratings:
if id in ids_titles and ids_ratings[id] >= min_rating:
#print(ids_titles[id], ids_ratings[id])
titles.append(ids_titles[id].lower())
return titles
def check_movie_pt(movie_title, dict_forms_labels):
movie_tokens = movie_title.lower().translate(str.maketrans('', '', string.punctuation)).split()
verifier = 0
if len(movie_tokens) <= 3 or movie_title[0:4] == 'Epis':
return False
for token in movie_tokens:
if token in dict_forms_labels:
label = dict_forms_labels[token]
if check_pos(trim_pos(label[0][2])): #so' esta' a olhar para a primeira label
verifier += 1
else:
return False
if verifier < 2:
return False
for i in range(0, 10):
if str(i) in movie_title:
return False
return True
def load_proverbs(file_proverbs):
proverbs = []
with open(file_proverbs, 'r') as file:
for line in file:
proverbs.append(line.strip())
return proverbs
def create_expressions_file():
movie_titles = best_rated_movies_pt('headline_gen/gen_inputs/title.ratings.tsv', 'gen_inputs/movie_titles_pt.tsv', 7, 3)
print("Movie titles", len(movie_titles))
dict_forms_labels = read_write_obj_file(1, None, 'models_db/dict_labels.pk1')
filtered_titles = []
for t in movie_titles:
if check_movie_pt(t, dict_forms_labels):
# print(t)
filtered_titles.append(t)
print("Filtered titles", len(filtered_titles))
proverbs = load_proverbs('headline_gen/gen_inputs/proverbs.txt')
print("Proverbs", len(proverbs))
expressions = []
expressions.extend(proverbs)
expressions.extend(filtered_titles)
read_write_obj_file(0, expressions, 'models_db/prov_movies_v2.pk1')
if __name__ == '__main__':
#create_expressions_file()
'''
expressions = read_write_obj_file(1, None, '../models_db/prov_movies_v2.pk1')
for c, e in enumerate(expressions):
if c < 1500:
print(c, e.strip())
'''
'''
print("Carregar labels...")
all_labels = read_write_obj_file(1, None, '../models_db/list_labels_v3.pk1')
print("Guardar dicionário labels...")
labels_2_dict(all_labels)
all_labels = read_write_obj_file(1, None, '../models_db/dict_labels.pk1')
print(type(all_labels))
print("Carregar GloVe...")
model = KeyedVectors.load('../models_db/we_models/model_glove300.model')
print("Guardar dicionário frequência...")
create_freq_dict('gen_inputs/formas.cetempublico.utf8.txt', model, all_labels, min_freq=50)
'''
'''
all_labels = read_write_obj_file(1, None, '../models_db/dict_labels.pk1')
for k in all_labels:
print(k, all_labels[k])'''
"""hl_example = "Quando vai acabar a quarentena? Estou completamente farto disto."
all_labels = read_write_obj_file(1, None, 'gen_inputs/list_labels_v3.pk1')
tfidf_words = read_write_obj_file(1, None, 'gen_inputs/tfidf_words.pk1')
tfidf_occur = read_write_obj_file(1, None, 'gen_inputs/tfidf_occur.pk1')
selected = read_write_obj_file(1, None, 'gen_inputs/selected_rank.pk1')
all_headlines = [s[0] for s in selected]
all_headlines.insert(0, hl_example)
start = time.time()
test_data_creation(
hl_example.split(),
KeyedVectors.load('C:/Disciplinas/Tese/ThesisWork/proverb_selector/sel_inputs/we_models/model_glove300.model'),
'gen_inputs/formas_cetempublico_utf8.txt',
all_labels)
print("NEW ", get_tfidf(hl_example, tfidf_words, tfidf_occur))
print("OLD ", get_word_tfidf(0, all_headlines))
end = time.time()
print("time: ", end-start)
x = read_write_obj_file(1, None, 'gen_inputs/list_labels_V2.pk1')
y = read_write_obj_file(1, None, 'gen_inputs/list_labels_v3.pk1')
print(len(x), len(y))"""
#x = 'afeicão é manual'
#print(x, x.lower().translate(str.maketrans('', '', string.punctuation)).strip())