-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlsi_model_run.py
94 lines (76 loc) · 3.55 KB
/
lsi_model_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This run produces an LSI model with Tfidf or log_entropy preprocessing.
"""
from datetime import datetime
from gensim import utils, similarities, matutils, models
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.logentropy_model import LogEntropyModel
from gensim.models.tfidfmodel import TfidfModel
from gensim.parsing import preprocessing
from os import path
import numpy as np
import os
import sys
import tools
def main(param_file=None):
# setup
p, base_path, output_dir = tools.setup(param_file)
working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name'])
human_data_file = path.join(base_path, p['human_data_file'])
lee_corpus = path.join(base_path, p['lee_corpus'])
logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
logger.info("running %s" % ' '.join(sys.argv))
# remember starting time for runtime evaluation
start = datetime.now()
logger.info('loading word mapping')
dictionary = Dictionary.load(path.join(base_path,
p['corpus_path'],
p['dict_name']))
Dictionary.save(dictionary, path.join(output_dir, p['dict_name']))
logger.info(dictionary)
logger.info('loading corpus')
corpus_bow = MmCorpus(working_corpus)
logger.info("create preprocessing model and save it to disk")
if p['pre_model'] == 'tfidf':
pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
elif p['pre_model'] == 'log_ent':
pre_model = LogEntropyModel(corpus_bow,
id2word=dictionary, normalize=True)
else:
raise ValueError('model parameter %s not known' % p['pre_model'])
pre_model.save(os.path.join(output_dir, p['pre_model_extension']))
logger.info('initialize LSI model')
lsi = models.LsiModel(pre_model[corpus_bow],
id2word=dictionary, num_topics=p['num_topics'])
lsi.save(os.path.join(output_dir, p['lsi_extension']))
logger.info('finished --> lsi model saved to: %s' %
os.path.join(output_dir, p['lsi_extension']))
# check for correlation with lee human data
logger.info('load smal lee corpus and preprocess')
with open(lee_corpus, 'r') as f:
preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
bow_lee_texts = [dictionary.doc2bow(text,
allow_update=False,
return_missing=False)
for text in preproc_lee_texts]
logger.info('transforming small lee corpus (LSI)')
corpus_lsi = lsi[pre_model[bow_lee_texts]]
# # compute pairwise similarity matrix of transformed corpus
sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
sim_matrix[i, j] = matutils.cossim(par1, par2)
sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]
# read the human similarity data and flatten upper triangular
human_sim_matrix = np.loadtxt(human_data_file)
sim_m_size = np.shape(human_sim_matrix)[0]
human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]
# compute correlations
cor = np.corrcoef(sim_vector, human_sim_vector)
logger.info("correlation with lee human data: %f" % cor[0, 1])
dif = start - datetime.now()
logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
if __name__ == '__main__':
main()