-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessRaw.py
77 lines (63 loc) · 2.38 KB
/
processRaw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
__author__ = 'zengjichuan'
import nltk
import os
import os.path
import re
import csv
def load_data(dir):
data_folder = dir
corpus = {}
for root, dirs, files in os.walk(data_folder):
for file_name in files:
if re.match(r'\d+', file_name):
raw = open(os.path.join(root, file_name)).read()
corpus[file_name] = raw
return corpus
if __name__ == '__main__':
# files' content list
files_content = []
vocab_set = set()
file_output = []
total_tokens = 0
total_items = 0
# file to read
input_dir = r'F:\KuaiPan\sharedDataset\text\20newsUsedInSentencReglarizerPaper\test'
# file to write
basename = os.path.split(input_dir)[1]
docword_file = os.path.join('uci', 'docword.' + basename + '.txt')
vocab_file = os.path.join('uci', 'vocab.' + basename + '.txt')
f_doc_word_out = open(docword_file, 'w')
f_vocab_out = open(vocab_file, 'w')
# load stopwords
stop_words = [w for (w,) in csv.reader(open(os.path.join('uci', 'stopword.en.txt')))]
# load raw corpus
my_corpus = load_data(input_dir)
# generate words bag
for file_raw in my_corpus:
print 'processing ', file_raw, my_corpus[file_raw][:20]
tokens = [w.lower() for w in nltk.word_tokenize(my_corpus[file_raw]) if w.isalnum() and not w.isdigit() and w.lower() not in stop_words]
vocab_set = vocab_set | set(tokens)
file_word_bag = dict.fromkeys(set(tokens), 0)
for w in tokens:
file_word_bag[w] += 1
total_tokens += 1
total_items += len(file_word_bag)
files_content.append(file_word_bag)
vocab_map = dict(zip(vocab_set, range(len(vocab_set))))
# write vocabulary file
for vocab in vocab_set:
f_vocab_out.write(vocab+'\n')
# write doc_word file's intro
f_doc_word_out.write('%d\n' % len(files_content))
f_doc_word_out.write('%d\n' % len(vocab_set))
f_doc_word_out.write('%d\n' % total_items)
for doc_id in range(len(files_content)):
doc_word_bag = {}
for word, counts in files_content[doc_id].items():
doc_word_bag[vocab_map[word]] = counts
# sort and print
for key_id in sorted(doc_word_bag):
f_doc_word_out.write('%d %d %d\n' % (doc_id+1, key_id + 1, doc_word_bag[key_id]))
f_vocab_out.close()
f_doc_word_out.close()
print 'UCI Text Token Generated!'