-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_vocab.py
86 lines (63 loc) · 2.32 KB
/
build_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import re
import sentencepiece as spm
from collections import Counter
re_sc = re.compile('[@#$%\^&\*\(\)\-\=\[\]\{\}\.,/~\+\'"|_:><`┃]')
def train_spm(txt_path='train/train_data',
spm_path='./spm',
vocab_size=10000, input_sentence_size=11000000):
spm_dir = os.path.dirname(spm_path)
os.makedirs(spm_dir, exist_ok=True)
spm.SentencePieceTrainer.Train(
f' --input={txt_path} --model_type=bpe'
f' --model_prefix={spm_path} --vocab_size={vocab_size}'
f' --input_sentence_size={input_sentence_size}'
)
stopwords = list(['!@#$%^&*()_+=,./'])
def text_cleaning(x, stopwords):
for sw in stopwords:
x = x.replace(sw,' ')
x = x.lower()
return x
def build_vocab(mode='train', dataset_path='.', vocab_size=10000):
if mode != 'train':
print('only [train] is supported')
return
data_review = os.path.join(dataset_path, 'train', 'train_data')
print('writing preprocessed titles ...')
reviews = []
with open(data_review, 'rt') as f:
for line in f:
reviews.append(' '.join(re_sc.sub(' ', line).strip().split()))
write_titles(reviews, './reviews.txt')
print('training spm ...')
train_spm('./reviews.txt', vocab_size=vocab_size)
print('build wp vocab ...')
wp_vocab = build_wp_vocab(reviews)
return reviews, wp_vocab
def build_wp_vocab(reviews, spm_model_path='./spm.model'):
sp = spm.SentencePieceProcessor()
sp.Load(spm_model_path)
wp_counter = Counter()
max_wps_len = 0
for _, review in enumerate(reviews):
words = review.split()
wps = []
for w in words:
wp = sp.EncodeAsPieces(w)
max_wps_len = max(len(wp), max_wps_len)
wps += wp
for wp in wps:
wp_counter[wp] += 1
wp_vocab = [('PAD', max_wps_len)] + wp_counter.most_common()
write_vocab(wp_vocab, './wp_vocab.txt')
#print(wp_vocab[:100])
return wp_vocab
def write_vocab(vocab, vocab_fn):
with open(vocab_fn, 'w') as fp:
for v, c in vocab:
fp.write(f'{v}\t{c}\n')
def write_titles(titles, titles_path):
f_titles = open(titles_path, 'w')
for title in titles:
f_titles.write(title + '\n')