-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_ngram.py
68 lines (62 loc) · 1.77 KB
/
read_ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""This code generates n-gram files."""
import re
import os
import subprocess
import argparse
import sys
parser = argparse.ArgumentParser()
parser.add_argument('--filename', type=str, default='input.txt',
help='Filename having textual data in data_dir.')
parser.add_argument('--outfile', type=str, default='output.txt',
help='Filename having textual data in data_dir.')
parser.add_argument('--lm', type=str, default='LM',
help='Filename having textual data in data_dir.')
args = parser.parse_args()
with open(args.filename, 'r') as f:
data = f.readlines()
for i, line in enumerate(data):
data[i] = data[i] + " </s>"
data = "\n".join(data).split()
# This RE is used to parse output produced by SRILM
regex = re.compile(r'\sp\(\s(.*)\s\|.*\]\s(.*)\s\[')
srilm = 'srilm/bin/i686-m64'
ngram = os.path.join(srilm, 'ngram')
lm_file = args.lm
command = \
ngram + " " + \
"-unk " + \
"-order " + str(3) + " " + \
"-lm " + lm_file + " " + \
"-debug 2 " + \
"-ppl " + args.filename
results = subprocess.check_output(command,
stderr=subprocess.STDOUT,
shell=True)
data[0] = "<s>"
results = results.split('\n')
token_ptr = 0
output = ""
for result in results:
match = regex.search(result)
if not match:
continue
if data[token_ptr] == '<s>':
token_ptr += 1
output += "<s> 1.0\n"
if token_ptr == 0:
# Ignoring the first word
token_ptr += 1
continue
active_token = data[token_ptr]
# Confirm active_token and matched token are same!
if active_token != match.group(1) and match.group(1) != '<unk>':
print result
print match
print "Error! " + active_token + " " + match.group(1)
if token_ptr > 50:
sys.exit()
# sys.exit()
output += active_token + " " + match.group(2) + "\n"
token_ptr += 1
with open(args.outfile, 'w') as f:
f.write(output)