-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmytokenize.py
executable file
·111 lines (88 loc) · 3.87 KB
/
mytokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python3
# coding: UTF-8
"""
Tokenize UTF-8 plain-text file (with #START lines) by calling Huntoken wrapper shellscript with customizations:
- identify URLs and keep them as single tokens
- fix huntoken output bug: <\n/s\n>\n => \n (exploded </s> EOS tag)
- insert missing spaces after punctuation between words (correct input text): eg. foo.Bar => foo. Bar
- remove period from end of a token if next token is "...", e.g. "alma...." tokenized as "alma.\n...\n" => "alma\n...\n"
(Note: this will break correct(?) uses of abbreviations followed by ellipses e.g. "stb....")
Usage:
tokenize INFILE OUTFILE
"""
import os
import re
import sys
from pytimeout import Timeout
HUNTDIR = '/home/mihaltz/hun-tools'
GRUBER_URLINTEXT_PAT = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
# from https://gist.github.com/uogbuji/705383
MISSING_SPACE_PAT = re.compile(r'([aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyzAÁBCDEÉFGHIÍJKLMNOÓÖŐPQRSTUÚÜŰVWXYZ])([\.,]+)([aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyzAÁBCDEÉFGHIÍJKLMNOÓÖŐPQRSTUÚÜŰVWXYZ])')
# no space after an interword punctuation
# global temp vars
URLS = []
UCNT = 0
def call_huntoken(inp, outp):
"""Calls huntokn command-line tokenizer via a system call.
inp is untokenized string
outp is the stream to write to.
"""
import subprocess
p = subprocess.Popen(HUNTDIR + '/010.huntoken', stdin=subprocess.PIPE, stdout=subprocess.PIPE)
(stout, sterr) = p.communicate( bytes(tok_preproc(inp), encoding='UTF-8'))
if p.returncode != 0:
sys.stderr.write('Command returned with exitcode {0}'.format(p.returncode))
else:
outp.write( tok_postproc( str(stout, encoding='utf8')))
def _repl_url1(m):
global URLS
URLS.append(m.group(0))
return '###URL###'
def _repl_url2(m):
global URLS, UCNT
UCNT += 1
return URLS[UCNT-1]
def tok_preproc(inp):
"""Pre-processing before tokenization. inp is untokenized string (UTF-8).
- identify URLS, save them, replace them with '###URL###' (to avoid tokenizer breaking up URLs)
- insert missing spaces after punctuation between words (correct input text): eg. foo.Bar => foo. Bar
"""
global URLS
URLS = []
# - identify URLS, save them, replace them with '###URL###' (to avoid tokenizer breaking up URLs)
try:
with Timeout(5):
outp = GRUBER_URLINTEXT_PAT.sub(_repl_url1, inp)
except Timeout.Timeout:
sys.stderr.write('Warning (tokenize.py): URL regexp substitute timed out on line:\n{0}\n'.format(inp))
outp = inp
# - insert missing spaces after punctuation between words (correct input text): eg. foo.Bar => foo. Bar
outp = MISSING_SPACE_PAT.sub(r'\1\2 \3', outp)
return outp
def tok_postproc(inp):
"""Post-processing after tokenization. inp is tokenized string ('\n' for new token, '\n\n' for new sentence)
- replace '###URL###' strings to original URL strings in message
- fix huntoken output bug: <\n/s\n>\n => \n\n (exploded </s> EOS tag)
- remove period from end of a token if next token is "...", e.g. "alma...." tokenized as "alma.\n...\n" => "alma\n...\n"
(Note: this will break correct uses of abbreviations before ellipses. e.g. "stb....")
"""
global UCNT
UCNT = 0
outp = re.sub(r'#\n#\n#\nURL\n#\n#\n#', _repl_url2, inp)
outp = re.sub(r'<\n/s\n>\n', r'\n', outp)
outp = re.sub(r'(\w+)\.\n\.\.\.\n', r'\1\n...\n', outp)
return outp
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.exit('Usage: mytokenize.py <input_filename> <output_filename>\n')
inp = open(sys.argv[1])
outp = open(sys.argv[2], 'w')
for line in inp:
line = line.rstrip()
if line.startswith('#START_'):
outp.write(line + '\n\n')
else:
#print('>>{0}<<'.format(line))
call_huntoken(line, outp)
inp.close()
outp.close()