-
Notifications
You must be signed in to change notification settings - Fork 126
/
Copy pathtokenization.py
131 lines (102 loc) · 3.77 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from typing import List, Tuple
from pytorch_transformers.tokenization_bert import (
_is_punctuation as is_punctuation,
_is_whitespace as is_whitespace,
)
class Token(object):
"""Info about a single token."""
def __init__(self,
text: str,
offset: int,
index: int,
tail: str = '',
tag: str = None):
if not isinstance(text, str) or not text:
raise TypeError('text should be a non-empty string.')
if not isinstance(offset, int) or offset < 0:
raise TypeError('offset should be an int >= 0.')
if not isinstance(index, int) or index < 0:
raise TypeError('index should be an int >= 0.')
self.text = text
self.offset = offset
self.tail = tail
self.tag = tag
self._example = None
self._index = index
def __str__(self):
return '{}{}'.format(self.text, self.tail)
def __repr__(self):
return 'Token(text=%r, offset=%r, index=%r, tail=%r, tag=%r)' % \
(self.text, self.offset, self.index, self.tail, self.tag)
def __len__(self):
return len(self.text) + len(self.tail)
def __add__(self, char):
self.text += char
return self
@property
def example(self):
return self._example
@property
def index(self):
return self._index
@property
def is_punct(self):
return is_punctuation(self.text)
def has_tail(self):
return bool(self.tail)
@property
def nbor(self):
"""Returns the neighboring token, e.g.,
self._example.doc_tokens[self.index + 1]."""
if self.index is None:
return None
try:
return self._example.doc_tokens[self.index + 1]
except IndexError:
return None
def reconstruct_text_from_tokens(tokens: List[Token],
include_last_tail: bool = False,
) -> str:
"""Concatenates the text of a sequence of tokens."""
def text_generator(tokens):
for i, token in enumerate(tokens):
yield token.text
if i < len(tokens) - 1 or include_last_tail:
yield token.tail
return ''.join(piece for piece in text_generator(tokens))
class TokenizerWithAlignment:
"""Tokenizer that performs basic tokenization keeping string alignment."""
def __init__(self):
pass
@staticmethod
def _begin_new_token(doc_tokens, text, offset):
token = Token(text=text, offset=offset, index=len(doc_tokens))
doc_tokens.append(token)
return token
def tokenize(self, text: str) -> Tuple[List[Token], List[int]]:
doc_tokens = []
char_to_word_offset = []
new_word = True
curr_token = None
for offset, c in enumerate(text):
if is_whitespace(c):
new_word = True
if curr_token:
curr_token.tail += c
else:
if is_punctuation(c):
curr_token = self._begin_new_token(doc_tokens, c, offset)
new_word = True
else:
if new_word:
curr_token = self._begin_new_token(
doc_tokens, c, offset)
else:
curr_token += c
new_word = False
# OBS: Whitespaces that appear before any tokens will have offset -1
# char_to_word_offset.append(len(doc_tokens) - 1)
char_to_word_offset.append(max(0, len(doc_tokens) - 1))
return doc_tokens, char_to_word_offset
def __call__(self, text: str) -> Tuple[List[Token], List[int]]:
return self.tokenize(text)