-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Use the new tokenizers #3868
Use the new tokenizers #3868
Changes from 30 commits
592362b
bc43649
2742a6e
02d9a80
5d5c01d
d1b21d5
163568d
a5be9d6
45aa194
aecff02
b7cd79f
3209466
125f44c
bca4b70
51775be
782f027
b2cf84c
39281bd
f626c84
8ebd7e2
cd046c6
de59f3c
4d2b578
64ff7c5
38117bc
450adf1
cb7f41f
9cb142a
3fe2587
f43f929
196925b
3ff38c5
2b14550
3f89d99
617f5c7
450f0e2
df6f7ec
b6db7ca
1163b66
fced896
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from typing import List | ||
from typing import List, Optional | ||
import logging | ||
|
||
from allennlp.common import Registrable | ||
|
@@ -45,3 +45,37 @@ def tokenize(self, text: str) -> List[Token]: | |
tokens : `List[Token]` | ||
""" | ||
raise NotImplementedError | ||
|
||
def add_special_tokens( | ||
self, tokens1: List[Token], tokens2: Optional[List[Token]] = None | ||
) -> List[Token]: | ||
""" | ||
Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP]. | ||
|
||
Not all tokenizers do this. The default is to just return the tokens unchanged. | ||
|
||
# Parameters | ||
|
||
tokens1 : `List[Token]` | ||
The list of tokens to add special tokens to. | ||
tokens2 : `Optional[List[Token]]` | ||
An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be | ||
added as appropriate. | ||
|
||
# Returns | ||
tokens : `List[Token]` | ||
The combined list of tokens, with special tokens added. | ||
""" | ||
return tokens1 + (tokens2 or []) | ||
|
||
def special_tokens_for_sequence(self) -> int: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This name makes me expect a return type of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed the name. |
||
""" | ||
Returns the number of special tokens added for a single sequence. | ||
""" | ||
return 0 | ||
|
||
def special_tokens_for_pair(self) -> int: | ||
""" | ||
Returns the number of special tokens added for a pair of sequences. | ||
""" | ||
return 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,20 +35,6 @@ def test_as_array_produces_token_sequence_bert_cased(self): | |
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) | ||
assert indexed["token_ids"] == expected_ids | ||
|
||
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we need these tests anymore? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are testing |
||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | ||
allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") | ||
indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") | ||
default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]" | ||
tokens = tokenizer.tokenize(default_format) | ||
expected_ids = tokenizer.convert_tokens_to_ids(tokens) | ||
allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( | ||
"AllenNLP is great!", "Really it is!" | ||
) | ||
vocab = Vocabulary() | ||
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) | ||
assert indexed["token_ids"] == expected_ids | ||
|
||
def test_as_array_produces_token_sequence_roberta(self): | ||
tokenizer = AutoTokenizer.from_pretrained("roberta-base") | ||
allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") | ||
|
@@ -63,20 +49,6 @@ def test_as_array_produces_token_sequence_roberta(self): | |
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) | ||
assert indexed["token_ids"] == expected_ids | ||
|
||
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): | ||
tokenizer = AutoTokenizer.from_pretrained("roberta-base") | ||
allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") | ||
indexer = PretrainedTransformerIndexer(model_name="roberta-base") | ||
default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" | ||
tokens = tokenizer.tokenize(default_format) | ||
expected_ids = tokenizer.convert_tokens_to_ids(tokens) | ||
allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( | ||
"AllenNLP is great!", "Really it is!" | ||
) | ||
vocab = Vocabulary() | ||
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) | ||
assert indexed["token_ids"] == expected_ids | ||
|
||
def test_transformers_vocab_sizes(self): | ||
def check_vocab_size(model_name: str): | ||
namespace = "tags" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changing this method might actually interact poorly with the demo. You'll be adding spaces in places that the demo isn't expecting it. Not sure how much a difference this makes, though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Huggingface have also changed their tack on this. At some point the argument was that
'Ġ'
is just the encoding for space, and the space is part of the token, so I made it this way. But in the latest huggingface, the space is no longer part of the token. I'm hoping for clarification from @mfuntowicz on the matter, but I guess if in doubt, I'll make this match the current behavior.I don't know if the demo is using
sanitize_wordpiece()
though. It's not a commonly used call as far as I know.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, sorry, you're right. I thought
sanitize_wordpiece
was called bysanitize
(which is used in predictors), but it's not.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've changed this back.