Skip to content

Commit

Permalink
warn about truncation only once (allenai#3052)
Browse files Browse the repository at this point in the history
  • Loading branch information
joelgrus authored and reiyw committed Nov 12, 2019
1 parent b8c794e commit 5e681a2
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions allennlp/data/token_indexers/wordpiece_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(self,
self.use_starting_offsets = use_starting_offsets
self._do_lowercase = do_lowercase
self._truncate_long_sequences = truncate_long_sequences
self._warned_about_truncation = False

if never_lowercase is None:
# Use the defaults
Expand Down Expand Up @@ -123,6 +124,14 @@ def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
vocabulary._token_to_index[self._namespace][word] = idx
vocabulary._index_to_token[self._namespace][idx] = word

def _warn_about_truncation(self, tokens: List[Token]) -> None:
if not self._warned_about_truncation:
logger.warning("Too many wordpieces, truncating sequence. "
"If you would like a sliding window, set `truncate_long_sequences` to False."
f"The offending input was: {str([token.text for token in tokens])}."
"To avoid polluting your logs we will not warn about this again.")
self._warned_about_truncation = True

@overrides
def tokens_to_indices(self,
tokens: List[Token],
Expand Down Expand Up @@ -207,8 +216,7 @@ def tokens_to_indices(self,
wordpiece_windows = [self._add_start_and_end(flat_wordpiece_ids)]
token_type_ids = self._extend(flat_token_type_ids)
elif self._truncate_long_sequences:
logger.warning("Too many wordpieces, truncating sequence. If you would like a sliding window, set"
"`truncate_long_sequences` to False %s", str([token.text for token in tokens]))
self._warn_about_truncation(tokens)
wordpiece_windows = [self._add_start_and_end(flat_wordpiece_ids[:pieces_accumulated])]
token_type_ids = self._extend(flat_token_type_ids[:pieces_accumulated])
else:
Expand Down

0 comments on commit 5e681a2

Please sign in to comment.