Clarify Vocabulary documentation, add -1 option for `min_pretrained_e…

…mbeddings` (#5581) * Clarify Vocabulary documentation, add -1 option for min_pretrained_embeddings * revert imports changes * Update CHANGELOG.md Co-authored-by: Akshita Bhagia <akshita23bhagia@gmail.com> Co-authored-by: Akshita Bhagia <akshita23bhagia@gmail.com>
allenai · Mar 4, 2022 · 99c9343 · 99c9343
1 parent 3fa5193
commit 99c9343
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updated dependencies, especially around doc creation.
 - Running the test suite out-of-tree (e.g. after installation) is now possible by pointing the environment variable `ALLENNLP_SRC_DIR` to the sources.
 - Silenced a warning that happens when you inappropriately clone a tensor.
+- Adding more clarification to the `Vocabulary` documentation around `min_pretrained_embeddings` and `only_include_pretrained_words`.
 
 ### Added
 
@@ -20,6 +21,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added a way to give JSON blobs as input to dataset readers in the `evaluate` command.
 - Added the argument `sub_module` in `PretrainedTransformerMismatchedEmbedder`
 
+### Changed
+
+- You can automatically include all words from a pretrained file when building a vocabulary by setting the value in `min_pretrained_embeddings` to `-1`
+  for that particular namespace.
+
 
 ## [v2.9.0](/~https://github.com/allenai/allennlp/releases/tag/v2.9.0) - 2022-01-27
 

diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
@@ -9,9 +9,10 @@
 import os
 import re
 from collections import defaultdict
-from transformers import PreTrainedTokenizer
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union, TYPE_CHECKING
 
+from transformers import PreTrainedTokenizer
+
 from allennlp.common import Registrable
 from allennlp.common.file_utils import cached_path, FileLock
 from allennlp.common.checks import ConfigurationError
@@ -193,17 +194,24 @@ class Vocabulary(Registrable):
         in the Vocabulary.
 
     min_pretrained_embeddings : `Dict[str, int]`, optional
-        If provided, specifies for each namespace a minimum number of lines (typically the
+        Specifies for each namespace a minimum number of lines (typically the
         most common words) to keep from pretrained embedding files, even for words not
-        appearing in the data.
+        appearing in the data. By default the minimum number of lines to keep is 0.
+        You can automatically include all lines for a namespace by setting the minimum number of lines
+        to `-1`.
 
     only_include_pretrained_words : `bool`, optional (default=`False`)
         This defines the strategy for using any pretrained embedding files which may have been
-        specified in `pretrained_files`. If False, an inclusive strategy is used: and words
-        which are in the `counter` and in the pretrained file are added to the `Vocabulary`,
-        regardless of whether their count exceeds `min_count` or not. If True, we use an
-        exclusive strategy: words are only included in the Vocabulary if they are in the pretrained
-        embedding file (their count must still be at least `min_count`).
+        specified in `pretrained_files`.
+
+        If `False`, we use an inclusive strategy and include both words in the `counter`
+        that have a count of at least `min_count` and words from the pretrained file
+        that are within the first `N` lines defined by `min_pretrained_embeddings`.
+
+        If `True`, we use an exclusive strategy where words are only included in the `Vocabulary`
+        if they are in the pretrained embedding file. Their count must also be at least `min_count`
+        or they must be listed in the embedding file within the first `N` lines defined
+        by `min_pretrained_embeddings`.
 
     tokens_to_add : `Dict[str, List[str]]`, optional (default=`None`)
         If given, this is a list of tokens to add to the vocabulary, keyed by the namespace to add
@@ -667,9 +675,13 @@ def _extend(
             if namespace in pretrained_files:
                 pretrained_list = _read_pretrained_tokens(pretrained_files[namespace])
                 min_embeddings = min_pretrained_embeddings.get(namespace, 0)
-                if min_embeddings > 0:
+                if min_embeddings > 0 or min_embeddings == -1:
                     tokens_old = tokens_to_add.get(namespace, [])
-                    tokens_new = pretrained_list[:min_embeddings]
+                    tokens_new = (
+                        pretrained_list
+                        if min_embeddings == -1
+                        else pretrained_list[:min_embeddings]
+                    )
                     tokens_to_add[namespace] = tokens_old + tokens_new
                 pretrained_set = set(pretrained_list)
             token_counts = list(counter[namespace].items())