Merge remote-tracking branch 'upstream/master' into streaming-glob

huggingface · Oct 5, 2021 · 9165e34 · 9165e34 · github-actions · Oct 5, 2021
2 parents 2e58cf1 + 1696fe0
commit 9165e34
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 14 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -47,6 +47,7 @@ jobs:
             - run: pip install virtualenv
             - run: python -m virtualenv venv --system-site-packages
             - run: "& venv/Scripts/activate.ps1"
+            - run: pip install 'huggingface_hub<0.0.18'
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow --upgrade
@@ -66,6 +67,7 @@ jobs:
             - run: pip install virtualenv
             - run: python -m virtualenv venv --system-site-packages
             - run: "& venv/Scripts/activate.ps1"
+            - run: pip install 'huggingface_hub<0.0.18'
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow==1.0.0

diff --git a/datasets/competition_math/competition_math.py b/datasets/competition_math/competition_math.py
@@ -82,7 +82,7 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, math_dir, split):
         """Yields examples as (key, example) tuples."""
-        filepaths = glob.glob(os.path.join(math_dir, split, "*/*"))
+        filepaths = glob.glob(os.path.join(math_dir, split, "*", "*"))
         for id_, filepath in enumerate(filepaths):
             with open(filepath, "rb") as fin:
                 example = json.load(fin)

diff --git a/datasets/lj_speech/lj_speech.py b/datasets/lj_speech/lj_speech.py
@@ -87,8 +87,8 @@ def _info(self):
 
     def _split_generators(self, dl_manager):
         root_path = dl_manager.download_and_extract(_DL_URL)
-        root_path = os.path.join(root_path, "LJSpeech-1.1/")
-        wav_path = os.path.join(root_path, "wavs/")
+        root_path = os.path.join(root_path, "LJSpeech-1.1")
+        wav_path = os.path.join(root_path, "wavs")
         csv_path = os.path.join(root_path, "metadata.csv")
 
         return [

diff --git a/datasets/superb/superb.py b/datasets/superb/superb.py
@@ -430,7 +430,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, archive_path, split=None):
         """Generate examples."""
         if self.config.name == "asr":
-            transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt")
+            transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*", "*", "*", "*.txt")
             key = 0
             for transcript_path in sorted(glob.glob(transcripts_glob)):
                 transcript_dir_path = os.path.dirname(transcript_path)
@@ -462,8 +462,8 @@ def _generate_examples(self, archive_path, split=None):
                     label = "_unknown_"
                 yield key, {"file": audio_file, "label": label}
         elif self.config.name == "ic":
-            root_path = os.path.join(archive_path, "fluent_speech_commands_dataset/")
-            csv_path = os.path.join(root_path, f"data/{split}_data.csv")
+            root_path = os.path.join(archive_path, "fluent_speech_commands_dataset")
+            csv_path = os.path.join(root_path, "data", f"{split}_data.csv")
             with open(csv_path, encoding="utf-8") as csv_file:
                 csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True)
                 next(csv_reader)
@@ -478,7 +478,7 @@ def _generate_examples(self, archive_path, split=None):
                         "location": location,
                     }
         elif self.config.name == "si":
-            wav_path = os.path.join(archive_path, "wav/")
+            wav_path = os.path.join(archive_path, "wav")
             splits_path = os.path.join(archive_path, "veri_test_class.txt")
             with open(splits_path, "r", encoding="utf-8") as f:
                 for key, line in enumerate(f):
@@ -518,9 +518,9 @@ def _generate_examples(self, archive_path, split=None):
                         }
                         key += 1
         elif self.config.name == "er":
-            root_path = os.path.join(archive_path, f"Session{split}/")
-            wav_path = os.path.join(root_path, "sentences/wav/")
-            labels_path = os.path.join(root_path, "dialog/EmoEvaluation/*.txt")
+            root_path = os.path.join(archive_path, f"Session{split}")
+            wav_path = os.path.join(root_path, "sentences", "wav")
+            labels_path = os.path.join(root_path, "dialog", "EmoEvaluation", "*.txt")
             emotions = ["neu", "hap", "ang", "sad", "exc"]
             key = 0
             for labels_file in sorted(glob.glob(labels_path)):
@@ -653,7 +653,7 @@ def _get_speakers(rec, data, args):
 
 
 def _split_ks_files(archive_path, split):
-    audio_path = os.path.join(archive_path, "**/*.wav")
+    audio_path = os.path.join(archive_path, "**", "*.wav")
     audio_paths = glob.glob(audio_path)
     if split == "test":
         # use all available files for the test archive

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -2347,7 +2347,7 @@ def init_buffer_and_writer():
             return self
 
     @transmit_format
-    @fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name"], version="2.0.0")
+    @fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name"], version="2.0.1")
     def filter(
         self,
         function: Optional[Callable] = None,
@@ -2413,7 +2413,9 @@ def filter(
             raise ValueError("Parameter `remove_columns` passed to .filter() is no longer supported.")
 
         indices = self.map(
-            function=partial(get_indices_from_mask_function, function, batched, with_indices, input_columns),
+            function=partial(
+                get_indices_from_mask_function, function, batched, with_indices, input_columns, self._indices
+            ),
             with_indices=True,
             features=Features({"indices": Value("uint64")}),
             batched=True,
@@ -3607,6 +3609,7 @@ def get_indices_from_mask_function(
     batched: bool,
     with_indices: bool,
     input_columns: Optional[Union[str, List[str]]],
+    indices_mapping: Optional[Table] = None,
     *args,
     **fn_kwargs,
 ):
@@ -3635,4 +3638,9 @@ def get_indices_from_mask_function(
                 mask.append(
                     function(*input, indices[i], **fn_kwargs) if with_indices else function(*input, **fn_kwargs)
                 )
-    return {"indices": [i for i, to_keep in zip(indices, mask) if to_keep]}
+    indices_array = [i for i, to_keep in zip(indices, mask) if to_keep]
+    if indices_mapping is not None:
+        indices_array = pa.array(indices_array, type=pa.uint64())
+        indices_array = indices_mapping.column(0).take(indices_array)
+        indices_array = indices_array.to_pylist()
+    return {"indices": indices_array}
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -1175,6 +1175,15 @@ def test_filter(self, in_memory):
                     self.assertNotEqual(dset_filter_even_num._fingerprint, fingerprint)
                     self.assertEqual(dset_filter_even_num.format["type"], "numpy")
 
+    def test_filter_with_indices_mapping(self, in_memory):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            dset = Dataset.from_dict({"col": [0, 1, 2]})
+            with self._to(in_memory, tmp_dir, dset) as dset:
+                with dset.filter(lambda x: x["col"] > 0) as dset:
+                    self.assertListEqual(dset["col"], [1, 2])
+                    with dset.filter(lambda x: x["col"] < 2) as dset:
+                        self.assertListEqual(dset["col"], [1])
+
     def test_filter_fn_kwargs(self, in_memory):
         with tempfile.TemporaryDirectory() as tmp_dir:
             with Dataset.from_dict({"id": range(10)}) as dset: