From d255d287951be0fad5da7be17bcf83ce9006e3bd Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 4 Oct 2021 14:46:43 +0200 Subject: [PATCH 1/7] Fix Windows path in MATH dataset (#3014) --- datasets/competition_math/competition_math.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/competition_math/competition_math.py b/datasets/competition_math/competition_math.py index 16762c4424f..7400ef0441b 100644 --- a/datasets/competition_math/competition_math.py +++ b/datasets/competition_math/competition_math.py @@ -82,7 +82,7 @@ def _split_generators(self, dl_manager): def _generate_examples(self, math_dir, split): """Yields examples as (key, example) tuples.""" - filepaths = glob.glob(os.path.join(math_dir, split, "*/*")) + filepaths = glob.glob(os.path.join(math_dir, split, "*", "*")) for id_, filepath in enumerate(filepaths): with open(filepath, "rb") as fin: example = json.load(fin) From 3f87a9e47f14031c5d41bb628b02fd3ec8e79dee Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 4 Oct 2021 15:43:24 +0200 Subject: [PATCH 2/7] Fix Windows paths in SUPERB benchmark datasets (#3009) --- datasets/superb/superb.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datasets/superb/superb.py b/datasets/superb/superb.py index 3dd3f5400ad..2af705ea116 100644 --- a/datasets/superb/superb.py +++ b/datasets/superb/superb.py @@ -430,7 +430,7 @@ def _split_generators(self, dl_manager): def _generate_examples(self, archive_path, split=None): """Generate examples.""" if self.config.name == "asr": - transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt") + transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*", "*", "*", "*.txt") key = 0 for transcript_path in sorted(glob.glob(transcripts_glob)): transcript_dir_path = os.path.dirname(transcript_path) @@ -462,8 +462,8 @@ def _generate_examples(self, archive_path, split=None): label = "_unknown_" yield key, {"file": audio_file, "label": label} elif self.config.name == "ic": - root_path = os.path.join(archive_path, "fluent_speech_commands_dataset/") - csv_path = os.path.join(root_path, f"data/{split}_data.csv") + root_path = os.path.join(archive_path, "fluent_speech_commands_dataset") + csv_path = os.path.join(root_path, "data", f"{split}_data.csv") with open(csv_path, encoding="utf-8") as csv_file: csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True) next(csv_reader) @@ -478,7 +478,7 @@ def _generate_examples(self, archive_path, split=None): "location": location, } elif self.config.name == "si": - wav_path = os.path.join(archive_path, "wav/") + wav_path = os.path.join(archive_path, "wav") splits_path = os.path.join(archive_path, "veri_test_class.txt") with open(splits_path, "r", encoding="utf-8") as f: for key, line in enumerate(f): @@ -518,9 +518,9 @@ def _generate_examples(self, archive_path, split=None): } key += 1 elif self.config.name == "er": - root_path = os.path.join(archive_path, f"Session{split}/") - wav_path = os.path.join(root_path, "sentences/wav/") - labels_path = os.path.join(root_path, "dialog/EmoEvaluation/*.txt") + root_path = os.path.join(archive_path, f"Session{split}") + wav_path = os.path.join(root_path, "sentences", "wav") + labels_path = os.path.join(root_path, "dialog", "EmoEvaluation", "*.txt") emotions = ["neu", "hap", "ang", "sad", "exc"] key = 0 for labels_file in sorted(glob.glob(labels_path)): @@ -653,7 +653,7 @@ def _get_speakers(rec, data, args): def _split_ks_files(archive_path, split): - audio_path = os.path.join(archive_path, "**/*.wav") + audio_path = os.path.join(archive_path, "**", "*.wav") audio_paths = glob.glob(audio_path) if split == "test": # use all available files for the test archive From 592f1fd2141b88906cc7c75e0b742f6f5c48e605 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 4 Oct 2021 17:23:04 +0200 Subject: [PATCH 3/7] Fix Windows paths in LJ Speech dataset (#3016) --- datasets/lj_speech/lj_speech.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/lj_speech/lj_speech.py b/datasets/lj_speech/lj_speech.py index d3171d1e52e..ccb536196b2 100644 --- a/datasets/lj_speech/lj_speech.py +++ b/datasets/lj_speech/lj_speech.py @@ -87,8 +87,8 @@ def _info(self): def _split_generators(self, dl_manager): root_path = dl_manager.download_and_extract(_DL_URL) - root_path = os.path.join(root_path, "LJSpeech-1.1/") - wav_path = os.path.join(root_path, "wavs/") + root_path = os.path.join(root_path, "LJSpeech-1.1") + wav_path = os.path.join(root_path, "wavs") csv_path = os.path.join(root_path, "metadata.csv") return [ From 390413fd8c09abf8607b949209cafa467c614ebc Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 5 Oct 2021 10:33:07 +0200 Subject: [PATCH 4/7] Fix filter leaking (#3019) * fix filter leaking * add test * use list instead of np.array for pyarrow 1 --- src/datasets/arrow_dataset.py | 14 +++++++++++--- tests/test_arrow_dataset.py | 9 +++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 33970c6f087..0facf58ddb8 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -2347,7 +2347,7 @@ def init_buffer_and_writer(): return self @transmit_format - @fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name"], version="2.0.0") + @fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name"], version="2.0.1") def filter( self, function: Optional[Callable] = None, @@ -2413,7 +2413,9 @@ def filter( raise ValueError("Parameter `remove_columns` passed to .filter() is no longer supported.") indices = self.map( - function=partial(get_indices_from_mask_function, function, batched, with_indices, input_columns), + function=partial( + get_indices_from_mask_function, function, batched, with_indices, input_columns, self._indices + ), with_indices=True, features=Features({"indices": Value("uint64")}), batched=True, @@ -3607,6 +3609,7 @@ def get_indices_from_mask_function( batched: bool, with_indices: bool, input_columns: Optional[Union[str, List[str]]], + indices_mapping: Optional[Table] = None, *args, **fn_kwargs, ): @@ -3635,4 +3638,9 @@ def get_indices_from_mask_function( mask.append( function(*input, indices[i], **fn_kwargs) if with_indices else function(*input, **fn_kwargs) ) - return {"indices": [i for i, to_keep in zip(indices, mask) if to_keep]} + indices_array = [i for i, to_keep in zip(indices, mask) if to_keep] + if indices_mapping is not None: + indices_array = pa.array(indices_array, type=pa.uint64()) + indices_array = indices_mapping.column(0).take(indices_array) + indices_array = indices_array.to_pylist() + return {"indices": indices_array} diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index b6f2f057e01..532d851681c 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -1175,6 +1175,15 @@ def test_filter(self, in_memory): self.assertNotEqual(dset_filter_even_num._fingerprint, fingerprint) self.assertEqual(dset_filter_even_num.format["type"], "numpy") + def test_filter_with_indices_mapping(self, in_memory): + with tempfile.TemporaryDirectory() as tmp_dir: + dset = Dataset.from_dict({"col": [0, 1, 2]}) + with self._to(in_memory, tmp_dir, dset) as dset: + with dset.filter(lambda x: x["col"] > 0) as dset: + self.assertListEqual(dset["col"], [1, 2]) + with dset.filter(lambda x: x["col"] < 2) as dset: + self.assertListEqual(dset["col"], [1]) + def test_filter_fn_kwargs(self, in_memory): with tempfile.TemporaryDirectory() as tmp_dir: with Dataset.from_dict({"id": range(10)}) as dset: From f59dd7fdd5e95921e9575597de239630335aa9df Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 5 Oct 2021 11:58:27 +0200 Subject: [PATCH 5/7] Fix Windows test suite (#3025) * Pin huggingface_hub version --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1593ca0adaf..7c62f05aa75 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -47,6 +47,7 @@ jobs: - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" + - run: pip install 'huggingface_hub<0.0.18' - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow --upgrade @@ -66,6 +67,7 @@ jobs: - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" + - run: pip install 'huggingface_hub<0.0.18' - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow==1.0.0 From 2aea217ef98975dc553bd4f5bfd44f77cbd5dc25 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 5 Oct 2021 12:48:01 +0200 Subject: [PATCH 6/7] properly install ruamel-yaml in the windows CI --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7c62f05aa75..ef36247feb6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -47,7 +47,7 @@ jobs: - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install 'huggingface_hub<0.0.18' + - run: pip install --ignore-installed ruamel-yaml - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow --upgrade @@ -67,7 +67,7 @@ jobs: - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install 'huggingface_hub<0.0.18' + - run: pip install --ignore-installed ruamel-yaml - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow==1.0.0 From 1696fe0746320cb7c69f3dfc68b1564dee64f57c Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 5 Oct 2021 12:48:46 +0200 Subject: [PATCH 7/7] Revert "properly install ruamel-yaml in the windows CI" This reverts commit 2aea217ef98975dc553bd4f5bfd44f77cbd5dc25. --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ef36247feb6..7c62f05aa75 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -47,7 +47,7 @@ jobs: - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install --ignore-installed ruamel-yaml + - run: pip install 'huggingface_hub<0.0.18' - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow --upgrade @@ -67,7 +67,7 @@ jobs: - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install --ignore-installed ruamel-yaml + - run: pip install 'huggingface_hub<0.0.18' - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow==1.0.0