Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into streaming-glob
Browse files Browse the repository at this point in the history
  • Loading branch information
albertvillanova committed Oct 5, 2021
2 parents 2e58cf1 + 1696fe0 commit 9165e34
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ jobs:
- run: pip install virtualenv
- run: python -m virtualenv venv --system-site-packages
- run: "& venv/Scripts/activate.ps1"
- run: pip install 'huggingface_hub<0.0.18'
- run: pip install .[tests]
- run: pip install -r additional-tests-requirements.txt --no-deps
- run: pip install pyarrow --upgrade
Expand All @@ -66,6 +67,7 @@ jobs:
- run: pip install virtualenv
- run: python -m virtualenv venv --system-site-packages
- run: "& venv/Scripts/activate.ps1"
- run: pip install 'huggingface_hub<0.0.18'
- run: pip install .[tests]
- run: pip install -r additional-tests-requirements.txt --no-deps
- run: pip install pyarrow==1.0.0
Expand Down
2 changes: 1 addition & 1 deletion datasets/competition_math/competition_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _split_generators(self, dl_manager):

def _generate_examples(self, math_dir, split):
"""Yields examples as (key, example) tuples."""
filepaths = glob.glob(os.path.join(math_dir, split, "*/*"))
filepaths = glob.glob(os.path.join(math_dir, split, "*", "*"))
for id_, filepath in enumerate(filepaths):
with open(filepath, "rb") as fin:
example = json.load(fin)
Expand Down
4 changes: 2 additions & 2 deletions datasets/lj_speech/lj_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ def _info(self):

def _split_generators(self, dl_manager):
root_path = dl_manager.download_and_extract(_DL_URL)
root_path = os.path.join(root_path, "LJSpeech-1.1/")
wav_path = os.path.join(root_path, "wavs/")
root_path = os.path.join(root_path, "LJSpeech-1.1")
wav_path = os.path.join(root_path, "wavs")
csv_path = os.path.join(root_path, "metadata.csv")

return [
Expand Down
16 changes: 8 additions & 8 deletions datasets/superb/superb.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def _split_generators(self, dl_manager):
def _generate_examples(self, archive_path, split=None):
"""Generate examples."""
if self.config.name == "asr":
transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt")
transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*", "*", "*", "*.txt")
key = 0
for transcript_path in sorted(glob.glob(transcripts_glob)):
transcript_dir_path = os.path.dirname(transcript_path)
Expand Down Expand Up @@ -462,8 +462,8 @@ def _generate_examples(self, archive_path, split=None):
label = "_unknown_"
yield key, {"file": audio_file, "label": label}
elif self.config.name == "ic":
root_path = os.path.join(archive_path, "fluent_speech_commands_dataset/")
csv_path = os.path.join(root_path, f"data/{split}_data.csv")
root_path = os.path.join(archive_path, "fluent_speech_commands_dataset")
csv_path = os.path.join(root_path, "data", f"{split}_data.csv")
with open(csv_path, encoding="utf-8") as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True)
next(csv_reader)
Expand All @@ -478,7 +478,7 @@ def _generate_examples(self, archive_path, split=None):
"location": location,
}
elif self.config.name == "si":
wav_path = os.path.join(archive_path, "wav/")
wav_path = os.path.join(archive_path, "wav")
splits_path = os.path.join(archive_path, "veri_test_class.txt")
with open(splits_path, "r", encoding="utf-8") as f:
for key, line in enumerate(f):
Expand Down Expand Up @@ -518,9 +518,9 @@ def _generate_examples(self, archive_path, split=None):
}
key += 1
elif self.config.name == "er":
root_path = os.path.join(archive_path, f"Session{split}/")
wav_path = os.path.join(root_path, "sentences/wav/")
labels_path = os.path.join(root_path, "dialog/EmoEvaluation/*.txt")
root_path = os.path.join(archive_path, f"Session{split}")
wav_path = os.path.join(root_path, "sentences", "wav")
labels_path = os.path.join(root_path, "dialog", "EmoEvaluation", "*.txt")
emotions = ["neu", "hap", "ang", "sad", "exc"]
key = 0
for labels_file in sorted(glob.glob(labels_path)):
Expand Down Expand Up @@ -653,7 +653,7 @@ def _get_speakers(rec, data, args):


def _split_ks_files(archive_path, split):
audio_path = os.path.join(archive_path, "**/*.wav")
audio_path = os.path.join(archive_path, "**", "*.wav")
audio_paths = glob.glob(audio_path)
if split == "test":
# use all available files for the test archive
Expand Down
14 changes: 11 additions & 3 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2347,7 +2347,7 @@ def init_buffer_and_writer():
return self

@transmit_format
@fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name"], version="2.0.0")
@fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name"], version="2.0.1")
def filter(
self,
function: Optional[Callable] = None,
Expand Down Expand Up @@ -2413,7 +2413,9 @@ def filter(
raise ValueError("Parameter `remove_columns` passed to .filter() is no longer supported.")

indices = self.map(
function=partial(get_indices_from_mask_function, function, batched, with_indices, input_columns),
function=partial(
get_indices_from_mask_function, function, batched, with_indices, input_columns, self._indices
),
with_indices=True,
features=Features({"indices": Value("uint64")}),
batched=True,
Expand Down Expand Up @@ -3607,6 +3609,7 @@ def get_indices_from_mask_function(
batched: bool,
with_indices: bool,
input_columns: Optional[Union[str, List[str]]],
indices_mapping: Optional[Table] = None,
*args,
**fn_kwargs,
):
Expand Down Expand Up @@ -3635,4 +3638,9 @@ def get_indices_from_mask_function(
mask.append(
function(*input, indices[i], **fn_kwargs) if with_indices else function(*input, **fn_kwargs)
)
return {"indices": [i for i, to_keep in zip(indices, mask) if to_keep]}
indices_array = [i for i, to_keep in zip(indices, mask) if to_keep]
if indices_mapping is not None:
indices_array = pa.array(indices_array, type=pa.uint64())
indices_array = indices_mapping.column(0).take(indices_array)
indices_array = indices_array.to_pylist()
return {"indices": indices_array}
9 changes: 9 additions & 0 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,6 +1175,15 @@ def test_filter(self, in_memory):
self.assertNotEqual(dset_filter_even_num._fingerprint, fingerprint)
self.assertEqual(dset_filter_even_num.format["type"], "numpy")

def test_filter_with_indices_mapping(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:
dset = Dataset.from_dict({"col": [0, 1, 2]})
with self._to(in_memory, tmp_dir, dset) as dset:
with dset.filter(lambda x: x["col"] > 0) as dset:
self.assertListEqual(dset["col"], [1, 2])
with dset.filter(lambda x: x["col"] < 2) as dset:
self.assertListEqual(dset["col"], [1])

def test_filter_fn_kwargs(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:
with Dataset.from_dict({"id": range(10)}) as dset:
Expand Down

1 comment on commit 9165e34

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.011578 / 0.011353 (0.000225) 0.004272 / 0.011008 (-0.006736) 0.034853 / 0.038508 (-0.003655) 0.039971 / 0.023109 (0.016862) 0.357854 / 0.275898 (0.081956) 0.402053 / 0.323480 (0.078573) 0.008655 / 0.007986 (0.000670) 0.003682 / 0.004328 (-0.000647) 0.010521 / 0.004250 (0.006271) 0.042605 / 0.037052 (0.005553) 0.357159 / 0.258489 (0.098670) 0.406487 / 0.293841 (0.112646) 0.034431 / 0.128546 (-0.094115) 0.011715 / 0.075646 (-0.063931) 0.295531 / 0.419271 (-0.123741) 0.055729 / 0.043533 (0.012196) 0.359323 / 0.255139 (0.104184) 0.385596 / 0.283200 (0.102397) 0.087508 / 0.141683 (-0.054174) 1.914240 / 1.452155 (0.462085) 2.055963 / 1.492716 (0.563247)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.189841 / 0.018006 (0.171835) 0.483874 / 0.000490 (0.483384) 0.005186 / 0.000200 (0.004986) 0.000386 / 0.000054 (0.000332)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.039146 / 0.037411 (0.001735) 0.026729 / 0.014526 (0.012203) 0.031158 / 0.176557 (-0.145398) 0.137701 / 0.737135 (-0.599434) 0.030589 / 0.296338 (-0.265749)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.526321 / 0.215209 (0.311112) 5.286459 / 2.077655 (3.208804) 2.371531 / 1.504120 (0.867411) 1.933259 / 1.541195 (0.392065) 2.087283 / 1.468490 (0.618793) 0.527069 / 4.584777 (-4.057708) 6.632254 / 3.745712 (2.886542) 1.506136 / 5.269862 (-3.763726) 1.394321 / 4.565676 (-3.171356) 0.062053 / 0.424275 (-0.362222) 0.005066 / 0.007607 (-0.002542) 0.667826 / 0.226044 (0.441782) 6.401471 / 2.268929 (4.132542) 2.785542 / 55.444624 (-52.659083) 2.218460 / 6.876477 (-4.658016) 2.261105 / 2.142072 (0.119033) 0.691970 / 4.805227 (-4.113257) 0.141727 / 6.500664 (-6.358937) 0.058805 / 0.075469 (-0.016664)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.062367 / 1.841788 (-0.779421) 14.066160 / 8.074308 (5.991852) 35.465159 / 10.191392 (25.273767) 0.867874 / 0.680424 (0.187450) 0.618877 / 0.534201 (0.084676) 0.252723 / 0.579283 (-0.326560) 0.628248 / 0.434364 (0.193884) 0.233694 / 0.540337 (-0.306644) 0.240363 / 1.386936 (-1.146573)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010845 / 0.011353 (-0.000508) 0.003746 / 0.011008 (-0.007262) 0.033690 / 0.038508 (-0.004818) 0.038536 / 0.023109 (0.015427) 0.322909 / 0.275898 (0.047011) 0.359335 / 0.323480 (0.035855) 0.008886 / 0.007986 (0.000901) 0.004253 / 0.004328 (-0.000075) 0.009778 / 0.004250 (0.005528) 0.040723 / 0.037052 (0.003670) 0.324646 / 0.258489 (0.066157) 0.359111 / 0.293841 (0.065270) 0.032839 / 0.128546 (-0.095707) 0.011705 / 0.075646 (-0.063941) 0.283677 / 0.419271 (-0.135595) 0.057549 / 0.043533 (0.014016) 0.334261 / 0.255139 (0.079122) 0.359123 / 0.283200 (0.075923) 0.088126 / 0.141683 (-0.053557) 1.856537 / 1.452155 (0.404383) 1.985973 / 1.492716 (0.493256)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.352839 / 0.018006 (0.334833) 0.469885 / 0.000490 (0.469395) 0.055074 / 0.000200 (0.054874) 0.000575 / 0.000054 (0.000521)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042365 / 0.037411 (0.004953) 0.023526 / 0.014526 (0.009000) 0.030876 / 0.176557 (-0.145681) 0.145093 / 0.737135 (-0.592042) 0.031799 / 0.296338 (-0.264539)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.455603 / 0.215209 (0.240394) 4.895204 / 2.077655 (2.817550) 2.220591 / 1.504120 (0.716471) 1.822209 / 1.541195 (0.281014) 1.792963 / 1.468490 (0.324473) 0.540668 / 4.584777 (-4.044109) 6.636569 / 3.745712 (2.890857) 1.444671 / 5.269862 (-3.825191) 1.397369 / 4.565676 (-3.168307) 0.055666 / 0.424275 (-0.368609) 0.004790 / 0.007607 (-0.002817) 0.585301 / 0.226044 (0.359256) 5.976857 / 2.268929 (3.707928) 2.726912 / 55.444624 (-52.717713) 2.106685 / 6.876477 (-4.769792) 2.019171 / 2.142072 (-0.122901) 0.665872 / 4.805227 (-4.139356) 0.150677 / 6.500664 (-6.349987) 0.058352 / 0.075469 (-0.017117)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 0.992487 / 1.841788 (-0.849300) 13.859921 / 8.074308 (5.785613) 32.360310 / 10.191392 (22.168918) 0.814780 / 0.680424 (0.134356) 0.568217 / 0.534201 (0.034016) 0.253702 / 0.579283 (-0.325581) 0.665848 / 0.434364 (0.231484) 0.218897 / 0.540337 (-0.321441) 0.237183 / 1.386936 (-1.149753)

CML watermark

Please sign in to comment.