Skip to content

Commit

Permalink
[Datasets] Improve Covost 2 (#3281)
Browse files Browse the repository at this point in the history
* [Datasets] Improve Covost 2

* up

* Delete validated.tsv

* Delete covost_v2.en_de.tsv

* finish

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
  • Loading branch information
patrickvonplaten and lhoestq authored Nov 18, 2021
1 parent f135035 commit e598a00
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 15 deletions.
25 changes: 10 additions & 15 deletions datasets/covost2/covost2.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,7 @@ class Covost2(datasets.GeneratorBasedBuilder):

@property
def manual_download_instructions(self):
return """\
You should download the Common Voice v4 dataset from https://commonvoice.mozilla.org/en/datasets.
and unpack it to a path `{COVOST_ROOT}/{SOURCE_LANG_ID}` and then pass the `{COVOST_ROOT}` path as `data_dir`
via `datasets.load_dataset('covost2', data_dir="path/to/covost_root")`
return f"""Please download the Common Voice Corpus 4 in {self.config.name.split('_')[0]} from https://commonvoice.mozilla.org/en/datasets and unpack it with `tar xvzf {self.config.name.split('_')[0]}.tar`. Make sure to pass the path to the directory in which you unpacked the downloaded file as `data_dir`: `datasets.load_dataset('covost2', data_dir="path/to/dir")`
"""

def _info(self):
Expand All @@ -111,29 +108,27 @@ def _info(self):

def _split_generators(self, dl_manager):
data_root = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

source_lang, target_lang = self.config.name.split("_")
source_path = os.path.join(data_root, source_lang)

if not os.path.exists(source_path):
if not os.path.exists(data_root):
raise FileNotFoundError(
"{} does not exist. Make sure you insert a manual dir via "
"`datasets.load_dataset('covost2', data_dir=...)` that includes files uncompressed files from the "
"COVOST2 archive. Manual download instructions: {}".format(
data_root, self.manual_download_instructions
)
f"You are trying to load the {self.config.name} speech translation dataset."
f"It is required that you manually download the input speech data {source_lang}."
"Manual download instructions: {}".format(data_root, self.manual_download_instructions)
)

covost_url = COVOST_URL_TEMPLATE.format(src_lang=source_lang, tgt_lang=target_lang)
extracted_path = dl_manager.download_and_extract(covost_url)

covost_tsv_path = os.path.join(extracted_path, f"covost_v2.{source_lang}_{target_lang}.tsv")
cv_tsv_path = os.path.join(source_path, "validated.tsv")
cv_tsv_path = os.path.join(data_root, "validated.tsv")

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"source_path": source_path,
"source_path": data_root,
"covost_tsv_path": covost_tsv_path,
"cv_tsv_path": cv_tsv_path,
"split": "train",
Expand All @@ -142,7 +137,7 @@ def _split_generators(self, dl_manager):
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"source_path": source_path,
"source_path": data_root,
"covost_tsv_path": covost_tsv_path,
"cv_tsv_path": cv_tsv_path,
"split": "dev",
Expand All @@ -151,7 +146,7 @@ def _split_generators(self, dl_manager):
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"source_path": source_path,
"source_path": data_root,
"covost_tsv_path": covost_tsv_path,
"cv_tsv_path": cv_tsv_path,
"split": "test",
Expand Down
Binary file modified datasets/covost2/dummy/en_de/1.0.0/dummy_data.zip
Binary file not shown.

1 comment on commit e598a00

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.065278 / 0.065278 (0.053925) 0.004038 / 0.004038 (-0.006970) 0.034833 / 0.034833 (-0.003676) 0.032319 / 0.032319 (0.009209) 0.265058 / 0.265058 (-0.010840) 0.300239 / 0.300239 (-0.023241) 0.080346 / 0.080346 (0.072360) 0.004314 / 0.004314 (-0.000015) 0.008419 / 0.008419 (0.004168) 0.044837 / 0.044837 (0.007785) 0.255386 / 0.255386 (-0.003103) 0.297715 / 0.297715 (0.003874) 0.076295 / 0.076295 (-0.052251) 0.008237 / 0.008237 (-0.067410) 0.224375 / 0.224375 (-0.194897) 0.042928 / 0.042928 (-0.000605) 0.260656 / 0.260656 (0.005517) 0.282527 / 0.282527 (-0.000673) 0.085270 / 0.085270 (-0.056413) 1.518156 / 1.518156 (0.066001) 1.562534 / 1.562534 (0.069817)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.322517 / 0.322517 (0.304511) 0.578197 / 0.578197 (0.577707) 0.013013 / 0.013013 (0.012813) 0.000146 / 0.000146 (0.000092)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.032388 / 0.032388 (-0.005023) 0.020669 / 0.020669 (0.006143) 0.029983 / 0.029983 (-0.146573) 0.180949 / 0.180949 (-0.556187) 0.030879 / 0.030879 (-0.265459)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.376171 / 0.376171 (0.160962) 3.770083 / 3.770083 (1.692428) 1.598015 / 1.598015 (0.093895) 1.412005 / 1.412005 (-0.129190) 1.531171 / 1.531171 (0.062681) 0.372427 / 0.372427 (-4.212350) 4.501772 / 4.501772 (0.756060) 2.097547 / 2.097547 (-3.172315) 0.894762 / 0.894762 (-3.670915) 0.044746 / 0.044746 (-0.379529) 0.009879 / 0.009879 (0.002272) 0.469836 / 0.469836 (0.243792) 4.694716 / 4.694716 (2.425787) 2.013142 / 2.013142 (-53.431482) 1.704753 / 1.704753 (-5.171724) 1.826298 / 1.826298 (-0.315774) 0.480897 / 0.480897 (-4.324330) 0.102850 / 0.102850 (-6.397814) 0.050738 / 0.050738 (-0.024731)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.343852 / 1.343852 (-0.497936) 12.227799 / 12.227799 (4.153491) 26.517040 / 26.517040 (16.325648) 0.684987 / 0.684987 (0.004564) 0.512427 / 0.512427 (-0.021773) 0.375917 / 0.375917 (-0.203366) 0.506140 / 0.506140 (0.071776) 0.260622 / 0.260622 (-0.279715) 0.271659 / 0.271659 (-1.115277)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.074664 / 0.074664 (0.063311) 0.004223 / 0.004223 (-0.006785) 0.030222 / 0.030222 (-0.008287) 0.038490 / 0.038490 (0.015381) 0.298248 / 0.298248 (0.022350) 0.339407 / 0.339407 (0.015927) 0.096075 / 0.096075 (0.088089) 0.004577 / 0.004577 (0.000249) 0.007574 / 0.007574 (0.003324) 0.041745 / 0.041745 (0.004693) 0.291400 / 0.291400 (0.032911) 0.339974 / 0.339974 (0.046133) 0.085729 / 0.085729 (-0.042817) 0.009310 / 0.009310 (-0.066336) 0.252143 / 0.252143 (-0.167128) 0.045974 / 0.045974 (0.002441) 0.293310 / 0.293310 (0.038171) 0.318686 / 0.318686 (0.035487) 0.088607 / 0.088607 (-0.053076) 1.747519 / 1.747519 (0.295364) 1.793852 / 1.793852 (0.301136)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.349175 / 0.349175 (0.331169) 0.538095 / 0.538095 (0.537605) 0.000969 / 0.000969 (0.000770) 0.000078 / 0.000078 (0.000023)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.033880 / 0.033880 (-0.003531) 0.021966 / 0.021966 (0.007440) 0.030804 / 0.030804 (-0.145753) 0.199655 / 0.199655 (-0.537480) 0.032379 / 0.032379 (-0.263959)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.374326 / 0.374326 (0.159117) 3.760234 / 3.760234 (1.682580) 1.592283 / 1.592283 (0.088163) 1.403541 / 1.403541 (-0.137654) 1.502685 / 1.502685 (0.034195) 0.369169 / 0.369169 (-4.215608) 4.541471 / 4.541471 (0.795759) 3.717366 / 3.717366 (-1.552496) 0.789772 / 0.789772 (-3.775905) 0.043976 / 0.043976 (-0.380299) 0.010095 / 0.010095 (0.002488) 0.474290 / 0.474290 (0.248246) 4.727542 / 4.727542 (2.458613) 2.025576 / 2.025576 (-53.419048) 1.690975 / 1.690975 (-5.185501) 1.841542 / 1.841542 (-0.300530) 0.472103 / 0.472103 (-4.333124) 0.102109 / 0.102109 (-6.398555) 0.050581 / 0.050581 (-0.024888)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.395449 / 1.395449 (-0.446339) 12.907385 / 12.907385 (4.833077) 26.496491 / 26.496491 (16.305099) 0.748810 / 0.748810 (0.068386) 0.552919 / 0.552919 (0.018718) 0.373932 / 0.373932 (-0.205351) 0.489458 / 0.489458 (0.055094) 0.247025 / 0.247025 (-0.293313) 0.277778 / 0.277778 (-1.109158)

CML watermark

Please sign in to comment.