Skip to content

Commit

Permalink
Remove prepare_module in run_beam
Browse files Browse the repository at this point in the history
  • Loading branch information
albertvillanova committed Oct 27, 2021
1 parent 24b9358 commit 8086798
Showing 1 changed file with 8 additions and 12 deletions.
20 changes: 8 additions & 12 deletions src/datasets/commands/run_beam.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datasets import config
from datasets.builder import DatasetBuilder
from datasets.commands import BaseDatasetsCLICommand
from datasets.load import import_main_class, prepare_module
from datasets.load import dataset_module_factory, import_main_class
from datasets.utils.download_manager import DownloadConfig, GenerateMode


Expand Down Expand Up @@ -86,12 +86,8 @@ def run(self):
print("Both parameters `name` and `all_configs` can't be used at once.")
exit(1)
path, name = self._dataset, self._name
module_path, hash, base_path, namespace = prepare_module(
path,
return_associated_base_path=True,
return_namespace=True,
)
builder_cls = import_main_class(module_path)
dataset_module = dataset_module_factory(path)
builder_cls = import_main_class(dataset_module.module_path)
builders: List[DatasetBuilder] = []
if self._beam_pipeline_options:
beam_options = beam.options.pipeline_options.PipelineOptions(
Expand All @@ -105,11 +101,11 @@ def run(self):
builder_cls(
name=builder_config.name,
data_dir=self._data_dir,
hash=hash,
hash=dataset_module.builder_kwargs.get("hash"),
beam_options=beam_options,
cache_dir=self._cache_dir,
base_path=base_path,
namespace=namespace,
base_path=dataset_module.builder_kwargs.get("base_path"),
namespace=dataset_module.builder_kwargs.get("namespace"),
)
)
else:
Expand All @@ -119,8 +115,8 @@ def run(self):
data_dir=self._data_dir,
beam_options=beam_options,
cache_dir=self._cache_dir,
base_path=base_path,
namespace=namespace,
base_path=dataset_module.builder_kwargs.get("base_path"),
namespace=dataset_module.builder_kwargs.get("namespace"),
)
)

Expand Down

1 comment on commit 8086798

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009280 / 0.011353 (-0.002072) 0.003794 / 0.011008 (-0.007214) 0.031701 / 0.038508 (-0.006807) 0.035378 / 0.023109 (0.012269) 0.340402 / 0.275898 (0.064504) 0.444388 / 0.323480 (0.120908) 0.007968 / 0.007986 (-0.000018) 0.004730 / 0.004328 (0.000401) 0.009051 / 0.004250 (0.004800) 0.038195 / 0.037052 (0.001143) 0.342522 / 0.258489 (0.084033) 0.374268 / 0.293841 (0.080428) 0.024269 / 0.128546 (-0.104277) 0.008796 / 0.075646 (-0.066851) 0.257476 / 0.419271 (-0.161796) 0.046771 / 0.043533 (0.003238) 0.338736 / 0.255139 (0.083597) 0.365468 / 0.283200 (0.082269) 0.086907 / 0.141683 (-0.054776) 1.736006 / 1.452155 (0.283851) 1.805196 / 1.492716 (0.312480)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.210996 / 0.018006 (0.192990) 0.436386 / 0.000490 (0.435896) 0.017610 / 0.000200 (0.017410) 0.000259 / 0.000054 (0.000205)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.036173 / 0.037411 (-0.001238) 0.022295 / 0.014526 (0.007769) 0.027417 / 0.176557 (-0.149140) 0.195052 / 0.737135 (-0.542083) 0.029476 / 0.296338 (-0.266863)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.417751 / 0.215209 (0.202542) 4.167381 / 2.077655 (2.089727) 1.812533 / 1.504120 (0.308413) 1.594696 / 1.541195 (0.053502) 1.629070 / 1.468490 (0.160580) 0.410937 / 4.584777 (-4.173839) 4.700840 / 3.745712 (0.955128) 0.928386 / 5.269862 (-4.341476) 0.852282 / 4.565676 (-3.713394) 0.050524 / 0.424275 (-0.373751) 0.010663 / 0.007607 (0.003056) 0.523580 / 0.226044 (0.297535) 5.218963 / 2.268929 (2.950035) 2.245328 / 55.444624 (-53.199296) 1.877551 / 6.876477 (-4.998925) 1.877613 / 2.142072 (-0.264460) 0.519114 / 4.805227 (-4.286113) 0.113727 / 6.500664 (-6.386937) 0.056704 / 0.075469 (-0.018765)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.537055 / 1.841788 (-0.304732) 12.374858 / 8.074308 (4.300550) 27.060998 / 10.191392 (16.869606) 0.788729 / 0.680424 (0.108305) 0.513660 / 0.534201 (-0.020541) 0.366331 / 0.579283 (-0.212952) 0.498560 / 0.434364 (0.064196) 0.253932 / 0.540337 (-0.286405) 0.260805 / 1.386936 (-1.126131)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007384 / 0.011353 (-0.003969) 0.003774 / 0.011008 (-0.007234) 0.029936 / 0.038508 (-0.008572) 0.033320 / 0.023109 (0.010211) 0.292335 / 0.275898 (0.016437) 0.326780 / 0.323480 (0.003300) 0.006158 / 0.007986 (-0.001828) 0.004675 / 0.004328 (0.000347) 0.007267 / 0.004250 (0.003016) 0.039872 / 0.037052 (0.002820) 0.289162 / 0.258489 (0.030673) 0.330591 / 0.293841 (0.036750) 0.023288 / 0.128546 (-0.105259) 0.008749 / 0.075646 (-0.066897) 0.254259 / 0.419271 (-0.165013) 0.045378 / 0.043533 (0.001846) 0.292307 / 0.255139 (0.037168) 0.316891 / 0.283200 (0.033692) 0.077998 / 0.141683 (-0.063685) 1.639825 / 1.452155 (0.187670) 1.783220 / 1.492716 (0.290504)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.333446 / 0.018006 (0.315440) 0.443055 / 0.000490 (0.442565) 0.075416 / 0.000200 (0.075216) 0.000682 / 0.000054 (0.000628)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.034322 / 0.037411 (-0.003090) 0.022042 / 0.014526 (0.007516) 0.026106 / 0.176557 (-0.150450) 0.198442 / 0.737135 (-0.538693) 0.026707 / 0.296338 (-0.269632)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.421799 / 0.215209 (0.206590) 4.220260 / 2.077655 (2.142605) 1.799140 / 1.504120 (0.295020) 1.579226 / 1.541195 (0.038031) 1.605179 / 1.468490 (0.136689) 0.416461 / 4.584777 (-4.168316) 4.753848 / 3.745712 (1.008136) 0.973930 / 5.269862 (-4.295932) 0.911044 / 4.565676 (-3.654632) 0.051422 / 0.424275 (-0.372854) 0.010322 / 0.007607 (0.002714) 0.523229 / 0.226044 (0.297185) 5.217752 / 2.268929 (2.948824) 2.248774 / 55.444624 (-53.195851) 1.861856 / 6.876477 (-5.014621) 1.879446 / 2.142072 (-0.262627) 0.525705 / 4.805227 (-4.279522) 0.114011 / 6.500664 (-6.386653) 0.056803 / 0.075469 (-0.018666)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.598068 / 1.841788 (-0.243719) 12.262777 / 8.074308 (4.188469) 27.093735 / 10.191392 (16.902343) 0.707237 / 0.680424 (0.026814) 0.513270 / 0.534201 (-0.020931) 0.371739 / 0.579283 (-0.207544) 0.505966 / 0.434364 (0.071602) 0.261224 / 0.540337 (-0.279114) 0.283368 / 1.386936 (-1.103569)

CML watermark

Please sign in to comment.