Skip to content

Commit

Permalink
Remove prepare_module in test_dataset_common
Browse files Browse the repository at this point in the history
  • Loading branch information
albertvillanova committed Oct 27, 2021
1 parent 8086798 commit 48cac2a
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions tests/test_dataset_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
from absl.testing import parameterized

import datasets
from datasets import cached_path, import_main_class, load_dataset, prepare_module
from datasets.builder import BuilderConfig, DatasetBuilder
from datasets.features import ClassLabel, Features, Value
from datasets.load import dataset_module_factory, import_main_class, load_dataset
from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
from datasets.search import _has_faiss
from datasets.utils.download_manager import GenerateMode
from datasets.utils.file_utils import DownloadConfig, is_remote_url
from datasets.utils.file_utils import DownloadConfig, cached_path, is_remote_url
from datasets.utils.logging import get_logger
from datasets.utils.mock_download_manager import MockDownloadManager

Expand Down Expand Up @@ -100,11 +100,11 @@ def __init__(self, parent):
def load_builder_class(self, dataset_name, is_local=False):
# Download/copy dataset script
if is_local is True:
module_path, _ = prepare_module(os.path.join("datasets", dataset_name))
dataset_module = dataset_module_factory(os.path.join("datasets", dataset_name))
else:
module_path, _ = prepare_module(dataset_name, download_config=DownloadConfig(force_download=True))
dataset_module = dataset_module_factory(dataset_name, download_config=DownloadConfig(force_download=True))
# Get dataset builder class
builder_cls = import_main_class(module_path)
builder_cls = import_main_class(dataset_module.module_path)
return builder_cls

def load_all_configs(self, dataset_name, is_local=False) -> List[Optional[BuilderConfig]]:
Expand Down Expand Up @@ -254,8 +254,8 @@ def test_load_dataset_all_configs(self, dataset_name):
@slow
def test_load_real_dataset(self, dataset_name):
path = "./datasets/" + dataset_name
module_path, hash = prepare_module(path, download_config=DownloadConfig(local_files_only=True), dataset=True)
builder_cls = import_main_class(module_path, dataset=True)
dataset_module = dataset_module_factory(path, download_config=DownloadConfig(local_files_only=True))
builder_cls = import_main_class(dataset_module.module_path)
name = builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None
with tempfile.TemporaryDirectory() as temp_cache_dir:
dataset = load_dataset(
Expand All @@ -268,8 +268,8 @@ def test_load_real_dataset(self, dataset_name):
@slow
def test_load_real_dataset_all_configs(self, dataset_name):
path = "./datasets/" + dataset_name
module_path, hash = prepare_module(path, download_config=DownloadConfig(local_files_only=True), dataset=True)
builder_cls = import_main_class(module_path, dataset=True)
dataset_module = dataset_module_factory(path, download_config=DownloadConfig(local_files_only=True))
builder_cls = import_main_class(dataset_module.module_path)
config_names = (
[config.name for config in builder_cls.BUILDER_CONFIGS] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None]
)
Expand Down

1 comment on commit 48cac2a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010535 / 0.011353 (-0.000818) 0.004903 / 0.011008 (-0.006106) 0.036162 / 0.038508 (-0.002346) 0.034954 / 0.023109 (0.011844) 0.328208 / 0.275898 (0.052310) 0.404934 / 0.323480 (0.081455) 0.009914 / 0.007986 (0.001929) 0.005007 / 0.004328 (0.000678) 0.009733 / 0.004250 (0.005482) 0.036629 / 0.037052 (-0.000423) 0.329208 / 0.258489 (0.070719) 0.375553 / 0.293841 (0.081713) 0.036599 / 0.128546 (-0.091947) 0.013634 / 0.075646 (-0.062013) 0.293619 / 0.419271 (-0.125652) 0.056165 / 0.043533 (0.012633) 0.341750 / 0.255139 (0.086611) 0.360844 / 0.283200 (0.077645) 0.078320 / 0.141683 (-0.063363) 1.902674 / 1.452155 (0.450519) 1.934120 / 1.492716 (0.441404)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.188742 / 0.018006 (0.170736) 0.479921 / 0.000490 (0.479431) 0.004125 / 0.000200 (0.003925) 0.000425 / 0.000054 (0.000371)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.041614 / 0.037411 (0.004203) 0.025356 / 0.014526 (0.010831) 0.030376 / 0.176557 (-0.146181) 0.222159 / 0.737135 (-0.514976) 0.029366 / 0.296338 (-0.266972)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.627887 / 0.215209 (0.412678) 6.026631 / 2.077655 (3.948977) 2.366748 / 1.504120 (0.862628) 2.040107 / 1.541195 (0.498913) 2.015034 / 1.468490 (0.546544) 0.759706 / 4.584777 (-3.825071) 6.632488 / 3.745712 (2.886776) 1.631373 / 5.269862 (-3.638488) 1.506791 / 4.565676 (-3.058885) 0.082669 / 0.424275 (-0.341606) 0.012383 / 0.007607 (0.004776) 0.836186 / 0.226044 (0.610142) 8.061028 / 2.268929 (5.792099) 3.068178 / 55.444624 (-52.376446) 2.703123 / 6.876477 (-4.173353) 2.597441 / 2.142072 (0.455369) 1.014113 / 4.805227 (-3.791114) 0.200140 / 6.500664 (-6.300524) 0.077020 / 0.075469 (0.001551)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 2.061856 / 1.841788 (0.220068) 14.846545 / 8.074308 (6.772237) 46.201062 / 10.191392 (36.009670) 1.015710 / 0.680424 (0.335286) 0.679404 / 0.534201 (0.145203) 0.534167 / 0.579283 (-0.045116) 0.712447 / 0.434364 (0.278083) 0.333665 / 0.540337 (-0.206672) 0.357579 / 1.386936 (-1.029357)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010321 / 0.011353 (-0.001032) 0.004640 / 0.011008 (-0.006368) 0.035481 / 0.038508 (-0.003027) 0.037496 / 0.023109 (0.014386) 0.331303 / 0.275898 (0.055405) 0.383002 / 0.323480 (0.059522) 0.007383 / 0.007986 (-0.000603) 0.005134 / 0.004328 (0.000806) 0.007829 / 0.004250 (0.003578) 0.045325 / 0.037052 (0.008273) 0.353928 / 0.258489 (0.095439) 0.378868 / 0.293841 (0.085027) 0.041941 / 0.128546 (-0.086605) 0.012829 / 0.075646 (-0.062818) 0.288158 / 0.419271 (-0.131114) 0.064716 / 0.043533 (0.021183) 0.339067 / 0.255139 (0.083928) 0.399541 / 0.283200 (0.116341) 0.090165 / 0.141683 (-0.051518) 2.054102 / 1.452155 (0.601947) 2.181731 / 1.492716 (0.689015)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.337342 / 0.018006 (0.319336) 0.473339 / 0.000490 (0.472849) 0.076613 / 0.000200 (0.076413) 0.001185 / 0.000054 (0.001130)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.034934 / 0.037411 (-0.002477) 0.023584 / 0.014526 (0.009059) 0.025820 / 0.176557 (-0.150736) 0.226935 / 0.737135 (-0.510201) 0.027643 / 0.296338 (-0.268695)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.645751 / 0.215209 (0.430542) 6.502213 / 2.077655 (4.424559) 2.239069 / 1.504120 (0.734949) 1.803265 / 1.541195 (0.262070) 1.841538 / 1.468490 (0.373048) 0.716211 / 4.584777 (-3.868566) 6.641771 / 3.745712 (2.896059) 1.646214 / 5.269862 (-3.623648) 1.502915 / 4.565676 (-3.062761) 0.084105 / 0.424275 (-0.340170) 0.012166 / 0.007607 (0.004559) 0.819799 / 0.226044 (0.593755) 8.051592 / 2.268929 (5.782663) 3.155700 / 55.444624 (-52.288925) 2.364324 / 6.876477 (-4.512153) 2.308283 / 2.142072 (0.166211) 0.925934 / 4.805227 (-3.879293) 0.192164 / 6.500664 (-6.308500) 0.083236 / 0.075469 (0.007767)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.980673 / 1.841788 (0.138886) 14.665317 / 8.074308 (6.591009) 45.405642 / 10.191392 (35.214250) 0.997856 / 0.680424 (0.317432) 0.635311 / 0.534201 (0.101110) 0.485376 / 0.579283 (-0.093907) 0.718387 / 0.434364 (0.284023) 0.342999 / 0.540337 (-0.197339) 0.374875 / 1.386936 (-1.012061)

CML watermark

Please sign in to comment.