Skip to content

Commit

Permalink
Upgrade black to version ~=22.0 (#3691)
Browse files Browse the repository at this point in the history
* Upgrade black to version ~=22.0

* Last fixes
  • Loading branch information
LysandreJik authored Feb 8, 2022
1 parent b06bf4e commit 36db39c
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 7 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@
]
)

QUALITY_REQUIRE = ["black==21.4b0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]


EXTRAS_REQUIRE = {
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/arrow_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets"

_SUB_SPEC_RE = re.compile(
fr"""
rf"""
^
(?P<split>{_split_re[1:-1]})
(\[
Expand Down
3 changes: 1 addition & 2 deletions src/datasets/utils/py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def size_str(size_in_bytes):
if not size_in_bytes:
return "Unknown size"

_NAME_LIST = [("PiB", 2 ** 50), ("TiB", 2 ** 40), ("GiB", 2 ** 30), ("MiB", 2 ** 20), ("KiB", 2 ** 10)]
_NAME_LIST = [("PiB", 2**50), ("TiB", 2**40), ("GiB", 2**30), ("MiB", 2**20), ("KiB", 2**10)]

size_in_bytes = float(size_in_bytes)
for (name, size_bytes) in _NAME_LIST:
Expand Down Expand Up @@ -634,6 +634,5 @@ def _save_regex(pickler, obj):
dill._dill.log.info("# Re")
return


except ImportError:
pass
4 changes: 2 additions & 2 deletions tests/test_info_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from datasets.utils.info_utils import is_small_dataset


@pytest.mark.parametrize("dataset_size", [None, 400 * 2 ** 20, 600 * 2 ** 20])
@pytest.mark.parametrize("input_in_memory_max_size", ["default", 0, 100 * 2 ** 20, 900 * 2 ** 20])
@pytest.mark.parametrize("dataset_size", [None, 400 * 2**20, 600 * 2**20])
@pytest.mark.parametrize("input_in_memory_max_size", ["default", 0, 100 * 2**20, 900 * 2**20])
def test_is_small_dataset(dataset_size, input_in_memory_max_size, monkeypatch):
if input_in_memory_max_size != "default":
monkeypatch.setattr(datasets.config, "IN_MEMORY_MAX_SIZE", input_in_memory_max_size)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory
with pytest.raises(FileNotFoundError) as exc_info:
datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST)
m_combined_path = re.search(
fr"http\S*{re.escape(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '/' + SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '.py')}\b",
rf"http\S*{re.escape(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '/' + SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '.py')}\b",
str(exc_info.value),
)
assert m_combined_path is not None and is_remote_url(m_combined_path.group())
Expand Down

1 comment on commit 36db39c

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009809 / 0.011353 (-0.001544) 0.004168 / 0.011008 (-0.006840) 0.031059 / 0.038508 (-0.007449) 0.035280 / 0.023109 (0.012170) 0.307772 / 0.275898 (0.031874) 0.326118 / 0.323480 (0.002638) 0.008261 / 0.007986 (0.000275) 0.003726 / 0.004328 (-0.000603) 0.009196 / 0.004250 (0.004946) 0.047312 / 0.037052 (0.010260) 0.293977 / 0.258489 (0.035488) 0.349803 / 0.293841 (0.055962) 0.031463 / 0.128546 (-0.097084) 0.009878 / 0.075646 (-0.065769) 0.251841 / 0.419271 (-0.167431) 0.051212 / 0.043533 (0.007679) 0.295738 / 0.255139 (0.040599) 0.317547 / 0.283200 (0.034348) 0.111928 / 0.141683 (-0.029755) 1.809222 / 1.452155 (0.357067) 1.864892 / 1.492716 (0.372176)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.328674 / 0.018006 (0.310668) 0.534893 / 0.000490 (0.534404) 0.011767 / 0.000200 (0.011568) 0.000116 / 0.000054 (0.000061)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.037015 / 0.037411 (-0.000396) 0.022377 / 0.014526 (0.007851) 0.028079 / 0.176557 (-0.148478) 0.075662 / 0.737135 (-0.661474) 0.027922 / 0.296338 (-0.268416)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.418271 / 0.215209 (0.203062) 4.192479 / 2.077655 (2.114825) 1.806307 / 1.504120 (0.302187) 1.596273 / 1.541195 (0.055079) 1.671808 / 1.468490 (0.203318) 0.444907 / 4.584777 (-4.139870) 4.559082 / 3.745712 (0.813370) 2.211994 / 5.269862 (-3.057868) 0.913208 / 4.565676 (-3.652469) 0.053584 / 0.424275 (-0.370691) 0.012062 / 0.007607 (0.004455) 0.524590 / 0.226044 (0.298546) 5.237106 / 2.268929 (2.968177) 2.293312 / 55.444624 (-53.151312) 1.913192 / 6.876477 (-4.963285) 1.941258 / 2.142072 (-0.200814) 0.563643 / 4.805227 (-4.241584) 0.124171 / 6.500664 (-6.376493) 0.062178 / 0.075469 (-0.013291)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.598191 / 1.841788 (-0.243597) 14.260763 / 8.074308 (6.186455) 26.344523 / 10.191392 (16.153131) 0.831297 / 0.680424 (0.150873) 0.515443 / 0.534201 (-0.018758) 0.485625 / 0.579283 (-0.093658) 0.495293 / 0.434364 (0.060929) 0.313985 / 0.540337 (-0.226352) 0.325482 / 1.386936 (-1.061454)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008642 / 0.011353 (-0.002711) 0.004280 / 0.011008 (-0.006728) 0.029719 / 0.038508 (-0.008790) 0.034810 / 0.023109 (0.011701) 0.304690 / 0.275898 (0.028792) 0.329387 / 0.323480 (0.005907) 0.006628 / 0.007986 (-0.001357) 0.005099 / 0.004328 (0.000770) 0.007576 / 0.004250 (0.003326) 0.042881 / 0.037052 (0.005828) 0.289171 / 0.258489 (0.030682) 0.330235 / 0.293841 (0.036394) 0.032054 / 0.128546 (-0.096492) 0.009884 / 0.075646 (-0.065763) 0.252660 / 0.419271 (-0.166612) 0.051757 / 0.043533 (0.008224) 0.298719 / 0.255139 (0.043580) 0.321078 / 0.283200 (0.037879) 0.098650 / 0.141683 (-0.043033) 1.769415 / 1.452155 (0.317261) 1.825931 / 1.492716 (0.333215)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.341950 / 0.018006 (0.323944) 0.533052 / 0.000490 (0.532562) 0.005379 / 0.000200 (0.005179) 0.000102 / 0.000054 (0.000048)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.033822 / 0.037411 (-0.003589) 0.021540 / 0.014526 (0.007014) 0.029561 / 0.176557 (-0.146996) 0.075900 / 0.737135 (-0.661236) 0.030074 / 0.296338 (-0.266264)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.421963 / 0.215209 (0.206753) 4.229102 / 2.077655 (2.151447) 1.865957 / 1.504120 (0.361837) 1.671244 / 1.541195 (0.130049) 1.761986 / 1.468490 (0.293496) 0.441463 / 4.584777 (-4.143314) 4.647811 / 3.745712 (0.902099) 3.575895 / 5.269862 (-1.693967) 0.956271 / 4.565676 (-3.609406) 0.054328 / 0.424275 (-0.369947) 0.012844 / 0.007607 (0.005237) 0.533313 / 0.226044 (0.307269) 5.335204 / 2.268929 (3.066276) 2.318875 / 55.444624 (-53.125750) 1.961456 / 6.876477 (-4.915021) 2.066257 / 2.142072 (-0.075815) 0.562175 / 4.805227 (-4.243052) 0.123533 / 6.500664 (-6.377131) 0.062268 / 0.075469 (-0.013201)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.633117 / 1.841788 (-0.208671) 14.092515 / 8.074308 (6.018207) 27.107463 / 10.191392 (16.916071) 0.922424 / 0.680424 (0.242000) 0.537160 / 0.534201 (0.002959) 0.490152 / 0.579283 (-0.089131) 0.499532 / 0.434364 (0.065168) 0.326862 / 0.540337 (-0.213476) 0.348135 / 1.386936 (-1.038801)

CML watermark

Please sign in to comment.