Skip to content

Commit

Permalink
huggingface#3111 Set features correctly when concatenating.
Browse files Browse the repository at this point in the history
  • Loading branch information
fr.branchaud-charron committed Oct 19, 2021
1 parent adc5cec commit cce0ae8
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ style:
# Run tests for the library

test:
python -m pytest -n auto --dist=loadfile -s -v ./tests/
python -m pytest -n 2 --dist=loadfile -s -v ./tests/
10 changes: 10 additions & 0 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3643,6 +3643,15 @@ def concatenate_datasets(
format = {}
logger.info("Some of the datasets have disparate format. Resetting the format of the concatenated dataset.")

# Find column types.
if axis == 1:
features_d = {}
for dset in dsets:
features_d.update(dset.features)
features = Features(features_d)
else:
features = dsets[0].features

# Concatenate tables
tables_to_concat = [dset._data for dset in dsets if len(dset._data) > 0]
# There might be no table with data left hence return first empty table
Expand Down Expand Up @@ -3702,6 +3711,7 @@ def apply_offset_to_indices_table(table, offset):
fingerprint=fingerprint,
)
concatenated_dataset.set_format(**format)
concatenated_dataset = concatenated_dataset.cast(features)
return concatenated_dataset


Expand Down
16 changes: 13 additions & 3 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def _to(self, in_memory, tmp_dir, *datasets):

def test_dummy_dataset(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:

with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
self.assertDictEqual(dset.features, Features({"filename": Value("string")}))
self.assertEqual(dset[0]["filename"], "my_name-train_0")
Expand Down Expand Up @@ -273,7 +272,6 @@ def test_dummy_dataset_serialize(self, in_memory):

def test_dummy_dataset_load_from_disk(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:

with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:
dataset_path = os.path.join(tmp_dir, "my_dataset")
dset.save_to_disk(dataset_path)
Expand Down Expand Up @@ -1142,7 +1140,6 @@ def __call__(self, example):

with tempfile.TemporaryDirectory() as tmp_dir:
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:

ex_cnt = ExampleCounter()
dset.map(ex_cnt)
self.assertEqual(ex_cnt.cnt, len(dset))
Expand Down Expand Up @@ -2242,6 +2239,19 @@ def test_concatenate_datasets_duplicate_columns(dataset):
assert "duplicated" in str(excinfo.value)


def test_concatenate_datasets_column_typing(dataset):
data = {"label": [0, 1, 1, 0], "col_1": ["a", "b", "c", "d"]}
data_2 = {"col2": [0, 1, 1, 0]}

features = Features({"label": ClassLabel(2, names=["POS", "NEG"]), "col_1": Value("string")})
features_2 = Features({"col2": ClassLabel(2, names=["POS", "NEG"])})
with Dataset.from_dict(data, features=features, info=DatasetInfo(features=features)) as dset:
with Dataset.from_dict(data_2, features=features_2, info=DatasetInfo(features=features_2)) as dset2:
concatenated = concatenate_datasets([dset, dset2], axis=1)
assert isinstance(concatenated.features["label"], ClassLabel)
assert isinstance(concatenated.features["col2"], ClassLabel)


def test_interleave_datasets():
d1 = Dataset.from_dict({"a": [0, 1, 2]})
d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
Expand Down

1 comment on commit cce0ae8

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010908 / 0.011353 (-0.000445) 0.004265 / 0.011008 (-0.006744) 0.035045 / 0.038508 (-0.003463) 0.038308 / 0.023109 (0.015199) 0.328979 / 0.275898 (0.053081) 0.469894 / 0.323480 (0.146414) 0.009146 / 0.007986 (0.001160) 0.003572 / 0.004328 (-0.000756) 0.008859 / 0.004250 (0.004609) 0.040966 / 0.037052 (0.003914) 0.311223 / 0.258489 (0.052734) 0.339543 / 0.293841 (0.045702) 0.037760 / 0.128546 (-0.090786) 0.011527 / 0.075646 (-0.064119) 0.287070 / 0.419271 (-0.132202) 0.052677 / 0.043533 (0.009144) 0.327395 / 0.255139 (0.072256) 0.342779 / 0.283200 (0.059580) 0.091965 / 0.141683 (-0.049718) 1.812738 / 1.452155 (0.360583) 1.867025 / 1.492716 (0.374309)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.216688 / 0.018006 (0.198682) 0.504083 / 0.000490 (0.503593) 0.007394 / 0.000200 (0.007194) 0.000500 / 0.000054 (0.000446)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.036487 / 0.037411 (-0.000924) 0.028092 / 0.014526 (0.013566) 0.033853 / 0.176557 (-0.142704) 0.119999 / 0.737135 (-0.617136) 0.027904 / 0.296338 (-0.268434)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.562348 / 0.215209 (0.347139) 5.474716 / 2.077655 (3.397061) 2.176777 / 1.504120 (0.672657) 1.963954 / 1.541195 (0.422759) 1.884120 / 1.468490 (0.415630) 0.567480 / 4.584777 (-4.017297) 6.285693 / 3.745712 (2.539981) 1.379218 / 5.269862 (-3.890644) 1.280679 / 4.565676 (-3.284998) 0.063357 / 0.424275 (-0.360918) 0.004861 / 0.007607 (-0.002746) 0.727943 / 0.226044 (0.501899) 6.953971 / 2.268929 (4.685043) 2.689064 / 55.444624 (-52.755560) 2.083041 / 6.876477 (-4.793436) 2.083046 / 2.142072 (-0.059026) 0.703382 / 4.805227 (-4.101845) 0.142182 / 6.500664 (-6.358482) 0.056637 / 0.075469 (-0.018832)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.704821 / 1.841788 (-0.136967) 13.544293 / 8.074308 (5.469985) 40.083969 / 10.191392 (29.892577) 0.948765 / 0.680424 (0.268341) 0.655844 / 0.534201 (0.121643) 0.285819 / 0.579283 (-0.293464) 0.682641 / 0.434364 (0.248277) 0.219506 / 0.540337 (-0.320832) 0.234348 / 1.386936 (-1.152588)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010176 / 0.011353 (-0.001177) 0.003860 / 0.011008 (-0.007149) 0.034930 / 0.038508 (-0.003578) 0.040108 / 0.023109 (0.016999) 0.350440 / 0.275898 (0.074542) 0.380196 / 0.323480 (0.056716) 0.009612 / 0.007986 (0.001626) 0.006065 / 0.004328 (0.001737) 0.009948 / 0.004250 (0.005697) 0.038882 / 0.037052 (0.001829) 0.354915 / 0.258489 (0.096426) 0.405594 / 0.293841 (0.111754) 0.031976 / 0.128546 (-0.096570) 0.012011 / 0.075646 (-0.063635) 0.283884 / 0.419271 (-0.135387) 0.053623 / 0.043533 (0.010090) 0.363131 / 0.255139 (0.107992) 0.378402 / 0.283200 (0.095202) 0.088831 / 0.141683 (-0.052852) 1.741348 / 1.452155 (0.289193) 1.879228 / 1.492716 (0.386512)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.419246 / 0.018006 (0.401240) 0.549802 / 0.000490 (0.549313) 0.085477 / 0.000200 (0.085277) 0.000653 / 0.000054 (0.000598)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.035362 / 0.037411 (-0.002049) 0.024287 / 0.014526 (0.009761) 0.026574 / 0.176557 (-0.149982) 0.136889 / 0.737135 (-0.600246) 0.031266 / 0.296338 (-0.265073)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.568231 / 0.215209 (0.353022) 5.387718 / 2.077655 (3.310063) 2.074909 / 1.504120 (0.570789) 1.714720 / 1.541195 (0.173525) 1.709058 / 1.468490 (0.240568) 0.556799 / 4.584777 (-4.027978) 6.359146 / 3.745712 (2.613434) 1.379808 / 5.269862 (-3.890054) 1.252582 / 4.565676 (-3.313094) 0.059632 / 0.424275 (-0.364643) 0.005818 / 0.007607 (-0.001789) 0.660669 / 0.226044 (0.434625) 6.663678 / 2.268929 (4.394749) 2.637656 / 55.444624 (-52.806968) 2.032999 / 6.876477 (-4.843478) 2.028198 / 2.142072 (-0.113874) 0.771085 / 4.805227 (-4.034142) 0.140166 / 6.500664 (-6.360498) 0.051680 / 0.075469 (-0.023789)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.700637 / 1.841788 (-0.141150) 13.369226 / 8.074308 (5.294917) 37.226900 / 10.191392 (27.035508) 0.870011 / 0.680424 (0.189587) 0.544714 / 0.534201 (0.010513) 0.254231 / 0.579283 (-0.325052) 0.653218 / 0.434364 (0.218854) 0.247953 / 0.540337 (-0.292384) 0.238084 / 1.386936 (-1.148852)

CML watermark

Please sign in to comment.