huggingface#3111 Set features correctly when concatenating.

Dref360 · Oct 19, 2021 · cce0ae8 · cce0ae8 · github-actions · Oct 19, 2021
1 parent adc5cec
commit cce0ae8
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 4 deletions.
diff --git a/Makefile b/Makefile
@@ -16,4 +16,4 @@ style:
 # Run tests for the library
 
 test:
-	python -m pytest -n auto --dist=loadfile -s -v ./tests/
+	python -m pytest -n 2 --dist=loadfile -s -v ./tests/
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -3643,6 +3643,15 @@ def concatenate_datasets(
         format = {}
         logger.info("Some of the datasets have disparate format. Resetting the format of the concatenated dataset.")
 
+    # Find column types.
+    if axis == 1:
+        features_d = {}
+        for dset in dsets:
+            features_d.update(dset.features)
+        features = Features(features_d)
+    else:
+        features = dsets[0].features
+
     # Concatenate tables
     tables_to_concat = [dset._data for dset in dsets if len(dset._data) > 0]
     # There might be no table with data left hence return first empty table
@@ -3702,6 +3711,7 @@ def apply_offset_to_indices_table(table, offset):
         fingerprint=fingerprint,
     )
     concatenated_dataset.set_format(**format)
+    concatenated_dataset = concatenated_dataset.cast(features)
     return concatenated_dataset
 
 

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -133,7 +133,6 @@ def _to(self, in_memory, tmp_dir, *datasets):
 
     def test_dummy_dataset(self, in_memory):
         with tempfile.TemporaryDirectory() as tmp_dir:
-
             with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                 self.assertDictEqual(dset.features, Features({"filename": Value("string")}))
                 self.assertEqual(dset[0]["filename"], "my_name-train_0")
@@ -273,7 +272,6 @@ def test_dummy_dataset_serialize(self, in_memory):
 
     def test_dummy_dataset_load_from_disk(self, in_memory):
         with tempfile.TemporaryDirectory() as tmp_dir:
-
             with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:
                 dataset_path = os.path.join(tmp_dir, "my_dataset")
                 dset.save_to_disk(dataset_path)
@@ -1142,7 +1140,6 @@ def __call__(self, example):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
-
                 ex_cnt = ExampleCounter()
                 dset.map(ex_cnt)
                 self.assertEqual(ex_cnt.cnt, len(dset))
@@ -2242,6 +2239,19 @@ def test_concatenate_datasets_duplicate_columns(dataset):
     assert "duplicated" in str(excinfo.value)
 
 
+def test_concatenate_datasets_column_typing(dataset):
+    data = {"label": [0, 1, 1, 0], "col_1": ["a", "b", "c", "d"]}
+    data_2 = {"col2": [0, 1, 1, 0]}
+
+    features = Features({"label": ClassLabel(2, names=["POS", "NEG"]), "col_1": Value("string")})
+    features_2 = Features({"col2": ClassLabel(2, names=["POS", "NEG"])})
+    with Dataset.from_dict(data, features=features, info=DatasetInfo(features=features)) as dset:
+        with Dataset.from_dict(data_2, features=features_2, info=DatasetInfo(features=features_2)) as dset2:
+            concatenated = concatenate_datasets([dset, dset2], axis=1)
+            assert isinstance(concatenated.features["label"], ClassLabel)
+            assert isinstance(concatenated.features["col2"], ClassLabel)
+
+
 def test_interleave_datasets():
     d1 = Dataset.from_dict({"a": [0, 1, 2]})
     d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})