From 92cad1513c85060c5da1819a906c49cd038ffd3e Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 29 Jul 2021 19:03:05 +0100 Subject: [PATCH 01/45] Rebase onto master --- src/datasets/arrow_dataset.py | 45 +++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 5361c54664c..5052d830afd 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -159,6 +159,51 @@ def version(self): return self._info.version +class TensorflowDatasetMixIn: + def __init__(self): + pass + + def to_tf_dataset(self, tokenizer, cols_to_remove, batch_size, shuffle): + import tensorflow as tf + dataset_in = self.remove_columns(cols_to_remove) + tf_cols = [col for col in dataset_in.features] + label_index = tf_cols.index("label") + dtypes_out = [] + for col in tf_cols: + try: + col_feature = dataset_in.features[col] + if hasattr(col_feature, 'feature'): + col_feature = col_feature.feature + dtype_str = col_feature.dtype + dtypes_out.append(tf.as_dtype(dtype_str)) + except TypeError: + raise TypeError(f"Couldn't convert column {col}, dtype {dtype_str} to TF Tensor!") + + def indices_to_samples(indices): + batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict() + batch = tokenizer.pad(batch) + output = [] + for col in tf_cols: + output.append(batch[col]) + return output + + def graph_indices_to_samples(indices): + return tf.py_function(indices_to_samples, [indices], Tout=dtypes_out) + + def reform_dict(*batch_list): + return ({col: batch_list[i] for i, col in enumerate(tf_cols)}, batch_list[label_index]) + + indices = tf.range(len(dataset_in)) + tf_dataset = tf.data.Dataset.from_tensor_slices(indices) + if shuffle: + tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset)) + tf_dataset = tf_dataset.batch(batch_size) + tf_dataset = tf_dataset.map(graph_indices_to_samples).map(reform_dict) + return tf_dataset + + + + class DatasetTransformationNotAllowedError(Exception): pass From 74b5badc2274f518ffc9ed469ed7addbc4a7e6be Mon Sep 17 00:00:00 2001 From: matt Date: Fri, 30 Jul 2021 16:03:42 +0100 Subject: [PATCH 02/45] Support multiple label_cols, replaced tokenizer with collate_fn, support padding to constant size for TPU training --- src/datasets/arrow_dataset.py | 45 ++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 5052d830afd..033d0747878 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -163,27 +163,48 @@ class TensorflowDatasetMixIn: def __init__(self): pass - def to_tf_dataset(self, tokenizer, cols_to_remove, batch_size, shuffle): + def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0): import tensorflow as tf - dataset_in = self.remove_columns(cols_to_remove) - tf_cols = [col for col in dataset_in.features] - label_index = tf_cols.index("label") + if len(set(columns)) < len(columns): + raise ValueError("List of columns contains duplicates!") + if len(set(label_cols)) < len(label_cols): + raise ValueError("List of label_cols contains duplicates!") + if pad_to > 0 and collate_fn is not None: + raise ValueError("pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!") + if label_cols is not None: + cols_to_retain = list(set(columns + label_cols)) + else: + cols_to_retain = columns + + dataset_in = self.remove_columns([col for col in self.features if col not in cols_to_retain]) + feature_indices = dict() + label_indices = dict() dtypes_out = [] - for col in tf_cols: + for i, col in enumerate(cols_to_retain): try: col_feature = dataset_in.features[col] if hasattr(col_feature, 'feature'): col_feature = col_feature.feature dtype_str = col_feature.dtype dtypes_out.append(tf.as_dtype(dtype_str)) + # Note that these two are not mutually exclusive! + if col in columns: + feature_indices[col] = i + if col in label_cols: + label_indices[col] = i except TypeError: raise TypeError(f"Couldn't convert column {col}, dtype {dtype_str} to TF Tensor!") def indices_to_samples(indices): batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict() - batch = tokenizer.pad(batch) + if collate_fn is not None: + batch = collate_fn(batch) + elif pad_to > 0: + batch = tf.ragged.constant(batch).to_tensor(shape=(batch_size, pad_to)) + else: + batch = tf.ragged.constant(batch).to_tensor() output = [] - for col in tf_cols: + for col in cols_to_retain: output.append(batch[col]) return output @@ -191,7 +212,15 @@ def graph_indices_to_samples(indices): return tf.py_function(indices_to_samples, [indices], Tout=dtypes_out) def reform_dict(*batch_list): - return ({col: batch_list[i] for i, col in enumerate(tf_cols)}, batch_list[label_index]) + features = {col: batch_list[idx] for col, idx in feature_indices.items()} + if label_cols is None: + return features + elif len(label_cols) == 1: + label_index = list(label_indices.values())[0] + return features, batch_list[label_index] + else: + labels = {col: batch_list[idx] for col, idx in label_indices.items()} + return features, labels indices = tf.range(len(dataset_in)) tf_dataset = tf.data.Dataset.from_tensor_slices(indices) From 97917bcc2140e528e0a09bbe5e71d98062b452a7 Mon Sep 17 00:00:00 2001 From: matt Date: Fri, 30 Jul 2021 17:07:50 +0100 Subject: [PATCH 03/45] Standardize int and float dtypes to keep TF happy --- src/datasets/arrow_dataset.py | 45 +++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 033d0747878..08c351bffec 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -181,31 +181,40 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col label_indices = dict() dtypes_out = [] for i, col in enumerate(cols_to_retain): - try: - col_feature = dataset_in.features[col] - if hasattr(col_feature, 'feature'): - col_feature = col_feature.feature - dtype_str = col_feature.dtype - dtypes_out.append(tf.as_dtype(dtype_str)) - # Note that these two are not mutually exclusive! - if col in columns: - feature_indices[col] = i - if col in label_cols: - label_indices[col] = i - except TypeError: - raise TypeError(f"Couldn't convert column {col}, dtype {dtype_str} to TF Tensor!") + col_feature = dataset_in.features[col] + if hasattr(col_feature, 'feature'): + col_feature = col_feature.feature + dtype_str = col_feature.dtype + if dtype_str.startswith("int") or dtype_str.startswith("uint"): + dtypes_out.append(tf.int32) + elif dtype_str.startswith("float"): + dtypes_out.append(tf.float32) + else: + raise TypeError(f"Can't convert dtype {dtype_str} to TF Tensor!") + # Note that these two are not mutually exclusive! + if col in columns: + feature_indices[col] = i + if col in label_cols: + label_indices[col] = i def indices_to_samples(indices): batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict() if collate_fn is not None: batch = collate_fn(batch) - elif pad_to > 0: - batch = tf.ragged.constant(batch).to_tensor(shape=(batch_size, pad_to)) - else: - batch = tf.ragged.constant(batch).to_tensor() output = [] for col in cols_to_retain: - output.append(batch[col]) + if pad_to > 0: # We know collate_fn is False + tensor = tf.ragged.constant(batch[col]) + if isinstance(tensor, tf.RaggedTensor): + tensor = tensor.to_tensor(shape=(batch_size, pad_to)) + output.append(tensor) + elif collate_fn is None: + tensor = tf.ragged.constant(batch[col]) + if isinstance(tensor, tf.RaggedTensor): + tensor = tensor.to_tensor() + output.append(tensor) + else: # Already processed + output.append(batch[col]) return output def graph_indices_to_samples(indices): From 4eb79f5709f0801be4abc9aa5f2be97c43500d67 Mon Sep 17 00:00:00 2001 From: matt Date: Fri, 30 Jul 2021 18:33:22 +0100 Subject: [PATCH 04/45] Add a prefetch buffer for improved performance --- src/datasets/arrow_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 08c351bffec..7118af7325a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -237,6 +237,7 @@ def reform_dict(*batch_list): tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset)) tf_dataset = tf_dataset.batch(batch_size) tf_dataset = tf_dataset.map(graph_indices_to_samples).map(reform_dict) + tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) return tf_dataset From bed394a97ec6f985a021dc95a0e2306372287f8f Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 4 Aug 2021 17:07:07 +0100 Subject: [PATCH 05/45] TF dataset is actually kinda performant now! --- src/datasets/arrow_dataset.py | 158 ++++++++++++++++++++-------------- 1 file changed, 92 insertions(+), 66 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 7118af7325a..ce4e7eafa28 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -163,84 +163,110 @@ class TensorflowDatasetMixIn: def __init__(self): pass - def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0): + @staticmethod + def _get_output_signature(dataset, batch_size): import tensorflow as tf - if len(set(columns)) < len(columns): - raise ValueError("List of columns contains duplicates!") - if len(set(label_cols)) < len(label_cols): + signatures = dict() + for column, col_feature in dataset.features.items(): + if hasattr(col_feature, 'feature'): + dtype_str = col_feature.feature.dtype + else: + dtype_str = col_feature.dtype + if dtype_str.startswith("int") or dtype_str.startswith("uint"): + dtype = tf.int32 + elif dtype_str.startswith("float"): + dtype = tf.float32 + else: + raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") + + if hasattr(col_feature, 'shape'): + shape = [batch_size] + list(col_feature.shape) + elif hasattr(col_feature, 'length'): + shape = [batch_size, col_feature.length] + else: + shape = [batch_size] + shape = [dim if dim != -1 else None for dim in shape] + + signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) + return signatures + + def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, pad_value=0, prefetch=True): + import tensorflow as tf + if label_cols is None: + label_cols = [] + elif isinstance(label_cols, str): + label_cols = [label_cols] + elif len(set(label_cols)) < len(label_cols): raise ValueError("List of label_cols contains duplicates!") + if not columns: + raise ValueError("Need to specify at least one column!") + elif isinstance(columns, str): + columns = [columns] + elif len(set(columns)) < len(columns): + raise ValueError("List of columns contains duplicates!") if pad_to > 0 and collate_fn is not None: - raise ValueError("pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!") + raise ValueError( + "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!") if label_cols is not None: cols_to_retain = list(set(columns + label_cols)) else: cols_to_retain = columns - - dataset_in = self.remove_columns([col for col in self.features if col not in cols_to_retain]) - feature_indices = dict() - label_indices = dict() - dtypes_out = [] - for i, col in enumerate(cols_to_retain): - col_feature = dataset_in.features[col] - if hasattr(col_feature, 'feature'): - col_feature = col_feature.feature - dtype_str = col_feature.dtype - if dtype_str.startswith("int") or dtype_str.startswith("uint"): - dtypes_out.append(tf.int32) - elif dtype_str.startswith("float"): - dtypes_out.append(tf.float32) + for col in cols_to_retain: + if col not in self.features: + raise ValueError(f"Couldn't find column {col} in dataset!") + dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) + gen_signature = self._get_output_signature(dataset, batch_size) + num_batches = len(dataset) // batch_size # Because we drop the remainder + + def tf_generator(): + # Note that the 'tensorflow' return format uses ragged tensors, which are VERY unperformant + # right now (TF 2.5). This may or may not change in the future, but for now we stick to 'numpy'. + if shuffle: + epoch_dataset = dataset.shuffle(load_from_cache_file=False) else: - raise TypeError(f"Can't convert dtype {dtype_str} to TF Tensor!") - # Note that these two are not mutually exclusive! - if col in columns: - feature_indices[col] = i - if col in label_cols: - label_indices[col] = i - - def indices_to_samples(indices): - batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict() - if collate_fn is not None: - batch = collate_fn(batch) - output = [] - for col in cols_to_retain: - if pad_to > 0: # We know collate_fn is False - tensor = tf.ragged.constant(batch[col]) - if isinstance(tensor, tf.RaggedTensor): - tensor = tensor.to_tensor(shape=(batch_size, pad_to)) - output.append(tensor) - elif collate_fn is None: - tensor = tf.ragged.constant(batch[col]) - if isinstance(tensor, tf.RaggedTensor): - tensor = tensor.to_tensor() - output.append(tensor) - else: # Already processed - output.append(batch[col]) - return output - - def graph_indices_to_samples(indices): - return tf.py_function(indices_to_samples, [indices], Tout=dtypes_out) - - def reform_dict(*batch_list): - features = {col: batch_list[idx] for col, idx in feature_indices.items()} - if label_cols is None: - return features - elif len(label_cols) == 1: - label_index = list(label_indices.values())[0] - return features, batch_list[label_index] + epoch_dataset = dataset + if collate_fn is None: + epoch_dataset.set_format('numpy') # Automatic padding else: - labels = {col: batch_list[idx] for col, idx in label_indices.items()} + epoch_dataset.set_format('python') # List of possibly variable lists + for i in range(0, len(epoch_dataset) - batch_size + 1, batch_size): + batch = epoch_dataset[i: i + batch_size] + if collate_fn is not None: + batch = collate_fn(batch) + batch = {key: np.array(val) for key, val in batch.items()} + yield batch + + tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) + + if pad_to > 0: + def padding_function(input_batch): + output_batch = dict() + for key, tensor in input_batch.items(): + if tf.rank(tensor) == 2: + padding = [[0, 0], [0, pad_to - tf.shape(tensor)[1]]] + output_batch[key] = tf.pad(tensor, padding, constant_values=pad_value) + else: + output_batch[key] = tensor + return output_batch + + tf_dataset = tf_dataset.map(padding_function) + + if label_cols: + def split_features_and_labels(input_batch): + features = {key: tensor for key, tensor in input_batch.items() if key in columns} + labels = {key: tensor for key, tensor in input_batch.items() if key in label_cols} + if len(features) == 1: + features = list(features.values())[0] + if len(labels) == 1: + labels = list(labels.values())[0] return features, labels - indices = tf.range(len(dataset_in)) - tf_dataset = tf.data.Dataset.from_tensor_slices(indices) - if shuffle: - tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset)) - tf_dataset = tf_dataset.batch(batch_size) - tf_dataset = tf_dataset.map(graph_indices_to_samples).map(reform_dict) - tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) - return tf_dataset - + tf_dataset = tf_dataset.map(split_features_and_labels) + tf_dataset = tf_dataset.apply(tf.data.experimental.assert_cardinality(num_batches)) + if prefetch: + tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) + return tf_dataset class DatasetTransformationNotAllowedError(Exception): From ea525a2e0b66e8dc83fe21f0a7a067067779871d Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 4 Aug 2021 17:11:00 +0100 Subject: [PATCH 06/45] TF dataset is actually kinda performant now! --- src/datasets/arrow_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index ce4e7eafa28..ff17af58529 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -190,7 +190,7 @@ def _get_output_signature(dataset, batch_size): signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) return signatures - def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, pad_value=0, prefetch=True): + def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, prefetch=True): import tensorflow as tf if label_cols is None: label_cols = [] @@ -244,7 +244,7 @@ def padding_function(input_batch): for key, tensor in input_batch.items(): if tf.rank(tensor) == 2: padding = [[0, 0], [0, pad_to - tf.shape(tensor)[1]]] - output_batch[key] = tf.pad(tensor, padding, constant_values=pad_value) + output_batch[key] = tf.pad(tensor, padding) else: output_batch[key] = tensor return output_batch From d3a8140fde51a675a9751f6cec5658e1ee4a95e8 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 4 Aug 2021 17:12:05 +0100 Subject: [PATCH 07/45] Style pass --- src/datasets/arrow_dataset.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index ff17af58529..204a6482a38 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -166,9 +166,10 @@ def __init__(self): @staticmethod def _get_output_signature(dataset, batch_size): import tensorflow as tf + signatures = dict() for column, col_feature in dataset.features.items(): - if hasattr(col_feature, 'feature'): + if hasattr(col_feature, "feature"): dtype_str = col_feature.feature.dtype else: dtype_str = col_feature.dtype @@ -179,9 +180,9 @@ def _get_output_signature(dataset, batch_size): else: raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") - if hasattr(col_feature, 'shape'): + if hasattr(col_feature, "shape"): shape = [batch_size] + list(col_feature.shape) - elif hasattr(col_feature, 'length'): + elif hasattr(col_feature, "length"): shape = [batch_size, col_feature.length] else: shape = [batch_size] @@ -192,6 +193,7 @@ def _get_output_signature(dataset, batch_size): def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, prefetch=True): import tensorflow as tf + if label_cols is None: label_cols = [] elif isinstance(label_cols, str): @@ -206,7 +208,8 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col raise ValueError("List of columns contains duplicates!") if pad_to > 0 and collate_fn is not None: raise ValueError( - "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!") + "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!" + ) if label_cols is not None: cols_to_retain = list(set(columns + label_cols)) else: @@ -226,11 +229,11 @@ def tf_generator(): else: epoch_dataset = dataset if collate_fn is None: - epoch_dataset.set_format('numpy') # Automatic padding + epoch_dataset.set_format("numpy") # Automatic padding else: - epoch_dataset.set_format('python') # List of possibly variable lists + epoch_dataset.set_format("python") # List of possibly variable lists for i in range(0, len(epoch_dataset) - batch_size + 1, batch_size): - batch = epoch_dataset[i: i + batch_size] + batch = epoch_dataset[i : i + batch_size] if collate_fn is not None: batch = collate_fn(batch) batch = {key: np.array(val) for key, val in batch.items()} @@ -239,6 +242,7 @@ def tf_generator(): tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) if pad_to > 0: + def padding_function(input_batch): output_batch = dict() for key, tensor in input_batch.items(): @@ -252,6 +256,7 @@ def padding_function(input_batch): tf_dataset = tf_dataset.map(padding_function) if label_cols: + def split_features_and_labels(input_batch): features = {key: tensor for key, tensor in input_batch.items() if key in columns} labels = {key: tensor for key, tensor in input_batch.items() if key in label_cols} From 3ce6dc44e9956b9d7017a9fde441cfc0e9f6f862 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 4 Aug 2021 17:25:37 +0100 Subject: [PATCH 08/45] Helpful error message if my code gets caught off-guard by unexpected feature types --- src/datasets/arrow_dataset.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 204a6482a38..b1a165d6018 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -44,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Value, cast_to_python_objects +from .features import ClassLabel, Features, Value, _ArrayXD, Sequence, cast_to_python_objects from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -180,12 +180,20 @@ def _get_output_signature(dataset, batch_size): else: raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") - if hasattr(col_feature, "shape"): + if isinstance(col_feature, Value): + shape = [batch_size] + elif isinstance(col_feature, _ArrayXD): shape = [batch_size] + list(col_feature.shape) - elif hasattr(col_feature, "length"): + elif isinstance(col_feature, Sequence): shape = [batch_size, col_feature.length] else: - shape = [batch_size] + raise ValueError(f"Couldn't parse feature {column} with type {type(col_feature)}! " + "This may indicate a column was included with an unusual datatype " + "that we were unable to process correctly. " + "If you're getting this error with one of our datasets, and you're " + "sure the column should be convertable to tf.Tensor, please " + "file an issue at github.com/huggingface/datasets and tag " + "@rocketknight1!") shape = [dim if dim != -1 else None for dim in shape] signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) From 67c06570a106eea78cff9706492bc9bec0f92cdb Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 4 Aug 2021 17:27:43 +0100 Subject: [PATCH 09/45] Style pass --- src/datasets/arrow_dataset.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index b1a165d6018..dce82800791 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -44,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Value, _ArrayXD, Sequence, cast_to_python_objects +from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -187,13 +187,15 @@ def _get_output_signature(dataset, batch_size): elif isinstance(col_feature, Sequence): shape = [batch_size, col_feature.length] else: - raise ValueError(f"Couldn't parse feature {column} with type {type(col_feature)}! " - "This may indicate a column was included with an unusual datatype " - "that we were unable to process correctly. " - "If you're getting this error with one of our datasets, and you're " - "sure the column should be convertable to tf.Tensor, please " - "file an issue at github.com/huggingface/datasets and tag " - "@rocketknight1!") + raise ValueError( + f"Couldn't parse feature {column} with type {type(col_feature)}! " + "This may indicate a column was included with an unusual datatype " + "that we were unable to process correctly. " + "If you're getting this error with one of our datasets, and you're " + "sure the column should be convertable to tf.Tensor, please " + "file an issue at github.com/huggingface/datasets and tag " + "@rocketknight1!" + ) shape = [dim if dim != -1 else None for dim in shape] signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) From 2963f0a110c23356bb56d863a352d2d80324b0d5 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 5 Aug 2021 14:43:11 +0100 Subject: [PATCH 10/45] Added drop_remainder argument, removed pad_to --- src/datasets/arrow_dataset.py | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index dce82800791..739eef6f754 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -201,7 +201,9 @@ def _get_output_signature(dataset, batch_size): signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) return signatures - def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, prefetch=True): + def to_tf_dataset( + self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, label_cols=None, prefetch=True + ): import tensorflow as tf if label_cols is None: @@ -216,10 +218,6 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col columns = [columns] elif len(set(columns)) < len(columns): raise ValueError("List of columns contains duplicates!") - if pad_to > 0 and collate_fn is not None: - raise ValueError( - "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!" - ) if label_cols is not None: cols_to_retain = list(set(columns + label_cols)) else: @@ -227,9 +225,15 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col for col in cols_to_retain: if col not in self.features: raise ValueError(f"Couldn't find column {col} in dataset!") + if drop_remainder is None: + # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to + drop_remainder = shuffle dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) gen_signature = self._get_output_signature(dataset, batch_size) - num_batches = len(dataset) // batch_size # Because we drop the remainder + if drop_remainder: + num_batches = floor(len(dataset) / batch_size) # Division rounding down ( // still returns a float!) + else: + num_batches = ceil(len(dataset) / batch_size) # Division rounding up def tf_generator(): # Note that the 'tensorflow' return format uses ragged tensors, which are VERY unperformant @@ -242,8 +246,8 @@ def tf_generator(): epoch_dataset.set_format("numpy") # Automatic padding else: epoch_dataset.set_format("python") # List of possibly variable lists - for i in range(0, len(epoch_dataset) - batch_size + 1, batch_size): - batch = epoch_dataset[i : i + batch_size] + for i in range(num_batches): + batch = epoch_dataset[i * batch_size : (i + 1) * batch_size] if collate_fn is not None: batch = collate_fn(batch) batch = {key: np.array(val) for key, val in batch.items()} @@ -251,20 +255,6 @@ def tf_generator(): tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) - if pad_to > 0: - - def padding_function(input_batch): - output_batch = dict() - for key, tensor in input_batch.items(): - if tf.rank(tensor) == 2: - padding = [[0, 0], [0, pad_to - tf.shape(tensor)[1]]] - output_batch[key] = tf.pad(tensor, padding) - else: - output_batch[key] = tensor - return output_batch - - tf_dataset = tf_dataset.map(padding_function) - if label_cols: def split_features_and_labels(input_batch): From 7f11d76396fb3d42f83dbc1c30fff182a47a3d36 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 5 Aug 2021 15:00:46 +0100 Subject: [PATCH 11/45] Correct shape signatures when we're not dropping the remainder --- src/datasets/arrow_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 739eef6f754..6bb1e39ce17 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -229,10 +229,11 @@ def to_tf_dataset( # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to drop_remainder = shuffle dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) - gen_signature = self._get_output_signature(dataset, batch_size) if drop_remainder: + gen_signature = self._get_output_signature(dataset, batch_size=batch_size) num_batches = floor(len(dataset) / batch_size) # Division rounding down ( // still returns a float!) else: + gen_signature = self._get_output_signature(dataset, batch_size=None) # Because batches can be variable here num_batches = ceil(len(dataset) / batch_size) # Division rounding up def tf_generator(): @@ -268,6 +269,9 @@ def split_features_and_labels(input_batch): tf_dataset = tf_dataset.map(split_features_and_labels) + elif len(columns) == 1: + tf_dataset = tf_dataset.map(lambda x: list(x.values())[0]) + tf_dataset = tf_dataset.apply(tf.data.experimental.assert_cardinality(num_batches)) if prefetch: tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) From bbf61978b7090947af04445200186be3adae6991 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 5 Aug 2021 15:12:12 +0100 Subject: [PATCH 12/45] Style pass --- src/datasets/arrow_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 6bb1e39ce17..c2d603013a6 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -233,7 +233,9 @@ def to_tf_dataset( gen_signature = self._get_output_signature(dataset, batch_size=batch_size) num_batches = floor(len(dataset) / batch_size) # Division rounding down ( // still returns a float!) else: - gen_signature = self._get_output_signature(dataset, batch_size=None) # Because batches can be variable here + gen_signature = self._get_output_signature( + dataset, batch_size=None + ) # Because batches can be variable here num_batches = ceil(len(dataset) / batch_size) # Division rounding up def tf_generator(): From f902bdedba49977616da8e7853dfefbec15c5fef Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 5 Aug 2021 17:50:15 +0100 Subject: [PATCH 13/45] Support ClassLabel columns too! --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c2d603013a6..294afc32f88 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -180,7 +180,7 @@ def _get_output_signature(dataset, batch_size): else: raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") - if isinstance(col_feature, Value): + if isinstance(col_feature, (Value, ClassLabel)): shape = [batch_size] elif isinstance(col_feature, _ArrayXD): shape = [batch_size] + list(col_feature.shape) From 990f150a7de6fac73855dd3de06e0cddd25d3e08 Mon Sep 17 00:00:00 2001 From: matt Date: Mon, 16 Aug 2021 16:42:39 +0100 Subject: [PATCH 14/45] Re-enable `tf.ragged` by avoiding `tf.ragged.constant` unless absolutely necessary --- src/datasets/arrow_dataset.py | 17 +++++++++++------ src/datasets/formatting/tf_formatter.py | 23 +++++++++++++++++++---- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 294afc32f88..15a51e79fbd 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -202,10 +202,13 @@ def _get_output_signature(dataset, batch_size): return signatures def to_tf_dataset( - self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, label_cols=None, prefetch=True + self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, collate_fn_args=None, label_cols=None, prefetch=True ): import tensorflow as tf + if collate_fn_args is None: + collate_fn_args = dict() + if label_cols is None: label_cols = [] elif isinstance(label_cols, str): @@ -239,21 +242,23 @@ def to_tf_dataset( num_batches = ceil(len(dataset) / batch_size) # Division rounding up def tf_generator(): - # Note that the 'tensorflow' return format uses ragged tensors, which are VERY unperformant - # right now (TF 2.5). This may or may not change in the future, but for now we stick to 'numpy'. if shuffle: epoch_dataset = dataset.shuffle(load_from_cache_file=False) else: epoch_dataset = dataset if collate_fn is None: - epoch_dataset.set_format("numpy") # Automatic padding + epoch_dataset.set_format("tensorflow") # Will return ragged tensors else: epoch_dataset.set_format("python") # List of possibly variable lists for i in range(num_batches): batch = epoch_dataset[i * batch_size : (i + 1) * batch_size] if collate_fn is not None: - batch = collate_fn(batch) - batch = {key: np.array(val) for key, val in batch.items()} + batch = collate_fn(batch, **collate_fn_args) + # In case the collate_fn returns something strange + batch = {key: tf.convert_to_tensor(val) for key, val in batch.items()} + else: + batch = {key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor + for key, tensor in batch.items()} yield batch tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 4da428f382a..9a1529aa43c 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -35,13 +35,28 @@ def __init__(self, **tf_tensor_kwargs): def _tensorize(self, value): import tensorflow as tf - default_dtype = {} if np.issubdtype(value.dtype, np.integer): - default_dtype = {"dtype": tf.int64} + np_dtype = np.int64 + tf_dtype = tf.int64 + default_dtype = {"dtype": tf_dtype} elif np.issubdtype(value.dtype, np.floating): - default_dtype = {"dtype": tf.float32} + np_dtype = np.float32 + tf_dtype = tf.float32 + default_dtype = {"dtype": tf_dtype} + else: + np_dtype = None + tf_dtype = None + default_dtype = {} + + # Saving the most expensive methods for last + try: + return tf.convert_to_tensor(value, dtype=tf_dtype) + except ValueError: + try: + return tf.ragged.stack([np.array(subarr, dtype=np_dtype) for subarr in value]) + except ValueError: + return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs}) - return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs}) def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct From fa062065f38eeb4e24036f2003077c5eaa822d52 Mon Sep 17 00:00:00 2001 From: matt Date: Mon, 16 Aug 2021 16:45:00 +0100 Subject: [PATCH 15/45] Style pass --- src/datasets/arrow_dataset.py | 16 +++++++++++++--- src/datasets/formatting/tf_formatter.py | 1 - 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 15a51e79fbd..01116239274 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -202,7 +202,15 @@ def _get_output_signature(dataset, batch_size): return signatures def to_tf_dataset( - self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, collate_fn_args=None, label_cols=None, prefetch=True + self, + columns, + batch_size, + shuffle, + drop_remainder=None, + collate_fn=None, + collate_fn_args=None, + label_cols=None, + prefetch=True, ): import tensorflow as tf @@ -257,8 +265,10 @@ def tf_generator(): # In case the collate_fn returns something strange batch = {key: tf.convert_to_tensor(val) for key, val in batch.items()} else: - batch = {key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor - for key, tensor in batch.items()} + batch = { + key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor + for key, tensor in batch.items() + } yield batch tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 9a1529aa43c..38cc1455f67 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -57,7 +57,6 @@ def _tensorize(self, value): except ValueError: return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs}) - def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): From 29415cd58f64bebaf9a98d2b3ba42dfa986196f7 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 26 Aug 2021 13:22:30 +0100 Subject: [PATCH 16/45] Adding a comment to explain myself in tf_formatter.py --- src/datasets/formatting/tf_formatter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 38cc1455f67..93a2b131d46 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -55,6 +55,7 @@ def _tensorize(self, value): try: return tf.ragged.stack([np.array(subarr, dtype=np_dtype) for subarr in value]) except ValueError: + # tf.ragged.constant is orders of magnitude slower than tf.ragged.stack return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs}) def _recursive_tensorize(self, data_struct: dict): From ca93c34b85043f15367b62a022805fc4510d0e35 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 26 Aug 2021 17:09:22 +0100 Subject: [PATCH 17/45] Fixes for shuffling and the case where the collator adds new columns --- src/datasets/arrow_dataset.py | 54 ++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 01116239274..9cb6bf31e5e 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -38,6 +38,7 @@ import pyarrow.compute as pc from multiprocess import Pool, RLock from tqdm.auto import tqdm +from random import randint from datasets.tasks.text_classification import TextClassification @@ -164,7 +165,7 @@ def __init__(self): pass @staticmethod - def _get_output_signature(dataset, batch_size): + def _get_output_signature(dataset, test_batch, batch_size): import tensorflow as tf signatures = dict() @@ -174,7 +175,7 @@ def _get_output_signature(dataset, batch_size): else: dtype_str = col_feature.dtype if dtype_str.startswith("int") or dtype_str.startswith("uint"): - dtype = tf.int32 + dtype = tf.int64 elif dtype_str.startswith("float"): dtype = tf.float32 else: @@ -199,6 +200,18 @@ def _get_output_signature(dataset, batch_size): shape = [dim if dim != -1 else None for dim in shape] signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) + + # Catching columns added by the collate_fn, such as MLM labels + for column, tensor in test_batch.items(): + if column in signatures: + continue + if column.startswith('label') and 'input_ids' in signatures: + shape = signatures['input_ids'].shape + else: + # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything + # about the dimensions we're unsure of + shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]] + signatures[column] = tf.TensorSpec(shape=shape, dtype=tensor.dtype) return signatures def to_tf_dataset( @@ -241,17 +254,13 @@ def to_tf_dataset( drop_remainder = shuffle dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) if drop_remainder: - gen_signature = self._get_output_signature(dataset, batch_size=batch_size) num_batches = floor(len(dataset) / batch_size) # Division rounding down ( // still returns a float!) else: - gen_signature = self._get_output_signature( - dataset, batch_size=None - ) # Because batches can be variable here num_batches = ceil(len(dataset) / batch_size) # Division rounding up def tf_generator(): if shuffle: - epoch_dataset = dataset.shuffle(load_from_cache_file=False) + epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2**32 - 1)) else: epoch_dataset = dataset if collate_fn is None: @@ -261,15 +270,32 @@ def tf_generator(): for i in range(num_batches): batch = epoch_dataset[i * batch_size : (i + 1) * batch_size] if collate_fn is not None: + actual_size = len(list(batch.values())[0]) # Get the length of one of the arrays, assume all same + # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert + batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)] batch = collate_fn(batch, **collate_fn_args) - # In case the collate_fn returns something strange - batch = {key: tf.convert_to_tensor(val) for key, val in batch.items()} + for key in list(batch.keys()): + # In case the collate_fn returns something strange + tensor = tf.convert_to_tensor(batch[key]) + cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32 + if tensor.dtype != cast_dtype: + tensor = tf.cast(tensor, cast_dtype) + batch[key] = tensor else: - batch = { - key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor - for key, tensor in batch.items() - } - yield batch + for key in list(batch.keys()): + tensor = batch[key] + if isinstance(tensor, tf.RaggedTensor): + tensor = tensor.to_tensor() + cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32 + if tensor.dtype != cast_dtype: + tensor = tf.cast(tensor, cast_dtype) + batch[key] = tensor + yield dict(batch) + + test_batch = next(tf_generator()) + + gen_signature = self._get_output_signature(dataset, test_batch=test_batch, + batch_size=batch_size if drop_remainder else None) tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) From d78cd5079dbe1eb4ccc4e43ef516c2ac0e903551 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 26 Aug 2021 17:12:07 +0100 Subject: [PATCH 18/45] Style pass --- src/datasets/arrow_dataset.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 9cb6bf31e5e..fbd600ec5f7 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -29,6 +29,7 @@ from functools import partial, wraps from math import ceil, floor from pathlib import Path +from random import randint from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Union import fsspec @@ -38,7 +39,6 @@ import pyarrow.compute as pc from multiprocess import Pool, RLock from tqdm.auto import tqdm -from random import randint from datasets.tasks.text_classification import TextClassification @@ -205,8 +205,8 @@ def _get_output_signature(dataset, test_batch, batch_size): for column, tensor in test_batch.items(): if column in signatures: continue - if column.startswith('label') and 'input_ids' in signatures: - shape = signatures['input_ids'].shape + if column.startswith("label") and "input_ids" in signatures: + shape = signatures["input_ids"].shape else: # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything # about the dimensions we're unsure of @@ -260,7 +260,7 @@ def to_tf_dataset( def tf_generator(): if shuffle: - epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2**32 - 1)) + epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2 ** 32 - 1)) else: epoch_dataset = dataset if collate_fn is None: @@ -294,8 +294,9 @@ def tf_generator(): test_batch = next(tf_generator()) - gen_signature = self._get_output_signature(dataset, test_batch=test_batch, - batch_size=batch_size if drop_remainder else None) + gen_signature = self._get_output_signature( + dataset, test_batch=test_batch, batch_size=batch_size if drop_remainder else None + ) tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) From 0bf0050c50aebec5f8f24defe7afbe28b3586d36 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 26 Aug 2021 17:58:50 +0100 Subject: [PATCH 19/45] Ensuring we respect TF dtype args --- src/datasets/formatting/tf_formatter.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 93a2b131d46..d000035647f 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -35,17 +35,22 @@ def __init__(self, **tf_tensor_kwargs): def _tensorize(self, value): import tensorflow as tf - if np.issubdtype(value.dtype, np.integer): - np_dtype = np.int64 - tf_dtype = tf.int64 - default_dtype = {"dtype": tf_dtype} - elif np.issubdtype(value.dtype, np.floating): - np_dtype = np.float32 - tf_dtype = tf.float32 - default_dtype = {"dtype": tf_dtype} + if 'dtype' not in self.tf_tensor_kwargs: + if np.issubdtype(value.dtype, np.integer): + np_dtype = np.int64 + tf_dtype = tf.int64 + default_dtype = {"dtype": tf_dtype} + elif np.issubdtype(value.dtype, np.floating): + np_dtype = np.float32 + tf_dtype = tf.float32 + default_dtype = {"dtype": tf_dtype} + else: + np_dtype = None + tf_dtype = None + default_dtype = {} else: - np_dtype = None - tf_dtype = None + tf_dtype = self.tf_tensor_kwargs['dtype'] + np_dtype = tf_dtype.as_numpy_dtype default_dtype = {} # Saving the most expensive methods for last From 6c91fc7ef7f1b95c594565644521e3445f70f3b7 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 26 Aug 2021 17:59:05 +0100 Subject: [PATCH 20/45] Style pass --- src/datasets/formatting/tf_formatter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index d000035647f..a54d69a928e 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -35,7 +35,7 @@ def __init__(self, **tf_tensor_kwargs): def _tensorize(self, value): import tensorflow as tf - if 'dtype' not in self.tf_tensor_kwargs: + if "dtype" not in self.tf_tensor_kwargs: if np.issubdtype(value.dtype, np.integer): np_dtype = np.int64 tf_dtype = tf.int64 @@ -49,7 +49,7 @@ def _tensorize(self, value): tf_dtype = None default_dtype = {} else: - tf_dtype = self.tf_tensor_kwargs['dtype'] + tf_dtype = self.tf_tensor_kwargs["dtype"] np_dtype = tf_dtype.as_numpy_dtype default_dtype = {} From 195486239824b9651973a1a685d237183787f3f0 Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 31 Aug 2021 14:19:39 +0100 Subject: [PATCH 21/45] Updating tests --- tests/test_formatting.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_formatting.py b/tests/test_formatting.py index cca0cb127bc..3c50cbd98cf 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -164,20 +164,20 @@ def test_tf_formatter(self): pa_table = self._create_dummy_table() formatter = TFFormatter() row = formatter.format_row(pa_table) - tf.debugging.assert_equal(row["a"], tf.ragged.constant(_COL_A, dtype=tf.int64)[0]) - tf.debugging.assert_equal(row["b"], tf.ragged.constant(_COL_B, dtype=tf.string)[0]) - tf.debugging.assert_equal(row["c"], tf.ragged.constant(_COL_C, dtype=tf.float32)[0]) + tf.debugging.assert_equal(row["a"], tf.convert_to_tensor(_COL_A, dtype=tf.int64)[0]) + tf.debugging.assert_equal(row["b"], tf.convert_to_tensor(_COL_B, dtype=tf.string)[0]) + tf.debugging.assert_equal(row["c"], tf.convert_to_tensor(_COL_C, dtype=tf.float32)[0]) col = formatter.format_column(pa_table) tf.debugging.assert_equal(col, tf.ragged.constant(_COL_A, dtype=tf.int64)) batch = formatter.format_batch(pa_table) - tf.debugging.assert_equal(batch["a"], tf.ragged.constant(_COL_A, dtype=tf.int64)) - tf.debugging.assert_equal(batch["b"], tf.ragged.constant(_COL_B, dtype=tf.string)) - self.assertIsInstance(batch["c"], tf.RaggedTensor) + tf.debugging.assert_equal(batch["a"], tf.convert_to_tensor(_COL_A, dtype=tf.int64)) + tf.debugging.assert_equal(batch["b"], tf.convert_to_tensor(_COL_B, dtype=tf.string)) + self.assertIsInstance(batch["c"], tf.Tensor) self.assertEqual(batch["c"].dtype, tf.float32) tf.debugging.assert_equal( - batch["c"].bounding_shape(), tf.ragged.constant(_COL_C, dtype=tf.float32).bounding_shape() + batch["c"].shape.as_list(), tf.convert_to_tensor(_COL_C, dtype=tf.float32).shape.as_list() ) - tf.debugging.assert_equal(batch["c"].flat_values, tf.ragged.constant(_COL_C, dtype=tf.float32).flat_values) + tf.debugging.assert_equal(tf.convert_to_tensor(batch["c"]), tf.convert_to_tensor(_COL_C, dtype=tf.float32)) @require_tf def test_tf_formatter_np_array_kwargs(self): From 7f2a8f10db715de42137e92f2257c6a415d2b495 Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 31 Aug 2021 14:31:37 +0100 Subject: [PATCH 22/45] Updating tests --- tests/test_arrow_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 5eda8c85bec..58eed9b638d 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -1816,8 +1816,8 @@ def test_format_vectors(self, in_memory): self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset[:2][col], (tf.Tensor, tf.RaggedTensor)) self.assertIsInstance(dset[col], (tf.Tensor, tf.RaggedTensor)) - self.assertEqual(tuple(dset[:2]["vec"].shape), (2, None)) - self.assertEqual(tuple(dset["vec"][:2].shape), (2, None)) + self.assertEqual(tuple(dset[:2]["vec"].shape), (2, 3)) + self.assertEqual(tuple(dset["vec"][:2].shape), (2, 3)) dset.set_format("numpy") self.assertIsNotNone(dset[0]) From 6eef188b10acd7b692d55485d0aabb264773389e Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 2 Sep 2021 14:54:56 +0100 Subject: [PATCH 23/45] Fixing things so they work in TF2.6 --- src/datasets/arrow_dataset.py | 146 ++++++++++++---------------------- 1 file changed, 51 insertions(+), 95 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index fbd600ec5f7..c7956931dc8 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -164,56 +164,6 @@ class TensorflowDatasetMixIn: def __init__(self): pass - @staticmethod - def _get_output_signature(dataset, test_batch, batch_size): - import tensorflow as tf - - signatures = dict() - for column, col_feature in dataset.features.items(): - if hasattr(col_feature, "feature"): - dtype_str = col_feature.feature.dtype - else: - dtype_str = col_feature.dtype - if dtype_str.startswith("int") or dtype_str.startswith("uint"): - dtype = tf.int64 - elif dtype_str.startswith("float"): - dtype = tf.float32 - else: - raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") - - if isinstance(col_feature, (Value, ClassLabel)): - shape = [batch_size] - elif isinstance(col_feature, _ArrayXD): - shape = [batch_size] + list(col_feature.shape) - elif isinstance(col_feature, Sequence): - shape = [batch_size, col_feature.length] - else: - raise ValueError( - f"Couldn't parse feature {column} with type {type(col_feature)}! " - "This may indicate a column was included with an unusual datatype " - "that we were unable to process correctly. " - "If you're getting this error with one of our datasets, and you're " - "sure the column should be convertable to tf.Tensor, please " - "file an issue at github.com/huggingface/datasets and tag " - "@rocketknight1!" - ) - shape = [dim if dim != -1 else None for dim in shape] - - signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) - - # Catching columns added by the collate_fn, such as MLM labels - for column, tensor in test_batch.items(): - if column in signatures: - continue - if column.startswith("label") and "input_ids" in signatures: - shape = signatures["input_ids"].shape - else: - # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything - # about the dimensions we're unsure of - shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]] - signatures[column] = tf.TensorSpec(shape=shape, dtype=tensor.dtype) - return signatures - def to_tf_dataset( self, columns, @@ -253,52 +203,59 @@ def to_tf_dataset( # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to drop_remainder = shuffle dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) - if drop_remainder: - num_batches = floor(len(dataset) / batch_size) # Division rounding down ( // still returns a float!) - else: - num_batches = ceil(len(dataset) / batch_size) # Division rounding up - - def tf_generator(): - if shuffle: - epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2 ** 32 - 1)) - else: - epoch_dataset = dataset - if collate_fn is None: - epoch_dataset.set_format("tensorflow") # Will return ragged tensors + self.set_format("numpy") + + def numpy_pad(data): + # Get lengths of each row of data + lens = np.array([len(i) for i in data]) + + # Mask of valid places in each row + mask = np.arange(lens.max()) < lens[:, None] + + # Setup output array and put elements from data into masked positions + out = np.zeros(mask.shape, dtype=data.dtype) + out[mask] = np.concatenate(data) + return out + + def np_get_batch(indices): + batch = self[indices] + out_batch = [] + if collate_fn is not None: + actual_size = len(list(batch.values())[0]) # Get the length of one of the arrays, assume all same + # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert + batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)] + batch = collate_fn(batch, **collate_fn_args) + for key in cols_to_retain: + # In case the collate_fn returns something strange + array = np.array(batch[key]) + cast_dtype = np.int64 if np.issubdtype(array.dtype, np.integer) else np.float32 + array = array.astype(cast_dtype) + out_batch.append(array) else: - epoch_dataset.set_format("python") # List of possibly variable lists - for i in range(num_batches): - batch = epoch_dataset[i * batch_size : (i + 1) * batch_size] - if collate_fn is not None: - actual_size = len(list(batch.values())[0]) # Get the length of one of the arrays, assume all same - # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert - batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)] - batch = collate_fn(batch, **collate_fn_args) - for key in list(batch.keys()): - # In case the collate_fn returns something strange - tensor = tf.convert_to_tensor(batch[key]) - cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32 - if tensor.dtype != cast_dtype: - tensor = tf.cast(tensor, cast_dtype) - batch[key] = tensor - else: - for key in list(batch.keys()): - tensor = batch[key] - if isinstance(tensor, tf.RaggedTensor): - tensor = tensor.to_tensor() - cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32 - if tensor.dtype != cast_dtype: - tensor = tf.cast(tensor, cast_dtype) - batch[key] = tensor - yield dict(batch) - - test_batch = next(tf_generator()) - - gen_signature = self._get_output_signature( - dataset, test_batch=test_batch, batch_size=batch_size if drop_remainder else None - ) + for key in cols_to_retain: + array = batch[key] + if array.dtype == np.object: + array = numpy_pad(array) + cast_dtype = np.int64 if np.issubdtype(array.dtype, np.integer) else np.float32 + array = array.astype(cast_dtype) + out_batch.append(array) + return [tf.convert_to_tensor(arr) for arr in out_batch] + + test_batch = np_get_batch(np.arange(batch_size)) + + @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) + def fetch_function(indices): + output = tf.numpy_function( + np_get_batch, inp=[indices], Tout=[tf.dtypes.as_dtype(arr.dtype) for arr in test_batch] + ) + return {key: output[i] for i, key in enumerate(cols_to_retain)} - tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature) + tf_dataset = ( + tf.data.Dataset.from_tensor_slices(np.arange(len(dataset))) + .shuffle(len(dataset)) + .batch(batch_size, drop_remainder=drop_remainder) + .map(fetch_function) + ) if label_cols: @@ -316,7 +273,6 @@ def split_features_and_labels(input_batch): elif len(columns) == 1: tf_dataset = tf_dataset.map(lambda x: list(x.values())[0]) - tf_dataset = tf_dataset.apply(tf.data.experimental.assert_cardinality(num_batches)) if prefetch: tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) return tf_dataset From a63dfb949bff50d8e57ffab614032ebc508c5ba0 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 2 Sep 2021 14:56:43 +0100 Subject: [PATCH 24/45] Style pass --- src/datasets/arrow_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c7956931dc8..3c7bb6da491 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -29,7 +29,6 @@ from functools import partial, wraps from math import ceil, floor from pathlib import Path -from random import randint from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Union import fsspec @@ -45,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects +from .features import ClassLabel, Features, Value, cast_to_python_objects from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, From d7048a43a416c1e6c0034b61caaf4e05f1c57372 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 2 Sep 2021 17:28:20 +0100 Subject: [PATCH 25/45] Correctly set output shapes - fixes a whole lot of issues --- src/datasets/arrow_dataset.py | 62 ++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 3c7bb6da491..25c5d3db2a9 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -44,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Value, cast_to_python_objects +from .features import ClassLabel, Features, Value, cast_to_python_objects, _ArrayXD, Sequence from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -163,6 +163,56 @@ class TensorflowDatasetMixIn: def __init__(self): pass + @staticmethod + def _get_output_signature(dataset, test_batch, batch_size): + import tensorflow as tf + + signatures = dict() + for column, col_feature in dataset.features.items(): + if hasattr(col_feature, "feature"): + dtype_str = col_feature.feature.dtype + else: + dtype_str = col_feature.dtype + if dtype_str.startswith("int") or dtype_str.startswith("uint"): + dtype = tf.int64 + elif dtype_str.startswith("float"): + dtype = tf.float32 + else: + raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") + + if isinstance(col_feature, (Value, ClassLabel)): + shape = [batch_size] + elif isinstance(col_feature, _ArrayXD): + shape = [batch_size] + list(col_feature.shape) + elif isinstance(col_feature, Sequence): + shape = [batch_size, col_feature.length] + else: + raise ValueError( + f"Couldn't parse feature {column} with type {type(col_feature)}! " + "This may indicate a column was included with an unusual datatype " + "that we were unable to process correctly. " + "If you're getting this error with one of our datasets, and you're " + "sure the column should be convertable to tf.Tensor, please " + "file an issue at github.com/huggingface/datasets and tag " + "@rocketknight1!" + ) + shape = [dim if dim != -1 else None for dim in shape] + + signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) + + # Catching columns added by the collate_fn, such as MLM labels + for column, tensor in test_batch.items(): + if column in signatures: + continue + if column.startswith("label") and "input_ids" in signatures: + shape = signatures["input_ids"].shape + else: + # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything + # about the dimensions we're unsure of + shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]] + signatures[column] = tf.TensorSpec(shape=shape, dtype=tensor.dtype) + return signatures + def to_tf_dataset( self, columns, @@ -242,6 +292,7 @@ def np_get_batch(indices): test_batch = np_get_batch(np.arange(batch_size)) + @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) def fetch_function(indices): output = tf.numpy_function( @@ -249,11 +300,20 @@ def fetch_function(indices): ) return {key: output[i] for i, key in enumerate(cols_to_retain)} + test_batch_dict = {key: test_batch[i] for i, key in enumerate(cols_to_retain)} + output_signature = self._get_output_signature(dataset, test_batch_dict, + batch_size=batch_size if drop_remainder else None) + + def ensure_shapes(input_dict): + return {key: tf.ensure_shape(val, output_signature[key].shape) + for key, val in input_dict.items()} + tf_dataset = ( tf.data.Dataset.from_tensor_slices(np.arange(len(dataset))) .shuffle(len(dataset)) .batch(batch_size, drop_remainder=drop_remainder) .map(fetch_function) + .map(ensure_shapes) ) if label_cols: From 56ea08fac7465f83b66c81fe19701e024208660a Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 7 Sep 2021 12:38:46 +0100 Subject: [PATCH 26/45] Fix an embarrassing regression bug --- src/datasets/arrow_dataset.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 25c5d3db2a9..c4ee8078b99 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -252,11 +252,14 @@ def to_tf_dataset( # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to drop_remainder = shuffle dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) - self.set_format("numpy") + dataset.set_format("python") def numpy_pad(data): # Get lengths of each row of data lens = np.array([len(i) for i in data]) + if np.all(lens == lens[0]): + # All data has the same length, no padding required + return np.array(data) # Mask of valid places in each row mask = np.arange(lens.max()) < lens[:, None] @@ -267,7 +270,7 @@ def numpy_pad(data): return out def np_get_batch(indices): - batch = self[indices] + batch = dataset[indices] out_batch = [] if collate_fn is not None: actual_size = len(list(batch.values())[0]) # Get the length of one of the arrays, assume all same From 2ddf7c6906abd1e496ada3c65534eea096da3a9a Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 7 Sep 2021 12:41:28 +0100 Subject: [PATCH 27/45] Style pass --- src/datasets/arrow_dataset.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c4ee8078b99..052ab255800 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -44,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Value, cast_to_python_objects, _ArrayXD, Sequence +from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -295,7 +295,6 @@ def np_get_batch(indices): test_batch = np_get_batch(np.arange(batch_size)) - @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) def fetch_function(indices): output = tf.numpy_function( @@ -304,12 +303,12 @@ def fetch_function(indices): return {key: output[i] for i, key in enumerate(cols_to_retain)} test_batch_dict = {key: test_batch[i] for i, key in enumerate(cols_to_retain)} - output_signature = self._get_output_signature(dataset, test_batch_dict, - batch_size=batch_size if drop_remainder else None) + output_signature = self._get_output_signature( + dataset, test_batch_dict, batch_size=batch_size if drop_remainder else None + ) def ensure_shapes(input_dict): - return {key: tf.ensure_shape(val, output_signature[key].shape) - for key, val in input_dict.items()} + return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()} tf_dataset = ( tf.data.Dataset.from_tensor_slices(np.arange(len(dataset))) From ddfda69273f882f5b7abf09d7797f75341da7cdb Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 8 Sep 2021 13:06:57 +0100 Subject: [PATCH 28/45] Added `config.TF_AVAILABLE` checks and dict literals --- src/datasets/arrow_dataset.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 052ab255800..2f63327b177 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -165,9 +165,12 @@ def __init__(self): @staticmethod def _get_output_signature(dataset, test_batch, batch_size): - import tensorflow as tf + if config.TF_AVAILABLE: + import tensorflow as tf + else: + raise ImportError("Called a Tensorflow-specific function but could not import it!") - signatures = dict() + signatures = {} for column, col_feature in dataset.features.items(): if hasattr(col_feature, "feature"): dtype_str = col_feature.feature.dtype @@ -224,10 +227,13 @@ def to_tf_dataset( label_cols=None, prefetch=True, ): - import tensorflow as tf + if config.TF_AVAILABLE: + import tensorflow as tf + else: + raise ImportError("Called a Tensorflow-specific function but could not import it!") if collate_fn_args is None: - collate_fn_args = dict() + collate_fn_args = {} if label_cols is None: label_cols = [] @@ -265,7 +271,7 @@ def numpy_pad(data): mask = np.arange(lens.max()) < lens[:, None] # Setup output array and put elements from data into masked positions - out = np.zeros(mask.shape, dtype=data.dtype) + out = np.zeros(mask.shape, dtype=np.array(data[0]).dtype) out[mask] = np.concatenate(data) return out @@ -286,8 +292,7 @@ def np_get_batch(indices): else: for key in cols_to_retain: array = batch[key] - if array.dtype == np.object: - array = numpy_pad(array) + array = numpy_pad(array) cast_dtype = np.int64 if np.issubdtype(array.dtype, np.integer) else np.float32 array = array.astype(cast_dtype) out_batch.append(array) From c87d47ed6b8b1957a82055898c05fe90e4100504 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 9 Sep 2021 16:58:36 +0100 Subject: [PATCH 29/45] Handling for special cases around label/labels and very nested dtypes --- src/datasets/arrow_dataset.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 2f63327b177..690836b4010 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -172,10 +172,10 @@ def _get_output_signature(dataset, test_batch, batch_size): signatures = {} for column, col_feature in dataset.features.items(): - if hasattr(col_feature, "feature"): - dtype_str = col_feature.feature.dtype - else: - dtype_str = col_feature.dtype + dtype_feature = col_feature + while hasattr(dtype_feature, "feature"): # Descend this godforsaken nested rabbit hole as long as it takes + dtype_feature = dtype_feature.feature + dtype_str = dtype_feature.dtype if dtype_str.startswith("int") or dtype_str.startswith("uint"): dtype = tf.int64 elif dtype_str.startswith("float"): @@ -251,6 +251,9 @@ def to_tf_dataset( cols_to_retain = list(set(columns + label_cols)) else: cols_to_retain = columns + # Special casing when the dataset has 'label' and the model expects 'labels' and the collator fixes it up for us + if "labels" in cols_to_retain and "labels" not in self.features and "label" in self.features: + cols_to_retain[cols_to_retain.index("labels")] = "label" for col in cols_to_retain: if col not in self.features: raise ValueError(f"Couldn't find column {col} in dataset!") @@ -283,6 +286,10 @@ def np_get_batch(indices): # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)] batch = collate_fn(batch, **collate_fn_args) + # Special casing when the dataset has 'label' and the model + # expects 'labels' and the collator fixes it up for us + if "label" in cols_to_retain and "label" not in batch and "labels" in batch: + cols_to_retain[cols_to_retain.index("label")] = "labels" for key in cols_to_retain: # In case the collate_fn returns something strange array = np.array(batch[key]) From e7d1ce8f7c3ed83dad06a11fc747938c4f79d15a Mon Sep 17 00:00:00 2001 From: matt Date: Fri, 10 Sep 2021 15:25:16 +0100 Subject: [PATCH 30/45] Fix for accidentally shuffling even when flag was False --- src/datasets/arrow_dataset.py | 56 ++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 690836b4010..32781f9b2ff 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -183,22 +183,26 @@ def _get_output_signature(dataset, test_batch, batch_size): else: raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!") - if isinstance(col_feature, (Value, ClassLabel)): - shape = [batch_size] - elif isinstance(col_feature, _ArrayXD): - shape = [batch_size] + list(col_feature.shape) - elif isinstance(col_feature, Sequence): - shape = [batch_size, col_feature.length] - else: - raise ValueError( - f"Couldn't parse feature {column} with type {type(col_feature)}! " - "This may indicate a column was included with an unusual datatype " - "that we were unable to process correctly. " - "If you're getting this error with one of our datasets, and you're " - "sure the column should be convertable to tf.Tensor, please " - "file an issue at github.com/huggingface/datasets and tag " - "@rocketknight1!" + shape = [] + shape_feature = col_feature + while not isinstance(shape_feature, (Value, ClassLabel)): + if isinstance(shape_feature, _ArrayXD): + shape.extend(list(shape_feature.shape)) + break + elif isinstance(shape_feature, Sequence): + shape.insert(0, shape_feature.length) + shape_feature = shape_feature.feature + else: + raise ValueError( + f"Couldn't parse feature {column} with type {type(col_feature)}! " + "This may indicate a column was included with an unusual datatype " + "that we were unable to process correctly. " + "If you're getting this error with one of our datasets, and you're " + "sure the column should be convertable to tf.Tensor, please " + "file an issue at github.com/huggingface/datasets and tag " + "@rocketknight1!" ) + shape = [batch_size] + shape shape = [dim if dim != -1 else None for dim in shape] signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype) @@ -207,8 +211,13 @@ def _get_output_signature(dataset, test_batch, batch_size): for column, tensor in test_batch.items(): if column in signatures: continue - if column.startswith("label") and "input_ids" in signatures: - shape = signatures["input_ids"].shape + if column.startswith("label"): + if "input_ids" in signatures and test_batch[column].shape == test_batch['input_ids'].shape: + shape = signatures["input_ids"].shape + else: + # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything + # about the dimensions we're unsure of + shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]] else: # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything # about the dimensions we're unsure of @@ -322,13 +331,12 @@ def fetch_function(indices): def ensure_shapes(input_dict): return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()} - tf_dataset = ( - tf.data.Dataset.from_tensor_slices(np.arange(len(dataset))) - .shuffle(len(dataset)) - .batch(batch_size, drop_remainder=drop_remainder) - .map(fetch_function) - .map(ensure_shapes) - ) + tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(dataset))) + + if shuffle: + tf_dataset = tf_dataset.shuffle(len(dataset)) + + tf_dataset = tf_dataset.batch(batch_size, drop_remainder=drop_remainder).map(fetch_function).map(ensure_shapes) if label_cols: From 48045fb00e806e2ff375a3fb54e1b2a7b03b6a2f Mon Sep 17 00:00:00 2001 From: matt Date: Tue, 14 Sep 2021 11:44:58 +0100 Subject: [PATCH 31/45] Adding dummy labels by default --- src/datasets/arrow_dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 32781f9b2ff..3261697eb1e 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -235,6 +235,7 @@ def to_tf_dataset( collate_fn_args=None, label_cols=None, prefetch=True, + dummy_labels=True ): if config.TF_AVAILABLE: import tensorflow as tf @@ -354,6 +355,16 @@ def split_features_and_labels(input_batch): elif len(columns) == 1: tf_dataset = tf_dataset.map(lambda x: list(x.values())[0]) + if dummy_labels and not label_cols: + print("Warning: No label_cols specified - adding some dummy labels to ensure fit() works correctly. If you " + "only want to use this dataset with predict() or custom training loops, you can disable this " + "behaviour by setting dummy_labels to False.") + + def add_dummy_labels(input_batch): + return input_batch, tf.zeros(tf.shape(input_batch[columns[0]])[0]) + + tf_dataset = tf_dataset.map(add_dummy_labels) + if prefetch: tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) return tf_dataset From ec4f7d4b886a6dd76f713bef30bf486639503266 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 12:46:26 +0100 Subject: [PATCH 32/45] Adding docstrings and type hints --- src/datasets/arrow_dataset.py | 45 +++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 3261697eb1e..e42823237e2 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -227,16 +227,41 @@ def _get_output_signature(dataset, test_batch, batch_size): def to_tf_dataset( self, - columns, - batch_size, - shuffle, - drop_remainder=None, - collate_fn=None, - collate_fn_args=None, - label_cols=None, - prefetch=True, - dummy_labels=True + columns: Union[str, List[str]], + batch_size: int, + shuffle: bool, + drop_remainder: bool = None, + collate_fn: Callable = None, + collate_fn_args: Dict[str, Any] = None, + label_cols: Union[str, List[str]] = None, + dummy_labels: bool = True, + prefetch: bool = True ): + """Create a tf.data.Dataset from the underlying Dataset. This tf.data.Dataset will load and collate batches from + the Dataset, and is suitable for passing to methods like model.fit() or model.predict(). + + Args: + columns (:obj:`List[str]` or :obj:`str`): Dataset column(s) to load in the tf.data.Dataset. In general, + only columns that the model can use as input should be included here. + batch_size (:obj:`int`): Size of batches to load from the dataset. + shuffle(:obj:`bool`): Shuffle the dataset order when loading. Recommended True for training, False for + validation/evaluation. + drop_remainder(:obj:`bool`, default ``None``): Drop the last incomplete batch when loading. If not provided, + defaults to the same setting as shuffle. + collate_fn(:obj:`Callable`): A function or callable object (such as a `DataCollator`) that will collate + lists of samples into a batch. + collate_fn_args (:obj:`Dict`, optional): An optional `dict` of keyword arguments to be passed to the + `collate_fn`. + label_cols (:obj:`List[str]` or :obj:`str`, default ``None``): Dataset column(s) to load as + labels. Note that many models compute loss internally rather than letting Keras do it, in which case it is + not necessary to actually pass the labels here, as long as they're in the input `columns`. + dummy_labels (:obj:`bool`, default ``True``): If no `label_cols` are set, output an array of "dummy" labels + with each batch. This setting ensures that Keras `fit()` or `train_on_batch()` does not get confused + by the missing labels. + prefetch (:obj:`bool`, default ``True``): Whether to run the dataloader in a separate thread and maintain + a small buffer of batches for training. Improves performance by allowing data to be loaded in the + background while the model is training. + """ if config.TF_AVAILABLE: import tensorflow as tf else: @@ -271,7 +296,7 @@ def to_tf_dataset( # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to drop_remainder = shuffle dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) - dataset.set_format("python") + dataset.set_format("numpy") def numpy_pad(data): # Get lengths of each row of data From 88e9f1e1d3d235ac366a8f51e81d8e81ef125eaf Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 12:47:00 +0100 Subject: [PATCH 33/45] Style pass --- src/datasets/arrow_dataset.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index e42823237e2..86323106ba0 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -201,7 +201,7 @@ def _get_output_signature(dataset, test_batch, batch_size): "sure the column should be convertable to tf.Tensor, please " "file an issue at github.com/huggingface/datasets and tag " "@rocketknight1!" - ) + ) shape = [batch_size] + shape shape = [dim if dim != -1 else None for dim in shape] @@ -212,7 +212,7 @@ def _get_output_signature(dataset, test_batch, batch_size): if column in signatures: continue if column.startswith("label"): - if "input_ids" in signatures and test_batch[column].shape == test_batch['input_ids'].shape: + if "input_ids" in signatures and test_batch[column].shape == test_batch["input_ids"].shape: shape = signatures["input_ids"].shape else: # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything @@ -235,7 +235,7 @@ def to_tf_dataset( collate_fn_args: Dict[str, Any] = None, label_cols: Union[str, List[str]] = None, dummy_labels: bool = True, - prefetch: bool = True + prefetch: bool = True, ): """Create a tf.data.Dataset from the underlying Dataset. This tf.data.Dataset will load and collate batches from the Dataset, and is suitable for passing to methods like model.fit() or model.predict(). @@ -381,9 +381,11 @@ def split_features_and_labels(input_batch): tf_dataset = tf_dataset.map(lambda x: list(x.values())[0]) if dummy_labels and not label_cols: - print("Warning: No label_cols specified - adding some dummy labels to ensure fit() works correctly. If you " - "only want to use this dataset with predict() or custom training loops, you can disable this " - "behaviour by setting dummy_labels to False.") + print( + "Warning: No label_cols specified - adding some dummy labels to ensure fit() works correctly. If you " + "only want to use this dataset with predict() or custom training loops, you can disable this " + "behaviour by setting dummy_labels to False." + ) def add_dummy_labels(input_batch): return input_batch, tf.zeros(tf.shape(input_batch[columns[0]])[0]) From a7b45747ddd5068c6da741a7b1321228efe7e1fd Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 13:31:19 +0100 Subject: [PATCH 34/45] Add tests, bugfix to handling scalar columns --- src/datasets/arrow_dataset.py | 7 ++++--- tests/test_arrow_dataset.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 86323106ba0..10a70974578 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -299,11 +299,12 @@ def to_tf_dataset( dataset.set_format("numpy") def numpy_pad(data): + try: + return np.array(data) + except: + pass # Get lengths of each row of data lens = np.array([len(i) for i in data]) - if np.all(lens == lens[0]): - # All data has the same length, no padding required - return np.array(data) # Mask of valid places in each row mask = np.arange(lens.max()) < lens[:, None] diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 58eed9b638d..34b3a2f5c54 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -1997,6 +1997,21 @@ def test_with_transform(self, in_memory): self.assertNotEqual(dset.format, dset2.format) self.assertNotEqual(dset._fingerprint, dset2._fingerprint) + @require_tf + def test_tf_dataset_conversion(self, in_memory): + with tempfile.TemporaryDirectory() as tmp_dir: + with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset: + tf_dataset = dset.to_tf_dataset(columns="col_3", batch_size=4, shuffle=False, dummy_labels=False) + batch = next(iter(tf_dataset)) + self.assertEqual(batch.shape.as_list(), [4, 4]) + self.assertEqual(batch.dtype.name, "int64") + with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset: + tf_dataset = dset.to_tf_dataset(columns="col_1", batch_size=4, shuffle=False, dummy_labels=False) + batch = next(iter(tf_dataset)) + self.assertEqual(batch.shape.as_list(), [4]) + self.assertEqual(batch.dtype.name, "int64") + + class MiscellaneousDatasetTest(TestCase): def test_from_pandas(self): From b35267dc0e6a6658464275c0298991e22720f165 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 13:31:36 +0100 Subject: [PATCH 35/45] Style pass --- tests/test_arrow_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 34b3a2f5c54..1f07d20e64a 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -2012,7 +2012,6 @@ def test_tf_dataset_conversion(self, in_memory): self.assertEqual(batch.dtype.name, "int64") - class MiscellaneousDatasetTest(TestCase): def test_from_pandas(self): data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} From 6273d737c8591937f217c2a8eb9d8eac9ea451bf Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 13:39:16 +0100 Subject: [PATCH 36/45] Fix to `numpy_pad` --- src/datasets/arrow_dataset.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 10a70974578..4ef250413c2 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -300,8 +300,13 @@ def to_tf_dataset( def numpy_pad(data): try: - return np.array(data) - except: + # When this is finally fully removed, remove this line + # Alternatively, find a more elegant way to do this whole thing + np.warnings.filterwarnings("error", category=np.VisibleDeprecationWarning) + data = np.array(data) + assert data.dtype != np.object + return data + except (np.VisibleDeprecationWarning, AssertionError): pass # Get lengths of each row of data lens = np.array([len(i) for i in data]) From 4ff6d2e81e5e5825b44f56026385e54b1157fcc0 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 13:41:04 +0100 Subject: [PATCH 37/45] Replace assertion with more robust syntax --- src/datasets/arrow_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 4ef250413c2..7b1e1967572 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -304,7 +304,8 @@ def numpy_pad(data): # Alternatively, find a more elegant way to do this whole thing np.warnings.filterwarnings("error", category=np.VisibleDeprecationWarning) data = np.array(data) - assert data.dtype != np.object + if data.dtype == np.object: + raise AssertionError # Do it this way so that the assert doesn't get optimized out return data except (np.VisibleDeprecationWarning, AssertionError): pass From 589c575df59c9e20f71646002bb59eb8bc51308d Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 14:04:58 +0100 Subject: [PATCH 38/45] Add cleanup deletion of tf_dataset in tests --- tests/test_arrow_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 1f07d20e64a..6c12a9ccd73 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -2010,6 +2010,7 @@ def test_tf_dataset_conversion(self, in_memory): batch = next(iter(tf_dataset)) self.assertEqual(batch.shape.as_list(), [4]) self.assertEqual(batch.dtype.name, "int64") + del tf_dataset # For correct cleanup class MiscellaneousDatasetTest(TestCase): From d70fe9482c07fd283baad3682e9013ba67da1189 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 14:52:56 +0100 Subject: [PATCH 39/45] Rebasing onto Master --- src/datasets/arrow_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 7b1e1967572..76c92c5ffd4 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -295,8 +295,7 @@ def to_tf_dataset( if drop_remainder is None: # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to drop_remainder = shuffle - dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain]) - dataset.set_format("numpy") + dataset.set_format("numpy", columns=cols_to_retain) def numpy_pad(data): try: From a1897407d58edad773ec4b9bf4232fec9d51e682 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 14:58:17 +0100 Subject: [PATCH 40/45] Fixes for the new approach --- src/datasets/arrow_dataset.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 76c92c5ffd4..4cc1ad5ea32 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -164,7 +164,7 @@ def __init__(self): pass @staticmethod - def _get_output_signature(dataset, test_batch, batch_size): + def _get_output_signature(dataset, cols_to_retain, test_batch, batch_size): if config.TF_AVAILABLE: import tensorflow as tf else: @@ -172,6 +172,8 @@ def _get_output_signature(dataset, test_batch, batch_size): signatures = {} for column, col_feature in dataset.features.items(): + if column not in cols_to_retain: + continue dtype_feature = col_feature while hasattr(dtype_feature, "feature"): # Descend this godforsaken nested rabbit hole as long as it takes dtype_feature = dtype_feature.feature @@ -295,7 +297,7 @@ def to_tf_dataset( if drop_remainder is None: # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to drop_remainder = shuffle - dataset.set_format("numpy", columns=cols_to_retain) + self.set_format("numpy", columns=cols_to_retain) def numpy_pad(data): try: @@ -320,7 +322,7 @@ def numpy_pad(data): return out def np_get_batch(indices): - batch = dataset[indices] + batch = self[indices] out_batch = [] if collate_fn is not None: actual_size = len(list(batch.values())[0]) # Get the length of one of the arrays, assume all same @@ -357,16 +359,16 @@ def fetch_function(indices): test_batch_dict = {key: test_batch[i] for i, key in enumerate(cols_to_retain)} output_signature = self._get_output_signature( - dataset, test_batch_dict, batch_size=batch_size if drop_remainder else None + self, cols_to_retain, test_batch_dict, batch_size=batch_size if drop_remainder else None ) def ensure_shapes(input_dict): return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()} - tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(dataset))) + tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(self))) if shuffle: - tf_dataset = tf_dataset.shuffle(len(dataset)) + tf_dataset = tf_dataset.shuffle(len(self)) tf_dataset = tf_dataset.batch(batch_size, drop_remainder=drop_remainder).map(fetch_function).map(ensure_shapes) From c8f251bfdef59c56ca6f98b59336c4d9539f068f Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 15:39:59 +0100 Subject: [PATCH 41/45] Force dtype to ensure Windows compatibility --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 4cc1ad5ea32..1936fd55ecd 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -365,7 +365,7 @@ def fetch_function(indices): def ensure_shapes(input_dict): return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()} - tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(self))) + tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(self), dtype=np.int64)) if shuffle: tf_dataset = tf_dataset.shuffle(len(self)) From f1f88888bd4882ad023e720fb23aafc40524edf2 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 15 Sep 2021 16:13:12 +0100 Subject: [PATCH 42/45] Fixing things because I am bad at merging --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 1936fd55ecd..93ea6747996 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -484,7 +484,7 @@ class NonExistentDatasetError(Exception): pass -class Dataset(DatasetInfoMixin, IndexableMixin): +class Dataset(DatasetInfoMixin, IndexableMixin, TensorflowDatasetMixIn): """A Dataset backed by an Arrow table.""" def __init__( From ef9a7bb211841c37a8a22c5b44a2a81dffd65b06 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 16 Sep 2021 13:37:23 +0100 Subject: [PATCH 43/45] Fix issues with passing a mutable list to columns argument --- src/datasets/arrow_dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 93ea6747996..96e32131e43 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1603,12 +1603,16 @@ def set_format( # Check filter column if isinstance(columns, str): columns = [columns] + if isinstance(columns, tuple): + columns = list(columns) if columns is not None and any(col not in self._data.column_names for col in columns): raise ValueError( "Columns {} not in the dataset. Current columns in the dataset: {}".format( list(filter(lambda col: col not in self._data.column_names, columns)), self._data.column_names ) ) + if columns is not None: + columns = columns.copy() # Ensures modifications made to the list after this call don't cause bugs self._format_type = type self._format_kwargs = format_kwargs From b8523e44a27c795d69a36ae9abd5dde027fc94d9 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 16 Sep 2021 15:06:02 +0200 Subject: [PATCH 44/45] Update src/datasets/arrow_dataset.py --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 96e32131e43..7570b5f7082 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -244,7 +244,7 @@ def to_tf_dataset( Args: columns (:obj:`List[str]` or :obj:`str`): Dataset column(s) to load in the tf.data.Dataset. In general, - only columns that the model can use as input should be included here. + only columns that the model can use as input should be included here (numeric data only). batch_size (:obj:`int`): Size of batches to load from the dataset. shuffle(:obj:`bool`): Shuffle the dataset order when loading. Recommended True for training, False for validation/evaluation. From 397bcb72ca4789d25f0bce07e43a23580812b7d2 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 16 Sep 2021 14:36:12 +0100 Subject: [PATCH 45/45] Fix unused import --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 79bf85eafb9..cefc848eb00 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -44,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects +from .features import ClassLabel, Features, Sequence, Value, _ArrayXD from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform,