From 92cad1513c85060c5da1819a906c49cd038ffd3e Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 29 Jul 2021 19:03:05 +0100
Subject: [PATCH 01/45] Rebase onto master

---
 src/datasets/arrow_dataset.py | 45 +++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 5361c54664c..5052d830afd 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -159,6 +159,51 @@ def version(self):
         return self._info.version
 
 
+class TensorflowDatasetMixIn:
+    def __init__(self):
+        pass
+
+    def to_tf_dataset(self, tokenizer, cols_to_remove, batch_size, shuffle):
+        import tensorflow as tf
+        dataset_in = self.remove_columns(cols_to_remove)
+        tf_cols = [col for col in dataset_in.features]
+        label_index = tf_cols.index("label")
+        dtypes_out = []
+        for col in tf_cols:
+            try:
+                col_feature = dataset_in.features[col]
+                if hasattr(col_feature, 'feature'):
+                    col_feature = col_feature.feature
+                dtype_str = col_feature.dtype
+                dtypes_out.append(tf.as_dtype(dtype_str))
+            except TypeError:
+                raise TypeError(f"Couldn't convert column {col}, dtype {dtype_str} to TF Tensor!")
+
+        def indices_to_samples(indices):
+            batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict()
+            batch = tokenizer.pad(batch)
+            output = []
+            for col in tf_cols:
+                output.append(batch[col])
+            return output
+
+        def graph_indices_to_samples(indices):
+            return tf.py_function(indices_to_samples, [indices], Tout=dtypes_out)
+
+        def reform_dict(*batch_list):
+            return ({col: batch_list[i] for i, col in enumerate(tf_cols)}, batch_list[label_index])
+
+        indices = tf.range(len(dataset_in))
+        tf_dataset = tf.data.Dataset.from_tensor_slices(indices)
+        if shuffle:
+            tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset))
+        tf_dataset = tf_dataset.batch(batch_size)
+        tf_dataset = tf_dataset.map(graph_indices_to_samples).map(reform_dict)
+        return tf_dataset
+
+
+
+
 class DatasetTransformationNotAllowedError(Exception):
     pass
 

From 74b5badc2274f518ffc9ed469ed7addbc4a7e6be Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Fri, 30 Jul 2021 16:03:42 +0100
Subject: [PATCH 02/45] Support multiple label_cols, replaced tokenizer with
 collate_fn, support padding to constant size for TPU training

---
 src/datasets/arrow_dataset.py | 45 ++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 5052d830afd..033d0747878 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -163,27 +163,48 @@ class TensorflowDatasetMixIn:
     def __init__(self):
         pass
 
-    def to_tf_dataset(self, tokenizer, cols_to_remove, batch_size, shuffle):
+    def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0):
         import tensorflow as tf
-        dataset_in = self.remove_columns(cols_to_remove)
-        tf_cols = [col for col in dataset_in.features]
-        label_index = tf_cols.index("label")
+        if len(set(columns)) < len(columns):
+            raise ValueError("List of columns contains duplicates!")
+        if len(set(label_cols)) < len(label_cols):
+            raise ValueError("List of label_cols contains duplicates!")
+        if pad_to > 0 and collate_fn is not None:
+            raise ValueError("pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!")
+        if label_cols is not None:
+            cols_to_retain = list(set(columns + label_cols))
+        else:
+            cols_to_retain = columns
+
+        dataset_in = self.remove_columns([col for col in self.features if col not in cols_to_retain])
+        feature_indices = dict()
+        label_indices = dict()
         dtypes_out = []
-        for col in tf_cols:
+        for i, col in enumerate(cols_to_retain):
             try:
                 col_feature = dataset_in.features[col]
                 if hasattr(col_feature, 'feature'):
                     col_feature = col_feature.feature
                 dtype_str = col_feature.dtype
                 dtypes_out.append(tf.as_dtype(dtype_str))
+                # Note that these two are not mutually exclusive!
+                if col in columns:
+                    feature_indices[col] = i
+                if col in label_cols:
+                    label_indices[col] = i
             except TypeError:
                 raise TypeError(f"Couldn't convert column {col}, dtype {dtype_str} to TF Tensor!")
 
         def indices_to_samples(indices):
             batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict()
-            batch = tokenizer.pad(batch)
+            if collate_fn is not None:
+                batch = collate_fn(batch)
+            elif pad_to > 0:
+                batch = tf.ragged.constant(batch).to_tensor(shape=(batch_size, pad_to))
+            else:
+                batch = tf.ragged.constant(batch).to_tensor()
             output = []
-            for col in tf_cols:
+            for col in cols_to_retain:
                 output.append(batch[col])
             return output
 
@@ -191,7 +212,15 @@ def graph_indices_to_samples(indices):
             return tf.py_function(indices_to_samples, [indices], Tout=dtypes_out)
 
         def reform_dict(*batch_list):
-            return ({col: batch_list[i] for i, col in enumerate(tf_cols)}, batch_list[label_index])
+            features = {col: batch_list[idx] for col, idx in feature_indices.items()}
+            if label_cols is None:
+                return features
+            elif len(label_cols) == 1:
+                label_index = list(label_indices.values())[0]
+                return features, batch_list[label_index]
+            else:
+                labels = {col: batch_list[idx] for col, idx in label_indices.items()}
+                return features, labels
 
         indices = tf.range(len(dataset_in))
         tf_dataset = tf.data.Dataset.from_tensor_slices(indices)

From 97917bcc2140e528e0a09bbe5e71d98062b452a7 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Fri, 30 Jul 2021 17:07:50 +0100
Subject: [PATCH 03/45] Standardize int and float dtypes to keep TF happy

---
 src/datasets/arrow_dataset.py | 45 +++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 033d0747878..08c351bffec 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -181,31 +181,40 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col
         label_indices = dict()
         dtypes_out = []
         for i, col in enumerate(cols_to_retain):
-            try:
-                col_feature = dataset_in.features[col]
-                if hasattr(col_feature, 'feature'):
-                    col_feature = col_feature.feature
-                dtype_str = col_feature.dtype
-                dtypes_out.append(tf.as_dtype(dtype_str))
-                # Note that these two are not mutually exclusive!
-                if col in columns:
-                    feature_indices[col] = i
-                if col in label_cols:
-                    label_indices[col] = i
-            except TypeError:
-                raise TypeError(f"Couldn't convert column {col}, dtype {dtype_str} to TF Tensor!")
+            col_feature = dataset_in.features[col]
+            if hasattr(col_feature, 'feature'):
+                col_feature = col_feature.feature
+            dtype_str = col_feature.dtype
+            if dtype_str.startswith("int") or dtype_str.startswith("uint"):
+                dtypes_out.append(tf.int32)
+            elif dtype_str.startswith("float"):
+                dtypes_out.append(tf.float32)
+            else:
+                raise TypeError(f"Can't convert dtype {dtype_str} to TF Tensor!")
+            # Note that these two are not mutually exclusive!
+            if col in columns:
+                feature_indices[col] = i
+            if col in label_cols:
+                label_indices[col] = i
 
         def indices_to_samples(indices):
             batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict()
             if collate_fn is not None:
                 batch = collate_fn(batch)
-            elif pad_to > 0:
-                batch = tf.ragged.constant(batch).to_tensor(shape=(batch_size, pad_to))
-            else:
-                batch = tf.ragged.constant(batch).to_tensor()
             output = []
             for col in cols_to_retain:
-                output.append(batch[col])
+                if pad_to > 0:  # We know collate_fn is False
+                    tensor = tf.ragged.constant(batch[col])
+                    if isinstance(tensor, tf.RaggedTensor):
+                        tensor = tensor.to_tensor(shape=(batch_size, pad_to))
+                    output.append(tensor)
+                elif collate_fn is None:
+                    tensor = tf.ragged.constant(batch[col])
+                    if isinstance(tensor, tf.RaggedTensor):
+                        tensor = tensor.to_tensor()
+                    output.append(tensor)
+                else:  # Already processed
+                    output.append(batch[col])
             return output
 
         def graph_indices_to_samples(indices):

From 4eb79f5709f0801be4abc9aa5f2be97c43500d67 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Fri, 30 Jul 2021 18:33:22 +0100
Subject: [PATCH 04/45] Add a prefetch buffer for improved performance

---
 src/datasets/arrow_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 08c351bffec..7118af7325a 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -237,6 +237,7 @@ def reform_dict(*batch_list):
             tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset))
         tf_dataset = tf_dataset.batch(batch_size)
         tf_dataset = tf_dataset.map(graph_indices_to_samples).map(reform_dict)
+        tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
         return tf_dataset
 
 

From bed394a97ec6f985a021dc95a0e2306372287f8f Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 4 Aug 2021 17:07:07 +0100
Subject: [PATCH 05/45] TF dataset is actually kinda performant now!

---
 src/datasets/arrow_dataset.py | 158 ++++++++++++++++++++--------------
 1 file changed, 92 insertions(+), 66 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 7118af7325a..ce4e7eafa28 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -163,84 +163,110 @@ class TensorflowDatasetMixIn:
     def __init__(self):
         pass
 
-    def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0):
+    @staticmethod
+    def _get_output_signature(dataset, batch_size):
         import tensorflow as tf
-        if len(set(columns)) < len(columns):
-            raise ValueError("List of columns contains duplicates!")
-        if len(set(label_cols)) < len(label_cols):
+        signatures = dict()
+        for column, col_feature in dataset.features.items():
+            if hasattr(col_feature, 'feature'):
+                dtype_str = col_feature.feature.dtype
+            else:
+                dtype_str = col_feature.dtype
+            if dtype_str.startswith("int") or dtype_str.startswith("uint"):
+                dtype = tf.int32
+            elif dtype_str.startswith("float"):
+                dtype = tf.float32
+            else:
+                raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
+
+            if hasattr(col_feature, 'shape'):
+                shape = [batch_size] + list(col_feature.shape)
+            elif hasattr(col_feature, 'length'):
+                shape = [batch_size, col_feature.length]
+            else:
+                shape = [batch_size]
+            shape = [dim if dim != -1 else None for dim in shape]
+
+            signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
+        return signatures
+
+    def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, pad_value=0, prefetch=True):
+        import tensorflow as tf
+        if label_cols is None:
+            label_cols = []
+        elif isinstance(label_cols, str):
+            label_cols = [label_cols]
+        elif len(set(label_cols)) < len(label_cols):
             raise ValueError("List of label_cols contains duplicates!")
+        if not columns:
+            raise ValueError("Need to specify at least one column!")
+        elif isinstance(columns, str):
+            columns = [columns]
+        elif len(set(columns)) < len(columns):
+            raise ValueError("List of columns contains duplicates!")
         if pad_to > 0 and collate_fn is not None:
-            raise ValueError("pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!")
+            raise ValueError(
+                "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!")
         if label_cols is not None:
             cols_to_retain = list(set(columns + label_cols))
         else:
             cols_to_retain = columns
-
-        dataset_in = self.remove_columns([col for col in self.features if col not in cols_to_retain])
-        feature_indices = dict()
-        label_indices = dict()
-        dtypes_out = []
-        for i, col in enumerate(cols_to_retain):
-            col_feature = dataset_in.features[col]
-            if hasattr(col_feature, 'feature'):
-                col_feature = col_feature.feature
-            dtype_str = col_feature.dtype
-            if dtype_str.startswith("int") or dtype_str.startswith("uint"):
-                dtypes_out.append(tf.int32)
-            elif dtype_str.startswith("float"):
-                dtypes_out.append(tf.float32)
+        for col in cols_to_retain:
+            if col not in self.features:
+                raise ValueError(f"Couldn't find column {col} in dataset!")
+        dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
+        gen_signature = self._get_output_signature(dataset, batch_size)
+        num_batches = len(dataset) // batch_size  # Because we drop the remainder
+
+        def tf_generator():
+            # Note that the 'tensorflow' return format uses ragged tensors, which are VERY unperformant
+            # right now (TF 2.5). This may or may not change in the future, but for now we stick to 'numpy'.
+            if shuffle:
+                epoch_dataset = dataset.shuffle(load_from_cache_file=False)
             else:
-                raise TypeError(f"Can't convert dtype {dtype_str} to TF Tensor!")
-            # Note that these two are not mutually exclusive!
-            if col in columns:
-                feature_indices[col] = i
-            if col in label_cols:
-                label_indices[col] = i
-
-        def indices_to_samples(indices):
-            batch = dataset_in.select(list(indices), keep_in_memory=True).to_dict()
-            if collate_fn is not None:
-                batch = collate_fn(batch)
-            output = []
-            for col in cols_to_retain:
-                if pad_to > 0:  # We know collate_fn is False
-                    tensor = tf.ragged.constant(batch[col])
-                    if isinstance(tensor, tf.RaggedTensor):
-                        tensor = tensor.to_tensor(shape=(batch_size, pad_to))
-                    output.append(tensor)
-                elif collate_fn is None:
-                    tensor = tf.ragged.constant(batch[col])
-                    if isinstance(tensor, tf.RaggedTensor):
-                        tensor = tensor.to_tensor()
-                    output.append(tensor)
-                else:  # Already processed
-                    output.append(batch[col])
-            return output
-
-        def graph_indices_to_samples(indices):
-            return tf.py_function(indices_to_samples, [indices], Tout=dtypes_out)
-
-        def reform_dict(*batch_list):
-            features = {col: batch_list[idx] for col, idx in feature_indices.items()}
-            if label_cols is None:
-                return features
-            elif len(label_cols) == 1:
-                label_index = list(label_indices.values())[0]
-                return features, batch_list[label_index]
+                epoch_dataset = dataset
+            if collate_fn is None:
+                epoch_dataset.set_format('numpy')  # Automatic padding
             else:
-                labels = {col: batch_list[idx] for col, idx in label_indices.items()}
+                epoch_dataset.set_format('python')  # List of possibly variable lists
+            for i in range(0, len(epoch_dataset) - batch_size + 1, batch_size):
+                batch = epoch_dataset[i: i + batch_size]
+                if collate_fn is not None:
+                    batch = collate_fn(batch)
+                    batch = {key: np.array(val) for key, val in batch.items()}
+                yield batch
+
+        tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
+
+        if pad_to > 0:
+            def padding_function(input_batch):
+                output_batch = dict()
+                for key, tensor in input_batch.items():
+                    if tf.rank(tensor) == 2:
+                        padding = [[0, 0], [0, pad_to - tf.shape(tensor)[1]]]
+                        output_batch[key] = tf.pad(tensor, padding, constant_values=pad_value)
+                    else:
+                        output_batch[key] = tensor
+                return output_batch
+
+            tf_dataset = tf_dataset.map(padding_function)
+
+        if label_cols:
+            def split_features_and_labels(input_batch):
+                features = {key: tensor for key, tensor in input_batch.items() if key in columns}
+                labels = {key: tensor for key, tensor in input_batch.items() if key in label_cols}
+                if len(features) == 1:
+                    features = list(features.values())[0]
+                if len(labels) == 1:
+                    labels = list(labels.values())[0]
                 return features, labels
 
-        indices = tf.range(len(dataset_in))
-        tf_dataset = tf.data.Dataset.from_tensor_slices(indices)
-        if shuffle:
-            tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset))
-        tf_dataset = tf_dataset.batch(batch_size)
-        tf_dataset = tf_dataset.map(graph_indices_to_samples).map(reform_dict)
-        tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
-        return tf_dataset
-
+            tf_dataset = tf_dataset.map(split_features_and_labels)
 
+        tf_dataset = tf_dataset.apply(tf.data.experimental.assert_cardinality(num_batches))
+        if prefetch:
+            tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
+        return tf_dataset
 
 
 class DatasetTransformationNotAllowedError(Exception):

From ea525a2e0b66e8dc83fe21f0a7a067067779871d Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 4 Aug 2021 17:11:00 +0100
Subject: [PATCH 06/45] TF dataset is actually kinda performant now!

---
 src/datasets/arrow_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index ce4e7eafa28..ff17af58529 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -190,7 +190,7 @@ def _get_output_signature(dataset, batch_size):
             signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
         return signatures
 
-    def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, pad_value=0, prefetch=True):
+    def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, prefetch=True):
         import tensorflow as tf
         if label_cols is None:
             label_cols = []
@@ -244,7 +244,7 @@ def padding_function(input_batch):
                 for key, tensor in input_batch.items():
                     if tf.rank(tensor) == 2:
                         padding = [[0, 0], [0, pad_to - tf.shape(tensor)[1]]]
-                        output_batch[key] = tf.pad(tensor, padding, constant_values=pad_value)
+                        output_batch[key] = tf.pad(tensor, padding)
                     else:
                         output_batch[key] = tensor
                 return output_batch

From d3a8140fde51a675a9751f6cec5658e1ee4a95e8 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 4 Aug 2021 17:12:05 +0100
Subject: [PATCH 07/45] Style pass

---
 src/datasets/arrow_dataset.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index ff17af58529..204a6482a38 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -166,9 +166,10 @@ def __init__(self):
     @staticmethod
     def _get_output_signature(dataset, batch_size):
         import tensorflow as tf
+
         signatures = dict()
         for column, col_feature in dataset.features.items():
-            if hasattr(col_feature, 'feature'):
+            if hasattr(col_feature, "feature"):
                 dtype_str = col_feature.feature.dtype
             else:
                 dtype_str = col_feature.dtype
@@ -179,9 +180,9 @@ def _get_output_signature(dataset, batch_size):
             else:
                 raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
 
-            if hasattr(col_feature, 'shape'):
+            if hasattr(col_feature, "shape"):
                 shape = [batch_size] + list(col_feature.shape)
-            elif hasattr(col_feature, 'length'):
+            elif hasattr(col_feature, "length"):
                 shape = [batch_size, col_feature.length]
             else:
                 shape = [batch_size]
@@ -192,6 +193,7 @@ def _get_output_signature(dataset, batch_size):
 
     def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, prefetch=True):
         import tensorflow as tf
+
         if label_cols is None:
             label_cols = []
         elif isinstance(label_cols, str):
@@ -206,7 +208,8 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col
             raise ValueError("List of columns contains duplicates!")
         if pad_to > 0 and collate_fn is not None:
             raise ValueError(
-                "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!")
+                "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!"
+            )
         if label_cols is not None:
             cols_to_retain = list(set(columns + label_cols))
         else:
@@ -226,11 +229,11 @@ def tf_generator():
             else:
                 epoch_dataset = dataset
             if collate_fn is None:
-                epoch_dataset.set_format('numpy')  # Automatic padding
+                epoch_dataset.set_format("numpy")  # Automatic padding
             else:
-                epoch_dataset.set_format('python')  # List of possibly variable lists
+                epoch_dataset.set_format("python")  # List of possibly variable lists
             for i in range(0, len(epoch_dataset) - batch_size + 1, batch_size):
-                batch = epoch_dataset[i: i + batch_size]
+                batch = epoch_dataset[i : i + batch_size]
                 if collate_fn is not None:
                     batch = collate_fn(batch)
                     batch = {key: np.array(val) for key, val in batch.items()}
@@ -239,6 +242,7 @@ def tf_generator():
         tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
 
         if pad_to > 0:
+
             def padding_function(input_batch):
                 output_batch = dict()
                 for key, tensor in input_batch.items():
@@ -252,6 +256,7 @@ def padding_function(input_batch):
             tf_dataset = tf_dataset.map(padding_function)
 
         if label_cols:
+
             def split_features_and_labels(input_batch):
                 features = {key: tensor for key, tensor in input_batch.items() if key in columns}
                 labels = {key: tensor for key, tensor in input_batch.items() if key in label_cols}

From 3ce6dc44e9956b9d7017a9fde441cfc0e9f6f862 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 4 Aug 2021 17:25:37 +0100
Subject: [PATCH 08/45] Helpful error message if my code gets caught off-guard
 by unexpected feature types

---
 src/datasets/arrow_dataset.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 204a6482a38..b1a165d6018 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -44,7 +44,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Value, cast_to_python_objects
+from .features import ClassLabel, Features, Value, _ArrayXD, Sequence, cast_to_python_objects
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -180,12 +180,20 @@ def _get_output_signature(dataset, batch_size):
             else:
                 raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
 
-            if hasattr(col_feature, "shape"):
+            if isinstance(col_feature, Value):
+                shape = [batch_size]
+            elif isinstance(col_feature, _ArrayXD):
                 shape = [batch_size] + list(col_feature.shape)
-            elif hasattr(col_feature, "length"):
+            elif isinstance(col_feature, Sequence):
                 shape = [batch_size, col_feature.length]
             else:
-                shape = [batch_size]
+                raise ValueError(f"Couldn't parse feature {column} with type {type(col_feature)}! "
+                                 "This may indicate a column was included with an unusual datatype "
+                                 "that we were unable to process correctly. "
+                                 "If you're getting this error with one of our datasets, and you're "
+                                 "sure the column should be convertable to tf.Tensor, please "
+                                 "file an issue at github.com/huggingface/datasets and tag "
+                                 "@rocketknight1!")
             shape = [dim if dim != -1 else None for dim in shape]
 
             signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)

From 67c06570a106eea78cff9706492bc9bec0f92cdb Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 4 Aug 2021 17:27:43 +0100
Subject: [PATCH 09/45] Style pass

---
 src/datasets/arrow_dataset.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index b1a165d6018..dce82800791 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -44,7 +44,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Value, _ArrayXD, Sequence, cast_to_python_objects
+from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -187,13 +187,15 @@ def _get_output_signature(dataset, batch_size):
             elif isinstance(col_feature, Sequence):
                 shape = [batch_size, col_feature.length]
             else:
-                raise ValueError(f"Couldn't parse feature {column} with type {type(col_feature)}! "
-                                 "This may indicate a column was included with an unusual datatype "
-                                 "that we were unable to process correctly. "
-                                 "If you're getting this error with one of our datasets, and you're "
-                                 "sure the column should be convertable to tf.Tensor, please "
-                                 "file an issue at github.com/huggingface/datasets and tag "
-                                 "@rocketknight1!")
+                raise ValueError(
+                    f"Couldn't parse feature {column} with type {type(col_feature)}! "
+                    "This may indicate a column was included with an unusual datatype "
+                    "that we were unable to process correctly. "
+                    "If you're getting this error with one of our datasets, and you're "
+                    "sure the column should be convertable to tf.Tensor, please "
+                    "file an issue at github.com/huggingface/datasets and tag "
+                    "@rocketknight1!"
+                )
             shape = [dim if dim != -1 else None for dim in shape]
 
             signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)

From 2963f0a110c23356bb56d863a352d2d80324b0d5 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 5 Aug 2021 14:43:11 +0100
Subject: [PATCH 10/45] Added drop_remainder argument, removed pad_to

---
 src/datasets/arrow_dataset.py | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index dce82800791..739eef6f754 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -201,7 +201,9 @@ def _get_output_signature(dataset, batch_size):
             signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
         return signatures
 
-    def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_cols=None, pad_to=0, prefetch=True):
+    def to_tf_dataset(
+        self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, label_cols=None, prefetch=True
+    ):
         import tensorflow as tf
 
         if label_cols is None:
@@ -216,10 +218,6 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col
             columns = [columns]
         elif len(set(columns)) < len(columns):
             raise ValueError("List of columns contains duplicates!")
-        if pad_to > 0 and collate_fn is not None:
-            raise ValueError(
-                "pad_to cannot be used with a custom collate_fn - you should modify your collate_fn instead!"
-            )
         if label_cols is not None:
             cols_to_retain = list(set(columns + label_cols))
         else:
@@ -227,9 +225,15 @@ def to_tf_dataset(self, columns, batch_size, shuffle, collate_fn=None, label_col
         for col in cols_to_retain:
             if col not in self.features:
                 raise ValueError(f"Couldn't find column {col} in dataset!")
+        if drop_remainder is None:
+            # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
+            drop_remainder = shuffle
         dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
         gen_signature = self._get_output_signature(dataset, batch_size)
-        num_batches = len(dataset) // batch_size  # Because we drop the remainder
+        if drop_remainder:
+            num_batches = floor(len(dataset) / batch_size)  # Division rounding down ( // still returns a float!)
+        else:
+            num_batches = ceil(len(dataset) / batch_size)  # Division rounding up
 
         def tf_generator():
             # Note that the 'tensorflow' return format uses ragged tensors, which are VERY unperformant
@@ -242,8 +246,8 @@ def tf_generator():
                 epoch_dataset.set_format("numpy")  # Automatic padding
             else:
                 epoch_dataset.set_format("python")  # List of possibly variable lists
-            for i in range(0, len(epoch_dataset) - batch_size + 1, batch_size):
-                batch = epoch_dataset[i : i + batch_size]
+            for i in range(num_batches):
+                batch = epoch_dataset[i * batch_size : (i + 1) * batch_size]
                 if collate_fn is not None:
                     batch = collate_fn(batch)
                     batch = {key: np.array(val) for key, val in batch.items()}
@@ -251,20 +255,6 @@ def tf_generator():
 
         tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
 
-        if pad_to > 0:
-
-            def padding_function(input_batch):
-                output_batch = dict()
-                for key, tensor in input_batch.items():
-                    if tf.rank(tensor) == 2:
-                        padding = [[0, 0], [0, pad_to - tf.shape(tensor)[1]]]
-                        output_batch[key] = tf.pad(tensor, padding)
-                    else:
-                        output_batch[key] = tensor
-                return output_batch
-
-            tf_dataset = tf_dataset.map(padding_function)
-
         if label_cols:
 
             def split_features_and_labels(input_batch):

From 7f11d76396fb3d42f83dbc1c30fff182a47a3d36 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 5 Aug 2021 15:00:46 +0100
Subject: [PATCH 11/45] Correct shape signatures when we're not dropping the
 remainder

---
 src/datasets/arrow_dataset.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 739eef6f754..6bb1e39ce17 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -229,10 +229,11 @@ def to_tf_dataset(
             # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
             drop_remainder = shuffle
         dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
-        gen_signature = self._get_output_signature(dataset, batch_size)
         if drop_remainder:
+            gen_signature = self._get_output_signature(dataset, batch_size=batch_size)
             num_batches = floor(len(dataset) / batch_size)  # Division rounding down ( // still returns a float!)
         else:
+            gen_signature = self._get_output_signature(dataset, batch_size=None)  # Because batches can be variable here
             num_batches = ceil(len(dataset) / batch_size)  # Division rounding up
 
         def tf_generator():
@@ -268,6 +269,9 @@ def split_features_and_labels(input_batch):
 
             tf_dataset = tf_dataset.map(split_features_and_labels)
 
+        elif len(columns) == 1:
+            tf_dataset = tf_dataset.map(lambda x: list(x.values())[0])
+
         tf_dataset = tf_dataset.apply(tf.data.experimental.assert_cardinality(num_batches))
         if prefetch:
             tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)

From bbf61978b7090947af04445200186be3adae6991 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 5 Aug 2021 15:12:12 +0100
Subject: [PATCH 12/45] Style pass

---
 src/datasets/arrow_dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 6bb1e39ce17..c2d603013a6 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -233,7 +233,9 @@ def to_tf_dataset(
             gen_signature = self._get_output_signature(dataset, batch_size=batch_size)
             num_batches = floor(len(dataset) / batch_size)  # Division rounding down ( // still returns a float!)
         else:
-            gen_signature = self._get_output_signature(dataset, batch_size=None)  # Because batches can be variable here
+            gen_signature = self._get_output_signature(
+                dataset, batch_size=None
+            )  # Because batches can be variable here
             num_batches = ceil(len(dataset) / batch_size)  # Division rounding up
 
         def tf_generator():

From f902bdedba49977616da8e7853dfefbec15c5fef Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 5 Aug 2021 17:50:15 +0100
Subject: [PATCH 13/45] Support ClassLabel columns too!

---
 src/datasets/arrow_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index c2d603013a6..294afc32f88 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -180,7 +180,7 @@ def _get_output_signature(dataset, batch_size):
             else:
                 raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
 
-            if isinstance(col_feature, Value):
+            if isinstance(col_feature, (Value, ClassLabel)):
                 shape = [batch_size]
             elif isinstance(col_feature, _ArrayXD):
                 shape = [batch_size] + list(col_feature.shape)

From 990f150a7de6fac73855dd3de06e0cddd25d3e08 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Mon, 16 Aug 2021 16:42:39 +0100
Subject: [PATCH 14/45] Re-enable `tf.ragged` by avoiding `tf.ragged.constant`
 unless absolutely necessary

---
 src/datasets/arrow_dataset.py           | 17 +++++++++++------
 src/datasets/formatting/tf_formatter.py | 23 +++++++++++++++++++----
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 294afc32f88..15a51e79fbd 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -202,10 +202,13 @@ def _get_output_signature(dataset, batch_size):
         return signatures
 
     def to_tf_dataset(
-        self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, label_cols=None, prefetch=True
+        self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, collate_fn_args=None, label_cols=None, prefetch=True
     ):
         import tensorflow as tf
 
+        if collate_fn_args is None:
+            collate_fn_args = dict()
+
         if label_cols is None:
             label_cols = []
         elif isinstance(label_cols, str):
@@ -239,21 +242,23 @@ def to_tf_dataset(
             num_batches = ceil(len(dataset) / batch_size)  # Division rounding up
 
         def tf_generator():
-            # Note that the 'tensorflow' return format uses ragged tensors, which are VERY unperformant
-            # right now (TF 2.5). This may or may not change in the future, but for now we stick to 'numpy'.
             if shuffle:
                 epoch_dataset = dataset.shuffle(load_from_cache_file=False)
             else:
                 epoch_dataset = dataset
             if collate_fn is None:
-                epoch_dataset.set_format("numpy")  # Automatic padding
+                epoch_dataset.set_format("tensorflow")  # Will return ragged tensors
             else:
                 epoch_dataset.set_format("python")  # List of possibly variable lists
             for i in range(num_batches):
                 batch = epoch_dataset[i * batch_size : (i + 1) * batch_size]
                 if collate_fn is not None:
-                    batch = collate_fn(batch)
-                    batch = {key: np.array(val) for key, val in batch.items()}
+                    batch = collate_fn(batch, **collate_fn_args)
+                    # In case the collate_fn returns something strange
+                    batch = {key: tf.convert_to_tensor(val) for key, val in batch.items()}
+                else:
+                    batch = {key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor
+                             for key, tensor in batch.items()}
                 yield batch
 
         tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
index 4da428f382a..9a1529aa43c 100644
--- a/src/datasets/formatting/tf_formatter.py
+++ b/src/datasets/formatting/tf_formatter.py
@@ -35,13 +35,28 @@ def __init__(self, **tf_tensor_kwargs):
     def _tensorize(self, value):
         import tensorflow as tf
 
-        default_dtype = {}
         if np.issubdtype(value.dtype, np.integer):
-            default_dtype = {"dtype": tf.int64}
+            np_dtype = np.int64
+            tf_dtype = tf.int64
+            default_dtype = {"dtype": tf_dtype}
         elif np.issubdtype(value.dtype, np.floating):
-            default_dtype = {"dtype": tf.float32}
+            np_dtype = np.float32
+            tf_dtype = tf.float32
+            default_dtype = {"dtype": tf_dtype}
+        else:
+            np_dtype = None
+            tf_dtype = None
+            default_dtype = {}
+
+        # Saving the most expensive methods for last
+        try:
+            return tf.convert_to_tensor(value, dtype=tf_dtype)
+        except ValueError:
+            try:
+                return tf.ragged.stack([np.array(subarr, dtype=np_dtype) for subarr in value])
+            except ValueError:
+                return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs})
 
-        return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs})
 
     def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct

From fa062065f38eeb4e24036f2003077c5eaa822d52 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Mon, 16 Aug 2021 16:45:00 +0100
Subject: [PATCH 15/45] Style pass

---
 src/datasets/arrow_dataset.py           | 16 +++++++++++++---
 src/datasets/formatting/tf_formatter.py |  1 -
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 15a51e79fbd..01116239274 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -202,7 +202,15 @@ def _get_output_signature(dataset, batch_size):
         return signatures
 
     def to_tf_dataset(
-        self, columns, batch_size, shuffle, drop_remainder=None, collate_fn=None, collate_fn_args=None, label_cols=None, prefetch=True
+        self,
+        columns,
+        batch_size,
+        shuffle,
+        drop_remainder=None,
+        collate_fn=None,
+        collate_fn_args=None,
+        label_cols=None,
+        prefetch=True,
     ):
         import tensorflow as tf
 
@@ -257,8 +265,10 @@ def tf_generator():
                     # In case the collate_fn returns something strange
                     batch = {key: tf.convert_to_tensor(val) for key, val in batch.items()}
                 else:
-                    batch = {key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor
-                             for key, tensor in batch.items()}
+                    batch = {
+                        key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor
+                        for key, tensor in batch.items()
+                    }
                 yield batch
 
         tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
index 9a1529aa43c..38cc1455f67 100644
--- a/src/datasets/formatting/tf_formatter.py
+++ b/src/datasets/formatting/tf_formatter.py
@@ -57,7 +57,6 @@ def _tensorize(self, value):
             except ValueError:
                 return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs})
 
-
     def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):

From 29415cd58f64bebaf9a98d2b3ba42dfa986196f7 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 26 Aug 2021 13:22:30 +0100
Subject: [PATCH 16/45] Adding a comment to explain myself in tf_formatter.py

---
 src/datasets/formatting/tf_formatter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
index 38cc1455f67..93a2b131d46 100644
--- a/src/datasets/formatting/tf_formatter.py
+++ b/src/datasets/formatting/tf_formatter.py
@@ -55,6 +55,7 @@ def _tensorize(self, value):
             try:
                 return tf.ragged.stack([np.array(subarr, dtype=np_dtype) for subarr in value])
             except ValueError:
+                # tf.ragged.constant is orders of magnitude slower than tf.ragged.stack
                 return tf.ragged.constant(value, **{**default_dtype, **self.tf_tensor_kwargs})
 
     def _recursive_tensorize(self, data_struct: dict):

From ca93c34b85043f15367b62a022805fc4510d0e35 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 26 Aug 2021 17:09:22 +0100
Subject: [PATCH 17/45] Fixes for shuffling and the case where the collator
 adds new columns

---
 src/datasets/arrow_dataset.py | 54 ++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 01116239274..9cb6bf31e5e 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -38,6 +38,7 @@
 import pyarrow.compute as pc
 from multiprocess import Pool, RLock
 from tqdm.auto import tqdm
+from random import randint
 
 from datasets.tasks.text_classification import TextClassification
 
@@ -164,7 +165,7 @@ def __init__(self):
         pass
 
     @staticmethod
-    def _get_output_signature(dataset, batch_size):
+    def _get_output_signature(dataset, test_batch, batch_size):
         import tensorflow as tf
 
         signatures = dict()
@@ -174,7 +175,7 @@ def _get_output_signature(dataset, batch_size):
             else:
                 dtype_str = col_feature.dtype
             if dtype_str.startswith("int") or dtype_str.startswith("uint"):
-                dtype = tf.int32
+                dtype = tf.int64
             elif dtype_str.startswith("float"):
                 dtype = tf.float32
             else:
@@ -199,6 +200,18 @@ def _get_output_signature(dataset, batch_size):
             shape = [dim if dim != -1 else None for dim in shape]
 
             signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
+
+        # Catching columns added by the collate_fn, such as MLM labels
+        for column, tensor in test_batch.items():
+            if column in signatures:
+                continue
+            if column.startswith('label') and 'input_ids' in signatures:
+                shape = signatures['input_ids'].shape
+            else:
+                # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
+                # about the dimensions we're unsure of
+                shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]]
+            signatures[column] = tf.TensorSpec(shape=shape, dtype=tensor.dtype)
         return signatures
 
     def to_tf_dataset(
@@ -241,17 +254,13 @@ def to_tf_dataset(
             drop_remainder = shuffle
         dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
         if drop_remainder:
-            gen_signature = self._get_output_signature(dataset, batch_size=batch_size)
             num_batches = floor(len(dataset) / batch_size)  # Division rounding down ( // still returns a float!)
         else:
-            gen_signature = self._get_output_signature(
-                dataset, batch_size=None
-            )  # Because batches can be variable here
             num_batches = ceil(len(dataset) / batch_size)  # Division rounding up
 
         def tf_generator():
             if shuffle:
-                epoch_dataset = dataset.shuffle(load_from_cache_file=False)
+                epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2**32 - 1))
             else:
                 epoch_dataset = dataset
             if collate_fn is None:
@@ -261,15 +270,32 @@ def tf_generator():
             for i in range(num_batches):
                 batch = epoch_dataset[i * batch_size : (i + 1) * batch_size]
                 if collate_fn is not None:
+                    actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same
+                    # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert
+                    batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)]
                     batch = collate_fn(batch, **collate_fn_args)
-                    # In case the collate_fn returns something strange
-                    batch = {key: tf.convert_to_tensor(val) for key, val in batch.items()}
+                    for key in list(batch.keys()):
+                        # In case the collate_fn returns something strange
+                        tensor = tf.convert_to_tensor(batch[key])
+                        cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32
+                        if tensor.dtype != cast_dtype:
+                            tensor = tf.cast(tensor, cast_dtype)
+                        batch[key] = tensor
                 else:
-                    batch = {
-                        key: tensor.to_tensor() if isinstance(tensor, tf.RaggedTensor) else tensor
-                        for key, tensor in batch.items()
-                    }
-                yield batch
+                    for key in list(batch.keys()):
+                        tensor = batch[key]
+                        if isinstance(tensor, tf.RaggedTensor):
+                            tensor = tensor.to_tensor()
+                        cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32
+                        if tensor.dtype != cast_dtype:
+                            tensor = tf.cast(tensor, cast_dtype)
+                        batch[key] = tensor
+                yield dict(batch)
+
+        test_batch = next(tf_generator())
+
+        gen_signature = self._get_output_signature(dataset, test_batch=test_batch,
+                                                   batch_size=batch_size if drop_remainder else None)
 
         tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
 

From d78cd5079dbe1eb4ccc4e43ef516c2ac0e903551 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 26 Aug 2021 17:12:07 +0100
Subject: [PATCH 18/45] Style pass

---
 src/datasets/arrow_dataset.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 9cb6bf31e5e..fbd600ec5f7 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -29,6 +29,7 @@
 from functools import partial, wraps
 from math import ceil, floor
 from pathlib import Path
+from random import randint
 from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Union
 
 import fsspec
@@ -38,7 +39,6 @@
 import pyarrow.compute as pc
 from multiprocess import Pool, RLock
 from tqdm.auto import tqdm
-from random import randint
 
 from datasets.tasks.text_classification import TextClassification
 
@@ -205,8 +205,8 @@ def _get_output_signature(dataset, test_batch, batch_size):
         for column, tensor in test_batch.items():
             if column in signatures:
                 continue
-            if column.startswith('label') and 'input_ids' in signatures:
-                shape = signatures['input_ids'].shape
+            if column.startswith("label") and "input_ids" in signatures:
+                shape = signatures["input_ids"].shape
             else:
                 # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
                 # about the dimensions we're unsure of
@@ -260,7 +260,7 @@ def to_tf_dataset(
 
         def tf_generator():
             if shuffle:
-                epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2**32 - 1))
+                epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2 ** 32 - 1))
             else:
                 epoch_dataset = dataset
             if collate_fn is None:
@@ -294,8 +294,9 @@ def tf_generator():
 
         test_batch = next(tf_generator())
 
-        gen_signature = self._get_output_signature(dataset, test_batch=test_batch,
-                                                   batch_size=batch_size if drop_remainder else None)
+        gen_signature = self._get_output_signature(
+            dataset, test_batch=test_batch, batch_size=batch_size if drop_remainder else None
+        )
 
         tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
 

From 0bf0050c50aebec5f8f24defe7afbe28b3586d36 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 26 Aug 2021 17:58:50 +0100
Subject: [PATCH 19/45] Ensuring we respect TF dtype args

---
 src/datasets/formatting/tf_formatter.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
index 93a2b131d46..d000035647f 100644
--- a/src/datasets/formatting/tf_formatter.py
+++ b/src/datasets/formatting/tf_formatter.py
@@ -35,17 +35,22 @@ def __init__(self, **tf_tensor_kwargs):
     def _tensorize(self, value):
         import tensorflow as tf
 
-        if np.issubdtype(value.dtype, np.integer):
-            np_dtype = np.int64
-            tf_dtype = tf.int64
-            default_dtype = {"dtype": tf_dtype}
-        elif np.issubdtype(value.dtype, np.floating):
-            np_dtype = np.float32
-            tf_dtype = tf.float32
-            default_dtype = {"dtype": tf_dtype}
+        if 'dtype' not in self.tf_tensor_kwargs:
+            if np.issubdtype(value.dtype, np.integer):
+                np_dtype = np.int64
+                tf_dtype = tf.int64
+                default_dtype = {"dtype": tf_dtype}
+            elif np.issubdtype(value.dtype, np.floating):
+                np_dtype = np.float32
+                tf_dtype = tf.float32
+                default_dtype = {"dtype": tf_dtype}
+            else:
+                np_dtype = None
+                tf_dtype = None
+                default_dtype = {}
         else:
-            np_dtype = None
-            tf_dtype = None
+            tf_dtype = self.tf_tensor_kwargs['dtype']
+            np_dtype = tf_dtype.as_numpy_dtype
             default_dtype = {}
 
         # Saving the most expensive methods for last

From 6c91fc7ef7f1b95c594565644521e3445f70f3b7 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 26 Aug 2021 17:59:05 +0100
Subject: [PATCH 20/45] Style pass

---
 src/datasets/formatting/tf_formatter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
index d000035647f..a54d69a928e 100644
--- a/src/datasets/formatting/tf_formatter.py
+++ b/src/datasets/formatting/tf_formatter.py
@@ -35,7 +35,7 @@ def __init__(self, **tf_tensor_kwargs):
     def _tensorize(self, value):
         import tensorflow as tf
 
-        if 'dtype' not in self.tf_tensor_kwargs:
+        if "dtype" not in self.tf_tensor_kwargs:
             if np.issubdtype(value.dtype, np.integer):
                 np_dtype = np.int64
                 tf_dtype = tf.int64
@@ -49,7 +49,7 @@ def _tensorize(self, value):
                 tf_dtype = None
                 default_dtype = {}
         else:
-            tf_dtype = self.tf_tensor_kwargs['dtype']
+            tf_dtype = self.tf_tensor_kwargs["dtype"]
             np_dtype = tf_dtype.as_numpy_dtype
             default_dtype = {}
 

From 195486239824b9651973a1a685d237183787f3f0 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Tue, 31 Aug 2021 14:19:39 +0100
Subject: [PATCH 21/45] Updating tests

---
 tests/test_formatting.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_formatting.py b/tests/test_formatting.py
index cca0cb127bc..3c50cbd98cf 100644
--- a/tests/test_formatting.py
+++ b/tests/test_formatting.py
@@ -164,20 +164,20 @@ def test_tf_formatter(self):
         pa_table = self._create_dummy_table()
         formatter = TFFormatter()
         row = formatter.format_row(pa_table)
-        tf.debugging.assert_equal(row["a"], tf.ragged.constant(_COL_A, dtype=tf.int64)[0])
-        tf.debugging.assert_equal(row["b"], tf.ragged.constant(_COL_B, dtype=tf.string)[0])
-        tf.debugging.assert_equal(row["c"], tf.ragged.constant(_COL_C, dtype=tf.float32)[0])
+        tf.debugging.assert_equal(row["a"], tf.convert_to_tensor(_COL_A, dtype=tf.int64)[0])
+        tf.debugging.assert_equal(row["b"], tf.convert_to_tensor(_COL_B, dtype=tf.string)[0])
+        tf.debugging.assert_equal(row["c"], tf.convert_to_tensor(_COL_C, dtype=tf.float32)[0])
         col = formatter.format_column(pa_table)
         tf.debugging.assert_equal(col, tf.ragged.constant(_COL_A, dtype=tf.int64))
         batch = formatter.format_batch(pa_table)
-        tf.debugging.assert_equal(batch["a"], tf.ragged.constant(_COL_A, dtype=tf.int64))
-        tf.debugging.assert_equal(batch["b"], tf.ragged.constant(_COL_B, dtype=tf.string))
-        self.assertIsInstance(batch["c"], tf.RaggedTensor)
+        tf.debugging.assert_equal(batch["a"], tf.convert_to_tensor(_COL_A, dtype=tf.int64))
+        tf.debugging.assert_equal(batch["b"], tf.convert_to_tensor(_COL_B, dtype=tf.string))
+        self.assertIsInstance(batch["c"], tf.Tensor)
         self.assertEqual(batch["c"].dtype, tf.float32)
         tf.debugging.assert_equal(
-            batch["c"].bounding_shape(), tf.ragged.constant(_COL_C, dtype=tf.float32).bounding_shape()
+            batch["c"].shape.as_list(), tf.convert_to_tensor(_COL_C, dtype=tf.float32).shape.as_list()
         )
-        tf.debugging.assert_equal(batch["c"].flat_values, tf.ragged.constant(_COL_C, dtype=tf.float32).flat_values)
+        tf.debugging.assert_equal(tf.convert_to_tensor(batch["c"]), tf.convert_to_tensor(_COL_C, dtype=tf.float32))
 
     @require_tf
     def test_tf_formatter_np_array_kwargs(self):

From 7f2a8f10db715de42137e92f2257c6a415d2b495 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Tue, 31 Aug 2021 14:31:37 +0100
Subject: [PATCH 22/45] Updating tests

---
 tests/test_arrow_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 5eda8c85bec..58eed9b638d 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -1816,8 +1816,8 @@ def test_format_vectors(self, in_memory):
                 self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor))
                 self.assertIsInstance(dset[:2][col], (tf.Tensor, tf.RaggedTensor))
                 self.assertIsInstance(dset[col], (tf.Tensor, tf.RaggedTensor))
-            self.assertEqual(tuple(dset[:2]["vec"].shape), (2, None))
-            self.assertEqual(tuple(dset["vec"][:2].shape), (2, None))
+            self.assertEqual(tuple(dset[:2]["vec"].shape), (2, 3))
+            self.assertEqual(tuple(dset["vec"][:2].shape), (2, 3))
 
             dset.set_format("numpy")
             self.assertIsNotNone(dset[0])

From 6eef188b10acd7b692d55485d0aabb264773389e Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 2 Sep 2021 14:54:56 +0100
Subject: [PATCH 23/45] Fixing things so they work in TF2.6

---
 src/datasets/arrow_dataset.py | 146 ++++++++++++----------------------
 1 file changed, 51 insertions(+), 95 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index fbd600ec5f7..c7956931dc8 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -164,56 +164,6 @@ class TensorflowDatasetMixIn:
     def __init__(self):
         pass
 
-    @staticmethod
-    def _get_output_signature(dataset, test_batch, batch_size):
-        import tensorflow as tf
-
-        signatures = dict()
-        for column, col_feature in dataset.features.items():
-            if hasattr(col_feature, "feature"):
-                dtype_str = col_feature.feature.dtype
-            else:
-                dtype_str = col_feature.dtype
-            if dtype_str.startswith("int") or dtype_str.startswith("uint"):
-                dtype = tf.int64
-            elif dtype_str.startswith("float"):
-                dtype = tf.float32
-            else:
-                raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
-
-            if isinstance(col_feature, (Value, ClassLabel)):
-                shape = [batch_size]
-            elif isinstance(col_feature, _ArrayXD):
-                shape = [batch_size] + list(col_feature.shape)
-            elif isinstance(col_feature, Sequence):
-                shape = [batch_size, col_feature.length]
-            else:
-                raise ValueError(
-                    f"Couldn't parse feature {column} with type {type(col_feature)}! "
-                    "This may indicate a column was included with an unusual datatype "
-                    "that we were unable to process correctly. "
-                    "If you're getting this error with one of our datasets, and you're "
-                    "sure the column should be convertable to tf.Tensor, please "
-                    "file an issue at github.com/huggingface/datasets and tag "
-                    "@rocketknight1!"
-                )
-            shape = [dim if dim != -1 else None for dim in shape]
-
-            signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
-
-        # Catching columns added by the collate_fn, such as MLM labels
-        for column, tensor in test_batch.items():
-            if column in signatures:
-                continue
-            if column.startswith("label") and "input_ids" in signatures:
-                shape = signatures["input_ids"].shape
-            else:
-                # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
-                # about the dimensions we're unsure of
-                shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]]
-            signatures[column] = tf.TensorSpec(shape=shape, dtype=tensor.dtype)
-        return signatures
-
     def to_tf_dataset(
         self,
         columns,
@@ -253,52 +203,59 @@ def to_tf_dataset(
             # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
             drop_remainder = shuffle
         dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
-        if drop_remainder:
-            num_batches = floor(len(dataset) / batch_size)  # Division rounding down ( // still returns a float!)
-        else:
-            num_batches = ceil(len(dataset) / batch_size)  # Division rounding up
-
-        def tf_generator():
-            if shuffle:
-                epoch_dataset = dataset.shuffle(load_from_cache_file=False, seed=randint(0, 2 ** 32 - 1))
-            else:
-                epoch_dataset = dataset
-            if collate_fn is None:
-                epoch_dataset.set_format("tensorflow")  # Will return ragged tensors
+        self.set_format("numpy")
+
+        def numpy_pad(data):
+            # Get lengths of each row of data
+            lens = np.array([len(i) for i in data])
+
+            # Mask of valid places in each row
+            mask = np.arange(lens.max()) < lens[:, None]
+
+            # Setup output array and put elements from data into masked positions
+            out = np.zeros(mask.shape, dtype=data.dtype)
+            out[mask] = np.concatenate(data)
+            return out
+
+        def np_get_batch(indices):
+            batch = self[indices]
+            out_batch = []
+            if collate_fn is not None:
+                actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same
+                # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert
+                batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)]
+                batch = collate_fn(batch, **collate_fn_args)
+                for key in cols_to_retain:
+                    # In case the collate_fn returns something strange
+                    array = np.array(batch[key])
+                    cast_dtype = np.int64 if np.issubdtype(array.dtype, np.integer) else np.float32
+                    array = array.astype(cast_dtype)
+                    out_batch.append(array)
             else:
-                epoch_dataset.set_format("python")  # List of possibly variable lists
-            for i in range(num_batches):
-                batch = epoch_dataset[i * batch_size : (i + 1) * batch_size]
-                if collate_fn is not None:
-                    actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same
-                    # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert
-                    batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)]
-                    batch = collate_fn(batch, **collate_fn_args)
-                    for key in list(batch.keys()):
-                        # In case the collate_fn returns something strange
-                        tensor = tf.convert_to_tensor(batch[key])
-                        cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32
-                        if tensor.dtype != cast_dtype:
-                            tensor = tf.cast(tensor, cast_dtype)
-                        batch[key] = tensor
-                else:
-                    for key in list(batch.keys()):
-                        tensor = batch[key]
-                        if isinstance(tensor, tf.RaggedTensor):
-                            tensor = tensor.to_tensor()
-                        cast_dtype = tf.int64 if tensor.dtype.is_integer else tf.float32
-                        if tensor.dtype != cast_dtype:
-                            tensor = tf.cast(tensor, cast_dtype)
-                        batch[key] = tensor
-                yield dict(batch)
-
-        test_batch = next(tf_generator())
-
-        gen_signature = self._get_output_signature(
-            dataset, test_batch=test_batch, batch_size=batch_size if drop_remainder else None
-        )
+                for key in cols_to_retain:
+                    array = batch[key]
+                    if array.dtype == np.object:
+                        array = numpy_pad(array)
+                    cast_dtype = np.int64 if np.issubdtype(array.dtype, np.integer) else np.float32
+                    array = array.astype(cast_dtype)
+                    out_batch.append(array)
+            return [tf.convert_to_tensor(arr) for arr in out_batch]
+
+        test_batch = np_get_batch(np.arange(batch_size))
+
+        @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
+        def fetch_function(indices):
+            output = tf.numpy_function(
+                np_get_batch, inp=[indices], Tout=[tf.dtypes.as_dtype(arr.dtype) for arr in test_batch]
+            )
+            return {key: output[i] for i, key in enumerate(cols_to_retain)}
 
-        tf_dataset = tf.data.Dataset.from_generator(tf_generator, output_signature=gen_signature)
+        tf_dataset = (
+            tf.data.Dataset.from_tensor_slices(np.arange(len(dataset)))
+            .shuffle(len(dataset))
+            .batch(batch_size, drop_remainder=drop_remainder)
+            .map(fetch_function)
+        )
 
         if label_cols:
 
@@ -316,7 +273,6 @@ def split_features_and_labels(input_batch):
         elif len(columns) == 1:
             tf_dataset = tf_dataset.map(lambda x: list(x.values())[0])
 
-        tf_dataset = tf_dataset.apply(tf.data.experimental.assert_cardinality(num_batches))
         if prefetch:
             tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
         return tf_dataset

From a63dfb949bff50d8e57ffab614032ebc508c5ba0 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 2 Sep 2021 14:56:43 +0100
Subject: [PATCH 24/45] Style pass

---
 src/datasets/arrow_dataset.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index c7956931dc8..3c7bb6da491 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -29,7 +29,6 @@
 from functools import partial, wraps
 from math import ceil, floor
 from pathlib import Path
-from random import randint
 from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Union
 
 import fsspec
@@ -45,7 +44,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects
+from .features import ClassLabel, Features, Value, cast_to_python_objects
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,

From d7048a43a416c1e6c0034b61caaf4e05f1c57372 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 2 Sep 2021 17:28:20 +0100
Subject: [PATCH 25/45] Correctly set output shapes - fixes a whole lot of
 issues

---
 src/datasets/arrow_dataset.py | 62 ++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 3c7bb6da491..25c5d3db2a9 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -44,7 +44,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Value, cast_to_python_objects
+from .features import ClassLabel, Features, Value, cast_to_python_objects, _ArrayXD, Sequence
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -163,6 +163,56 @@ class TensorflowDatasetMixIn:
     def __init__(self):
         pass
 
+    @staticmethod
+    def _get_output_signature(dataset, test_batch, batch_size):
+        import tensorflow as tf
+
+        signatures = dict()
+        for column, col_feature in dataset.features.items():
+            if hasattr(col_feature, "feature"):
+                dtype_str = col_feature.feature.dtype
+            else:
+                dtype_str = col_feature.dtype
+            if dtype_str.startswith("int") or dtype_str.startswith("uint"):
+                dtype = tf.int64
+            elif dtype_str.startswith("float"):
+                dtype = tf.float32
+            else:
+                raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
+
+            if isinstance(col_feature, (Value, ClassLabel)):
+                shape = [batch_size]
+            elif isinstance(col_feature, _ArrayXD):
+                shape = [batch_size] + list(col_feature.shape)
+            elif isinstance(col_feature, Sequence):
+                shape = [batch_size, col_feature.length]
+            else:
+                raise ValueError(
+                    f"Couldn't parse feature {column} with type {type(col_feature)}! "
+                    "This may indicate a column was included with an unusual datatype "
+                    "that we were unable to process correctly. "
+                    "If you're getting this error with one of our datasets, and you're "
+                    "sure the column should be convertable to tf.Tensor, please "
+                    "file an issue at github.com/huggingface/datasets and tag "
+                    "@rocketknight1!"
+                )
+            shape = [dim if dim != -1 else None for dim in shape]
+
+            signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
+
+        # Catching columns added by the collate_fn, such as MLM labels
+        for column, tensor in test_batch.items():
+            if column in signatures:
+                continue
+            if column.startswith("label") and "input_ids" in signatures:
+                shape = signatures["input_ids"].shape
+            else:
+                # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
+                # about the dimensions we're unsure of
+                shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]]
+            signatures[column] = tf.TensorSpec(shape=shape, dtype=tensor.dtype)
+        return signatures
+
     def to_tf_dataset(
         self,
         columns,
@@ -242,6 +292,7 @@ def np_get_batch(indices):
 
         test_batch = np_get_batch(np.arange(batch_size))
 
+
         @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
         def fetch_function(indices):
             output = tf.numpy_function(
@@ -249,11 +300,20 @@ def fetch_function(indices):
             )
             return {key: output[i] for i, key in enumerate(cols_to_retain)}
 
+        test_batch_dict = {key: test_batch[i] for i, key in enumerate(cols_to_retain)}
+        output_signature = self._get_output_signature(dataset, test_batch_dict,
+                                                      batch_size=batch_size if drop_remainder else None)
+
+        def ensure_shapes(input_dict):
+            return {key: tf.ensure_shape(val, output_signature[key].shape)
+                    for key, val in input_dict.items()}
+
         tf_dataset = (
             tf.data.Dataset.from_tensor_slices(np.arange(len(dataset)))
             .shuffle(len(dataset))
             .batch(batch_size, drop_remainder=drop_remainder)
             .map(fetch_function)
+            .map(ensure_shapes)
         )
 
         if label_cols:

From 56ea08fac7465f83b66c81fe19701e024208660a Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Tue, 7 Sep 2021 12:38:46 +0100
Subject: [PATCH 26/45] Fix an embarrassing regression bug

---
 src/datasets/arrow_dataset.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 25c5d3db2a9..c4ee8078b99 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -252,11 +252,14 @@ def to_tf_dataset(
             # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
             drop_remainder = shuffle
         dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
-        self.set_format("numpy")
+        dataset.set_format("python")
 
         def numpy_pad(data):
             # Get lengths of each row of data
             lens = np.array([len(i) for i in data])
+            if np.all(lens == lens[0]):
+                # All data has the same length, no padding required
+                return np.array(data)
 
             # Mask of valid places in each row
             mask = np.arange(lens.max()) < lens[:, None]
@@ -267,7 +270,7 @@ def numpy_pad(data):
             return out
 
         def np_get_batch(indices):
-            batch = self[indices]
+            batch = dataset[indices]
             out_batch = []
             if collate_fn is not None:
                 actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same

From 2ddf7c6906abd1e496ada3c65534eea096da3a9a Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Tue, 7 Sep 2021 12:41:28 +0100
Subject: [PATCH 27/45] Style pass

---
 src/datasets/arrow_dataset.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index c4ee8078b99..052ab255800 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -44,7 +44,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Value, cast_to_python_objects, _ArrayXD, Sequence
+from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -295,7 +295,6 @@ def np_get_batch(indices):
 
         test_batch = np_get_batch(np.arange(batch_size))
 
-
         @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
         def fetch_function(indices):
             output = tf.numpy_function(
@@ -304,12 +303,12 @@ def fetch_function(indices):
             return {key: output[i] for i, key in enumerate(cols_to_retain)}
 
         test_batch_dict = {key: test_batch[i] for i, key in enumerate(cols_to_retain)}
-        output_signature = self._get_output_signature(dataset, test_batch_dict,
-                                                      batch_size=batch_size if drop_remainder else None)
+        output_signature = self._get_output_signature(
+            dataset, test_batch_dict, batch_size=batch_size if drop_remainder else None
+        )
 
         def ensure_shapes(input_dict):
-            return {key: tf.ensure_shape(val, output_signature[key].shape)
-                    for key, val in input_dict.items()}
+            return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()}
 
         tf_dataset = (
             tf.data.Dataset.from_tensor_slices(np.arange(len(dataset)))

From ddfda69273f882f5b7abf09d7797f75341da7cdb Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 8 Sep 2021 13:06:57 +0100
Subject: [PATCH 28/45] Added `config.TF_AVAILABLE` checks and dict literals

---
 src/datasets/arrow_dataset.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 052ab255800..2f63327b177 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -165,9 +165,12 @@ def __init__(self):
 
     @staticmethod
     def _get_output_signature(dataset, test_batch, batch_size):
-        import tensorflow as tf
+        if config.TF_AVAILABLE:
+            import tensorflow as tf
+        else:
+            raise ImportError("Called a Tensorflow-specific function but could not import it!")
 
-        signatures = dict()
+        signatures = {}
         for column, col_feature in dataset.features.items():
             if hasattr(col_feature, "feature"):
                 dtype_str = col_feature.feature.dtype
@@ -224,10 +227,13 @@ def to_tf_dataset(
         label_cols=None,
         prefetch=True,
     ):
-        import tensorflow as tf
+        if config.TF_AVAILABLE:
+            import tensorflow as tf
+        else:
+            raise ImportError("Called a Tensorflow-specific function but could not import it!")
 
         if collate_fn_args is None:
-            collate_fn_args = dict()
+            collate_fn_args = {}
 
         if label_cols is None:
             label_cols = []
@@ -265,7 +271,7 @@ def numpy_pad(data):
             mask = np.arange(lens.max()) < lens[:, None]
 
             # Setup output array and put elements from data into masked positions
-            out = np.zeros(mask.shape, dtype=data.dtype)
+            out = np.zeros(mask.shape, dtype=np.array(data[0]).dtype)
             out[mask] = np.concatenate(data)
             return out
 
@@ -286,8 +292,7 @@ def np_get_batch(indices):
             else:
                 for key in cols_to_retain:
                     array = batch[key]
-                    if array.dtype == np.object:
-                        array = numpy_pad(array)
+                    array = numpy_pad(array)
                     cast_dtype = np.int64 if np.issubdtype(array.dtype, np.integer) else np.float32
                     array = array.astype(cast_dtype)
                     out_batch.append(array)

From c87d47ed6b8b1957a82055898c05fe90e4100504 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 9 Sep 2021 16:58:36 +0100
Subject: [PATCH 29/45] Handling for special cases around label/labels and very
 nested dtypes

---
 src/datasets/arrow_dataset.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 2f63327b177..690836b4010 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -172,10 +172,10 @@ def _get_output_signature(dataset, test_batch, batch_size):
 
         signatures = {}
         for column, col_feature in dataset.features.items():
-            if hasattr(col_feature, "feature"):
-                dtype_str = col_feature.feature.dtype
-            else:
-                dtype_str = col_feature.dtype
+            dtype_feature = col_feature
+            while hasattr(dtype_feature, "feature"):  # Descend this godforsaken nested rabbit hole as long as it takes
+                dtype_feature = dtype_feature.feature
+            dtype_str = dtype_feature.dtype
             if dtype_str.startswith("int") or dtype_str.startswith("uint"):
                 dtype = tf.int64
             elif dtype_str.startswith("float"):
@@ -251,6 +251,9 @@ def to_tf_dataset(
             cols_to_retain = list(set(columns + label_cols))
         else:
             cols_to_retain = columns
+        # Special casing when the dataset has 'label' and the model expects 'labels' and the collator fixes it up for us
+        if "labels" in cols_to_retain and "labels" not in self.features and "label" in self.features:
+            cols_to_retain[cols_to_retain.index("labels")] = "label"
         for col in cols_to_retain:
             if col not in self.features:
                 raise ValueError(f"Couldn't find column {col} in dataset!")
@@ -283,6 +286,10 @@ def np_get_batch(indices):
                 # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert
                 batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)]
                 batch = collate_fn(batch, **collate_fn_args)
+                # Special casing when the dataset has 'label' and the model
+                # expects 'labels' and the collator fixes it up for us
+                if "label" in cols_to_retain and "label" not in batch and "labels" in batch:
+                    cols_to_retain[cols_to_retain.index("label")] = "labels"
                 for key in cols_to_retain:
                     # In case the collate_fn returns something strange
                     array = np.array(batch[key])

From e7d1ce8f7c3ed83dad06a11fc747938c4f79d15a Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Fri, 10 Sep 2021 15:25:16 +0100
Subject: [PATCH 30/45] Fix for accidentally shuffling even when flag was False

---
 src/datasets/arrow_dataset.py | 56 ++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 690836b4010..32781f9b2ff 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -183,22 +183,26 @@ def _get_output_signature(dataset, test_batch, batch_size):
             else:
                 raise ValueError(f"Could not convert datatype {dtype_str} in column {column}!")
 
-            if isinstance(col_feature, (Value, ClassLabel)):
-                shape = [batch_size]
-            elif isinstance(col_feature, _ArrayXD):
-                shape = [batch_size] + list(col_feature.shape)
-            elif isinstance(col_feature, Sequence):
-                shape = [batch_size, col_feature.length]
-            else:
-                raise ValueError(
-                    f"Couldn't parse feature {column} with type {type(col_feature)}! "
-                    "This may indicate a column was included with an unusual datatype "
-                    "that we were unable to process correctly. "
-                    "If you're getting this error with one of our datasets, and you're "
-                    "sure the column should be convertable to tf.Tensor, please "
-                    "file an issue at github.com/huggingface/datasets and tag "
-                    "@rocketknight1!"
+            shape = []
+            shape_feature = col_feature
+            while not isinstance(shape_feature, (Value, ClassLabel)):
+                if isinstance(shape_feature, _ArrayXD):
+                    shape.extend(list(shape_feature.shape))
+                    break
+                elif isinstance(shape_feature, Sequence):
+                    shape.insert(0, shape_feature.length)
+                    shape_feature = shape_feature.feature
+                else:
+                    raise ValueError(
+                        f"Couldn't parse feature {column} with type {type(col_feature)}! "
+                        "This may indicate a column was included with an unusual datatype "
+                        "that we were unable to process correctly. "
+                        "If you're getting this error with one of our datasets, and you're "
+                        "sure the column should be convertable to tf.Tensor, please "
+                        "file an issue at github.com/huggingface/datasets and tag "
+                        "@rocketknight1!"
                 )
+            shape = [batch_size] + shape
             shape = [dim if dim != -1 else None for dim in shape]
 
             signatures[column] = tf.TensorSpec(shape=shape, dtype=dtype)
@@ -207,8 +211,13 @@ def _get_output_signature(dataset, test_batch, batch_size):
         for column, tensor in test_batch.items():
             if column in signatures:
                 continue
-            if column.startswith("label") and "input_ids" in signatures:
-                shape = signatures["input_ids"].shape
+            if column.startswith("label"):
+                if "input_ids" in signatures and test_batch[column].shape == test_batch['input_ids'].shape:
+                    shape = signatures["input_ids"].shape
+                else:
+                    # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
+                    # about the dimensions we're unsure of
+                    shape = [batch_size] + [None for dim in tensor.shape.as_list()[1:]]
             else:
                 # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
                 # about the dimensions we're unsure of
@@ -322,13 +331,12 @@ def fetch_function(indices):
         def ensure_shapes(input_dict):
             return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()}
 
-        tf_dataset = (
-            tf.data.Dataset.from_tensor_slices(np.arange(len(dataset)))
-            .shuffle(len(dataset))
-            .batch(batch_size, drop_remainder=drop_remainder)
-            .map(fetch_function)
-            .map(ensure_shapes)
-        )
+        tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(dataset)))
+
+        if shuffle:
+            tf_dataset = tf_dataset.shuffle(len(dataset))
+
+        tf_dataset = tf_dataset.batch(batch_size, drop_remainder=drop_remainder).map(fetch_function).map(ensure_shapes)
 
         if label_cols:
 

From 48045fb00e806e2ff375a3fb54e1b2a7b03b6a2f Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Tue, 14 Sep 2021 11:44:58 +0100
Subject: [PATCH 31/45] Adding dummy labels by default

---
 src/datasets/arrow_dataset.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 32781f9b2ff..3261697eb1e 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -235,6 +235,7 @@ def to_tf_dataset(
         collate_fn_args=None,
         label_cols=None,
         prefetch=True,
+        dummy_labels=True
     ):
         if config.TF_AVAILABLE:
             import tensorflow as tf
@@ -354,6 +355,16 @@ def split_features_and_labels(input_batch):
         elif len(columns) == 1:
             tf_dataset = tf_dataset.map(lambda x: list(x.values())[0])
 
+        if dummy_labels and not label_cols:
+            print("Warning: No label_cols specified - adding some dummy labels to ensure fit() works correctly. If you "
+                  "only want to use this dataset with predict() or custom training loops, you can disable this "
+                  "behaviour by setting dummy_labels to False.")
+
+            def add_dummy_labels(input_batch):
+                return input_batch, tf.zeros(tf.shape(input_batch[columns[0]])[0])
+
+            tf_dataset = tf_dataset.map(add_dummy_labels)
+
         if prefetch:
             tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
         return tf_dataset

From ec4f7d4b886a6dd76f713bef30bf486639503266 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 12:46:26 +0100
Subject: [PATCH 32/45] Adding docstrings and type hints

---
 src/datasets/arrow_dataset.py | 45 +++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 3261697eb1e..e42823237e2 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -227,16 +227,41 @@ def _get_output_signature(dataset, test_batch, batch_size):
 
     def to_tf_dataset(
         self,
-        columns,
-        batch_size,
-        shuffle,
-        drop_remainder=None,
-        collate_fn=None,
-        collate_fn_args=None,
-        label_cols=None,
-        prefetch=True,
-        dummy_labels=True
+        columns: Union[str, List[str]],
+        batch_size: int,
+        shuffle: bool,
+        drop_remainder: bool = None,
+        collate_fn: Callable = None,
+        collate_fn_args: Dict[str, Any] = None,
+        label_cols: Union[str, List[str]] = None,
+        dummy_labels: bool = True,
+        prefetch: bool = True
     ):
+        """Create a tf.data.Dataset from the underlying Dataset. This tf.data.Dataset will load and collate batches from
+        the Dataset, and is suitable for passing to methods like model.fit() or model.predict().
+
+        Args:
+            columns (:obj:`List[str]` or :obj:`str`): Dataset column(s) to load in the tf.data.Dataset. In general,
+            only columns that the model can use as input should be included here.
+            batch_size (:obj:`int`): Size of batches to load from the dataset.
+            shuffle(:obj:`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
+             validation/evaluation.
+            drop_remainder(:obj:`bool`, default ``None``): Drop the last incomplete batch when loading. If not provided,
+             defaults to the same setting as shuffle.
+            collate_fn(:obj:`Callable`): A function or callable object (such as a `DataCollator`) that will collate
+             lists of samples into a batch.
+            collate_fn_args (:obj:`Dict`, optional): An optional `dict` of keyword arguments to be passed to the
+             `collate_fn`.
+            label_cols (:obj:`List[str]` or :obj:`str`, default ``None``): Dataset column(s) to load as
+             labels. Note that many models compute loss internally rather than letting Keras do it, in which case it is
+              not necessary to actually pass the labels here, as long as they're in the input `columns`.
+            dummy_labels (:obj:`bool`, default ``True``): If no `label_cols` are set, output an array of "dummy" labels
+             with each batch. This setting ensures that Keras `fit()` or `train_on_batch()` does not get confused
+             by the missing labels.
+            prefetch (:obj:`bool`, default ``True``): Whether to run the dataloader in a separate thread and maintain
+             a small buffer of batches for training. Improves performance by allowing data to be loaded in the
+             background while the model is training.
+        """
         if config.TF_AVAILABLE:
             import tensorflow as tf
         else:
@@ -271,7 +296,7 @@ def to_tf_dataset(
             # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
             drop_remainder = shuffle
         dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
-        dataset.set_format("python")
+        dataset.set_format("numpy")
 
         def numpy_pad(data):
             # Get lengths of each row of data

From 88e9f1e1d3d235ac366a8f51e81d8e81ef125eaf Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 12:47:00 +0100
Subject: [PATCH 33/45] Style pass

---
 src/datasets/arrow_dataset.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index e42823237e2..86323106ba0 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -201,7 +201,7 @@ def _get_output_signature(dataset, test_batch, batch_size):
                         "sure the column should be convertable to tf.Tensor, please "
                         "file an issue at github.com/huggingface/datasets and tag "
                         "@rocketknight1!"
-                )
+                    )
             shape = [batch_size] + shape
             shape = [dim if dim != -1 else None for dim in shape]
 
@@ -212,7 +212,7 @@ def _get_output_signature(dataset, test_batch, batch_size):
             if column in signatures:
                 continue
             if column.startswith("label"):
-                if "input_ids" in signatures and test_batch[column].shape == test_batch['input_ids'].shape:
+                if "input_ids" in signatures and test_batch[column].shape == test_batch["input_ids"].shape:
                     shape = signatures["input_ids"].shape
                 else:
                     # If this doesn't look like LM labels that got added by the collate_fn, let's not say anything
@@ -235,7 +235,7 @@ def to_tf_dataset(
         collate_fn_args: Dict[str, Any] = None,
         label_cols: Union[str, List[str]] = None,
         dummy_labels: bool = True,
-        prefetch: bool = True
+        prefetch: bool = True,
     ):
         """Create a tf.data.Dataset from the underlying Dataset. This tf.data.Dataset will load and collate batches from
         the Dataset, and is suitable for passing to methods like model.fit() or model.predict().
@@ -381,9 +381,11 @@ def split_features_and_labels(input_batch):
             tf_dataset = tf_dataset.map(lambda x: list(x.values())[0])
 
         if dummy_labels and not label_cols:
-            print("Warning: No label_cols specified - adding some dummy labels to ensure fit() works correctly. If you "
-                  "only want to use this dataset with predict() or custom training loops, you can disable this "
-                  "behaviour by setting dummy_labels to False.")
+            print(
+                "Warning: No label_cols specified - adding some dummy labels to ensure fit() works correctly. If you "
+                "only want to use this dataset with predict() or custom training loops, you can disable this "
+                "behaviour by setting dummy_labels to False."
+            )
 
             def add_dummy_labels(input_batch):
                 return input_batch, tf.zeros(tf.shape(input_batch[columns[0]])[0])

From a7b45747ddd5068c6da741a7b1321228efe7e1fd Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 13:31:19 +0100
Subject: [PATCH 34/45] Add tests, bugfix to handling scalar columns

---
 src/datasets/arrow_dataset.py |  7 ++++---
 tests/test_arrow_dataset.py   | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 86323106ba0..10a70974578 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -299,11 +299,12 @@ def to_tf_dataset(
         dataset.set_format("numpy")
 
         def numpy_pad(data):
+            try:
+                return np.array(data)
+            except:
+                pass
             # Get lengths of each row of data
             lens = np.array([len(i) for i in data])
-            if np.all(lens == lens[0]):
-                # All data has the same length, no padding required
-                return np.array(data)
 
             # Mask of valid places in each row
             mask = np.arange(lens.max()) < lens[:, None]
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 58eed9b638d..34b3a2f5c54 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -1997,6 +1997,21 @@ def test_with_transform(self, in_memory):
                     self.assertNotEqual(dset.format, dset2.format)
                     self.assertNotEqual(dset._fingerprint, dset2._fingerprint)
 
+    @require_tf
+    def test_tf_dataset_conversion(self, in_memory):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:
+                tf_dataset = dset.to_tf_dataset(columns="col_3", batch_size=4, shuffle=False, dummy_labels=False)
+                batch = next(iter(tf_dataset))
+                self.assertEqual(batch.shape.as_list(), [4, 4])
+                self.assertEqual(batch.dtype.name, "int64")
+            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
+                tf_dataset = dset.to_tf_dataset(columns="col_1", batch_size=4, shuffle=False, dummy_labels=False)
+                batch = next(iter(tf_dataset))
+                self.assertEqual(batch.shape.as_list(), [4])
+                self.assertEqual(batch.dtype.name, "int64")
+
+
 
 class MiscellaneousDatasetTest(TestCase):
     def test_from_pandas(self):

From b35267dc0e6a6658464275c0298991e22720f165 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 13:31:36 +0100
Subject: [PATCH 35/45] Style pass

---
 tests/test_arrow_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 34b3a2f5c54..1f07d20e64a 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -2012,7 +2012,6 @@ def test_tf_dataset_conversion(self, in_memory):
                 self.assertEqual(batch.dtype.name, "int64")
 
 
-
 class MiscellaneousDatasetTest(TestCase):
     def test_from_pandas(self):
         data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}

From 6273d737c8591937f217c2a8eb9d8eac9ea451bf Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 13:39:16 +0100
Subject: [PATCH 36/45] Fix to `numpy_pad`

---
 src/datasets/arrow_dataset.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 10a70974578..4ef250413c2 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -300,8 +300,13 @@ def to_tf_dataset(
 
         def numpy_pad(data):
             try:
-                return np.array(data)
-            except:
+                # When this is finally fully removed, remove this line
+                # Alternatively, find a more elegant way to do this whole thing
+                np.warnings.filterwarnings("error", category=np.VisibleDeprecationWarning)
+                data = np.array(data)
+                assert data.dtype != np.object
+                return data
+            except (np.VisibleDeprecationWarning, AssertionError):
                 pass
             # Get lengths of each row of data
             lens = np.array([len(i) for i in data])

From 4ff6d2e81e5e5825b44f56026385e54b1157fcc0 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 13:41:04 +0100
Subject: [PATCH 37/45] Replace assertion with more robust syntax

---
 src/datasets/arrow_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 4ef250413c2..7b1e1967572 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -304,7 +304,8 @@ def numpy_pad(data):
                 # Alternatively, find a more elegant way to do this whole thing
                 np.warnings.filterwarnings("error", category=np.VisibleDeprecationWarning)
                 data = np.array(data)
-                assert data.dtype != np.object
+                if data.dtype == np.object:
+                    raise AssertionError  # Do it this way so that the assert doesn't get optimized out
                 return data
             except (np.VisibleDeprecationWarning, AssertionError):
                 pass

From 589c575df59c9e20f71646002bb59eb8bc51308d Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 14:04:58 +0100
Subject: [PATCH 38/45] Add cleanup deletion of tf_dataset in tests

---
 tests/test_arrow_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 1f07d20e64a..6c12a9ccd73 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -2010,6 +2010,7 @@ def test_tf_dataset_conversion(self, in_memory):
                 batch = next(iter(tf_dataset))
                 self.assertEqual(batch.shape.as_list(), [4])
                 self.assertEqual(batch.dtype.name, "int64")
+            del tf_dataset  # For correct cleanup
 
 
 class MiscellaneousDatasetTest(TestCase):

From d70fe9482c07fd283baad3682e9013ba67da1189 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 14:52:56 +0100
Subject: [PATCH 39/45] Rebasing onto Master

---
 src/datasets/arrow_dataset.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 7b1e1967572..76c92c5ffd4 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -295,8 +295,7 @@ def to_tf_dataset(
         if drop_remainder is None:
             # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
             drop_remainder = shuffle
-        dataset = self.remove_columns([col for col in self.features if col not in cols_to_retain])
-        dataset.set_format("numpy")
+        dataset.set_format("numpy", columns=cols_to_retain)
 
         def numpy_pad(data):
             try:

From a1897407d58edad773ec4b9bf4232fec9d51e682 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 14:58:17 +0100
Subject: [PATCH 40/45] Fixes for the new approach

---
 src/datasets/arrow_dataset.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 76c92c5ffd4..4cc1ad5ea32 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -164,7 +164,7 @@ def __init__(self):
         pass
 
     @staticmethod
-    def _get_output_signature(dataset, test_batch, batch_size):
+    def _get_output_signature(dataset, cols_to_retain, test_batch, batch_size):
         if config.TF_AVAILABLE:
             import tensorflow as tf
         else:
@@ -172,6 +172,8 @@ def _get_output_signature(dataset, test_batch, batch_size):
 
         signatures = {}
         for column, col_feature in dataset.features.items():
+            if column not in cols_to_retain:
+                continue
             dtype_feature = col_feature
             while hasattr(dtype_feature, "feature"):  # Descend this godforsaken nested rabbit hole as long as it takes
                 dtype_feature = dtype_feature.feature
@@ -295,7 +297,7 @@ def to_tf_dataset(
         if drop_remainder is None:
             # We assume that if you're shuffling it's the train set, so we drop the remainder unless told not to
             drop_remainder = shuffle
-        dataset.set_format("numpy", columns=cols_to_retain)
+        self.set_format("numpy", columns=cols_to_retain)
 
         def numpy_pad(data):
             try:
@@ -320,7 +322,7 @@ def numpy_pad(data):
             return out
 
         def np_get_batch(indices):
-            batch = dataset[indices]
+            batch = self[indices]
             out_batch = []
             if collate_fn is not None:
                 actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same
@@ -357,16 +359,16 @@ def fetch_function(indices):
 
         test_batch_dict = {key: test_batch[i] for i, key in enumerate(cols_to_retain)}
         output_signature = self._get_output_signature(
-            dataset, test_batch_dict, batch_size=batch_size if drop_remainder else None
+            self, cols_to_retain, test_batch_dict, batch_size=batch_size if drop_remainder else None
         )
 
         def ensure_shapes(input_dict):
             return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()}
 
-        tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(dataset)))
+        tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(self)))
 
         if shuffle:
-            tf_dataset = tf_dataset.shuffle(len(dataset))
+            tf_dataset = tf_dataset.shuffle(len(self))
 
         tf_dataset = tf_dataset.batch(batch_size, drop_remainder=drop_remainder).map(fetch_function).map(ensure_shapes)
 

From c8f251bfdef59c56ca6f98b59336c4d9539f068f Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 15:39:59 +0100
Subject: [PATCH 41/45] Force dtype to ensure Windows compatibility

---
 src/datasets/arrow_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 4cc1ad5ea32..1936fd55ecd 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -365,7 +365,7 @@ def fetch_function(indices):
         def ensure_shapes(input_dict):
             return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()}
 
-        tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(self)))
+        tf_dataset = tf.data.Dataset.from_tensor_slices(np.arange(len(self), dtype=np.int64))
 
         if shuffle:
             tf_dataset = tf_dataset.shuffle(len(self))

From f1f88888bd4882ad023e720fb23aafc40524edf2 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Wed, 15 Sep 2021 16:13:12 +0100
Subject: [PATCH 42/45] Fixing things because I am bad at merging

---
 src/datasets/arrow_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 1936fd55ecd..93ea6747996 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -484,7 +484,7 @@ class NonExistentDatasetError(Exception):
     pass
 
 
-class Dataset(DatasetInfoMixin, IndexableMixin):
+class Dataset(DatasetInfoMixin, IndexableMixin, TensorflowDatasetMixIn):
     """A Dataset backed by an Arrow table."""
 
     def __init__(

From ef9a7bb211841c37a8a22c5b44a2a81dffd65b06 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 16 Sep 2021 13:37:23 +0100
Subject: [PATCH 43/45] Fix issues with passing a mutable list to columns
 argument

---
 src/datasets/arrow_dataset.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 93ea6747996..96e32131e43 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -1603,12 +1603,16 @@ def set_format(
         # Check filter column
         if isinstance(columns, str):
             columns = [columns]
+        if isinstance(columns, tuple):
+            columns = list(columns)
         if columns is not None and any(col not in self._data.column_names for col in columns):
             raise ValueError(
                 "Columns {} not in the dataset. Current columns in the dataset: {}".format(
                     list(filter(lambda col: col not in self._data.column_names, columns)), self._data.column_names
                 )
             )
+        if columns is not None:
+            columns = columns.copy()  # Ensures modifications made to the list after this call don't cause bugs
 
         self._format_type = type
         self._format_kwargs = format_kwargs

From b8523e44a27c795d69a36ae9abd5dde027fc94d9 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Thu, 16 Sep 2021 15:06:02 +0200
Subject: [PATCH 44/45] Update src/datasets/arrow_dataset.py

---
 src/datasets/arrow_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 96e32131e43..7570b5f7082 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -244,7 +244,7 @@ def to_tf_dataset(
 
         Args:
             columns (:obj:`List[str]` or :obj:`str`): Dataset column(s) to load in the tf.data.Dataset. In general,
-            only columns that the model can use as input should be included here.
+            only columns that the model can use as input should be included here (numeric data only).
             batch_size (:obj:`int`): Size of batches to load from the dataset.
             shuffle(:obj:`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
              validation/evaluation.

From 397bcb72ca4789d25f0bce07e43a23580812b7d2 Mon Sep 17 00:00:00 2001
From: matt <rocketknight1@gmail.com>
Date: Thu, 16 Sep 2021 14:36:12 +0100
Subject: [PATCH 45/45] Fix unused import

---
 src/datasets/arrow_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 79bf85eafb9..cefc848eb00 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -44,7 +44,7 @@
 from . import config, utils
 from .arrow_reader import ArrowReader
 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
-from .features import ClassLabel, Features, Sequence, Value, _ArrayXD, cast_to_python_objects
+from .features import ClassLabel, Features, Sequence, Value, _ArrayXD
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,