Skip to content

Commit

Permalink
Support pyarrow large_list (#7019)
Browse files Browse the repository at this point in the history
* Test polars round trip

* Test Features.from_arrow_schema

* Add large attribute to Sequence

* Update get_nested_type to support pa.large_list

* Update generate_from_arrow_type to support pa.LargeListType

* Fix typo

* Rename test

* Add require_polars to test

* Test from_polars large_list

* Update test array_cast with large list

* Support large list in array_cast

* Test cast_array_to_feature for large list

* Support large list in cast_array_to_feature

* Fix support large list in cast_array_to_feature

* Test save_to_disk with a dataset from polars with large_list

* Test Features.reorder_fields_as with large Sequence

* Fix Features.reorder_fields_as by using all Sequence params

* Test save_to/load_from disk round trip with large_list dataset

* Test DatasetInfo.from_dict with large Sequence

* Test Features to/from dict round trip with large Sequence

* Fix features generate_from_dict by using all Sequence params

* Remove debug comments

* Test cast_array_to_feature with struct array

* Fix cast_array_to_feature for struct array

* Test cast_array_to_feature from/to the same Sequence feature dtype

* Fix cast_array_to_feature for the same Sequence feature dtype

* Add more tests for dataset with large Sequence

* Remove Sequence.large

* Remove Sequence.large from tests

* Add LargeList to tests

* Replace tests with Sequence.large with LargeList

* Replace Sequence.large with LargeList in test_dataset_info_from_dict

* Implement LargeList

* Test features to_yaml_list with LargeList

* Support LargeList in Features._to_yaml_list

* Test Features.from_dict with LargeList

* Support LargeList in Features.from_dict

* Test Features from_yaml_list with LargeList

* Support LargeList in Features._from_yaml_list

* Test get_nested_type with scalar/list features

* Support LargeList in get_nested_type

* Test generate_from_arrow_type with primitive/nested data types

* Support LargeList in generate_from_arrow_type

* Remove Sequence of dict from test cast_array_to_feature

* Support LargeList in cast_array_to_feature

* Test Features.encode_example

* Test encode_nested_example with list types

* Support LargeList in encode_nested_example

* Test check_non_null_non_empty_recursive with list types

* Support LargeList in check_non_null_non_empty_recursive

* Test require_decoding with list types

* Support LargeList in require_decoding

* Test decode_nested_example with list types

* Support LargeList in decode_nested_example

* Test generate_from_dict with list types

* Test Features.from_dict with list types

* Test _visit with list types

* Support LargeList in _visit

* Test require_storage_cast with list types

* Support LargeList in require_storage_cast

* Refactor test_require_storage_cast_with_list_types

* Test require_storage_embed with list types

* Support LargeList in require_storage_embed

* Fix test_features_reorder_fields_as

* Test Features.reorder_fields_as with list types

* Test Features.reorder_fields_as with dict within list types

* Support LargeList in Features.reorder_fields_as

* Test Features.flatten with list types

* Test embed_array_storage with list types

* Support LargeList in embed_array_storage

* Delete unused tf_utils.is_numeric_feature

* Add LargeList docstring

* Add LargeList to main classes docs

* Address requested changes
  • Loading branch information
albertvillanova committed Aug 14, 2024
1 parent 51591c0 commit 1654414
Show file tree
Hide file tree
Showing 9 changed files with 643 additions and 94 deletions.
6 changes: 4 additions & 2 deletions docs/source/package_reference/main_classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,13 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable

[[autodoc]] datasets.Features

[[autodoc]] datasets.Sequence
[[autodoc]] datasets.Value

[[autodoc]] datasets.ClassLabel

[[autodoc]] datasets.Value
[[autodoc]] datasets.LargeList

[[autodoc]] datasets.Sequence

[[autodoc]] datasets.Translation

Expand Down
3 changes: 2 additions & 1 deletion src/datasets/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
"Array5D",
"ClassLabel",
"Features",
"LargeList",
"Sequence",
"Value",
"Image",
"Translation",
"TranslationVariableLanguages",
]
from .audio import Audio
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, Sequence, Value
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value
from .image import Image
from .translation import Translation, TranslationVariableLanguages
170 changes: 114 additions & 56 deletions src/datasets/features/features.py

Large diffs are not rendered by default.

46 changes: 37 additions & 9 deletions src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ def array_cast(
return array
arrays = [_c(array.field(field.name), field.type) for field in pa_type]
return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())
elif pa.types.is_list(array.type):
elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
if pa.types.is_fixed_size_list(pa_type):
if _are_list_values_of_length(array, pa_type.list_size):
if array.null_count > 0:
Expand All @@ -1911,6 +1911,10 @@ def array_cast(
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
array_offsets = _combine_list_array_offsets_with_mask(array)
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
elif pa.types.is_large_list(pa_type):
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
array_offsets = _combine_list_array_offsets_with_mask(array)
return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
elif pa.types.is_fixed_size_list(array.type):
if pa.types.is_fixed_size_list(pa_type):
if pa_type.list_size == array.type.list_size:
Expand All @@ -1923,6 +1927,11 @@ def array_cast(
elif pa.types.is_list(pa_type):
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
elif pa.types.is_large_list(pa_type):
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
return pa.LargeListArray.from_arrays(
array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()
)
else:
if pa.types.is_string(pa_type):
if not allow_primitive_to_str and pa.types.is_primitive(array.type):
Expand Down Expand Up @@ -1972,7 +1981,7 @@ def cast_array_to_feature(
Returns:
array (`pyarrow.Array`): the casted array
"""
from .features.features import Sequence, get_nested_type
from .features.features import LargeList, Sequence, get_nested_type

_c = partial(
cast_array_to_feature,
Expand All @@ -1988,24 +1997,34 @@ def cast_array_to_feature(
elif pa.types.is_struct(array.type):
# feature must be a dict or Sequence(subfeatures_dict)
if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
feature = {
name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items()
}
sequence_kwargs = vars(feature).copy()
feature = sequence_kwargs.pop("feature")
feature = {name: Sequence(subfeature, **sequence_kwargs) for name, subfeature in feature.items()}
if isinstance(feature, dict) and {field.name for field in array.type} == set(feature):
if array.type.num_fields == 0:
return array
arrays = [_c(array.field(name), subfeature) for name, subfeature in feature.items()]
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
elif pa.types.is_list(array.type):
# feature must be either [subfeature] or Sequence(subfeature)
elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
# feature must be either [subfeature] or LargeList(subfeature) or Sequence(subfeature)
if isinstance(feature, list):
casted_array_values = _c(array.values, feature[0])
if casted_array_values.type == array.values.type:
if pa.types.is_list(array.type) and casted_array_values.type == array.values.type:
# Both array and feature have equal list type and values (within the list) type
return array
else:
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
array_offsets = _combine_list_array_offsets_with_mask(array)
return pa.ListArray.from_arrays(array_offsets, casted_array_values)
elif isinstance(feature, LargeList):
casted_array_values = _c(array.values, feature.dtype)
if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type:
# Both array and feature have equal large_list type and values (within the list) type
return array
else:
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
array_offsets = _combine_list_array_offsets_with_mask(array)
return pa.LargeListArray.from_arrays(array_offsets, casted_array_values)
elif isinstance(feature, Sequence):
if feature.length > -1:
if _are_list_values_of_length(array, feature.length):
Expand Down Expand Up @@ -2042,7 +2061,8 @@ def cast_array_to_feature(
return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)
else:
casted_array_values = _c(array.values, feature.feature)
if casted_array_values.type == array.values.type:
if pa.types.is_list(array.type) and casted_array_values.type == array.values.type:
# Both array and feature have equal list type and values (within the list) type
return array
else:
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
Expand All @@ -2053,6 +2073,9 @@ def cast_array_to_feature(
if isinstance(feature, list):
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature[0]), mask=array.is_null())
elif isinstance(feature, LargeList):
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, feature.dtype), mask=array.is_null())
elif isinstance(feature, Sequence):
if feature.length > -1:
if feature.length == array.type.list_size:
Expand Down Expand Up @@ -2128,6 +2151,11 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType"):
return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature[0]))
if isinstance(feature, Sequence) and feature.length == -1:
return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
elif pa.types.is_large_list(array.type):
# feature must be LargeList(subfeature)
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
array_offsets = _combine_list_array_offsets_with_mask(array)
return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.dtype))
elif pa.types.is_fixed_size_list(array.type):
# feature must be Sequence(subfeature)
if isinstance(feature, Sequence) and feature.length > -1:
Expand Down
18 changes: 0 additions & 18 deletions src/datasets/utils/tf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,24 +67,6 @@ def is_numeric_pa_type(pa_type):
return pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type) or pa.types.is_decimal(pa_type)


def is_numeric_feature(feature):
from .. import ClassLabel, Sequence, Value
from ..features.features import _ArrayXD

if isinstance(feature, Sequence):
return is_numeric_feature(feature.feature)
elif isinstance(feature, list):
return is_numeric_feature(feature[0])
elif isinstance(feature, _ArrayXD):
return is_numeric_pa_type(feature().storage_dtype)
elif isinstance(feature, Value):
return is_numeric_pa_type(feature())
elif isinstance(feature, ClassLabel):
return True
else:
return False


def np_get_batch(
indices, dataset, cols_to_retain, collate_fn, collate_fn_args, columns_to_np_types, return_dict=False
):
Expand Down
Loading

0 comments on commit 1654414

Please sign in to comment.