Skip to content

Commit

Permalink
Fix exception chaining (#2911)
Browse files Browse the repository at this point in the history
* Fix exception chaining

* Fix style

* Make head_hf_s3 raise Exception
  • Loading branch information
albertvillanova authored Sep 16, 2021
1 parent 19291d4 commit 110e2e5
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 47 deletions.
12 changes: 7 additions & 5 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,7 +915,9 @@ def save_to_disk(self, dataset_path: str, fs=None):
try:
json.dumps(state["_format_kwargs"][k])
except TypeError as e:
raise TypeError(str(e) + f"\nThe format kwargs must be JSON serializable, but key '{k}' isn't.")
raise TypeError(
str(e) + f"\nThe format kwargs must be JSON serializable, but key '{k}' isn't."
) from None

# Get json serializable dataset info
dataset_info = asdict(self._info)
Expand Down Expand Up @@ -2154,7 +2156,7 @@ def _map_single(
# If set to False, no new arrow table will be created
update_data = None

class NumExamplesMismatch(Exception):
class NumExamplesMismatchError(Exception):
pass

def validate_function_output(processed_inputs, indices):
Expand Down Expand Up @@ -2208,7 +2210,7 @@ def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=F
input_num_examples = len(inputs[next(iter(inputs.keys()))])
processed_inputs_num_examples = len(processed_inputs[next(iter(processed_inputs.keys()))])
if input_num_examples != processed_inputs_num_examples:
raise NumExamplesMismatch()
raise NumExamplesMismatchError()
if isinstance(inputs, dict) and isinstance(processed_inputs, Mapping):
inputs.update(processed_inputs)
return inputs
Expand Down Expand Up @@ -2301,10 +2303,10 @@ def init_buffer_and_writer():
check_same_num_examples=len(input_dataset.list_indexes()) > 0,
offset=offset,
)
except NumExamplesMismatch:
except NumExamplesMismatchError:
raise DatasetTransformationNotAllowedError(
"Using `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn't create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it."
)
) from None
if update_data:
if i == 0:
buf_writer, writer, tmp_file = init_buffer_and_writer()
Expand Down
12 changes: 6 additions & 6 deletions src/datasets/arrow_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@
_ADDITION_SEP_RE = re.compile(r"\s*\+\s*")


class DatasetNotOnHfGcs(ConnectionError):
class DatasetNotOnHfGcsError(ConnectionError):
"""When you can't get the dataset from the Hf google cloud storage"""

pass


class MissingFilesOnHfGcs(ConnectionError):
class MissingFilesOnHfGcsError(ConnectionError):
"""When some files are missing on the Hf oogle cloud storage"""

pass
Expand Down Expand Up @@ -263,8 +263,8 @@ def download_from_hf_gcs(self, download_config: DownloadConfig, relative_data_di
shutil.move(downloaded_dataset_info, os.path.join(self._path, "dataset_info.json"))
if self._info is not None:
self._info.update(self._info.from_directory(self._path))
except FileNotFoundError:
raise DatasetNotOnHfGcs()
except FileNotFoundError as err:
raise DatasetNotOnHfGcsError(err) from None
try:
for split in self._info.splits:
file_instructions = self.get_file_instructions(
Expand All @@ -278,8 +278,8 @@ def download_from_hf_gcs(self, download_config: DownloadConfig, relative_data_di
remote_prepared_filename.replace(os.sep, "/"), download_config=download_config
)
shutil.move(downloaded_prepared_filename, os.path.join(self._path, file_instruction["filename"]))
except FileNotFoundError:
raise MissingFilesOnHfGcs()
except FileNotFoundError as err:
raise MissingFilesOnHfGcsError(err) from None


class ArrowReader(BaseReader):
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,15 @@ def __arrow_array__(self, type=None):
"There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format(
type_(self.data), e
)
)
) from None
else:
raise
elif "overflow" in str(e):
raise OverflowError(
"There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format(
type_(self.data), e
)
)
) from None
else:
raise

Expand Down Expand Up @@ -535,7 +535,7 @@ def finalize(self, metrics_query_result: dict):
parquet_to_arrow(sources, dest)
except socket.error as e: # broken pipe can happen if the connection is unstable, do local conversion instead
if e.errno != errno.EPIPE: # not a broken pipe
raise e
raise
logger.warning("Broken Pipe during stream conversion from parquet to arrow. Using local convert instead")
local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
os.makedirs(local_convert_dir, exist_ok=True)
Expand Down
12 changes: 9 additions & 3 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@

from . import config, utils
from .arrow_dataset import Dataset
from .arrow_reader import HF_GCP_BASE_URL, ArrowReader, DatasetNotOnHfGcs, MissingFilesOnHfGcs, ReadInstruction
from .arrow_reader import (
HF_GCP_BASE_URL,
ArrowReader,
DatasetNotOnHfGcsError,
MissingFilesOnHfGcsError,
ReadInstruction,
)
from .arrow_writer import ArrowWriter, BeamWriter
from .dataset_dict import DatasetDict, IterableDatasetDict
from .fingerprint import Hasher
Expand Down Expand Up @@ -628,7 +634,7 @@ def incomplete_dir(dirname):
try:
self._download_prepared_from_hf_gcs(dl_manager._download_config)
downloaded_from_gcs = True
except (DatasetNotOnHfGcs, MissingFilesOnHfGcs):
except (DatasetNotOnHfGcsError, MissingFilesOnHfGcsError):
logger.info("Dataset not on Hf google storage. Downloading and preparing it from source")
except ConnectionError:
logger.warning("HF google storage unreachable. Downloading and preparing it from source")
Expand Down Expand Up @@ -730,7 +736,7 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
+ (self.manual_download_instructions or "")
+ "\nOriginal error:\n"
+ str(e)
)
) from None

dl_manager.manage_extracted_files()

Expand Down
8 changes: 4 additions & 4 deletions src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,10 @@ def format_row(self, pa_table: pa.Table) -> dict:
formatted_batch = self.format_batch(pa_table)
try:
return _unnest(formatted_batch)
except Exception:
except Exception as exc:
raise TypeError(
f"Custom formatting function must return a dict to be able to pick a row, but got {formatted_batch}"
)
) from exc

def format_column(self, pa_table: pa.Table) -> ColumnFormat:
formatted_batch = self.format_batch(pa_table)
Expand All @@ -290,10 +290,10 @@ def format_column(self, pa_table: pa.Table) -> ColumnFormat:
)
try:
return formatted_batch[pa_table.column_names[0]]
except Exception:
except Exception as exc:
raise TypeError(
f"Custom formatting function must return a dict to be able to pick a row, but got {formatted_batch}"
)
) from exc

def format_batch(self, pa_table: pa.Table) -> dict:
batch = self.python_arrow_extractor().extract_batch(pa_table)
Expand Down
16 changes: 9 additions & 7 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,11 +498,13 @@ def prepare_packaged_module(name):
return output
else:
# Try github (canonical datasets/metrics) and then HF Hub (community datasets)

combined_path_abs = relative_to_absolute_path(combined_path)
expected_dir_for_combined_path_abs = os.path.dirname(combined_path_abs)
try:
head_hf_s3(path, filename=name, dataset=dataset, max_retries=download_config.max_retries)
try:
head_hf_s3(path, filename=name, dataset=dataset, max_retries=download_config.max_retries)
except Exception:
pass
script_version = str(script_version) if script_version is not None else None
if path.count("/") == 0: # canonical datasets/metrics: github path
file_path = hf_github_url(path=path, name=name, dataset=dataset, version=script_version)
Expand All @@ -513,7 +515,7 @@ def prepare_packaged_module(name):
raise FileNotFoundError(
f"Couldn't find a directory or a {resource_type} named '{path}' using version {script_version}. "
f"It doesn't exist locally at {expected_dir_for_combined_path_abs} or remotely at {file_path}"
)
) from None
else:
github_file_path = file_path
file_path = hf_github_url(path=path, name=name, dataset=dataset, version="master")
Expand All @@ -527,7 +529,7 @@ def prepare_packaged_module(name):
raise FileNotFoundError(
f"Couldn't find a directory or a {resource_type} named '{path}'. "
f"It doesn't exist locally at {expected_dir_for_combined_path_abs} or remotely at {github_file_path}"
)
) from None
elif path.count("/") == 1: # users datasets/metrics: s3 path (hub for datasets and s3 for metrics)
file_path = hf_hub_url(path=path, name=name, version=script_version)
if not dataset:
Expand All @@ -544,11 +546,11 @@ def prepare_packaged_module(name):
dataset_info = hf_api.dataset_info(
repo_id=path, revision=script_version, token=download_config.use_auth_token
)
except Exception:
except Exception as exc:
raise FileNotFoundError(
f"Couldn't find a directory or a {resource_type} named '{path}'. "
f"It doesn't exist locally at {expected_dir_for_combined_path_abs} or remotely on {hf_api.endpoint}/datasets"
)
) from exc
resolved_data_files = _resolve_data_files_in_dataset_repository(
dataset_info,
data_files if data_files is not None else "*",
Expand All @@ -558,7 +560,7 @@ def prepare_packaged_module(name):
if not infered_module_name:
raise FileNotFoundError(
f"No data files found in dataset repository '{path}'. Local directory at {expected_dir_for_combined_path_abs} doesn't exist either."
)
) from None
output = prepare_packaged_module(infered_module_name)
if return_resolved_file_path:
output += (None,)
Expand Down
22 changes: 12 additions & 10 deletions src/datasets/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,13 @@ def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
f"Error in _create_cache_file: another metric instance is already using the local cache file at {file_path}. "
f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
f"between distributed metric instances."
)
) from None
if i == self.max_concurrent_cache_files - 1:
raise ValueError(
f"Cannot acquire lock, too many metric instance are operating concurrently on this file system."
f"You should set a larger value of max_concurrent_cache_files when creating the metric "
f"(current value is {self.max_concurrent_cache_files})."
)
) from None
# In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
file_uuid = str(uuid.uuid4())
file_path = os.path.join(
Expand Down Expand Up @@ -292,7 +292,9 @@ def _get_all_cache_files(self) -> Tuple[List[str], List[FileLock]]:
try:
filelock.acquire(timeout=self.timeout)
except Timeout:
raise ValueError(f"Cannot acquire lock on cached file {file_path} for process {process_id}.")
raise ValueError(
f"Cannot acquire lock on cached file {file_path} for process {process_id}."
) from None
else:
filelocks.append(filelock)

Expand All @@ -310,7 +312,7 @@ def _check_all_processes_locks(self):
except Timeout:
raise ValueError(
f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
)
) from None
else:
nofilelock.release()

Expand All @@ -322,15 +324,15 @@ def _check_rendez_vous(self):
except Timeout:
raise ValueError(
f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
)
) from None
else:
nofilelock.release()
lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
rendez_vous_lock = FileLock(lock_file_name)
try:
rendez_vous_lock.acquire(timeout=self.timeout)
except Timeout:
raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.")
raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.") from None
else:
rendez_vous_lock.release()

Expand Down Expand Up @@ -362,7 +364,7 @@ def _finalize(self):
raise ValueError(
"Error in finalize: another metric instance is already using the local cache file. "
"Please specify an experiment_id to avoid collision between distributed metric instances."
)
) from None

# Store file paths and locks and we will release/delete them after the computation.
self.file_paths = file_paths
Expand Down Expand Up @@ -439,7 +441,7 @@ def add_batch(self, *, predictions=None, references=None):
f"Expected format: {self.features},\n"
f"Input predictions: {predictions},\n"
f"Input references: {references}"
)
) from None

def add(self, *, prediction=None, reference=None):
"""Add one prediction and reference for the metric's stack.
Expand All @@ -460,7 +462,7 @@ def add(self, *, prediction=None, reference=None):
f"Expected format: {self.features},\n"
f"Input predictions: {prediction},\n"
f"Input references: {reference}"
)
) from None

def _init_writer(self, timeout=1):
if self.num_process > 1:
Expand All @@ -474,7 +476,7 @@ def _init_writer(self, timeout=1):
f"Error in _init_writer: another metric instance is already using the local cache file at {file_path}. "
f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
f"between distributed metric instances."
)
) from None

if self.keep_in_memory:
self.buf_writer = pa.BufferOutputStream()
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _generate_tables(self, files):
f"You should probably indicate the field of the JSON file containing your records. "
f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
)
) from None
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
Expand Down
13 changes: 5 additions & 8 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,10 @@ def hf_bucket_url(identifier: str, filename: str, use_cdn=False, dataset=True) -
def head_hf_s3(
identifier: str, filename: str, use_cdn=False, dataset=True, max_retries=0
) -> Union[requests.Response, Exception]:
try:
return http_head(
hf_bucket_url(identifier=identifier, filename=filename, use_cdn=use_cdn, dataset=dataset),
max_retries=max_retries,
)
except Exception as e:
return e
return http_head(
hf_bucket_url(identifier=identifier, filename=filename, use_cdn=use_cdn, dataset=dataset),
max_retries=max_retries,
)


def hf_github_url(path: str, name: str, dataset=True, version: Optional[str] = None) -> str:
Expand Down Expand Up @@ -421,7 +418,7 @@ def ftp_get(url, temp_file, timeout=10.0):
with closing(urllib.request.urlopen(url, timeout=timeout)) as r:
shutil.copyfileobj(r, temp_file)
except urllib.error.URLError as e:
raise ConnectionError(e)
raise ConnectionError(e) from None


def http_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0):
Expand Down

2 comments on commit 110e2e5

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.011538 / 0.011353 (0.000185) 0.004614 / 0.011008 (-0.006394) 0.038286 / 0.038508 (-0.000222) 0.041704 / 0.023109 (0.018595) 0.369306 / 0.275898 (0.093408) 0.419920 / 0.323480 (0.096440) 0.012397 / 0.007986 (0.004411) 0.005914 / 0.004328 (0.001585) 0.011052 / 0.004250 (0.006802) 0.059809 / 0.037052 (0.022757) 0.367247 / 0.258489 (0.108758) 0.419634 / 0.293841 (0.125793) 0.034453 / 0.128546 (-0.094093) 0.011859 / 0.075646 (-0.063788) 0.319379 / 0.419271 (-0.099892) 0.060427 / 0.043533 (0.016894) 0.373441 / 0.255139 (0.118302) 0.397437 / 0.283200 (0.114237) 0.140518 / 0.141683 (-0.001165) 2.058128 / 1.452155 (0.605973) 2.055511 / 1.492716 (0.562794)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.260915 / 0.018006 (0.242909) 0.576978 / 0.000490 (0.576488) 0.006896 / 0.000200 (0.006696) 0.000526 / 0.000054 (0.000472)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.044471 / 0.037411 (0.007060) 0.032133 / 0.014526 (0.017607) 0.031280 / 0.176557 (-0.145276) 0.149036 / 0.737135 (-0.588100) 0.037904 / 0.296338 (-0.258434)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.519615 / 0.215209 (0.304405) 5.125030 / 2.077655 (3.047375) 2.350607 / 1.504120 (0.846487) 2.047403 / 1.541195 (0.506208) 2.074662 / 1.468490 (0.606172) 0.509974 / 4.584777 (-4.074803) 6.823520 / 3.745712 (3.077808) 5.297573 / 5.269862 (0.027711) 1.376917 / 4.565676 (-3.188760) 0.058500 / 0.424275 (-0.365775) 0.006385 / 0.007607 (-0.001222) 0.686451 / 0.226044 (0.460407) 6.754794 / 2.268929 (4.485866) 3.064759 / 55.444624 (-52.379865) 2.504582 / 6.876477 (-4.371895) 2.487837 / 2.142072 (0.345764) 0.719768 / 4.805227 (-4.085459) 0.160110 / 6.500664 (-6.340554) 0.165295 / 0.075469 (0.089826)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.189009 / 1.841788 (-0.652778) 16.329618 / 8.074308 (8.255310) 35.224160 / 10.191392 (25.032768) 0.861395 / 0.680424 (0.180971) 0.633785 / 0.534201 (0.099584) 0.314176 / 0.579283 (-0.265107) 0.742021 / 0.434364 (0.307657) 0.248839 / 0.540337 (-0.291498) 0.255369 / 1.386936 (-1.131567)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.013383 / 0.011353 (0.002030) 0.005460 / 0.011008 (-0.005548) 0.038156 / 0.038508 (-0.000352) 0.042228 / 0.023109 (0.019119) 0.403599 / 0.275898 (0.127701) 0.396053 / 0.323480 (0.072573) 0.011801 / 0.007986 (0.003815) 0.004566 / 0.004328 (0.000237) 0.011318 / 0.004250 (0.007068) 0.045150 / 0.037052 (0.008097) 0.362394 / 0.258489 (0.103905) 0.406692 / 0.293841 (0.112851) 0.035026 / 0.128546 (-0.093520) 0.011832 / 0.075646 (-0.063814) 0.309068 / 0.419271 (-0.110203) 0.056869 / 0.043533 (0.013336) 0.374356 / 0.255139 (0.119217) 0.419303 / 0.283200 (0.136103) 0.092066 / 0.141683 (-0.049617) 2.113327 / 1.452155 (0.661172) 2.148836 / 1.492716 (0.656120)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.257851 / 0.018006 (0.239845) 0.577728 / 0.000490 (0.577238) 0.006904 / 0.000200 (0.006704) 0.000378 / 0.000054 (0.000323)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.044483 / 0.037411 (0.007072) 0.038355 / 0.014526 (0.023829) 0.032642 / 0.176557 (-0.143915) 0.148812 / 0.737135 (-0.588323) 0.035745 / 0.296338 (-0.260593)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.539744 / 0.215209 (0.324535) 5.167495 / 2.077655 (3.089841) 2.392665 / 1.504120 (0.888545) 2.110101 / 1.541195 (0.568907) 2.223917 / 1.468490 (0.755427) 0.537448 / 4.584777 (-4.047329) 6.885964 / 3.745712 (3.140252) 5.091698 / 5.269862 (-0.178163) 1.414699 / 4.565676 (-3.150977) 0.059756 / 0.424275 (-0.364519) 0.006225 / 0.007607 (-0.001382) 0.649750 / 0.226044 (0.423705) 6.801760 / 2.268929 (4.532832) 3.013116 / 55.444624 (-52.431508) 2.430926 / 6.876477 (-4.445551) 2.423413 / 2.142072 (0.281341) 0.721098 / 4.805227 (-4.084130) 0.162488 / 6.500664 (-6.338176) 0.150451 / 0.075469 (0.074981)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.178531 / 1.841788 (-0.663257) 17.826527 / 8.074308 (9.752219) 35.519246 / 10.191392 (25.327854) 1.006354 / 0.680424 (0.325930) 0.672549 / 0.534201 (0.138348) 0.284718 / 0.579283 (-0.294565) 0.715977 / 0.434364 (0.281613) 0.245078 / 0.540337 (-0.295260) 0.269747 / 1.386936 (-1.117189)

CML watermark

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010135 / 0.011353 (-0.001218) 0.003888 / 0.011008 (-0.007120) 0.035516 / 0.038508 (-0.002992) 0.039387 / 0.023109 (0.016278) 0.348213 / 0.275898 (0.072315) 0.378466 / 0.323480 (0.054986) 0.008927 / 0.007986 (0.000941) 0.005835 / 0.004328 (0.001506) 0.010571 / 0.004250 (0.006321) 0.045443 / 0.037052 (0.008391) 0.346433 / 0.258489 (0.087944) 0.391732 / 0.293841 (0.097891) 0.025086 / 0.128546 (-0.103461) 0.008797 / 0.075646 (-0.066850) 0.296755 / 0.419271 (-0.122516) 0.050531 / 0.043533 (0.006998) 0.370880 / 0.255139 (0.115741) 0.370398 / 0.283200 (0.087199) 0.123401 / 0.141683 (-0.018282) 1.959426 / 1.452155 (0.507271) 1.978125 / 1.492716 (0.485408)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.211012 / 0.018006 (0.193006) 0.469528 / 0.000490 (0.469038) 0.006639 / 0.000200 (0.006439) 0.000079 / 0.000054 (0.000025)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042095 / 0.037411 (0.004683) 0.026590 / 0.014526 (0.012064) 0.028068 / 0.176557 (-0.148489) 0.143287 / 0.737135 (-0.593848) 0.029851 / 0.296338 (-0.266488)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.399410 / 0.215209 (0.184201) 4.037063 / 2.077655 (1.959408) 2.083167 / 1.504120 (0.579047) 1.955458 / 1.541195 (0.414263) 1.973776 / 1.468490 (0.505286) 0.353135 / 4.584777 (-4.231641) 5.059073 / 3.745712 (1.313361) 4.071474 / 5.269862 (-1.198387) 0.904280 / 4.565676 (-3.661396) 0.042859 / 0.424275 (-0.381416) 0.006004 / 0.007607 (-0.001603) 0.522225 / 0.226044 (0.296180) 5.116259 / 2.268929 (2.847330) 2.582387 / 55.444624 (-52.862238) 2.225406 / 6.876477 (-4.651070) 2.296172 / 2.142072 (0.154100) 0.451144 / 4.805227 (-4.354083) 0.111928 / 6.500664 (-6.388736) 0.124911 / 0.075469 (0.049442)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.050637 / 1.841788 (-0.791151) 14.430945 / 8.074308 (6.356637) 24.830766 / 10.191392 (14.639374) 0.858374 / 0.680424 (0.177951) 0.606081 / 0.534201 (0.071880) 0.257363 / 0.579283 (-0.321920) 0.577509 / 0.434364 (0.143145) 0.197526 / 0.540337 (-0.342812) 0.204573 / 1.386936 (-1.182363)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.010186 / 0.011353 (-0.001167) 0.003897 / 0.011008 (-0.007112) 0.036680 / 0.038508 (-0.001828) 0.040283 / 0.023109 (0.017174) 0.338637 / 0.275898 (0.062739) 0.367235 / 0.323480 (0.043755) 0.008632 / 0.007986 (0.000646) 0.003687 / 0.004328 (-0.000642) 0.010575 / 0.004250 (0.006324) 0.046380 / 0.037052 (0.009328) 0.329558 / 0.258489 (0.071069) 0.375260 / 0.293841 (0.081419) 0.025889 / 0.128546 (-0.102657) 0.008811 / 0.075646 (-0.066835) 0.304753 / 0.419271 (-0.114518) 0.051440 / 0.043533 (0.007907) 0.345351 / 0.255139 (0.090212) 0.363216 / 0.283200 (0.080016) 0.092157 / 0.141683 (-0.049526) 1.927957 / 1.452155 (0.475802) 1.989090 / 1.492716 (0.496374)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.402085 / 0.018006 (0.384079) 0.483926 / 0.000490 (0.483436) 0.050429 / 0.000200 (0.050229) 0.000474 / 0.000054 (0.000419)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.040861 / 0.037411 (0.003450) 0.026021 / 0.014526 (0.011495) 0.028252 / 0.176557 (-0.148304) 0.143775 / 0.737135 (-0.593361) 0.031013 / 0.296338 (-0.265325)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.408584 / 0.215209 (0.193375) 4.003674 / 2.077655 (1.926019) 2.084509 / 1.504120 (0.580389) 1.872823 / 1.541195 (0.331628) 1.922930 / 1.468490 (0.454440) 0.362743 / 4.584777 (-4.222034) 5.329184 / 3.745712 (1.583472) 2.271526 / 5.269862 (-2.998336) 0.915697 / 4.565676 (-3.649980) 0.043853 / 0.424275 (-0.380422) 0.006181 / 0.007607 (-0.001426) 0.534206 / 0.226044 (0.308162) 5.301082 / 2.268929 (3.032153) 2.631492 / 55.444624 (-52.813132) 2.229833 / 6.876477 (-4.646644) 2.239248 / 2.142072 (0.097175) 0.495860 / 4.805227 (-4.309368) 0.115960 / 6.500664 (-6.384704) 0.129191 / 0.075469 (0.053722)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.022342 / 1.841788 (-0.819446) 14.523643 / 8.074308 (6.449335) 25.041072 / 10.191392 (14.849680) 0.831327 / 0.680424 (0.150903) 0.591332 / 0.534201 (0.057131) 0.259992 / 0.579283 (-0.319291) 0.559421 / 0.434364 (0.125057) 0.201834 / 0.540337 (-0.338504) 0.211466 / 1.386936 (-1.175470)

CML watermark

Please sign in to comment.