Fix exception chaining (#2911)

* Fix exception chaining * Fix style * Make head_hf_s3 raise Exception
huggingface · Sep 16, 2021 · 110e2e5 · 110e2e5 · github-actions · Sep 16, 2021
1 parent 19291d4
commit 110e2e5
Show file tree

Hide file tree

Showing 9 changed files with 56 additions and 47 deletions.
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -915,7 +915,9 @@ def save_to_disk(self, dataset_path: str, fs=None):
             try:
                 json.dumps(state["_format_kwargs"][k])
             except TypeError as e:
-                raise TypeError(str(e) + f"\nThe format kwargs must be JSON serializable, but key '{k}' isn't.")
+                raise TypeError(
+                    str(e) + f"\nThe format kwargs must be JSON serializable, but key '{k}' isn't."
+                ) from None
 
         # Get json serializable dataset info
         dataset_info = asdict(self._info)
@@ -2154,7 +2156,7 @@ def _map_single(
         # If set to False, no new arrow table will be created
         update_data = None
 
-        class NumExamplesMismatch(Exception):
+        class NumExamplesMismatchError(Exception):
             pass
 
         def validate_function_output(processed_inputs, indices):
@@ -2208,7 +2210,7 @@ def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=F
                 input_num_examples = len(inputs[next(iter(inputs.keys()))])
                 processed_inputs_num_examples = len(processed_inputs[next(iter(processed_inputs.keys()))])
                 if input_num_examples != processed_inputs_num_examples:
-                    raise NumExamplesMismatch()
+                    raise NumExamplesMismatchError()
             if isinstance(inputs, dict) and isinstance(processed_inputs, Mapping):
                 inputs.update(processed_inputs)
                 return inputs
@@ -2301,10 +2303,10 @@ def init_buffer_and_writer():
                                 check_same_num_examples=len(input_dataset.list_indexes()) > 0,
                                 offset=offset,
                             )
-                        except NumExamplesMismatch:
+                        except NumExamplesMismatchError:
                             raise DatasetTransformationNotAllowedError(
                                 "Using `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn't create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it."
-                            )
+                            ) from None
                         if update_data:
                             if i == 0:
                                 buf_writer, writer, tmp_file = init_buffer_and_writer()

diff --git a/src/datasets/arrow_reader.py b/src/datasets/arrow_reader.py
@@ -64,13 +64,13 @@
 _ADDITION_SEP_RE = re.compile(r"\s*\+\s*")
 
 
-class DatasetNotOnHfGcs(ConnectionError):
+class DatasetNotOnHfGcsError(ConnectionError):
     """When you can't get the dataset from the Hf google cloud storage"""
 
     pass
 
 
-class MissingFilesOnHfGcs(ConnectionError):
+class MissingFilesOnHfGcsError(ConnectionError):
     """When some files are missing on the Hf oogle cloud storage"""
 
     pass
@@ -263,8 +263,8 @@ def download_from_hf_gcs(self, download_config: DownloadConfig, relative_data_di
             shutil.move(downloaded_dataset_info, os.path.join(self._path, "dataset_info.json"))
             if self._info is not None:
                 self._info.update(self._info.from_directory(self._path))
-        except FileNotFoundError:
-            raise DatasetNotOnHfGcs()
+        except FileNotFoundError as err:
+            raise DatasetNotOnHfGcsError(err) from None
         try:
             for split in self._info.splits:
                 file_instructions = self.get_file_instructions(
@@ -278,8 +278,8 @@ def download_from_hf_gcs(self, download_config: DownloadConfig, relative_data_di
                         remote_prepared_filename.replace(os.sep, "/"), download_config=download_config
                     )
                     shutil.move(downloaded_prepared_filename, os.path.join(self._path, file_instruction["filename"]))
-        except FileNotFoundError:
-            raise MissingFilesOnHfGcs()
+        except FileNotFoundError as err:
+            raise MissingFilesOnHfGcsError(err) from None
 
 
 class ArrowReader(BaseReader):

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -139,15 +139,15 @@ def __arrow_array__(self, type=None):
                             "There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format(
                                 type_(self.data), e
                             )
-                        )
+                        ) from None
                     else:
                         raise
             elif "overflow" in str(e):
                 raise OverflowError(
                     "There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format(
                         type_(self.data), e
                     )
-                )
+                ) from None
             else:
                 raise
 
@@ -535,7 +535,7 @@ def finalize(self, metrics_query_result: dict):
                 parquet_to_arrow(sources, dest)
         except socket.error as e:  # broken pipe can happen if the connection is unstable, do local conversion instead
             if e.errno != errno.EPIPE:  # not a broken pipe
-                raise e
+                raise
             logger.warning("Broken Pipe during stream conversion from parquet to arrow. Using local convert instead")
             local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
             os.makedirs(local_convert_dir, exist_ok=True)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -35,7 +35,13 @@
 
 from . import config, utils
 from .arrow_dataset import Dataset
-from .arrow_reader import HF_GCP_BASE_URL, ArrowReader, DatasetNotOnHfGcs, MissingFilesOnHfGcs, ReadInstruction
+from .arrow_reader import (
+    HF_GCP_BASE_URL,
+    ArrowReader,
+    DatasetNotOnHfGcsError,
+    MissingFilesOnHfGcsError,
+    ReadInstruction,
+)
 from .arrow_writer import ArrowWriter, BeamWriter
 from .dataset_dict import DatasetDict, IterableDatasetDict
 from .fingerprint import Hasher
@@ -628,7 +634,7 @@ def incomplete_dir(dirname):
                         try:
                             self._download_prepared_from_hf_gcs(dl_manager._download_config)
                             downloaded_from_gcs = True
-                        except (DatasetNotOnHfGcs, MissingFilesOnHfGcs):
+                        except (DatasetNotOnHfGcsError, MissingFilesOnHfGcsError):
                             logger.info("Dataset not on Hf google storage. Downloading and preparing it from source")
                         except ConnectionError:
                             logger.warning("HF google storage unreachable. Downloading and preparing it from source")
@@ -730,7 +736,7 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
                     + (self.manual_download_instructions or "")
                     + "\nOriginal error:\n"
                     + str(e)
-                )
+                ) from None
 
             dl_manager.manage_extracted_files()
 

diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
@@ -271,10 +271,10 @@ def format_row(self, pa_table: pa.Table) -> dict:
         formatted_batch = self.format_batch(pa_table)
         try:
             return _unnest(formatted_batch)
-        except Exception:
+        except Exception as exc:
             raise TypeError(
                 f"Custom formatting function must return a dict to be able to pick a row, but got {formatted_batch}"
-            )
+            ) from exc
 
     def format_column(self, pa_table: pa.Table) -> ColumnFormat:
         formatted_batch = self.format_batch(pa_table)
@@ -290,10 +290,10 @@ def format_column(self, pa_table: pa.Table) -> ColumnFormat:
             )
         try:
             return formatted_batch[pa_table.column_names[0]]
-        except Exception:
+        except Exception as exc:
             raise TypeError(
                 f"Custom formatting function must return a dict to be able to pick a row, but got {formatted_batch}"
-            )
+            ) from exc
 
     def format_batch(self, pa_table: pa.Table) -> dict:
         batch = self.python_arrow_extractor().extract_batch(pa_table)

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -498,11 +498,13 @@ def prepare_packaged_module(name):
         return output
     else:
         # Try github (canonical datasets/metrics) and then HF Hub (community datasets)
-
         combined_path_abs = relative_to_absolute_path(combined_path)
         expected_dir_for_combined_path_abs = os.path.dirname(combined_path_abs)
         try:
-            head_hf_s3(path, filename=name, dataset=dataset, max_retries=download_config.max_retries)
+            try:
+                head_hf_s3(path, filename=name, dataset=dataset, max_retries=download_config.max_retries)
+            except Exception:
+                pass
             script_version = str(script_version) if script_version is not None else None
             if path.count("/") == 0:  # canonical datasets/metrics: github path
                 file_path = hf_github_url(path=path, name=name, dataset=dataset, version=script_version)
@@ -513,7 +515,7 @@ def prepare_packaged_module(name):
                         raise FileNotFoundError(
                             f"Couldn't find a directory or a {resource_type} named '{path}' using version {script_version}. "
                             f"It doesn't exist locally at {expected_dir_for_combined_path_abs} or remotely at {file_path}"
-                        )
+                        ) from None
                     else:
                         github_file_path = file_path
                         file_path = hf_github_url(path=path, name=name, dataset=dataset, version="master")
@@ -527,7 +529,7 @@ def prepare_packaged_module(name):
                             raise FileNotFoundError(
                                 f"Couldn't find a directory or a {resource_type} named '{path}'. "
                                 f"It doesn't exist locally at {expected_dir_for_combined_path_abs} or remotely at {github_file_path}"
-                            )
+                            ) from None
             elif path.count("/") == 1:  # users datasets/metrics: s3 path (hub for datasets and s3 for metrics)
                 file_path = hf_hub_url(path=path, name=name, version=script_version)
                 if not dataset:
@@ -544,11 +546,11 @@ def prepare_packaged_module(name):
                         dataset_info = hf_api.dataset_info(
                             repo_id=path, revision=script_version, token=download_config.use_auth_token
                         )
-                    except Exception:
+                    except Exception as exc:
                         raise FileNotFoundError(
                             f"Couldn't find a directory or a {resource_type} named '{path}'. "
                             f"It doesn't exist locally at {expected_dir_for_combined_path_abs} or remotely on {hf_api.endpoint}/datasets"
-                        )
+                        ) from exc
                     resolved_data_files = _resolve_data_files_in_dataset_repository(
                         dataset_info,
                         data_files if data_files is not None else "*",
@@ -558,7 +560,7 @@ def prepare_packaged_module(name):
                     if not infered_module_name:
                         raise FileNotFoundError(
                             f"No data files found in dataset repository '{path}'. Local directory at {expected_dir_for_combined_path_abs} doesn't exist either."
-                        )
+                        ) from None
                     output = prepare_packaged_module(infered_module_name)
                     if return_resolved_file_path:
                         output += (None,)

diff --git a/src/datasets/metric.py b/src/datasets/metric.py
@@ -248,13 +248,13 @@ def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
                         f"Error in _create_cache_file: another metric instance is already using the local cache file at {file_path}. "
                         f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
                         f"between distributed metric instances."
-                    )
+                    ) from None
                 if i == self.max_concurrent_cache_files - 1:
                     raise ValueError(
                         f"Cannot acquire lock, too many metric instance are operating concurrently on this file system."
                         f"You should set a larger value of max_concurrent_cache_files when creating the metric "
                         f"(current value is {self.max_concurrent_cache_files})."
-                    )
+                    ) from None
                 # In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
                 file_uuid = str(uuid.uuid4())
                 file_path = os.path.join(
@@ -292,7 +292,9 @@ def _get_all_cache_files(self) -> Tuple[List[str], List[FileLock]]:
                 try:
                     filelock.acquire(timeout=self.timeout)
                 except Timeout:
-                    raise ValueError(f"Cannot acquire lock on cached file {file_path} for process {process_id}.")
+                    raise ValueError(
+                        f"Cannot acquire lock on cached file {file_path} for process {process_id}."
+                    ) from None
                 else:
                     filelocks.append(filelock)
 
@@ -310,7 +312,7 @@ def _check_all_processes_locks(self):
             except Timeout:
                 raise ValueError(
                     f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
-                )
+                ) from None
             else:
                 nofilelock.release()
 
@@ -322,15 +324,15 @@ def _check_rendez_vous(self):
         except Timeout:
             raise ValueError(
                 f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
-            )
+            ) from None
         else:
             nofilelock.release()
         lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
         rendez_vous_lock = FileLock(lock_file_name)
         try:
             rendez_vous_lock.acquire(timeout=self.timeout)
         except Timeout:
-            raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.")
+            raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.") from None
         else:
             rendez_vous_lock.release()
 
@@ -362,7 +364,7 @@ def _finalize(self):
                 raise ValueError(
                     "Error in finalize: another metric instance is already using the local cache file. "
                     "Please specify an experiment_id to avoid collision between distributed metric instances."
-                )
+                ) from None
 
             # Store file paths and locks and we will release/delete them after the computation.
             self.file_paths = file_paths
@@ -439,7 +441,7 @@ def add_batch(self, *, predictions=None, references=None):
                 f"Expected format: {self.features},\n"
                 f"Input predictions: {predictions},\n"
                 f"Input references: {references}"
-            )
+            ) from None
 
     def add(self, *, prediction=None, reference=None):
         """Add one prediction and reference for the metric's stack.
@@ -460,7 +462,7 @@ def add(self, *, prediction=None, reference=None):
                 f"Expected format: {self.features},\n"
                 f"Input predictions: {prediction},\n"
                 f"Input references: {reference}"
-            )
+            ) from None
 
     def _init_writer(self, timeout=1):
         if self.num_process > 1:
@@ -474,7 +476,7 @@ def _init_writer(self, timeout=1):
                         f"Error in _init_writer: another metric instance is already using the local cache file at {file_path}. "
                         f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
                         f"between distributed metric instances."
-                    )
+                    ) from None
 
         if self.keep_in_memory:
             self.buf_writer = pa.BufferOutputStream()

diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py
@@ -141,7 +141,7 @@ def _generate_tables(self, files):
                                 f"You should probably indicate the field of the JSON file containing your records. "
                                 f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                                 f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
-                            )
+                            ) from None
                         # Uncomment for debugging (will print the Arrow table size and elements)
                         # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
                         # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
@@ -147,13 +147,10 @@ def hf_bucket_url(identifier: str, filename: str, use_cdn=False, dataset=True) -
 def head_hf_s3(
     identifier: str, filename: str, use_cdn=False, dataset=True, max_retries=0
 ) -> Union[requests.Response, Exception]:
-    try:
-        return http_head(
-            hf_bucket_url(identifier=identifier, filename=filename, use_cdn=use_cdn, dataset=dataset),
-            max_retries=max_retries,
-        )
-    except Exception as e:
-        return e
+    return http_head(
+        hf_bucket_url(identifier=identifier, filename=filename, use_cdn=use_cdn, dataset=dataset),
+        max_retries=max_retries,
+    )
 
 
 def hf_github_url(path: str, name: str, dataset=True, version: Optional[str] = None) -> str:
@@ -421,7 +418,7 @@ def ftp_get(url, temp_file, timeout=10.0):
         with closing(urllib.request.urlopen(url, timeout=timeout)) as r:
             shutil.copyfileobj(r, temp_file)
     except urllib.error.URLError as e:
-        raise ConnectionError(e)
+        raise ConnectionError(e) from None
 
 
 def http_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0):