Skip to content

Commit

Permalink
better error message when downloading
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Nov 30, 2021
1 parent e6f1352 commit a74e15b
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
3 changes: 3 additions & 0 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,9 @@ def dataset_module_factory(
download_config = DownloadConfig(**download_kwargs)
download_config.extract_compressed_file = True
download_config.force_extract = True
download_config.force_download = download_mode = (
GenerateMode(download_mode or GenerateMode.REUSE_DATASET_IF_EXISTS) == GenerateMode.FORCE_REDOWNLOAD
)

filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
if not filename.endswith(".py"):
Expand Down
17 changes: 14 additions & 3 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ def get_from_cache(
response = None
cookies = None
etag = None
head_error = None

# Try a first time to file the file on the local file system without eTag (None)
# if we don't ask for 'force_download' then we spare a request
Expand Down Expand Up @@ -588,14 +589,19 @@ def get_from_cache(
):
connected = True
logger.info(f"Couldn't get ETag version for url {url}")
except (EnvironmentError, requests.exceptions.Timeout):
elif response.status_code == 401 and config.HF_ENDPOINT in url and use_auth_token is None:
raise ConnectionError(
f"Unauthorized for URL {url}. Please use the parameter ``use_auth_token=True`` after logging in with ``huggingface-cli login``"
)
except (EnvironmentError, requests.exceptions.Timeout) as e:
# not connected
head_error = e
pass

# connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
# try to get the last downloaded one
if not connected:
if os.path.exists(cache_path):
if os.path.exists(cache_path) and not force_download:
return cache_path
if local_files_only:
raise FileNotFoundError(
Expand All @@ -605,7 +611,12 @@ def get_from_cache(
elif response is not None and response.status_code == 404:
raise FileNotFoundError(f"Couldn't find file at {url}")
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
raise ConnectionError(f"Couldn't reach {url}")
if head_error is not None:
raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
elif response is not None:
raise ConnectionError(f"Couldn't reach {url} (error {response.status_code})")
else:
raise ConnectionError(f"Couldn't reach {url}")

# Try a second time
filename = hash_url_to_filename(cached_url, etag)
Expand Down

1 comment on commit a74e15b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.070537 / 0.011353 (0.059184) 0.004016 / 0.011008 (-0.006993) 0.031724 / 0.038508 (-0.006784) 0.035579 / 0.023109 (0.012469) 0.285859 / 0.275898 (0.009961) 0.331944 / 0.323480 (0.008464) 0.083699 / 0.007986 (0.075713) 0.005010 / 0.004328 (0.000681) 0.009307 / 0.004250 (0.005056) 0.042523 / 0.037052 (0.005471) 0.285299 / 0.258489 (0.026810) 0.327437 / 0.293841 (0.033596) 0.085458 / 0.128546 (-0.043088) 0.008991 / 0.075646 (-0.066655) 0.254881 / 0.419271 (-0.164390) 0.046239 / 0.043533 (0.002707) 0.291007 / 0.255139 (0.035868) 0.317854 / 0.283200 (0.034655) 0.084160 / 0.141683 (-0.057522) 1.743395 / 1.452155 (0.291241) 1.843641 / 1.492716 (0.350925)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.361437 / 0.018006 (0.343430) 0.550506 / 0.000490 (0.550017) 0.018641 / 0.000200 (0.018441) 0.000272 / 0.000054 (0.000217)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.037030 / 0.037411 (-0.000381) 0.022095 / 0.014526 (0.007569) 0.026219 / 0.176557 (-0.150337) 0.201597 / 0.737135 (-0.535539) 0.028284 / 0.296338 (-0.268054)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.449394 / 0.215209 (0.234185) 4.487727 / 2.077655 (2.410072) 2.072323 / 1.504120 (0.568203) 1.846186 / 1.541195 (0.304991) 1.913871 / 1.468490 (0.445381) 0.423163 / 4.584777 (-4.161614) 4.668130 / 3.745712 (0.922418) 2.364731 / 5.269862 (-2.905130) 0.870391 / 4.565676 (-3.695285) 0.050165 / 0.424275 (-0.374110) 0.011286 / 0.007607 (0.003679) 0.556797 / 0.226044 (0.330753) 5.526619 / 2.268929 (3.257690) 2.548610 / 55.444624 (-52.896014) 2.179211 / 6.876477 (-4.697266) 2.288500 / 2.142072 (0.146428) 0.542351 / 4.805227 (-4.262876) 0.115779 / 6.500664 (-6.384885) 0.057666 / 0.075469 (-0.017803)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.558363 / 1.841788 (-0.283425) 12.292840 / 8.074308 (4.218532) 26.847194 / 10.191392 (16.655802) 0.807745 / 0.680424 (0.127321) 0.528455 / 0.534201 (-0.005746) 0.368042 / 0.579283 (-0.211241) 0.497985 / 0.434364 (0.063621) 0.251437 / 0.540337 (-0.288900) 0.260915 / 1.386936 (-1.126021)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.068904 / 0.011353 (0.057551) 0.004071 / 0.011008 (-0.006937) 0.029798 / 0.038508 (-0.008710) 0.034126 / 0.023109 (0.011016) 0.340715 / 0.275898 (0.064817) 0.369065 / 0.323480 (0.045585) 0.088590 / 0.007986 (0.080604) 0.004432 / 0.004328 (0.000103) 0.007440 / 0.004250 (0.003190) 0.045000 / 0.037052 (0.007947) 0.339851 / 0.258489 (0.081362) 0.383257 / 0.293841 (0.089416) 0.084252 / 0.128546 (-0.044295) 0.009088 / 0.075646 (-0.066559) 0.252889 / 0.419271 (-0.166383) 0.046016 / 0.043533 (0.002483) 0.328779 / 0.255139 (0.073640) 0.355265 / 0.283200 (0.072065) 0.083563 / 0.141683 (-0.058120) 1.680631 / 1.452155 (0.228477) 1.757177 / 1.492716 (0.264461)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.369807 / 0.018006 (0.351801) 0.548927 / 0.000490 (0.548437) 0.003144 / 0.000200 (0.002944) 0.000088 / 0.000054 (0.000033)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.034251 / 0.037411 (-0.003160) 0.021472 / 0.014526 (0.006946) 0.027304 / 0.176557 (-0.149253) 0.197710 / 0.737135 (-0.539426) 0.028597 / 0.296338 (-0.267741)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.436087 / 0.215209 (0.220877) 4.356086 / 2.077655 (2.278431) 1.943439 / 1.504120 (0.439319) 1.744667 / 1.541195 (0.203472) 1.833605 / 1.468490 (0.365114) 0.421464 / 4.584777 (-4.163313) 4.675316 / 3.745712 (0.929604) 2.118813 / 5.269862 (-3.151048) 0.877378 / 4.565676 (-3.688299) 0.050404 / 0.424275 (-0.373871) 0.011174 / 0.007607 (0.003567) 0.542700 / 0.226044 (0.316655) 5.447621 / 2.268929 (3.178692) 2.369851 / 55.444624 (-53.074773) 2.025482 / 6.876477 (-4.850995) 2.185834 / 2.142072 (0.043762) 0.536941 / 4.805227 (-4.268286) 0.115357 / 6.500664 (-6.385307) 0.055914 / 0.075469 (-0.019556)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.556461 / 1.841788 (-0.285327) 12.129746 / 8.074308 (4.055438) 26.887470 / 10.191392 (16.696078) 0.806739 / 0.680424 (0.126315) 0.533396 / 0.534201 (-0.000805) 0.371756 / 0.579283 (-0.207527) 0.511830 / 0.434364 (0.077466) 0.264107 / 0.540337 (-0.276230) 0.275812 / 1.386936 (-1.111124)

CML watermark

Please sign in to comment.