Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use HF_HUB_OFFLINE instead of HF_DATASETS_OFFLINE #6968

Merged
merged 2 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/loading.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ For more details, check out the [how to load tabular datasets from Pandas DataFr

Even if you don't have an internet connection, it is still possible to load a dataset. As long as you've downloaded a dataset from the Hub repository before, it should be cached. This means you can reload the dataset from the cache and use it offline.

If you know you won't have internet access, you can run 🤗 Datasets in full offline mode. This saves time because instead of waiting for the Dataset builder download to time out, 🤗 Datasets will look directly in the cache. Set the environment variable `HF_DATASETS_OFFLINE` to `1` to enable full offline mode.
If you know you won't have internet access, you can run 🤗 Datasets in full offline mode. This saves time because instead of waiting for the Dataset builder download to time out, 🤗 Datasets will look directly in the cache. Set the environment variable `HF_HUB_OFFLINE` to `1` to enable full offline mode.

## Slice splits

Expand Down
5 changes: 4 additions & 1 deletion src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from typing import Optional

from huggingface_hub import constants
from packaging import version


Expand Down Expand Up @@ -215,7 +216,9 @@
PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = 100

# Offline mode
HF_DATASETS_OFFLINE = os.environ.get("HF_DATASETS_OFFLINE", "AUTO").upper() in ENV_VARS_TRUE_VALUES
_offline = os.environ.get("HF_DATASETS_OFFLINE")
HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES
HF_DATASETS_OFFLINE = HF_HUB_OFFLINE # kept for backward-compatibility

# Here, `True` will disable progress bars globally without possibility of enabling it
# programmatically. `False` will enable them without possibility of disabling them.
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def files_to_hash(file_paths: List[str]) -> str:

def increase_load_count(name: str, resource_type: str):
"""Update the download count of a dataset or metric."""
if not config.HF_DATASETS_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
try:
head_hf_s3(name, filename=name + ".py", dataset=(resource_type == "dataset"))
except Exception:
Expand Down Expand Up @@ -1595,7 +1595,7 @@ def _get_modification_time(module_hash):
f"(last modified on {time.ctime(_get_modification_time(hash))}) since it "
f"couldn't be found locally at {self.name}"
)
if not config.HF_DATASETS_OFFLINE:
if not config.HF_HUB_OFFLINE:
warning_msg += ", or remotely on the Hugging Face Hub."
logger.warning(warning_msg)
importable_file_path = _get_importable_file_path(
Expand Down Expand Up @@ -1632,7 +1632,7 @@ def _get_modification_time(module_hash):
"dataset_name": self.name.split("/")[-1],
}
warning_msg = f"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub"
if config.HF_DATASETS_OFFLINE:
if config.HF_HUB_OFFLINE:
warning_msg += " (offline mode is enabled)."
logger.warning(warning_msg)
return DatasetModule(
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ class OfflineModeIsEnabled(ConnectionError):


def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):
"""Raise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_DATASETS_OFFLINE is True."""
if config.HF_DATASETS_OFFLINE:
"""Raise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_HUB_OFFLINE is True."""
if config.HF_HUB_OFFLINE:
raise OfflineModeIsEnabled(
"Offline mode is enabled." if msg is None else "Offline mode is enabled. " + str(msg)
)
Expand All @@ -317,7 +317,7 @@ def _request_with_retry(
) -> requests.Response:
"""Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.

Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
Note that if the environment variable HF_HUB_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.

Args:
method (str): HTTP method, such as 'GET' or 'HEAD'.
Expand Down
8 changes: 4 additions & 4 deletions tests/test_file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,13 @@ def test_get_from_cache_fsspec(tmpfs_file):
assert output_file_content == FILE_CONTENT


@patch("datasets.config.HF_DATASETS_OFFLINE", True)
@patch("datasets.config.HF_HUB_OFFLINE", True)
def test_cached_path_offline():
with pytest.raises(OfflineModeIsEnabled):
cached_path("https://huggingface.co")


@patch("datasets.config.HF_DATASETS_OFFLINE", True)
@patch("datasets.config.HF_HUB_OFFLINE", True)
def test_http_offline(tmp_path_factory):
filename = tmp_path_factory.mktemp("data") / "file.html"
with pytest.raises(OfflineModeIsEnabled):
Expand All @@ -158,7 +158,7 @@ def test_http_offline(tmp_path_factory):
http_head("https://huggingface.co")


@patch("datasets.config.HF_DATASETS_OFFLINE", True)
@patch("datasets.config.HF_HUB_OFFLINE", True)
def test_ftp_offline(tmp_path_factory):
filename = tmp_path_factory.mktemp("data") / "file.html"
with pytest.raises(OfflineModeIsEnabled):
Expand All @@ -167,7 +167,7 @@ def test_ftp_offline(tmp_path_factory):
ftp_head("ftp://huggingface.co")


@patch("datasets.config.HF_DATASETS_OFFLINE", True)
@patch("datasets.config.HF_HUB_OFFLINE", True)
def test_fsspec_offline(tmp_path_factory):
filename = tmp_path_factory.mktemp("data") / "file.html"
with pytest.raises(OfflineModeIsEnabled):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,7 @@ def test_load_dataset_from_hub(self):
with offline(offline_simulation_mode):
with self.assertRaises(ConnectionError) as context:
datasets.load_dataset("_dummy")
if offline_simulation_mode != OfflineSimulationMode.HF_DATASETS_OFFLINE_SET_TO_1:
if offline_simulation_mode != OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1:
self.assertIn(
"Couldn't reach '_dummy' on the Hub",
str(context.exception),
Expand Down
2 changes: 1 addition & 1 deletion tests/test_offline_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ def test_offline_with_connection_error():


def test_offline_with_datasets_offline_mode_enabled():
with offline(OfflineSimulationMode.HF_DATASETS_OFFLINE_SET_TO_1):
with offline(OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1):
with pytest.raises(ConnectionError):
http_head("https://huggingface.co")
8 changes: 4 additions & 4 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ class RequestWouldHangIndefinitelyError(Exception):
class OfflineSimulationMode(Enum):
CONNECTION_FAILS = 0
CONNECTION_TIMES_OUT = 1
HF_DATASETS_OFFLINE_SET_TO_1 = 2
HF_HUB_OFFLINE_SET_TO_1 = 2


@contextmanager
Expand All @@ -362,7 +362,7 @@ def offline(mode=OfflineSimulationMode.CONNECTION_FAILS, timeout=1e-16):
CONNECTION_TIMES_OUT: the connection hangs until it times out.
The default timeout value is low (1e-16) to speed up the tests.
Timeout errors are created by mocking requests.request
HF_DATASETS_OFFLINE_SET_TO_1: the HF_DATASETS_OFFLINE environment variable is set to 1.
HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE environment variable is set to 1.
This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEmabled error.
"""
online_request = requests.Session().request
Expand Down Expand Up @@ -395,8 +395,8 @@ def raise_connection_error(session, prepared_request, **kwargs):
# inspired from https://stackoverflow.com/a/904609
with patch("requests.Session.request", timeout_request):
yield
elif mode is OfflineSimulationMode.HF_DATASETS_OFFLINE_SET_TO_1:
with patch("datasets.config.HF_DATASETS_OFFLINE", True):
elif mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1:
with patch("datasets.config.HF_HUB_OFFLINE", True):
yield
else:
raise ValueError("Please use a value from the OfflineSimulationMode enum.")
Expand Down
Loading