Skip to content

Commit

Permalink
Merge branch 'main' into 1539-InferenceClient-text-classification
Browse files Browse the repository at this point in the history
  • Loading branch information
martinbrose committed Sep 5, 2023
2 parents 559872f + e328730 commit 2cca3fd
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 20 deletions.
18 changes: 9 additions & 9 deletions docs/source/en/guides/upload.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,14 +384,14 @@ getting an upload/push to fail at the end of the process or encountering a degra
We gathered a list of tips and recommendations for structuring your repo.


| Characteristic | Recommended | Tips |
| ---------------- | ------------------ | ---------------------------------------- |
| Repo size | - | contact us for large repos (TBs of data) |
| Files per repo | <100k | merge data into fewer files |
| Entries per folder | <10k | use subdirectories in repo |
| File size | <5GB | split data into chunked files |
| Commit size | <100 files* | upload files in multiple commits |
| Commits per repo | - | upload multiple files per commit |
| Characteristic | Recommended | Tips |
| ---------------- | ------------------ | ------------------------------------------------------ |
| Repo size | - | contact us for large repos (TBs of data) |
| Files per repo | <100k | merge data into fewer files |
| Entries per folder | <10k | use subdirectories in repo |
| File size | <5GB | split data into chunked files |
| Commit size | <100 files* | upload files in multiple commits |
| Commits per repo | - | upload multiple files per commit and/or squash history |

_* Not relevant when using `git` CLI directly_

Expand Down Expand Up @@ -424,7 +424,7 @@ In all cases no single LFS file will be able to be >50GB. I.e. 50GB is the hard
our experience, the user experience on the Hub starts to degrade after a few thousand commits. We are constantly working to
improve the service, but one must always remember that a git repository is not meant to work as a database with a lot of
writes. If your repo's history gets very large, it is always possible to squash all the commits to get a
fresh start.
fresh start using [`super_squash_history`]. This is a non-revertible operation.
- **Number of operations per commit**: Once again, there is no hard limit here. When a commit is uploaded on the Hub, each
git operation (addition or delete) is checked by the server. When a hundred LFS files are committed at once,
each file is checked individually to ensure it's been correctly uploaded. When pushing data through HTTP with `huggingface_hub`,
Expand Down
2 changes: 2 additions & 0 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
"run_as_future",
"set_space_sleep_time",
"space_info",
"super_squash_history",
"unlike",
"update_repo_visibility",
"upload_file",
Expand Down Expand Up @@ -505,6 +506,7 @@ def __dir__():
run_as_future, # noqa: F401
set_space_sleep_time, # noqa: F401
space_info, # noqa: F401
super_squash_history, # noqa: F401
unlike, # noqa: F401
update_repo_visibility, # noqa: F401
upload_file, # noqa: F401
Expand Down
11 changes: 10 additions & 1 deletion src/huggingface_hub/_commit_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class CommitScheduler:
If provided, only files matching at least one pattern are uploaded.
ignore_patterns (`List[str]` or `str`, *optional*):
If provided, files matching any of the patterns are not uploaded.
squash_history (`bool`, *optional*):
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
useful to avoid degraded performances on the repo when it grows too large.
hf_api (`HfApi`, *optional*):
The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
Expand Down Expand Up @@ -90,6 +93,7 @@ def __init__(
token: Optional[str] = None,
allow_patterns: Optional[Union[List[str], str]] = None,
ignore_patterns: Optional[Union[List[str], str]] = None,
squash_history: bool = False,
hf_api: Optional["HfApi"] = None,
) -> None:
self.api = hf_api or HfApi(token=token)
Expand Down Expand Up @@ -124,6 +128,7 @@ def __init__(
raise ValueError(f"'every' must be a positive integer, not '{every}'.")
self.lock = Lock()
self.every = every
self.squash_history = squash_history

logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.")
self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
Expand Down Expand Up @@ -161,7 +166,11 @@ def _push_to_hub(self) -> Optional[CommitInfo]:

logger.info("(Background) scheduled commit triggered.")
try:
return self.push_to_hub()
value = self.push_to_hub()
if self.squash_history:
logger.info("(Background) squashing repo history.")
self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision)
return value
except Exception as e:
logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced
raise
Expand Down
10 changes: 10 additions & 0 deletions src/huggingface_hub/_tensorboard_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class HFSummaryWriter(SummaryWriter):
underlying `SummaryWriter` object.
commit_every (`int` or `float`, *optional*):
The frequency (in minutes) at which the logs will be pushed to the Hub. Defaults to 5 minutes.
squash_history (`bool`, *optional*):
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
useful to avoid degraded performances on the repo when it grows too large.
repo_type (`str`, *optional*):
The type of the repo to which the logs will be pushed. Defaults to "model".
repo_revision (`str`, *optional*):
Expand Down Expand Up @@ -114,6 +117,7 @@ def __init__(
*,
logdir: Optional[str] = None,
commit_every: Union[int, float] = 5,
squash_history: bool = False,
repo_type: Optional[str] = None,
repo_revision: Optional[str] = None,
repo_private: bool = False,
Expand Down Expand Up @@ -148,8 +152,14 @@ def __init__(
allow_patterns=repo_allow_patterns,
ignore_patterns=repo_ignore_patterns,
every=commit_every,
squash_history=squash_history,
)

# Exposing some high-level info at root level
self.repo_id = self.scheduler.repo_id
self.repo_type = self.scheduler.repo_type
self.repo_revision = self.scheduler.revision

def __exit__(self, exc_type, exc_val, exc_tb):
"""Push to hub in a non-blocking way when exiting the logger's context manager."""
super().__exit__(exc_type, exc_val, exc_tb)
Expand Down
107 changes: 97 additions & 10 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2355,6 +2355,91 @@ def list_repo_commits(
)
]

@validate_hf_hub_args
def super_squash_history(
self,
repo_id: str,
*,
branch: Optional[str] = None,
commit_message: Optional[str] = None,
repo_type: Optional[str] = None,
token: Optional[str] = None,
) -> None:
"""Squash commit history on a branch for a repo on the Hub.
Squashing the repo history is useful when you know you'll make hundreds of commits and you don't want to
clutter the history. Squashing commits can only be performed from the head of a branch.
<Tip warning={true}>
Once squashed, the commit history cannot be retrieved. This is a non-revertible operation.
</Tip>
<Tip warning={true}>
Once the history of a branch has been squashed, it is not possible to merge it back into another branch since
their history will have diverged.
</Tip>
Args:
repo_id (`str`):
A namespace (user or an organization) and a repo name separated by a `/`.
branch (`str`, *optional*):
The branch to squash. Defaults to the head of the `"main"` branch.
commit_message (`str`, *optional*):
The commit message to use for the squashed commit.
repo_type (`str`, *optional*):
Set to `"dataset"` or `"space"` if listing commits from a dataset or a Space, `None` or `"model"` if
listing from a model. Default is `None`.
token (`str`, *optional*):
A valid authentication token (see https://huggingface.co/settings/token). If the machine is logged in
(through `huggingface-cli login` or [`~huggingface_hub.login`]), token can be automatically retrieved
from the cache.
Raises:
[`~utils.RepositoryNotFoundError`]:
If repository is not found (error 404): wrong repo_id/repo_type, private but not authenticated or repo
does not exist.
[`~utils.RevisionNotFoundError`]:
If the branch to squash cannot be found.
[`~utils.BadRequestError`]:
If invalid reference for a branch. You cannot squash history on tags.
Example:
```py
>>> from huggingface_hub import HfApi
>>> api = HfApi()
# Create repo
>>> repo_id = api.create_repo("test-squash").repo_id
# Make a lot of commits.
>>> api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"content")
>>> api.upload_file(repo_id=repo_id, path_in_repo="lfs.bin", path_or_fileobj=b"content")
>>> api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"another_content")
# Squash history
>>> api.super_squash_history(repo_id=repo_id)
```
"""
if repo_type is None:
repo_type = REPO_TYPE_MODEL
if repo_type not in REPO_TYPES:
raise ValueError("Invalid repo type")
if branch is None:
branch = DEFAULT_REVISION

# Prepare request
url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/super-squash/{branch}"
headers = self._build_hf_headers(token=token, is_write_action=True)
commit_message = commit_message or f"Super-squash branch '{branch}' using huggingface_hub"

# Super-squash
response = get_session().post(url=url, headers=headers, json={"message": commit_message})
hf_raise_for_status(response)

@validate_hf_hub_args
def create_repo(
self,
Expand Down Expand Up @@ -2485,7 +2570,8 @@ def delete_repo(
*,
token: Optional[str] = None,
repo_type: Optional[str] = None,
):
missing_ok: bool = False,
) -> None:
"""
Delete a repo from the HuggingFace Hub. CAUTION: this is irreversible.
Expand All @@ -2498,16 +2584,12 @@ def delete_repo(
repo_type (`str`, *optional*):
Set to `"dataset"` or `"space"` if uploading to a dataset or
space, `None` or `"model"` if uploading to a model.
missing_ok (`bool`, *optional*, defaults to `False`):
If `True`, do not raise an error if repo does not exist.
<Tip>
Raises the following errors:
Raises:
- [`~utils.RepositoryNotFoundError`]
If the repository to download from cannot be found. This may be because it doesn't exist,
or because it is set to `private` and you do not have access.
</Tip>
If the repository to delete from cannot be found and `missing_ok` is set to False (default).
"""
organization, name = repo_id.split("/") if "/" in repo_id else (None, repo_id)

Expand All @@ -2522,7 +2604,11 @@ def delete_repo(

headers = self._build_hf_headers(token=token, is_write_action=True)
r = get_session().delete(path, headers=headers, json=json)
hf_raise_for_status(r)
try:
hf_raise_for_status(r)
except RepositoryNotFoundError:
if not missing_ok:
raise

@validate_hf_hub_args
def update_repo_visibility(
Expand Down Expand Up @@ -5761,6 +5847,7 @@ def _parse_revision_from_pr_url(pr_url: str) -> str:
create_repo = api.create_repo
delete_repo = api.delete_repo
update_repo_visibility = api.update_repo_visibility
super_squash_history = api.super_squash_history
move_repo = api.move_repo
upload_file = api.upload_file
upload_folder = api.upload_folder
Expand Down
26 changes: 26 additions & 0 deletions tests/test_commit_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,32 @@ def _download(filename: str, revision: str) -> Path:
self.assertEqual(lfs_push2.read_text(), "binary content")
self.assertEqual(lfs_push3.read_text(), "binary content updated")

def test_sync_and_squash_history(self) -> None:
"""Test squash history when pushing to the Hub."""
watched_folder = self.cache_dir / "watched_folder"
watched_folder.mkdir(exist_ok=True, parents=True)
file_path = watched_folder / "file.txt"
with file_path.open("a") as f:
f.write("first line\n")

self.scheduler = CommitScheduler(
folder_path=watched_folder,
repo_id=self.repo_name,
every=1 / 60, # every 0.1s
hf_api=self.api,
squash_history=True,
)

# At least 1 push to hub triggered
time.sleep(0.5)
self.scheduler.stop()
self.scheduler.last_future.result()

# Branch history has been squashed
commits = self.api.list_repo_commits(repo_id=self.scheduler.repo_id)
self.assertEqual(len(commits), 1)
self.assertEqual(commits[0].title, "Super-squash branch 'main' using huggingface_hub")


@pytest.mark.usefixtures("fx_cache_dir")
class TestPartialFileIO(unittest.TestCase):
Expand Down
37 changes: 37 additions & 0 deletions tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ def test_delete_repo_error_message(self):
):
self._api.delete_repo("repo-that-does-not-exist")

@retry_endpoint
def test_delete_repo_missing_ok(self) -> None:
self._api.delete_repo("repo-that-does-not-exist", missing_ok=True)

@retry_endpoint
def test_create_update_and_delete_repo(self):
REPO_NAME = repo_name("crud")
Expand Down Expand Up @@ -2416,6 +2420,39 @@ def test_list_likes_on_production(self) -> None:
self.assertGreater(len(likes.spaces), 0)


class TestSquashHistory(HfApiCommonTest):
@use_tmp_repo()
def test_super_squash_history(self, repo_url: RepoUrl) -> None:
# Upload + update file on main
repo_id = repo_url.repo_id
self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"content")
self._api.upload_file(repo_id=repo_id, path_in_repo="lfs.bin", path_or_fileobj=b"content")
self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"another_content")

# Upload file on a new branch
self._api.create_branch(repo_id=repo_id, branch="v0.1", exist_ok=True)
self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"foo", revision="v0.1")

# Squash history on main
self._api.super_squash_history(repo_id=repo_id)

# List history
squashed_main_commits = self._api.list_repo_commits(repo_id=repo_id, revision="main")
branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1")

# Main branch has been squashed but initial commits still exists on other branch
self.assertEqual(len(squashed_main_commits), 1)
self.assertEqual(squashed_main_commits[0].title, "Super-squash branch 'main' using huggingface_hub")
self.assertEqual(len(branch_commits), 5)
self.assertEqual(branch_commits[-1].title, "initial commit")

# Squash history on branch
self._api.super_squash_history(repo_id=repo_id, branch="v0.1")
squashed_branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1")
self.assertEqual(len(squashed_branch_commits), 1)
self.assertEqual(squashed_branch_commits[0].title, "Super-squash branch 'v0.1' using huggingface_hub")


@pytest.mark.usefixtures("fx_production_space")
class TestSpaceAPIProduction(unittest.TestCase):
"""
Expand Down

0 comments on commit 2cca3fd

Please sign in to comment.