Skip to content

Commit

Permalink
Update fetching a bucket from MinIO
Browse files Browse the repository at this point in the history
Previously, each dataset had their own bucket:
  https://openml1.win.tue.nl/datasets61/dataset_61.pq

But we were advised to reduce the amount of buckets and
favor hosting many objects in hierarchical structure, so
we now have instead some prefixes to divide up the
dataset objects into separate subdirectories:

  https://openml1.win.tue.nl/datasets/0000/0061/dataset_61.pq

This commit has bypassed pre-commit. Tests should be
updated too.
  • Loading branch information
PGijsbers committed Jan 8, 2024
1 parent 43c66aa commit fc9462b
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 6 deletions.
9 changes: 5 additions & 4 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,15 @@ def _download_minio_bucket(
parsed_url = urllib.parse.urlparse(source)

# expect path format: /BUCKET/path/to/file.ext
bucket = parsed_url.path[1:]
_, bucket, *prefix, file = parsed_url.path.split("/")
prefix = "/".join(prefix)

client = minio.Minio(endpoint=parsed_url.netloc, secure=False)

for file_object in client.list_objects(bucket, recursive=True):
for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
_download_minio_file(
source=source + "/" + file_object.object_name,
destination=pathlib.Path(destination, file_object.object_name),
source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
destination=pathlib.Path(destination, file_object.object_name.rsplit("/", 1)[1]),
exists_ok=True,
)

Expand Down
2 changes: 0 additions & 2 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,8 +1110,6 @@ def _get_dataset_parquet(
# For now, it would be the only way for the user to fetch the additional
# files in the bucket (no function exists on an OpenMLDataset to do this).
if download_all_files:
if url.endswith(".pq"):
url, _ = url.rsplit("/", maxsplit=1)
openml._api_calls._download_minio_bucket(source=cast(str, url), destination=cache_directory)

if not os.path.isfile(output_file_path):
Expand Down

0 comments on commit fc9462b

Please sign in to comment.