Update fetching a bucket from MinIO

Previously, each dataset had their own bucket: https://openml1.win.tue.nl/datasets61/dataset_61.pq But we were advised to reduce the amount of buckets and favor hosting many objects in hierarchical structure, so we now have instead some prefixes to divide up the dataset objects into separate subdirectories: https://openml1.win.tue.nl/datasets/0000/0061/dataset_61.pq This commit has bypassed pre-commit. Tests should be updated too.
openml · Jan 8, 2024 · fc9462b · fc9462b
1 parent 43c66aa
commit fc9462b
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 6 deletions.
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -185,14 +185,15 @@ def _download_minio_bucket(
     parsed_url = urllib.parse.urlparse(source)
 
     # expect path format: /BUCKET/path/to/file.ext
-    bucket = parsed_url.path[1:]
+    _, bucket, *prefix, file = parsed_url.path.split("/")
+    prefix = "/".join(prefix)
 
     client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
 
-    for file_object in client.list_objects(bucket, recursive=True):
+    for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
         _download_minio_file(
-            source=source + "/" + file_object.object_name,
-            destination=pathlib.Path(destination, file_object.object_name),
+            source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
+            destination=pathlib.Path(destination, file_object.object_name.rsplit("/", 1)[1]),
             exists_ok=True,
         )
 

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1110,8 +1110,6 @@ def _get_dataset_parquet(
     # For now, it would be the only way for the user to fetch the additional
     # files in the bucket (no function exists on an OpenMLDataset to do this).
     if download_all_files:
-        if url.endswith(".pq"):
-            url, _ = url.rsplit("/", maxsplit=1)
         openml._api_calls._download_minio_bucket(source=cast(str, url), destination=cache_directory)
 
     if not os.path.isfile(output_file_path):