From f8e332104dd4827da559a39d06d7849b9ed3f77d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:36:02 +0100 Subject: [PATCH] Disallow video push_to_hub (#7265) * disallow video push_to_hub * docs * minor --- docs/source/_toctree.yml | 2 + docs/source/how_to.md | 2 +- docs/source/video_dataset.mdx | 172 ++++++++++++++++++++++++++++++++++ docs/source/video_load.mdx | 6 ++ src/datasets/arrow_dataset.py | 7 ++ 5 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 docs/source/video_dataset.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 295974c3d20..2e3728ef83a 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -74,6 +74,8 @@ title: Object detection - local: video_load title: Load video data + - local: video_dataset + title: Create a video dataset title: "Vision" - sections: - local: nlp_load diff --git a/docs/source/how_to.md b/docs/source/how_to.md index 7e6cf8f719e..223a7c2c4c0 100644 --- a/docs/source/how_to.md +++ b/docs/source/how_to.md @@ -14,7 +14,7 @@ The guides are organized into six sections: - General usage: Functions for general dataset loading and processing. The functions shown in this section are applicable across all dataset modalities. - Audio: How to load, process, and share audio datasets. -- Vision: How to load, process, and share image datasets. +- Vision: How to load, process, and share image and video datasets. - Text: How to load, process, and share text datasets. - Tabular: How to load, process, and share tabular datasets. - Dataset repository: How to share and upload a dataset to the Hub. diff --git a/docs/source/video_dataset.mdx b/docs/source/video_dataset.mdx new file mode 100644 index 00000000000..79cefbd294b --- /dev/null +++ b/docs/source/video_dataset.mdx @@ -0,0 +1,172 @@ +# Create a video dataset + +This guide will show you how to create a video dataset with `VideoFolder` and some metadata. This is a no-code solution for quickly creating a video dataset with several thousand videos. + + + +You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. + + + +## VideoFolder + +The `VideoFolder` is a dataset builder designed to quickly load a video dataset with several thousand videos without requiring you to write any code. + + + +💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `VideoFolder` creates dataset splits based on your dataset repository structure. + + + +`VideoFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like: + +``` +folder/train/dog/golden_retriever.mp4 +folder/train/dog/german_shepherd.mp4 +folder/train/dog/chihuahua.mp4 + +folder/train/cat/maine_coon.mp4 +folder/train/cat/bengal.mp4 +folder/train/cat/birman.mp4 +``` + +Then users can load your dataset by specifying `videofolder` in [`load_dataset`] and the directory in `data_dir`: + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("videofolder", data_dir="/path/to/folder") +``` + +You can also use `videofolder` to load datasets involving multiple splits. To do so, your dataset directory should have the following structure: + +``` +folder/train/dog/golden_retriever.mp4 +folder/train/cat/maine_coon.mp4 +folder/test/dog/german_shepherd.mp4 +folder/test/cat/bengal.mp4 +``` + + + +If all video files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. + + + + +If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl`. + +``` +folder/train/metadata.csv +folder/train/0001.mp4 +folder/train/0002.mp4 +folder/train/0003.mp4 +``` + +You can also zip your videos: + +``` +folder/metadata.csv +folder/train.zip +folder/test.zip +folder/valid.zip +``` + +Your `metadata.csv` file must have a `file_name` column which links video files with their metadata: + +```csv +file_name,additional_feature +0001.mp4,This is a first value of a text feature you added to your videos +0002.mp4,This is a second value of a text feature you added to your videos +0003.mp4,This is a third value of a text feature you added to your videos +``` + +or using `metadata.jsonl`: + +```jsonl +{"file_name": "0001.mp4", "additional_feature": "This is a first value of a text feature you added to your videos"} +{"file_name": "0002.mp4", "additional_feature": "This is a second value of a text feature you added to your videos"} +{"file_name": "0003.mp4", "additional_feature": "This is a third value of a text feature you added to your videos"} +``` + + + +If metadata files are present, the inferred labels based on the directory name are dropped by default. To include those labels, set `drop_labels=False` in `load_dataset`. + + + +### Video captioning + +Video captioning datasets have text describing a video. An example `metadata.csv` may look like: + +```csv +file_name,text +0001.mp4,This is a golden retriever playing with a ball +0002.mp4,A german shepherd +0003.mp4,One chihuahua +``` + +Load the dataset with `VideoFolder`, and it will create a `text` column for the video captions: + +```py +>>> dataset = load_dataset("videofolder", data_dir="/path/to/folder", split="train") +>>> dataset[0]["text"] +"This is a golden retriever playing with a ball" +``` + +### Upload dataset to the Hub + +Once you've created a dataset, you can share it to the using `huggingface_hub` for example. Make sure you have the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) library installed and you're logged in to your Hugging Face account (see the [Upload with Python tutorial](upload_dataset#upload-with-python) for more details). + +Upload your dataset with `huggingface_hub.HfApi.upload_folder`: + +```py +from huggingface_hub import HfApi +api = HfApi() + +api.upload_folder( + folder_path="/path/to/local/dataset", + repo_id="username/my-cool-dataset", + repo_type="dataset", +) +``` + +## WebDataset + +The [WebDataset](/~https://github.com/webdataset/webdataset) format is based on TAR archives and is suitable for big video datasets. +Indeed you can group your videos in TAR archives (e.g. 1GB of videos per TAR archive) and have thousands of TAR archives: + +``` +folder/train/00000.tar +folder/train/00001.tar +folder/train/00002.tar +... +``` + +In the archives, each example is made of files sharing the same prefix: + +``` +e39871fd9fd74f55.mp4 +e39871fd9fd74f55.json +f18b91585c4d3f3e.mp4 +f18b91585c4d3f3e.json +ede6e66b2fb59aab.mp4 +ede6e66b2fb59aab.json +ed600d57fcee4f94.mp4 +ed600d57fcee4f94.json +... +``` + +You can put your videos labels/captions/features using JSON or text files for example. + +For more details on the WebDataset format and the python library, please check the [WebDataset documentation](https://webdataset.github.io/webdataset). + +Load your WebDataset and it will create on column per file suffix (here "mp4" and "json"): + +```python +>>> from datasets import load_dataset + +>>> dataset = load_dataset("webdataset", data_dir="/path/to/folder", split="train") +>>> dataset[0]["json"] +{"bbox": [[302.0, 109.0, 73.0, 52.0]], "categories": [0]} +``` diff --git a/docs/source/video_load.mdx b/docs/source/video_load.mdx index 3744995b6ed..782869d2eed 100644 --- a/docs/source/video_load.mdx +++ b/docs/source/video_load.mdx @@ -112,6 +112,12 @@ To ignore the information in the metadata file, set `drop_labels=False` in [`loa >>> dataset = load_dataset("videofolder", data_dir="/path/to/folder", drop_labels=False) ``` + + +For more information about creating your own `VideoFolder` dataset, take a look at the [Create a video dataset](./video_dataset) guide. + + + ## WebDataset The [WebDataset](/~https://github.com/webdataset/webdataset) format is based on a folder of TAR archives and is suitable for big video datasets. diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index d284092eb1d..57f3024e53b 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5400,6 +5400,13 @@ def push_to_hub( >>> french_dataset = load_dataset("/", "fr") ``` """ + if "Video(" in str(self.features): + raise NotImplementedError( + "push_to_hub is not implemented for video datasets, instead you should upload the video files " + "using e.g. the huggingface_hub library and optionally upload a metadata.csv or metadata.jsonl " + "file containing other information like video captions, features or labels. More information " + "at https://huggingface.co/docs/datasets/main/en/video_load#videofolder" + ) if config_name == "data": raise ValueError("`config_name` cannot be 'data'. Please, choose another name for configuration.")