From f8e332104dd4827da559a39d06d7849b9ed3f77d Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Wed, 30 Oct 2024 14:36:02 +0100
Subject: [PATCH] Disallow video push_to_hub (#7265)

* disallow video push_to_hub

* docs

* minor
---
 docs/source/_toctree.yml      |   2 +
 docs/source/how_to.md         |   2 +-
 docs/source/video_dataset.mdx | 172 ++++++++++++++++++++++++++++++++++
 docs/source/video_load.mdx    |   6 ++
 src/datasets/arrow_dataset.py |   7 ++
 5 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/video_dataset.mdx
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 295974c3d20..2e3728ef83a 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -74,6 +74,8 @@
       title: Object detection
     - local: video_load
       title: Load video data
+    - local: video_dataset
+      title: Create a video dataset
     title: "Vision"
   - sections:
     - local: nlp_load
diff --git a/docs/source/how_to.md b/docs/source/how_to.md
index 7e6cf8f719e..223a7c2c4c0 100644
--- a/docs/source/how_to.md
+++ b/docs/source/how_to.md
@@ -14,7 +14,7 @@ The guides are organized into six sections:
 
 - <span class="underline decoration-sky-400 decoration-2 font-semibold">General usage</span>: Functions for general dataset loading and processing. The functions shown in this section are applicable across all dataset modalities.
 - <span class="underline decoration-pink-400 decoration-2 font-semibold">Audio</span>: How to load, process, and share audio datasets.
-- <span class="underline decoration-yellow-400 decoration-2 font-semibold">Vision</span>: How to load, process, and share image datasets.
+- <span class="underline decoration-yellow-400 decoration-2 font-semibold">Vision</span>: How to load, process, and share image and video datasets.
 - <span class="underline decoration-green-400 decoration-2 font-semibold">Text</span>: How to load, process, and share text datasets.
 - <span class="underline decoration-orange-400 decoration-2 font-semibold">Tabular</span>: How to load, process, and share tabular datasets.
 - <span class="underline decoration-indigo-400 decoration-2 font-semibold">Dataset repository</span>: How to share and upload a dataset to the <a href="https://huggingface.co/datasets">Hub</a>.
diff --git a/docs/source/video_dataset.mdx b/docs/source/video_dataset.mdx
new file mode 100644
index 00000000000..79cefbd294b
--- /dev/null
+++ b/docs/source/video_dataset.mdx
@@ -0,0 +1,172 @@
+# Create a video dataset
+
+This guide will show you how to create a video dataset with `VideoFolder` and some metadata. This is a no-code solution for quickly creating a video dataset with several thousand videos.
+
+<Tip>
+
+You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub.
+
+</Tip>
+
+## VideoFolder
+
+The `VideoFolder` is a dataset builder designed to quickly load a video dataset with several thousand videos without requiring you to write any code.
+
+<Tip>
+
+💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `VideoFolder` creates dataset splits based on your dataset repository structure.
+
+</Tip>
+
+`VideoFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like:
+
+```
+folder/train/dog/golden_retriever.mp4
+folder/train/dog/german_shepherd.mp4
+folder/train/dog/chihuahua.mp4
+
+folder/train/cat/maine_coon.mp4
+folder/train/cat/bengal.mp4
+folder/train/cat/birman.mp4
+```
+
+Then users can load your dataset by specifying `videofolder` in [`load_dataset`] and the directory in `data_dir`:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("videofolder", data_dir="/path/to/folder")
+```
+
+You can also use `videofolder` to load datasets involving multiple splits. To do so, your dataset directory should have the following structure:
+
+```
+folder/train/dog/golden_retriever.mp4
+folder/train/cat/maine_coon.mp4
+folder/test/dog/german_shepherd.mp4
+folder/test/cat/bengal.mp4
+```
+
+<Tip warning={true}>
+
+If all video files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly.
+
+</Tip>
+
+
+If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl`.
+
+```
+folder/train/metadata.csv
+folder/train/0001.mp4
+folder/train/0002.mp4
+folder/train/0003.mp4
+```
+
+You can also zip your videos:
+
+```
+folder/metadata.csv
+folder/train.zip
+folder/test.zip
+folder/valid.zip
+```
+
+Your `metadata.csv` file must have a `file_name` column which links video files with their metadata:
+
+```csv
+file_name,additional_feature
+0001.mp4,This is a first value of a text feature you added to your videos
+0002.mp4,This is a second value of a text feature you added to your videos
+0003.mp4,This is a third value of a text feature you added to your videos
+```
+
+or using `metadata.jsonl`:
+
+```jsonl
+{"file_name": "0001.mp4", "additional_feature": "This is a first value of a text feature you added to your videos"}
+{"file_name": "0002.mp4", "additional_feature": "This is a second value of a text feature you added to your videos"}
+{"file_name": "0003.mp4", "additional_feature": "This is a third value of a text feature you added to your videos"}
+```
+
+<Tip>
+
+If metadata files are present, the inferred labels based on the directory name are dropped by default. To include those labels, set `drop_labels=False` in `load_dataset`.
+
+</Tip>
+
+### Video captioning
+
+Video captioning datasets have text describing a video. An example `metadata.csv` may look like:
+
+```csv
+file_name,text
+0001.mp4,This is a golden retriever playing with a ball
+0002.mp4,A german shepherd
+0003.mp4,One chihuahua
+```
+
+Load the dataset with `VideoFolder`, and it will create a `text` column for the video captions:
+
+```py
+>>> dataset = load_dataset("videofolder", data_dir="/path/to/folder", split="train")
+>>> dataset[0]["text"]
+"This is a golden retriever playing with a ball"
+```
+
+### Upload dataset to the Hub
+
+Once you've created a dataset, you can share it to the using `huggingface_hub` for example. Make sure you have the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) library installed and you're logged in to your Hugging Face account (see the [Upload with Python tutorial](upload_dataset#upload-with-python) for more details).
+
+Upload your dataset with `huggingface_hub.HfApi.upload_folder`:
+
+```py
+from huggingface_hub import HfApi
+api = HfApi()
+
+api.upload_folder(
+    folder_path="/path/to/local/dataset",
+    repo_id="username/my-cool-dataset",
+    repo_type="dataset",
+)
+```
+
+## WebDataset
+
+The [WebDataset](/~https://github.com/webdataset/webdataset) format is based on TAR archives and is suitable for big video datasets.
+Indeed you can group your videos in TAR archives (e.g. 1GB of videos per TAR archive) and have thousands of TAR archives:
+
+```
+folder/train/00000.tar
+folder/train/00001.tar
+folder/train/00002.tar
+...
+```
+
+In the archives, each example is made of files sharing the same prefix:
+
+```
+e39871fd9fd74f55.mp4
+e39871fd9fd74f55.json
+f18b91585c4d3f3e.mp4
+f18b91585c4d3f3e.json
+ede6e66b2fb59aab.mp4
+ede6e66b2fb59aab.json
+ed600d57fcee4f94.mp4
+ed600d57fcee4f94.json
+...
+```
+
+You can put your videos labels/captions/features using JSON or text files for example.
+
+For more details on the WebDataset format and the python library, please check the [WebDataset documentation](https://webdataset.github.io/webdataset).
+
+Load your WebDataset and it will create on column per file suffix (here "mp4" and "json"):
+
+```python
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("webdataset", data_dir="/path/to/folder", split="train")
+>>> dataset[0]["json"]
+{"bbox": [[302.0, 109.0, 73.0, 52.0]], "categories": [0]}
+```
diff --git a/docs/source/video_load.mdx b/docs/source/video_load.mdx
index 3744995b6ed..782869d2eed 100644
--- a/docs/source/video_load.mdx
+++ b/docs/source/video_load.mdx
@@ -112,6 +112,12 @@ To ignore the information in the metadata file, set `drop_labels=False` in [`loa
 >>> dataset = load_dataset("videofolder", data_dir="/path/to/folder", drop_labels=False)
 ```
 
+<Tip>
+
+For more information about creating your own `VideoFolder` dataset, take a look at the [Create a video dataset](./video_dataset) guide.
+
+</Tip>
+
 ## WebDataset
 
 The [WebDataset](/~https://github.com/webdataset/webdataset) format is based on a folder of TAR archives and is suitable for big video datasets.
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index d284092eb1d..57f3024e53b 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -5400,6 +5400,13 @@ def push_to_hub(
         >>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
         ```
         """
+        if "Video(" in str(self.features):
+            raise NotImplementedError(
+                "push_to_hub is not implemented for video datasets, instead you should upload the video files "
+                "using e.g. the huggingface_hub library and optionally upload a metadata.csv or metadata.jsonl "
+                "file containing other information like video captions, features or labels. More information "
+                "at https://huggingface.co/docs/datasets/main/en/video_load#videofolder"
+            )
         if config_name == "data":
             raise ValueError("`config_name` cannot be 'data'. Please, choose another name for configuration.")