huggingface · lhoestq · Oct 21, 2020 · Oct 21, 2020
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -86,7 +86,7 @@
 
 	*For the dummy data*:
 	```bash
-	RUN_SLOW=1 pytest tests/test_dataset_common.py::LocalDatasetTest::test_load_dataset_all_configs_<your-dataset-name>
+	RUN_SLOW=0 pytest tests/test_dataset_common.py::LocalDatasetTest::test_load_dataset_all_configs_<your-dataset-name>
 	```
 
 6. If all tests pass, your dataset works correctly. Awesome! You can now follow steps 6, 7 and 8 of the section [*How to contribute to 🤗Datasets?*](#how-to-contribute-to-🤗Datasets). If you experience problems with the dummy data tests, you might want to take a look at the section *Help for dummy data tests* below.

diff --git a/datasets/compguesswhat/compguesswhat.py b/datasets/compguesswhat/compguesswhat.py
@@ -19,7 +19,7 @@ def __init__(self, data_url, splits, gameplay_scenario, **kwargs):
             **kwargs: keyword arguments forwarded to super.
         """
         super(CompguesswhatConfig, self).__init__(
-            version=datasets.Version("0.1.0", "First CompGuessWhat?! release"), **kwargs
+            version=datasets.Version("0.2.0", "Second CompGuessWhat?! release"), **kwargs
         )
         assert gameplay_scenario in (
             "original",
@@ -73,7 +73,7 @@ class Compguesswhat(datasets.GeneratorBasedBuilder):
         ),
     ]
 
-    VERSION = datasets.Version("0.1.0")
+    VERSION = datasets.Version("0.2.0")
 
     def _info(self):
         if self.config.gameplay_scenario == "original":
@@ -88,14 +88,27 @@ def _info(self):
                         "timestamp": datasets.Value("string"),
                         "status": datasets.Value("string"),
                         "image": {
+                            # this is the image ID in GuessWhat?! which corresponds to the MSCOCO id
                             "id": datasets.Value("int32"),
                             "file_name": datasets.Value("string"),
                             "flickr_url": datasets.Value("string"),
                             "coco_url": datasets.Value("string"),
                             "height": datasets.Value("int32"),
                             "width": datasets.Value("int32"),
-                            "vg_id": datasets.Value("int32"),
-                            "vg_url": datasets.Value("string"),
+                            # this field represents the corresponding image metadata that can be found in VisualGenome
+                            # in the file image_data.json
+                            # We copy it over so that we avoid any confusion or possible wrong URL
+                            # Please use the original image files to resolve photos
+                            "visual_genome": {
+                                "width": datasets.Value("int32"),
+                                "height": datasets.Value("int32"),
+                                "url": datasets.Value("string"),
+                                "coco_id": datasets.Value("int32"),
+                                # this is the actual VisualGenome image ID
+                                # because we can't rely store it as an integer we same it as string
+                                "flickr_id": datasets.Value("string"),
+                                "image_id": datasets.Value("string"),
+                            },
                         },
                         "qas": datasets.features.Sequence(
                             {
@@ -203,7 +216,12 @@ def _split_generators(self, dl_manager):
                 datasets.SplitGenerator(
                     name=split_name,
                     gen_kwargs={
-                        "filepath": os.path.join(dl_dir, full_split_name, self.VERSION.version_str, split_filename)
+                        "filepath": os.path.join(
+                            dl_dir,
+                            full_split_name,
+                            self.VERSION.version_str,
+                            split_filename,
+                        )
                     },
                 )
             )
@@ -224,6 +242,11 @@ def _extract_game_tuple(data):
                 del game["questioner_id"]
             ###
 
+            if "visual_genome" in game["image"]:
+                # We need to cast it to string so that we avoid issues with int size
+                game["image"]["visual_genome"]["image_id"] = str(game["image"]["visual_genome"]["image_id"])
+                game["image"]["visual_genome"]["flickr_id"] = str(game["image"]["visual_genome"]["flickr_id"])
+
             return game["id"], game
 
         """Yields examples."""

diff --git a/datasets/compguesswhat/create_dummy_data.py b/datasets/compguesswhat/create_dummy_data.py
@@ -12,7 +12,12 @@
     required=True,
     help="Data path containing the CompGuessWhat?! datasets (files with 'jsonl.gz' extension)",
 )
-parser.add_argument("--examples", type=int, default=5, help="Number of games to consider in the dummy dataset")
+parser.add_argument(
+    "--examples",
+    type=int,
+    default=5,
+    help="Number of games to consider in the dummy dataset",
+)
 original_data_files = {
     "train": "compguesswhat.train.jsonl.gz",
     "valid": "compguesswhat.valid.jsonl.gz",
@@ -74,7 +79,7 @@ def main(args):
     with open(dataset_info_path, encoding="utf-8") as in_file:
         dataset_info = json.load(in_file)
 
-    dataset_version = dataset_info["default"]["version"]["version_str"]
+    dataset_version = dataset_info["compguesswhat-original"]["version"]["version_str"]
 
     print(f"Creating dummy data for CompGuessWhat?! {dataset_version}")