Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New version of CompGuessWhat?! with refined annotations #748

Merged
merged 1 commit into from
Oct 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@

*For the dummy data*:
```bash
RUN_SLOW=1 pytest tests/test_dataset_common.py::LocalDatasetTest::test_load_dataset_all_configs_<your-dataset-name>
RUN_SLOW=0 pytest tests/test_dataset_common.py::LocalDatasetTest::test_load_dataset_all_configs_<your-dataset-name>
```

6. If all tests pass, your dataset works correctly. Awesome! You can now follow steps 6, 7 and 8 of the section [*How to contribute to 🤗Datasets?*](#how-to-contribute-to-🤗Datasets). If you experience problems with the dummy data tests, you might want to take a look at the section *Help for dummy data tests* below.
Expand Down
33 changes: 28 additions & 5 deletions datasets/compguesswhat/compguesswhat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, data_url, splits, gameplay_scenario, **kwargs):
**kwargs: keyword arguments forwarded to super.
"""
super(CompguesswhatConfig, self).__init__(
version=datasets.Version("0.1.0", "First CompGuessWhat?! release"), **kwargs
version=datasets.Version("0.2.0", "Second CompGuessWhat?! release"), **kwargs
)
assert gameplay_scenario in (
"original",
Expand Down Expand Up @@ -73,7 +73,7 @@ class Compguesswhat(datasets.GeneratorBasedBuilder):
),
]

VERSION = datasets.Version("0.1.0")
VERSION = datasets.Version("0.2.0")

def _info(self):
if self.config.gameplay_scenario == "original":
Expand All @@ -88,14 +88,27 @@ def _info(self):
"timestamp": datasets.Value("string"),
"status": datasets.Value("string"),
"image": {
# this is the image ID in GuessWhat?! which corresponds to the MSCOCO id
"id": datasets.Value("int32"),
"file_name": datasets.Value("string"),
"flickr_url": datasets.Value("string"),
"coco_url": datasets.Value("string"),
"height": datasets.Value("int32"),
"width": datasets.Value("int32"),
"vg_id": datasets.Value("int32"),
"vg_url": datasets.Value("string"),
# this field represents the corresponding image metadata that can be found in VisualGenome
# in the file image_data.json
# We copy it over so that we avoid any confusion or possible wrong URL
# Please use the original image files to resolve photos
"visual_genome": {
"width": datasets.Value("int32"),
"height": datasets.Value("int32"),
"url": datasets.Value("string"),
"coco_id": datasets.Value("int32"),
# this is the actual VisualGenome image ID
# because we can't rely store it as an integer we same it as string
"flickr_id": datasets.Value("string"),
"image_id": datasets.Value("string"),
},
},
"qas": datasets.features.Sequence(
{
Expand Down Expand Up @@ -203,7 +216,12 @@ def _split_generators(self, dl_manager):
datasets.SplitGenerator(
name=split_name,
gen_kwargs={
"filepath": os.path.join(dl_dir, full_split_name, self.VERSION.version_str, split_filename)
"filepath": os.path.join(
dl_dir,
full_split_name,
self.VERSION.version_str,
split_filename,
)
},
)
)
Expand All @@ -224,6 +242,11 @@ def _extract_game_tuple(data):
del game["questioner_id"]
###

if "visual_genome" in game["image"]:
# We need to cast it to string so that we avoid issues with int size
game["image"]["visual_genome"]["image_id"] = str(game["image"]["visual_genome"]["image_id"])
game["image"]["visual_genome"]["flickr_id"] = str(game["image"]["visual_genome"]["flickr_id"])

return game["id"], game

"""Yields examples."""
Expand Down
9 changes: 7 additions & 2 deletions datasets/compguesswhat/create_dummy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
required=True,
help="Data path containing the CompGuessWhat?! datasets (files with 'jsonl.gz' extension)",
)
parser.add_argument("--examples", type=int, default=5, help="Number of games to consider in the dummy dataset")
parser.add_argument(
"--examples",
type=int,
default=5,
help="Number of games to consider in the dummy dataset",
)
original_data_files = {
"train": "compguesswhat.train.jsonl.gz",
"valid": "compguesswhat.valid.jsonl.gz",
Expand Down Expand Up @@ -74,7 +79,7 @@ def main(args):
with open(dataset_info_path, encoding="utf-8") as in_file:
dataset_info = json.load(in_file)

dataset_version = dataset_info["default"]["version"]["version_str"]
dataset_version = dataset_info["compguesswhat-original"]["version"]["version_str"]

print(f"Creating dummy data for CompGuessWhat?! {dataset_version}")

Expand Down
Loading