add web_questions (huggingface#401)

* add web_questions * fix web questions dummy data Co-authored-by: Mariama Drame <mariama@debmower_ajd> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
vegarab · Aug 18, 2020 · ea49484 · ea49484
1 parent 844805c
commit ea49484
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 0 deletions.
diff --git a/datasets/web_questions/dataset_infos.json b/datasets/web_questions/dataset_infos.json
@@ -0,0 +1 @@
+{"default": {"description": "This dataset consists of 6,642 question/answer pairs.\nThe questions are supposed to be answerable by Freebase, a large knowledge graph.\nThe questions are mostly centered around a single named entity.\nThe questions are popular ones asked on the web (at least in 2013).\n", "citation": "\n@inproceedings{berant-etal-2013-semantic,\n    title = \"Semantic Parsing on {F}reebase from Question-Answer Pairs\",\n    author = \"Berant, Jonathan  and\n      Chou, Andrew  and\n      Frostig, Roy  and\n      Liang, Percy\",\n    booktitle = \"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing\",\n    month = oct,\n    year = \"2013\",\n    address = \"Seattle, Washington, USA\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/D13-1160\",\n    pages = \"1533--1544\",\n}\n", "homepage": "https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "web_questions", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 533736, "num_examples": 3778, "dataset_name": "web_questions"}, "test": {"name": "test", "num_bytes": 289824, "num_examples": 2032, "dataset_name": "web_questions"}}, "download_checksums": {"https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/": {"num_bytes": 825320, "checksum": "fb1797e4554a1b1be642388367de1379f8c0d5afc609ac171492c67f7b70cb1e"}, "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/": {"num_bytes": 447645, "checksum": "e3d4550e90660aaabe18458ba34b59f2624857273f375af7353273ce8b84ce6e"}}, "download_size": 1272965, "dataset_size": 823560, "size_in_bytes": 2096525}}
diff --git a/datasets/web_questions/dummy/1.0.0/dummy_data.zip b/datasets/web_questions/dummy/1.0.0/dummy_data.zip
diff --git a/datasets/web_questions/web_questions.py b/datasets/web_questions/web_questions.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""WebQuestions Benchmark for Question Answering."""
+
+from __future__ import absolute_import, division, print_function
+
+import json
+import re
+
+import nlp
+
+
+_CITATION = """
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/D13-1160",
+    pages = "1533--1544",
+}
+"""
+_SPLIT_DOWNLOAD_URL = {
+    "train": "https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/",
+    "test": "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/",
+}
+
+_DESCRIPTION = """\
+This dataset consists of 6,642 question/answer pairs.
+The questions are supposed to be answerable by Freebase, a large knowledge graph.
+The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+"""
+
+
+class WebQuestions(nlp.GeneratorBasedBuilder):
+    """WebQuestions Benchmark for Question Answering."""
+
+    VERSION = nlp.Version("1.0.0")
+
+    def _info(self):
+        return nlp.DatasetInfo(
+            description=_DESCRIPTION,
+            features=nlp.Features(
+                {
+                    "url": nlp.Value("string"),
+                    "question": nlp.Value("string"),
+                    "answers": nlp.features.Sequence(nlp.Value("string")),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        file_paths = dl_manager.download(_SPLIT_DOWNLOAD_URL)
+
+        return [
+            nlp.SplitGenerator(name=split, gen_kwargs={"file_path": file_path})
+            for split, file_path in file_paths.items()
+        ]
+
+    def _generate_examples(self, file_path):
+        """Parses split file and yields examples."""
+
+        def _target_to_answers(target):
+            target = re.sub(r"^\(list |\)$", "", target)
+            return ["".join(ans) for ans in re.findall(r'\(description (?:"([^"]+?)"|([^)]+?))\)\w*', target)]
+
+        with open(file_path) as f:
+            examples = json.load(f)
+            for i, ex in enumerate(examples):
+                yield i, {
+                    "url": ex["url"],
+                    "question": ex["utterance"],
+                    "answers": _target_to_answers(ex["targetValue"]),
+                }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"default": {"description": "This dataset consists of 6,642 question/answer pairs.\nThe questions are supposed to be answerable by Freebase, a large knowledge graph.\nThe questions are mostly centered around a single named entity.\nThe questions are popular ones asked on the web (at least in 2013).\n", "citation": "\n@inproceedings{berant-etal-2013-semantic,\n title = \"Semantic Parsing on {F}reebase from Question-Answer Pairs\",\n author = \"Berant, Jonathan and\n Chou, Andrew and\n Frostig, Roy and\n Liang, Percy\",\n booktitle = \"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing\",\n month = oct,\n year = \"2013\",\n address = \"Seattle, Washington, USA\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D13-1160\",\n pages = \"1533--1544\",\n}\n", "homepage": "https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "web_questions", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 533736, "num_examples": 3778, "dataset_name": "web_questions"}, "test": {"name": "test", "num_bytes": 289824, "num_examples": 2032, "dataset_name": "web_questions"}}, "download_checksums": {"https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/": {"num_bytes": 825320, "checksum": "fb1797e4554a1b1be642388367de1379f8c0d5afc609ac171492c67f7b70cb1e"}, "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/": {"num_bytes": 447645, "checksum": "e3d4550e90660aaabe18458ba34b59f2624857273f375af7353273ce8b84ce6e"}}, "download_size": 1272965, "dataset_size": 823560, "size_in_bytes": 2096525}}