diff --git a/datasets/web_questions/dataset_infos.json b/datasets/web_questions/dataset_infos.json new file mode 100644 index 000000000000..94246e4d2670 --- /dev/null +++ b/datasets/web_questions/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "This dataset consists of 6,642 question/answer pairs.\nThe questions are supposed to be answerable by Freebase, a large knowledge graph.\nThe questions are mostly centered around a single named entity.\nThe questions are popular ones asked on the web (at least in 2013).\n", "citation": "\n@inproceedings{berant-etal-2013-semantic,\n title = \"Semantic Parsing on {F}reebase from Question-Answer Pairs\",\n author = \"Berant, Jonathan and\n Chou, Andrew and\n Frostig, Roy and\n Liang, Percy\",\n booktitle = \"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing\",\n month = oct,\n year = \"2013\",\n address = \"Seattle, Washington, USA\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D13-1160\",\n pages = \"1533--1544\",\n}\n", "homepage": "https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "web_questions", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 533736, "num_examples": 3778, "dataset_name": "web_questions"}, "test": {"name": "test", "num_bytes": 289824, "num_examples": 2032, "dataset_name": "web_questions"}}, "download_checksums": {"https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/": {"num_bytes": 825320, "checksum": "fb1797e4554a1b1be642388367de1379f8c0d5afc609ac171492c67f7b70cb1e"}, "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/": {"num_bytes": 447645, "checksum": "e3d4550e90660aaabe18458ba34b59f2624857273f375af7353273ce8b84ce6e"}}, "download_size": 1272965, "dataset_size": 823560, "size_in_bytes": 2096525}} \ No newline at end of file diff --git a/datasets/web_questions/dummy/1.0.0/dummy_data.zip b/datasets/web_questions/dummy/1.0.0/dummy_data.zip new file mode 100644 index 000000000000..fdb8cd0f781c Binary files /dev/null and b/datasets/web_questions/dummy/1.0.0/dummy_data.zip differ diff --git a/datasets/web_questions/web_questions.py b/datasets/web_questions/web_questions.py new file mode 100644 index 000000000000..fc29d71ab7ff --- /dev/null +++ b/datasets/web_questions/web_questions.py @@ -0,0 +1,99 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""WebQuestions Benchmark for Question Answering.""" + +from __future__ import absolute_import, division, print_function + +import json +import re + +import nlp + + +_CITATION = """ +@inproceedings{berant-etal-2013-semantic, + title = "Semantic Parsing on {F}reebase from Question-Answer Pairs", + author = "Berant, Jonathan and + Chou, Andrew and + Frostig, Roy and + Liang, Percy", + booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing", + month = oct, + year = "2013", + address = "Seattle, Washington, USA", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/D13-1160", + pages = "1533--1544", +} +""" +_SPLIT_DOWNLOAD_URL = { + "train": "https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/", + "test": "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/", +} + +_DESCRIPTION = """\ +This dataset consists of 6,642 question/answer pairs. +The questions are supposed to be answerable by Freebase, a large knowledge graph. +The questions are mostly centered around a single named entity. +The questions are popular ones asked on the web (at least in 2013). +""" + + +class WebQuestions(nlp.GeneratorBasedBuilder): + """WebQuestions Benchmark for Question Answering.""" + + VERSION = nlp.Version("1.0.0") + + def _info(self): + return nlp.DatasetInfo( + description=_DESCRIPTION, + features=nlp.Features( + { + "url": nlp.Value("string"), + "question": nlp.Value("string"), + "answers": nlp.features.Sequence(nlp.Value("string")), + } + ), + supervised_keys=None, + homepage="https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + file_paths = dl_manager.download(_SPLIT_DOWNLOAD_URL) + + return [ + nlp.SplitGenerator(name=split, gen_kwargs={"file_path": file_path}) + for split, file_path in file_paths.items() + ] + + def _generate_examples(self, file_path): + """Parses split file and yields examples.""" + + def _target_to_answers(target): + target = re.sub(r"^\(list |\)$", "", target) + return ["".join(ans) for ans in re.findall(r'\(description (?:"([^"]+?)"|([^)]+?))\)\w*', target)] + + with open(file_path) as f: + examples = json.load(f) + for i, ex in enumerate(examples): + yield i, { + "url": ex["url"], + "question": ex["utterance"], + "answers": _target_to_answers(ex["targetValue"]), + }