Skip to content

Commit

Permalink
add web_questions (huggingface#401)
Browse files Browse the repository at this point in the history
* add web_questions

* fix web questions dummy data

Co-authored-by: Mariama Drame <mariama@debmower_ajd>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
  • Loading branch information
3 people authored and vegarab committed Aug 18, 2020
1 parent 844805c commit ea49484
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 0 deletions.
1 change: 1 addition & 0 deletions datasets/web_questions/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"default": {"description": "This dataset consists of 6,642 question/answer pairs.\nThe questions are supposed to be answerable by Freebase, a large knowledge graph.\nThe questions are mostly centered around a single named entity.\nThe questions are popular ones asked on the web (at least in 2013).\n", "citation": "\n@inproceedings{berant-etal-2013-semantic,\n title = \"Semantic Parsing on {F}reebase from Question-Answer Pairs\",\n author = \"Berant, Jonathan and\n Chou, Andrew and\n Frostig, Roy and\n Liang, Percy\",\n booktitle = \"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing\",\n month = oct,\n year = \"2013\",\n address = \"Seattle, Washington, USA\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D13-1160\",\n pages = \"1533--1544\",\n}\n", "homepage": "https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "web_questions", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 533736, "num_examples": 3778, "dataset_name": "web_questions"}, "test": {"name": "test", "num_bytes": 289824, "num_examples": 2032, "dataset_name": "web_questions"}}, "download_checksums": {"https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/": {"num_bytes": 825320, "checksum": "fb1797e4554a1b1be642388367de1379f8c0d5afc609ac171492c67f7b70cb1e"}, "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/": {"num_bytes": 447645, "checksum": "e3d4550e90660aaabe18458ba34b59f2624857273f375af7353273ce8b84ce6e"}}, "download_size": 1272965, "dataset_size": 823560, "size_in_bytes": 2096525}}
Binary file added datasets/web_questions/dummy/1.0.0/dummy_data.zip
Binary file not shown.
99 changes: 99 additions & 0 deletions datasets/web_questions/web_questions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""WebQuestions Benchmark for Question Answering."""

from __future__ import absolute_import, division, print_function

import json
import re

import nlp


_CITATION = """
@inproceedings{berant-etal-2013-semantic,
title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
author = "Berant, Jonathan and
Chou, Andrew and
Frostig, Roy and
Liang, Percy",
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
month = oct,
year = "2013",
address = "Seattle, Washington, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D13-1160",
pages = "1533--1544",
}
"""
_SPLIT_DOWNLOAD_URL = {
"train": "https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/",
"test": "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/",
}

_DESCRIPTION = """\
This dataset consists of 6,642 question/answer pairs.
The questions are supposed to be answerable by Freebase, a large knowledge graph.
The questions are mostly centered around a single named entity.
The questions are popular ones asked on the web (at least in 2013).
"""


class WebQuestions(nlp.GeneratorBasedBuilder):
"""WebQuestions Benchmark for Question Answering."""

VERSION = nlp.Version("1.0.0")

def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features(
{
"url": nlp.Value("string"),
"question": nlp.Value("string"),
"answers": nlp.features.Sequence(nlp.Value("string")),
}
),
supervised_keys=None,
homepage="https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a",
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
file_paths = dl_manager.download(_SPLIT_DOWNLOAD_URL)

return [
nlp.SplitGenerator(name=split, gen_kwargs={"file_path": file_path})
for split, file_path in file_paths.items()
]

def _generate_examples(self, file_path):
"""Parses split file and yields examples."""

def _target_to_answers(target):
target = re.sub(r"^\(list |\)$", "", target)
return ["".join(ans) for ans in re.findall(r'\(description (?:"([^"]+?)"|([^)]+?))\)\w*', target)]

with open(file_path) as f:
examples = json.load(f)
for i, ex in enumerate(examples):
yield i, {
"url": ex["url"],
"question": ex["utterance"],
"answers": _target_to_answers(ex["targetValue"]),
}

0 comments on commit ea49484

Please sign in to comment.