Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use full released xsum dataset #754

Merged
merged 8 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/xsum/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "\nExtreme Summarization (XSum) Dataset.\n\nThere are two features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n\nThis data need to manaully downloaded and extracted as described in\n/~https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md.\nThe folder 'xsum-extracts-from-downloads' need to be compressed as\n'xsum-extracts-from-downloads.tar.gz' and put in manually downloaded folder.\n", "citation": "\n@article{Narayan2018DontGM,\n title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},\n author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},\n journal={ArXiv},\n year={2018},\n volume={abs/1808.08745}\n}\n", "homepage": "/~https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": {"input": "document", "output": "summary"}, "builder_name": "xsum", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 474092909, "num_examples": 204017, "dataset_name": "xsum"}, "validation": {"name": "validation", "num_bytes": 26011730, "num_examples": 11327, "dataset_name": "xsum"}, "test": {"name": "test", "num_bytes": 26470484, "num_examples": 11333, "dataset_name": "xsum"}}, "download_checksums": {"https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz": {"num_bytes": 204844092, "checksum": "3daaea63a068ad9d9c250ca39fcfe1e985e08696984dfbc3274f6a4082a29f88"}}, "download_size": 204844092, "dataset_size": 526575123, "size_in_bytes": 731419215}}
{"default": {"description": "\nExtreme Summarization (XSum) Dataset.\n\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.\n\n", "citation": "\n@article{Narayan2018DontGM,\n title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},\n author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},\n journal={ArXiv},\n year={2018},\n volume={abs/1808.08745}\n}\n", "homepage": "/~https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "builder_name": "xsum", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 2, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 479206608, "num_examples": 204045, "dataset_name": "xsum"}, "validation": {"name": "validation", "num_bytes": 26292901, "num_examples": 11332, "dataset_name": "xsum"}, "test": {"name": "test", "num_bytes": 26756165, "num_examples": 11334, "dataset_name": "xsum"}}, "download_checksums": {"http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz": {"num_bytes": 254582292, "checksum": "10b48aa187fc9c904b30f76ca97e2da0de8d3a1238acc26acadef93e2001af90"}, "https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json": {"num_bytes": 2720574, "checksum": "9c0c5d8f048a90bd68b19a34e4c30577ed270d3247b2119fa06a04ef46292068"}}, "download_size": 257302866, "post_processing_size": null, "dataset_size": 532255674, "size_in_bytes": 789558540}}
Binary file added datasets/xsum/dummy/1.2.0/dummy_data.zip
Binary file not shown.
89 changes: 67 additions & 22 deletions datasets/xsum/xsum.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from __future__ import absolute_import, division, print_function

import json
import os

import datasets
Expand All @@ -36,25 +37,45 @@
_DESCRIPTION = """
Extreme Summarization (XSum) Dataset.

There are two features:
There are three features:
- document: Input news article.
- summary: One sentence summary of the article.
- id: BBC ID of the article.

"""


_URL = "https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz"
# From /~https://github.com/EdinburghNLP/XSum/issues/12
_URL_DATA = "http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz"
_URL_SPLITS = (
"https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json"
)

_DOCUMENT = "document"
_SUMMARY = "summary"
_ID = "id"

_REMOVE_LINES = set(
[
"Share this with\n",
"Email\n",
"Facebook\n",
"Messenger\n",
"Twitter\n",
"Pinterest\n",
"WhatsApp\n",
"Linkedin\n",
"LinkedIn\n",
"Copy this link\n",
"These are external links and will open in a new window\n",
]
)


class Xsum(datasets.GeneratorBasedBuilder):
"""Extreme Summarization (XSum) Dataset."""

# Version 1.1.0 removes web contents.
VERSION = datasets.Version("1.1.0")
SUPPORTED_VERSIONS = [datasets.Version("1.0.0", "Dataset without cleaning.")]
# Version 1.2.0 expands coverage, includes ids, and removes web contents.
VERSION = datasets.Version("1.2.0")

def _info(self):
return datasets.DatasetInfo(
Expand All @@ -63,6 +84,7 @@ def _info(self):
{
_DOCUMENT: datasets.Value("string"),
_SUMMARY: datasets.Value("string"),
_ID: datasets.Value("string"),
}
),
supervised_keys=(_DOCUMENT, _SUMMARY),
Expand All @@ -73,39 +95,62 @@ def _info(self):
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""

dl_path = dl_manager.download_and_extract(_URL)
files_to_download = {"data": _URL_DATA, "splits": _URL_SPLITS}
downloaded_files = dl_manager.download_and_extract(files_to_download)

dl_path = os.path.join(dl_path, "xsum")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"source": os.path.join(dl_path, "train.source"),
"target": os.path.join(dl_path, "train.target"),
"split_path": downloaded_files["splits"],
"split_name": "train",
"data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"),
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"source": os.path.join(dl_path, "val.source"),
"target": os.path.join(dl_path, "val.target"),
"split_path": downloaded_files["splits"],
"split_name": "validation",
"data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"),
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"source": os.path.join(dl_path, "test.source"),
"target": os.path.join(dl_path, "test.target"),
"split_path": downloaded_files["splits"],
"split_name": "test",
"data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"),
},
),
]

def _generate_examples(self, source, target):
def _generate_examples(self, split_path, split_name, data_dir):
"""Yields examples."""
with open(source, encoding="utf-8") as f1:
source = f1.readlines()
with open(target, encoding="utf-8") as f2:
target = f2.readlines()
assert len(source) == len(target)
for i in range(len(target)):
yield i, {_DOCUMENT: source[i], _SUMMARY: target[i]}

with open(split_path, "r", encoding="utf-8") as f:
split_ids = json.load(f)

for i in split_ids[split_name]:
with open(os.path.join(data_dir, i + ".summary"), "r", encoding="utf-8") as f:
text = "".join([line for line in f.readlines() if line not in _REMOVE_LINES and line.strip()])
# Each file follows below format:
# [SN]URL[SN]
# http://somelink
#
# [SN]TITLE[SN]
# some intro
#
# [SN]FIRST-SENTENCE[SN]
# some intro
#
# [SN]RESTBODY[SN]
# text line.
# another text line.
# "another text line."

# According to the following issue, FIRST-SENTENCE
# is the reference summary and TITLE is unused:
# /~https://github.com/EdinburghNLP/XSum/issues/22
segs = text.split("[SN]")
yield i, {_DOCUMENT: segs[8].strip(), _SUMMARY: segs[6].strip(), _ID: i}