huggingface · lhoestq · Oct 26, 2020 · May 18, 2020 · Oct 23, 2020 · Oct 23, 2020
diff --git a/datasets/xsum/dataset_infos.json b/datasets/xsum/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "\nExtreme Summarization (XSum) Dataset.\n\nThere are two features:\n  - document: Input news article.\n  - summary: One sentence summary of the article.\n\nThis data need to manaully downloaded and extracted as described in\n/~https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md.\nThe folder 'xsum-extracts-from-downloads' need to be compressed as\n'xsum-extracts-from-downloads.tar.gz' and put in manually downloaded folder.\n", "citation": "\n@article{Narayan2018DontGM,\n  title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},\n  author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},\n  journal={ArXiv},\n  year={2018},\n  volume={abs/1808.08745}\n}\n", "homepage": "/~https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": {"input": "document", "output": "summary"}, "builder_name": "xsum", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 474092909, "num_examples": 204017, "dataset_name": "xsum"}, "validation": {"name": "validation", "num_bytes": 26011730, "num_examples": 11327, "dataset_name": "xsum"}, "test": {"name": "test", "num_bytes": 26470484, "num_examples": 11333, "dataset_name": "xsum"}}, "download_checksums": {"https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz": {"num_bytes": 204844092, "checksum": "3daaea63a068ad9d9c250ca39fcfe1e985e08696984dfbc3274f6a4082a29f88"}}, "download_size": 204844092, "dataset_size": 526575123, "size_in_bytes": 731419215}}
+{"default": {"description": "\nExtreme Summarization (XSum) Dataset.\n\nThere are three features:\n  - document: Input news article.\n  - summary: One sentence summary of the article.\n  - id: BBC ID of the article.\n\n", "citation": "\n@article{Narayan2018DontGM,\n  title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},\n  author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},\n  journal={ArXiv},\n  year={2018},\n  volume={abs/1808.08745}\n}\n", "homepage": "/~https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "builder_name": "xsum", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 2, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 479206608, "num_examples": 204045, "dataset_name": "xsum"}, "validation": {"name": "validation", "num_bytes": 26292901, "num_examples": 11332, "dataset_name": "xsum"}, "test": {"name": "test", "num_bytes": 26756165, "num_examples": 11334, "dataset_name": "xsum"}}, "download_checksums": {"http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz": {"num_bytes": 254582292, "checksum": "10b48aa187fc9c904b30f76ca97e2da0de8d3a1238acc26acadef93e2001af90"}, "https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json": {"num_bytes": 2720574, "checksum": "9c0c5d8f048a90bd68b19a34e4c30577ed270d3247b2119fa06a04ef46292068"}}, "download_size": 257302866, "post_processing_size": null, "dataset_size": 532255674, "size_in_bytes": 789558540}}
diff --git a/datasets/xsum/dummy/1.2.0/dummy_data.zip b/datasets/xsum/dummy/1.2.0/dummy_data.zip
diff --git a/datasets/xsum/xsum.py b/datasets/xsum/xsum.py
@@ -18,6 +18,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+import json
 import os
 
 import datasets
@@ -36,25 +37,45 @@
 _DESCRIPTION = """
 Extreme Summarization (XSum) Dataset.
 
-There are two features:
+There are three features:
   - document: Input news article.
   - summary: One sentence summary of the article.
+  - id: BBC ID of the article.
 
 """
 
-
-_URL = "https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz"
+# From /~https://github.com/EdinburghNLP/XSum/issues/12
+_URL_DATA = "http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz"
+_URL_SPLITS = (
+    "https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json"
+)
 
 _DOCUMENT = "document"
 _SUMMARY = "summary"
+_ID = "id"
+
+_REMOVE_LINES = set(
+    [
+        "Share this with\n",
+        "Email\n",
+        "Facebook\n",
+        "Messenger\n",
+        "Twitter\n",
+        "Pinterest\n",
+        "WhatsApp\n",
+        "Linkedin\n",
+        "LinkedIn\n",
+        "Copy this link\n",
+        "These are external links and will open in a new window\n",
+    ]
+)
 
 
 class Xsum(datasets.GeneratorBasedBuilder):
     """Extreme Summarization (XSum) Dataset."""
 
-    # Version 1.1.0 removes web contents.
-    VERSION = datasets.Version("1.1.0")
-    SUPPORTED_VERSIONS = [datasets.Version("1.0.0", "Dataset without cleaning.")]
+    # Version 1.2.0 expands coverage, includes ids, and removes web contents.
+    VERSION = datasets.Version("1.2.0")
 
     def _info(self):
         return datasets.DatasetInfo(
@@ -63,6 +84,7 @@ def _info(self):
                 {
                     _DOCUMENT: datasets.Value("string"),
                     _SUMMARY: datasets.Value("string"),
+                    _ID: datasets.Value("string"),
                 }
             ),
             supervised_keys=(_DOCUMENT, _SUMMARY),
@@ -73,39 +95,62 @@ def _info(self):
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
 
-        dl_path = dl_manager.download_and_extract(_URL)
+        files_to_download = {"data": _URL_DATA, "splits": _URL_SPLITS}
+        downloaded_files = dl_manager.download_and_extract(files_to_download)
 
-        dl_path = os.path.join(dl_path, "xsum")
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "source": os.path.join(dl_path, "train.source"),
-                    "target": os.path.join(dl_path, "train.target"),
+                    "split_path": downloaded_files["splits"],
+                    "split_name": "train",
+                    "data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "source": os.path.join(dl_path, "val.source"),
-                    "target": os.path.join(dl_path, "val.target"),
+                    "split_path": downloaded_files["splits"],
+                    "split_name": "validation",
+                    "data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    "source": os.path.join(dl_path, "test.source"),
-                    "target": os.path.join(dl_path, "test.target"),
+                    "split_path": downloaded_files["splits"],
+                    "split_name": "test",
+                    "data_dir": os.path.join(downloaded_files["data"], "bbc-summary-data"),
                 },
             ),
         ]
 
-    def _generate_examples(self, source, target):
+    def _generate_examples(self, split_path, split_name, data_dir):
         """Yields examples."""
-        with open(source, encoding="utf-8") as f1:
-            source = f1.readlines()
-        with open(target, encoding="utf-8") as f2:
-            target = f2.readlines()
-        assert len(source) == len(target)
-        for i in range(len(target)):
-            yield i, {_DOCUMENT: source[i], _SUMMARY: target[i]}
+
+        with open(split_path, "r", encoding="utf-8") as f:
+            split_ids = json.load(f)
+
+        for i in split_ids[split_name]:
+            with open(os.path.join(data_dir, i + ".summary"), "r", encoding="utf-8") as f:
+                text = "".join([line for line in f.readlines() if line not in _REMOVE_LINES and line.strip()])
+                # Each file follows below format:
+                # [SN]URL[SN]
+                # http://somelink
+                #
+                # [SN]TITLE[SN]
+                # some intro
+                #
+                # [SN]FIRST-SENTENCE[SN]
+                # some intro
+                #
+                # [SN]RESTBODY[SN]
+                # text line.
+                # another text line.
+                # "another text line."
+
+                # According to the following issue, FIRST-SENTENCE
+                # is the reference summary and TITLE is unused:
+                # /~https://github.com/EdinburghNLP/XSum/issues/22
+                segs = text.split("[SN]")
+                yield i, {_DOCUMENT: segs[8].strip(), _SUMMARY: segs[6].strip(), _ID: i}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "\nExtreme Summarization (XSum) Dataset.\n\nThere are two features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n\nThis data need to manaully downloaded and extracted as described in\n/~https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md.\nThe folder 'xsum-extracts-from-downloads' need to be compressed as\n'xsum-extracts-from-downloads.tar.gz' and put in manually downloaded folder.\n", "citation": "\n@article{Narayan2018DontGM,\n title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},\n author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},\n journal={ArXiv},\n year={2018},\n volume={abs/1808.08745}\n}\n", "homepage": "/~https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": {"input": "document", "output": "summary"}, "builder_name": "xsum", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 474092909, "num_examples": 204017, "dataset_name": "xsum"}, "validation": {"name": "validation", "num_bytes": 26011730, "num_examples": 11327, "dataset_name": "xsum"}, "test": {"name": "test", "num_bytes": 26470484, "num_examples": 11333, "dataset_name": "xsum"}}, "download_checksums": {"https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz": {"num_bytes": 204844092, "checksum": "3daaea63a068ad9d9c250ca39fcfe1e985e08696984dfbc3274f6a4082a29f88"}}, "download_size": 204844092, "dataset_size": 526575123, "size_in_bytes": 731419215}}
		{"default": {"description": "\nExtreme Summarization (XSum) Dataset.\n\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.\n\n", "citation": "\n@article{Narayan2018DontGM,\n title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},\n author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},\n journal={ArXiv},\n year={2018},\n volume={abs/1808.08745}\n}\n", "homepage": "/~https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "builder_name": "xsum", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 2, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 479206608, "num_examples": 204045, "dataset_name": "xsum"}, "validation": {"name": "validation", "num_bytes": 26292901, "num_examples": 11332, "dataset_name": "xsum"}, "test": {"name": "test", "num_bytes": 26756165, "num_examples": 11334, "dataset_name": "xsum"}}, "download_checksums": {"http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz": {"num_bytes": 254582292, "checksum": "10b48aa187fc9c904b30f76ca97e2da0de8d3a1238acc26acadef93e2001af90"}, "https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json": {"num_bytes": 2720574, "checksum": "9c0c5d8f048a90bd68b19a34e4c30577ed270d3247b2119fa06a04ef46292068"}}, "download_size": 257302866, "post_processing_size": null, "dataset_size": 532255674, "size_in_bytes": 789558540}}