Add The Pile Enron Emails subset (#3427)

* Add The Pile Enron Emails subset * Update dataset card * Fix style
huggingface · Dec 14, 2021 · 7601a7b · 7601a7b · github-actions · Dec 14, 2021
1 parent 0d814bd
commit 7601a7b
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 1 deletion.
diff --git a/datasets/the_pile/README.md b/datasets/the_pile/README.md
@@ -81,6 +81,15 @@ This dataset is in English (`EN`).
 }
 ```
 
+#### enron_emails
+```
+{
+  'text': 'Name\t\t\tNew Title\t\t\t\tEffective Date\t\t\tMid Year promotion Yes/No\n\nFloyd, Jodie\t\tSr Cust Svc Rep (no change)\t\t7/16/01\t\t\t\tNo\n\nBuehler, Craig\t\tSr Mkt/Sup Analyst (no change)\t\t7/16/01\t\t\t\tNo\n\nWagoner, Mike\t\tTeam Advisor - Gas Control\t\t7/1/01\t\t\t\tNo\n\nClapper, Karen\t\tSr Cust Svc Rep\t\t\t8/1/01\t\t\t\tYes\n\nGreaney, Chris\t\tSr Cust Svc Rep\t\t\t8/1/01\t\t\t\tYes\n\nWilkens, Jerry\t\tSr Cust Svc Rep\t\t\t8/1/01\t\t\t\tYes\n\nMinton, Kevin\t\tPipeline Controller\t\t\t8/1/01\t\t\t\tYes\n\nCox, Don\t\tPipeline Controller\t\t\t8/1/01\t\t\t\tYes\n\nHanagriff, Richard\tSr Accounting Control Spec\t\t8/1/01\t\t\t\tYes\n\n\nThanks,\nMS'
+  'meta': "{}",
+
+}
+```
+
 #### europarl
 ```
 {
@@ -154,6 +163,11 @@ This dataset is in English (`EN`).
 - `meta` (dict): Metadata of the data instance with keys:
   - pile_set_name: Name of the subset.
 
+#### enron_emails
+
+- `text` (str): Text.
+- `meta` (str): Metadata of the data instance.
+
 #### europarl
 
 - `text` (str): Text.

diff --git a/datasets/the_pile/the_pile.py b/datasets/the_pile/the_pile.py
@@ -39,6 +39,7 @@
 
 _LICENSES = {
     "all": "Multiple: see each subset license",
+    "enron_emails": "Unknown",
     "europarl": "Unknown",
     "free_law": "Unknown",
     "hacker_news": "Unknown",
@@ -55,6 +56,7 @@
         "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
         "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
     },
+    "enron_emails": "http://eaidata.bmk.sh/data/enron_emails.jsonl.zst",
     "europarl": "https://the-eye.eu/public/AI/pile_preliminary_components/EuroParliamentProceedings_1996_2011.jsonl.zst",
     "free_law": "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
     "hacker_news": "https://the-eye.eu/public/AI/pile_preliminary_components/hn.tar.gz",
@@ -72,6 +74,12 @@
             "meta": {"pile_set_name": datasets.Value("string")},
         }
     ),
+    "enron_emails": datasets.Features(
+        {
+            "text": datasets.Value("string"),
+            "meta": datasets.Value("string"),
+        }
+    ),
     "europarl": datasets.Features(
         {
             "text": datasets.Value("string"),
@@ -213,7 +221,7 @@ def _generate_examples(self, files):
                         key += 1
         else:
             for subset in files:
-                if subset in {"europarl", "free_law", "nih_exporter", "pubmed", "ubuntu_irc"}:
+                if subset in {"enron_emails", "europarl", "free_law", "nih_exporter", "pubmed", "ubuntu_irc"}:
                     import zstandard as zstd
 
                     with zstd.open(open(files[subset], "rb"), "rt", encoding="utf-8") as f: