Add SLR83 to OpenSLR (#3125)

* Add SLR83 to OpenSLR * Fix en-GB and en-IE locale identifiers for openslr/SLR83 Co-authored-by: Jonathan Zimmerman <jonazi01@noa.nintendo.com>
huggingface · Oct 22, 2021 · aa7e0af · aa7e0af · github-actions · Oct 22, 2021
1 parent ac0d1d1
commit aa7e0af
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 1 deletion.
diff --git a/datasets/openslr/README.md b/datasets/openslr/README.md
@@ -60,6 +60,9 @@ languages:
   - kn
   SLR80:
   - my
+  SLR83:
+  - en-GB
+  - en-IE
   SLR86:
   - yo
 licenses:
@@ -494,6 +497,19 @@ /~https://github.com/google/language-resources#license for license information.
 
 Copyright 2018, 2019 Google, Inc.
 
+#### SLR83: Crowdsourced high-quality UK and Ireland English Dialect speech data set
+This data set contains transcribed high-quality audio of English sentences recorded by volunteers speaking different dialects of the language.
+The data set consists of wave files, and a TSV file (line_index.tsv). The file line_index.csv contains a line id, an anonymized FileID and the transcription of audio in the file.
+
+The data set has been manually quality checked, but there might still be errors.
+
+The recordings from the Welsh English speakers were collected in collaboration with Cardiff University.
+
+The dataset is distributed under Creative Commons Attribution-ShareAlike 4.0 International Public License.
+See [LICENSE](https://www.openslr.org/resources/83/LICENSE) file and /~https://github.com/google/language-resources#license for license information.
+
+Copyright 2018, 2019 Google, Inc.
+
 #### SLR86: Crowdsourced high-quality  multi-speaker speech data set
 This data set contains transcribed high-quality audio of  sentences recorded by volunteers. The data set 
 consists of wave files, and a TSV file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 

diff --git a/datasets/openslr/dataset_infos.json b/datasets/openslr/dataset_infos.json
diff --git a/datasets/openslr/dummy/SLR83/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR83/0.0.0/dummy_data.zip
diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
@@ -112,6 +112,20 @@
     ISBN = {979-10-95546-34-4},
 }
 
+SLR83
+@inproceedings{demirsahin-etal-2020-open,
+    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},
+    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},
+    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
+    month = may,
+    year = {2020},
+    pages = {6532--6541},
+    address = {Marseille, France},
+    publisher = {European Language Resources Association (ELRA)},
+    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},
+    ISBN = {979-10-95546-34-4},
+}
+
 SLR80
 @inproceedings{oo-etal-2020-burmese,
     title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application
@@ -479,6 +493,39 @@
         "IndexFiles": ["line_index.tsv"],
         "DataDirs": [""],
     },
+    "SLR83": {
+        "Language": "English",
+        "LongName": "Crowdsourced high-quality UK and Ireland English Dialect speech data set",
+        "Category": "Speech",
+        "Summary": "Data set which contains male and female recordings of English from various dialects of the UK and Ireland",
+        "Files": [
+            "irish_english_male.zip",
+            "midlands_english_female.zip",
+            "midlands_english_male.zip",
+            "northern_english_female.zip",
+            "northern_english_male.zip",
+            "scottish_english_female.zip",
+            "scottish_english_male.zip",
+            "southern_english_female.zip",
+            "southern_english_male.zip",
+            "welsh_english_female.zip",
+            "welsh_english_male.zip",
+        ],
+        "IndexFiles": [
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+            "line_index.csv",
+        ],
+        "DataDirs": ["", "", "", "", "", "", "", "", "", "", ""],
+    },
     "SLR86": {
         "Language": "Yoruba",
         "LongName": "Crowdsourced high-quality Yoruba speech data set",