From 0281f9d881f3a55c89aeaa642f1ba23444b64083 Mon Sep 17 00:00:00 2001 From: "Gowtham.R" Date: Fri, 22 Jan 2021 15:43:45 +0530 Subject: [PATCH] PAWS-X: Fix csv Dictreader splitting data on quotes (#1763) * Fix csv Dictreader spliting data on quotes * remove -1 labels * update dataset_infos.json * update readme Co-authored-by: Quentin Lhoest --- datasets/paws-x/README.md | 17 +++++++++-------- datasets/paws-x/dataset_infos.json | 2 +- datasets/paws-x/paws-x.py | 4 +--- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/datasets/paws-x/README.md b/datasets/paws-x/README.md index be7968c58d2..a3276f87961 100644 --- a/datasets/paws-x/README.md +++ b/datasets/paws-x/README.md @@ -117,17 +117,18 @@ corresponding file in PAWS-Wiki. ### Data Splits -The numbers of examples for each of the six languages are shown below: +The numbers of examples for each of the seven languages are shown below: Language | Train | Dev | Test :------- | ------: | -----: | -----: -fr | 49,401 | 1,992 | 1,985 -es | 49,401 | 1,962 | 1,999 -de | 49,401 | 1,932 | 1,967 -zh | 49,401 | 1,984 | 1,975 -ja | 49,401 | 1,980 | 1,946 -ko | 49,401 | 1,965 | 1,972 -Total | 296,406 | 11,815 | 11,844 +en | 49,401 | 2,000 | 2,000 +fr | 49,401 | 2,000 | 2,000 +es | 49,401 | 2,000 | 2,000 +de | 49,401 | 2,000 | 2,000 +zh | 49,401 | 2,000 | 2,000 +ja | 49,401 | 2,000 | 2,000 +ko | 49,401 | 2,000 | 2,000 + > **Caveat**: please note that the dev and test sets of PAWS-X are both sourced > from the dev set of PAWS-Wiki. As a consequence, the same `sentence 1` may diff --git a/datasets/paws-x/dataset_infos.json b/datasets/paws-x/dataset_infos.json index 5f23005f8e1..47292db059d 100644 --- a/datasets/paws-x/dataset_infos.json +++ b/datasets/paws-x/dataset_infos.json @@ -1 +1 @@ -{"en": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "en", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12209260, "num_examples": 49202, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 494734, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 492287, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 13196281, "size_in_bytes": 43478338}, "de": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "de", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12799423, "num_examples": 49383, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 524190, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 514005, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 13837618, "size_in_bytes": 44119675}, "es": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "es", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12807894, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 519043, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 513223, "num_examples": 1970, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 13840160, "size_in_bytes": 44122217}, "fr": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "fr", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13294805, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 535027, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 532657, "num_examples": 1990, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 14362489, "size_in_bytes": 44644546}, "ja": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "ja", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 15041440, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 668636, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 661770, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 16371846, "size_in_bytes": 46653903}, "ko": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "ko", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13929684, "num_examples": 49192, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 562204, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 554783, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 15046671, "size_in_bytes": 45328728}, "zh": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "zh", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10815499, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 474634, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 473116, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 11763249, "size_in_bytes": 42045306}} \ No newline at end of file +{"en": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "en", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12215953, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 494734, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 492287, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 13202974, "size_in_bytes": 43485031}, "de": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "de", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12801824, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 524214, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 514009, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 13840047, "size_in_bytes": 44122104}, "es": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "es", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12808486, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 519111, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 513888, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 13841485, "size_in_bytes": 44123542}, "fr": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "fr", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13295597, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 535101, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 533031, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 14363729, "size_in_bytes": 44645786}, "ja": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "ja", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 15041632, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 668636, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 661778, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 16372046, "size_in_bytes": 46654103}, "ko": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "ko", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13934221, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 562300, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 554875, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 15051396, "size_in_bytes": 45333453}, "zh": {"description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "citation": "@InProceedings{pawsx2019emnlp,\n title = {{PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}},\n author = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},\n booktitle = {Proc. of EMNLP},\n year = {2019}\n}\n", "homepage": "/~https://github.com/google-research-datasets/paws/tree/master/pawsx", "license": "The dataset may be freely used for any purpose, although acknowledgement of Google LLC (\"Google\") as the data source would be appreciated. The dataset is provided \"AS IS\" without any warranty, express or implied. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pawsx", "config_name": "zh", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10815499, "num_examples": 49401, "dataset_name": "pawsx"}, "test": {"name": "test", "num_bytes": 474644, "num_examples": 2000, "dataset_name": "pawsx"}, "validation": {"name": "validation", "num_bytes": 473118, "num_examples": 2000, "dataset_name": "pawsx"}}, "download_checksums": {"https://storage.googleapis.com/paws/pawsx/x-final.tar.gz": {"num_bytes": 30282057, "checksum": "4146db499101d66e68ae4c8ed3cf9dadecd625f44b7d8cf3d4a0fe93afc4fd9f"}}, "download_size": 30282057, "post_processing_size": null, "dataset_size": 11763261, "size_in_bytes": 42045318}} \ No newline at end of file diff --git a/datasets/paws-x/paws-x.py b/datasets/paws-x/paws-x.py index 667a616bd97..25172671ec8 100644 --- a/datasets/paws-x/paws-x.py +++ b/datasets/paws-x/paws-x.py @@ -160,10 +160,8 @@ def _generate_examples(self, filepath, split): """ Yields examples. """ with open(filepath, encoding="utf-8") as f: - data = csv.DictReader(f, delimiter="\t") + data = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) for id_, row in enumerate(data): - if row["label"] not in ["0", "1"]: - row["label"] = -1 yield id_, { "id": row["id"], "sentence1": row["sentence1"],