From bc9afa52ee49f900b377b1b3ad5b291d402fcff4 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 12 Jan 2022 16:39:14 +0100 Subject: [PATCH 1/2] Add classification tasks --- datasets/muchocine/dataset_infos.json | 2 +- datasets/muchocine/muchocine.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/datasets/muchocine/dataset_infos.json b/datasets/muchocine/dataset_infos.json index bfb965be668..41ba375c766 100644 --- a/datasets/muchocine/dataset_infos.json +++ b/datasets/muchocine/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Muchocine reviews dataset contains 3,872 longform movie reviews in Spanish language,\neach with a shorter summary review, and a rating on a 1-5 scale.\n", "citation": "", "homepage": "http://www.lsi.us.es/~fermin/index.php/Datasets", "license": "CC-BY-2.1", "features": {"review_body": {"dtype": "string", "id": null, "_type": "Value"}, "review_summary": {"dtype": "string", "id": null, "_type": "Value"}, "star_rating": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "muchocine", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 11855607, "num_examples": 3872, "dataset_name": "muchocine"}}, "download_checksums": {"http://www.lsi.us.es/~fermin/corpusCine.zip": {"num_bytes": 55556703, "checksum": "2be1333c903613402effa85ca629a66541093702c92e079c438b6eb5d84260a5"}}, "download_size": 55556703, "post_processing_size": null, "dataset_size": 11855607, "size_in_bytes": 67412310}} \ No newline at end of file +{"default": {"description": "The Muchocine reviews dataset contains 3,872 longform movie reviews in Spanish language,\neach with a shorter summary review, and a rating on a 1-5 scale.\n", "citation": "", "homepage": "http://www.lsi.us.es/~fermin/index.php/Datasets", "license": "CC-BY-2.1", "features": {"review_body": {"dtype": "string", "id": null, "_type": "Value"}, "review_summary": {"dtype": "string", "id": null, "_type": "Value"}, "star_rating": {"num_classes": 5, "names": ["1", "2", "3", "4", "5"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "review_body", "label_column": "star_rating"}, {"task": "text-classification", "text_column": "review_summary", "label_column": "star_rating"}], "builder_name": "muchocine", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 11871095, "num_examples": 3872, "dataset_name": "muchocine"}}, "download_checksums": {"http://www.lsi.us.es/~fermin/corpusCine.zip": {"num_bytes": 55556703, "checksum": "2be1333c903613402effa85ca629a66541093702c92e079c438b6eb5d84260a5"}}, "download_size": 55556703, "post_processing_size": null, "dataset_size": 11871095, "size_in_bytes": 67427798}} \ No newline at end of file diff --git a/datasets/muchocine/muchocine.py b/datasets/muchocine/muchocine.py index 35dc20cac9c..032ff2ab5b5 100644 --- a/datasets/muchocine/muchocine.py +++ b/datasets/muchocine/muchocine.py @@ -20,6 +20,7 @@ from xml.dom.minidom import parseString import datasets +from datasets.tasks import TextClassification # no BibTeX citation @@ -43,7 +44,7 @@ def _info(self): { "review_body": datasets.Value("string"), "review_summary": datasets.Value("string"), - "star_rating": datasets.Value("int32"), + "star_rating": datasets.ClassLabel(names=[str(i) for i in range(1, 6)]), } ) return datasets.DatasetInfo( @@ -53,6 +54,10 @@ def _info(self): homepage="http://www.lsi.us.es/~fermin/index.php/Datasets", license=_LICENSE, citation=_CITATION, + task_templates=[ + TextClassification(text_column="review_body", label_column="star_rating"), + TextClassification(text_column="review_summary", label_column="star_rating"), + ], ) def _split_generators(self, dl_manager): @@ -102,5 +107,5 @@ def _generate_examples(self, filepaths, split): yield id, { "review_body": btxt, "review_summary": rtxt, - "star_rating": int(doc.documentElement.attributes["rank"].value), + "star_rating": doc.documentElement.attributes["rank"].value, } From 6efd9d3bdd9f1a05fb84daf53061daf8c0b339c1 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 12 Jan 2022 17:01:41 +0100 Subject: [PATCH 2/2] Readme improvements --- datasets/muchocine/README.md | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/datasets/muchocine/README.md b/datasets/muchocine/README.md index e6f6903bab2..238821faf70 100644 --- a/datasets/muchocine/README.md +++ b/datasets/muchocine/README.md @@ -1,12 +1,12 @@ --- annotations_creators: -- no-annotation +- found language_creators: - found languages: - es licenses: -- cc-by-2.1 +- unknown multilinguality: - monolingual size_categories: @@ -18,6 +18,7 @@ task_categories: task_ids: - sentiment-classification paperswithcode_id: null +pretty_name: MuchoCine --- # Dataset Card for Muchocine @@ -57,29 +58,36 @@ each with a shorter summary review, and a rating on a 1-5 scale. ### Supported Tasks and Leaderboards -[More Information Needed] +- `text-classification`: This dataset can be used for Text Classification, more precisely Sentiment Classification where the task is to predict the `star_rating` for a `reveiw_body` or a `review summaray`. ### Languages -Spanish +Spanish. ## Dataset Structure ### Data Instances -[More Information Needed] +An example from the train split: + +``` +{ + 'review_body': 'Zoom nos cuenta la historia de Jack Shepard, anteriormente conocido como el Capitán Zoom, Superhéroe que perdió sus poderes y que actualmente vive en el olvido. La llegada de una amenaza para la Tierra hará que la agencia del gobierno que se ocupa de estos temas acuda a él para que entrene a un grupo de jóvenes con poderes para combatir esta amenaza.Zoom es una comedia familiar, con todo lo que eso implica, es decir, guión flojo y previsible, bromas no salidas de tono, historia amorosa de por medio y un desenlace tópico. La gracia está en que los protagonistas son jóvenes con superpoderes, una producción cargada de efectos especiales y unos cuantos guiños frikis. La película además se pasa volando ya que dura poco mas de ochenta minutos y cabe destacar su prologo en forma de dibujos de comics explicando la historia de la cual partimos en la película.Tim Allen protagoniza la cinta al lado de un envejecido Chevy Chase, que hace de doctor encargado del proyecto, un papel bastante gracioso y ridículo, pero sin duda el mejor papel es el de Courteney Cox, en la piel de una científica amante de los comics y de lo más friki. Del grupito de los cuatro niños sin duda la mas graciosa es la niña pequeña con súper fuerza y la que provocara la mayor parte de los gags debido a su poder.Una comedia entretenida y poca cosa más para ver una tarde de domingo. ', + 'review_summary': 'Una comedia entretenida y poca cosa más para ver una tarde de domingo ', 'star_rating': 2 +} +``` ### Data Fields -- review_body - longform review -- review_summary - shorter-form review -- star_rating - an integer star rating (1-5) +- `review_body` - longform review +- `review_summary` - shorter-form review +- `star_rating` - an integer star rating (1-5) The original source also includes part-of-speech tagging for body and summary fields. ### Data Splits -One split (train) with 3,872 reviews +One split (train) with 3,872 reviews. ## Dataset Creation @@ -92,7 +100,7 @@ One split (train) with 3,872 reviews #### Initial Data Collection and Normalization Data was collected from www.muchocine.net and uploaded by Dr. Fermín L. Cruz Mata -of La Universidad de Sevilla +of La Universidad de Sevilla. #### Who are the source language producers? @@ -130,11 +138,11 @@ The text reviews and star ratings came directly from users, so no additional ann ### Dataset Curators -- Dr. Fermín L. Cruz Mata +Dr. Fermín L. Cruz Mata. ### Licensing Information -CC-BY-2.1 +[More Information Needed] ### Citation Information