diff --git a/datasets/muchocine/README.md b/datasets/muchocine/README.md index a039379f0e3..151ca61a69b 100644 --- a/datasets/muchocine/README.md +++ b/datasets/muchocine/README.md @@ -1,12 +1,12 @@ --- annotations_creators: -- no-annotation +- found language_creators: - found languages: - es licenses: -- cc-by-2.1 +- unknown multilinguality: - monolingual size_categories: @@ -58,29 +58,36 @@ each with a shorter summary review, and a rating on a 1-5 scale. ### Supported Tasks and Leaderboards -[More Information Needed] +- `text-classification`: This dataset can be used for Text Classification, more precisely Sentiment Classification where the task is to predict the `star_rating` for a `reveiw_body` or a `review summaray`. ### Languages -Spanish +Spanish. ## Dataset Structure ### Data Instances -[More Information Needed] +An example from the train split: + +``` +{ + 'review_body': 'Zoom nos cuenta la historia de Jack Shepard, anteriormente conocido como el Capitán Zoom, Superhéroe que perdió sus poderes y que actualmente vive en el olvido. La llegada de una amenaza para la Tierra hará que la agencia del gobierno que se ocupa de estos temas acuda a él para que entrene a un grupo de jóvenes con poderes para combatir esta amenaza.Zoom es una comedia familiar, con todo lo que eso implica, es decir, guión flojo y previsible, bromas no salidas de tono, historia amorosa de por medio y un desenlace tópico. La gracia está en que los protagonistas son jóvenes con superpoderes, una producción cargada de efectos especiales y unos cuantos guiños frikis. La película además se pasa volando ya que dura poco mas de ochenta minutos y cabe destacar su prologo en forma de dibujos de comics explicando la historia de la cual partimos en la película.Tim Allen protagoniza la cinta al lado de un envejecido Chevy Chase, que hace de doctor encargado del proyecto, un papel bastante gracioso y ridículo, pero sin duda el mejor papel es el de Courteney Cox, en la piel de una científica amante de los comics y de lo más friki. Del grupito de los cuatro niños sin duda la mas graciosa es la niña pequeña con súper fuerza y la que provocara la mayor parte de los gags debido a su poder.Una comedia entretenida y poca cosa más para ver una tarde de domingo. ', + 'review_summary': 'Una comedia entretenida y poca cosa más para ver una tarde de domingo ', 'star_rating': 2 +} +``` ### Data Fields -- review_body - longform review -- review_summary - shorter-form review -- star_rating - an integer star rating (1-5) +- `review_body` - longform review +- `review_summary` - shorter-form review +- `star_rating` - an integer star rating (1-5) The original source also includes part-of-speech tagging for body and summary fields. ### Data Splits -One split (train) with 3,872 reviews +One split (train) with 3,872 reviews. ## Dataset Creation @@ -93,7 +100,7 @@ One split (train) with 3,872 reviews #### Initial Data Collection and Normalization Data was collected from www.muchocine.net and uploaded by Dr. Fermín L. Cruz Mata -of La Universidad de Sevilla +of La Universidad de Sevilla. #### Who are the source language producers? @@ -131,11 +138,11 @@ The text reviews and star ratings came directly from users, so no additional ann ### Dataset Curators -- Dr. Fermín L. Cruz Mata +Dr. Fermín L. Cruz Mata. ### Licensing Information -CC-BY-2.1 +[More Information Needed] ### Citation Information diff --git a/datasets/muchocine/dataset_infos.json b/datasets/muchocine/dataset_infos.json index bfb965be668..41ba375c766 100644 --- a/datasets/muchocine/dataset_infos.json +++ b/datasets/muchocine/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Muchocine reviews dataset contains 3,872 longform movie reviews in Spanish language,\neach with a shorter summary review, and a rating on a 1-5 scale.\n", "citation": "", "homepage": "http://www.lsi.us.es/~fermin/index.php/Datasets", "license": "CC-BY-2.1", "features": {"review_body": {"dtype": "string", "id": null, "_type": "Value"}, "review_summary": {"dtype": "string", "id": null, "_type": "Value"}, "star_rating": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "muchocine", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 11855607, "num_examples": 3872, "dataset_name": "muchocine"}}, "download_checksums": {"http://www.lsi.us.es/~fermin/corpusCine.zip": {"num_bytes": 55556703, "checksum": "2be1333c903613402effa85ca629a66541093702c92e079c438b6eb5d84260a5"}}, "download_size": 55556703, "post_processing_size": null, "dataset_size": 11855607, "size_in_bytes": 67412310}} \ No newline at end of file +{"default": {"description": "The Muchocine reviews dataset contains 3,872 longform movie reviews in Spanish language,\neach with a shorter summary review, and a rating on a 1-5 scale.\n", "citation": "", "homepage": "http://www.lsi.us.es/~fermin/index.php/Datasets", "license": "CC-BY-2.1", "features": {"review_body": {"dtype": "string", "id": null, "_type": "Value"}, "review_summary": {"dtype": "string", "id": null, "_type": "Value"}, "star_rating": {"num_classes": 5, "names": ["1", "2", "3", "4", "5"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "review_body", "label_column": "star_rating"}, {"task": "text-classification", "text_column": "review_summary", "label_column": "star_rating"}], "builder_name": "muchocine", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 11871095, "num_examples": 3872, "dataset_name": "muchocine"}}, "download_checksums": {"http://www.lsi.us.es/~fermin/corpusCine.zip": {"num_bytes": 55556703, "checksum": "2be1333c903613402effa85ca629a66541093702c92e079c438b6eb5d84260a5"}}, "download_size": 55556703, "post_processing_size": null, "dataset_size": 11871095, "size_in_bytes": 67427798}} \ No newline at end of file diff --git a/datasets/muchocine/muchocine.py b/datasets/muchocine/muchocine.py index 35dc20cac9c..032ff2ab5b5 100644 --- a/datasets/muchocine/muchocine.py +++ b/datasets/muchocine/muchocine.py @@ -20,6 +20,7 @@ from xml.dom.minidom import parseString import datasets +from datasets.tasks import TextClassification # no BibTeX citation @@ -43,7 +44,7 @@ def _info(self): { "review_body": datasets.Value("string"), "review_summary": datasets.Value("string"), - "star_rating": datasets.Value("int32"), + "star_rating": datasets.ClassLabel(names=[str(i) for i in range(1, 6)]), } ) return datasets.DatasetInfo( @@ -53,6 +54,10 @@ def _info(self): homepage="http://www.lsi.us.es/~fermin/index.php/Datasets", license=_LICENSE, citation=_CITATION, + task_templates=[ + TextClassification(text_column="review_body", label_column="star_rating"), + TextClassification(text_column="review_summary", label_column="star_rating"), + ], ) def _split_generators(self, dl_manager): @@ -102,5 +107,5 @@ def _generate_examples(self, filepaths, split): yield id, { "review_body": btxt, "review_summary": rtxt, - "star_rating": int(doc.documentElement.attributes["rank"].value), + "star_rating": doc.documentElement.attributes["rank"].value, }