From 7811bd8f60a8a90e39db6fd322348f22c8181236 Mon Sep 17 00:00:00 2001 From: mozharovsky Date: Thu, 21 Jan 2021 18:10:21 +0300 Subject: [PATCH 1/4] refactor(data): specify script version via an argument --- formerbox/data/binarizer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/formerbox/data/binarizer.py b/formerbox/data/binarizer.py index aa43d2ac..bf42def1 100644 --- a/formerbox/data/binarizer.py +++ b/formerbox/data/binarizer.py @@ -3,7 +3,7 @@ from abc import ABCMeta, abstractmethod from dataclasses import dataclass, field from io import TextIOWrapper -from typing import Any, Dict, List, Text, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Text, Tuple, Type, Union import torch from datasets import Dataset, DatasetDict, load_dataset @@ -134,13 +134,17 @@ def encode(self, instance: Dict[Text, Any]) -> Dict[Text, Any]: raise NotImplementedError() def process_dataset( - self, filename: Text, script_path: Text, remove_columns: List[Text] + self, + filename: Text, + script_path: Text, + script_version: Optional[Text], + remove_columns: List[Text], ) -> Union[Dataset, DatasetDict]: dataset = load_dataset( path=script_path, data_files=[filename], split="train", - script_version="master", + script_version=script_version, ) dataset = dataset.map( From 6336536bc47f145cc84e9699def978e64285d861 Mon Sep 17 00:00:00 2001 From: mozharovsky Date: Thu, 21 Jan 2021 18:11:02 +0300 Subject: [PATCH 2/4] feat(data): add offline mode support by passing none to script version --- formerbox/data/binarizer_default.py | 1 + formerbox/data/binarizer_translation.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/formerbox/data/binarizer_default.py b/formerbox/data/binarizer_default.py index 06e42409..39c76d3b 100644 --- a/formerbox/data/binarizer_default.py +++ b/formerbox/data/binarizer_default.py @@ -97,6 +97,7 @@ def binarize( dataset = self.process_dataset( filename, script_path="text", + script_version=None, remove_columns=["text"], ) diff --git a/formerbox/data/binarizer_translation.py b/formerbox/data/binarizer_translation.py index 817701c9..0bce1111 100644 --- a/formerbox/data/binarizer_translation.py +++ b/formerbox/data/binarizer_translation.py @@ -84,14 +84,20 @@ def binarize( # process source dataset src_filename = f"{filename}.{self.params.src_lang}" src_dataset = self.process_dataset( - src_filename, script_path="text", remove_columns=["text"] + src_filename, + script_path="text", + script_version=None, + remove_columns=["text"], ) if self.params.tgt_lang is not None: # process target dataset if present tgt_filename = f"{filename}.{self.params.tgt_lang}" tgt_dataset = self.process_dataset( - tgt_filename, script_path="text", remove_columns=["text"] + tgt_filename, + script_path="text", + script_version=None, + remove_columns=["text"], ) logger.info("Processing source and target files") From 8cf13fe455360dfbee3e3402f982b0063fff3c95 Mon Sep 17 00:00:00 2001 From: mozharovsky Date: Thu, 21 Jan 2021 19:44:25 +0300 Subject: [PATCH 3/4] refactor(data): log error when packaged scripts are set incorrectly --- formerbox/data/binarizer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/formerbox/data/binarizer.py b/formerbox/data/binarizer.py index bf42def1..756209df 100644 --- a/formerbox/data/binarizer.py +++ b/formerbox/data/binarizer.py @@ -140,6 +140,15 @@ def process_dataset( script_version: Optional[Text], remove_columns: List[Text], ) -> Union[Dataset, DatasetDict]: + # check if packaged scripts are set correctly + if script_path in ["text", "json", "csv"]: + if script_version is not None: + logger.error( + "Script %s is packaged into datasets library." + " Make sure you do not set `script_version` argument.", + script_path, + ) + dataset = load_dataset( path=script_path, data_files=[filename], From 90735b7c895e1c04a3579ec3e1be02792ed79da6 Mon Sep 17 00:00:00 2001 From: mozharovsky Date: Thu, 21 Jan 2021 19:49:52 +0300 Subject: [PATCH 4/4] style(data): use sorted list values --- formerbox/data/binarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/formerbox/data/binarizer.py b/formerbox/data/binarizer.py index 756209df..f82edc57 100644 --- a/formerbox/data/binarizer.py +++ b/formerbox/data/binarizer.py @@ -141,7 +141,7 @@ def process_dataset( remove_columns: List[Text], ) -> Union[Dataset, DatasetDict]: # check if packaged scripts are set correctly - if script_path in ["text", "json", "csv"]: + if script_path in ["csv", "json", "text"]: if script_version is not None: logger.error( "Script %s is packaged into datasets library."