diff --git a/datasets/code_x_glue_tt_text_to_text/code_x_glue_tt_text_to_text.py b/datasets/code_x_glue_tt_text_to_text/code_x_glue_tt_text_to_text.py index 95bac2ddd98..c1b9eb7ec86 100644 --- a/datasets/code_x_glue_tt_text_to_text/code_x_glue_tt_text_to_text.py +++ b/datasets/code_x_glue_tt_text_to_text/code_x_glue_tt_text_to_text.py @@ -60,7 +60,6 @@ def generate_urls(self, split_name): yield self.KEYS[i], f"{split_name}/{lang_pair}.{split_name}.{lang}" def _generate_examples(self, split_name, file_paths): - print(file_paths) # Open each file (one for source language and the other for target language) files = {k: open(file_paths[k], encoding="utf-8") for k in file_paths} diff --git a/datasets/compguesswhat/create_dummy_data.py b/datasets/compguesswhat/create_dummy_data.py index e60669e12f4..eb6e9bd05c9 100644 --- a/datasets/compguesswhat/create_dummy_data.py +++ b/datasets/compguesswhat/create_dummy_data.py @@ -1,5 +1,6 @@ import gzip import json +import logging import os from argparse import ArgumentParser @@ -50,10 +51,10 @@ def create_dummy_data_for_split(data_path, dataset_name, dataset_version, data_f os.makedirs(dummy_data_path) for split_name, split_file in data_files.items(): - print(f"Generating dummy data for split {split_name} (num. examples = {args.examples})") + logging.info(f"Generating dummy data for split {split_name} (num. examples = {args.examples})") split_filepath = os.path.join(data_path, full_dataset_name, dataset_version, split_file) - print(f"Reading split file {split_filepath}") + logging.info(f"Reading split file {split_filepath}") with gzip.open(split_filepath) as in_file: dummy_filepath = os.path.join(dummy_data_path, split_file) with gzip.open(dummy_filepath, mode="w") as out_file: @@ -81,12 +82,12 @@ def main(args): dataset_version = dataset_info["compguesswhat-original"]["version"]["version_str"] - print(f"Creating dummy data for CompGuessWhat?! {dataset_version}") + logging.info(f"Creating dummy data for CompGuessWhat?! {dataset_version}") - print("Original dataset...") + logging.info("Original dataset...") create_dummy_data_for_split(args.data_path, "original", dataset_version, original_data_files) - print("Zero-shot dataset...") + logging.info("Zero-shot dataset...") create_dummy_data_for_split(args.data_path, "zero_shot", dataset_version, zs_data_files) diff --git a/datasets/oscar/generate_dummy.py b/datasets/oscar/generate_dummy.py index 20c60206d84..9a9afc1f940 100644 --- a/datasets/oscar/generate_dummy.py +++ b/datasets/oscar/generate_dummy.py @@ -3,7 +3,7 @@ import fsspec as fs import requests -from oscar import _BASE_CHECKSUM_FILE_NAME, Oscar +from oscar import _BASE_CHECKSUM_FILE_NAME, Oscar, logger N_EXAMPLES = 2 @@ -11,7 +11,7 @@ if __name__ == "__main__": for i, config in enumerate(Oscar.BUILDER_CONFIGS): - print(f"Loading config '{config.name}' ({i + 1}/{len(Oscar.BUILDER_CONFIGS)})") + logger.info(f"Loading config '{config.name}' ({i + 1}/{len(Oscar.BUILDER_CONFIGS)})") # Get data url checksum_filename = _BASE_CHECKSUM_FILE_NAME.format(language=config.language) @@ -42,6 +42,6 @@ root_dir = str(dummy_data_dir.parent) base_name = str(dummy_data_dir) base_dir = "dummy_data" - print(f"Compressing dummy data folder to '{base_name}.zip'") + logger.info(f"Compressing dummy data folder to '{base_name}.zip'") shutil.make_archive(base_name, "zip", root_dir, base_dir) shutil.rmtree(base_name) diff --git a/datasets/wiki_lingua/create_dummy.py b/datasets/wiki_lingua/create_dummy.py index 3e8dfef11c9..d48fc23ef65 100644 --- a/datasets/wiki_lingua/create_dummy.py +++ b/datasets/wiki_lingua/create_dummy.py @@ -1,4 +1,5 @@ import itertools +import logging import os import pickle import shutil @@ -46,7 +47,7 @@ def create(): base_path = "/Users/katnoria/dev/projects/workspaces/python/datasets" for key in _URLs.keys(): # data = load_dataset('./datasets/wiki_lingua', key) - print(f"Finding {key}.pkl") + logging.info(f"Finding {key}.pkl") filepath = [name for name in files if name.endswith(f"{key}.pkl")][0] with open(filepath, "rb") as f: data = pickle.load(f) @@ -55,13 +56,13 @@ def create(): fname = sanitize_url(_URLs[key]) dirname = pjoin(base_path, f"datasets/wiki_lingua/dummy/{key}/1.1.0/dummy_data") if not os.path.exists(dirname): - print(f"created folder {dirname}") + logging.info(f"created folder {dirname}") os.makedirs(dirname) fname = pjoin(dirname, fname) - print(f"creating for {key}:{fname}") + logging.info(f"creating for {key}:{fname}") with open(fname, "wb") as f: pickle.dump(data_subset, f) - print("SUCCESS") + logging.info("SUCCESS") def zip(): @@ -70,10 +71,10 @@ def zip(): for key in _URLs.keys(): # dirname = pjoin(base_path, f"datasets/wiki_lingua/dummy/{key}/1.1.0/dummy_data") dirname = pjoin(base_path, f"datasets/wiki_lingua/dummy/{key}/1.1.0") - print(f"Zipping {dirname}") + logging.info(f"Zipping {dirname}") shutil.make_archive(f"{dirname}/dummy_data", "zip", dirname, "dummy_data") shutil.rmtree(f"{dirname}/dummy_data") - print(f"Deleted folder {dirname}/dummy_data") + logging.info(f"Deleted folder {dirname}/dummy_data") # Utility script to create the dummy data and zip the contents diff --git a/datasets/wmt14/wmt_utils.py b/datasets/wmt14/wmt_utils.py index b0a059dc208..b700277367a 100644 --- a/datasets/wmt14/wmt_utils.py +++ b/datasets/wmt14/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/datasets/wmt15/wmt_utils.py b/datasets/wmt15/wmt_utils.py index 3be70123fc4..a83e097b651 100644 --- a/datasets/wmt15/wmt_utils.py +++ b/datasets/wmt15/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/datasets/wmt16/wmt_utils.py b/datasets/wmt16/wmt_utils.py index 6da0fe995d0..721b348dc66 100644 --- a/datasets/wmt16/wmt_utils.py +++ b/datasets/wmt16/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/datasets/wmt17/wmt_utils.py b/datasets/wmt17/wmt_utils.py index 7745a38c811..5386ab4ec04 100644 --- a/datasets/wmt17/wmt_utils.py +++ b/datasets/wmt17/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/datasets/wmt18/wmt_utils.py b/datasets/wmt18/wmt_utils.py index 7745a38c811..5386ab4ec04 100644 --- a/datasets/wmt18/wmt_utils.py +++ b/datasets/wmt18/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/datasets/wmt19/wmt_utils.py b/datasets/wmt19/wmt_utils.py index 7745a38c811..5386ab4ec04 100644 --- a/datasets/wmt19/wmt_utils.py +++ b/datasets/wmt19/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/datasets/wmt_t2t/wmt_utils.py b/datasets/wmt_t2t/wmt_utils.py index 3be70123fc4..a83e097b651 100644 --- a/datasets/wmt_t2t/wmt_utils.py +++ b/datasets/wmt_t2t/wmt_utils.py @@ -793,7 +793,6 @@ def _get_filenames(dataset): # +++++++++++++++++++++ logger.info("Generating examples from: %s", ss_name) - print("Generating examples from: %s", ss_name) dataset = DATASET_MAP[ss_name] extract_dirs = extraction_map[ss_name] files = _get_local_paths(dataset, extract_dirs) diff --git a/tests/test_dataset_scripts.py b/tests/test_dataset_scripts.py new file mode 100644 index 00000000000..30fe5b9967c --- /dev/null +++ b/tests/test_dataset_scripts.py @@ -0,0 +1,64 @@ +import re +from pathlib import Path +from unittest import TestCase + + +class TestDatasetScripts(TestCase): + def _no_encoding_on_file_open(self, filepath: str): + r"""Find all instances where a non-binary file is opened without UTF-8 encoding. + + This function uses regular expressions to find instances where Python's `open()` function is used to open + non-binary files. See below for an explanation of the regular expression: + + (?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b): Lookahead and discard match if `encoding` or `rb` etc are + arguments of `open()`. + + (?<=\s): Lookbehind and match if `open()` predeceded by one whitespace. + + (open)\((.*)\): Capture everything in parentheses of `open()`. + """ + + with open(filepath, "r", encoding="utf-8") as input_file: + regexp = re.compile(r"(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b)(?<=\s)(open)\((.*)\)") + input_text = input_file.read() + match = regexp.search(input_text) + + return match + + def _no_print_statements(self, filepath: str): + r"""Find all instances where a python sctipt file contains a `print` statement. + + #[^\r\n]*print\(: Match print statement inside a comment. We ignore this group. + + \"[^\r\n]*print\(: Match print statement inside a string. We ignore this group. + + \"\"\".*?print\(.*?\"\"\"": Match print statement inside a triple-quoted string. Uses re.DOTALL to also match newlines with ".". + We ignore this group. + + (print\()): Match print statement. + """ + + with open(filepath, "r", encoding="utf-8") as input_file: + regexp = re.compile(r"#[^\r\n]*print\(|\"[^\r\n]*print\(|\"\"\".*?print\(.*?\"\"\"|(print\()", re.DOTALL) + input_text = input_file.read() + # use `re.finditer` to handle the case where the ignored groups would be matched first by `re.search` + matches = regexp.finditer(input_text) + + matches = [match for match in matches if match is not None and match.group(1) is not None] + return matches[0] if matches else None + + def test_no_encoding_on_file_open(self): + dataset_paths = Path("./datasets") + dataset_files = list(dataset_paths.absolute().glob("**/*.py")) + + for dataset in dataset_files: + if self._no_encoding_on_file_open(str(dataset)): + raise AssertionError(f"open(...) must use utf-8 encoding in {dataset}") + + def test_no_print_statements(self): + dataset_paths = Path("./datasets") + dataset_files = list(dataset_paths.absolute().glob("**/*.py")) + + for dataset in dataset_files: + if self._no_print_statements(str(dataset)): + raise AssertionError(f"print statement found in {dataset}. Use datasets.logger/logging instead.") diff --git a/tests/test_file_encoding.py b/tests/test_file_encoding.py deleted file mode 100644 index c57bfd39bf8..00000000000 --- a/tests/test_file_encoding.py +++ /dev/null @@ -1,34 +0,0 @@ -import re -from pathlib import Path -from unittest import TestCase - - -class TestFileEncoding(TestCase): - def _no_encoding_on_file_open(self, filepath: str): - r"""Find all instances where a non-binary file is opened without UTF-8 encoding. - - This function uses regular expressions to find instances where Python's `open()` function is used to open - non-binary files. See below for an explanation of the regular expression: - - (?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b): Lookahead and discard match if `encoding` or `rb` etc are - arguments of `open()`. - - (?<=\s): Lookbehind and match if `open()` predeceded by one whitespace. - - (open)\((.*)\): Capture everything in parentheses of `open()`. - """ - - with open(filepath, "r", encoding="utf-8") as input_file: - regexp = re.compile(r"(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b)(?<=\s)(open)\((.*)\)") - input_text = input_file.read() - match = regexp.search(input_text) - - return match - - def test_no_encoding_on_file_open(self): - dataset_paths = Path("./datasets") - dataset_files = list(dataset_paths.absolute().glob("**/*.py")) - - for dataset in dataset_files: - if self._no_encoding_on_file_open(str(dataset)): - raise ValueError(f"open(...) must use utf-8 encoding in {dataset}")