Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove print statements in datasets #3546

Merged
merged 3 commits into from
Jan 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def generate_urls(self, split_name):
yield self.KEYS[i], f"{split_name}/{lang_pair}.{split_name}.{lang}"

def _generate_examples(self, split_name, file_paths):
print(file_paths)
# Open each file (one for source language and the other for target language)
files = {k: open(file_paths[k], encoding="utf-8") for k in file_paths}

Expand Down
11 changes: 6 additions & 5 deletions datasets/compguesswhat/create_dummy_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gzip
import json
import logging
import os
from argparse import ArgumentParser

Expand Down Expand Up @@ -50,10 +51,10 @@ def create_dummy_data_for_split(data_path, dataset_name, dataset_version, data_f
os.makedirs(dummy_data_path)

for split_name, split_file in data_files.items():
print(f"Generating dummy data for split {split_name} (num. examples = {args.examples})")
logging.info(f"Generating dummy data for split {split_name} (num. examples = {args.examples})")

split_filepath = os.path.join(data_path, full_dataset_name, dataset_version, split_file)
print(f"Reading split file {split_filepath}")
logging.info(f"Reading split file {split_filepath}")
with gzip.open(split_filepath) as in_file:
dummy_filepath = os.path.join(dummy_data_path, split_file)
with gzip.open(dummy_filepath, mode="w") as out_file:
Expand Down Expand Up @@ -81,12 +82,12 @@ def main(args):

dataset_version = dataset_info["compguesswhat-original"]["version"]["version_str"]

print(f"Creating dummy data for CompGuessWhat?! {dataset_version}")
logging.info(f"Creating dummy data for CompGuessWhat?! {dataset_version}")

print("Original dataset...")
logging.info("Original dataset...")
create_dummy_data_for_split(args.data_path, "original", dataset_version, original_data_files)

print("Zero-shot dataset...")
logging.info("Zero-shot dataset...")
create_dummy_data_for_split(args.data_path, "zero_shot", dataset_version, zs_data_files)


Expand Down
6 changes: 3 additions & 3 deletions datasets/oscar/generate_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

import fsspec as fs
import requests
from oscar import _BASE_CHECKSUM_FILE_NAME, Oscar
from oscar import _BASE_CHECKSUM_FILE_NAME, Oscar, logger


N_EXAMPLES = 2

if __name__ == "__main__":

for i, config in enumerate(Oscar.BUILDER_CONFIGS):
print(f"Loading config '{config.name}' ({i + 1}/{len(Oscar.BUILDER_CONFIGS)})")
logger.info(f"Loading config '{config.name}' ({i + 1}/{len(Oscar.BUILDER_CONFIGS)})")

# Get data url
checksum_filename = _BASE_CHECKSUM_FILE_NAME.format(language=config.language)
Expand Down Expand Up @@ -42,6 +42,6 @@
root_dir = str(dummy_data_dir.parent)
base_name = str(dummy_data_dir)
base_dir = "dummy_data"
print(f"Compressing dummy data folder to '{base_name}.zip'")
logger.info(f"Compressing dummy data folder to '{base_name}.zip'")
shutil.make_archive(base_name, "zip", root_dir, base_dir)
shutil.rmtree(base_name)
13 changes: 7 additions & 6 deletions datasets/wiki_lingua/create_dummy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import logging
import os
import pickle
import shutil
Expand Down Expand Up @@ -46,7 +47,7 @@ def create():
base_path = "/Users/katnoria/dev/projects/workspaces/python/datasets"
for key in _URLs.keys():
# data = load_dataset('./datasets/wiki_lingua', key)
print(f"Finding {key}.pkl")
logging.info(f"Finding {key}.pkl")
filepath = [name for name in files if name.endswith(f"{key}.pkl")][0]
with open(filepath, "rb") as f:
data = pickle.load(f)
Expand All @@ -55,13 +56,13 @@ def create():
fname = sanitize_url(_URLs[key])
dirname = pjoin(base_path, f"datasets/wiki_lingua/dummy/{key}/1.1.0/dummy_data")
if not os.path.exists(dirname):
print(f"created folder {dirname}")
logging.info(f"created folder {dirname}")
os.makedirs(dirname)
fname = pjoin(dirname, fname)
print(f"creating for {key}:{fname}")
logging.info(f"creating for {key}:{fname}")
with open(fname, "wb") as f:
pickle.dump(data_subset, f)
print("SUCCESS")
logging.info("SUCCESS")


def zip():
Expand All @@ -70,10 +71,10 @@ def zip():
for key in _URLs.keys():
# dirname = pjoin(base_path, f"datasets/wiki_lingua/dummy/{key}/1.1.0/dummy_data")
dirname = pjoin(base_path, f"datasets/wiki_lingua/dummy/{key}/1.1.0")
print(f"Zipping {dirname}")
logging.info(f"Zipping {dirname}")
shutil.make_archive(f"{dirname}/dummy_data", "zip", dirname, "dummy_data")
shutil.rmtree(f"{dirname}/dummy_data")
print(f"Deleted folder {dirname}/dummy_data")
logging.info(f"Deleted folder {dirname}/dummy_data")


# Utility script to create the dummy data and zip the contents
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt14/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt15/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt16/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt17/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt18/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt19/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
1 change: 0 additions & 1 deletion datasets/wmt_t2t/wmt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,6 @@ def _get_filenames(dataset):
# +++++++++++++++++++++

logger.info("Generating examples from: %s", ss_name)
print("Generating examples from: %s", ss_name)
dataset = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(dataset, extract_dirs)
Expand Down
64 changes: 64 additions & 0 deletions tests/test_dataset_scripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import re
from pathlib import Path
from unittest import TestCase


class TestDatasetScripts(TestCase):
def _no_encoding_on_file_open(self, filepath: str):
r"""Find all instances where a non-binary file is opened without UTF-8 encoding.

This function uses regular expressions to find instances where Python's `open()` function is used to open
non-binary files. See below for an explanation of the regular expression:

(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b): Lookahead and discard match if `encoding` or `rb` etc are
arguments of `open()`.

(?<=\s): Lookbehind and match if `open()` predeceded by one whitespace.

(open)\((.*)\): Capture everything in parentheses of `open()`.
"""

with open(filepath, "r", encoding="utf-8") as input_file:
regexp = re.compile(r"(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b)(?<=\s)(open)\((.*)\)")
input_text = input_file.read()
match = regexp.search(input_text)

return match

def _no_print_statements(self, filepath: str):
r"""Find all instances where a python sctipt file contains a `print` statement.

#[^\r\n]*print\(: Match print statement inside a comment. We ignore this group.

\"[^\r\n]*print\(: Match print statement inside a string. We ignore this group.

\"\"\".*?print\(.*?\"\"\"": Match print statement inside a triple-quoted string. Uses re.DOTALL to also match newlines with ".".
We ignore this group.

(print\()): Match print statement.
"""

with open(filepath, "r", encoding="utf-8") as input_file:
regexp = re.compile(r"#[^\r\n]*print\(|\"[^\r\n]*print\(|\"\"\".*?print\(.*?\"\"\"|(print\()", re.DOTALL)
input_text = input_file.read()
# use `re.finditer` to handle the case where the ignored groups would be matched first by `re.search`
matches = regexp.finditer(input_text)

matches = [match for match in matches if match is not None and match.group(1) is not None]
return matches[0] if matches else None

def test_no_encoding_on_file_open(self):
dataset_paths = Path("./datasets")
dataset_files = list(dataset_paths.absolute().glob("**/*.py"))

for dataset in dataset_files:
if self._no_encoding_on_file_open(str(dataset)):
raise AssertionError(f"open(...) must use utf-8 encoding in {dataset}")

def test_no_print_statements(self):
dataset_paths = Path("./datasets")
dataset_files = list(dataset_paths.absolute().glob("**/*.py"))

for dataset in dataset_files:
if self._no_print_statements(str(dataset)):
raise AssertionError(f"print statement found in {dataset}. Use datasets.logger/logging instead.")
34 changes: 0 additions & 34 deletions tests/test_file_encoding.py

This file was deleted.