Merge branch 'master' into indices

huggingface · Aug 27, 2020 · 3a830c3 · 3a830c3 · github-actions · Aug 27, 2020
2 parents c751344 + 5c89ed1
commit 3a830c3
Show file tree

Hide file tree

Showing 24 changed files with 519 additions and 213 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -37,9 +37,9 @@ jobs:
             # we need a version of isort with /~https://github.com/timothycrosley/isort/pull/1000
             - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
             - run: sudo pip install .[quality]
-            - run: black --check --line-length 119 --target-version py36 tests src
-            - run: isort --check-only --recursive tests src
-            - run: flake8 tests src
+            - run: black --check --line-length 119 --target-version py36 src tests benchmarks
+            - run: isort --check-only --recursive src tests benchmarks
+            - run: flake8 src tests benchmarks
     build_doc:
         working_directory: ~/nlp
         docker:

diff --git a/.dvc/.gitignore b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json
@@ -0,0 +1,30 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": "rect",
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "nominal",
+            "sort": "ascending",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "nominal",
+            "sort": "ascending",
+            "title": "<DVC_METRIC_Y_LABEL>"
+        },
+        "color": {
+            "aggregate": "count",
+            "type": "quantitative"
+        },
+        "facet": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json
@@ -0,0 +1,29 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json
@@ -0,0 +1,27 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": "point",
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json
@@ -0,0 +1,39 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    },
+    "transform": [
+        {
+            "loess": "<DVC_METRIC_Y>",
+            "on": "<DVC_METRIC_X>",
+            "groupby": [
+                "rev"
+            ],
+            "bandwidth": 0.3
+        }
+    ]
+}
diff --git a/.dvcignore b/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -0,0 +1,22 @@
+name: benchmarks
+on: [push]
+jobs:
+  run:
+    runs-on: [ubuntu-latest]
+    container: docker://dvcorg/cml-py3:latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: cml_run
+        env:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          # Your ML workflow goes here
+          pip install -e .[tests]
+          dvc repro
+
+          git fetch --prune
+          dvc metrics diff --show-json master > report.json
+
+          python ./benchmarks/format.py report.json report.md
+
+          cml-send-comment report.md
diff --git a/.gitignore b/.gitignore
@@ -54,3 +54,9 @@ venv.bak/
 # Sphinx documentation
 docs/_build/
 docs/source/_build/
+
+# Benchmark results
+/benchmarks/results/*
+!/benchmarks/results/.gitkeep
+report.json
+report.md
diff --git a/Makefile b/Makefile
@@ -3,12 +3,12 @@
 # Check that source code meets quality standards
 
 quality:
-	black --check --line-length 119 --target-version py36 src tests
-	isort --check-only --recursive src tests
-	flake8 src tests
+	black --check --line-length 119 --target-version py36 tests src benchmarks
+	isort --check-only --recursive tests src benchmarks datasets
+	flake8 tests src benchmarks
 
 # Format source code automatically
 
 style:
-	black --line-length 119 --target-version py36 src tests
-	isort --recursive src tests datasets
+	black --line-length 119 --target-version py36 tests src benchmarks
+	isort --recursive tests src datasets benchmarks
diff --git a/benchmarks/benchmark_array_xd.py b/benchmarks/benchmark_array_xd.py
@@ -0,0 +1,130 @@
+import json
+import os
+import tempfile
+
+import nlp
+from nlp.arrow_writer import ArrowWriter
+from nlp.features import Array2D
+from utils import generate_examples, get_duration
+
+
+SHAPE_TEST_1 = (30, 487)
+SHAPE_TEST_2 = (36, 1024)
+SPEED_TEST_SHAPE = (100, 100)
+SPEED_TEST_N_EXAMPLES = 100
+
+DEFAULT_FEATURES = nlp.Features(
+    {"text": Array2D(SHAPE_TEST_1, dtype="float32"), "image": Array2D(SHAPE_TEST_2, dtype="float32")}
+)
+
+RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
+RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
+
+
+@get_duration
+def write(my_features, dummy_data, tmp_dir):
+    writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow"))
+    for key, record in dummy_data:
+        example = my_features.encode_example(record)
+        writer.write(example)
+    num_examples, num_bytes = writer.finalize()
+
+
+@get_duration
+def read_unformated(feats, tmp_dir):
+    dataset = nlp.Dataset.from_file(filename=os.path.join(tmp_dir, "beta.arrow"), info=nlp.DatasetInfo(features=feats))
+    for _ in dataset:
+        pass
+
+
+@get_duration
+def read_formatted_as_numpy(feats, tmp_dir):
+    dataset = nlp.Dataset.from_file(filename=os.path.join(tmp_dir, "beta.arrow"), info=nlp.DatasetInfo(features=feats))
+    dataset.set_format("numpy")
+    for _ in dataset:
+        pass
+
+
+@get_duration
+def read_batch_unformated(feats, tmp_dir):
+    batch_size = 10
+    dataset = nlp.Dataset.from_file(filename=os.path.join(tmp_dir, "beta.arrow"), info=nlp.DatasetInfo(features=feats))
+    for i in range(0, len(dataset), batch_size):
+        _ = dataset[i : i + batch_size]
+
+
+@get_duration
+def read_batch_formatted_as_numpy(feats, tmp_dir):
+    batch_size = 10
+    dataset = nlp.Dataset.from_file(filename=os.path.join(tmp_dir, "beta.arrow"), info=nlp.DatasetInfo(features=feats))
+    dataset.set_format("numpy")
+    for i in range(0, len(dataset), batch_size):
+        _ = dataset[i : i + batch_size]
+
+
+@get_duration
+def read_col_unformated(feats, tmp_dir):
+    dataset = nlp.Dataset.from_file(filename=os.path.join(tmp_dir, "beta.arrow"), info=nlp.DatasetInfo(features=feats))
+    for col in feats:
+        _ = dataset[col]
+
+
+@get_duration
+def read_col_formatted_as_numpy(feats, tmp_dir):
+    dataset = nlp.Dataset.from_file(filename=os.path.join(tmp_dir, "beta.arrow"), info=nlp.DatasetInfo(features=feats))
+    dataset.set_format("numpy")
+    for col in feats:
+        _ = dataset[col]
+
+
+def benchmark_array_xd():
+    times = {}
+    read_functions = (
+        read_unformated,
+        read_formatted_as_numpy,
+        read_batch_unformated,
+        read_batch_formatted_as_numpy,
+        read_col_unformated,
+        read_col_formatted_as_numpy,
+    )
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        feats = nlp.Features({"image": Array2D(SPEED_TEST_SHAPE, dtype="float32")})
+        data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES)
+        times["write_array2d"] = write(feats, data, tmp_dir)
+        for read_func in read_functions:
+            times[read_func.__name__ + " after write_array2d"] = read_func(feats, tmp_dir)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # don't use fixed length for fair comparison
+        # feats = nlp.Features(
+        #     {"image": nlp.Sequence(nlp.Sequence(nlp.Value("float32"), SPEED_TEST_SHAPE[1]), SPEED_TEST_SHAPE[0])}
+        # )
+        feats = nlp.Features({"image": nlp.Sequence(nlp.Sequence(nlp.Value("float32")))})
+        data = generate_examples(
+            features=feats, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"image": SPEED_TEST_SHAPE}
+        )
+        times["write_nested_sequence"] = write(feats, data, tmp_dir)
+        for read_func in read_functions:
+            times[read_func.__name__ + " after write_nested_sequence"] = read_func(feats, tmp_dir)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # don't use fixed length for fair comparison
+        # feats = nlp.Features(
+        #     {"image": nlp.Sequence(nlp.Value("float32"), SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1])}
+        # )
+        feats = nlp.Features({"image": nlp.Sequence(nlp.Value("float32"))})
+        data = generate_examples(
+            features=feats,
+            num_examples=SPEED_TEST_N_EXAMPLES,
+            seq_shapes={"image": [SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1]]},
+        )
+        times["write_flattened_sequence"] = write(feats, data, tmp_dir)
+        for read_func in read_functions:
+            times[read_func.__name__ + " after write_flattened_sequence"] = read_func(feats, tmp_dir)
+
+    with open(RESULTS_FILE_PATH, "wb") as f:
+        f.write(json.dumps(times).encode("utf-8"))
+
+
+if __name__ == "__main__":  # useful to run the profiler
+    benchmark_array_xd()
diff --git a/benchmarks/benchmark_indices_mapping.py b/benchmarks/benchmark_indices_mapping.py
@@ -0,0 +1,57 @@
+import json
+import os
+import tempfile
+
+import nlp
+from utils import generate_example_dataset, get_duration
+
+
+SPEED_TEST_N_EXAMPLES = 500_000
+
+RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
+RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
+
+
+@get_duration
+def select(dataset: nlp.Dataset):
+    _ = dataset.select(range(0, len(dataset), 2))
+
+
+@get_duration
+def sort(dataset: nlp.Dataset):
+    _ = dataset.sort("numbers")
+
+
+@get_duration
+def shuffle(dataset: nlp.Dataset):
+    _ = dataset.shuffle()
+
+
+@get_duration
+def train_test_split(dataset: nlp.Dataset):
+    _ = dataset.train_test_split(0.1)
+
+
+@get_duration
+def shard(dataset: nlp.Dataset, num_shards=10):
+    for shard_id in range(num_shards):
+        _ = dataset.shard(num_shards, shard_id)
+
+
+def benchmark_indices_mapping():
+    times = {}
+    functions = (select, sort, shuffle, train_test_split, shard)
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        features = nlp.Features({"text": nlp.Value("string"), "numbers": nlp.Value("float32")})
+        dataset = generate_example_dataset(
+            os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES
+        )
+        for func in functions:
+            times[func.__name__] = func(dataset)
+
+    with open(RESULTS_FILE_PATH, "wb") as f:
+        f.write(json.dumps(times).encode("utf-8"))
+
+
+if __name__ == "__main__":  # useful to run the profiler
+    benchmark_indices_mapping()