Run checklist suites in AllenNLP (#5065)

* run checklist suites from command line * specify output file * separate task from checklist suite * qa task * adding describe, misc updates * fix docs, TE suite * update changelog * bug fix * adding default tests * qa defaults * typing, docs, minor updates * more updates * set add_default_tests to True * remove commented lines * capitalizing help strings * does this work * adding start_method to test * skipping test * oops, actually fix * temp fix to check memory issues * Skip more memory hungry tests * fix * fixing professions * Update setup.py Co-authored-by: Pete <petew@allenai.org> * Update CHANGELOG.md Co-authored-by: Pete <petew@allenai.org> * Update allennlp/sanity_checks/task_checklists/task_suite.py Co-authored-by: Pete <petew@allenai.org> * formatting functions Co-authored-by: Evan Pete Walsh <petew@allenai.org>
allenai · May 10, 2021 · 402bc78 · 402bc78
1 parent 7c06b49
commit 402bc78
Show file tree

Hide file tree

Showing 21 changed files with 2,345 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Added
+
+- Added `TaskSuite` base class and command line functionality for running [`checklist`](/~https://github.com/marcotcr/checklist) test suites, along with implementations for `SentimentAnalysisSuite`, `QuestionAnsweringSuite`, and `TextualEntailmentSuite`. These can be found in the `allennlp.sanity_checks.task_checklists` module.
+
 
 ## [v2.4.0](/~https://github.com/allenai/allennlp/releases/tag/v2.4.0) - 2021-04-22
 
@@ -40,7 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fixed a bug with the `ShardedDatasetReader` when used with multi-process data loading (/~https://github.com/allenai/allennlp/issues/5132).
 
-
 ## [v2.3.0](/~https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14
 
 ### Added
@@ -103,6 +108,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `Model.get_parameters_for_histogram_tensorboard_logging` is deprecated in favor of
   `Model.get_parameters_for_histogram_logging`.
 
+
 ### Fixed
 
 - Makes sure tensors that are stored in `TensorCache` always live on CPUs

diff --git a/Makefile b/Makefile
@@ -86,7 +86,9 @@ install :
 	# See /~https://github.com/pypa/pip/issues/4537.
 	python setup.py install_egg_info
 	pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
-
+	# Docs are not built on docker, and the runner is unable to find
+	# the nltk_data folder. Hence, we download the requirement.
+	python -c 'import nltk; nltk.download("sentiwordnet")'
 #
 # Documention helpers.
 #

diff --git a/allennlp/commands/__init__.py b/allennlp/commands/__init__.py
@@ -18,6 +18,7 @@
 from allennlp.commands.count_instances import CountInstances
 from allennlp.common.plugins import import_plugins
 from allennlp.common.util import import_module_and_submodules
+from allennlp.commands.checklist import CheckList
 
 logger = logging.getLogger(__name__)
 

diff --git a/allennlp/commands/checklist.py b/allennlp/commands/checklist.py
@@ -0,0 +1,199 @@
+"""
+The `checklist` subcommand allows you to sanity check your
+model's predictions using a trained model and its
+[`Predictor`](../predictors/predictor.md#predictor) wrapper.
+"""
+
+from typing import Optional, Dict, Any, List
+import argparse
+import sys
+import json
+
+from overrides import overrides
+
+from allennlp.commands.subcommand import Subcommand
+from allennlp.common.checks import check_for_gpu, ConfigurationError
+from allennlp.models.archival import load_archive
+from allennlp.predictors.predictor import Predictor
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+
+
+@Subcommand.register("checklist")
+class CheckList(Subcommand):
+    @overrides
+    def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
+
+        description = """Run the specified model through a checklist suite."""
+        subparser = parser.add_parser(
+            self.name,
+            description=description,
+            help="Run a trained model through a checklist suite.",
+        )
+
+        subparser.add_argument(
+            "archive_file", type=str, help="The archived model to make predictions with"
+        )
+
+        subparser.add_argument("task", type=str, help="The name of the task suite")
+
+        subparser.add_argument("--checklist-suite", type=str, help="The checklist suite path")
+
+        subparser.add_argument(
+            "--capabilities",
+            nargs="+",
+            default=[],
+            help=('An optional list of strings of capabilities. Eg. "[Vocabulary, Robustness]"'),
+        )
+
+        subparser.add_argument(
+            "--max-examples",
+            type=int,
+            default=None,
+            help="Maximum number of examples to check per test.",
+        )
+
+        subparser.add_argument(
+            "--task-suite-args",
+            type=str,
+            default="",
+            help=(
+                "An optional JSON structure used to provide additional parameters to the task suite"
+            ),
+        )
+
+        subparser.add_argument(
+            "--print-summary-args",
+            type=str,
+            default="",
+            help=(
+                "An optional JSON structure used to provide additional "
+                "parameters for printing test summary"
+            ),
+        )
+
+        subparser.add_argument("--output-file", type=str, help="Path to output file")
+
+        subparser.add_argument(
+            "--cuda-device", type=int, default=-1, help="ID of GPU to use (if any)"
+        )
+
+        subparser.add_argument(
+            "--predictor", type=str, help="Optionally specify a specific predictor to use"
+        )
+
+        subparser.add_argument(
+            "--predictor-args",
+            type=str,
+            default="",
+            help=(
+                "An optional JSON structure used to provide additional parameters to the predictor"
+            ),
+        )
+
+        subparser.set_defaults(func=_run_suite)
+
+        return subparser
+
+
+def _get_predictor(args: argparse.Namespace) -> Predictor:
+    check_for_gpu(args.cuda_device)
+    archive = load_archive(
+        args.archive_file,
+        cuda_device=args.cuda_device,
+    )
+
+    predictor_args = args.predictor_args.strip()
+    if len(predictor_args) <= 0:
+        predictor_args = {}
+    else:
+        predictor_args = json.loads(predictor_args)
+
+    return Predictor.from_archive(
+        archive,
+        args.predictor,
+        extra_args=predictor_args,
+    )
+
+
+def _get_task_suite(args: argparse.Namespace) -> TaskSuite:
+    available_tasks = TaskSuite.list_available()
+    if args.task in available_tasks:
+        suite_name = args.task
+    else:
+        raise ConfigurationError(
+            f"'{args.task}' is not a recognized task suite. "
+            f"Available tasks are: {available_tasks}."
+        )
+
+    file_path = args.checklist_suite
+
+    task_suite_args = args.task_suite_args.strip()
+    if len(task_suite_args) <= 0:
+        task_suite_args = {}
+    else:
+        task_suite_args = json.loads(task_suite_args)
+
+    return TaskSuite.constructor(
+        name=suite_name,
+        suite_file=file_path,
+        extra_args=task_suite_args,
+    )
+
+
+class _CheckListManager:
+    def __init__(
+        self,
+        task_suite: TaskSuite,
+        predictor: Predictor,
+        capabilities: Optional[List[str]] = None,
+        max_examples: Optional[int] = None,
+        output_file: Optional[str] = None,
+        print_summary_args: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        self._task_suite = task_suite
+        self._predictor = predictor
+        self._capabilities = capabilities
+        self._max_examples = max_examples
+        self._output_file = None if output_file is None else open(output_file, "w")
+        self._print_summary_args = print_summary_args or {}
+
+        if capabilities:
+            self._print_summary_args["capabilities"] = capabilities
+
+    def run(self) -> None:
+        self._task_suite.run(
+            self._predictor, capabilities=self._capabilities, max_examples=self._max_examples
+        )
+
+        # We pass in an IO object.
+        output_file = self._output_file or sys.stdout
+        self._task_suite.summary(file=output_file, **self._print_summary_args)
+
+        # If `_output_file` was None, there would be nothing to close.
+        if self._output_file is not None:
+            self._output_file.close()
+
+
+def _run_suite(args: argparse.Namespace) -> None:
+
+    task_suite = _get_task_suite(args)
+    predictor = _get_predictor(args)
+
+    print_summary_args = args.print_summary_args.strip()
+    if len(print_summary_args) <= 0:
+        print_summary_args = {}
+    else:
+        print_summary_args = json.loads(print_summary_args)
+
+    capabilities = args.capabilities
+    max_examples = args.max_examples
+
+    manager = _CheckListManager(
+        task_suite,
+        predictor,
+        capabilities,
+        max_examples,
+        args.output_file,
+        print_summary_args,
+    )
+    manager.run()
diff --git a/allennlp/common/testing/checklist_test.py b/allennlp/common/testing/checklist_test.py
@@ -0,0 +1,35 @@
+from typing import Optional
+from checklist.test_suite import TestSuite
+from checklist.test_types import MFT as MinimumFunctionalityTest
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+
+
+@TaskSuite.register("fake-task-suite")
+class FakeTaskSuite(TaskSuite):
+    """
+    Fake checklist suite for testing purpose.
+    """
+
+    def __init__(
+        self,
+        suite: Optional[TestSuite] = None,
+        fake_arg1: Optional[int] = None,
+        fake_arg2: Optional[int] = None,
+    ):
+        self._fake_arg1 = fake_arg1
+        self._fake_arg2 = fake_arg2
+
+        if not suite:
+            suite = TestSuite()
+
+        # Adding a simple checklist test.
+        test = MinimumFunctionalityTest(
+            ["sentence 1", "sentence 2"],
+            labels=0,
+            name="fake test 1",
+            capability="fake capability",
+            description="Test's description",
+        )
+        suite.add(test)
+
+        super().__init__(suite)
diff --git a/allennlp/sanity_checks/task_checklists/__init__.py b/allennlp/sanity_checks/task_checklists/__init__.py
@@ -0,0 +1,10 @@
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+from allennlp.sanity_checks.task_checklists.sentiment_analysis_suite import (
+    SentimentAnalysisSuite,
+)
+from allennlp.sanity_checks.task_checklists.question_answering_suite import (
+    QuestionAnsweringSuite,
+)
+from allennlp.sanity_checks.task_checklists.textual_entailment_suite import (
+    TextualEntailmentSuite,
+)