allenai · pdasigi · Aug 21, 2019 · Aug 12, 2019 · Aug 16, 2019 · Aug 16, 2019
diff --git a/allennlp/tests/fixtures/data/quoref/quoref_sample.json b/allennlp/tests/fixtures/data/quoref/quoref_sample.json
@@ -0,0 +1,68 @@
+{
+  "data": [
+    {
+      "title": "2007\u20132008 Nazko earthquakes 1",
+      "url": "https://en.wikipedia.org/wiki/2007%E2%80%932008_Nazko_earthquakes",
+      "paragraphs": [
+        {
+          "context": "The earthquake swarm was noted on October 12, 2007 in the Prince George Citizen by citizen staff, three days after the earthquakes began. Scientists mentioned in the report were seismologist John Cassidy of Natural Resources Canada and volcanologist Catherine Hickson, who was part of the Geological Survey of Canada at the time. At the time of the report, scientists did not know the origin of the swarm. Seismologist John Cassidy stated, \"the depth is enough to rule out hydrothermal but it's up in the air as to whether the cause is tectonic shifts or volcanic activity. If it is volcanic there are certain characteristics that we would expect, there's a tremor-like character to it. And so we'll be looking for the types of events that we see beneath volcanoes and we'll be looking to see if they're getting closer to the surface or if they're migrating at all.\"Even if the Nazko swarm were a warning of a volcanic eruption, Hickson doubted it would turn out to be a highly explosive eruption like those that can occur in subduction-zone volcanoes. \"We're not talking about an injection of tonnes of ash many kilometers into the air like the 1980 Mount St. Helens eruption or the 1991 Mount Pinatubo eruption. We're talking about something very small, relatively localized that should have a fairly limited impact... but it'll be extremely exciting\", Hickson said. If an eruption were to occur, Hickson suggested that it would be characterized by a lava fountain that sends globs of lava 100 m (330 ft) into the air. This is similar to those that occur in Hawaii. Hickson said that a Nazko eruption could be a tourist attraction, but warned that noxious gases such as carbon dioxide and sulfur dioxide would be released during the event.",
+          "context_id": "9821bc1796c741d48f9a50ad2e70a2f136f00338",
+          "qas": [
+            {
+              "question": "What is the first name of the person who doubted it would turn out to be a highly explosive eruption like those that can occur in subduction-zone volcanoes?",
+              "id": "ba3f052c7a557909526b59713430403dd134e01d",
+              "answers": [
+                {
+                  "text": "Catherine",
+                  "answer_start": 250
+                }
+              ]
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "title": "Peter Warlock",
+      "url": "https://en.wikipedia.org/wiki/Peter_Warlock",
+      "paragraphs": [
+        {
+          "context": "Philip Arnold Heseltine (30 October 1894 \u2013 17 December 1930), known by the pseudonym Peter Warlock, was a British composer and music critic. The Warlock name, which reflects Heseltine's interest in occult practices, was used for all his published musical works. He is best known as a composer of songs and other vocal music; he also achieved notoriety in his lifetime through his unconventional and often scandalous lifestyle.\nAs a schoolboy at Eton College, Heseltine met the British composer Frederick Delius, with whom he formed a close friendship. After a failed student career in Oxford and London, Heseltine turned to musical journalism, while developing interests in folk-song and Elizabethan music. His first serious compositions date from around 1915. Following a period of inactivity, a positive and lasting influence on his work arose from his meeting in 1916 with the Dutch composer Bernard van Dieren; he also gained creative impetus from a year spent in Ireland, studying Celtic culture and language. On his return to England in 1918, Heseltine began composing songs in a distinctive, original style, while building a reputation as a combative and controversial music critic.  During 1920\u201321 he edited the music magazine The Sackbut. His most prolific period as a composer came in the 1920s, when he was based first in Wales and later at Eynsford in Kent.\nThrough his critical writings, published under his own name, Heseltine made a pioneering contribution to the scholarship of early music. In addition, he produced a full-length biography of Frederick Delius and wrote, edited, or otherwise assisted the production of several other books and pamphlets. Towards the end of his life, Heseltine became depressed by a loss of his creative inspiration. He died in his London flat of coal gas poisoning in 1930, probably by his own hand.",
+          "context_id": "d7a264271662fb370d46eeb48d997bd976d7660e",
+          "qas": [
+            {
+              "question": "What is the last name of the person who is best known as a composer of songs and other vocal music?",
+              "id": "7db0ed1ab90b90ee27a71b63798e4528a8523df1",
+              "answers": [
+                {
+                  "text": "Heseltine",
+                  "answer_start": 459
+                }
+              ]
+            },
+            {
+              "question": "What is the last name of the person who also achieved notoriety in his lifetime through his unconventional and often scandalous lifestyle?",
+              "id": "335654892c66647dd8531140c9bcd28e3f7500ec",
+              "answers": [
+                {
+                  "text": "Heseltine",
+                  "answer_start": 459
+                }
+              ]
+            },
+            {
+              "question": "What is the last name of the person whose first serious compositions date from around 1915?",
+              "id": "2142d85e9eacd549bc6164583d14407383d15692",
+              "answers": [
+                {
+                  "text": "Heseltine",
+                  "answer_start": 459
+                }
+              ]
+            }
+	  ]
+	}
+      ]
+    }
+  ]
+}
diff --git a/allennlp/tests/fixtures/data/quoref/quoref_sample_predictions.json b/allennlp/tests/fixtures/data/quoref/quoref_sample_predictions.json
@@ -0,0 +1,15 @@
+{
+  "ba3f052c7a557909526b59713430403dd134e01d":
+              [
+                "Catherine"
+	      ],
+  "335654892c66647dd8531140c9bcd28e3f7500ec":
+              [
+                "Heseltine"
+              ],
+  "2142d85e9eacd549bc6164583d14407383d15692":
+              [
+                "Delius",
+		"Heseltine"
+              ]
+}
diff --git a/allennlp/tests/tools/quoref_eval_test.py b/allennlp/tests/tools/quoref_eval_test.py
@@ -0,0 +1,32 @@
+# pylint: disable=invalid-name
+import os
+
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.tools import quoref_eval
+
+
+class TestQuorefEval(AllenNlpTestCase):
+    """
+    The actual evaluation logic in Quoref's evaluation script is from DROP's script, and the
+    only additional thing that Quoref's script does is handling the data properly. So this class only tests the
+    data handling aspects. The tests we have for DROP are fairly comprehensive.
+    """
+    def test_quoref_eval_with_original_data_format(self):
+        predictions_file = self.FIXTURES_ROOT / "data" / "quoref" / "quoref_sample_predictions.json"
+        gold_file = self.FIXTURES_ROOT / "data" / "quoref" / "quoref_sample.json"
+        metrics = quoref_eval.evaluate_prediction_file(predictions_file, gold_file)
+        assert metrics == (0.5, 0.625)
+
+    def test_quoref_eval_with_simple_format(self):
+        predictions_file = self.FIXTURES_ROOT / "data" / "quoref" / "quoref_sample_predictions.json"
+        gold_file = self.FIXTURES_ROOT / "data" / "quoref" / "quoref_sample_predictions.json"
+        metrics = quoref_eval.evaluate_prediction_file(predictions_file, gold_file)
+        assert metrics == (1.0, 1.0)
+
+    def test_quoref_eval_script(self):
+        predictions_file = self.FIXTURES_ROOT / "data" / "quoref" / "quoref_sample_predictions.json"
+        gold_file = self.FIXTURES_ROOT / "data" / "quoref" / "quoref_sample.json"
+        script_file = "allennlp/tools/quoref_eval.py"
+        result = os.system(f'python {script_file} --gold_path {gold_file} --prediction_path {predictions_file}'
+                           ' --output_path /tmp/output.json')
+        assert result == 0
diff --git a/allennlp/tools/quoref_eval.py b/allennlp/tools/quoref_eval.py
@@ -0,0 +1,113 @@
+"""
+This evaluation script relies heavily on the one for DROP (``allennlp/tools/drop_eval.py``). We need a separate
+script for Quoref only because the data formats are slightly different.
+"""
+
+import json
+from typing import Dict, Tuple, List, Any, Optional
+import argparse
+import numpy as np
+from allennlp.tools import drop_eval
+
+
+def _get_answers_from_data(annotations: Dict[str, Any]) -> Dict[str, List[str]]:
+    """
+    If the annotations file is in the same format as the original data files, this method can be used to extract a
+    dict of query ids and answers.
+    """
+    answers_dict: Dict[str, List[str]] = {}
+    for article_info in annotations["data"]:
+        for paragraph_info in article_info["paragraphs"]:
+            for qa_pair in paragraph_info["qas"]:
+                query_id = qa_pair["id"]
+                candidate_answers = [answer["text"] for answer in qa_pair["answers"]]
+                answers_dict[query_id] = candidate_answers
+    return answers_dict
+
+def evaluate_json(annotations: Dict[str, Any], predicted_answers: Dict[str, Any]) -> Tuple[float, float]:
+    """
+    Takes gold annotations and predicted answers and  evaluates the predictions for each question
+    in the gold annotations.  Both JSON dictionaries must have query_id keys, which are used to
+    match predictions to gold annotations.
+
+    The ``predicted_answers`` JSON must be a dictionary keyed by query id, where the value is a
+    list of strings (or just one string) that is the answer.
+    The ``annotations`` are assumed to have either the format of the dev set in the Quoref data release, or the
+    same format as the predicted answers file.
+    """
+    instance_exact_match = []
+    instance_f1 = []
+    if "data" in annotations:
+        # We're looking at annotations in the original data format. Let's extract the answers.
+        annotated_answers = _get_answers_from_data(annotations)
+    else:
+        annotated_answers = annotations
+    for query_id, candidate_answers in annotated_answers.items():
+        max_em_score = 0.0
+        max_f1_score = 0.0
+        if query_id in predicted_answers:
+            predicted = predicted_answers[query_id]
+            gold_answer = tuple(candidate_answers)
+            em_score, f1_score = drop_eval.get_metrics(predicted, gold_answer)
+            if gold_answer[0].strip() != "":
+                max_em_score = max(max_em_score, em_score)
+                max_f1_score = max(max_f1_score, f1_score)
+        else:
+            print("Missing prediction for question: {}".format(query_id))
+            max_em_score = 0.0
+            max_f1_score = 0.0
+        instance_exact_match.append(max_em_score)
+        instance_f1.append(max_f1_score)
+
+    global_em = np.mean(instance_exact_match)
+    global_f1 = np.mean(instance_f1)
+    print("Exact-match accuracy {0:.2f}".format(global_em * 100))
+    print("F1 score {0:.2f}".format(global_f1 * 100))
+    print("{0:.2f}   &   {1:.2f}".format(global_em * 100, global_f1 * 100))
+    return global_em, global_f1
+
+
+def evaluate_prediction_file(prediction_path: str, gold_path: str,
+                             output_path: Optional[str] = None) -> Tuple[float, float]:
+    """
+    Takes a prediction file and a gold file and evaluates the predictions for each question in the gold file.  Both
+    files must be json formatted and must have query_id keys, which are used to match predictions to gold
+    annotations. Writes a json with global_em and global_f1 metrics to file at the specified output
+    path, unless None is passed as output path.
+    """
+    predicted_answers = json.load(open(prediction_path, encoding='utf-8'))
+    annotations = json.load(open(gold_path, encoding='utf-8'))
+    global_em, global_f1 = evaluate_json(annotations, predicted_answers)
+
+    # Output predictions to file if an output path is given
+    if output_path is not None:
+        output_dict = {"global_em": global_em,
+                       "global_f1": global_f1}
+
+        with open(output_path, "w", encoding="utf8") as outfile:
+            json.dump(output_dict, outfile)
+
+    return (global_em, global_f1)
+
+
+if __name__ == "__main__":
+    # pylint: disable=invalid-name
+    parser = argparse.ArgumentParser(description='Evaluate Quoref predictions')
+    parser.add_argument("--gold_path",
+                        type=str,
+                        required=False,
+                        default="quoref-test-v0.1.json",
+                        help='location of the gold file')
+    parser.add_argument("--prediction_path",
+                        type=str,
+                        required=False,
+                        default="sample_predictions.json",
+                        help='location of the prediction file')
+    parser.add_argument("--output_path",
+                        type=str,
+                        required=False,
+                        default=None,
+                        help='location of the output metrics file')
+
+    args = parser.parse_args()
+    evaluate_prediction_file(args.prediction_path, args.gold_path, args.output_path)
diff --git a/doc/api/allennlp.tools.rst b/doc/api/allennlp.tools.rst
@@ -14,6 +14,11 @@ tasks for which we build models.
    :undoc-members:
    :show-inheritance:
 
+.. automodule:: allennlp.tools.quoref_eval
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 .. automodule:: allennlp.tools.squad_eval
    :members:
    :undoc-members: