From ff125a6d14125e5d0774573d9dbe8e1bac920ad8 Mon Sep 17 00:00:00 2001
From: Daniel Khashabi <danielk@allenai.org>
Date: Tue, 25 Jan 2022 20:03:51 -0800
Subject: [PATCH] Crowdworker evaluation of the tasks [Work in Progress] 
 (#276)

* - first commit for evaluation: adding the first version a working evaluation template.

* - added a script for result aggregation.

* - update the preparation script.

* - update.

* - update.

* - update.

* - update.

* - add the evaluation script.

* - drop a mistakenly-added directory.

* - add more description to the instructions.

* - minor.

* -

* -

* add quals.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* update.
---
 .gitignore                                    |   3 +
 eval/amt/1.amti_create_input_data.py          | 101 ++++
 eval/amt/2.run_eval.sh                        |  13 +
 eval/amt/3.amti_aggregate_results.py          | 124 +++++
 eval/amt/README.md                            |  24 +
 .../qualificationtypeproperties.json          |   8 +
 eval/amt/mturk-specs/definition-likert/NOTES  |   1 +
 .../definition-likert/hitproperties.json      |   4 +
 .../definition-likert/hittypeproperties.json  |  14 +
 .../definition-likert/question.xml.j2         | 455 ++++++++++++++++++
 eval/automatic/evaluation.py                  |  62 +++
 11 files changed, 809 insertions(+)
 create mode 100644 eval/amt/1.amti_create_input_data.py
 create mode 100644 eval/amt/2.run_eval.sh
 create mode 100644 eval/amt/3.amti_aggregate_results.py
 create mode 100644 eval/amt/README.md
 create mode 100644 eval/amt/disqualification/qualificationtypeproperties.json
 create mode 100644 eval/amt/mturk-specs/definition-likert/NOTES
 create mode 100644 eval/amt/mturk-specs/definition-likert/hitproperties.json
 create mode 100644 eval/amt/mturk-specs/definition-likert/hittypeproperties.json
 create mode 100644 eval/amt/mturk-specs/definition-likert/question.xml.j2
 create mode 100644 eval/automatic/evaluation.py
diff --git a/.gitignore b/.gitignore
index 5204e5f7c..37f2c978e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 .idea
 .DS_Store
+/eval/amt/past_experiments/
+eval/amt/past_experiments/*
+/eval/amt/disqualification/blocklist.txt
 /src/utils/
\ No newline at end of file
diff --git a/eval/amt/1.amti_create_input_data.py b/eval/amt/1.amti_create_input_data.py
new file mode 100644
index 000000000..e0337d13e
--- /dev/null
+++ b/eval/amt/1.amti_create_input_data.py
@@ -0,0 +1,101 @@
+'''
+Given a task file, it creates a single file for crowd annotations (the input to AMTI).
+For example: python 1.amti_create_input_data.py --start  5 --end 20 --eval_count 10
+'''
+import json
+import random
+import argparse
+from os import listdir
+from os.path import isfile, join
+
+tasks_path = '../../tasks/'
+
+
+def read_file(file):
+    with open(tasks_path + file, 'r') as f:
+        return json.load(f)
+
+
+def normalize(str):
+    return str.replace('"', '\'').replace('`', '\'').replace('&', ' and ').encode('ascii', 'ignore').decode('ascii').replace('<br>', '\n').replace('<', '[').replace('>', ']').replace('\n', '<br>')
+
+files = [f for f in listdir(tasks_path) if isfile(join(tasks_path, f)) and ".json" in f]
+task_ids_to_file = {}
+
+for f in files:
+    id = f.split("_")[0].replace("task", "")
+    # print(id)
+    id = int(id)
+    task_ids_to_file[id] = f
+
+
+def process_single_file(start, end, max_count):
+    fout = open(f"start={start}_end={end}_max_size={max_count}.jsonl", "w")
+    for idx in range(start, end + 1):
+        if idx not in task_ids_to_file:
+            continue
+        file = task_ids_to_file[idx]
+
+        # grouping instances into groups of size 5
+        json_content = read_file(file)
+
+        if json_content["Input_language"] != ["English"] or json_content["Output_language"] != ["English"]:
+            continue
+
+        positive_examples = json_content['Positive Examples']
+        negative_examples = json_content['Negative Examples']
+        instances = json_content['Instances']
+
+        # make sure the annotators see all the examples
+        random.shuffle(positive_examples)
+        random.shuffle(negative_examples)
+        random.shuffle(instances)
+
+        positive_examples = positive_examples[:5]
+        negative_examples = negative_examples[:3]
+
+        n = 2
+        chunks = [instances[i:i + n] for i in range(0, len(instances), n)]
+        for i, chunk in enumerate(chunks):
+            if i * n > int(max_count):
+                break
+            map = {
+                'file': normalize(file),
+                'instructions': normalize(json_content['Definition']),
+            }
+
+            map[f'positive_example_count'] = len(positive_examples)
+            map[f'negative_example_count'] = len(negative_examples)
+
+            for idx, ex in enumerate(positive_examples):
+                map[f'positive_ex_{idx}_input'] = normalize(ex['input'])
+                map[f'positive_ex_{idx}_output'] = normalize(ex['output'])
+                map[f'positive_ex_{idx}_explanation'] = normalize(ex['explanation'])
+
+            for idx, ex in enumerate(negative_examples):
+                map[f'negative_ex_{idx}_input'] = normalize(ex['input'])
+                map[f'negative_ex_{idx}_output'] = normalize(ex['output'])
+                map[f'negative_ex_{idx}_explanation'] = normalize(ex['explanation'])
+
+            for idx, ex in enumerate(chunk):
+                map[f'instance_{idx}_input'] = normalize(ex['input'])
+                map[f'instance_{idx}_output'] = "///".join([normalize(x) for x in ex['output']])
+
+            fout.write(json.dumps(map) + "\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='A script for preparing natural instructions tasks for human evaluation')
+    parser.add_argument('--start', help='id of the start task inside `tasks/`')
+    parser.add_argument('--end', help='id of the end task inside `tasks/`')
+    parser.add_argument('--eval_count', help='how many instances to use in this evaluation. '
+                                             '100 should be enough for reliable estimates.')
+    args = parser.parse_args()
+
+    print(" >>>>>>>>> Processing with the following arguments: ")
+    print(f" * start task id: {args.start}")
+    print(f" * end task id: {args.end}")
+    print(f" * count of samples from each task: {args.eval_count}")
+
+    process_single_file(int(args.start), int(args.end), args.eval_count)
diff --git a/eval/amt/2.run_eval.sh b/eval/amt/2.run_eval.sh
new file mode 100644
index 000000000..8798d01ae
--- /dev/null
+++ b/eval/amt/2.run_eval.sh
@@ -0,0 +1,13 @@
+# launch the experiment
+export AWS_PROFILE=alexandria-mturk
+amti --verbose create-batch mturk-specs/definition-likert start=1200_end=1536_max_size=5.jsonl . --live
+
+# check the experiment status
+export BATCH=batch-...
+#export BATCH=batch-13abedb3-9788-4118-8a23-89978a941638_start=1540_end=1800_max_size=5
+amti status-batch "$BATCH" --live
+
+# fetch the results when they're over
+amti review-batch "$BATCH" --approve-all --live
+amti save-batch "$BATCH" --live
+amti extract tabular "$BATCH" "${BATCH}/batch-results.jsonl"
diff --git a/eval/amt/3.amti_aggregate_results.py b/eval/amt/3.amti_aggregate_results.py
new file mode 100644
index 000000000..1aab886d0
--- /dev/null
+++ b/eval/amt/3.amti_aggregate_results.py
@@ -0,0 +1,124 @@
+import sys
+
+# sys.path.append("..")
+sys.path.append("../automatic")
+
+import json
+from evaluation import metric_max_over_ground_truths, rouge
+
+
+def get_stats(values):
+    maj_vote_value = max(set(values), key=values.count)
+    avg_score = sum(values) / len(values)
+    model_values_str = "\t".join([str(x) for x in values])
+    return maj_vote_value, avg_score, model_values_str
+
+
+def normalize(str):
+    return str.replace("\t", " ").replace("\n", "<newline>").replace("\r", "<newline>")
+
+
+def normalize2(str):
+    return ' '.join(str.replace(",", "").lower().split())
+
+
+def discretize(x):
+    if x == 0.5:
+        return 0.5
+    if x > 0.5:
+        return 1.0
+    else:
+        return 0.0
+
+
+def aggregate_v2(response_file):
+    worker_stats = {}
+    suggestions =  {}
+    with open(response_file) as f:
+        for line in f.readlines():
+            json_line = json.loads(line)
+
+            worker_id = json_line[f'WorkerId']
+            if worker_id not in worker_stats:
+                worker_stats[worker_id] = 0
+            worker_stats[worker_id] += 1
+
+            file = json_line[f'file']
+            instructions = normalize(json_line[f'instructions'])
+
+            instruction_quality_value = float(json_line[f'instruction_quality_q1'])
+            instruction_suggestions = normalize(json_line[f'instruction_quality_q2'])
+
+            positive_ex_quality_value = float(json_line[f'positive_example_quality_q1'])
+            positive_ex_suggestions = normalize(json_line[f'positive_example_quality_q2'])
+
+            negative_ex_quality_value = float(json_line[f'negative_example_quality_q1'])
+            negative_ex_suggestions = normalize(json_line[f'negative_example_quality_q2'])
+
+            positive_examples = []
+            for idx in range(0, 5):
+                id = f'positive_ex_{idx}_input'
+                if id in json_line:
+                    positive_examples.append(json_line[id])
+            positive_examples_appended = normalize("//".join(positive_examples))
+
+            negative_examples = []
+            for idx in range(0, 3):
+                id = f'negative_ex_{idx}_input'
+                if id in json_line:
+                    negative_examples.append(json_line[id])
+            negative_examples_appended = normalize("//".join(negative_examples))
+
+            instruction_suggestions = instruction_suggestions.replace("\n", " ")
+            positive_ex_suggestions = positive_ex_suggestions.replace("\n", " ")
+            negative_ex_suggestions = negative_ex_suggestions.replace("\n", " ")
+
+            prefix = f"{instruction_quality_value}\t{instruction_suggestions}\t{file}\t{instructions}" \
+                     f"\t{positive_ex_quality_value}\t{positive_ex_suggestions}\t{positive_examples_appended}" \
+                     f"\t{negative_ex_quality_value}\t{negative_ex_suggestions}\t{negative_examples_appended}"
+
+            if len(instruction_suggestions) + len(positive_ex_suggestions) + len(negative_ex_suggestions) > 2:
+                if file not in suggestions:
+                    suggestions[file] = []
+                if len(instruction_suggestions.strip()) > 2:
+                    suggestions[file].append(f"    - regarding instructions: `{instruction_suggestions}`")
+                if len(positive_ex_suggestions.strip()) > 2:
+                    suggestions[file].append(f"    - regarding p examples: `{positive_ex_suggestions}`")
+                if len(negative_ex_suggestions.strip()) > 2:
+                    suggestions[file].append(f"    - regarding n examples: `{negative_ex_suggestions}`")
+            # instance_input = []
+            # instance_output = []
+            # instance_prediction = []
+            for idx in range(0, 5):
+                id = f'instance_{idx}_input'
+                if id in json_line:
+                    input = normalize(json_line[id])
+                    output = normalize(json_line[f'instance_{idx}_output'])
+                    human_output = normalize(json_line[f'annotated_instance_{idx}_output'])
+                    # instance_input.append(input)
+                    # instance_output.append(output)
+                    # instance_prediction.append(human_output)
+                    rouge_val = metric_max_over_ground_truths(
+                        rouge, normalize2(human_output),
+                        [normalize2(x) for x in output.split("///")]
+                    )
+                    print(f"{prefix}\t{input}\t{output}\t{human_output}\t{rouge_val}\t{worker_id}")
+
+    for file in sorted(suggestions.keys()):
+        print(f" - [ ] {file}")
+        for feedback in suggestions[file]:
+            print(feedback)
+
+
+
+# aggregate_v2("batch-43ecd7ef-0717-4b72-a39c-b6179f8b5f77_task156_pilot/batch-results.jsonl")
+# aggregate_v2("batch-65c49abc-f8a3-4f12-a71d-9ce5742c3419_start=60_end=100_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-eea0ef32-da0a-47cf-bcae-810d1a503379_start=1_end=59_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-cdcb497e-49a3-4cee-8ab5-1451dc19dac2_119_end=200_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-58086c69-62bf-4e91-8741-b68d27e1fd63-start=201_end=300_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-fc02d066-1e35-4184-b4ea-ba7eca1abcc1_start=301_end=400_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-4739062e-2141-4f97-9a37-41197abf9a93_start=400_end=600_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-23353cc5-13c1-4af9-94c3-03eb2bacbd0a_start=600_end=850_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-4ee23f3d-2900-4fef-a0ae-d05bc7d519e8_start=850_end=1200_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-13abedb3-9788-4118-8a23-89978a941638_start=1200_end=1536_max_size=5/batch-results.jsonl")
+aggregate_v2("batch-eb1b61cd-36e7-4fdf-b8e1-fedd3c77245f_start=1540_end=1726_max_size=5/batch-results.jsonl")
diff --git a/eval/amt/README.md b/eval/amt/README.md
new file mode 100644
index 000000000..4382de275
--- /dev/null
+++ b/eval/amt/README.md
@@ -0,0 +1,24 @@
+# Human Evaluation of Instructions 
+ - Step1: Select a task and convert it into AMTI-friendly format:  
+```terminal
+> python 1.amti_create_input_data.py --file [NAME] --eval_count [COUNT]
+```
+For example: 
+```terminal
+> python 1.amti_create_input_data.py --file  task156_codah_classification_adversarial.json --eval_count 10
+```
+ - Step2: follow the steps in `2.run_eval.sh`
+ - Step3: evaluate
+
+
+To disqualify a certain user, add them to `genie_disqualification/blocklist.txt` and then:
+
+    amti associate-qual  --file genie_disqualification/blocklist.txt --qual  3090SA10WM5MIHCWNTON1VROMP4CN3 --live 
+
+Or disqualify a particular user: 
+
+    amti associate-qual  WorkerId --qual   3090SA10WM5MIHCWNTON1VROMP4CN3   --live
+
+To list workers that are disqualified:
+
+    amti  associated --qual  3090SA10WM5MIHCWNTON1VROMP4CN3 --status Granted  --live 
diff --git a/eval/amt/disqualification/qualificationtypeproperties.json b/eval/amt/disqualification/qualificationtypeproperties.json
new file mode 100644
index 000000000..d04c77f51
--- /dev/null
+++ b/eval/amt/disqualification/qualificationtypeproperties.json
@@ -0,0 +1,8 @@
+{
+  "Name": "Qualification for Participating in natural instructions annotations ",
+  "Keywords": "Human evaluation",
+  "Description": "Workers that consistently produced low quality work, or is suspected of producing spam.",
+  "QualificationTypeStatus": "Active",
+  "RetryDelayInSeconds": 86400,
+  "TestDurationInSeconds": 3600
+}
\ No newline at end of file
diff --git a/eval/amt/mturk-specs/definition-likert/NOTES b/eval/amt/mturk-specs/definition-likert/NOTES
new file mode 100644
index 000000000..d8035ecf6
--- /dev/null
+++ b/eval/amt/mturk-specs/definition-likert/NOTES
@@ -0,0 +1 @@
+Human evaluation for natural instructions.
\ No newline at end of file
diff --git a/eval/amt/mturk-specs/definition-likert/hitproperties.json b/eval/amt/mturk-specs/definition-likert/hitproperties.json
new file mode 100644
index 000000000..3bfa3be4e
--- /dev/null
+++ b/eval/amt/mturk-specs/definition-likert/hitproperties.json
@@ -0,0 +1,4 @@
+{
+    "MaxAssignments": 1,
+    "LifetimeInSeconds": 86400
+}
diff --git a/eval/amt/mturk-specs/definition-likert/hittypeproperties.json b/eval/amt/mturk-specs/definition-likert/hittypeproperties.json
new file mode 100644
index 000000000..17ec4e9a1
--- /dev/null
+++ b/eval/amt/mturk-specs/definition-likert/hittypeproperties.json
@@ -0,0 +1,14 @@
+{
+    "AutoApprovalDelayInSeconds": 86400,
+    "AssignmentDurationInSeconds": 3600,
+    "Reward": "0.70",
+    "Title": "language instructions - v17",
+    "Keywords": "language instructions",
+    "Description": "In this HIT, you will read input instructions and see a few examples. Based on that, you're expected to label/answer a new instance.",
+    "QualificationRequirements": [
+        { "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
+        { "QualificationTypeId": "000000000000000000L0", "Comparator": "GreaterThanOrEqualTo", "IntegerValues": [ 99 ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
+        { "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
+        { "QualificationTypeId": "3090SA10WM5MIHCWNTON1VROMP4CN3", "Comparator": "DoesNotExist" }
+  ]
+}
diff --git a/eval/amt/mturk-specs/definition-likert/question.xml.j2 b/eval/amt/mturk-specs/definition-likert/question.xml.j2
new file mode 100644
index 000000000..b990d6da8
--- /dev/null
+++ b/eval/amt/mturk-specs/definition-likert/question.xml.j2
@@ -0,0 +1,455 @@
+<HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">
+    <HTMLContent>
+        <![CDATA[
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/>
+    <script type='text/javascript' src='https://s3.amazonaws.com/mturk-public/externalHIT_v1.js'></script>
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+    <link href="https://s3.amazonaws.com/mturk-public/bs30/css/bootstrap.min.css" rel="stylesheet"/>
+    <title></title>
+</head>
+<body>
+<style>
+    body {
+        font-family: 'Helvetica', 'Arial', sans-serif;
+        color: #444444;
+        font-size: 12pt;
+        background-color: #FAFAFA;
+    }
+
+    mark {
+        background-color: #cafad5;
+        color: black;
+        padding-left: 3px;
+        padding-right: 3px;
+    }
+
+    ul {
+        text-align: left;
+        list-style-position: inside;
+    }
+
+    label {
+        padding-right: 20pt;
+     }
+
+    textarea {
+        font-size: 18px;
+        margin: 0 auto;
+        display: block;
+    }
+
+    #submitButton {
+        font-size: 20pt;
+        color: white;
+        background-color: green;
+        border: 2px solid #336600;
+        padding: 3px;
+    }
+
+    table {
+        border-spacing: 5px;
+    }
+
+    /* On mouse-over, add a grey background color */
+    .container:hover input ~ .checkmark {
+        background-color: #ccc;
+    }
+
+    /* When the radio button is checked, add a blue background */
+    .container input:checked ~ .checkmark {
+        background-color: #2196F3;
+    }
+
+    /* Show the indicator (dot/circle) when checked */
+    .container input:checked ~ .checkmark:after {
+        display: block;
+    }
+
+    /* Style the indicator (dot/circle) */
+    .container .checkmark:after {
+        top: 9px;
+        left: 9px;
+        width: 8px;
+        height: 8px;
+        border-radius: 50%;
+        background: white;
+    }
+
+    .hoverTable td {
+        padding: 7px;
+        border: #4e95f4 0.2px solid;
+    }
+
+    /* Define the default color for all the table rows */
+    .hoverTable tr {
+        background: #ffffff;
+    }
+
+    /* Define the hover highlight color for the table row */
+    .hoverTable tr:hover {
+        background-color: #ffff99;
+    }
+
+    textarea {
+        resize: none;
+    }
+
+    .column {
+        margin-left: 20px;
+        margin-right: 20px;
+        padding: 5px;
+      float: left;
+      width: 44%;
+      background-color: #d9e8ff;
+    }
+
+    /* Clear floats after the columns */
+    .row:after {
+      content: "";
+      display: table;
+      clear: both;
+    }
+
+    .row {
+        margin: 5px;
+        padding: 5px;
+     }
+
+    hr {
+        display: block; height: 1px;
+        border: 0; border-top: 1px solid black;
+        margin: 1em 0; padding: 0;
+    }
+
+</style>
+<div class="container">
+    <h3 align="center">Welcome and thank you for your participation!</h3>
+    <h2 align="left"><u>Instructions and Examples</u></h2>
+    <div id="introduction" style="width:1000px; margin:0 auto;">
+        <p>
+            The main purpose of this task is to get your feedback on the quality of the provided instructions below.
+            We use this feedback to improve the quality of the provided instructions for the next round.
+            Please read the instructions and tells us what you think. You can tell us about: (1) typos (2) ambiguous or odd phrasings (3) what can be added/changed to make the task easier to understand.
+            Feel free to be as critical as you want about the instructions.
+        </p>
+        <p>
+            <b>Instructions: </b> {{instructions}}
+        </p>
+        <br>
+        <button onclick="myFunction()">Click here to hide/show positive examples</button>
+        <div id="myDIV">
+        <div style=" background: aliceblue; margin: 10px">
+            <h3>Positive (Desirable) Examples</h3>
+            Here are examples of desirable behaviors for this task:
+            <hr/>
+            <div id="positive_examples" class="examples" style="width:1024px; margin:0 auto; display:block;"> </div>
+        </div>
+        </div>
+
+        <br>
+        <button onclick="myFunction2()">Click here to hide/show negative examples</button>
+        <div id="myDIV2">
+        <div style=" background: mistyrose; margin: 10px">
+            <h3>Negative (Undesirable) Examples</h3>
+            Here are several negative examples (undesirable outputs) for this task:
+            <hr/>
+            <div id="negative_examples" class="examples" style="width:1024px; margin:0 auto; display:block;"> </div>
+        </div>
+        </div>
+        <br>
+
+        Great! You are now ready to start the following examples!
+    </div>
+
+    <br><br>
+
+    <p>
+
+    <h2 align="left"><u>Your Task</u></h2>
+    <div style="width:1024px;margin:0 auto">
+        <div style="position:-webkit-sticky;position:sticky;top:0;background-color:white;padding:10px">
+            <form name='mturk_form' method='POST' id='mturk_form' action='/mturk/externalSubmit'>
+
+                <br>
+                <div class="row" id='row0'>
+                    <p>
+                        First, please tell us how we can improve the above instructions.
+                        <b>We expect you to provide suggestions to improve the instructions for every few HITs.</b>
+                        So, please do not leave them empty all the time.
+                        If the instructions are impeccable, leave the boxes empty (and avoid unconstructive feedback such as "it's good", "nothing", etc.)
+                    </p>
+                    <br>
+                    <b>Instruction quality: </b> <br>
+                        <b>Q1:</b> What did you think about the quality of the task definition?  <br>
+                        <font color="red">(Avoid putting random/garbage text here. Consistent random responses will result in being blocked. Better leave this box empty instead of writing something irrelevant. We bonus those workers with constructive feedback for improving the instructions.)</font>
+                        <br>
+                        <div>
+                            <input type="radio" name="instruction_quality_q1" id="instruction_quality_q11" value="1.0" required>
+                            <label for="instruction_quality_q11"><font color="blue">pretty good</font></label>
+
+                            <input type="radio" name="instruction_quality_q1" id="instruction_quality_q12" value="0.66">
+                            <label for="instruction_quality_q12"><font color="blue">okay</font></label>
+
+                            <input type="radio" name="instruction_quality_q1" id="instruction_quality_q13" value="0.33">
+                            <label for="instruction_quality_q13"><font color="blue">weak</font></label>
+
+                            <input type="radio" name="instruction_quality_q1" id="instruction_quality_q14" value="0.0">
+                            <label for="instruction_quality_q14"><font color="blue">confusing</font></label>
+                        </div>
+
+                        <b>Q2:</b> What can be improved in the instructions, if any? (please indicate any typos, lack of clarities or any other issues)
+                        <!-- <p><input type="text" name="instruction_quality_q2" size="90"></p> -->
+
+                        <p><textarea id="instruction_quality_q2" name="instruction_quality_q2" rows="4" cols="50"></textarea></p>
+
+                </div>
+                <hr/>
+                <div class="row" id='row0'>
+                    <b>Positive examples quality: </b> <br>
+                        <b>Q1:</b> What did you think about the quality of the provided positive examples?  <br>
+
+                        <div>
+                            <input type="radio" name="positive_example_quality_q1" id="positive_example_quality_q11" value="1.0" required>
+                            <label for="positive_example_quality_q11"><font color="blue">pretty good</font></label>
+
+                            <input type="radio" name="positive_example_quality_q1" id="positive_example_quality_q12" value="0.66">
+                            <label for="positive_example_quality_q12"><font color="blue">okay</font></label>
+
+                            <input type="radio" name="positive_example_quality_q1" id="positive_example_quality_q13" value="0.33">
+                            <label for="positive_example_quality_q13"><font color="blue">weak</font></label>
+
+                            <input type="radio" name="positive_example_quality_q1" id="positive_example_quality_q14" value="0.0">
+                            <label for="positive_example_quality_q14"><font color="blue">confusing</font></label>
+                        </div>
+
+                        <b>Q2:</b> What can be improved about the positive examples, if any? (please indicate any typos, lack of clarities or any other issues)
+                        <!-- <p><input type="text" name="positive_example_quality_q2" size="90"></p> -->
+                        <p><textarea id="positive_example_quality_q2" name="positive_example_quality_q2" rows="4" cols="50"></textarea></p>
+                </div>
+                <hr/>
+                <div class="row" id='row0'>
+                    <b>Negative examples quality: </b> <br>
+                        <b>Q1:</b> What did you think about the quality of the provided negative examples?  <br>
+
+                        <div>
+                            <input type="radio" name="negative_example_quality_q1" id="negative_example_quality_q11" value="1.0" required>
+                            <label for="negative_example_quality_q11"><font color="blue">pretty good</font></label>
+
+                            <input type="radio" name="negative_example_quality_q1" id="negative_example_quality_q12" value="0.66">
+                            <label for="negative_example_quality_q12"><font color="blue">okay</font></label>
+
+                            <input type="radio" name="negative_example_quality_q1" id="negative_example_quality_q13" value="0.33">
+                            <label for="negative_example_quality_q13"><font color="blue">weak</font></label>
+
+                            <input type="radio" name="negative_example_quality_q1" id="negative_example_quality_q14" value="0.0">
+                            <label for="negative_example_quality_q14"><font color="blue">confusing</font></label>
+                        </div>
+
+                        <b>Q2:</b> What can be improved about the negative examples, if any? (please indicate any typos, lack of clarities or any other issues)
+                        <!-- <p><input type="text" name="negative_example_quality_q2" size="90"></p> -->
+                        <p><textarea id="negative_example_quality_q2" name="negative_example_quality_q2" rows="4" cols="50"></textarea></p>
+                </div>
+                <hr/>
+
+
+                <div class="row" id='row0_candidates_gr' style="text-align: center;">
+                    <font style="color: red"> Now please answer the following inputs, according to the provided instructions above. </font>
+                </div>
+
+                <div id='instances_div'></div>
+
+                <hr/>
+                <p>
+                    <span>
+                        Please direct any comments, feedback, issues or questions
+                        to this email address:
+                        <a href="mailto:danielk@allenai.org">danielk@allenai.org</a>
+                    </span>
+                </p>
+                <input type="hidden" id="file" name="file" value="{{file}}">
+                <input type="hidden" id="instructions" name="instructions" value="{{instructions}}">
+
+                <div id="hidden_variables"> </div>
+
+                <br>
+                <input type='hidden' value='' name='assignmentId' id='assignmentId'/>
+                <p><input type='submit' id='submitButton' value='Submit'/></p>
+            </form>
+            <script language='Javascript'>turkSetAssignmentID();</script>
+        </div>
+        <hr>
+    </div>
+</div>
+<script>
+    // negative
+    var negative_ex_inputs = [
+        "{{negative_ex_0_input}}",
+        "{{negative_ex_1_input}}",
+        "{{negative_ex_2_input}}",
+    ]
+
+    var negative_ex_outputs = [
+        "{{negative_ex_0_output}}",
+        "{{negative_ex_1_output}}",
+        "{{negative_ex_2_output}}",
+    ]
+
+    var negative_ex_explanations = [
+        "{{negative_ex_0_explanation}}",
+        "{{negative_ex_1_explanation}}",
+        "{{negative_ex_2_explanation}}",
+    ]
+
+    // sometimes we don't have 3 negative examples
+    neg_ex_len = negative_ex_inputs.filter((element) => {
+      return element.length > 3;
+    });
+    neg_ex_len = neg_ex_len.length
+    if (neg_ex_len > 0 && neg_ex_len < 5) {
+        negative_ex_inputs = negative_ex_inputs.slice(0,neg_ex_len)
+        negative_ex_outputs = negative_ex_outputs.slice(0,neg_ex_len)
+        negative_ex_explanations = negative_ex_explanations.slice(0,neg_ex_len)
+    }
+
+    // positive
+    var positive_ex_inputs = [
+        "{{positive_ex_0_input}}",
+        "{{positive_ex_1_input}}",
+        "{{positive_ex_2_input}}",
+        "{{positive_ex_3_input}}",
+        "{{positive_ex_4_input}}",
+    ]
+
+    var positive_ex_outputs = [
+        "{{positive_ex_0_output}}",
+        "{{positive_ex_1_output}}",
+        "{{positive_ex_2_output}}",
+        "{{positive_ex_3_output}}",
+        "{{positive_ex_4_output}}",
+    ]
+
+    var positive_ex_explanations = [
+        "{{positive_ex_0_explanation}}",
+        "{{positive_ex_1_explanation}}",
+        "{{positive_ex_2_explanation}}",
+        "{{positive_ex_3_explanation}}",
+        "{{positive_ex_4_explanation}}",
+    ]
+
+    // sometimes we don't have 5 positive examples
+    pos_ex_len = positive_ex_inputs.filter((element) => {
+      return element.length > 3;
+    });
+    pos_ex_len = pos_ex_len.length
+    if (pos_ex_len > 0 && pos_ex_len < 5) {
+        positive_ex_inputs = positive_ex_inputs.slice(0,pos_ex_len)
+        positive_ex_outputs = positive_ex_outputs.slice(0,pos_ex_len)
+        positive_ex_explanations = positive_ex_explanations.slice(0,pos_ex_len)
+    }
+
+    var instance_inputs = [
+        "{{instance_0_input}}",
+        "{{instance_1_input}}",
+    ]
+
+    var instance_outputs = [
+        "{{instance_0_output}}",
+        "{{instance_1_output}}",
+    ]
+
+        positive_ex_inputs.forEach(function(ins, x){
+            var ex = `<h4>Example ${x+1}: </h4>
+                    <b>Input:</b> ${positive_ex_inputs[x]} <br>
+                    <br>
+                    <b>Output:</b> ${positive_ex_outputs[x]} <br>
+                    <br>
+                    <b>Explanation of the example:</b> ${positive_ex_explanations[x]} <br>
+                    <hr/>`;
+
+            $(`#positive_examples`).append(ex);
+        });
+
+        negative_ex_inputs.forEach(function(ins, x){
+            var ex = `<h4>Example ${x+1}: </h4>
+                    <b>Input:</b> ${negative_ex_inputs[x]} <br>
+                    <br>
+                    <b>Output:</b> ${negative_ex_outputs[x]} <br>
+                    <br>
+                    <b>Explanation of the example:</b> ${negative_ex_explanations[x]} <br>
+                    <hr/>`;
+            $(`#negative_examples`).append(ex);
+        });
+
+        instance_inputs.forEach(function(ins, x){
+            var ins = `
+                    <b>Input:</b> ${instance_inputs[x]} <br>
+                    <br>
+                    Now, given the above input, write an appropriate <b>output:</b>
+                    <p><textarea id="annotated_instance_${x}_output" name="annotated_instance_${x}_output" rows="4" cols="50"></textarea></p>
+                    <br>
+                    <hr/>`;
+
+            $(`#instances_div`).append(ins);
+        });
+
+
+        instance_inputs.forEach(function(ins, x){
+            var y = `<input type="hidden" id="instance_${x}_input" name="instance_${x}_input" value="${instance_inputs[x]}">`;
+            $(`#hidden_variables`).append(y);
+
+            var y = `<input type="hidden" id="instance_${x}_output" name="instance_${x}_output" value="${instance_outputs[x]}">`;
+            $(`#hidden_variables`).append(y);
+        });
+
+        negative_ex_inputs.forEach(function(ins, x){
+            // negative
+            var y = `<input type="hidden" id="negative_ex_${x}_input" name="negative_ex_${x}_input" value="${negative_ex_inputs[x]}">`;
+            $(`#hidden_variables`).append(y);
+
+            var y = `<input type="hidden" id="negative_ex_${x}_output" name="negative_ex_${x}_output" value="${negative_ex_outputs[x]}">`;
+            $(`#hidden_variables`).append(y);
+
+            var y = `<input type="hidden" id="negative_ex_${x}_explanation" name="negative_ex_${x}_explanation" value="${negative_ex_explanations[x]}">`;
+            $(`#hidden_variables`).append(y);
+        });
+
+        positive_ex_inputs.forEach(function(ins, x){
+            // positive
+            var y = `<input type="hidden" id="positive_ex_${x}_input" name="positive_ex_${x}_input" value="${positive_ex_inputs[x]}">`;
+            $(`#hidden_variables`).append(y);
+
+            var y = `<input type="hidden" id="positive_ex_${x}_output" name="positive_ex_${x}_output" value="${positive_ex_outputs[x]}">`;
+            $(`#hidden_variables`).append(y);
+
+            var y = `<input type="hidden" id="positive_ex_${x}_explanation" name="positive_ex_${x}_explanation" value="${positive_ex_explanations[x]}">`;
+            $(`#hidden_variables`).append(y);
+        });
+
+    function myFunction() {
+      var x = document.getElementById("myDIV");
+      if (x.style.display === "none") {
+        x.style.display = "block";
+      } else {
+        x.style.display = "none";
+      }
+    }
+
+    function myFunction2() {
+      var x = document.getElementById("myDIV2");
+      if (x.style.display === "none") {
+        x.style.display = "block";
+      } else {
+        x.style.display = "none";
+      }
+    }
+
+</script>
+</body>
+</html>
+        ]]>
+    </HTMLContent>
+    <FrameHeight>450</FrameHeight>
+</HTMLQuestion>
\ No newline at end of file
diff --git a/eval/automatic/evaluation.py b/eval/automatic/evaluation.py
new file mode 100644
index 000000000..f256ad73e
--- /dev/null
+++ b/eval/automatic/evaluation.py
@@ -0,0 +1,62 @@
+from __future__ import print_function
+import argparse
+import json
+import datasets
+
+rouge_metric = datasets.load_metric('rouge')
+
+
+def rouge(prediction, ground_truth):
+    score = rouge_metric.compute(
+        predictions=[prediction],
+        references=[ground_truth],
+        **{'use_agregator': False, 'use_stemmer': True, 'rouge_types': ['rougeL']}
+    )
+    return score['rougeL'][0].fmeasure
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    metrics = {}
+    for i in range(len(predictions)):
+        pred = predictions[i]['output']
+        # gold_input = dataset['Instances'][predictions[i]['index']]['input']
+        gold_outputs = dataset['Instances'][predictions[i]['index']]['output']
+        if 'rouge' not in metrics:
+            metrics['rouge'] = 0
+        metrics['rouge'] += metric_max_over_ground_truths(rouge, pred, gold_outputs)
+
+    for key in metrics.keys():
+        metrics[key] /= len(predictions)
+
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dataset",
+                        type=str,
+                        required=True,
+                        help="Dataset Json File Name")
+    parser.add_argument("--predictions",
+                        type=str,
+                        required=True,
+                        help="Prediction File Name")
+    args = parser.parse_args()
+    with open(args.dataset) as dataset_file:
+        dataset_json = json.load(dataset_file)
+    with open(args.predictions) as prediction_file:
+        predictions_json = json.load(prediction_file)
+    print(evaluate(dataset_json, predictions_json['predictions']))
+
+
+if __name__ == "__main__":
+    main()