Crowdworker evaluation of the tasks [Work in Progress] (#276)

* - first commit for evaluation: adding the first version a working evaluation template. * - added a script for result aggregation. * - update the preparation script. * - update. * - update. * - update. * - update. * - add the evaluation script. * - drop a mistakenly-added directory. * - add more description to the instructions. * - minor. * - * - * add quals. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * update.
allenai · Jan 26, 2022 · ff125a6 · ff125a6
1 parent fe65586
commit ff125a6
Show file tree

Hide file tree

Showing 11 changed files with 809 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 .idea
 .DS_Store
+/eval/amt/past_experiments/
+eval/amt/past_experiments/*
+/eval/amt/disqualification/blocklist.txt
 /src/utils/
diff --git a/eval/amt/1.amti_create_input_data.py b/eval/amt/1.amti_create_input_data.py
@@ -0,0 +1,101 @@
+'''
+Given a task file, it creates a single file for crowd annotations (the input to AMTI).
+For example: python 1.amti_create_input_data.py --start  5 --end 20 --eval_count 10
+'''
+import json
+import random
+import argparse
+from os import listdir
+from os.path import isfile, join
+
+tasks_path = '../../tasks/'
+
+
+def read_file(file):
+    with open(tasks_path + file, 'r') as f:
+        return json.load(f)
+
+
+def normalize(str):
+    return str.replace('"', '\'').replace('`', '\'').replace('&', ' and ').encode('ascii', 'ignore').decode('ascii').replace('<br>', '\n').replace('<', '[').replace('>', ']').replace('\n', '<br>')
+
+files = [f for f in listdir(tasks_path) if isfile(join(tasks_path, f)) and ".json" in f]
+task_ids_to_file = {}
+
+for f in files:
+    id = f.split("_")[0].replace("task", "")
+    # print(id)
+    id = int(id)
+    task_ids_to_file[id] = f
+
+
+def process_single_file(start, end, max_count):
+    fout = open(f"start={start}_end={end}_max_size={max_count}.jsonl", "w")
+    for idx in range(start, end + 1):
+        if idx not in task_ids_to_file:
+            continue
+        file = task_ids_to_file[idx]
+
+        # grouping instances into groups of size 5
+        json_content = read_file(file)
+
+        if json_content["Input_language"] != ["English"] or json_content["Output_language"] != ["English"]:
+            continue
+
+        positive_examples = json_content['Positive Examples']
+        negative_examples = json_content['Negative Examples']
+        instances = json_content['Instances']
+
+        # make sure the annotators see all the examples
+        random.shuffle(positive_examples)
+        random.shuffle(negative_examples)
+        random.shuffle(instances)
+
+        positive_examples = positive_examples[:5]
+        negative_examples = negative_examples[:3]
+
+        n = 2
+        chunks = [instances[i:i + n] for i in range(0, len(instances), n)]
+        for i, chunk in enumerate(chunks):
+            if i * n > int(max_count):
+                break
+            map = {
+                'file': normalize(file),
+                'instructions': normalize(json_content['Definition']),
+            }
+
+            map[f'positive_example_count'] = len(positive_examples)
+            map[f'negative_example_count'] = len(negative_examples)
+
+            for idx, ex in enumerate(positive_examples):
+                map[f'positive_ex_{idx}_input'] = normalize(ex['input'])
+                map[f'positive_ex_{idx}_output'] = normalize(ex['output'])
+                map[f'positive_ex_{idx}_explanation'] = normalize(ex['explanation'])
+
+            for idx, ex in enumerate(negative_examples):
+                map[f'negative_ex_{idx}_input'] = normalize(ex['input'])
+                map[f'negative_ex_{idx}_output'] = normalize(ex['output'])
+                map[f'negative_ex_{idx}_explanation'] = normalize(ex['explanation'])
+
+            for idx, ex in enumerate(chunk):
+                map[f'instance_{idx}_input'] = normalize(ex['input'])
+                map[f'instance_{idx}_output'] = "///".join([normalize(x) for x in ex['output']])
+
+            fout.write(json.dumps(map) + "\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='A script for preparing natural instructions tasks for human evaluation')
+    parser.add_argument('--start', help='id of the start task inside `tasks/`')
+    parser.add_argument('--end', help='id of the end task inside `tasks/`')
+    parser.add_argument('--eval_count', help='how many instances to use in this evaluation. '
+                                             '100 should be enough for reliable estimates.')
+    args = parser.parse_args()
+
+    print(" >>>>>>>>> Processing with the following arguments: ")
+    print(f" * start task id: {args.start}")
+    print(f" * end task id: {args.end}")
+    print(f" * count of samples from each task: {args.eval_count}")
+
+    process_single_file(int(args.start), int(args.end), args.eval_count)
diff --git a/eval/amt/2.run_eval.sh b/eval/amt/2.run_eval.sh
@@ -0,0 +1,13 @@
+# launch the experiment
+export AWS_PROFILE=alexandria-mturk
+amti --verbose create-batch mturk-specs/definition-likert start=1200_end=1536_max_size=5.jsonl . --live
+
+# check the experiment status
+export BATCH=batch-...
+#export BATCH=batch-13abedb3-9788-4118-8a23-89978a941638_start=1540_end=1800_max_size=5
+amti status-batch "$BATCH" --live
+
+# fetch the results when they're over
+amti review-batch "$BATCH" --approve-all --live
+amti save-batch "$BATCH" --live
+amti extract tabular "$BATCH" "${BATCH}/batch-results.jsonl"
diff --git a/eval/amt/3.amti_aggregate_results.py b/eval/amt/3.amti_aggregate_results.py
@@ -0,0 +1,124 @@
+import sys
+
+# sys.path.append("..")
+sys.path.append("../automatic")
+
+import json
+from evaluation import metric_max_over_ground_truths, rouge
+
+
+def get_stats(values):
+    maj_vote_value = max(set(values), key=values.count)
+    avg_score = sum(values) / len(values)
+    model_values_str = "\t".join([str(x) for x in values])
+    return maj_vote_value, avg_score, model_values_str
+
+
+def normalize(str):
+    return str.replace("\t", " ").replace("\n", "<newline>").replace("\r", "<newline>")
+
+
+def normalize2(str):
+    return ' '.join(str.replace(",", "").lower().split())
+
+
+def discretize(x):
+    if x == 0.5:
+        return 0.5
+    if x > 0.5:
+        return 1.0
+    else:
+        return 0.0
+
+
+def aggregate_v2(response_file):
+    worker_stats = {}
+    suggestions =  {}
+    with open(response_file) as f:
+        for line in f.readlines():
+            json_line = json.loads(line)
+
+            worker_id = json_line[f'WorkerId']
+            if worker_id not in worker_stats:
+                worker_stats[worker_id] = 0
+            worker_stats[worker_id] += 1
+
+            file = json_line[f'file']
+            instructions = normalize(json_line[f'instructions'])
+
+            instruction_quality_value = float(json_line[f'instruction_quality_q1'])
+            instruction_suggestions = normalize(json_line[f'instruction_quality_q2'])
+
+            positive_ex_quality_value = float(json_line[f'positive_example_quality_q1'])
+            positive_ex_suggestions = normalize(json_line[f'positive_example_quality_q2'])
+
+            negative_ex_quality_value = float(json_line[f'negative_example_quality_q1'])
+            negative_ex_suggestions = normalize(json_line[f'negative_example_quality_q2'])
+
+            positive_examples = []
+            for idx in range(0, 5):
+                id = f'positive_ex_{idx}_input'
+                if id in json_line:
+                    positive_examples.append(json_line[id])
+            positive_examples_appended = normalize("//".join(positive_examples))
+
+            negative_examples = []
+            for idx in range(0, 3):
+                id = f'negative_ex_{idx}_input'
+                if id in json_line:
+                    negative_examples.append(json_line[id])
+            negative_examples_appended = normalize("//".join(negative_examples))
+
+            instruction_suggestions = instruction_suggestions.replace("\n", " ")
+            positive_ex_suggestions = positive_ex_suggestions.replace("\n", " ")
+            negative_ex_suggestions = negative_ex_suggestions.replace("\n", " ")
+
+            prefix = f"{instruction_quality_value}\t{instruction_suggestions}\t{file}\t{instructions}" \
+                     f"\t{positive_ex_quality_value}\t{positive_ex_suggestions}\t{positive_examples_appended}" \
+                     f"\t{negative_ex_quality_value}\t{negative_ex_suggestions}\t{negative_examples_appended}"
+
+            if len(instruction_suggestions) + len(positive_ex_suggestions) + len(negative_ex_suggestions) > 2:
+                if file not in suggestions:
+                    suggestions[file] = []
+                if len(instruction_suggestions.strip()) > 2:
+                    suggestions[file].append(f"    - regarding instructions: `{instruction_suggestions}`")
+                if len(positive_ex_suggestions.strip()) > 2:
+                    suggestions[file].append(f"    - regarding p examples: `{positive_ex_suggestions}`")
+                if len(negative_ex_suggestions.strip()) > 2:
+                    suggestions[file].append(f"    - regarding n examples: `{negative_ex_suggestions}`")
+            # instance_input = []
+            # instance_output = []
+            # instance_prediction = []
+            for idx in range(0, 5):
+                id = f'instance_{idx}_input'
+                if id in json_line:
+                    input = normalize(json_line[id])
+                    output = normalize(json_line[f'instance_{idx}_output'])
+                    human_output = normalize(json_line[f'annotated_instance_{idx}_output'])
+                    # instance_input.append(input)
+                    # instance_output.append(output)
+                    # instance_prediction.append(human_output)
+                    rouge_val = metric_max_over_ground_truths(
+                        rouge, normalize2(human_output),
+                        [normalize2(x) for x in output.split("///")]
+                    )
+                    print(f"{prefix}\t{input}\t{output}\t{human_output}\t{rouge_val}\t{worker_id}")
+
+    for file in sorted(suggestions.keys()):
+        print(f" - [ ] {file}")
+        for feedback in suggestions[file]:
+            print(feedback)
+
+
+
+# aggregate_v2("batch-43ecd7ef-0717-4b72-a39c-b6179f8b5f77_task156_pilot/batch-results.jsonl")
+# aggregate_v2("batch-65c49abc-f8a3-4f12-a71d-9ce5742c3419_start=60_end=100_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-eea0ef32-da0a-47cf-bcae-810d1a503379_start=1_end=59_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-cdcb497e-49a3-4cee-8ab5-1451dc19dac2_119_end=200_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-58086c69-62bf-4e91-8741-b68d27e1fd63-start=201_end=300_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-fc02d066-1e35-4184-b4ea-ba7eca1abcc1_start=301_end=400_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-4739062e-2141-4f97-9a37-41197abf9a93_start=400_end=600_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-23353cc5-13c1-4af9-94c3-03eb2bacbd0a_start=600_end=850_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-4ee23f3d-2900-4fef-a0ae-d05bc7d519e8_start=850_end=1200_max_size=5/batch-results.jsonl")
+# aggregate_v2("batch-13abedb3-9788-4118-8a23-89978a941638_start=1200_end=1536_max_size=5/batch-results.jsonl")
+aggregate_v2("batch-eb1b61cd-36e7-4fdf-b8e1-fedd3c77245f_start=1540_end=1726_max_size=5/batch-results.jsonl")
diff --git a/eval/amt/README.md b/eval/amt/README.md
@@ -0,0 +1,24 @@
+# Human Evaluation of Instructions 
+ - Step1: Select a task and convert it into AMTI-friendly format:  
+```terminal
+> python 1.amti_create_input_data.py --file [NAME] --eval_count [COUNT]
+```
+For example: 
+```terminal
+> python 1.amti_create_input_data.py --file  task156_codah_classification_adversarial.json --eval_count 10
+```
+ - Step2: follow the steps in `2.run_eval.sh`
+ - Step3: evaluate
+
+
+To disqualify a certain user, add them to `genie_disqualification/blocklist.txt` and then:
+
+    amti associate-qual  --file genie_disqualification/blocklist.txt --qual  3090SA10WM5MIHCWNTON1VROMP4CN3 --live 
+
+Or disqualify a particular user: 
+
+    amti associate-qual  WorkerId --qual   3090SA10WM5MIHCWNTON1VROMP4CN3   --live
+
+To list workers that are disqualified:
+
+    amti  associated --qual  3090SA10WM5MIHCWNTON1VROMP4CN3 --status Granted  --live 
diff --git a/eval/amt/disqualification/qualificationtypeproperties.json b/eval/amt/disqualification/qualificationtypeproperties.json
@@ -0,0 +1,8 @@
+{
+  "Name": "Qualification for Participating in natural instructions annotations ",
+  "Keywords": "Human evaluation",
+  "Description": "Workers that consistently produced low quality work, or is suspected of producing spam.",
+  "QualificationTypeStatus": "Active",
+  "RetryDelayInSeconds": 86400,
+  "TestDurationInSeconds": 3600
+}
diff --git a/eval/amt/mturk-specs/definition-likert/NOTES b/eval/amt/mturk-specs/definition-likert/NOTES
@@ -0,0 +1 @@
+Human evaluation for natural instructions.
diff --git a/eval/amt/mturk-specs/definition-likert/hitproperties.json b/eval/amt/mturk-specs/definition-likert/hitproperties.json
@@ -0,0 +1,4 @@
+{
+    "MaxAssignments": 1,
+    "LifetimeInSeconds": 86400
+}
diff --git a/eval/amt/mturk-specs/definition-likert/hittypeproperties.json b/eval/amt/mturk-specs/definition-likert/hittypeproperties.json
@@ -0,0 +1,14 @@
+{
+    "AutoApprovalDelayInSeconds": 86400,
+    "AssignmentDurationInSeconds": 3600,
+    "Reward": "0.70",
+    "Title": "language instructions - v17",
+    "Keywords": "language instructions",
+    "Description": "In this HIT, you will read input instructions and see a few examples. Based on that, you're expected to label/answer a new instance.",
+    "QualificationRequirements": [
+        { "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
+        { "QualificationTypeId": "000000000000000000L0", "Comparator": "GreaterThanOrEqualTo", "IntegerValues": [ 99 ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
+        { "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
+        { "QualificationTypeId": "3090SA10WM5MIHCWNTON1VROMP4CN3", "Comparator": "DoesNotExist" }
+  ]
+}