From ff125a6d14125e5d0774573d9dbe8e1bac920ad8 Mon Sep 17 00:00:00 2001 From: Daniel Khashabi Date: Tue, 25 Jan 2022 20:03:51 -0800 Subject: [PATCH] Crowdworker evaluation of the tasks [Work in Progress] (#276) * - first commit for evaluation: adding the first version a working evaluation template. * - added a script for result aggregation. * - update the preparation script. * - update. * - update. * - update. * - update. * - add the evaluation script. * - drop a mistakenly-added directory. * - add more description to the instructions. * - minor. * - * - * add quals. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * update. --- .gitignore | 3 + eval/amt/1.amti_create_input_data.py | 101 ++++ eval/amt/2.run_eval.sh | 13 + eval/amt/3.amti_aggregate_results.py | 124 +++++ eval/amt/README.md | 24 + .../qualificationtypeproperties.json | 8 + eval/amt/mturk-specs/definition-likert/NOTES | 1 + .../definition-likert/hitproperties.json | 4 + .../definition-likert/hittypeproperties.json | 14 + .../definition-likert/question.xml.j2 | 455 ++++++++++++++++++ eval/automatic/evaluation.py | 62 +++ 11 files changed, 809 insertions(+) create mode 100644 eval/amt/1.amti_create_input_data.py create mode 100644 eval/amt/2.run_eval.sh create mode 100644 eval/amt/3.amti_aggregate_results.py create mode 100644 eval/amt/README.md create mode 100644 eval/amt/disqualification/qualificationtypeproperties.json create mode 100644 eval/amt/mturk-specs/definition-likert/NOTES create mode 100644 eval/amt/mturk-specs/definition-likert/hitproperties.json create mode 100644 eval/amt/mturk-specs/definition-likert/hittypeproperties.json create mode 100644 eval/amt/mturk-specs/definition-likert/question.xml.j2 create mode 100644 eval/automatic/evaluation.py diff --git a/.gitignore b/.gitignore index 5204e5f7c..37f2c978e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .idea .DS_Store +/eval/amt/past_experiments/ +eval/amt/past_experiments/* +/eval/amt/disqualification/blocklist.txt /src/utils/ \ No newline at end of file diff --git a/eval/amt/1.amti_create_input_data.py b/eval/amt/1.amti_create_input_data.py new file mode 100644 index 000000000..e0337d13e --- /dev/null +++ b/eval/amt/1.amti_create_input_data.py @@ -0,0 +1,101 @@ +''' +Given a task file, it creates a single file for crowd annotations (the input to AMTI). +For example: python 1.amti_create_input_data.py --start 5 --end 20 --eval_count 10 +''' +import json +import random +import argparse +from os import listdir +from os.path import isfile, join + +tasks_path = '../../tasks/' + + +def read_file(file): + with open(tasks_path + file, 'r') as f: + return json.load(f) + + +def normalize(str): + return str.replace('"', '\'').replace('`', '\'').replace('&', ' and ').encode('ascii', 'ignore').decode('ascii').replace('
', '\n').replace('<', '[').replace('>', ']').replace('\n', '
') + +files = [f for f in listdir(tasks_path) if isfile(join(tasks_path, f)) and ".json" in f] +task_ids_to_file = {} + +for f in files: + id = f.split("_")[0].replace("task", "") + # print(id) + id = int(id) + task_ids_to_file[id] = f + + +def process_single_file(start, end, max_count): + fout = open(f"start={start}_end={end}_max_size={max_count}.jsonl", "w") + for idx in range(start, end + 1): + if idx not in task_ids_to_file: + continue + file = task_ids_to_file[idx] + + # grouping instances into groups of size 5 + json_content = read_file(file) + + if json_content["Input_language"] != ["English"] or json_content["Output_language"] != ["English"]: + continue + + positive_examples = json_content['Positive Examples'] + negative_examples = json_content['Negative Examples'] + instances = json_content['Instances'] + + # make sure the annotators see all the examples + random.shuffle(positive_examples) + random.shuffle(negative_examples) + random.shuffle(instances) + + positive_examples = positive_examples[:5] + negative_examples = negative_examples[:3] + + n = 2 + chunks = [instances[i:i + n] for i in range(0, len(instances), n)] + for i, chunk in enumerate(chunks): + if i * n > int(max_count): + break + map = { + 'file': normalize(file), + 'instructions': normalize(json_content['Definition']), + } + + map[f'positive_example_count'] = len(positive_examples) + map[f'negative_example_count'] = len(negative_examples) + + for idx, ex in enumerate(positive_examples): + map[f'positive_ex_{idx}_input'] = normalize(ex['input']) + map[f'positive_ex_{idx}_output'] = normalize(ex['output']) + map[f'positive_ex_{idx}_explanation'] = normalize(ex['explanation']) + + for idx, ex in enumerate(negative_examples): + map[f'negative_ex_{idx}_input'] = normalize(ex['input']) + map[f'negative_ex_{idx}_output'] = normalize(ex['output']) + map[f'negative_ex_{idx}_explanation'] = normalize(ex['explanation']) + + for idx, ex in enumerate(chunk): + map[f'instance_{idx}_input'] = normalize(ex['input']) + map[f'instance_{idx}_output'] = "///".join([normalize(x) for x in ex['output']]) + + fout.write(json.dumps(map) + "\n") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='A script for preparing natural instructions tasks for human evaluation') + parser.add_argument('--start', help='id of the start task inside `tasks/`') + parser.add_argument('--end', help='id of the end task inside `tasks/`') + parser.add_argument('--eval_count', help='how many instances to use in this evaluation. ' + '100 should be enough for reliable estimates.') + args = parser.parse_args() + + print(" >>>>>>>>> Processing with the following arguments: ") + print(f" * start task id: {args.start}") + print(f" * end task id: {args.end}") + print(f" * count of samples from each task: {args.eval_count}") + + process_single_file(int(args.start), int(args.end), args.eval_count) diff --git a/eval/amt/2.run_eval.sh b/eval/amt/2.run_eval.sh new file mode 100644 index 000000000..8798d01ae --- /dev/null +++ b/eval/amt/2.run_eval.sh @@ -0,0 +1,13 @@ +# launch the experiment +export AWS_PROFILE=alexandria-mturk +amti --verbose create-batch mturk-specs/definition-likert start=1200_end=1536_max_size=5.jsonl . --live + +# check the experiment status +export BATCH=batch-... +#export BATCH=batch-13abedb3-9788-4118-8a23-89978a941638_start=1540_end=1800_max_size=5 +amti status-batch "$BATCH" --live + +# fetch the results when they're over +amti review-batch "$BATCH" --approve-all --live +amti save-batch "$BATCH" --live +amti extract tabular "$BATCH" "${BATCH}/batch-results.jsonl" diff --git a/eval/amt/3.amti_aggregate_results.py b/eval/amt/3.amti_aggregate_results.py new file mode 100644 index 000000000..1aab886d0 --- /dev/null +++ b/eval/amt/3.amti_aggregate_results.py @@ -0,0 +1,124 @@ +import sys + +# sys.path.append("..") +sys.path.append("../automatic") + +import json +from evaluation import metric_max_over_ground_truths, rouge + + +def get_stats(values): + maj_vote_value = max(set(values), key=values.count) + avg_score = sum(values) / len(values) + model_values_str = "\t".join([str(x) for x in values]) + return maj_vote_value, avg_score, model_values_str + + +def normalize(str): + return str.replace("\t", " ").replace("\n", "").replace("\r", "") + + +def normalize2(str): + return ' '.join(str.replace(",", "").lower().split()) + + +def discretize(x): + if x == 0.5: + return 0.5 + if x > 0.5: + return 1.0 + else: + return 0.0 + + +def aggregate_v2(response_file): + worker_stats = {} + suggestions = {} + with open(response_file) as f: + for line in f.readlines(): + json_line = json.loads(line) + + worker_id = json_line[f'WorkerId'] + if worker_id not in worker_stats: + worker_stats[worker_id] = 0 + worker_stats[worker_id] += 1 + + file = json_line[f'file'] + instructions = normalize(json_line[f'instructions']) + + instruction_quality_value = float(json_line[f'instruction_quality_q1']) + instruction_suggestions = normalize(json_line[f'instruction_quality_q2']) + + positive_ex_quality_value = float(json_line[f'positive_example_quality_q1']) + positive_ex_suggestions = normalize(json_line[f'positive_example_quality_q2']) + + negative_ex_quality_value = float(json_line[f'negative_example_quality_q1']) + negative_ex_suggestions = normalize(json_line[f'negative_example_quality_q2']) + + positive_examples = [] + for idx in range(0, 5): + id = f'positive_ex_{idx}_input' + if id in json_line: + positive_examples.append(json_line[id]) + positive_examples_appended = normalize("//".join(positive_examples)) + + negative_examples = [] + for idx in range(0, 3): + id = f'negative_ex_{idx}_input' + if id in json_line: + negative_examples.append(json_line[id]) + negative_examples_appended = normalize("//".join(negative_examples)) + + instruction_suggestions = instruction_suggestions.replace("\n", " ") + positive_ex_suggestions = positive_ex_suggestions.replace("\n", " ") + negative_ex_suggestions = negative_ex_suggestions.replace("\n", " ") + + prefix = f"{instruction_quality_value}\t{instruction_suggestions}\t{file}\t{instructions}" \ + f"\t{positive_ex_quality_value}\t{positive_ex_suggestions}\t{positive_examples_appended}" \ + f"\t{negative_ex_quality_value}\t{negative_ex_suggestions}\t{negative_examples_appended}" + + if len(instruction_suggestions) + len(positive_ex_suggestions) + len(negative_ex_suggestions) > 2: + if file not in suggestions: + suggestions[file] = [] + if len(instruction_suggestions.strip()) > 2: + suggestions[file].append(f" - regarding instructions: `{instruction_suggestions}`") + if len(positive_ex_suggestions.strip()) > 2: + suggestions[file].append(f" - regarding p examples: `{positive_ex_suggestions}`") + if len(negative_ex_suggestions.strip()) > 2: + suggestions[file].append(f" - regarding n examples: `{negative_ex_suggestions}`") + # instance_input = [] + # instance_output = [] + # instance_prediction = [] + for idx in range(0, 5): + id = f'instance_{idx}_input' + if id in json_line: + input = normalize(json_line[id]) + output = normalize(json_line[f'instance_{idx}_output']) + human_output = normalize(json_line[f'annotated_instance_{idx}_output']) + # instance_input.append(input) + # instance_output.append(output) + # instance_prediction.append(human_output) + rouge_val = metric_max_over_ground_truths( + rouge, normalize2(human_output), + [normalize2(x) for x in output.split("///")] + ) + print(f"{prefix}\t{input}\t{output}\t{human_output}\t{rouge_val}\t{worker_id}") + + for file in sorted(suggestions.keys()): + print(f" - [ ] {file}") + for feedback in suggestions[file]: + print(feedback) + + + +# aggregate_v2("batch-43ecd7ef-0717-4b72-a39c-b6179f8b5f77_task156_pilot/batch-results.jsonl") +# aggregate_v2("batch-65c49abc-f8a3-4f12-a71d-9ce5742c3419_start=60_end=100_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-eea0ef32-da0a-47cf-bcae-810d1a503379_start=1_end=59_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-cdcb497e-49a3-4cee-8ab5-1451dc19dac2_119_end=200_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-58086c69-62bf-4e91-8741-b68d27e1fd63-start=201_end=300_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-fc02d066-1e35-4184-b4ea-ba7eca1abcc1_start=301_end=400_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-4739062e-2141-4f97-9a37-41197abf9a93_start=400_end=600_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-23353cc5-13c1-4af9-94c3-03eb2bacbd0a_start=600_end=850_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-4ee23f3d-2900-4fef-a0ae-d05bc7d519e8_start=850_end=1200_max_size=5/batch-results.jsonl") +# aggregate_v2("batch-13abedb3-9788-4118-8a23-89978a941638_start=1200_end=1536_max_size=5/batch-results.jsonl") +aggregate_v2("batch-eb1b61cd-36e7-4fdf-b8e1-fedd3c77245f_start=1540_end=1726_max_size=5/batch-results.jsonl") diff --git a/eval/amt/README.md b/eval/amt/README.md new file mode 100644 index 000000000..4382de275 --- /dev/null +++ b/eval/amt/README.md @@ -0,0 +1,24 @@ +# Human Evaluation of Instructions + - Step1: Select a task and convert it into AMTI-friendly format: +```terminal +> python 1.amti_create_input_data.py --file [NAME] --eval_count [COUNT] +``` +For example: +```terminal +> python 1.amti_create_input_data.py --file task156_codah_classification_adversarial.json --eval_count 10 +``` + - Step2: follow the steps in `2.run_eval.sh` + - Step3: evaluate + + +To disqualify a certain user, add them to `genie_disqualification/blocklist.txt` and then: + + amti associate-qual --file genie_disqualification/blocklist.txt --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --live + +Or disqualify a particular user: + + amti associate-qual WorkerId --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --live + +To list workers that are disqualified: + + amti associated --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --status Granted --live diff --git a/eval/amt/disqualification/qualificationtypeproperties.json b/eval/amt/disqualification/qualificationtypeproperties.json new file mode 100644 index 000000000..d04c77f51 --- /dev/null +++ b/eval/amt/disqualification/qualificationtypeproperties.json @@ -0,0 +1,8 @@ +{ + "Name": "Qualification for Participating in natural instructions annotations ", + "Keywords": "Human evaluation", + "Description": "Workers that consistently produced low quality work, or is suspected of producing spam.", + "QualificationTypeStatus": "Active", + "RetryDelayInSeconds": 86400, + "TestDurationInSeconds": 3600 +} \ No newline at end of file diff --git a/eval/amt/mturk-specs/definition-likert/NOTES b/eval/amt/mturk-specs/definition-likert/NOTES new file mode 100644 index 000000000..d8035ecf6 --- /dev/null +++ b/eval/amt/mturk-specs/definition-likert/NOTES @@ -0,0 +1 @@ +Human evaluation for natural instructions. \ No newline at end of file diff --git a/eval/amt/mturk-specs/definition-likert/hitproperties.json b/eval/amt/mturk-specs/definition-likert/hitproperties.json new file mode 100644 index 000000000..3bfa3be4e --- /dev/null +++ b/eval/amt/mturk-specs/definition-likert/hitproperties.json @@ -0,0 +1,4 @@ +{ + "MaxAssignments": 1, + "LifetimeInSeconds": 86400 +} diff --git a/eval/amt/mturk-specs/definition-likert/hittypeproperties.json b/eval/amt/mturk-specs/definition-likert/hittypeproperties.json new file mode 100644 index 000000000..17ec4e9a1 --- /dev/null +++ b/eval/amt/mturk-specs/definition-likert/hittypeproperties.json @@ -0,0 +1,14 @@ +{ + "AutoApprovalDelayInSeconds": 86400, + "AssignmentDurationInSeconds": 3600, + "Reward": "0.70", + "Title": "language instructions - v17", + "Keywords": "language instructions", + "Description": "In this HIT, you will read input instructions and see a few examples. Based on that, you're expected to label/answer a new instance.", + "QualificationRequirements": [ + { "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" }, + { "QualificationTypeId": "000000000000000000L0", "Comparator": "GreaterThanOrEqualTo", "IntegerValues": [ 99 ], "RequiredToPreview": false, "ActionsGuarded": "Accept" }, + { "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" }, + { "QualificationTypeId": "3090SA10WM5MIHCWNTON1VROMP4CN3", "Comparator": "DoesNotExist" } + ] +} diff --git a/eval/amt/mturk-specs/definition-likert/question.xml.j2 b/eval/amt/mturk-specs/definition-likert/question.xml.j2 new file mode 100644 index 000000000..b990d6da8 --- /dev/null +++ b/eval/amt/mturk-specs/definition-likert/question.xml.j2 @@ -0,0 +1,455 @@ + + + + + + + + + + + + + +
+

Welcome and thank you for your participation!

+

Instructions and Examples

+
+

+ The main purpose of this task is to get your feedback on the quality of the provided instructions below. + We use this feedback to improve the quality of the provided instructions for the next round. + Please read the instructions and tells us what you think. You can tell us about: (1) typos (2) ambiguous or odd phrasings (3) what can be added/changed to make the task easier to understand. + Feel free to be as critical as you want about the instructions. +

+

+ Instructions: {{instructions}} +

+
+ +
+
+

Positive (Desirable) Examples

+ Here are examples of desirable behaviors for this task: +
+
+
+
+ +
+ +
+
+

Negative (Undesirable) Examples

+ Here are several negative examples (undesirable outputs) for this task: +
+
+
+
+
+ + Great! You are now ready to start the following examples! +
+ +

+ +

+ +

Your Task

+
+
+
+ +
+
+

+ First, please tell us how we can improve the above instructions. + We expect you to provide suggestions to improve the instructions for every few HITs. + So, please do not leave them empty all the time. + If the instructions are impeccable, leave the boxes empty (and avoid unconstructive feedback such as "it's good", "nothing", etc.) +

+
+ Instruction quality:
+ Q1: What did you think about the quality of the task definition?
+ (Avoid putting random/garbage text here. Consistent random responses will result in being blocked. Better leave this box empty instead of writing something irrelevant. We bonus those workers with constructive feedback for improving the instructions.) +
+
+ + + + + + + + + + + +
+ + Q2: What can be improved in the instructions, if any? (please indicate any typos, lack of clarities or any other issues) + + +

+ +
+
+
+ Positive examples quality:
+ Q1: What did you think about the quality of the provided positive examples?
+ +
+ + + + + + + + + + + +
+ + Q2: What can be improved about the positive examples, if any? (please indicate any typos, lack of clarities or any other issues) + +

+
+
+
+ Negative examples quality:
+ Q1: What did you think about the quality of the provided negative examples?
+ +
+ + + + + + + + + + + +
+ + Q2: What can be improved about the negative examples, if any? (please indicate any typos, lack of clarities or any other issues) + +

+
+
+ + +
+ Now please answer the following inputs, according to the provided instructions above. +
+ +
+ +
+

+ + Please direct any comments, feedback, issues or questions + to this email address: + danielk@allenai.org + +

+ + + +
+ +
+ +

+
+ +
+
+
+
+ + + + ]]> +
+ 450 +
\ No newline at end of file diff --git a/eval/automatic/evaluation.py b/eval/automatic/evaluation.py new file mode 100644 index 000000000..f256ad73e --- /dev/null +++ b/eval/automatic/evaluation.py @@ -0,0 +1,62 @@ +from __future__ import print_function +import argparse +import json +import datasets + +rouge_metric = datasets.load_metric('rouge') + + +def rouge(prediction, ground_truth): + score = rouge_metric.compute( + predictions=[prediction], + references=[ground_truth], + **{'use_agregator': False, 'use_stemmer': True, 'rouge_types': ['rougeL']} + ) + return score['rougeL'][0].fmeasure + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + metrics = {} + for i in range(len(predictions)): + pred = predictions[i]['output'] + # gold_input = dataset['Instances'][predictions[i]['index']]['input'] + gold_outputs = dataset['Instances'][predictions[i]['index']]['output'] + if 'rouge' not in metrics: + metrics['rouge'] = 0 + metrics['rouge'] += metric_max_over_ground_truths(rouge, pred, gold_outputs) + + for key in metrics.keys(): + metrics[key] /= len(predictions) + + return metrics + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument("--dataset", + type=str, + required=True, + help="Dataset Json File Name") + parser.add_argument("--predictions", + type=str, + required=True, + help="Prediction File Name") + args = parser.parse_args() + with open(args.dataset) as dataset_file: + dataset_json = json.load(dataset_file) + with open(args.predictions) as prediction_file: + predictions_json = json.load(prediction_file) + print(evaluate(dataset_json, predictions_json['predictions'])) + + +if __name__ == "__main__": + main()