-
Notifications
You must be signed in to change notification settings - Fork 190
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Crowdworker evaluation of the tasks [Work in Progress] (#276)
* - first commit for evaluation: adding the first version a working evaluation template. * - added a script for result aggregation. * - update the preparation script. * - update. * - update. * - update. * - update. * - add the evaluation script. * - drop a mistakenly-added directory. * - add more description to the instructions. * - minor. * - * - * add quals. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * fix example display. * update.
- Loading branch information
Daniel Khashabi
authored
Jan 26, 2022
1 parent
fe65586
commit ff125a6
Showing
11 changed files
with
809 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
.idea | ||
.DS_Store | ||
/eval/amt/past_experiments/ | ||
eval/amt/past_experiments/* | ||
/eval/amt/disqualification/blocklist.txt | ||
/src/utils/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
''' | ||
Given a task file, it creates a single file for crowd annotations (the input to AMTI). | ||
For example: python 1.amti_create_input_data.py --start 5 --end 20 --eval_count 10 | ||
''' | ||
import json | ||
import random | ||
import argparse | ||
from os import listdir | ||
from os.path import isfile, join | ||
|
||
tasks_path = '../../tasks/' | ||
|
||
|
||
def read_file(file): | ||
with open(tasks_path + file, 'r') as f: | ||
return json.load(f) | ||
|
||
|
||
def normalize(str): | ||
return str.replace('"', '\'').replace('`', '\'').replace('&', ' and ').encode('ascii', 'ignore').decode('ascii').replace('<br>', '\n').replace('<', '[').replace('>', ']').replace('\n', '<br>') | ||
|
||
files = [f for f in listdir(tasks_path) if isfile(join(tasks_path, f)) and ".json" in f] | ||
task_ids_to_file = {} | ||
|
||
for f in files: | ||
id = f.split("_")[0].replace("task", "") | ||
# print(id) | ||
id = int(id) | ||
task_ids_to_file[id] = f | ||
|
||
|
||
def process_single_file(start, end, max_count): | ||
fout = open(f"start={start}_end={end}_max_size={max_count}.jsonl", "w") | ||
for idx in range(start, end + 1): | ||
if idx not in task_ids_to_file: | ||
continue | ||
file = task_ids_to_file[idx] | ||
|
||
# grouping instances into groups of size 5 | ||
json_content = read_file(file) | ||
|
||
if json_content["Input_language"] != ["English"] or json_content["Output_language"] != ["English"]: | ||
continue | ||
|
||
positive_examples = json_content['Positive Examples'] | ||
negative_examples = json_content['Negative Examples'] | ||
instances = json_content['Instances'] | ||
|
||
# make sure the annotators see all the examples | ||
random.shuffle(positive_examples) | ||
random.shuffle(negative_examples) | ||
random.shuffle(instances) | ||
|
||
positive_examples = positive_examples[:5] | ||
negative_examples = negative_examples[:3] | ||
|
||
n = 2 | ||
chunks = [instances[i:i + n] for i in range(0, len(instances), n)] | ||
for i, chunk in enumerate(chunks): | ||
if i * n > int(max_count): | ||
break | ||
map = { | ||
'file': normalize(file), | ||
'instructions': normalize(json_content['Definition']), | ||
} | ||
|
||
map[f'positive_example_count'] = len(positive_examples) | ||
map[f'negative_example_count'] = len(negative_examples) | ||
|
||
for idx, ex in enumerate(positive_examples): | ||
map[f'positive_ex_{idx}_input'] = normalize(ex['input']) | ||
map[f'positive_ex_{idx}_output'] = normalize(ex['output']) | ||
map[f'positive_ex_{idx}_explanation'] = normalize(ex['explanation']) | ||
|
||
for idx, ex in enumerate(negative_examples): | ||
map[f'negative_ex_{idx}_input'] = normalize(ex['input']) | ||
map[f'negative_ex_{idx}_output'] = normalize(ex['output']) | ||
map[f'negative_ex_{idx}_explanation'] = normalize(ex['explanation']) | ||
|
||
for idx, ex in enumerate(chunk): | ||
map[f'instance_{idx}_input'] = normalize(ex['input']) | ||
map[f'instance_{idx}_output'] = "///".join([normalize(x) for x in ex['output']]) | ||
|
||
fout.write(json.dumps(map) + "\n") | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='A script for preparing natural instructions tasks for human evaluation') | ||
parser.add_argument('--start', help='id of the start task inside `tasks/`') | ||
parser.add_argument('--end', help='id of the end task inside `tasks/`') | ||
parser.add_argument('--eval_count', help='how many instances to use in this evaluation. ' | ||
'100 should be enough for reliable estimates.') | ||
args = parser.parse_args() | ||
|
||
print(" >>>>>>>>> Processing with the following arguments: ") | ||
print(f" * start task id: {args.start}") | ||
print(f" * end task id: {args.end}") | ||
print(f" * count of samples from each task: {args.eval_count}") | ||
|
||
process_single_file(int(args.start), int(args.end), args.eval_count) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# launch the experiment | ||
export AWS_PROFILE=alexandria-mturk | ||
amti --verbose create-batch mturk-specs/definition-likert start=1200_end=1536_max_size=5.jsonl . --live | ||
|
||
# check the experiment status | ||
export BATCH=batch-... | ||
#export BATCH=batch-13abedb3-9788-4118-8a23-89978a941638_start=1540_end=1800_max_size=5 | ||
amti status-batch "$BATCH" --live | ||
|
||
# fetch the results when they're over | ||
amti review-batch "$BATCH" --approve-all --live | ||
amti save-batch "$BATCH" --live | ||
amti extract tabular "$BATCH" "${BATCH}/batch-results.jsonl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import sys | ||
|
||
# sys.path.append("..") | ||
sys.path.append("../automatic") | ||
|
||
import json | ||
from evaluation import metric_max_over_ground_truths, rouge | ||
|
||
|
||
def get_stats(values): | ||
maj_vote_value = max(set(values), key=values.count) | ||
avg_score = sum(values) / len(values) | ||
model_values_str = "\t".join([str(x) for x in values]) | ||
return maj_vote_value, avg_score, model_values_str | ||
|
||
|
||
def normalize(str): | ||
return str.replace("\t", " ").replace("\n", "<newline>").replace("\r", "<newline>") | ||
|
||
|
||
def normalize2(str): | ||
return ' '.join(str.replace(",", "").lower().split()) | ||
|
||
|
||
def discretize(x): | ||
if x == 0.5: | ||
return 0.5 | ||
if x > 0.5: | ||
return 1.0 | ||
else: | ||
return 0.0 | ||
|
||
|
||
def aggregate_v2(response_file): | ||
worker_stats = {} | ||
suggestions = {} | ||
with open(response_file) as f: | ||
for line in f.readlines(): | ||
json_line = json.loads(line) | ||
|
||
worker_id = json_line[f'WorkerId'] | ||
if worker_id not in worker_stats: | ||
worker_stats[worker_id] = 0 | ||
worker_stats[worker_id] += 1 | ||
|
||
file = json_line[f'file'] | ||
instructions = normalize(json_line[f'instructions']) | ||
|
||
instruction_quality_value = float(json_line[f'instruction_quality_q1']) | ||
instruction_suggestions = normalize(json_line[f'instruction_quality_q2']) | ||
|
||
positive_ex_quality_value = float(json_line[f'positive_example_quality_q1']) | ||
positive_ex_suggestions = normalize(json_line[f'positive_example_quality_q2']) | ||
|
||
negative_ex_quality_value = float(json_line[f'negative_example_quality_q1']) | ||
negative_ex_suggestions = normalize(json_line[f'negative_example_quality_q2']) | ||
|
||
positive_examples = [] | ||
for idx in range(0, 5): | ||
id = f'positive_ex_{idx}_input' | ||
if id in json_line: | ||
positive_examples.append(json_line[id]) | ||
positive_examples_appended = normalize("//".join(positive_examples)) | ||
|
||
negative_examples = [] | ||
for idx in range(0, 3): | ||
id = f'negative_ex_{idx}_input' | ||
if id in json_line: | ||
negative_examples.append(json_line[id]) | ||
negative_examples_appended = normalize("//".join(negative_examples)) | ||
|
||
instruction_suggestions = instruction_suggestions.replace("\n", " ") | ||
positive_ex_suggestions = positive_ex_suggestions.replace("\n", " ") | ||
negative_ex_suggestions = negative_ex_suggestions.replace("\n", " ") | ||
|
||
prefix = f"{instruction_quality_value}\t{instruction_suggestions}\t{file}\t{instructions}" \ | ||
f"\t{positive_ex_quality_value}\t{positive_ex_suggestions}\t{positive_examples_appended}" \ | ||
f"\t{negative_ex_quality_value}\t{negative_ex_suggestions}\t{negative_examples_appended}" | ||
|
||
if len(instruction_suggestions) + len(positive_ex_suggestions) + len(negative_ex_suggestions) > 2: | ||
if file not in suggestions: | ||
suggestions[file] = [] | ||
if len(instruction_suggestions.strip()) > 2: | ||
suggestions[file].append(f" - regarding instructions: `{instruction_suggestions}`") | ||
if len(positive_ex_suggestions.strip()) > 2: | ||
suggestions[file].append(f" - regarding p examples: `{positive_ex_suggestions}`") | ||
if len(negative_ex_suggestions.strip()) > 2: | ||
suggestions[file].append(f" - regarding n examples: `{negative_ex_suggestions}`") | ||
# instance_input = [] | ||
# instance_output = [] | ||
# instance_prediction = [] | ||
for idx in range(0, 5): | ||
id = f'instance_{idx}_input' | ||
if id in json_line: | ||
input = normalize(json_line[id]) | ||
output = normalize(json_line[f'instance_{idx}_output']) | ||
human_output = normalize(json_line[f'annotated_instance_{idx}_output']) | ||
# instance_input.append(input) | ||
# instance_output.append(output) | ||
# instance_prediction.append(human_output) | ||
rouge_val = metric_max_over_ground_truths( | ||
rouge, normalize2(human_output), | ||
[normalize2(x) for x in output.split("///")] | ||
) | ||
print(f"{prefix}\t{input}\t{output}\t{human_output}\t{rouge_val}\t{worker_id}") | ||
|
||
for file in sorted(suggestions.keys()): | ||
print(f" - [ ] {file}") | ||
for feedback in suggestions[file]: | ||
print(feedback) | ||
|
||
|
||
|
||
# aggregate_v2("batch-43ecd7ef-0717-4b72-a39c-b6179f8b5f77_task156_pilot/batch-results.jsonl") | ||
# aggregate_v2("batch-65c49abc-f8a3-4f12-a71d-9ce5742c3419_start=60_end=100_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-eea0ef32-da0a-47cf-bcae-810d1a503379_start=1_end=59_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-cdcb497e-49a3-4cee-8ab5-1451dc19dac2_119_end=200_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-58086c69-62bf-4e91-8741-b68d27e1fd63-start=201_end=300_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-fc02d066-1e35-4184-b4ea-ba7eca1abcc1_start=301_end=400_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-4739062e-2141-4f97-9a37-41197abf9a93_start=400_end=600_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-23353cc5-13c1-4af9-94c3-03eb2bacbd0a_start=600_end=850_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-4ee23f3d-2900-4fef-a0ae-d05bc7d519e8_start=850_end=1200_max_size=5/batch-results.jsonl") | ||
# aggregate_v2("batch-13abedb3-9788-4118-8a23-89978a941638_start=1200_end=1536_max_size=5/batch-results.jsonl") | ||
aggregate_v2("batch-eb1b61cd-36e7-4fdf-b8e1-fedd3c77245f_start=1540_end=1726_max_size=5/batch-results.jsonl") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Human Evaluation of Instructions | ||
- Step1: Select a task and convert it into AMTI-friendly format: | ||
```terminal | ||
> python 1.amti_create_input_data.py --file [NAME] --eval_count [COUNT] | ||
``` | ||
For example: | ||
```terminal | ||
> python 1.amti_create_input_data.py --file task156_codah_classification_adversarial.json --eval_count 10 | ||
``` | ||
- Step2: follow the steps in `2.run_eval.sh` | ||
- Step3: evaluate | ||
|
||
|
||
To disqualify a certain user, add them to `genie_disqualification/blocklist.txt` and then: | ||
|
||
amti associate-qual --file genie_disqualification/blocklist.txt --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --live | ||
|
||
Or disqualify a particular user: | ||
|
||
amti associate-qual WorkerId --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --live | ||
|
||
To list workers that are disqualified: | ||
|
||
amti associated --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --status Granted --live |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"Name": "Qualification for Participating in natural instructions annotations ", | ||
"Keywords": "Human evaluation", | ||
"Description": "Workers that consistently produced low quality work, or is suspected of producing spam.", | ||
"QualificationTypeStatus": "Active", | ||
"RetryDelayInSeconds": 86400, | ||
"TestDurationInSeconds": 3600 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Human evaluation for natural instructions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"MaxAssignments": 1, | ||
"LifetimeInSeconds": 86400 | ||
} |
14 changes: 14 additions & 0 deletions
14
eval/amt/mturk-specs/definition-likert/hittypeproperties.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"AutoApprovalDelayInSeconds": 86400, | ||
"AssignmentDurationInSeconds": 3600, | ||
"Reward": "0.70", | ||
"Title": "language instructions - v17", | ||
"Keywords": "language instructions", | ||
"Description": "In this HIT, you will read input instructions and see a few examples. Based on that, you're expected to label/answer a new instance.", | ||
"QualificationRequirements": [ | ||
{ "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" }, | ||
{ "QualificationTypeId": "000000000000000000L0", "Comparator": "GreaterThanOrEqualTo", "IntegerValues": [ 99 ], "RequiredToPreview": false, "ActionsGuarded": "Accept" }, | ||
{ "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" }, | ||
{ "QualificationTypeId": "3090SA10WM5MIHCWNTON1VROMP4CN3", "Comparator": "DoesNotExist" } | ||
] | ||
} |
Oops, something went wrong.