Skip to content

Commit

Permalink
Crowdworker evaluation of the tasks [Work in Progress] (#276)
Browse files Browse the repository at this point in the history
* - first commit for evaluation: adding the first version a working evaluation template.

* - added a script for result aggregation.

* - update the preparation script.

* - update.

* - update.

* - update.

* - update.

* - add the evaluation script.

* - drop a mistakenly-added directory.

* - add more description to the instructions.

* - minor.

* -

* -

* add quals.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* fix example display.

* update.
  • Loading branch information
Daniel Khashabi authored Jan 26, 2022
1 parent fe65586 commit ff125a6
Show file tree
Hide file tree
Showing 11 changed files with 809 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.idea
.DS_Store
/eval/amt/past_experiments/
eval/amt/past_experiments/*
/eval/amt/disqualification/blocklist.txt
/src/utils/
101 changes: 101 additions & 0 deletions eval/amt/1.amti_create_input_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
'''
Given a task file, it creates a single file for crowd annotations (the input to AMTI).
For example: python 1.amti_create_input_data.py --start 5 --end 20 --eval_count 10
'''
import json
import random
import argparse
from os import listdir
from os.path import isfile, join

tasks_path = '../../tasks/'


def read_file(file):
with open(tasks_path + file, 'r') as f:
return json.load(f)


def normalize(str):
return str.replace('"', '\'').replace('`', '\'').replace('&', ' and ').encode('ascii', 'ignore').decode('ascii').replace('<br>', '\n').replace('<', '[').replace('>', ']').replace('\n', '<br>')

files = [f for f in listdir(tasks_path) if isfile(join(tasks_path, f)) and ".json" in f]
task_ids_to_file = {}

for f in files:
id = f.split("_")[0].replace("task", "")
# print(id)
id = int(id)
task_ids_to_file[id] = f


def process_single_file(start, end, max_count):
fout = open(f"start={start}_end={end}_max_size={max_count}.jsonl", "w")
for idx in range(start, end + 1):
if idx not in task_ids_to_file:
continue
file = task_ids_to_file[idx]

# grouping instances into groups of size 5
json_content = read_file(file)

if json_content["Input_language"] != ["English"] or json_content["Output_language"] != ["English"]:
continue

positive_examples = json_content['Positive Examples']
negative_examples = json_content['Negative Examples']
instances = json_content['Instances']

# make sure the annotators see all the examples
random.shuffle(positive_examples)
random.shuffle(negative_examples)
random.shuffle(instances)

positive_examples = positive_examples[:5]
negative_examples = negative_examples[:3]

n = 2
chunks = [instances[i:i + n] for i in range(0, len(instances), n)]
for i, chunk in enumerate(chunks):
if i * n > int(max_count):
break
map = {
'file': normalize(file),
'instructions': normalize(json_content['Definition']),
}

map[f'positive_example_count'] = len(positive_examples)
map[f'negative_example_count'] = len(negative_examples)

for idx, ex in enumerate(positive_examples):
map[f'positive_ex_{idx}_input'] = normalize(ex['input'])
map[f'positive_ex_{idx}_output'] = normalize(ex['output'])
map[f'positive_ex_{idx}_explanation'] = normalize(ex['explanation'])

for idx, ex in enumerate(negative_examples):
map[f'negative_ex_{idx}_input'] = normalize(ex['input'])
map[f'negative_ex_{idx}_output'] = normalize(ex['output'])
map[f'negative_ex_{idx}_explanation'] = normalize(ex['explanation'])

for idx, ex in enumerate(chunk):
map[f'instance_{idx}_input'] = normalize(ex['input'])
map[f'instance_{idx}_output'] = "///".join([normalize(x) for x in ex['output']])

fout.write(json.dumps(map) + "\n")


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='A script for preparing natural instructions tasks for human evaluation')
parser.add_argument('--start', help='id of the start task inside `tasks/`')
parser.add_argument('--end', help='id of the end task inside `tasks/`')
parser.add_argument('--eval_count', help='how many instances to use in this evaluation. '
'100 should be enough for reliable estimates.')
args = parser.parse_args()

print(" >>>>>>>>> Processing with the following arguments: ")
print(f" * start task id: {args.start}")
print(f" * end task id: {args.end}")
print(f" * count of samples from each task: {args.eval_count}")

process_single_file(int(args.start), int(args.end), args.eval_count)
13 changes: 13 additions & 0 deletions eval/amt/2.run_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# launch the experiment
export AWS_PROFILE=alexandria-mturk
amti --verbose create-batch mturk-specs/definition-likert start=1200_end=1536_max_size=5.jsonl . --live

# check the experiment status
export BATCH=batch-...
#export BATCH=batch-13abedb3-9788-4118-8a23-89978a941638_start=1540_end=1800_max_size=5
amti status-batch "$BATCH" --live

# fetch the results when they're over
amti review-batch "$BATCH" --approve-all --live
amti save-batch "$BATCH" --live
amti extract tabular "$BATCH" "${BATCH}/batch-results.jsonl"
124 changes: 124 additions & 0 deletions eval/amt/3.amti_aggregate_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import sys

# sys.path.append("..")
sys.path.append("../automatic")

import json
from evaluation import metric_max_over_ground_truths, rouge


def get_stats(values):
maj_vote_value = max(set(values), key=values.count)
avg_score = sum(values) / len(values)
model_values_str = "\t".join([str(x) for x in values])
return maj_vote_value, avg_score, model_values_str


def normalize(str):
return str.replace("\t", " ").replace("\n", "<newline>").replace("\r", "<newline>")


def normalize2(str):
return ' '.join(str.replace(",", "").lower().split())


def discretize(x):
if x == 0.5:
return 0.5
if x > 0.5:
return 1.0
else:
return 0.0


def aggregate_v2(response_file):
worker_stats = {}
suggestions = {}
with open(response_file) as f:
for line in f.readlines():
json_line = json.loads(line)

worker_id = json_line[f'WorkerId']
if worker_id not in worker_stats:
worker_stats[worker_id] = 0
worker_stats[worker_id] += 1

file = json_line[f'file']
instructions = normalize(json_line[f'instructions'])

instruction_quality_value = float(json_line[f'instruction_quality_q1'])
instruction_suggestions = normalize(json_line[f'instruction_quality_q2'])

positive_ex_quality_value = float(json_line[f'positive_example_quality_q1'])
positive_ex_suggestions = normalize(json_line[f'positive_example_quality_q2'])

negative_ex_quality_value = float(json_line[f'negative_example_quality_q1'])
negative_ex_suggestions = normalize(json_line[f'negative_example_quality_q2'])

positive_examples = []
for idx in range(0, 5):
id = f'positive_ex_{idx}_input'
if id in json_line:
positive_examples.append(json_line[id])
positive_examples_appended = normalize("//".join(positive_examples))

negative_examples = []
for idx in range(0, 3):
id = f'negative_ex_{idx}_input'
if id in json_line:
negative_examples.append(json_line[id])
negative_examples_appended = normalize("//".join(negative_examples))

instruction_suggestions = instruction_suggestions.replace("\n", " ")
positive_ex_suggestions = positive_ex_suggestions.replace("\n", " ")
negative_ex_suggestions = negative_ex_suggestions.replace("\n", " ")

prefix = f"{instruction_quality_value}\t{instruction_suggestions}\t{file}\t{instructions}" \
f"\t{positive_ex_quality_value}\t{positive_ex_suggestions}\t{positive_examples_appended}" \
f"\t{negative_ex_quality_value}\t{negative_ex_suggestions}\t{negative_examples_appended}"

if len(instruction_suggestions) + len(positive_ex_suggestions) + len(negative_ex_suggestions) > 2:
if file not in suggestions:
suggestions[file] = []
if len(instruction_suggestions.strip()) > 2:
suggestions[file].append(f" - regarding instructions: `{instruction_suggestions}`")
if len(positive_ex_suggestions.strip()) > 2:
suggestions[file].append(f" - regarding p examples: `{positive_ex_suggestions}`")
if len(negative_ex_suggestions.strip()) > 2:
suggestions[file].append(f" - regarding n examples: `{negative_ex_suggestions}`")
# instance_input = []
# instance_output = []
# instance_prediction = []
for idx in range(0, 5):
id = f'instance_{idx}_input'
if id in json_line:
input = normalize(json_line[id])
output = normalize(json_line[f'instance_{idx}_output'])
human_output = normalize(json_line[f'annotated_instance_{idx}_output'])
# instance_input.append(input)
# instance_output.append(output)
# instance_prediction.append(human_output)
rouge_val = metric_max_over_ground_truths(
rouge, normalize2(human_output),
[normalize2(x) for x in output.split("///")]
)
print(f"{prefix}\t{input}\t{output}\t{human_output}\t{rouge_val}\t{worker_id}")

for file in sorted(suggestions.keys()):
print(f" - [ ] {file}")
for feedback in suggestions[file]:
print(feedback)



# aggregate_v2("batch-43ecd7ef-0717-4b72-a39c-b6179f8b5f77_task156_pilot/batch-results.jsonl")
# aggregate_v2("batch-65c49abc-f8a3-4f12-a71d-9ce5742c3419_start=60_end=100_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-eea0ef32-da0a-47cf-bcae-810d1a503379_start=1_end=59_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-cdcb497e-49a3-4cee-8ab5-1451dc19dac2_119_end=200_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-58086c69-62bf-4e91-8741-b68d27e1fd63-start=201_end=300_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-fc02d066-1e35-4184-b4ea-ba7eca1abcc1_start=301_end=400_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-4739062e-2141-4f97-9a37-41197abf9a93_start=400_end=600_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-23353cc5-13c1-4af9-94c3-03eb2bacbd0a_start=600_end=850_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-4ee23f3d-2900-4fef-a0ae-d05bc7d519e8_start=850_end=1200_max_size=5/batch-results.jsonl")
# aggregate_v2("batch-13abedb3-9788-4118-8a23-89978a941638_start=1200_end=1536_max_size=5/batch-results.jsonl")
aggregate_v2("batch-eb1b61cd-36e7-4fdf-b8e1-fedd3c77245f_start=1540_end=1726_max_size=5/batch-results.jsonl")
24 changes: 24 additions & 0 deletions eval/amt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Human Evaluation of Instructions
- Step1: Select a task and convert it into AMTI-friendly format:
```terminal
> python 1.amti_create_input_data.py --file [NAME] --eval_count [COUNT]
```
For example:
```terminal
> python 1.amti_create_input_data.py --file task156_codah_classification_adversarial.json --eval_count 10
```
- Step2: follow the steps in `2.run_eval.sh`
- Step3: evaluate


To disqualify a certain user, add them to `genie_disqualification/blocklist.txt` and then:

amti associate-qual --file genie_disqualification/blocklist.txt --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --live

Or disqualify a particular user:

amti associate-qual WorkerId --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --live

To list workers that are disqualified:

amti associated --qual 3090SA10WM5MIHCWNTON1VROMP4CN3 --status Granted --live
8 changes: 8 additions & 0 deletions eval/amt/disqualification/qualificationtypeproperties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"Name": "Qualification for Participating in natural instructions annotations ",
"Keywords": "Human evaluation",
"Description": "Workers that consistently produced low quality work, or is suspected of producing spam.",
"QualificationTypeStatus": "Active",
"RetryDelayInSeconds": 86400,
"TestDurationInSeconds": 3600
}
1 change: 1 addition & 0 deletions eval/amt/mturk-specs/definition-likert/NOTES
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Human evaluation for natural instructions.
4 changes: 4 additions & 0 deletions eval/amt/mturk-specs/definition-likert/hitproperties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"MaxAssignments": 1,
"LifetimeInSeconds": 86400
}
14 changes: 14 additions & 0 deletions eval/amt/mturk-specs/definition-likert/hittypeproperties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"AutoApprovalDelayInSeconds": 86400,
"AssignmentDurationInSeconds": 3600,
"Reward": "0.70",
"Title": "language instructions - v17",
"Keywords": "language instructions",
"Description": "In this HIT, you will read input instructions and see a few examples. Based on that, you're expected to label/answer a new instance.",
"QualificationRequirements": [
{ "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
{ "QualificationTypeId": "000000000000000000L0", "Comparator": "GreaterThanOrEqualTo", "IntegerValues": [ 99 ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
{ "QualificationTypeId": "00000000000000000071", "Comparator": "In", "LocaleValues": [ { "Country": "US" }, { "Country": "GB" }, { "Country": "AU" }, { "Country": "CA" } ], "RequiredToPreview": false, "ActionsGuarded": "Accept" },
{ "QualificationTypeId": "3090SA10WM5MIHCWNTON1VROMP4CN3", "Comparator": "DoesNotExist" }
]
}
Loading

0 comments on commit ff125a6

Please sign in to comment.