-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdataset2key.py
60 lines (44 loc) · 2.29 KB
/
dataset2key.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from pandas import read_csv
import argparse
import codecs
from eval_lib import get_best_id, format_lexsample, FIELD_NAMES, FIELD_TYPES, LIST_SEP, SCORE_SEP
SEMEVAL_SEP = " "
SEMEVAL_SCORE_SEP = "/"
BEST_SENSE = True
BEST_SENSE_WITH_SCORE = False
def format_score(score):
""" Gets '0:-99.65' and returns '0:10.03' """
label, score = score.split(SCORE_SEP)
score = int(-100000.*(1/float(score)))
return "%s/%d" % (label, score)
def convert_dataset2semevalkey(dataset_fpath, output_fpath, no_header=False):
with codecs.open(output_fpath, "w", encoding="utf-8") as output:
if no_header:
df = read_csv(dataset_fpath, sep='\t', encoding='utf8', header=None, names=FIELD_NAMES,
dtype=FIELD_TYPES, doublequote=False, quotechar='\0')
df.target = df.target.astype(str)
else:
df = read_csv(dataset_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False,
doublequote=False, quotechar='\0')
for i, row in df.iterrows():
if BEST_SENSE:
predicted_senses = get_best_id(unicode(row.predict_sense_ids))
elif BEST_SENSE_WITH_SCORE:
predicted_senses = format_score(get_best_id(unicode(row.predict_sense_ids), output_score=True))
else:
predicted_senses = SEMEVAL_SEP.join(format_score(s) for s in row.predict_sense_ids.split(LIST_SEP))
print >> output, "%s %s %s" % (row.target + "." + row.target_pos, row.context_id, predicted_senses)
print "Key file:", output_fpath
def main():
parser = argparse.ArgumentParser(description='Convert lexical sample dataset to SemEval 2013 key format.')
parser.add_argument('input', help='Path to a file with input lexical sample CSV file (9 columns or more).')
parser.add_argument('output', help='Output file: a SemEval key file with the sense predictions.')
parser.add_argument('--no_header', action='store_true', help='No headers. Default -- false.')
args = parser.parse_args()
print "Input: ", args.input
print "Output: ", args.output
print "No header:", args.no_header
lexsample_9cols_fpath = format_lexsample(args.input)
convert_dataset2semevalkey(lexsample_9cols_fpath, args.output, args.no_header)
if __name__ == '__main__':
main()