-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathdependency_benchmarks.py
134 lines (97 loc) · 3.7 KB
/
dependency_benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import time
import os
import spacy
from danlp.datasets import DDT
from danlp.models import load_spacy_model
from utils import print_speed_performance, dependency_report
import stanza
stanza.download('Danish') # Download model (you can comment this line after download)
# load the data
ddt = DDT()
ccorpus_conll = ddt.load_as_conllu(predefined_splits=True)
deps_true = []
# the test set
sentences_tokens = []
for sent in ccorpus_conll[2]:
sentences_tokens.append([token.form for token in sent._tokens])
deps_true.append([(token.deprel.lower(),int(token.head)) for token in sent._tokens])
num_sentences = len(sentences_tokens)
num_tokens = sum([len(s) for s in sentences_tokens])
def benchmark_spacy_mdl():
def normalize_spacy_head(i, hd):
return 0 if i == hd else hd+1
nlp = load_spacy_model()
parser = nlp.parser
start = time.time()
deps_pred = []
for sent in sentences_tokens:
doc = nlp.tokenizer.tokens_from_list(sent)
doc = parser(doc)
deprels = []
depheads = []
for i, tok in enumerate(doc):
deprels.append(tok.dep_.lower())
depheads.append(normalize_spacy_head(i, tok.head.i))
deps_pred.append([(r,h) for r,h in zip(deprels, depheads)])
print('**Spacy model**')
print_speed_performance(start, num_sentences, num_tokens)
assert len(deps_pred)==num_sentences
assert sum([len(s) for s in deps_pred])==num_tokens
print(dependency_report(deps_true, deps_pred))
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"):
"""
an adaption of benchmark spacy model which is compatible with spacy v. 3
running this requires:
spacy >= 3.0.0
spacy-transformers
"""
def normalize_spacy_head(i, hd):
return 0 if i == hd else hd+1
from spacy.tokens import Doc
import dacy
nlp = dacy.load(dacy_model)
trf = nlp.get_pipe('transformer')
parser = nlp.get_pipe('parser')
start = time.time()
deps_pred = []
for sent in sentences_tokens:
doc = Doc(nlp.vocab, words=sent)
doc = trf(doc)
doc = parser(doc)
deprels = []
depheads = []
for i, tok in enumerate(doc):
deprels.append(tok.dep_.lower())
depheads.append(normalize_spacy_head(i, tok.head.i))
deps_pred.append([(r,h) for r,h in zip(deprels, depheads)])
print('**Spacy model**')
print_speed_performance(start, num_sentences, num_tokens)
assert len(deps_pred)==num_sentences
assert sum([len(s) for s in deps_pred])==num_tokens
print(dependency_report(deps_true, deps_pred))
def benchmark_stanza_mdl():
nlp = stanza.Pipeline('da', processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True)
start = time.time()
deps_pred = []
for sent in sentences_tokens:
doc = nlp(" ".join(sent))
deprels = []
depheads = []
for tok in doc.iter_tokens():
deprels.append(tok.words[0].deprel)
depheads.append(tok.words[0].head)
deps_pred.append([(r,h) for r,h in zip(deprels, depheads)])
print('**Stanza model**')
print_speed_performance(start, num_sentences, num_tokens)
assert len(deps_pred)==num_sentences
assert sum([len(s) for s in deps_pred])==num_tokens
print(dependency_report(deps_true, deps_pred))
if __name__ == '__main__':
benchmark_spacy_mdl()
benchmark_stanza_mdl()
"""
To run the following benchmarks, make a new env with spacy ~3.0.1, dacy and spacy-transformer.
"""
# benchmark_dacy_mdl(dacy_model="da_dacy_small_tft-0.0.0")
# benchmark_dacy_mdl(dacy_model="da_dacy_medium_tft-0.0.0")
# benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0")