-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathCLOCQ.py
178 lines (153 loc) · 6.71 KB
/
CLOCQ.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import json
import pickle
import random
import re
import time
import requests
import sys
from clocq import config
from clocq.CLOCQAlgorithm import CLOCQAlgorithm
from clocq.knowledge_base.KnowledgeBase import KnowledgeBase
from clocq.StringLibrary import StringLibrary
from clocq.WikidataSearchCache import WikidataSearchCache
class CLOCQ:
def __init__(self, tagme_token=None, dev=False):
# load required modules
if tagme_token:
string_lib = StringLibrary(config.PATH_TO_STOPWORDS, tagme_token, config.PATH_TO_TAGME_NER_CACHE)
else:
string_lib = StringLibrary(config.PATH_TO_STOPWORDS, config.TAGME_TOKEN, config.PATH_TO_TAGME_NER_CACHE)
wikidata_search_cache = WikidataSearchCache(config.PATH_TO_WIKI_SEARCH_CACHE)
if dev:
self.kb = KnowledgeBase(config.PATH_TO_KB_LIST, config.PATH_TO_KB_DICTS, max_items=10)
else:
self.kb = KnowledgeBase(config.PATH_TO_KB_LIST, config.PATH_TO_KB_DICTS)
# load CLOCQ
method_name = "clocq"
self.clocq = CLOCQAlgorithm(
self.kb,
string_lib,
method_name,
config.NER,
config.PATH_TO_STOPWORDS,
config.PATH_TO_WIKI2VEC_MODEL,
config.PATH_TO_WIKIPEDIA_MAPPINGS,
config.PATH_TO_NORM_CACHE,
wikidata_search_cache=wikidata_search_cache,
)
# define regex pattern
self.ENTITY_PATTERN = re.compile("^Q[0-9]+$")
self.PRED_PATTERN = re.compile("^P[0-9]+$")
def get_label(self, kb_item):
"""
Retrieves a single label for the given KB item.
E.g. "France national association football team" for "Q47774".
Note: The n-triples Wikidata dump stores multiple labels (not aliases) for the same item.
Here, we return the first KB label which is not exactly the KB item id (i.e. "Q47774").
Shown as: "Label".
"""
return self.kb.item_to_single_label(kb_item)
def get_labels(self, kb_item):
"""
Retrieves the list of label for the given KB item.
E.g. ["France national association football team", "France national team"] for "Q47774".
Note: The n-triples Wikidata dump stores multiple labels (not aliases) for the same item.
Here, we return the full list of KB labels stored in the n-triples dump.
Shown as: "Label".
"""
return self.kb.item_to_labels(kb_item)
def get_aliases(self, kb_item):
"""
Retrieves the aliases for the given KB item.
E.g. "France" for "Q47774".
Shown as: "Also known as".
"""
return self.kb.item_to_aliases(kb_item)
def get_description(self, kb_item):
"""
Retrieves the description for the given KB item.
The descriptions can be seen on top of Wikidata pages.
E.g. "men's national association football team representing France" for "Q47774".
Shown as: "Description".
"""
return self.kb.item_to_description(kb_item)
def get_types(self, kb_item):
"""
Retrieves the types for the given KB item.
Returns list of items with keys: {"id", "label"}.
E.g. [{"id": "Q6979593", "label": "national association football team"}] for "Q47774".
"""
return self.kb.item_to_types(kb_item)
def get_type(self, kb_item):
"""
Retrieves the most frequent type for the given KB item.
Returns a single item with keys: {"id", "label"}.
E.g. {"id": "Q6979593", "label": "national association football team"} for "Q47774".
"""
return self.kb.item_to_most_frequent_type(kb_item)
def get_frequency(self, kb_item):
"""
A list of two frequency numbers for the given KB item:
- number of facts with the item occuring as subject
- number of facts with the item occuring as object/qualifier-object.
"""
return self.kb.get_frequency(kb_item)
def get_neighborhood(self, kb_item, p=1000, include_labels=True, include_type=False):
"""
Returns a list of facts including the item (the 1-hop neighborhood)
each fact is a n-tuple, with subject, predicate, object and qualifier information.
"""
return self.kb.get_neighborhood(kb_item, p=p, include_labels=include_labels, include_type=include_type)
def get_neighborhood_two_hop(self, kb_item, p=1000, include_labels=True, include_type=False):
"""
Returns a list of facts in the 2-hop neighborhood of the item
each fact is a n-tuple, with subject, predicate, object and qualifier information.
"""
return self.kb.get_neighborhood_two_hop(kb_item, p=p, include_labels=include_labels, include_type=include_type)
def connect(self, kb_item1, kb_item2):
"""
Returns a list of paths between item1 and item2. Each path is given by either 1 fact
(1-hop connection) or 2 facts (2-hop connections).
"""
return self.kb.connect(kb_item1, kb_item2)
def connectivity_check(self, kb_item1, kb_item2):
"""
Returns the distance of the two items in the graph, given a fact-based definition.
Returns 1 if the items are within 1 hop of each other,
Returns 0.5 if the items are within 2 hops of each other,
and returns 0 otherwise.
"""
return self.kb.connectivity_check(kb_item1, kb_item2)
def get_search_space(self, question, parameters=dict(), include_labels=True, include_type=False):
"""
Extract a question-specific context for the given question using the CLOCQ algorithm.
Returns k (context tuple, context graph)-pairs for the given questions,
i.e. a mapping of question words to KB items and a question-relevant KG subset.
In case the dict is empty, the default CLOCQ parameters are used
"""
if not parameters:
parameters = config.DEF_PARAMS
else:
new_parameters = config.DEF_PARAMS
for key in parameters:
new_parameters[key] = parameters[key]
parameters = new_parameters
return self.clocq.get_seach_space(question, parameters=parameters, include_labels=include_labels, include_type=include_type)
def is_wikidata_entity(self, string):
"""
Check whether the given string can be a wikidata entity.
"""
return self.ENTITY_PATTERN.match(string) is not None
def is_wikidata_predicate(self, string):
"""
Check whether the given string can be a wikidata predicate.
"""
return self.PRED_PATTERN.match(string) is not None
"""
MAIN
"""
if __name__ == "__main__":
clocq = CLOCQ(dev=True)
kb_item = "Q5"
res = clocq.get_label(kb_item)
print(res)