-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery_processing.py
327 lines (257 loc) · 10.9 KB
/
query_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import os
import math as m
import numpy as np
import pandas as pd
import tkinter as tk
from pathlib import Path
from nltk.stem import PorterStemmer
from tkinter import Tk, Canvas, Entry, Text, Button, PhotoImage
def extract_weights():
"""
This function is used to extract the TF-IDF and the IDF weights from their respective csv files.
Returns:
TF-IDF (DataFrame): The extracted TF-IDF weights.
IDF (DataFrame): The extracted IDF weights.
"""
TF_IDF = pd.read_csv('tf-idf.csv', index_col=0) # read TF-IDF DataFrame from CSV
IDF = pd.read_csv('idf.csv', index_col=0) # read IDF DataFrame from CSV
return TF_IDF, IDF
def get_docIDs():
"""
This function is used to extract document IDs based on the names of the files in the 'ResearchPapers' directory.
It gets the current working directory and lists all the files in the 'ResearchPapers' directory.
It then extracts the document IDs from the names of these files, sorts them, and returns the sorted list.
Assumes the 'ResearchPapers' folder is in your current working directory.
Returns:
docID (list): A sorted list of document IDs extracted from the file names in the 'ResearchPapers' directory.
"""
curr_dir = os.getcwd() # get the current directory
docID = [int(c.rstrip('.txt')) for c in os.listdir(curr_dir + '\ResearchPapers')] # extract the docIDs from the names of the files in the ResearchPapers directory
docID.sort()
return docID
def get_stopwords():
"""
This function is used to extract stopwords from 'Stopword-List.txt' file.
It reads each line from the file, and if the line is not empty, it appends the line to the stopwords list.
The function continues this process until it reaches the end of the file. Assumes the file is in your current working directory.
Returns:
stopwords (list): A list of stopwords extracted from the file.
"""
stopwords = []
with open('Stopword-List.txt', 'r') as f: # the 'Stopword-List.txt' file is opened in read mode
while True:
text = f.readline() # each line from the file is read one by one
if not text: # if the line read is empty (which means end of file), the loop is broken
break
stopwords.append(text) # else append the read line to the stopwords list
stopwords = [c.rstrip(' \n') for c in stopwords if c != '\n'] # a new list is created from stopwords, excluding any newline characters. Newline characters are also removed from the strings.
return stopwords
def calculate_QueryVector(query, IDF):
"""
This function calculates the query vector based on the query terms and the IDF weights.
Args:
query (list): A list of query terms.
IDF (DataFrame): The Inverse Document Frequency weights.
Returns:
query_vector (dictionary): The query vector based on the query terms and the IDF weights.
"""
query_vector = {} # create a list of zeros with the length of the columns in the IDF DataFrame
for term in IDF.columns: # loop through each term in the IDF columns
if term in query: # if the term is in the query
query_vector[term] = 1 + m.log(query.count(term), 10) # calculate the log term frequency weight
query_vector[term] *= IDF.loc[0, term] # multiply the log term frequency weight by the IDF weight
else:
query_vector[term] = 0 # if the term is not in the query, set the weight to zero
norm = sum([query_vector[x] ** 2 for x in query_vector.keys()]) ** 0.5 # calculate the norm of the query vector
if norm != 0: # if the norm is not zero
for term in query_vector:
query_vector[term] = query_vector[term] / norm # normalize the query vector
return query_vector
def QueryProcessing(query):
'''
This function processes the query by stemming the words, removing stopwords, and calculating the TF-IDF weights of the terms.
Args:
query (str): The query string to be processed.
Returns:
vectors (DataFrame): The DataFrame containing the TF-IDF weights of all the terms.
query_vector (DataFrame): The DataFrame containing TF-IDF weights the query terms.
'''
vectors, IDF = extract_weights() # # read the TF-IDF and IDF DataFrames from CSV
query_vector = calculate_QueryVector(query, IDF) # calculate the query vector
query_vector = pd.DataFrame(query_vector, index=[0]) # create a DataFrame from the query vector
return vectors, query_vector
def find_sim(query):
'''
This function calculates the similarity scores between the query vector and the document vectors.
Args:
query (string): The query string to be processed.
Returns:
scores (string): a string containing the document IDs with similarity scores greater than or equal to 0.05, sorted and ranked with respect to their scores.
'''
query = query.split() # split the query into words
porter_stemmer = PorterStemmer() # initialize the stemmer
stopwords = get_stopwords() # get the stopwords
doc = get_docIDs() # get the document IDs
query = [porter_stemmer.stem(word).rstrip("'").casefold() for word in query if word not in stopwords] # stem the words in the query and remove the stopwords
vectors , query_vector = QueryProcessing(query) # calculate the query vector
score = {} # create a dictionary to store the similarity scores
for docID in doc:
score[docID] = np.dot(vectors[str(docID)], query_vector.transpose()) # calculate the similarity score for each document
score = {k: v for k, v in sorted(score.items(), key=lambda item: item[1], reverse=True)} # sort the similarity scores in descending order
score = {k: score[k] for k in score if score[k] >= 0.05} # remove any documents with a similarity score less than 0.05
score = [k for k in score.keys()]
score = ' '.join(map(str, score))
return score
def process_query():
"""
This function retrieves a user's query from a GUI text entry field, processes the query, and displays the result in a GUI label.
"""
query = enter_query.get() # get the query from the text entry field
result = find_sim(query)
if result == '': # if no documents are found
result = 'No documents found'
output_label.configure(state='normal') # enable the output label
output_label.delete(0.0, tk.END) # clear the output label
output_label.insert(0.0, result) # insert the result into the output label
output_label.configure(state='disabled') # again disable the output label
OUTPUT_PATH = Path(__file__).parent # get the output path
ASSETS_PATH = OUTPUT_PATH / Path(r"C:\Users\Hp\Desktop\Information Retrieval\Assignment 2\build\assets\frame0") # get the assets path
def relative_to_assets(path: str) -> Path: # function to get the relative path of the assets
return ASSETS_PATH / Path(path)
window = Tk() # create a new window
window.geometry("700x400") # set the window size
window.configure(bg = "#FFFFFF") # set the window background color
# create a canvas to place the widgets
canvas = Canvas(
window,
bg = "#FFFFFF",
height = 400,
width = 700,
bd = 0,
highlightthickness = 0,
relief = "ridge"
)
# place the canvas at the top left corner of the window
canvas.place(x = 0, y = 0)
canvas.create_rectangle(
0.0,
0.0,
700.0,
49.0,
fill="#51ADD4",
outline="")
# place the Vector Space Model text, that has been converted to an image, at the given co-ordinates on the canvas
vsm_text_image = PhotoImage(
file=relative_to_assets("image_1.png"))
image_1 = canvas.create_image(
130.0,
24.0,
image=vsm_text_image
)
# place the image at the given co-ordinates for the query entry box on the canvas
entry_image_1 = PhotoImage(
file=relative_to_assets("entry_1.png"))
entry_bg_1 = canvas.create_image(
350.0,
130.5,
image=entry_image_1
)
# place the text entry field at the given co-ordinates on the canvas with the foreground and background matching the image for the entry box
enter_query = Entry(
bd=0,
bg="#D9D9D9",
fg="#000716",
highlightthickness=0
)
enter_query.place(
x=52.0,
y=110.0,
width=596.0,
height=39.0
)
# place the image at the given co-ordinates for the output label box on the canvas
output_label_image = PhotoImage(
file=relative_to_assets("entry_2.png"))
output_label_bg = canvas.create_image(
350.0,
313.5,
image=output_label_image
)
# create the output label at the given co-ordinates on the canvas with the foreground and background matching the image for the output label box
output_label = Text(
bd=0,
bg="#D9D9D9",
fg="#000716",
highlightthickness=0
)
output_label.place(
x=52.0,
y=293.0,
width=596.0,
height=39.0
)
# disable the output label so that it cannot be edited
output_label.configure(
state='disabled',
font=('Monteserrat', 10)
)
# place the "Enter Query" text, that has been converted to an image, above the query entry box
image_image_2 = PhotoImage(
file=relative_to_assets("image_2.png"))
image_2 = canvas.create_image(
100.0,
88.0,
image=image_image_2
)
# place the "Result" text, that has been converted to an image, above the output label box
image_image_3 = PhotoImage(
file=relative_to_assets("image_3.png"))
image_3 = canvas.create_image(
80.0,
275.0,
image=image_image_3
)
# place the "Search" button, that has been converted to an image, at the given co-ordinates on the canvas
button_image_1 = PhotoImage(
file=relative_to_assets("button_1.png"))
# create the button that is to be placed on the canvas for the searching, pressing this button will call the process_query function
button_1 = Button(
image=button_image_1,
borderwidth=0,
highlightthickness=0,
command=process_query,
relief="flat"
)
button_1.place(
x=176.0,
y=166.0,
width=348.0,
height=41.0
)
# place the "Exit" button, that has been converted to an image, at the given co-ordinates on the canvas, pressing this button will close the window
button_image_2 = PhotoImage(
file=relative_to_assets("button_2.png"))
button_2 = Button(
image=button_image_2,
borderwidth=0,
highlightthickness=0,
command=window.destroy,
relief="flat"
)
button_2.place(
x=176.0,
y=222.0,
width=348.0,
height=41.0
)
# place the image at the given co-ordinates for the logo on the canvas
image_image_4 = PhotoImage(
file=relative_to_assets("image_4.png"))
image_4 = canvas.create_image(
657.0,
24.0,
image=image_image_4
)
window.resizable(False, False) # make the window non-resizable
if __name__ == "__main__":
window.mainloop() # run the window