-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathSession.py
More file actions
120 lines (87 loc) · 3.56 KB
/
Session.py
File metadata and controls
120 lines (87 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from Conversion import Conversion, is_operand, Et
import numpy as np
import pickle as p
from text_processing_utils import extract_keywords
from nltk.stem import PorterStemmer
from functools import reduce
class Session:
def __init__(self):
self.BIR = None
self.scores = None
self.tf_idf = None
self.N = None
def load(self, path_bir, pscore_path, tf_idf):
try:
with open(path_bir, "rb") as f:
self.BIR, self.N = p.load(f)
with open(pscore_path, "rb") as f:
self.scores = np.load(f)
with open(tf_idf, "rb") as f:
self.tf_idf = p.load(f)
return True
except FileNotFoundError:
print("The input path is not valid. Please check the input paths.")
return False
def advance_search(self, query, top_k=10):
"""
Method takes in a query and evaluate it, retrieve relevant
documents and rank it based on the PageRank scores.
:param top_k:
:param query:
:return:
"""
print("Searching through {} documents...".format(self.N))
c = Conversion()
if "AND" not in query and "NOT" not in query and "OR" not in query:
self.search(query)
return self.BIR.reduce_intersect()
ps = PorterStemmer()
kw = [ps.stem(word) if is_operand(word) else word for word in query.split()]
query = " ".join(kw)
del kw
postfix = c.infix_to_postfix(query)
root = c.constructTree(postfix)
del c, postfix
found = self._solve(root)
found_scores = self.scores[found]
pairs = [(f, s) for f, s in zip(found, found_scores)]
pairs = list(set(pairs))
pairs.sort(key=lambda x: x[1], reverse=True)
docs, scores = zip(*pairs)
print("Found {} in {} documents!!".format(len(docs), self.N))
print("Returning {}".format(top_k))
return docs[:top_k], scores[:top_k]
def search(self, query, top_k=10):
print("Searching through {} documents...".format(self.N))
kw = extract_keywords(query, stem=True)
tf_idf = np.empty((0, self.N))
tf_idf = np.array([np.append(tf_idf, self.tf_idf[word]) for word in kw if word in self.BIR])
tf_idf = np.sum(tf_idf, axis=0)
postings = [self.BIR[t] for t in kw if t in self.BIR]
try:
retrieved_doc = reduce(np.union1d, postings).astype(int)
except TypeError:
print("Sorry! We found 0 document that matches your query. \n\n Can you try searching with other keywords?!")
return [], []
pagerank_score = self.scores[retrieved_doc]
retrieved_tf = tf_idf[retrieved_doc]
final = np.multiply(pagerank_score, retrieved_tf)
res = sorted(zip(retrieved_doc, final), key=lambda x: x[1], reverse=True)
doc, scores = zip(*res)
print("Found {} in {} documents!!".format(len(doc), self.N))
print("Returning {}".format(top_k))
return doc[:top_k], scores[:top_k]
def _solve(self, root):
if root:
val = root.value
if is_operand(val):
return self.BIR[val]
else:
left = self._solve(root.left)
right = self._solve(root.right)
if val == "AND":
return np.intersect1d(left, right)
elif val == "OR":
return np.union1d(left, right)
else:
return np.setdiff1d(left, right)