-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathretriever.py
More file actions
104 lines (90 loc) · 2.59 KB
/
retriever.py
File metadata and controls
104 lines (90 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Assignment 3 - Retriever
from indexer import getFilePaths
import math, json, requests, os
import tkinter as tk
from bs4 import BeautifulSoup
index = {}
files = []
N = 0
def initialize():
global index, files, N
files = getFilePaths()
N = len(files)
with open('indexjson.json') as f:
index = json.load(f)
def formatQuery(query):
lower = query.lower()
return lower.split()
def search(query):
q = formatQuery(query)
docs, validWords = gatherDocs(q)
if len(docs) == 0:
return []
else:
scores = rankResults(docs, validWords)
return returnResults(scores, query)
def gatherDocs(words):
''' gathers all the documents where all of the words are found '''
global index
allDocs = []
commonDocs = []
validWords = []
for word in words:
if word not in index:
continue
else:
validWords.append(word)
docs = [i[0] for i in index[word]]
allDocs.append(docs)
if len(validWords) < 1: return [], []
if len(words) > 1:
for sublist in allDocs:
commonDocs = list(set(allDocs[0]).intersection(sublist))
else:
commonDocs = allDocs[0]
return commonDocs, validWords
def rankResults(docs, query):
length = len(docs)
allScores = {}
for doc in docs:
allScores[doc] = calculateScore(doc, query, length)
return allScores
def calculateScore(doc, query, length):
global index, N
tf = 0
idf = length
''' calculate total term frequency '''
for word in query:
matches = index[word]
for match in matches:
if match[0] == doc:
tf += match[1]
numSpecial = match[2]
break
''' calculate tf idf score '''
tfidf = (1+math.log(tf) * math.log(N/idf))
''' factor in weight of headings/bolded words '''
return tfidf * numSpecial
def getPageTitle(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
return soup.title.string
def returnResults(scores, query):
rankedScores = [v for v in sorted(scores.items(), key=lambda item: item[1], reverse=True)]
results = []
if len(rankedScores) == 0:
return []
for score in rankedScores:
url = getURL(score[0])
removedFragmentURL = url.split('#')
if removedFragmentURL[0] not in results:
results.append(removedFragmentURL[0])
return results
def getURL(docID):
global files
file = files[docID]
with open(file) as f:
content = json.load(f)
url = content["url"]
f.close()
return url