query-processor/queryProcessor.py at main · zthsk/query-processor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import sys
from hashtable import HashTable

class QueryProcessor:
    def __init__(self, queries, inv_dir):
        self.queries = [q.lower() for q in queries]
        self.inv_dir = inv_dir
        self.accumulator = {}
        self.dict_file = f'{inv_dir}/dict.txt'
        self.post_file = f'{inv_dir}/post.txt'
        self.map_file = f'{inv_dir}/map.txt'
        self.hashtable = HashTable(15000)
        if self.checkFileExists(): sys.exit()

    #read the data from the inverted files from the line number the data is stored in
    @staticmethod
    def readLineFile(file_path, line_number, bytes_to_read):
        byte_offset = line_number * bytes_to_read
        with open(file_path, 'rb') as file:
            file.seek(byte_offset)
            data = file.read(bytes_to_read)
            return data

    #check if the inverted file exists
    def checkFileExists(self):
        flag = False
        if not os.path.exists(self.dict_file):
            print('Error: Dictionary File not found!!!')
            flag = True
        if not os.path.exists(self.post_file):
            print('Error: Postings File not found!!!')
            flag = True
        if not os.path.exists(self.map_file):
            print('Error: Mappings File not found!!!')
            flag = True
        return flag

    #run the query processor to update the accumulator bucket
    def runQueryProcessor(self):
        for query in self.queries:
            dict_content = self.seekDictionary(query)
            if dict_content is not None:
                rec = Records(dict_content, self)
                postings = rec.seekPostings()
                self.updateAccumulator(postings)
            # else:
            #     print(f'{query} does not exist in dictionary!')


    #get the filename for the doc Id
    def getFileName(self, doc_id):
        content = self.readLineFile(self.map_file, doc_id, bytes_to_read=18).decode()
        record = content.strip().split('|')
        record = [i.strip() for i in record]
        return record[1]

    #ge the dictionary content for a query token
    def seekDictionary(self, query):
        index = self.hashtable.__find__(query)
        while True:
            line_content = self.readLineFile(self.dict_file, index, bytes_to_read=43).decode()
            dict_rec = self.formatRecords(line_content)
            if dict_rec[0] == query:
                return dict_rec
            elif dict_rec[1] == -1:
                return None

            index += 1

    #update the accumulator bucket for the query
    def updateAccumulator(self, postings):
        for post in postings:
            doc_id, wt = post
            if doc_id in self.accumulator:
                self.accumulator[doc_id] += wt
            else:
                self.accumulator[doc_id] = wt

    #sort in descending order and return the final accumulated bucket
    def getAccumulator(self):
        return dict(sorted(self.accumulator.items(), key=lambda item: item[1], reverse=True))

    @staticmethod
    def formatRecords(content):
        record = content.strip().split('|')
        record = [i.strip() for i in record]
        record[1], record[2] = int(record[1]), int(record[2])
        return tuple(record)


class Records:
    def __init__(self, dict_content, queryprocessor):
        self.token, self.num_docs, self.start = dict_content
        # self.start = int(self.start)
        # self.num_docs = int(self.num_docs)
        self.postings = []
        self.qp = queryprocessor

    #format the dicionary and postings record
    @staticmethod
    def formatRecords(content):
        record = content.strip().split('|')
        record = [i.strip() for i in record]
        record[0], record[1] = int(record[0]), float(record[1])
        return tuple(record)

    #get the postings for each query
    def seekPostings(self):
        for _ in range(self.num_docs):
            line_content = self.qp.readLineFile(self.qp.post_file, self.start, 21).decode()
            self.postings.append(self.formatRecords(line_content))
            self.start += 1
        return self.postings