-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathpreprocessor.py
More file actions
115 lines (89 loc) · 2.72 KB
/
preprocessor.py
File metadata and controls
115 lines (89 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
This module separates files into smaller units each containing no more than 10k characters.
Then removes special characters and everything within brackets
"""
import os
import shared
import constants
from unidecode import unidecode
from string import punctuation
from collections import OrderedDict
DOC_DIR_PATH = ""
def remove_enclosed_content(line):
# removes all content enclosed by brackets {} [] ()
ret = ''
skip1c = 0
skip2c = 0
skip3c = 0
for i in line:
if i == '[':
skip1c += 1
elif i == '(':
skip2c += 1
elif i == '{':
skip3c += 1
elif i == ']' and skip1c > 0:
skip1c -= 1
elif i == ')' and skip2c > 0:
skip2c -= 1
elif i == '}':
skip3c -= 1
elif skip1c == 0 and skip2c == 0 and skip3c == 0:
ret += i
return ret
def break_multiple_lines(line):
lines = line.split('\n')
ret_lines = list()
for l in lines:
l.strip().strip('.')
start = 0
ix = 0
while ix < len(l):
if l[ix] == '.':
if ix == len(l) - 1 or l[ix + 1] == ' ':
ret_lines.append(l[start:ix])
start = ix + 2
ix += 1
else:
ret_lines.append(l[start:-1])
return ret_lines
def process_word(word: str):
# if word.isdigit() or not word.isalnum():
# return constants.null_word
word = word.lower()
if word in shared.STOP_WORDS:
return ""
return word
def replace_special_meaning_symbols(line):
if not constants.punctuations:
constants.punctuations = set(punctuation)
replace_dict = OrderedDict([
(',', '\n'),
("'s", " is"),
("'m", " am"),
("can't", "can not"),
("n't", " not"),
("'ll", " will")
])
def replace_all(text, dic):
for i, j in dic.items():
text = text.replace(i, j)
return text
line = replace_all(line, replace_dict)
return line
def process_line(line):
line = unidecode(line)
line = line.strip().strip('.').strip()
line = replace_special_meaning_symbols(line)
line = "".join([c if c not in constants.question_replaceable_special_characters else ' ' for c in line])
words = [process_word(w) for w in line.split()]
line = " ".join([w for w in words if w])
line = line.replace(" ", " ") # remove duplicate spaces
return line
def preprocess_question_content(content):
lines = break_multiple_lines(content)
processed_lines = [process_line(l) for l in lines]
return " ".join([l for l in processed_lines if l])
if __name__ == "__main__":
pass
# preprocess_document("part_3", ".txt")