stackoverflow_api_recommender/preprocessor.py at master · adwait-thattey/stackoverflow_api_recommender · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
This module separates files into smaller units each containing no more than 10k characters.
Then removes special characters and everything within brackets

"""
import os

import shared
import constants
from unidecode import unidecode
from string import punctuation
from collections import OrderedDict

DOC_DIR_PATH = ""


def remove_enclosed_content(line):
    # removes all content enclosed by brackets {} [] ()
    ret = ''
    skip1c = 0
    skip2c = 0
    skip3c = 0
    for i in line:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == '{':
            skip3c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')' and skip2c > 0:
            skip2c -= 1
        elif i == '}':
            skip3c -= 1
        elif skip1c == 0 and skip2c == 0 and skip3c == 0:
            ret += i

    return ret


def break_multiple_lines(line):
    lines = line.split('\n')
    ret_lines = list()
    for l in lines:
        l.strip().strip('.')
        start = 0
        ix = 0
        while ix < len(l):
            if l[ix] == '.':
                if ix == len(l) - 1 or l[ix + 1] == ' ':
                    ret_lines.append(l[start:ix])
                    start = ix + 2

            ix += 1
        else:
            ret_lines.append(l[start:-1])

    return ret_lines


def process_word(word: str):
    # if word.isdigit() or not word.isalnum():
    #     return constants.null_word
    word = word.lower()

    if word in shared.STOP_WORDS:
        return ""

    return word


def replace_special_meaning_symbols(line):
    if not constants.punctuations:
        constants.punctuations = set(punctuation)

    replace_dict = OrderedDict([
        (',', '\n'),
        ("'s", " is"),
        ("'m", " am"),
        ("can't", "can not"),
        ("n't", " not"),
        ("'ll", " will")
    ])

    def replace_all(text, dic):
        for i, j in dic.items():
            text = text.replace(i, j)
        return text

    line = replace_all(line, replace_dict)
    return line


def process_line(line):
    line = unidecode(line)
    line = line.strip().strip('.').strip()
    line = replace_special_meaning_symbols(line)
    line = "".join([c if c not in constants.question_replaceable_special_characters else ' ' for c in line])
    words = [process_word(w) for w in line.split()]
    line = " ".join([w for w in words if w])
    line = line.replace("  ", " ")  # remove duplicate spaces

    return line


def preprocess_question_content(content):
    lines = break_multiple_lines(content)
    processed_lines = [process_line(l) for l in lines]
    return " ".join([l for l in processed_lines if l])


if __name__ == "__main__":
    pass
    # preprocess_document("part_3", ".txt")