Community-Detection-over-StackOverflow-Tags/TextPreprocessor.py at main · sensai99/Community-Detection-over-StackOverflow-Tags · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType
import os
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# This environment variable is only needed for macOS, not required for Dataproc clusters
os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'

# nltk.download('wordnet', download_dir = nltk_data_dir, quiet = True)
# nltk.download('stopwords', download_dir = nltk_data_dir, quiet = True)
# nltk.download('punkt', download_dir = nltk_data_dir, quiet = True)
# nltk.download('punkt_tab', download_dir = nltk_data_dir, quiet=True)

stop_words = set(stopwords.words('english'))

# Define the UDF function outside the class
def clean_text(text):
    # Calls various static methods from the TextPreprocessor class to clean and process text
    text = TextPreprocessor.remove_html_tags(text)
    text = TextPreprocessor.tokenize_text(text)
    text = TextPreprocessor.normalize_text(text)
    text = TextPreprocessor.remove_urls(text)
    text = TextPreprocessor.remove_stopwords(text)
    text = TextPreprocessor.stem_text(text)
    text = TextPreprocessor.lemmatize_text(text)
    return text

# Create UDF once
clean_text_udf = udf(clean_text, StringType())

class TextPreprocessor:
    def __init__(self, text_df):
        self.text_df = text_df  # DataFrame containing text to be processed
        return

    # Remove HTML tags
    @staticmethod
    def remove_html_tags(text):
        filtered_html_text = ""
        try:
            filtered_html_text = BeautifulSoup(text, "html.parser").get_text()  # Convert HTML to plain text
        except:
            filtered_html_text = ""
        return filtered_html_text

    @staticmethod
    def normalize_text(text):
        # Remove leading/trailing whitespaces whitespaces
        text = text.strip()

        # Lower the text
        text = text.lower()

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove non-ASCII characters
        text = re.sub(r'[^\x00-\x7f]', ' ', text)

        return text

    # Remove URLs
    @staticmethod
    def remove_urls(text):
        return re.sub(r'http\S+', '', text)

    # Remove stopwords
    @staticmethod
    def remove_stopwords(text):
        return ' '.join([word for word in text.split() if word not in stop_words])

    # Tokenize text
    @staticmethod
    def tokenize_text(text):
        return ' '.join(word_tokenize(text))

    # Stemming
    @staticmethod
    def stem_text(text):
        stemmer = PorterStemmer()
        return ' '.join([stemmer.stem(word) for word in text.split()])

    # Lemmatization
    @staticmethod
    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    def preprocess_text(self):
        # Preprocess title & body of the posts
        text_df_processed = self.text_df.withColumn("body", clean_text_udf(col('body')))
        text_df_processed = text_df_processed.withColumn("title", clean_text_udf(col('title')))
        return text_df_processed