-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTextPreprocessor.py
More file actions
101 lines (83 loc) · 3.28 KB
/
TextPreprocessor.py
File metadata and controls
101 lines (83 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType
import os
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# This environment variable is only needed for macOS, not required for Dataproc clusters
os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
# nltk.download('wordnet', download_dir = nltk_data_dir, quiet = True)
# nltk.download('stopwords', download_dir = nltk_data_dir, quiet = True)
# nltk.download('punkt', download_dir = nltk_data_dir, quiet = True)
# nltk.download('punkt_tab', download_dir = nltk_data_dir, quiet=True)
stop_words = set(stopwords.words('english'))
# Define the UDF function outside the class
def clean_text(text):
# Calls various static methods from the TextPreprocessor class to clean and process text
text = TextPreprocessor.remove_html_tags(text)
text = TextPreprocessor.tokenize_text(text)
text = TextPreprocessor.normalize_text(text)
text = TextPreprocessor.remove_urls(text)
text = TextPreprocessor.remove_stopwords(text)
text = TextPreprocessor.stem_text(text)
text = TextPreprocessor.lemmatize_text(text)
return text
# Create UDF once
clean_text_udf = udf(clean_text, StringType())
class TextPreprocessor:
def __init__(self, text_df):
self.text_df = text_df # DataFrame containing text to be processed
return
# Remove HTML tags
@staticmethod
def remove_html_tags(text):
filtered_html_text = ""
try:
filtered_html_text = BeautifulSoup(text, "html.parser").get_text() # Convert HTML to plain text
except:
filtered_html_text = ""
return filtered_html_text
@staticmethod
def normalize_text(text):
# Remove leading/trailing whitespaces whitespaces
text = text.strip()
# Lower the text
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove non-ASCII characters
text = re.sub(r'[^\x00-\x7f]', ' ', text)
return text
# Remove URLs
@staticmethod
def remove_urls(text):
return re.sub(r'http\S+', '', text)
# Remove stopwords
@staticmethod
def remove_stopwords(text):
return ' '.join([word for word in text.split() if word not in stop_words])
# Tokenize text
@staticmethod
def tokenize_text(text):
return ' '.join(word_tokenize(text))
# Stemming
@staticmethod
def stem_text(text):
stemmer = PorterStemmer()
return ' '.join([stemmer.stem(word) for word in text.split()])
# Lemmatization
@staticmethod
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
def preprocess_text(self):
# Preprocess title & body of the posts
text_df_processed = self.text_df.withColumn("body", clean_text_udf(col('body')))
text_df_processed = text_df_processed.withColumn("title", clean_text_udf(col('title')))
return text_df_processed