-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing_functions.py
More file actions
89 lines (68 loc) · 2.49 KB
/
preprocessing_functions.py
File metadata and controls
89 lines (68 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
# remove user names
def remove_users(data, column):
data[column] = data[column].str.replace(r'@\w+ ', '', regex = True)
return data
# Lowercase all tweet text
def lowercase_text(data, column):
data[column] = data[column].str.lower()
return data
# Remove numbers.
def remove_numbers(data, column):
data[column] = data[column].str.replace(r'\d+', '', regex = True)
return data
# Remove extra whitespace.
def remove_whitespace(data, column):
data[column] = data[column].str.replace(r'\s+',' ', regex = True)
return data
# Remove punctuation.
def remove_punctuation(data, column):
data[column] = data[column].str.translate(str.maketrans('', '', string.punctuation))
return data
# Remove stopwords
def remove_stopwords(data, column):
from nltk.corpus import stopwords
stopword_match = r"\b(" + r"|".join(stopwords.words('english')) + r")\b"
data[column] = data[column].str.replace(stopword_match, '', regex=True)
return data
# Stem words
def stem_words(data, column):
stemmer = SnowballStemmer(language='english')
data['stemmed_text'] = data[column].apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)]))
return data
# Lemmatize words. (Hint: lemmatization requires part-of-speech tags)
def lemmatize_words(data, column):
lemmatizer = WordNetLemmatizer()
# Mapping NLTK POS tags to WordNet POS tags
tag_map = {
'J': wordnet.ADJ,
'V': wordnet.VERB,
'R': wordnet.ADV,
'N': wordnet.NOUN
}
# Apply lemmatization to the specified column
data['lemmatized_text'] = data[column].apply(lambda x: " ".join([
lemmatizer.lemmatize(word, tag_map.get(tag[0], wordnet.NOUN))
for word, tag in pos_tag(word_tokenize(x))
]))
return data
# conver date format
def convert_date_format(df, column_name):
# Convert the column to datetime
df[column_name] = pd.to_datetime(df[column_name])
# Format the dates back to 'dd mm yyyy' strings
df[column_name] = df[column_name].dt.strftime('%Y-%m-%d')
return df
#Defining a function that will create bigrams
def bigrams(doc):
bigrams = []
for bigram in list(nltk.bigrams(doc)):
bigrams.append("_".join(bigram))
return bigrams