-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSpamClassifier.py
More file actions
205 lines (179 loc) · 7.24 KB
/
SpamClassifier.py
File metadata and controls
205 lines (179 loc) · 7.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 13 13:17:32 2020
@author: padmanabhan, arun
"""
#%%
algo = 'svm'
iscount = False
import pandas as pd
import numpy as np
import sklearn
import time
import os
import string
import re
import pickle
#%%
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) #get stopwords
lemmatizer = WordNetLemmatizer()
def preprocess(message):
lemmaWords = []
for word, tag in pos_tag(word_tokenize(message)):
if word in stop_words: # remove stopwords
continue
word = word.lower() #convert to lowercase
tag_lower = tag.lower()[0]
if tag_lower in ['a', 'r', 'n', 'v']:
lemmaWords.append(lemmatizer.lemmatize(word, tag_lower))
else:
lemmaWords.append(word)
message_pre = " ".join(lemmaWords) #lemmatized sentence
message_pre = message_pre.translate(str.maketrans('', '', string.punctuation + '\n\t\r' )) #remove punctuations and whitespaces
message_pre = re.sub(' +',' ', message_pre) #remove multiple whitespaces
message_pre = re.sub('[0-9]+','', message_pre) #remove numbers
return message_pre
def readFolder(path):
for fileName in os.listdir(path):
isEnronDataset = (path.find('enron') != -1)
filePath = os.path.join(path, fileName)
if os.path.isfile(filePath):
headerSeen, lines = False, []
f = open(filePath, encoding="latin-1")
for line in f:
if isEnronDataset or headerSeen or line.startswith('Subject: '):
lines.append(line)
elif line == '\n':
headerSeen = True
f.close()
content = '\n'.join(lines)
yield content
numbers = re.compile(r'(\d+)')
file_pattern = re.compile(r'email[\d]+\.txt')
def sortByNumber(filename):
if file_pattern.match(filename):
filename_parts = numbers.split(filename)
return int(filename_parts[1])
else:
return -1
def readTestFolder(path):
for fileName in sorted(os.listdir(path), key = sortByNumber):
filePath = os.path.join(path, fileName)
if file_pattern.match(fileName) and os.path.isfile(filePath):
lines = []
f = open(filePath, encoding="latin-1")
for line in f:
lines.append(line)
f.close()
content = '\n'.join(lines)
yield fileName, content
def buildModel():
#%% Load dataset
folders = {'easy_ham' : 0, 'easy_ham1' : 0, 'easy_ham_2' : 0, 'easy_ham_3' : 0, 'hard_ham' : 0, 'spam' : 1,
'spam_2': 1, 'spam_3': 1, 'spam_4': 1, 'spam_5': 1, 'enron1/ham':0, 'enron1/spam':1,
'enron2/ham':0, 'enron2/spam':1, 'enron3/ham':0, 'enron3/spam':1, 'enron4/ham':0, 'enron4/spam':1
}
# read in data
rows = []
print('Loading data...')
for folder, label in folders.items():
for mail in readFolder(folder):
rows.append({'mail' : mail, 'label': label})
data = pd.DataFrame(rows)
data = data.dropna()
#%% Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid = train_test_split(data['mail'], data['label'], test_size=0.2)
#%% Preprocessing
print('Preprocessing...')
x_train_pre = [preprocess(message) for message in x_train]
x_valid_pre = [preprocess(message) for message in x_valid]
#%% Feature extraction
print('feature extraction...')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = None
if iscount:
print('Using count vectorizer')
vectorizer = CountVectorizer(ngram_range=(1,2))
else:
print('Using TFIDF vectorizer')
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(x_train_pre)
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))
x_train_feature = vectorizer.transform(x_train_pre)
x_valid_feature = vectorizer.transform(x_valid_pre)
#%%Train model
print('Training model')
model = None
if algo == 'svm':
print('Using SVM algorithm')
from sklearn import svm
model = svm.SVC(kernel='sigmoid', class_weight= 'balanced')
model.fit(x_train_feature, y_train)
pickle.dump(model, open('svmmodel.sav', 'wb'))
print('Validating model..')
y_pred = model.predict(x_valid_feature)
sklearn.metrics.plot_confusion_matrix(estimator = model, X=x_valid_feature, y_true=y_valid)
report = sklearn.metrics.classification_report(y_true = y_valid, y_pred = y_pred)
print(report)
else:
print('Using Naive Bayes algorithm')
from sklearn import naive_bayes
nv = naive_bayes.MultinomialNB()
nv.fit(x_train_feature, y_train)
pickle.dump(nv, open('nbmodel.sav', 'wb'))
y_pred_1 = nv.predict(x_valid_feature)
sklearn.metrics.plot_confusion_matrix(estimator = nv, X=x_valid_feature, y_true=y_valid)
report_1 = sklearn.metrics.classification_report(y_true = y_valid, y_pred = y_pred_1)
print(report_1)
model = nv
#%% Test with samples
from striprtf.striprtf import rtf_to_text
print('Testing with samples..')
testdata = []
testindex = []
for name, testMail in readTestFolder('test'):
testdata.append({'mail': rtf_to_text(testMail)})
testindex.append(name)
testdata = pd.DataFrame(testdata, index = testindex)
test_pre = [preprocess(message) for message in testdata['mail']]
test_feature = vectorizer.transform(test_pre)
test_predictions = model.predict(test_feature)
predictions = pd.DataFrame(test_predictions, index = testindex)
print(predictions)
#%% Test with external dataset
from sklearn.metrics import confusion_matrix
print('Testing with external dataset')
testfolders = {'enron5/ham':0, 'enron5/spam':1, 'enron6/ham':0, 'enron6/spam':1 }
# read in data
rows = []
print('Loading test data...')
for folder, label in testfolders.items():
for mail in readFolder(folder):
rows.append({'mail' : mail, 'label': label})
testdata = pd.DataFrame(rows)
testdata = pd.concat([testdata, pd.read_csv('spam_or_not_spam.csv')])
testdata = testdata.dropna()
print('preprocessing test data...')
test_pre = [preprocess(message) for message in testdata['mail']]
print('extracting features...')
test_feature = vectorizer.transform(test_pre)
print('predicting...')
test_predictions = model.predict(test_feature)
predictions = pd.DataFrame(test_predictions)
sklearn.metrics.plot_confusion_matrix(estimator = model, X=test_feature, y_true=testdata['label'])
report = sklearn.metrics.classification_report(y_true = testdata['label'], y_pred = predictions)
print(report)
print(confusion_matrix(y_true = list(testdata['label']), y_pred = test_predictions))
#%%
if __name__ == '__main__':
st = time.time()
buildModel()
et = time.time()
tt = (et-st)/60
print('Time taken : %.2f mins ' % (tt))