Email_Spam_Classifier/SpamClassifier.py at main · paddy3696/Email_Spam_Classifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 13 13:17:32 2020

@author: padmanabhan, arun
"""
#%%
algo = 'svm'
iscount = False
import pandas as pd
import numpy as np
import sklearn
import time
import os
import string
import re
import pickle

#%%
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))    #get stopwords
lemmatizer = WordNetLemmatizer()

def preprocess(message):
    lemmaWords = []
    for word, tag in pos_tag(word_tokenize(message)):
        if word in stop_words:  # remove stopwords
            continue
        word = word.lower()     #convert to lowercase
        tag_lower = tag.lower()[0]
        if tag_lower in ['a', 'r', 'n', 'v']:
            lemmaWords.append(lemmatizer.lemmatize(word, tag_lower))
        else:
            lemmaWords.append(word)

    message_pre = " ".join(lemmaWords)      #lemmatized sentence
    message_pre = message_pre.translate(str.maketrans('', '', string.punctuation + '\n\t\r' )) #remove punctuations and whitespaces
    message_pre = re.sub(' +',' ', message_pre)   #remove multiple whitespaces
    message_pre = re.sub('[0-9]+','', message_pre)   #remove numbers
    return message_pre

def readFolder(path):
    for fileName in os.listdir(path):
        isEnronDataset = (path.find('enron') != -1)
        filePath = os.path.join(path, fileName)
        if os.path.isfile(filePath):
            headerSeen, lines = False, []
            f = open(filePath, encoding="latin-1")
            for line in f:
                if isEnronDataset or headerSeen or line.startswith('Subject: '):
                    lines.append(line)
                elif line == '\n':
                    headerSeen = True
            f.close()
            content = '\n'.join(lines)
            yield content

numbers = re.compile(r'(\d+)')
file_pattern = re.compile(r'email[\d]+\.txt')

def sortByNumber(filename):
    if file_pattern.match(filename):
        filename_parts = numbers.split(filename)
        return int(filename_parts[1])
    else:
        return -1

def readTestFolder(path):
    for fileName in sorted(os.listdir(path), key = sortByNumber):
        filePath = os.path.join(path, fileName)
        if file_pattern.match(fileName) and os.path.isfile(filePath):
            lines = []
            f = open(filePath, encoding="latin-1")
            for line in f:
                lines.append(line)
            f.close()
            content = '\n'.join(lines)
            yield fileName, content

def buildModel():
    #%% Load dataset
    folders = {'easy_ham' : 0, 'easy_ham1' : 0, 'easy_ham_2' : 0, 'easy_ham_3' : 0, 'hard_ham' : 0, 'spam' : 1,
                'spam_2': 1, 'spam_3': 1, 'spam_4': 1, 'spam_5': 1, 'enron1/ham':0, 'enron1/spam':1,
                'enron2/ham':0, 'enron2/spam':1, 'enron3/ham':0, 'enron3/spam':1, 'enron4/ham':0, 'enron4/spam':1
                }
    # read in data
    rows = []
    print('Loading data...')
    for folder, label in folders.items():
        for mail in readFolder(folder):
            rows.append({'mail' : mail, 'label': label})

    data = pd.DataFrame(rows)
    data = data.dropna()
    #%% Train Test Split
    from sklearn.model_selection import train_test_split
    x_train,x_valid,y_train,y_valid = train_test_split(data['mail'], data['label'], test_size=0.2)

    #%% Preprocessing
    print('Preprocessing...')
    x_train_pre = [preprocess(message) for message in x_train]
    x_valid_pre = [preprocess(message) for message in x_valid]

    #%% Feature extraction
    print('feature extraction...')
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = None
    if iscount:
        print('Using count vectorizer')
        vectorizer = CountVectorizer(ngram_range=(1,2))
    else:
        print('Using  TFIDF vectorizer')
        vectorizer = TfidfVectorizer(ngram_range=(1,2))

    vectorizer.fit(x_train_pre)

    pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))

    x_train_feature = vectorizer.transform(x_train_pre)
    x_valid_feature = vectorizer.transform(x_valid_pre)

    #%%Train model
    print('Training model')
    model = None
    if algo == 'svm':
        print('Using SVM algorithm')
        from sklearn import svm
        model = svm.SVC(kernel='sigmoid', class_weight= 'balanced')
        model.fit(x_train_feature, y_train)
        pickle.dump(model, open('svmmodel.sav', 'wb'))
        print('Validating model..')
        y_pred = model.predict(x_valid_feature)

        sklearn.metrics.plot_confusion_matrix(estimator = model, X=x_valid_feature, y_true=y_valid)
        report = sklearn.metrics.classification_report(y_true = y_valid, y_pred = y_pred)
        print(report)
    else:
        print('Using Naive Bayes algorithm')
        from sklearn import naive_bayes
        nv = naive_bayes.MultinomialNB()
        nv.fit(x_train_feature, y_train)
        pickle.dump(nv, open('nbmodel.sav', 'wb'))
        y_pred_1 = nv.predict(x_valid_feature)

        sklearn.metrics.plot_confusion_matrix(estimator = nv, X=x_valid_feature, y_true=y_valid)
        report_1 = sklearn.metrics.classification_report(y_true = y_valid, y_pred = y_pred_1)
        print(report_1)
        model = nv
    #%% Test with samples
    from striprtf.striprtf import rtf_to_text
    print('Testing with samples..')
    testdata = []
    testindex = []
    for name, testMail in readTestFolder('test'):
        testdata.append({'mail': rtf_to_text(testMail)})
        testindex.append(name)

    testdata = pd.DataFrame(testdata, index = testindex)

    test_pre = [preprocess(message) for message in testdata['mail']]
    test_feature = vectorizer.transform(test_pre)
    test_predictions = model.predict(test_feature)
    predictions = pd.DataFrame(test_predictions, index = testindex)

    print(predictions)

    #%% Test with external dataset
    from sklearn.metrics import confusion_matrix
    print('Testing with external dataset')
    testfolders = {'enron5/ham':0, 'enron5/spam':1, 'enron6/ham':0, 'enron6/spam':1 }
    # read in data
    rows = []
    print('Loading test data...')
    for folder, label in testfolders.items():
        for mail in readFolder(folder):
            rows.append({'mail' : mail, 'label': label})

    testdata = pd.DataFrame(rows)
    testdata = pd.concat([testdata, pd.read_csv('spam_or_not_spam.csv')])
    testdata = testdata.dropna()
    print('preprocessing test data...')
    test_pre = [preprocess(message) for message in testdata['mail']]
    print('extracting features...')
    test_feature = vectorizer.transform(test_pre)
    print('predicting...')
    test_predictions = model.predict(test_feature)
    predictions = pd.DataFrame(test_predictions)

    sklearn.metrics.plot_confusion_matrix(estimator = model, X=test_feature, y_true=testdata['label'])
    report = sklearn.metrics.classification_report(y_true = testdata['label'], y_pred = predictions)
    print(report)
    print(confusion_matrix(y_true = list(testdata['label']), y_pred = test_predictions))

#%%
if __name__ == '__main__':
    st = time.time()
    buildModel()
    et = time.time()
    tt = (et-st)/60
    print('Time taken : %.2f mins ' % (tt))