-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMulticlass_Classification_&_Regression.py
More file actions
672 lines (501 loc) · 24 KB
/
Multiclass_Classification_&_Regression.py
File metadata and controls
672 lines (501 loc) · 24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
# -*- coding: utf-8 -*-
"""Group_29_Final_Submission.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1UfVACju7-YR431RAEmd8Rnp3j3_q9zmv
#MULTI-CLASS CLASSIFICATION AND REGRESSION
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets # to retrieve the iris Dataset
import pandas as pd # to load the dataframe
from sklearn.preprocessing import StandardScaler # to standardize the features
from sklearn.decomposition import PCA # to apply PCA
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_excel('/content/drive/MyDrive/data_mining/CO2_Emission.xlsx')
df
df = df.drop(0)#First row contains NAN value therefore deleting it
df
df.info()
df.isnull().sum()
# There are missing values present in attributes, 'CO2 Rating' and 'Smog Rating.'
# CO2 rating has 18904 missing values.
# Smog rating has 20014 missing values.
df.describe() #Getting summary of the data
sns.boxplot(data = df, x = 'Smog Rating') #There are no outliers so we can use mean as imputing method
#Imputing NaN values with mean of the column
df['Smog Rating'].fillna(df['Smog Rating'].mean(), inplace = True)
df['CO2 Rating'].fillna(df['CO2 Rating'].mean(), inplace = True)
df.head()
df.isnull().sum() #CO2 rating is what we aare goint to predict using classification task
sns.pairplot(data = df, hue = 'CO2 Rating') # Plotting a paitplot to explore the relationships between pairs of variables in a dataset
df.hist(bins=50, figsize=(20,15))
plt.show()
# The fuel consumed per 100 km is more on city roads than on highway roads.
# The majority of cars either have 4, 6 or 8 cylinders.
# Engine sizes a bit less than 2 litres are the most common.
plt.figure(figsize = (15,6))
sns.boxplot(data = df, y = 'CO2 EMISSIONS (g/km)', x = 'VEHICLE CLASS')
plt.xticks(rotation = 90)
"""By observing the box plot visualization between 'vehicle class' and 'CO2 emissions' it can be
observed that Van-passenger has the highest mean for CO2 emissions whereas station
wagonsmall has the lowest CO2 emissions
"""
plt.figure(figsize = (15,6))
sns.boxplot(data = df, y = 'CO2 EMISSIONS (g/km)', x = 'CYLINDERS')
"""By observing the box plot visualization between Cylinder engines and CO2 emissions it can be observed that Cylinder engine with rating 16 has the highest mean for CO2 emissions whereas cylinder engine with rating 3 has the lowest mean for CO2 emissions."""
plt.figure(figsize = (15,6))
sns.boxplot(data = df, y = 'CO2 EMISSIONS (g/km)', x = 'FUEL TYPE')
"""Again, with the below visualization of box plot between Fuel type and Co2 emissions it can be noted that Natural gas has the highest mean and thus contributes the most in the case of CO2 emissions."""
df.head()
# fig, axes = plt.subplots(3, 3, figsize=(20, 10))
# #sns.boxplot(df['FUEL TYPE'], ax = axes[0,0])
# sns.boxplot(df['COMB (mpg)'], ax = axes[0,1])
# sns.boxplot(df['CO2 EMISSIONS (g/km)'], ax = axes[0,2])
# sns.boxplot(df['ENGINE SIZE (L)'], ax = axes[1,0])
# sns.boxplot(df['CYLINDERS'], ax = axes[1,1])
# sns.boxplot(df['TRANSMISSION'], ax = axes[1,2])
# sns.boxplot(df['FUEL CONSUMPTION CITY (L/100)'], ax = axes[2,0])
# sns.boxplot(df['FUEL CONSUMPTION HWY (L/100)'], ax = axes[2,1])
# sns.boxplot(df['COMB (L/100 km)'], ax = axes[2,2])
import matplotlib.pyplot as plt
# Assuming your DataFrame is named 'df'
num_cols = df.select_dtypes(include='number').columns.tolist()
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 8))
for row in range(2):
for col in range(3):
i = row * 3 + col
try:
axs[row, col].boxplot(df[num_cols[i]])
axs[row, col].set_title(num_cols[i])
except IndexError:
pass
plt.tight_layout()
plt.show()
"""From the above plotted subplots we can observe that there are some outliers in the data but as these outliers represent natural variations in the population, therefore they should be left as it is in the dataset. Such outliers can be referred to as are called true outliers."""
# Q1 = df.quantile(0.25)
# Q3 = df.quantile(0.75)
# IQR = Q3 - Q1
# print(IQR)
# cleandf = df[~((df< (Q1 - 1.5 * IQR)) |(df> (Q3 + 1.5 * IQR)))]
# cleandf
corr_matrix = df.corr()
corr_matrix["CO2 EMISSIONS (g/km)"].sort_values(ascending=False)
# We have done a correlation of all attributes with CO2 emission attribute.
# The 4 most corelated attributes are COMB(L/100), Fuel Consumption city, Fuel COnsumption HWY and COMB (mpg)
# We aren't including CO2 rating as we need to predict its values later on.
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True)
"""After plotting the heat map further correlations can be drawn between the various attributes and the response variables and dependency of one attribute on the other column as from the plotted heat map, we can see that the CO2 emissions by the attributes fuel consumption city, fuel consumption highway and combustion are almost entirely correlated with each other therefore moving forward we can apply dimension reduction by removing the interdependent attributes.
From the plotted heat map, it can be summarized that the concept of multicollinearity exists between Fuel consumption in city and fuel consumption in highway whereas a duplicate column of combustion exists in the data therefore the following columns can be reduced as a part of dimension reduction, Furthermore the column of smog rating can also be dropped as it has lean correlation value of just 0.41 and when initially cleaned the data smog rating constituted of almost 77% of the null values.
"""
Ccols = df.select_dtypes(exclude='number').columns
Ncols = df.select_dtypes(include='number').columns
catcols = Ccols.to_list()
numcols = Ncols.to_list()
catcols
numcols
import pandas as pd
import scipy.stats as stats
#
# Create a list of numerical columns
numerical_cols = ['CO2 EMISSIONS (g/km)', 'CO2 Rating']
# Create a list of categorical columns
categorical_cols = ['MAKE', 'MODEL(# = high output engine)', 'VEHICLE CLASS', 'TRANSMISSION', 'FUEL TYPE']
# Perform ANOVA test for each categorical column and each numerical column
for cat_col in categorical_cols:
for num_col in numerical_cols:
groups = df.groupby(cat_col)[num_col].apply(list)
f_value, p_value = stats.f_oneway(*groups)
print(f"{cat_col} vs {num_col}: F-value = {f_value}, p-value = {p_value}")
"""As for understanding the relation between the categorical variables and the numerical response variable we employed the ANOVA test on the model.
ANOVA: This measures the dependency of a target column with continuous values on another column containing categorical values.
On doing the hypothesis testing:
Ho: Categorical variables and CO2 emission rating are NOT correlated H1: Categorical variables and CO2 emission rating are correlated.
On doing the hypothesis testing we found out that the p value is less than α(alpha) and therefore we reject the null as by considering the principle “if p value is low null has to go “ Therefore, we cannot drop the categorical columns from the dataset.
"""
df['MODEL(# = high output engine)'].dtype
df['MODEL(# = high output engine)'] = df['MODEL(# = high output engine)'].astype(str)
le = LabelEncoder()
catcols
df[catcols] = df[catcols].apply(LabelEncoder().fit_transform)
df.head()
"""For a machine learning algorithm, the data needs to be fed in numerical values for further prediction Therefore the categorical variables in the dataset needed to be converted to their numerical form, hence we applied label encoding to transform the categorical variables into their counterpart numerical variables in order to draw further calculational results"""
df.drop(['MODEL YEAR', 'FUEL CONSUMPTION CITY (L/100)', 'FUEL CONSUMPTION HWY (L/100)', 'COMB (mpg)', 'Smog Rating'], axis=1, inplace=True)
"""Columns To Drop
Model Year - Low Correaltion Value of -0.205244
FUEL CONSUMPTION CITY (L/100) - multicollinearity
FUEL CONSUMPTION HWY (L/100) - multicollinearity
COMB (mpg) - duplicate column
Smog rating - as low correlation valie of 0.41, where 77% were initially null values.
"""
df
"""#REGRESSION"""
#Predicting Co2 Emission is a regression task
#Creating a new dataframe for the same
dfreg = df.copy()
dfreg.drop(['CO2 Rating'], axis=1, inplace=True) #Prediction Co2 rating is the classification task that we will be doing later on,
# also Co2 rating has as low correlation value of -0.47, where 72.5% were initially null values so we can drop it fro regression analysis.
dfreg.head()
#Co2 Emissions is out target variable
y = dfreg['CO2 EMISSIONS (g/km)']
X = dfreg.drop(['CO2 EMISSIONS (g/km)'], axis = 1)
print(X.shape)
print(y.shape)
X.head()
#Splitting the data into 80% Train and 20% Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
#Scaling the data to scale data because it transforms the data to have zero mean and unit variance,
#which can improve the performance and accuracy of machine learning algorithms.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
"""#Multiple Linear Regression"""
#Defining model for Linear Regression
from sklearn.linear_model import LinearRegression
regressor_mlr = LinearRegression()
regressor_mlr.fit(X_train, y_train)
y_pred = regressor_mlr.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# compute R-squared
r2 = r2_score(y_test, y_pred)
# compute root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# compute mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
# compute mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("R-squared: {:.2f}".format(r2))
print("RMSE: {:.2f}".format(rmse))
print("MSE: {:.2f}".format(mse))
print("MAE: {:.2f}".format(mae))
"""#Decision Tree Regressor"""
#Defining Decision Tree Regressor Model
from sklearn.tree import DecisionTreeRegressor
regressor_dtr = DecisionTreeRegressor(random_state = 0)
regressor_dtr.fit(X_train, y_train)
y_predDT = regressor_dtr.predict(X_test)
# compute R-squared
r2 = r2_score(y_test, y_predDT)
# compute root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_predDT))
# compute mean squared error (MSE)
mse = mean_squared_error(y_test, y_predDT)
# compute mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_predDT)
print("R-squared: {:.2f}".format(r2))
print("RMSE: {:.2f}".format(rmse))
print("MSE: {:.2f}".format(mse))
print("MAE: {:.2f}".format(mae))
"""#Support Vector Regressor"""
#Defining Support Vector Regressor Model
from sklearn.svm import SVR
regressor_svr = SVR(kernel = 'rbf')
regressor_svr.fit(X_train, y_train)
y_predSVR = regressor_svr.predict(X_test)
# compute R-squared
r2 = r2_score(y_test, y_predSVR)
# compute root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# compute mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
# compute mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("R-squared: {:.2f}".format(r2))
print("RMSE: {:.2f}".format(rmse))
print("MSE: {:.2f}".format(mse))
print("MAE: {:.2f}".format(mae))
"""#Random Forest Regressor"""
#Defining Random forest Regressor Model
from sklearn.ensemble import RandomForestRegressor
regressor_rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor_rfr.fit(X_train, y_train)
y_pred = regressor_rfr.predict(X_test)
# compute R-squared
r2 = r2_score(y_test, y_pred)
# compute root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# compute mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
# compute mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("R-squared: {:.2f}".format(r2))
print("RMSE: {:.2f}".format(rmse))
print("MSE: {:.2f}".format(mse))
print("MAE: {:.2f}".format(mae))
#Fine tuning the hyperparameter using GridSearchCVfor better prediction and to avoid over fitting
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search over
param_grid = {
'n_estimators': [10, 50, 100],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Create a RandomForestRegressor object
rf = RandomForestRegressor(random_state=0)
# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='r2')
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)
# Print the best parameters and R2 score
print("Best parameters:", grid_search.best_params_)
print("R2 score:", grid_search.best_score_)
# Use the best model to predict on the test data
y_pred = grid_search.predict(X_test)
"""#CLASSIFICATION"""
#Our Classification task is to predict CO2 Rating
dfclf = df.copy()
dfclf.drop(['CO2 EMISSIONS (g/km)'], axis=1, inplace=True)
dfclf.head()
dfclf = dfclf[dfclf['CO2 Rating'].notna()] #CO2 Rating containg NULL values, for further prediction we take only the NON NUll data
dfclf.shape
dfclf
#CO2 Rating is out target variable
y = dfclf['CO2 Rating']
X = dfclf.drop(['CO2 Rating'], axis = 1)
y = y.astype(int)
y -= 1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.metrics import roc_auc_score
"""#Support Vector Classifier"""
#Defining Support Vector Classifier Model
from sklearn.svm import SVC
classifier_svm_linear = SVC(kernel = 'linear', random_state = 0, probability=True)
classifier_svm_linear.fit(X_train, y_train)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_svm_linear, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
prediction = classifier_svm_linear.predict(X_test)
from sklearn.metrics import precision_score, \
recall_score, confusion_matrix, classification_report, \
accuracy_score, f1_score
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('\n SVM clasification report:\n \n', classification_report(y_test,prediction))
print('\n SVM confussion matrix:\n',confusion_matrix(y_test, prediction))
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
# Make predictions on the test set and obtain the predicted probabilities
y_pred_proba = classifier_svm_linear.predict_proba(X_test)
# Compute the ROC curve and ROC area for each class using the OvA strategy
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(10):
y_true = (y_test == i)
fpr[i], tpr[i], _ = roc_curve(y_true, y_pred_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
y_true = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
n_classes = y_true.shape[1]
# Plot ROC curve for each class
plt.figure()
lw = 2
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 'orange']
for i, color in zip(range(10), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve (area = %0.2f) for class %d' % (roc_auc[i], i))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for Support Vector Classifier')
plt.legend(loc="lower right")
plt.show()
#Plotting Micro Average Curve for the 10 class roc curve
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', lw=lw, linestyle='--', label='Micro-average ROC curve (area = %0.2f)' % roc_auc["micro"])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for Support Vector Classifier')
plt.legend(loc="lower right")
plt.show()
"""#Multinomial Naive Bayes"""
#Defining Multinomial Naive Bayes Classifier Model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
classifier_multinomial_bayes = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
classifier_multinomial_bayes.fit(X_train, y_train)
prediction = classifier_multinomial_bayes.predict(X_test)
from sklearn.metrics import precision_score, \
recall_score, confusion_matrix, classification_report, \
accuracy_score, f1_score
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('\n NAIVE BAYES clasification report:\n \n', classification_report(y_test,prediction))
print('\n NAICE BAYES confussion matrix:\n',confusion_matrix(y_test, prediction))
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
# Make predictions on the test set and obtain the predicted probabilities
y_pred_proba = classifier_multinomial_bayes.predict_proba(X_test)
# Compute the ROC curve and ROC area for each class using the OvA strategy
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(10):
y_true = (y_test == i)
fpr[i], tpr[i], _ = roc_curve(y_true, y_pred_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
y_true = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
n_classes = y_true.shape[1]
# Plot ROC curve for each class
plt.figure()
lw = 2
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 'orange']
for i, color in zip(range(10), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve (area = %0.2f) for class %d' % (roc_auc[i], i))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for Naive Bayes Classifier (10 classes)')
plt.legend(loc="lower right")
plt.show()
#Plotting Micro Average Curve for the 10 class roc curve
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', lw=lw, linestyle='--', label='Micro-average ROC curve (area = %0.2f)' % roc_auc["micro"])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for Naive Bayes Classifier (10 classes)')
plt.legend(loc="lower right")
plt.show()
"""#Random Forest Classifier"""
#Defining Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier
classifier_rfr = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rfr.fit(X_train, y_train)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_rfr, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
prediction = classifier_rfr.predict(X_test)
from sklearn.metrics import precision_score, \
recall_score, confusion_matrix, classification_report, \
accuracy_score, f1_score
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('\n RANDOM FOREST clasification report:\n \n', classification_report(y_test,prediction))
print('\n RANDOM FOREST confussion matrix:\n',confusion_matrix(y_test, prediction))
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
# Make predictions on the test set and obtain the predicted probabilities
y_pred_proba = classifier_rfr.predict_proba(X_test)
# Compute the ROC curve and ROC area for each class using the OvA strategy
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(10):
y_true = (y_test == i)
fpr[i], tpr[i], _ = roc_curve(y_true, y_pred_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
y_true = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
n_classes = y_true.shape[1]
# Plot ROC curve for each class
plt.figure()
lw = 2
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 'orange']
for i, color in zip(range(10), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve (area = %0.2f) for class %d' % (roc_auc[i], i))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for Random Forest Classifier')
plt.legend(loc="lower right")
plt.show()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Plot ROC curve formicro-average ROC curve
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', lw=lw, linestyle='--', label='Micro-average ROC curve (area = %0.2f)' % roc_auc["micro"])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for Random Forest Classifier')
plt.legend(loc="lower right")
plt.show()
"""#XGBoost Classifier"""
from xgboost import XGBClassifier
#Defining XGBoost Classifier Model
xgb_model = XGBClassifier(objective='multi:softmax')
xgb_model.fit(X_train, y_train)
accuracies = cross_val_score(estimator = xgb_model, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
prediction = xgb_model.predict(X_test)
print('XGBoost Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('\n XGBoost clasification report:\n \n', classification_report(y_test,prediction))
print('\n confussion matrix:\n',confusion_matrix(y_test, prediction))
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
# Make predictions on the test set and obtain the predicted probabilities
y_pred_proba = xgb_model.predict_proba(X_test)
# Compute the ROC curve and ROC area for each class using the OvA strategy
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(10):
y_true = (y_test == i)
fpr[i], tpr[i], _ = roc_curve(y_true, y_pred_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
y_true = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
n_classes = y_true.shape[1]
# Plot ROC curve for each class and micro-average ROC curve
plt.figure()
lw = 2
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 'orange']
for i, color in zip(range(10), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve (area = %0.2f) for class %d' % (roc_auc[i], i))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for XGBoost Classifier')
plt.legend(loc="lower right")
plt.show()
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', lw=lw, linestyle='--', label='Micro-average ROC curve (area = %0.2f)' % roc_auc["micro"])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for XGBoost Classifier')
plt.legend(loc="lower right")
plt.show()