-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhyperparameter_tuning.py
More file actions
124 lines (103 loc) · 3.94 KB
/
hyperparameter_tuning.py
File metadata and controls
124 lines (103 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# hyperparameter_tuning.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')
# ================================
# 🔷 1. Load dataset
# ================================
df = pd.read_csv("login_dataset.csv")
print("✅ Dataset loaded. Shape:", df.shape)
# ================================
# 🔷 2. Encode categorical features
# ================================
le = LabelEncoder()
df['Device novelty'] = le.fit_transform(df['Device novelty']) # Known=0, New=1
# ================================
# 🔷 3. Drop unnecessary columns
# ================================
X = df.drop(columns=[
'Label', 'UserID', 'DeviceID', 'IP Address', 'Timestamp', 'Country', 'City'
])
y = df['Label']
# ================================
# 🔷 4. Train-Test split with stratification
# ================================
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# ================================
# 🔷 5. Feature scaling
# ================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ================================
# 🔷 6. SMOTE Oversampling
# ================================
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)
# ================================
# 🔷 7. Hyperparameter Tuning - Logistic Regression
# ================================
print("\n🔷 Tuning Logistic Regression...")
lr_params = {
'C': [0.01, 0.1, 1, 10], # Regularization strength
'penalty': ['l2'], # l1 requires saga solver; l2 is standard here
'solver': ['lbfgs', 'saga'],
'class_weight': ['balanced', None]
}
lr_grid = GridSearchCV(
LogisticRegression(random_state=42, max_iter=1000),
lr_params,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=2
)
lr_grid.fit(X_train_resampled, y_train_resampled)
print("✅ Best Logistic Regression Params:", lr_grid.best_params_)
print("✅ Best ROC AUC:", lr_grid.best_score_)
# Evaluate on test set
y_pred_lr = lr_grid.best_estimator_.predict(X_test_scaled)
print("\n🔷 Logistic Regression Test Performance:")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, lr_grid.best_estimator_.predict_proba(X_test_scaled)[:,1]))
# ================================
# 🔷 8. Hyperparameter Tuning - Random Forest
# ================================
print("\n🔷 Tuning Random Forest...")
rf_params = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'class_weight': ['balanced', 'balanced_subsample']
}
rf_grid = GridSearchCV(
RandomForestClassifier(random_state=42),
rf_params,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=2
)
rf_grid.fit(X_train_resampled, y_train_resampled)
print("✅ Best Random Forest Params:", rf_grid.best_params_)
print("✅ Best ROC AUC:", rf_grid.best_score_)
# Evaluate on test set
y_pred_rf = rf_grid.best_estimator_.predict(X_test_scaled)
print("\n🔷 Random Forest Test Performance:")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, rf_grid.best_estimator_.predict_proba(X_test_scaled)[:,1]))
# ================================
# 🔷 9. Confusion Matrices
# ================================
print("\n🔷 Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred_lr))
print("\n🔷 Confusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))
print("\n✅ Hyperparameter tuning and model evaluation complete.")