-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_model.py
More file actions
81 lines (58 loc) · 2.72 KB
/
train_model.py
File metadata and controls
81 lines (58 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
print("Starting model training...")
columns = [
'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]
try:
# Paste the full path you copied inside the quotes.
# The 'r' at the beginning is important!
full_path_to_train_file = r'C:\Users\shree\Desktop\pythonst\KDDTrain+.txt'
df = pd.read_csv(full_path_to_train_file, header=None, names=columns)
print("Dataset loaded successfully.")
except FileNotFoundError:
print("Error: KDDTrain+.txt not found. Please place it in the same folder.")
exit()
df = df.drop('difficulty', axis=1)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('label')
encoders = {}
for col in categorical_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
encoders[col] = le
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
# train_model.py - Step 4
# 'X' is all our data columns (the clues)
X = df.drop('label', axis=1)
# 'y' is the answer (normal or attack)
y = df['label']
# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training the Random Forest model...")
# Create the AI model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# Train the model on the data
model.fit(X_train, y_train)
print("Model training complete.")
# train_model.py - Step 5
# Test the model on data it has never seen before
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")
# Save the trained model and the encoders to disk
joblib.dump(model, 'sass_model.joblib')
joblib.dump(encoders, 'sass_encoders.joblib')
print("Model and encoders have been saved successfully.")