pythonst/train_model.py at main · cheron2000/pythonst · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

print("Starting model training...")


columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]


try:
    # Paste the full path you copied inside the quotes.
# The 'r' at the beginning is important!
    full_path_to_train_file = r'C:\Users\shree\Desktop\pythonst\KDDTrain+.txt'

    df = pd.read_csv(full_path_to_train_file, header=None, names=columns)

    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: KDDTrain+.txt not found. Please place it in the same folder.")
    exit()

df = df.drop('difficulty', axis=1)


categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('label')


encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)


# train_model.py - Step 4
# 'X' is all our data columns (the clues)
X = df.drop('label', axis=1)
# 'y' is the answer (normal or attack)
y = df['label']

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training the Random Forest model...")
# Create the AI model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# Train the model on the data
model.fit(X_train, y_train)
print("Model training complete.")

# train_model.py - Step 5
# Test the model on data it has never seen before
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")

# Save the trained model and the encoders to disk
joblib.dump(model, 'sass_model.joblib')
joblib.dump(encoders, 'sass_encoders.joblib')
print("Model and encoders have been saved successfully.")