-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_pipeline.py
More file actions
101 lines (81 loc) · 3.42 KB
/
train_pipeline.py
File metadata and controls
101 lines (81 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# train_pipeline.py
import os
import json
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from src.data_processing import load_data, preprocess_data
from src.model_training import tune_hyperparameters, save_model
# -------------------------------------------------
# CONFIG
# -------------------------------------------------
DATA_PATH = "data/Clean_Dataset.csv"
MODEL_PATH = "models/flight_price_model.pkl"
DEBUG_MODE = False # ⚡️ True: quick test, False: full hyperparameter tuning
def main():
print("🚀 Loading dataset...")
df = load_data(DATA_PATH)
print(f"✅ Data loaded successfully! Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print("⚙️ Preprocessing data...")
X_train, X_test, y_train, y_test, preprocessor, encoded_cols = preprocess_data(df)
print("✅ Preprocessing complete.")
# -------------------------------------------------
# MODEL TRAINING
# -------------------------------------------------
if DEBUG_MODE:
print("🧪 DEBUG MODE: Using default DecisionTreeRegressor (no tuning)...")
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)
best_params = model.get_params()
else:
print("🧠 Training model with hyperparameter tuning...")
model, best_params = tune_hyperparameters(X_train, y_train)
# -------------------------------------------------
# EVALUATION
# -------------------------------------------------
print("📊 Evaluating model...")
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("\n✅ MODEL PERFORMANCE:")
print(f" R² Score: {r2:.4f}")
print(f" MAE: {mae:,.2f}")
print(f" MSE: {mse:,.2f}\n")
# -------------------------------------------------
# SAVE MODEL + METADATA
# -------------------------------------------------
print("💾 Saving model and metadata...")
save_model(model, preprocessor, MODEL_PATH)
os.makedirs("models", exist_ok=True)
sample_n = min(200, X_test.shape[0])
X_test_sample = X_test[:sample_n]
y_test_sample = y_test[:sample_n]
# Convert sparse matrix to DataFrame if needed
if hasattr(X_test_sample, "toarray"):
X_test_sample = pd.DataFrame(
X_test_sample.toarray(),
columns=preprocessor.get_feature_names_out()
)
# Convert sparse to dense if needed
X_test_trans = X_test_sample
if hasattr(X_test_trans, "toarray"):
X_test_trans = X_test_trans.toarray()
feature_names = preprocessor.get_feature_names_out().tolist()
X_test_trans_df = pd.DataFrame(X_test_trans, columns=feature_names)
X_test_sample.to_parquet("models/eval_X_raw.parquet")
y_test_sample.to_frame("price").to_parquet("models/eval_y.parquet")
X_test_trans_df.to_parquet("models/eval_X_trans.parquet")
with open("models/feature_names.json", "w") as f:
json.dump(feature_names, f, indent=4)
with open("models/metadata.json", "w") as f:
json.dump({
"r2": r2,
"mae": mae,
"mse": mse,
"best_params": best_params
}, f, indent=4)
print("✅ All model artifacts saved under 'models/'.")
print("🏁 Pipeline completed successfully.")
if __name__ == "__main__":
main()