Energy-forecasting-API/model_building.py at main · hydrogeohc/Energy-forecasting-API · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import sys
import os

# Add the directory containing your data_preprocessing to the Python path
# This assumes data_preprocessing.py is in a directory called 'data_processing'
sys.path.append('./data_processing')  # Adjust if the path is different
from data_preprocessing import engineer_features

def calculate_bias(y_true, y_pred):
    """Calculates the bias (mean error) of the predictions."""
    return np.mean(y_pred - y_true)

def train_and_evaluate_models(df):
    """Trains and evaluates multiple models, including LSTM, using time series cross-validation.

    Adds lag features (energy_consumption_lag1 and energy_consumption_lag2)
    and calculates Mean Absolute Error (MAE) and Bias instead of R-squared.
    """

    # Engineer features (including lag features)
    df = engineer_features(df)

    # Drop rows with NaN values introduced by lag features.
    df = df.dropna() # Essential! Otherwise LinearRegression will fail.

    # Define features (X) and target (y)
    X = df[['temperature', 'humidity', 'energy_consumption_lag1','temp_humidity_interaction']]  #  features
    y = df['energy_consumption']

    # Time Series Split
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits

    # Initialize models
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
    }

    results = {} #Initialize results
    for model_name in models:
        results[model_name] = {"MSE":[], "MAE":[], "Bias":[]} #Metrics will be lists now

    #Time series cross validation loop:
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train and evaluate traditional models
        for model_name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred) # Added MAE
            bias = calculate_bias(y_test, y_pred) # Added bias

            results[model_name]["MSE"].append(mse)
            results[model_name]["MAE"].append(mae)
            results[model_name]["Bias"].append(bias)

    # Calculate the average metrics across all the folds
    for model_name in models:
        results[model_name]["MSE"] = np.mean(results[model_name]["MSE"])
        results[model_name]["MAE"] = np.mean(results[model_name]["MAE"])
        results[model_name]["Bias"] = np.mean(results[model_name]["Bias"])

    #LSTM - *NOT* using time series cross-validation - requires refactoring to use TSCV properly
    X_lstm = X.values.reshape((X.shape[0], 1, X.shape[1]))
    X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y, test_size=0.2, random_state=42)

    # Create and train LSTM model
    lstm_model = Sequential([
        LSTM(50, activation='relu', input_shape=(1, X.shape[1])),
        Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mse')
    lstm_model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32, verbose=0)

    # Evaluate LSTM model - USING a SINGLE train_test_split, *NOT* TSCV!
    y_pred_lstm = lstm_model.predict(X_test_lstm)
    y_pred_lstm = y_pred_lstm.flatten()

    mse_lstm = mean_squared_error(y_test_lstm, y_pred_lstm)
    mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)  # Added MAE for LSTM
    bias_lstm = calculate_bias(y_test_lstm, y_pred_lstm)  # Added bias for LSTM

    results["LSTM"] = {"MSE": mse_lstm, "MAE": mae_lstm, "Bias": bias_lstm}  #

    return results

if __name__ == '__main__':
    # Load your dataset here (assuming it's already preprocessed)
    df = pd.read_csv('./energy_consumption_data.csv')

    # Train models and evaluate their performance
    model_results = train_and_evaluate_models(df)

    # Print the results for each model
    for model_name, metrics in model_results.items():
        print(f"{model_name}:")
        print(f"  MSE: {metrics['MSE']:.2f}")
        print(f"  MAE: {metrics['MAE']:.2f}")
        print(f"  Bias: {metrics['Bias']:.2f}")