Multiple-Linear-Regression/multiple_linear_regression.py at master · stabgan/Multiple-Linear-Regression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Multiple Linear Regression
# Predict startup profitability using backward elimination

# --- Imports (PEP 8: all imports at top) ---
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
import statsmodels.api as sm


def main():
    # --- Load dataset (relative to script location) ---
    script_dir = os.path.dirname(os.path.abspath(__file__))
    csv_path = os.path.join(script_dir, "50_Startups.csv")
    dataset = pd.read_csv(csv_path)
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # --- Encode categorical data ---
    # ColumnTransformer + OneHotEncoder (modern sklearn API)
    # drop='first' avoids the dummy variable trap automatically
    ct = ColumnTransformer(
        transformers=[("encoder", OneHotEncoder(drop="first"), [3])],
        remainder="passthrough",
    )
    X = np.array(ct.fit_transform(X))

    # --- Split into training / test sets (80/20) ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )

    # --- Fit Multiple Linear Regression ---
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    # --- Predict & evaluate ---
    y_pred = regressor.predict(X_test)
    print("=== Model Evaluation ===")
    print(f"R² Score : {r2_score(y_test, y_pred):.4f}")
    print(f"RMSE     : {root_mean_squared_error(y_test, y_pred):.2f}")
    print()

    # --- Backward Elimination (statsmodels OLS) ---
    # sm.add_constant() is the recommended way to prepend an intercept column
    X_with_const = sm.add_constant(X)

    print("=== Backward Elimination ===\n")

    # Step 1: all predictors
    cols = list(range(X_with_const.shape[1]))
    X_opt = X_with_const[:, cols]
    results = sm.OLS(endog=y, exog=X_opt).fit()
    print(results.summary(), "\n")

    # Step 2: remove predictor with highest p-value > 0.05
    cols = [0, 1, 3, 4, 5]
    X_opt = X_with_const[:, cols]
    results = sm.OLS(endog=y, exog=X_opt).fit()
    print(results.summary(), "\n")

    # Step 3
    cols = [0, 3, 4, 5]
    X_opt = X_with_const[:, cols]
    results = sm.OLS(endog=y, exog=X_opt).fit()
    print(results.summary(), "\n")

    # Step 4
    cols = [0, 3, 5]
    X_opt = X_with_const[:, cols]
    results = sm.OLS(endog=y, exog=X_opt).fit()
    print(results.summary(), "\n")

    # Step 5: optimal model — R&D Spend only
    cols = [0, 3]
    X_opt = X_with_const[:, cols]
    results = sm.OLS(endog=y, exog=X_opt).fit()
    print(results.summary(), "\n")


if __name__ == "__main__":
    main()