Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ engineering, EDA, model building, reporting, and CI setup.
## Project Structure

end-to-end/
.github\workflows
│ |──ci,yml
| |──codeql.yml
├── data/
│ ├── raw/ # Original data (untouched)
│ └── processed/ # Cleaned, transformed data
Expand All @@ -25,18 +27,9 @@ engineering, EDA, model building, reporting, and CI setup.
│ │ └── train_model.py
│ └── utils/
│ └── data_loader.py
├── reports/
│ ├── interim_report.md
│ └── final_report.md
├── ci/
│ └── python-ci.yml # CI pipeline (GitHub Actions)
├── .gitignore
├── requirements.txt
├── README.md
└── setup.py (optional)

## How to Run This Project

Expand Down
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
pandas
numpy
matplotlib
seaborn
scikit-learn
scipy
statsmodels
dvc
pytest
Binary file added scripts/__pycache__/run_eda.cpython-311.pyc
Binary file not shown.
File renamed without changes.
File renamed without changes.
60 changes: 60 additions & 0 deletions scripts/run_eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# scripts/run_eda.py
import os
from src.utils.data_loader import load_csv, save_csv
from src.eda.eda_tools import (
data_structure, descriptive_stats, overall_loss_ratio, loss_ratio_by_group,
plot_loss_ratio_by_province, plot_totalclaims_distribution,
plot_claims_premium_time_series, scatter_premium_vs_claims, outlier_summary
)
import argparse
import json

def main(input_path, output_dir):
os.makedirs(output_dir, exist_ok=True)
figures_dir = os.path.join(output_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)
summaries_dir = os.path.join(output_dir, "summaries")
os.makedirs(summaries_dir, exist_ok=True)

print("Loading data:", input_path)
df = load_csv(input_path, parse_dates=["TransactionMonth", "VehicleIntroDate"])

# Basic data structure
structure = data_structure(df)
structure.to_csv(os.path.join(summaries_dir, "data_structure.csv"))

# Descriptive stats for key numeric columns
numeric_cols = ["TotalPremium", "TotalClaims", "CustomValueEstimate"]
present = [c for c in numeric_cols if c in df.columns]
stats = descriptive_stats(df, present)
stats.to_csv(os.path.join(summaries_dir, "descriptive_stats.csv"))

# Compute Loss Ratios
overall_lr = overall_loss_ratio(df)
lr_by_province = loss_ratio_by_group(df, "Province").reset_index()
lr_by_province.to_csv(os.path.join(summaries_dir, "loss_ratio_by_province.csv"))
with open(os.path.join(summaries_dir, "loss_ratio_overall.json"), "w") as f:
json.dump({"overall_loss_ratio": overall_lr}, f, default=str)

# Outliers summary
outlier_tc = outlier_summary(df, "TotalClaims") if "TotalClaims" in df.columns else {}
with open(os.path.join(summaries_dir, "outlier_totalclaims.json"), "w") as f:
json.dump(outlier_tc, f, default=str)

# Create required 3 beautiful plots
p1 = plot_loss_ratio_by_province(df, figures_dir)
p2 = plot_totalclaims_distribution(df, figures_dir)
p3 = plot_claims_premium_time_series(df, figures_dir)
p4 = scatter_premium_vs_claims(df, figures_dir)

print("Saved figures:", p1, p2, p3, p4)
print("Summaries saved to", summaries_dir)
print("Overall Loss Ratio:", overall_lr)
print("EDA complete")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run EDA for ACIS dataset")
parser.add_argument("--input", default="data/raw/data.csv")
parser.add_argument("--output", default="reports")
args = parser.parse_args()
main(args.input, args.output)
File renamed without changes.
Binary file added src/eda/__pycache__/eda_tools.cpython-311.pyc
Binary file not shown.
120 changes: 120 additions & 0 deletions src/eda/eda_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# src/eda/eda_tools.py
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple

sns.set(style="whitegrid", rc={"figure.dpi": 150})

def ensure_dir(path: str):
os.makedirs(path, exist_ok=True)

# ---------- Summaries ----------
def data_structure(df: pd.DataFrame) -> pd.DataFrame:
"""Return dtypes and non-null counts."""
info = pd.DataFrame({
"dtype": df.dtypes.astype(str),
"non_null_count": df.count(),
"null_count": df.isna().sum(),
"unique": df.nunique(dropna=False)
})
return info

def descriptive_stats(df: pd.DataFrame, cols: list) -> pd.DataFrame:
return df[cols].describe().T

# ---------- Business Metric: Loss Ratio ----------
def overall_loss_ratio(df: pd.DataFrame) -> float:
total_claims = df["TotalClaims"].sum(skipna=True)
total_premium = df["TotalPremium"].sum(skipna=True)
if total_premium == 0:
return np.nan
return total_claims / total_premium

def loss_ratio_by_group(df: pd.DataFrame, group_col: str) -> pd.DataFrame:
grp = df.groupby(group_col)[["TotalPremium","TotalClaims"]].sum()
grp = grp.assign(LossRatio = grp["TotalClaims"] / grp["TotalPremium"])
grp = grp.sort_values("LossRatio", ascending=False)
return grp

# ---------- Time series ----------
def monthly_claims_premiums(df: pd.DataFrame, date_col: str = "TransactionMonth") -> pd.DataFrame:
df = df.copy()
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df = df.dropna(subset=[date_col])
monthly = df.groupby(pd.Grouper(key=date_col, freq="MS"))[["TotalClaims","TotalPremium"]].sum()
monthly["ClaimFrequency"] = df.groupby(pd.Grouper(key=date_col, freq="MS"))["TotalClaims"].apply(lambda s: (s>0).sum())
# severity: average claim amount per claim (avoid div by zero)
monthly["ClaimSeverity"] = monthly.apply(lambda r: r["TotalClaims"] / max(r["ClaimFrequency"], 1), axis=1)
return monthly

# ---------- Outlier detection ----------
def outlier_summary(df: pd.DataFrame, col: str) -> dict:
s = df[col].dropna()
q1, q3 = s.quantile([0.25, 0.75])
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
return {"q1": q1, "q3": q3, "iqr": iqr, "lower": lower, "upper": upper,
"n_outliers": ((s < lower) | (s > upper)).sum()}

# ---------- Plots (3 required polished plots) ----------
def plot_loss_ratio_by_province(df: pd.DataFrame, outdir: str):
ensure_dir(outdir)
grp = loss_ratio_by_group(df, "Province")
plt.figure(figsize=(10,6))
sns.barplot(x=grp.index, y=grp["LossRatio"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("Loss Ratio (TotalClaims / TotalPremium)")
plt.title("Loss Ratio by Province")
plt.tight_layout()
path = os.path.join(outdir, "loss_ratio_by_province.png")
plt.savefig(path)
plt.close()
return path

def plot_totalclaims_distribution(df: pd.DataFrame, outdir: str):
ensure_dir(outdir)
plt.figure(figsize=(8,5))
# log scale helps when heavy skew/outliers
sns.histplot(df["TotalClaims"].dropna(), bins=100, kde=True)
plt.xscale('symlog') # symmetric log to keep zeros visible
plt.xlabel("TotalClaims (symlog scale)")
plt.title("Distribution of TotalClaims (log-friendly)")
plt.tight_layout()
path = os.path.join(outdir, "totalclaims_distribution.png")
plt.savefig(path)
plt.close()
return path

def plot_claims_premium_time_series(df: pd.DataFrame, outdir: str, date_col="TransactionMonth"):
ensure_dir(outdir)
monthly = monthly_claims_premiums(df, date_col=date_col)
plt.figure(figsize=(10,6))
ax = monthly[["TotalClaims","TotalPremium"]].plot(title="Monthly TotalClaims vs TotalPremium")
ax.set_ylabel("Amount (local currency)")
plt.tight_layout()
path = os.path.join(outdir, "monthly_claims_premium.png")
plt.savefig(path)
plt.close()
return path

# ---------- Bivariate exploration ----------
def scatter_premium_vs_claims(df: pd.DataFrame, outdir: str, sample=10000):
ensure_dir(outdir)
n = min(len(df), sample)
sample_df = df.sample(n=n, random_state=42)
plt.figure(figsize=(8,6))
sns.scatterplot(x=sample_df["TotalPremium"], y=sample_df["TotalClaims"], alpha=0.6)
plt.xscale("symlog")
plt.yscale("symlog")
plt.xlabel("TotalPremium (symlog)")
plt.ylabel("TotalClaims (symlog)")
plt.title(f"Scatter: TotalPremium vs TotalClaims (sample n={n})")
plt.tight_layout()
path = os.path.join(outdir, "scatter_premium_vs_claims.png")
plt.savefig(path)
plt.close()
return path
4 changes: 4 additions & 0 deletions src/features/build_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
def build_features(df):
df["VehicleAge"] = 2025 - df["RegistrationYear"]
df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int)
return df
Empty file removed src/models/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions src/models/train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score

def train_model(df, target):
X = df.drop(columns=[target])
y = df[target]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train,y_train)

preds = model.predict(X_test)

rmse = mean_squared_error(y_test,preds,squared=False)
r2 = r2_score(y_test,preds)

return model,rmse,r2
Empty file removed src/utils/__init__.py
Empty file.
Binary file added src/utils/__pycache__/data_loader.cpython-311.pyc
Binary file not shown.
19 changes: 19 additions & 0 deletions src/utils/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# src/utils/data_loader.py
import pandas as pd
from typing import Optional

def load_csv(path: str, parse_dates: Optional[list] = None) -> pd.DataFrame:
"""
Load CSV into a DataFrame.
- path: file path
- parse_dates: list of column names to parse as dates
"""
df = pd.read_csv(path, low_memory=False)
if parse_dates:
for c in parse_dates:
if c in df.columns:
df[c] = pd.to_datetime(df[c], errors="coerce")
return df

def save_csv(df: pd.DataFrame, path: str):
df.to_csv(path, index=False)
File renamed without changes.
4 changes: 4 additions & 0 deletions tests/test_data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from src.utils.data_loader import load_data

def test_loader():
assert load_data
Loading