From 56dccce5dfcd6446deae3cb4cbda51e23c146de3 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 5 Dec 2025 22:36:47 +0300 Subject: [PATCH] add the script, and the else --- README.md | 13 +- requirements.txt | 9 ++ scripts/__pycache__/run_eda.cpython-311.pyc | Bin 0 -> 4687 bytes .../build_features.py | 0 src/__init__.py => scripts/evaluate.py | 0 scripts/run_eda.py | 60 +++++++++ src/eda/__init__.py => scripts/train.py | 0 src/eda/__pycache__/eda_tools.cpython-311.pyc | Bin 0 -> 9675 bytes src/eda/eda_tools.py | 120 ++++++++++++++++++ src/features/build_features.py | 4 + src/models/__init__.py | 0 src/models/train_model.py | 19 +++ src/utils/__init__.py | 0 .../__pycache__/data_loader.cpython-311.pyc | Bin 0 -> 1170 bytes src/utils/data_loader.py | 19 +++ src/features/__init__.py => tests/features.py | 0 tests/test_data_loader.py | 4 + 17 files changed, 238 insertions(+), 10 deletions(-) create mode 100644 scripts/__pycache__/run_eda.cpython-311.pyc rename reports/final_report.md => scripts/build_features.py (100%) rename src/__init__.py => scripts/evaluate.py (100%) create mode 100644 scripts/run_eda.py rename src/eda/__init__.py => scripts/train.py (100%) create mode 100644 src/eda/__pycache__/eda_tools.cpython-311.pyc delete mode 100644 src/models/__init__.py delete mode 100644 src/utils/__init__.py create mode 100644 src/utils/__pycache__/data_loader.cpython-311.pyc rename src/features/__init__.py => tests/features.py (100%) create mode 100644 tests/test_data_loader.py diff --git a/README.md b/README.md index 4591c2b..04e3532 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ engineering, EDA, model building, reporting, and CI setup. ## Project Structure end-to-end/ - │ + .github\workflows + │ |──ci,yml + | |──codeql.yml ├── data/ │ ├── raw/ # Original data (untouched) │ └── processed/ # Cleaned, transformed data @@ -25,18 +27,9 @@ engineering, EDA, model building, reporting, and CI setup. │ │ └── train_model.py │ └── utils/ │ └── data_loader.py - │ - ├── reports/ - │ ├── interim_report.md - │ └── final_report.md - │ - ├── ci/ - │ └── python-ci.yml # CI pipeline (GitHub Actions) - │ ├── .gitignore ├── requirements.txt ├── README.md - └── setup.py (optional) ## How to Run This Project diff --git a/requirements.txt b/requirements.txt index e69de29..c35cadf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,9 @@ +pandas +numpy +matplotlib +seaborn +scikit-learn +scipy +statsmodels +dvc +pytest diff --git a/scripts/__pycache__/run_eda.cpython-311.pyc b/scripts/__pycache__/run_eda.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1815e0e6fa50f7e4d5f216b2e025084f776cfccf GIT binary patch literal 4687 zcmcf^Yi|?T^^SdG&-fMFd6Dokk0nirO(8r3OO`wp$g*sb(p{64HF$>LFyryw83%A3 z$!b^Huwqq;K#8hWwfcd21y%bK_Di)O^}`GriD;xqm8yQ|w^Vc`#HXG+j^hUpbyd{s zxp&UJ=gfKBd+s^MfAIS~2-+_W^{4g#^mjU`G`s;kA7T(%MgkHT3B~E#Au+Ne?vR~v zCqw5>3Cm6KCfOBt$?mvYX5*~P#W~p%_sHJ3SN6qyvOn&(X&EUH57@k#y)3 zNNBp|pwbAu#K3&GqJrRhkRYl_lFDc)AW=15Q?0PFAQE1Z5|W~-3BqeB#cHX- ziOV+=b41BztgRVI(Q2qDCPS2kR5~eI+ij$#XuOn^_>`2`Ax1x%I}p19GvyvFFM{J%kd6m;RadGH^_3n zRu)%PUuepsdR`3Dyj6T22`<4s=Xk`{Q+*ZG{8g%3!TKw+i1WBe_x_dkgZ9)ZaP?B_ zr7&m?37+3Gi%m>K&O)$$egrLgw5?U{ z^MML1gz{{P$$PXNl@){@g&)8@e7JcR?LZHSp`Is$(%wDjAX4#l#|`JF=sLroPa#AD zb<3^be6YfA0g<4#r!qqb&Y-29Q<_PkMeqM$eQQ0x|K9q%_uX2qy;o~zz9H%@`n2x1 z;{96BTk!#H|6B3RT5a@c%lq>Fd?4Tapb1VBdbxq0)l#+R8)9n%z9Zidof`048ogLR zkQ_VD&-5@BE8c)ilu)i&?klM|NP?=xl`{$Cot%FX(q<|>7omxAzzWU~KCSXenzTMt z(%Mxkcu~BXN=o8HS|iE`uZh6Yo8gHnCg2m`5Ux+LF`9UtmT!ip!VBeOW_j(|u$>sK zmf@_bDe^^L%8H{ZBtc-4+{{$4KeR}q(d79)816I65C!w-Gyfu!XKp#>VXfR2(4V|{ z=2wo}fob#&0=3*pMas%)6~sGYKem{pQcnX=!E6J*Y6ONC1*8WC z%FmsTOS?+sQLc2LbYSUoI+hMR2O5-Xd!_QAJ7uvKHY$i*SAESIGQLe=1 zRj4bN>x5>ZZ@_p#yuxQCjkHr~TPig}O+{jVkqb;ibtFVKTwx#=p0=wB@P!Din}mp_ zx+7t0_ZL z;4O6H3H5l(r75%?BQCl^^(8D__JS{pS|XJe#2Xe?GGZDSP^++zl{1zT>L=1gsc9l6 z^u7>USreKqh!2LIkmX6GGg&QROZR}$bOH)l-VFooiGS1I?I~BS2YuAt}o8r@f2rt+-u-o6ZZnJ7H+-Qz1(-VZzWecVT6yE;Uk4} zWlC>R(D8l)?>F&&0M=UD?~N}{-kmIVl`a{rC(PCpg%8V={u0)4)WA^_M*;W;=9Uhu z3>bKyiTCMv-x?RX)2r`)UmyHf@14@QX@i?Kx#_~t_bwFLxk8NKZZo*MFtLjL&v5Hg z+(N)ZSIr;O+`{8>X%iHd5TaoqXh!Gk!L!$;BGx3;?$3U6z z-r|rE>NZ2&g|lmIKPrVwaii_H*>?O{+sMxg7QB$Lj7i_Ux(+DIP5{a4I@Ti)C`P%s~X%nlRHN_#~y!fn{cOh zh5sWcA2ryh$wmR7%ExRHd!U-fsNywiRcsa7(72K6dlq@#D^8WR8r%_+J5m^07>COGr41QN>A&O!JRO<6Lh+}c&PX@gX=fB{=yJAr0(fj87ex8LnTMmqv`UFqTj$V z6UTHMTcZm*i(LlpGjX4e`(R-yX5dZ}cj}c#-3xc>yWr08lg{r>;_*TEV+RA{ljiLc z$DL1(zT1r5N;>C`M(voB z3(>@PfbiE3P=Lm1LMyBs8;zC&k>w_0Mj@Ko(@AJeVzsFx8G}EBk{YvOp&cVFmM3#|ZYEJ+XNMPRVw#6Pnh7g#hRi{@OKTTs z`X-jEOltXfK}f)tHqP>tlRL~Is@r3eZLj4jTSaJrOd@okk?bXVN>+rdB%UT8!Ib84 zbr?o)E{6FV;tKX}6&==J_pCcG<5)-kufc-xy-S~%{=NFabGm=h@K2il$->ww_B_KKPjSZzr|&E4ns`vhgH?>D zG+`r-n)s-WkFI({&%E27dbb(g9j13jVPqAvg^{J!+g*2N?p-qQE)(z4E6=(Y;VzJF HZ}NWt!evd- literal 0 HcmV?d00001 diff --git a/reports/final_report.md b/scripts/build_features.py similarity index 100% rename from reports/final_report.md rename to scripts/build_features.py diff --git a/src/__init__.py b/scripts/evaluate.py similarity index 100% rename from src/__init__.py rename to scripts/evaluate.py diff --git a/scripts/run_eda.py b/scripts/run_eda.py new file mode 100644 index 0000000..2c9a9c7 --- /dev/null +++ b/scripts/run_eda.py @@ -0,0 +1,60 @@ +# scripts/run_eda.py +import os +from src.utils.data_loader import load_csv, save_csv +from src.eda.eda_tools import ( + data_structure, descriptive_stats, overall_loss_ratio, loss_ratio_by_group, + plot_loss_ratio_by_province, plot_totalclaims_distribution, + plot_claims_premium_time_series, scatter_premium_vs_claims, outlier_summary +) +import argparse +import json + +def main(input_path, output_dir): + os.makedirs(output_dir, exist_ok=True) + figures_dir = os.path.join(output_dir, "figures") + os.makedirs(figures_dir, exist_ok=True) + summaries_dir = os.path.join(output_dir, "summaries") + os.makedirs(summaries_dir, exist_ok=True) + + print("Loading data:", input_path) + df = load_csv(input_path, parse_dates=["TransactionMonth", "VehicleIntroDate"]) + + # Basic data structure + structure = data_structure(df) + structure.to_csv(os.path.join(summaries_dir, "data_structure.csv")) + + # Descriptive stats for key numeric columns + numeric_cols = ["TotalPremium", "TotalClaims", "CustomValueEstimate"] + present = [c for c in numeric_cols if c in df.columns] + stats = descriptive_stats(df, present) + stats.to_csv(os.path.join(summaries_dir, "descriptive_stats.csv")) + + # Compute Loss Ratios + overall_lr = overall_loss_ratio(df) + lr_by_province = loss_ratio_by_group(df, "Province").reset_index() + lr_by_province.to_csv(os.path.join(summaries_dir, "loss_ratio_by_province.csv")) + with open(os.path.join(summaries_dir, "loss_ratio_overall.json"), "w") as f: + json.dump({"overall_loss_ratio": overall_lr}, f, default=str) + + # Outliers summary + outlier_tc = outlier_summary(df, "TotalClaims") if "TotalClaims" in df.columns else {} + with open(os.path.join(summaries_dir, "outlier_totalclaims.json"), "w") as f: + json.dump(outlier_tc, f, default=str) + + # Create required 3 beautiful plots + p1 = plot_loss_ratio_by_province(df, figures_dir) + p2 = plot_totalclaims_distribution(df, figures_dir) + p3 = plot_claims_premium_time_series(df, figures_dir) + p4 = scatter_premium_vs_claims(df, figures_dir) + + print("Saved figures:", p1, p2, p3, p4) + print("Summaries saved to", summaries_dir) + print("Overall Loss Ratio:", overall_lr) + print("EDA complete") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run EDA for ACIS dataset") + parser.add_argument("--input", default="data/raw/data.csv") + parser.add_argument("--output", default="reports") + args = parser.parse_args() + main(args.input, args.output) diff --git a/src/eda/__init__.py b/scripts/train.py similarity index 100% rename from src/eda/__init__.py rename to scripts/train.py diff --git a/src/eda/__pycache__/eda_tools.cpython-311.pyc b/src/eda/__pycache__/eda_tools.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec777fc03bee01070183138c174caff44e02bd57 GIT binary patch literal 9675 zcmb_CTWlLwb~EJVo1z|+WJ#86`Ypz$&g6Rsy$XQ!kb++^o~6g+BH}1^Th*S5Tn9j4*(RK@AvPEKvQaNI!~v_1rt; zXl6(!G0s_MKyh)~=L$%){!>ykkBXb0qz#z*vA~9E{+b3f30oojl@Q33kc-g@a+ffvY+kcEWxUzXhIjH+<3sPkQR!t`FQNWvse@Ol44Vlsz?gM2oz{#Sq% zb!aY8ngb*Uj%z5(N%!J#??yq;a>Cy?hXFigB-67gqg?OeQHF%+6_kAuYmAVz3Mj!Sn!Grw>4pd5PQ)t}ddIB%|F5+O4A9(9nP100N^q z_zBI81u2KqmfaFs=x*IW(Cc?QW$YQkU`D-Nt=qGc12-j{_k-uVn!X`N>agqu))=LhtpBD^%06~qXa<|FA$ zdN7?!CL{4oE-i@%FTy3ed7R0nxhT?H6jKWUR*Yc9D3<0owcWXN;(ktu`ZPzD*L>$W ziMxoolmM&Hw`gt-*fJ||OJ4vb#54yZxs>KfR~K;L7TA`C66uMINbV&P!A)c& z#KW2~n4OEmJqvgz3=-WUP5>w}e+o1`8m_d4pLDOa?pSTz@%Ouw*1c-$-bzpJZ~H&* z|LcKg1C`FMr<0#eewuiac$)ew^=alwrqb5_bl|gr@`c3#rENfM8`$ue2VL5$^<7~(3YWuO` z^@^vx#IAa}Wl#6Z=GI3u3r&^4meQaS*e9cXI=}fQaVD1qnM+!jj_4hFU;hCOa$jYr z>q~*P-Vo37j!B2Qo_QM@1UaTEaDv+o+*!3XpTj=%^}RX$FPgJDUeCqY|z=y{FL>30r#M@&qnwSbtx zQf!7x<^-{N|M)GKt~n=h)_U3D%}b7r&&B9+_ypkYhCfjN0JrTAJ|1{Du=o?jzg_ij zFK=6Jk|U>8|Cu%axmEu;#eYHdUnrid_?rk?Ib6Q0?l`g>Upc7$=(4io^1^`PzoPoD z=omu}hf4dGI~Il%|1s5n3b%(aCE!<-ga9xb`7 zPKb?_-Mq)b%H#~{ifzY^|9>h>!qm`&gEP!nDF;FE(eraOHu}8(0mBEDg4W%`tR{UH z2t{aAbH_6RjtfNf2^eRH_XSEYH!cd2=D21eNOQ6Crsw-HQ3LCti6?NZ_+U#$=!$~eR&U2^p(G=m`9GyT9p@IAfpc}&U*?@z;zYe1y8|Ia8J{>K+B_xcODc~V~cJTaUMN$gP6NUCv`IAGu6g_ z)f)eMChFE4_uqoQBbqaDADsJS<|6^BTo!l@f`nKmCnXaQ1p==TX4Jg*b6i?VBn4e* zc&RgnZ@?pbo&e%c;SUM8O~8){_z3`!96GpUwCF)Wb9L<)#1JVC&*3|O@*{%#D*(Wg zUjM?8N1n%_hatt=rFy%H?^c>ZkH;R4mD)>EDX28=THVUT3~FsxwAf_DKg>4gV_yyUggWy3Y1SKepka2uXMA@BcAJev0+ zZ@2O-tkzj23aZV#Rhfj6bp>3FfR*Zbr)7sEA3;mO7Vp@oHga8dY6?z}qTs0JeII9L z66v_`rDL6tzD`d3OIGth8b?gz1$+ehV7S>?jh$N`gpTIIiOG8s2?;SoE5r@d9QU~V zKH>#Os2d#VH$_5`p@=#DiRL>^h~smS8UgtptCyRt5_T{PPIL>+z)+u8b7qs0=GN0p z5UZy}%`?v7Y%(Khu0)y_W;OS$l!#A@ntLwEjSET5B_$+CKm{d2cr3}y0q0`~1{j(f z($G{Uk=8sSHzUBRkP84pgecOrz+HlNyoM_xXc2MOrF|=Tt0@E{-g%8~{1Kpzz@PYU z07%X#v@oNfP8D^^sPm=2uQXl;r+ZNK4;Ih8M4ks%7H4F%O+njKv<(_$=<4FGC()8v zKB;aWlF=ar9a7ODXi)T$qx2EPWD44+A`+a@LAm+3f=;ODgp5v*fy;7euNgx{?VB_} zhT_=c;fKQ|w;UN(0;kl#sp5J4y+}bFD(a9?2VLdks}HUguT}!h)tp0PrBQi9rLpPX z*Qbkskqt2hfg$Hk3n&_=$-gy;eGS7ddeuM<6wp_O5`xu@Hmq)_8QxPT+-5w3BHsI* zc;BT|%s_S($IJUWYIazLyk?HOC>Xt`5v<#=+_4$BDmK=G(T`2GPaOV z$`K6`$xP^4LeHK=`eF9q1WrH<4K886mu5orvXdm1?l;Alc>#u8^6+5zY>3h!RBNi%U*8>hz`$g@eaZ0ZNQ zWJre0fEF>NzHG>~K@2oNPJK2xbyne>VEhxgGuy+e@1{I zr!>~i@1iN2BKNPZ|MI%rW)a2LO5U5<+x3=x3==oR0yaSyf{I@eBC$8|)Xgkc zs7nLc7}WXNgqpKH2{(@#`|-SU$@b+3%&%dTKyVDR3NfQ=)3`&=z1gFlJnYzF#7%;> zDSiz+c{85T{vJF-92QMXLzPbR==uO4kU$}=1tCGq zXHxV7kO1F`=HLk)Cx9xW{iGEIFnNTQKb`?gE0Dnn0Gf+SX76$M7tqdkm?MWs)q#lS z+Z(tQ(frVjI44O0e#fHBA~oe!^DIOz1)r=V=~Lgb(R?2`1!PVw{TG`Ns|i7>%)(Te zL2eD=17}i5gJVNyZ)!H(*9cUXq3%YJ?7}@q-3|OPbntxwet^Sa)_R|kIe$Tx^6Br4 z@e_@eU-xVGfd3NaC*V(P1IDaVd)~mW&n-GWx%BAL;)ja2SM~NnGNPsJsqZsi>D?D+ z7kx_0klHd-ysWE3i@QRG-djwRLu%*H@`;tZ>XEDRjSppXTS2!~bQ>CUlHF0U_vL0w zbn@Ve+`LCYdsVboMtjK$5Wk%N0}DX^Dd>WVF39KtJbSHbJ);Ch)WFCObZWp_z5_=D z)dH#ds6VK=v%tfgsHZPo>0Am@nFrvx)|%p|SD8$V!+#Q>X3eeFlqn#L6u9vWPV0=} zhEL4!5eO4a4Y8^qVIsS_i6p9xel*rllDM>>;eq=nk{M2-pYmw5OLLkU;uCR+rjlq> zNu5h-)94L!?FdcI3S7Ewj>+cogDLNl`7N$#u7dPZ>Z^_JIg*mJ;hoP~7Sa#3ijSU7WJMenh zmPP!eu)#p%g<~m58+OpnvO7vUpB*5xcf8$ZA}qfI;L^suK& zj%R=ljTc8~wfr`%?4Ui!wv^942U?)9vWr%B&`WT}pC@&kbN0t%=0>j{00TfjMrf?W bY2OZdnh7F+^3LZ6XoKvhHrS{|(}e#Ao-y#C literal 0 HcmV?d00001 diff --git a/src/eda/eda_tools.py b/src/eda/eda_tools.py index e69de29..5af9bfe 100644 --- a/src/eda/eda_tools.py +++ b/src/eda/eda_tools.py @@ -0,0 +1,120 @@ +# src/eda/eda_tools.py +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from typing import Tuple + +sns.set(style="whitegrid", rc={"figure.dpi": 150}) + +def ensure_dir(path: str): + os.makedirs(path, exist_ok=True) + +# ---------- Summaries ---------- +def data_structure(df: pd.DataFrame) -> pd.DataFrame: + """Return dtypes and non-null counts.""" + info = pd.DataFrame({ + "dtype": df.dtypes.astype(str), + "non_null_count": df.count(), + "null_count": df.isna().sum(), + "unique": df.nunique(dropna=False) + }) + return info + +def descriptive_stats(df: pd.DataFrame, cols: list) -> pd.DataFrame: + return df[cols].describe().T + +# ---------- Business Metric: Loss Ratio ---------- +def overall_loss_ratio(df: pd.DataFrame) -> float: + total_claims = df["TotalClaims"].sum(skipna=True) + total_premium = df["TotalPremium"].sum(skipna=True) + if total_premium == 0: + return np.nan + return total_claims / total_premium + +def loss_ratio_by_group(df: pd.DataFrame, group_col: str) -> pd.DataFrame: + grp = df.groupby(group_col)[["TotalPremium","TotalClaims"]].sum() + grp = grp.assign(LossRatio = grp["TotalClaims"] / grp["TotalPremium"]) + grp = grp.sort_values("LossRatio", ascending=False) + return grp + +# ---------- Time series ---------- +def monthly_claims_premiums(df: pd.DataFrame, date_col: str = "TransactionMonth") -> pd.DataFrame: + df = df.copy() + df[date_col] = pd.to_datetime(df[date_col], errors="coerce") + df = df.dropna(subset=[date_col]) + monthly = df.groupby(pd.Grouper(key=date_col, freq="MS"))[["TotalClaims","TotalPremium"]].sum() + monthly["ClaimFrequency"] = df.groupby(pd.Grouper(key=date_col, freq="MS"))["TotalClaims"].apply(lambda s: (s>0).sum()) + # severity: average claim amount per claim (avoid div by zero) + monthly["ClaimSeverity"] = monthly.apply(lambda r: r["TotalClaims"] / max(r["ClaimFrequency"], 1), axis=1) + return monthly + +# ---------- Outlier detection ---------- +def outlier_summary(df: pd.DataFrame, col: str) -> dict: + s = df[col].dropna() + q1, q3 = s.quantile([0.25, 0.75]) + iqr = q3 - q1 + lower = q1 - 1.5 * iqr + upper = q3 + 1.5 * iqr + return {"q1": q1, "q3": q3, "iqr": iqr, "lower": lower, "upper": upper, + "n_outliers": ((s < lower) | (s > upper)).sum()} + +# ---------- Plots (3 required polished plots) ---------- +def plot_loss_ratio_by_province(df: pd.DataFrame, outdir: str): + ensure_dir(outdir) + grp = loss_ratio_by_group(df, "Province") + plt.figure(figsize=(10,6)) + sns.barplot(x=grp.index, y=grp["LossRatio"]) + plt.xticks(rotation=45, ha="right") + plt.ylabel("Loss Ratio (TotalClaims / TotalPremium)") + plt.title("Loss Ratio by Province") + plt.tight_layout() + path = os.path.join(outdir, "loss_ratio_by_province.png") + plt.savefig(path) + plt.close() + return path + +def plot_totalclaims_distribution(df: pd.DataFrame, outdir: str): + ensure_dir(outdir) + plt.figure(figsize=(8,5)) + # log scale helps when heavy skew/outliers + sns.histplot(df["TotalClaims"].dropna(), bins=100, kde=True) + plt.xscale('symlog') # symmetric log to keep zeros visible + plt.xlabel("TotalClaims (symlog scale)") + plt.title("Distribution of TotalClaims (log-friendly)") + plt.tight_layout() + path = os.path.join(outdir, "totalclaims_distribution.png") + plt.savefig(path) + plt.close() + return path + +def plot_claims_premium_time_series(df: pd.DataFrame, outdir: str, date_col="TransactionMonth"): + ensure_dir(outdir) + monthly = monthly_claims_premiums(df, date_col=date_col) + plt.figure(figsize=(10,6)) + ax = monthly[["TotalClaims","TotalPremium"]].plot(title="Monthly TotalClaims vs TotalPremium") + ax.set_ylabel("Amount (local currency)") + plt.tight_layout() + path = os.path.join(outdir, "monthly_claims_premium.png") + plt.savefig(path) + plt.close() + return path + +# ---------- Bivariate exploration ---------- +def scatter_premium_vs_claims(df: pd.DataFrame, outdir: str, sample=10000): + ensure_dir(outdir) + n = min(len(df), sample) + sample_df = df.sample(n=n, random_state=42) + plt.figure(figsize=(8,6)) + sns.scatterplot(x=sample_df["TotalPremium"], y=sample_df["TotalClaims"], alpha=0.6) + plt.xscale("symlog") + plt.yscale("symlog") + plt.xlabel("TotalPremium (symlog)") + plt.ylabel("TotalClaims (symlog)") + plt.title(f"Scatter: TotalPremium vs TotalClaims (sample n={n})") + plt.tight_layout() + path = os.path.join(outdir, "scatter_premium_vs_claims.png") + plt.savefig(path) + plt.close() + return path diff --git a/src/features/build_features.py b/src/features/build_features.py index e69de29..1ff6c15 100644 --- a/src/features/build_features.py +++ b/src/features/build_features.py @@ -0,0 +1,4 @@ +def build_features(df): + df["VehicleAge"] = 2025 - df["RegistrationYear"] + df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int) + return df diff --git a/src/models/__init__.py b/src/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/models/train_model.py b/src/models/train_model.py index e69de29..2713167 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -0,0 +1,19 @@ +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error,r2_score + +def train_model(df, target): + X = df.drop(columns=[target]) + y = df[target] + + X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) + + model = RandomForestRegressor() + model.fit(X_train,y_train) + + preds = model.predict(X_test) + + rmse = mean_squared_error(y_test,preds,squared=False) + r2 = r2_score(y_test,preds) + + return model,rmse,r2 diff --git a/src/utils/__init__.py b/src/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/__pycache__/data_loader.cpython-311.pyc b/src/utils/__pycache__/data_loader.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4132a5125a3333c630b78817cabe304734633a16 GIT binary patch literal 1170 zcmZ`&y-yTD6rb7M`{qNS5W(nBSeybzu_6Q_5E6{R0yTu3;^t;fmhA0bX7*6b5-tWK z1;v?=U}13uQH&Ip7XAZvM?!I##KgqP?h;~RsC+Ye7m)Ze``(*3``(-1Z{NOaZA~JO zFV}~hNduwpYSWO|U+3@R>2MyWvXCv!4{|;Fd?Iy z1x4;<;j&GMfM>#USO&F?0^B@7OHDt7>+czFA@4&ochv|n9|-XzdT@Ew49(D3YMw14 ze(^*l)j>;Wgyu1@=AL!3syZKmH#e_@1^|mZ3z7unwr^81F?*jlUf>g(Oxl4x#qB&B z)W-uvfgL6}$7R}WMw{S=iR*}f_&JjG-D2J&9%Ksw)iguI7DSVzGpB&br0YLf$g{l9 zS7bEnGoED-jWW)CF4C4X3RK29v*|)sEX&xRszRm$U$Y%Jd6u@g1z5P^SQ<1ZaaO2+ zLUeR>VmNb8FfKBSg$(oPK;RF6$q1g!6a&W<8Mu$V-~w{S2Ma4Qu3XiOcR~$SQ)EDu z(9af>Y}xEB->tA}qJKZpe~{=ejn`tGb!0}yY8~h6$ViN7wQ;l7ex^LWb!%g7W9@5q z&zrUi-|gEeew?dzkM4Jm9(0eEjqLcC`>cq?_40>j6fU}K4Q~`Th zLVS8^VC)fmS93;D9C19#o+?&>Gys-1bT3VsP{TwB?*Jp?!d_+?`dQd2_~HVn5~^9z zm($OuH|Xn2rRjZ(99X2Q)N3)e)%jkT#XM hUAps7yPx>yP&@ys9*q0Som=qrz^Tk?^IxT}`5T(X7hwPZ literal 0 HcmV?d00001 diff --git a/src/utils/data_loader.py b/src/utils/data_loader.py index e69de29..19cfeba 100644 --- a/src/utils/data_loader.py +++ b/src/utils/data_loader.py @@ -0,0 +1,19 @@ +# src/utils/data_loader.py +import pandas as pd +from typing import Optional + +def load_csv(path: str, parse_dates: Optional[list] = None) -> pd.DataFrame: + """ + Load CSV into a DataFrame. + - path: file path + - parse_dates: list of column names to parse as dates + """ + df = pd.read_csv(path, low_memory=False) + if parse_dates: + for c in parse_dates: + if c in df.columns: + df[c] = pd.to_datetime(df[c], errors="coerce") + return df + +def save_csv(df: pd.DataFrame, path: str): + df.to_csv(path, index=False) diff --git a/src/features/__init__.py b/tests/features.py similarity index 100% rename from src/features/__init__.py rename to tests/features.py diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py new file mode 100644 index 0000000..b9e4af7 --- /dev/null +++ b/tests/test_data_loader.py @@ -0,0 +1,4 @@ +from src.utils.data_loader import load_data + +def test_loader(): + assert load_data