Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ This function takes as arguments:
- `patience`, number of epochs without any improvement of the features selection, before stopping the process (the idea is similar to the early stopping of Tensorflow/Keras)
- `splitting_type`, it can be equal to `simple` (for simple train/test split) or `kfold` (for 5-fold splitting). If you choose `kfold`, the feature importance will be computed as the average feature importance for each train/test subset.
- `noise_type`, it can be equal to `gaussian` for gaussian noise or `random` for flat random noise
- `importance_type`, it can be equal to `model` for using model coefficients or `shap` for extracting importance using Shapley values
- `filename_output`, a string to indicate where to save the file. You can also choose `None` if you do not want to save it
- `random_state`, set the random seed that it is used by the k-fold splitting

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ numpy==1.20.3
pandas==1.3.4
scipy==1.7.1
scikit-learn==0.24.2
jupyterlab==3.2.1
jupyterlab==3.2.1
shap==0.43.0
70 changes: 55 additions & 15 deletions src/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Tuple, List, Optional
import numpy as np
import pandas as pd
import shap
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.base import BaseEstimator
Expand Down Expand Up @@ -61,28 +62,57 @@ def train_evaluate_model(


def get_feature_importances(
trained_model: BaseEstimator, column_names: List[str]
trained_model: BaseEstimator,
x_train: pd.DataFrame,
column_names: List[str],
scaler_type: BaseEstimator,
importance_type: str
) -> pd.DataFrame:
"""It computes the features importance, given a trained model.

Parameters:
- trained_model: a scikit-learn ML trained model
- x_train: training features
- column_names: the name of the columns associated to the features
- scaler_type: choose between StandardScaler or MinMaxScaler
- importance_type: the method for selecting feature importance

Return:
- a DataFrame containing the feature importance (not sorted) as column and
the name of the features as index
"""


# SHAP importance
if importance_type == "shap":

if scaler_type == "StandardScaler":
scaler = StandardScaler()
elif scaler_type == "MinMaxScaler":
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)

explainer = shap.Explainer(trained_model, x_train)
shap_values = explainer.shap_values(x_train)

shap_sum = np.abs(shap_values).mean(axis=0)
df_coef = pd.DataFrame([shap_sum.tolist()]).T
df_coef.columns = ['shap_importance']
df_coef.index = column_names

# inspect coefficients
if hasattr(trained_model, "coef_"):
model_coefficients = trained_model.coef_
elif hasattr(trained_model, "feature_importances_"):
model_coefficients = trained_model.feature_importances_
else:
raise ValueError("Could not retrieve the feature importance")
elif importance_type == "model":
if hasattr(trained_model, "coef_"):
model_coefficients = trained_model.coef_
elif hasattr(trained_model, "feature_importances_"):
model_coefficients = trained_model.feature_importances_
else:
raise ValueError("Could not retrieve the feature importance")

df_coef = pd.DataFrame(model_coefficients, index=column_names)
df_coef = pd.DataFrame(model_coefficients, index=column_names)

else:
raise ValueError("Allowed values for importance_type are model and shap")


return df_coef

Expand Down Expand Up @@ -183,6 +213,7 @@ def train_with_kfold_splitting(
labels: pd.DataFrame,
model: BaseEstimator,
scaler_type: BaseEstimator,
importance_type: str,
verbose: bool,
random_state: int,
) -> pd.DataFrame:
Expand All @@ -194,6 +225,7 @@ def train_with_kfold_splitting(
- labels: the vector with labels, commonly called y
- model: an untrained scikit-learn model
- scaler_type: choose between StandardScaler or MinMaxScaler
- importance_type: the method for selecting feature importance
- verbose: True or False to tune the level of verbosity
- random_state: select the random state of the train/test splitting process

Expand All @@ -216,12 +248,13 @@ def train_with_kfold_splitting(
scaler_type,
verbose,
)

if i == 0:
df_coefs = get_feature_importances(trained_model, x_trains[i].columns)
df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type)
df_coefs.columns = ["cycle_" + str(i + 1)]
else:
df_coefs["cycle_" + str(i + 1)] = get_feature_importances(
trained_model, x_trains[i].columns
trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type
)

df_coef = compute_mean_coefficients(df_coefs)
Expand All @@ -233,6 +266,7 @@ def train_with_simple_splitting(
labels: pd.DataFrame,
model: BaseEstimator,
scaler_type: BaseEstimator,
importance_type: str,
verbose: bool,
random_state: int,
) -> pd.DataFrame:
Expand All @@ -244,6 +278,7 @@ def train_with_simple_splitting(
- labels: the vector with labels, commonly called y
- model: an untrained scikit-learn model
- scaler_type: choose between StandardScaler or MinMaxScaler
- importance_type: the method for selecting feature importance
- verbose: True or False to tune the level of verbosity
- random_state: select the random state of the train/test splitting process

Expand All @@ -259,7 +294,7 @@ def train_with_simple_splitting(
trained_model = train_evaluate_model(
x_train, y_train, x_test, y_test, model, scaler_type, verbose
)
df_coefs = get_feature_importances(trained_model, x_train.columns)
df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, scaler_type, importance_type)

df_coef = compute_mean_coefficients(df_coefs)

Expand All @@ -274,6 +309,7 @@ def scan_features_pipeline(
verbose: bool,
random_state: int,
noise_type: str,
importance_type: str
) -> pd.DataFrame:
"""This pipeline performs various operations:
- train and evaluate the model
Expand All @@ -289,6 +325,7 @@ def scan_features_pipeline(
- verbose: True or False to tune the level of verbosity
- random_state: select the random state of the train/test splitting process
- noise_type: choose between "gaussian" noise or "random" (flat) noise
- importance_type: the method for selecting feature importance

Return:
- the simplified dataset, containing only the most relevant features
Expand All @@ -308,11 +345,11 @@ def scan_features_pipeline(

if splitting_type == "kfold":
df_coef = train_with_kfold_splitting(
x_new, labels, model, scaler_type, verbose, random_state
x_new, labels, model, scaler_type, importance_type, verbose, random_state
)
elif splitting_type == "simple":
df_coef = train_with_simple_splitting(
x_new, labels, model, scaler_type, verbose, random_state
x_new, labels, model, scaler_type, importance_type, verbose, random_state
)
else:
raise ValueError("Choice not recognized. Possible choices are kfold or simple")
Expand All @@ -330,6 +367,7 @@ def get_relevant_features(
epochs: int,
patience: int,
noise_type: str = "gaussian",
importance_type: str = "model",
verbose: bool = True,
filename_output: Optional[str] = None,
random_state: int = 42,
Expand All @@ -346,6 +384,7 @@ def get_relevant_features(
- patience: the number of cycles of non-improvement to wait before stopping
the execution of the code
- noise_type: choose between "gaussian" noise or "random" (flat) noise
- importance_type: the method for selecting feature importance
- verbose: True or False, to tune the level of verbosity
- filename_output: name of the simplified dataset if you want to export it, default is None
- random_state: select the random seed
Expand All @@ -372,6 +411,7 @@ def get_relevant_features(
verbose,
random_states[epoch],
noise_type,
importance_type
)
n_features_after = x_new.shape[1]

Expand All @@ -390,4 +430,4 @@ def get_relevant_features(
if filename_output is not None:
x_new.to_csv(filename_output, index=False)

return x_new
return x_new