apalladi · valots12 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ This function takes as arguments:
 - `patience`, number of epochs without any improvement of the features selection, before stopping the process (the idea is similar to the early stopping of Tensorflow/Keras)
 - `splitting_type`, it can be equal to `simple` (for simple train/test split) or `kfold` (for 5-fold splitting). If you choose `kfold`, the feature importance will be computed as the average feature importance for each train/test subset.
 - `noise_type`, it can be equal to `gaussian` for gaussian noise or `random` for flat random noise
+- `importance_type`, it can be equal to `model` for using model coefficients or `shap` for extracting importance using Shapley values
 - `filename_output`, a string to indicate where to save the file. You can also choose `None` if you do not want to save it
 - `random_state`, set the random seed that it is used by the k-fold splitting
 

diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ numpy==1.20.3
 pandas==1.3.4
 scipy==1.7.1
 scikit-learn==0.24.2
-jupyterlab==3.2.1
+jupyterlab==3.2.1
+shap==0.43.0
diff --git a/src/ml.py b/src/ml.py
@@ -4,6 +4,7 @@
 from typing import Tuple, List, Optional
 import numpy as np
 import pandas as pd
+import shap
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.model_selection import KFold, train_test_split
 from sklearn.base import BaseEstimator
@@ -61,28 +62,57 @@ def train_evaluate_model(
 
 
 def get_feature_importances(
-    trained_model: BaseEstimator, column_names: List[str]
+    trained_model: BaseEstimator, 
+    x_train: pd.DataFrame,
+    column_names: List[str],
+    scaler_type: BaseEstimator,
+    importance_type: str
 ) -> pd.DataFrame:
     """It computes the features importance, given a trained model.
 
     Parameters:
         - trained_model: a scikit-learn ML trained model
+        - x_train: training features
         - column_names: the name of the columns associated to the features
+        - scaler_type: choose between StandardScaler or MinMaxScaler
+        - importance_type: the method for selecting feature importance
 
     Return:
         - a DataFrame containing the feature importance (not sorted) as column and
         the name of the features as index
     """
-
+
+    # SHAP importance
+    if importance_type == "shap":
+
+        if scaler_type == "StandardScaler":
+            scaler = StandardScaler()
+        elif scaler_type == "MinMaxScaler":
+            scaler = MinMaxScaler()
+        x_train = scaler.fit_transform(x_train)
+
+        explainer = shap.Explainer(trained_model, x_train)
+        shap_values = explainer.shap_values(x_train)
+
+        shap_sum = np.abs(shap_values).mean(axis=0)
+        df_coef = pd.DataFrame([shap_sum.tolist()]).T
+        df_coef.columns = ['shap_importance']
+        df_coef.index = column_names
+
     # inspect coefficients
-    if hasattr(trained_model, "coef_"):
-        model_coefficients = trained_model.coef_
-    elif hasattr(trained_model, "feature_importances_"):
-        model_coefficients = trained_model.feature_importances_
-    else:
-        raise ValueError("Could not retrieve the feature importance")
+    elif importance_type == "model":
+        if hasattr(trained_model, "coef_"):
+            model_coefficients = trained_model.coef_
+        elif hasattr(trained_model, "feature_importances_"):
+            model_coefficients = trained_model.feature_importances_
+        else:
+            raise ValueError("Could not retrieve the feature importance")
 
-    df_coef = pd.DataFrame(model_coefficients, index=column_names)
+        df_coef = pd.DataFrame(model_coefficients, index=column_names)
+
+    else:
+        raise ValueError("Allowed values for importance_type are model and shap")
+
 
     return df_coef
 
@@ -183,6 +213,7 @@ def train_with_kfold_splitting(
     labels: pd.DataFrame,
     model: BaseEstimator,
     scaler_type: BaseEstimator,
+    importance_type: str,
     verbose: bool,
     random_state: int,
 ) -> pd.DataFrame:
@@ -194,6 +225,7 @@ def train_with_kfold_splitting(
         - labels: the vector with labels, commonly called y
         - model: an untrained scikit-learn model
         - scaler_type: choose between StandardScaler or MinMaxScaler
+        - importance_type: the method for selecting feature importance
         - verbose: True or False to tune the level of verbosity
         - random_state: select the random state of the train/test splitting process
 
@@ -216,12 +248,13 @@ def train_with_kfold_splitting(
             scaler_type,
             verbose,
         )
+
         if i == 0:
-            df_coefs = get_feature_importances(trained_model, x_trains[i].columns)
+            df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type)
             df_coefs.columns = ["cycle_" + str(i + 1)]
         else:
             df_coefs["cycle_" + str(i + 1)] = get_feature_importances(
-                trained_model, x_trains[i].columns
+                trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type
             )
 
     df_coef = compute_mean_coefficients(df_coefs)
@@ -233,6 +266,7 @@ def train_with_simple_splitting(
     labels: pd.DataFrame,
     model: BaseEstimator,
     scaler_type: BaseEstimator,
+    importance_type: str,
     verbose: bool,
     random_state: int,
 ) -> pd.DataFrame:
@@ -244,6 +278,7 @@ def train_with_simple_splitting(
         - labels: the vector with labels, commonly called y
         - model: an untrained scikit-learn model
         - scaler_type: choose between StandardScaler or MinMaxScaler
+        - importance_type: the method for selecting feature importance
         - verbose: True or False to tune the level of verbosity
         - random_state: select the random state of the train/test splitting process
 
@@ -259,7 +294,7 @@ def train_with_simple_splitting(
     trained_model = train_evaluate_model(
         x_train, y_train, x_test, y_test, model, scaler_type, verbose
     )
-    df_coefs = get_feature_importances(trained_model, x_train.columns)
+    df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, scaler_type, importance_type)
 
     df_coef = compute_mean_coefficients(df_coefs)
 
@@ -274,6 +309,7 @@ def scan_features_pipeline(
     verbose: bool,
     random_state: int,
     noise_type: str,
+    importance_type: str
 ) -> pd.DataFrame:
     """This pipeline performs various operations:
     - train and evaluate the model
@@ -289,6 +325,7 @@ def scan_features_pipeline(
         - verbose: True or False to tune the level of verbosity
         - random_state: select the random state of the train/test splitting process
         - noise_type: choose between "gaussian" noise or "random" (flat) noise
+        - importance_type: the method for selecting feature importance
 
     Return:
         - the simplified dataset, containing only the most relevant features
@@ -308,11 +345,11 @@ def scan_features_pipeline(
 
     if splitting_type == "kfold":
         df_coef = train_with_kfold_splitting(
-            x_new, labels, model, scaler_type, verbose, random_state
+            x_new, labels, model, scaler_type, importance_type, verbose, random_state
         )
     elif splitting_type == "simple":
         df_coef = train_with_simple_splitting(
-            x_new, labels, model, scaler_type, verbose, random_state
+            x_new, labels, model, scaler_type, importance_type, verbose, random_state
         )
     else:
         raise ValueError("Choice not recognized. Possible choices are kfold or simple")
@@ -330,6 +367,7 @@ def get_relevant_features(
     epochs: int,
     patience: int,
     noise_type: str = "gaussian",
+    importance_type: str = "model",
     verbose: bool = True,
     filename_output: Optional[str] = None,
     random_state: int = 42,
@@ -346,6 +384,7 @@ def get_relevant_features(
         - patience: the number of cycles of non-improvement to wait before stopping
         the execution of the code
         - noise_type: choose between "gaussian" noise or "random" (flat) noise
+        - importance_type: the method for selecting feature importance
         - verbose: True or False, to tune the level of verbosity
         - filename_output:  name of the simplified dataset if you want to export it, default is None
         - random_state: select the random seed
@@ -372,6 +411,7 @@ def get_relevant_features(
             verbose,
             random_states[epoch],
             noise_type,
+            importance_type
         )
         n_features_after = x_new.shape[1]
 
@@ -390,4 +430,4 @@ def get_relevant_features(
     if filename_output is not None:
         x_new.to_csv(filename_output, index=False)
 
-    return x_new
+    return x_new