Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ sdist/*
docs/html
docs/jupyter_execute
app.html

# Virtual environment
venv/
41 changes: 28 additions & 13 deletions src/simdec/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def decomposition(
output: pd.DataFrame,
*,
sensitivity_indices: np.ndarray,
dec_limit: float = 1,
dec_limit: float | None = None,
auto_ordering: bool = True,
states: list[int] | None = None,
statistic: Literal["mean", "median"] | None = "mean",
Expand All @@ -79,7 +79,7 @@ def decomposition(
----------
inputs : DataFrame of shape (n_runs, n_factors)
Input variables.
output : DataFrame of shape (n_runs, 1)
output : DataFrame of shape (n_runs, 1) or (n_runs,)
Target variable.
sensitivity_indices : ndarray of shape (n_factors, 1)
Sensitivity indices, combined effect of each input.
Expand Down Expand Up @@ -116,7 +116,7 @@ def decomposition(
inputs[cat_col] = codes

inputs = inputs.to_numpy()
output = output.to_numpy()
output = output.to_numpy().flatten()

# 1. variables for decomposition
var_order = np.argsort(sensitivity_indices)[::-1]
Expand All @@ -125,26 +125,41 @@ def decomposition(
sensitivity_indices = sensitivity_indices[var_order]

if auto_ordering:
n_var_dec = np.where(np.cumsum(sensitivity_indices) < dec_limit)[0].size
# handle edge case where sensitivity indices don't sum exactly to 1.0
if dec_limit is None:
dec_limit = 0.8 * np.sum(sensitivity_indices)

cumulative_sum = np.cumsum(sensitivity_indices)
indices_over_limit = np.where(cumulative_sum >= dec_limit)[0]

if indices_over_limit.size > 0:
n_var_dec = indices_over_limit[0] + 1
else:
n_var_dec = sensitivity_indices.size

n_var_dec = max(1, n_var_dec) # keep at least one variable
n_var_dec = min(5, n_var_dec) # use at most 5 variables
else:
n_var_dec = inputs.shape[1]

# 2. states formation
# 2. variable selection and reordering
if auto_ordering:
var_names = var_names[var_order[:n_var_dec]].tolist()
inputs = inputs[:, var_order[:n_var_dec]]
else:
var_names = var_names[:n_var_dec].tolist()
inputs = inputs[:, :n_var_dec]

# 3. states formation (after reordering/selection)
if states is None:
states = 3 if n_var_dec < 3 else 2
states = 3 if n_var_dec <= 2 else 2
states = [states] * n_var_dec

for i in range(n_var_dec):
n_unique = np.unique(inputs[:, i]).size
states[i] = n_unique if n_unique <= 5 else states[i]

if auto_ordering:
var_names = var_names[var_order[:n_var_dec]].tolist()
inputs = inputs[:, var_order[:n_var_dec]]

# 3. decomposition
# 4. decomposition
bins = []

statistic_methods = {
Expand All @@ -153,8 +168,8 @@ def decomposition(
}
try:
statistic_method = statistic_methods[statistic]
except IndexError:
msg = f"'statistic' must be one of {statistic_methods.values()}"
except KeyError:
msg = f"'statistic' must be one of {statistic_methods.keys()}"
raise ValueError(msg)

def statistic_(inputs):
Expand Down
90 changes: 52 additions & 38 deletions src/simdec/sensitivity_indices.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from dataclasses import dataclass
import warnings

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -61,7 +60,7 @@ def sensitivity_indices(
Sensitivity indices, combined effect of each input.
foe : ndarray of shape (n_factors, 1)
First-order effects (also called 'main' or 'individual').
soe : ndarray of shape (n_factors, 1)
soe : ndarray of shape (n_factors, n_factors)
Second-order effects (also called 'interaction').

Examples
Expand Down Expand Up @@ -96,15 +95,21 @@ def sensitivity_indices(
array([0.43157591, 0.44241433, 0.11767249])

"""
# Handle inputs conversion
if isinstance(inputs, pd.DataFrame):
cat_columns = inputs.select_dtypes(["category", "O"]).columns
inputs[cat_columns] = inputs[cat_columns].apply(
lambda x: x.astype("category").cat.codes
)
inputs = inputs.to_numpy()
if isinstance(output, pd.DataFrame):

# Handle output conversion first, then flatten
if isinstance(output, (pd.DataFrame, pd.Series)):
output = output.to_numpy()

# Flatten output if it's (N, 1)
output = output.flatten()

n_runs, n_factors = inputs.shape
n_bins_foe, n_bins_soe = number_of_bins(n_runs, n_factors)

Expand All @@ -116,55 +121,64 @@ def sensitivity_indices(
soe = np.zeros((n_factors, n_factors))

for i in range(n_factors):
# first order
# 1. First-order effects (FOE)
xi = inputs[:, i]

bin_avg, _, binnumber = stats.binned_statistic(
x=xi, values=output, bins=n_bins_foe
x=xi, values=output, bins=n_bins_foe, statistic="mean"
)
# can have NaN in the average but no corresponding binnumber
bin_avg = bin_avg[~np.isnan(bin_avg)]
bin_counts = np.unique(binnumber, return_counts=True)[1]

# weighted variance and divide by the overall variance of the output
foe[i] = _weighted_var(bin_avg, weights=bin_counts) / var_y
# Filter empty bins and get weights (counts)
mask_foe = ~np.isnan(bin_avg)
mean_i_foe = bin_avg[mask_foe]
# binnumber starts at 1; 0 is for values outside range
bin_counts_foe = np.unique(binnumber[binnumber > 0], return_counts=True)[1]

foe[i] = _weighted_var(mean_i_foe, weights=bin_counts_foe) / var_y

# second order
# 2. Second-order effects (SOE)
for j in range(n_factors):
if i == j or j < i:
if j <= i:
continue

xj = inputs[:, j]

bin_avg, *edges, binnumber = stats.binned_statistic_2d(
# 2D Binned Statistic for Var(E[Y|Xi, Xj])
bin_avg_ij, x_edges, y_edges, binnumber_ij = stats.binned_statistic_2d(
x=xi, y=xj, values=output, bins=n_bins_soe, expand_binnumbers=False
)

mean_ij = bin_avg[~np.isnan(bin_avg)]
bin_counts = np.unique(binnumber, return_counts=True)[1]
var_ij = _weighted_var(mean_ij, weights=bin_counts)

# expand_binnumbers here
nbin = np.array([len(edges_) + 1 for edges_ in edges])
binnumbers = np.asarray(np.unravel_index(binnumber, nbin))

bin_counts_i = np.unique(binnumbers[0], return_counts=True)[1]
bin_counts_j = np.unique(binnumbers[1], return_counts=True)[1]
mask_ij = ~np.isnan(bin_avg_ij)
mean_ij = bin_avg_ij[mask_ij]
counts_ij = np.unique(binnumber_ij[binnumber_ij > 0], return_counts=True)[1]
var_ij = _weighted_var(mean_ij, weights=counts_ij)

# handle NaNs
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
mean_i = np.nanmean(bin_avg, axis=1)
mean_i = mean_i[~np.isnan(mean_i)]
mean_j = np.nanmean(bin_avg, axis=0)
mean_j = mean_j[~np.isnan(mean_j)]

var_i = _weighted_var(mean_i, weights=bin_counts_i)
var_j = _weighted_var(mean_j, weights=bin_counts_j)

soe[i, j] = (var_ij - var_i - var_j) / var_y

soe = np.where(soe == 0, soe.T, soe)
si[i] = foe[i] + soe[:, i].sum() / 2
# Marginal Var(E[Y|Xi]) using n_bins_soe to match MATLAB logic
bin_avg_i_soe, _, binnumber_i_soe = stats.binned_statistic(
x=xi, values=output, bins=n_bins_soe, statistic="mean"
)
mask_i = ~np.isnan(bin_avg_i_soe)
counts_i = np.unique(
binnumber_i_soe[binnumber_i_soe > 0], return_counts=True
)[1]
var_i_soe = _weighted_var(bin_avg_i_soe[mask_i], weights=counts_i)

# Marginal Var(E[Y|Xj]) using n_bins_soe to match MATLAB logic
bin_avg_j_soe, _, binnumber_j_soe = stats.binned_statistic(
x=xj, values=output, bins=n_bins_soe, statistic="mean"
)
mask_j = ~np.isnan(bin_avg_j_soe)
counts_j = np.unique(
binnumber_j_soe[binnumber_j_soe > 0], return_counts=True
)[1]
var_j_soe = _weighted_var(bin_avg_j_soe[mask_j], weights=counts_j)

soe[i, j] = (var_ij - var_i_soe - var_j_soe) / var_y

# Mirror SOE and calculate Combined Effect (SI)
# SI is FOE + half of all interactions associated with that variable
soe = soe + soe.T
for k in range(n_factors):
si[k] = foe[k] + (soe[:, k].sum() / 2)

return SensitivityAnalysisResult(si, foe, soe)
4 changes: 3 additions & 1 deletion tests/test_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def test_decomposition():
output_name, *v_names = list(data.columns)
inputs, output = data[v_names], data[output_name]
si = np.array([0.04, 0.50, 0.11, 0.28])
res = sd.decomposition(inputs=inputs, output=output, sensitivity_indices=si)
res = sd.decomposition(
inputs=inputs, output=output, sensitivity_indices=si, dec_limit=1
)

assert res.var_names == ["sigma_res", "R", "Rp0.2", "Kf"]
assert res.states == [2, 2, 2, 2]
Expand Down
29 changes: 29 additions & 0 deletions tests/test_decomposition_43.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np
import pandas as pd
from scipy.stats import qmc, uniform, lognorm
import simdec as sd


def test_decomposition_default():
m = 13
sampler = qmc.Sobol(d=2, scramble=True, seed=42)
sample = sampler.random_base2(m=m)

# deposit_0: uniform(500, 1500)
deposit_0 = uniform.ppf(sample[:, 0], loc=500, scale=1000)

# interest_rate: lognormal
sigma = 0.5
mu = np.log(0.01) + sigma**2
interest_rate = lognorm.ppf(sample[:, 1], s=sigma, scale=np.exp(mu))

deposit_20 = deposit_0 * (1 + interest_rate) ** 20

inputs = pd.DataFrame({"deposit_0": deposit_0, "interest_rate": interest_rate})
output = pd.Series(deposit_20, name="deposit_20")
indices = sd.sensitivity_indices(inputs=inputs, output=output)
si = indices.si
res = sd.decomposition(inputs=inputs, output=output, sensitivity_indices=si)

assert len(res.var_names) == 2
assert res.var_names == ["deposit_0", "interest_rate"]
37 changes: 37 additions & 0 deletions tests/test_sensitivity_indices_43.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import numpy as np
import numpy.testing as npt
import pandas as pd
from scipy.stats import qmc, uniform, lognorm
import simdec as sd

# Testing fix for issue #43


def test_sensitivity_indices_43():
m = 13
sampler = qmc.Sobol(d=2, scramble=True, seed=42)
sample = sampler.random_base2(m=m)

# deposit_0: uniform(500, 1500)
deposit_0 = uniform.ppf(sample[:, 0], loc=500, scale=1000)

# interest_rate: lognormal
sigma = 0.5
mu = np.log(0.01) + sigma**2
interest_rate = lognorm.ppf(sample[:, 1], s=sigma, scale=np.exp(mu))

deposit_20 = deposit_0 * (1 + interest_rate) ** 20

inputs = pd.DataFrame({"deposit_0": deposit_0, "interest_rate": interest_rate})
output = pd.Series(deposit_20, name="deposit_20")

res = sd.sensitivity_indices(inputs, output)

# MATLAB Results
expected_si = np.array([0.7101, 0.2739])
expected_foe = np.array([0.7028, 0.2666])
expected_soe_12 = 0.0146

npt.assert_allclose(res.si, expected_si, atol=3e-2)
npt.assert_allclose(res.first_order, expected_foe, atol=3e-2)
npt.assert_allclose(res.second_order[0, 1], expected_soe_12, atol=1e-2)
Loading