Skip to content

Commit 39bb93d

Browse files
authored
Fix SOE with lognormal distribution (#44)
1 parent 9a70edf commit 39bb93d

File tree

6 files changed

+152
-52
lines changed

6 files changed

+152
-52
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ sdist/*
3535
docs/html
3636
docs/jupyter_execute
3737
app.html
38+
39+
# Virtual environment
40+
venv/

src/simdec/decomposition.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def decomposition(
6868
output: pd.DataFrame,
6969
*,
7070
sensitivity_indices: np.ndarray,
71-
dec_limit: float = 1,
71+
dec_limit: float | None = None,
7272
auto_ordering: bool = True,
7373
states: list[int] | None = None,
7474
statistic: Literal["mean", "median"] | None = "mean",
@@ -79,7 +79,7 @@ def decomposition(
7979
----------
8080
inputs : DataFrame of shape (n_runs, n_factors)
8181
Input variables.
82-
output : DataFrame of shape (n_runs, 1)
82+
output : DataFrame of shape (n_runs, 1) or (n_runs,)
8383
Target variable.
8484
sensitivity_indices : ndarray of shape (n_factors, 1)
8585
Sensitivity indices, combined effect of each input.
@@ -116,7 +116,7 @@ def decomposition(
116116
inputs[cat_col] = codes
117117

118118
inputs = inputs.to_numpy()
119-
output = output.to_numpy()
119+
output = output.to_numpy().flatten()
120120

121121
# 1. variables for decomposition
122122
var_order = np.argsort(sensitivity_indices)[::-1]
@@ -125,26 +125,41 @@ def decomposition(
125125
sensitivity_indices = sensitivity_indices[var_order]
126126

127127
if auto_ordering:
128-
n_var_dec = np.where(np.cumsum(sensitivity_indices) < dec_limit)[0].size
128+
# handle edge case where sensitivity indices don't sum exactly to 1.0
129+
if dec_limit is None:
130+
dec_limit = 0.8 * np.sum(sensitivity_indices)
131+
132+
cumulative_sum = np.cumsum(sensitivity_indices)
133+
indices_over_limit = np.where(cumulative_sum >= dec_limit)[0]
134+
135+
if indices_over_limit.size > 0:
136+
n_var_dec = indices_over_limit[0] + 1
137+
else:
138+
n_var_dec = sensitivity_indices.size
139+
129140
n_var_dec = max(1, n_var_dec) # keep at least one variable
130141
n_var_dec = min(5, n_var_dec) # use at most 5 variables
131142
else:
132143
n_var_dec = inputs.shape[1]
133144

134-
# 2. states formation
145+
# 2. variable selection and reordering
146+
if auto_ordering:
147+
var_names = var_names[var_order[:n_var_dec]].tolist()
148+
inputs = inputs[:, var_order[:n_var_dec]]
149+
else:
150+
var_names = var_names[:n_var_dec].tolist()
151+
inputs = inputs[:, :n_var_dec]
152+
153+
# 3. states formation (after reordering/selection)
135154
if states is None:
136-
states = 3 if n_var_dec < 3 else 2
155+
states = 3 if n_var_dec <= 2 else 2
137156
states = [states] * n_var_dec
138157

139158
for i in range(n_var_dec):
140159
n_unique = np.unique(inputs[:, i]).size
141160
states[i] = n_unique if n_unique <= 5 else states[i]
142161

143-
if auto_ordering:
144-
var_names = var_names[var_order[:n_var_dec]].tolist()
145-
inputs = inputs[:, var_order[:n_var_dec]]
146-
147-
# 3. decomposition
162+
# 4. decomposition
148163
bins = []
149164

150165
statistic_methods = {
@@ -153,8 +168,8 @@ def decomposition(
153168
}
154169
try:
155170
statistic_method = statistic_methods[statistic]
156-
except IndexError:
157-
msg = f"'statistic' must be one of {statistic_methods.values()}"
171+
except KeyError:
172+
msg = f"'statistic' must be one of {statistic_methods.keys()}"
158173
raise ValueError(msg)
159174

160175
def statistic_(inputs):

src/simdec/sensitivity_indices.py

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from dataclasses import dataclass
2-
import warnings
32

43
import numpy as np
54
import pandas as pd
@@ -61,7 +60,7 @@ def sensitivity_indices(
6160
Sensitivity indices, combined effect of each input.
6261
foe : ndarray of shape (n_factors, 1)
6362
First-order effects (also called 'main' or 'individual').
64-
soe : ndarray of shape (n_factors, 1)
63+
soe : ndarray of shape (n_factors, n_factors)
6564
Second-order effects (also called 'interaction').
6665
6766
Examples
@@ -96,15 +95,21 @@ def sensitivity_indices(
9695
array([0.43157591, 0.44241433, 0.11767249])
9796
9897
"""
98+
# Handle inputs conversion
9999
if isinstance(inputs, pd.DataFrame):
100100
cat_columns = inputs.select_dtypes(["category", "O"]).columns
101101
inputs[cat_columns] = inputs[cat_columns].apply(
102102
lambda x: x.astype("category").cat.codes
103103
)
104104
inputs = inputs.to_numpy()
105-
if isinstance(output, pd.DataFrame):
105+
106+
# Handle output conversion first, then flatten
107+
if isinstance(output, (pd.DataFrame, pd.Series)):
106108
output = output.to_numpy()
107109

110+
# Flatten output if it's (N, 1)
111+
output = output.flatten()
112+
108113
n_runs, n_factors = inputs.shape
109114
n_bins_foe, n_bins_soe = number_of_bins(n_runs, n_factors)
110115

@@ -116,55 +121,64 @@ def sensitivity_indices(
116121
soe = np.zeros((n_factors, n_factors))
117122

118123
for i in range(n_factors):
119-
# first order
124+
# 1. First-order effects (FOE)
120125
xi = inputs[:, i]
121126

122127
bin_avg, _, binnumber = stats.binned_statistic(
123-
x=xi, values=output, bins=n_bins_foe
128+
x=xi, values=output, bins=n_bins_foe, statistic="mean"
124129
)
125-
# can have NaN in the average but no corresponding binnumber
126-
bin_avg = bin_avg[~np.isnan(bin_avg)]
127-
bin_counts = np.unique(binnumber, return_counts=True)[1]
128130

129-
# weighted variance and divide by the overall variance of the output
130-
foe[i] = _weighted_var(bin_avg, weights=bin_counts) / var_y
131+
# Filter empty bins and get weights (counts)
132+
mask_foe = ~np.isnan(bin_avg)
133+
mean_i_foe = bin_avg[mask_foe]
134+
# binnumber starts at 1; 0 is for values outside range
135+
bin_counts_foe = np.unique(binnumber[binnumber > 0], return_counts=True)[1]
136+
137+
foe[i] = _weighted_var(mean_i_foe, weights=bin_counts_foe) / var_y
131138

132-
# second order
139+
# 2. Second-order effects (SOE)
133140
for j in range(n_factors):
134-
if i == j or j < i:
141+
if j <= i:
135142
continue
136143

137144
xj = inputs[:, j]
138145

139-
bin_avg, *edges, binnumber = stats.binned_statistic_2d(
146+
# 2D Binned Statistic for Var(E[Y|Xi, Xj])
147+
bin_avg_ij, x_edges, y_edges, binnumber_ij = stats.binned_statistic_2d(
140148
x=xi, y=xj, values=output, bins=n_bins_soe, expand_binnumbers=False
141149
)
142150

143-
mean_ij = bin_avg[~np.isnan(bin_avg)]
144-
bin_counts = np.unique(binnumber, return_counts=True)[1]
145-
var_ij = _weighted_var(mean_ij, weights=bin_counts)
146-
147-
# expand_binnumbers here
148-
nbin = np.array([len(edges_) + 1 for edges_ in edges])
149-
binnumbers = np.asarray(np.unravel_index(binnumber, nbin))
150-
151-
bin_counts_i = np.unique(binnumbers[0], return_counts=True)[1]
152-
bin_counts_j = np.unique(binnumbers[1], return_counts=True)[1]
151+
mask_ij = ~np.isnan(bin_avg_ij)
152+
mean_ij = bin_avg_ij[mask_ij]
153+
counts_ij = np.unique(binnumber_ij[binnumber_ij > 0], return_counts=True)[1]
154+
var_ij = _weighted_var(mean_ij, weights=counts_ij)
153155

154-
# handle NaNs
155-
with warnings.catch_warnings():
156-
warnings.simplefilter("ignore", RuntimeWarning)
157-
mean_i = np.nanmean(bin_avg, axis=1)
158-
mean_i = mean_i[~np.isnan(mean_i)]
159-
mean_j = np.nanmean(bin_avg, axis=0)
160-
mean_j = mean_j[~np.isnan(mean_j)]
161-
162-
var_i = _weighted_var(mean_i, weights=bin_counts_i)
163-
var_j = _weighted_var(mean_j, weights=bin_counts_j)
164-
165-
soe[i, j] = (var_ij - var_i - var_j) / var_y
166-
167-
soe = np.where(soe == 0, soe.T, soe)
168-
si[i] = foe[i] + soe[:, i].sum() / 2
156+
# Marginal Var(E[Y|Xi]) using n_bins_soe to match MATLAB logic
157+
bin_avg_i_soe, _, binnumber_i_soe = stats.binned_statistic(
158+
x=xi, values=output, bins=n_bins_soe, statistic="mean"
159+
)
160+
mask_i = ~np.isnan(bin_avg_i_soe)
161+
counts_i = np.unique(
162+
binnumber_i_soe[binnumber_i_soe > 0], return_counts=True
163+
)[1]
164+
var_i_soe = _weighted_var(bin_avg_i_soe[mask_i], weights=counts_i)
165+
166+
# Marginal Var(E[Y|Xj]) using n_bins_soe to match MATLAB logic
167+
bin_avg_j_soe, _, binnumber_j_soe = stats.binned_statistic(
168+
x=xj, values=output, bins=n_bins_soe, statistic="mean"
169+
)
170+
mask_j = ~np.isnan(bin_avg_j_soe)
171+
counts_j = np.unique(
172+
binnumber_j_soe[binnumber_j_soe > 0], return_counts=True
173+
)[1]
174+
var_j_soe = _weighted_var(bin_avg_j_soe[mask_j], weights=counts_j)
175+
176+
soe[i, j] = (var_ij - var_i_soe - var_j_soe) / var_y
177+
178+
# Mirror SOE and calculate Combined Effect (SI)
179+
# SI is FOE + half of all interactions associated with that variable
180+
soe = soe + soe.T
181+
for k in range(n_factors):
182+
si[k] = foe[k] + (soe[:, k].sum() / 2)
169183

170184
return SensitivityAnalysisResult(si, foe, soe)

tests/test_decomposition.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ def test_decomposition():
1616
output_name, *v_names = list(data.columns)
1717
inputs, output = data[v_names], data[output_name]
1818
si = np.array([0.04, 0.50, 0.11, 0.28])
19-
res = sd.decomposition(inputs=inputs, output=output, sensitivity_indices=si)
19+
res = sd.decomposition(
20+
inputs=inputs, output=output, sensitivity_indices=si, dec_limit=1
21+
)
2022

2123
assert res.var_names == ["sigma_res", "R", "Rp0.2", "Kf"]
2224
assert res.states == [2, 2, 2, 2]

tests/test_decomposition_43.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import numpy as np
2+
import pandas as pd
3+
from scipy.stats import qmc, uniform, lognorm
4+
import simdec as sd
5+
6+
7+
def test_decomposition_default():
8+
m = 13
9+
sampler = qmc.Sobol(d=2, scramble=True, seed=42)
10+
sample = sampler.random_base2(m=m)
11+
12+
# deposit_0: uniform(500, 1500)
13+
deposit_0 = uniform.ppf(sample[:, 0], loc=500, scale=1000)
14+
15+
# interest_rate: lognormal
16+
sigma = 0.5
17+
mu = np.log(0.01) + sigma**2
18+
interest_rate = lognorm.ppf(sample[:, 1], s=sigma, scale=np.exp(mu))
19+
20+
deposit_20 = deposit_0 * (1 + interest_rate) ** 20
21+
22+
inputs = pd.DataFrame({"deposit_0": deposit_0, "interest_rate": interest_rate})
23+
output = pd.Series(deposit_20, name="deposit_20")
24+
indices = sd.sensitivity_indices(inputs=inputs, output=output)
25+
si = indices.si
26+
res = sd.decomposition(inputs=inputs, output=output, sensitivity_indices=si)
27+
28+
assert len(res.var_names) == 2
29+
assert res.var_names == ["deposit_0", "interest_rate"]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import numpy as np
2+
import numpy.testing as npt
3+
import pandas as pd
4+
from scipy.stats import qmc, uniform, lognorm
5+
import simdec as sd
6+
7+
# Testing fix for issue #43
8+
9+
10+
def test_sensitivity_indices_43():
11+
m = 13
12+
sampler = qmc.Sobol(d=2, scramble=True, seed=42)
13+
sample = sampler.random_base2(m=m)
14+
15+
# deposit_0: uniform(500, 1500)
16+
deposit_0 = uniform.ppf(sample[:, 0], loc=500, scale=1000)
17+
18+
# interest_rate: lognormal
19+
sigma = 0.5
20+
mu = np.log(0.01) + sigma**2
21+
interest_rate = lognorm.ppf(sample[:, 1], s=sigma, scale=np.exp(mu))
22+
23+
deposit_20 = deposit_0 * (1 + interest_rate) ** 20
24+
25+
inputs = pd.DataFrame({"deposit_0": deposit_0, "interest_rate": interest_rate})
26+
output = pd.Series(deposit_20, name="deposit_20")
27+
28+
res = sd.sensitivity_indices(inputs, output)
29+
30+
# MATLAB Results
31+
expected_si = np.array([0.7101, 0.2739])
32+
expected_foe = np.array([0.7028, 0.2666])
33+
expected_soe_12 = 0.0146
34+
35+
npt.assert_allclose(res.si, expected_si, atol=3e-2)
36+
npt.assert_allclose(res.first_order, expected_foe, atol=3e-2)
37+
npt.assert_allclose(res.second_order[0, 1], expected_soe_12, atol=1e-2)

0 commit comments

Comments
 (0)