Fix SOE with lognormal distribution (#44)

justushelo · web-flow · commit 39bb93da5532 · 2026-02-26T18:58:18.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,6 @@ sdist/*
 docs/html
 docs/jupyter_execute
 app.html
+
+# Virtual environment
+venv/
diff --git a/src/simdec/decomposition.py b/src/simdec/decomposition.py
@@ -68,7 +68,7 @@ def decomposition(
     output: pd.DataFrame,
     *,
     sensitivity_indices: np.ndarray,
-    dec_limit: float = 1,
+    dec_limit: float | None = None,
     auto_ordering: bool = True,
     states: list[int] | None = None,
     statistic: Literal["mean", "median"] | None = "mean",
@@ -79,7 +79,7 @@ def decomposition(
     ----------
     inputs : DataFrame of shape (n_runs, n_factors)
         Input variables.
-    output : DataFrame of shape (n_runs, 1)
+    output : DataFrame of shape (n_runs, 1) or (n_runs,)
         Target variable.
     sensitivity_indices : ndarray of shape (n_factors, 1)
         Sensitivity indices, combined effect of each input.
@@ -116,7 +116,7 @@ def decomposition(
         inputs[cat_col] = codes
 
     inputs = inputs.to_numpy()
-    output = output.to_numpy()
+    output = output.to_numpy().flatten()
 
     # 1. variables for decomposition
     var_order = np.argsort(sensitivity_indices)[::-1]
@@ -125,26 +125,41 @@ def decomposition(
     sensitivity_indices = sensitivity_indices[var_order]
 
     if auto_ordering:
-        n_var_dec = np.where(np.cumsum(sensitivity_indices) < dec_limit)[0].size
+        # handle edge case where sensitivity indices don't sum exactly to 1.0
+        if dec_limit is None:
+            dec_limit = 0.8 * np.sum(sensitivity_indices)
+
+        cumulative_sum = np.cumsum(sensitivity_indices)
+        indices_over_limit = np.where(cumulative_sum >= dec_limit)[0]
+
+        if indices_over_limit.size > 0:
+            n_var_dec = indices_over_limit[0] + 1
+        else:
+            n_var_dec = sensitivity_indices.size
+
         n_var_dec = max(1, n_var_dec)  # keep at least one variable
         n_var_dec = min(5, n_var_dec)  # use at most 5 variables
     else:
         n_var_dec = inputs.shape[1]
 
-    # 2. states formation
+    # 2. variable selection and reordering
+    if auto_ordering:
+        var_names = var_names[var_order[:n_var_dec]].tolist()
+        inputs = inputs[:, var_order[:n_var_dec]]
+    else:
+        var_names = var_names[:n_var_dec].tolist()
+        inputs = inputs[:, :n_var_dec]
+
+    # 3. states formation (after reordering/selection)
     if states is None:
-        states = 3 if n_var_dec < 3 else 2
+        states = 3 if n_var_dec <= 2 else 2
         states = [states] * n_var_dec
 
         for i in range(n_var_dec):
             n_unique = np.unique(inputs[:, i]).size
             states[i] = n_unique if n_unique <= 5 else states[i]
 
-    if auto_ordering:
-        var_names = var_names[var_order[:n_var_dec]].tolist()
-        inputs = inputs[:, var_order[:n_var_dec]]
-
-    # 3. decomposition
+    # 4. decomposition
     bins = []
 
     statistic_methods = {
@@ -153,8 +168,8 @@ def decomposition(
     }
     try:
         statistic_method = statistic_methods[statistic]
-    except IndexError:
-        msg = f"'statistic' must be one of {statistic_methods.values()}"
+    except KeyError:
+        msg = f"'statistic' must be one of {statistic_methods.keys()}"
         raise ValueError(msg)
 
     def statistic_(inputs):
diff --git a/src/simdec/sensitivity_indices.py b/src/simdec/sensitivity_indices.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-import warnings
 
 import numpy as np
 import pandas as pd
@@ -61,7 +60,7 @@ def sensitivity_indices(
             Sensitivity indices, combined effect of each input.
         foe : ndarray of shape (n_factors, 1)
             First-order effects (also called 'main' or 'individual').
-        soe : ndarray of shape (n_factors, 1)
+        soe : ndarray of shape (n_factors, n_factors)
             Second-order effects (also called 'interaction').
 
     Examples
@@ -96,15 +95,21 @@ def sensitivity_indices(
     array([0.43157591, 0.44241433, 0.11767249])
 
     """
+    # Handle inputs conversion
     if isinstance(inputs, pd.DataFrame):
         cat_columns = inputs.select_dtypes(["category", "O"]).columns
         inputs[cat_columns] = inputs[cat_columns].apply(
             lambda x: x.astype("category").cat.codes
         )
         inputs = inputs.to_numpy()
-    if isinstance(output, pd.DataFrame):
+
+    # Handle output conversion first, then flatten
+    if isinstance(output, (pd.DataFrame, pd.Series)):
         output = output.to_numpy()
 
+    # Flatten output if it's (N, 1)
+    output = output.flatten()
+
     n_runs, n_factors = inputs.shape
     n_bins_foe, n_bins_soe = number_of_bins(n_runs, n_factors)
 
@@ -116,55 +121,64 @@ def sensitivity_indices(
     soe = np.zeros((n_factors, n_factors))
 
     for i in range(n_factors):
-        # first order
+        # 1. First-order effects (FOE)
         xi = inputs[:, i]
 
         bin_avg, _, binnumber = stats.binned_statistic(
-            x=xi, values=output, bins=n_bins_foe
+            x=xi, values=output, bins=n_bins_foe, statistic="mean"
         )
-        # can have NaN in the average but no corresponding binnumber
-        bin_avg = bin_avg[~np.isnan(bin_avg)]
-        bin_counts = np.unique(binnumber, return_counts=True)[1]
 
-        # weighted variance and divide by the overall variance of the output
-        foe[i] = _weighted_var(bin_avg, weights=bin_counts) / var_y
+        # Filter empty bins and get weights (counts)
+        mask_foe = ~np.isnan(bin_avg)
+        mean_i_foe = bin_avg[mask_foe]
+        # binnumber starts at 1; 0 is for values outside range
+        bin_counts_foe = np.unique(binnumber[binnumber > 0], return_counts=True)[1]
+
+        foe[i] = _weighted_var(mean_i_foe, weights=bin_counts_foe) / var_y
 
-        # second order
+        # 2. Second-order effects (SOE)
         for j in range(n_factors):
-            if i == j or j < i:
+            if j <= i:
                 continue
 
             xj = inputs[:, j]
 
-            bin_avg, *edges, binnumber = stats.binned_statistic_2d(
+            # 2D Binned Statistic for Var(E[Y|Xi, Xj])
+            bin_avg_ij, x_edges, y_edges, binnumber_ij = stats.binned_statistic_2d(
                 x=xi, y=xj, values=output, bins=n_bins_soe, expand_binnumbers=False
             )
 
-            mean_ij = bin_avg[~np.isnan(bin_avg)]
-            bin_counts = np.unique(binnumber, return_counts=True)[1]
-            var_ij = _weighted_var(mean_ij, weights=bin_counts)
-
-            # expand_binnumbers here
-            nbin = np.array([len(edges_) + 1 for edges_ in edges])
-            binnumbers = np.asarray(np.unravel_index(binnumber, nbin))
-
-            bin_counts_i = np.unique(binnumbers[0], return_counts=True)[1]
-            bin_counts_j = np.unique(binnumbers[1], return_counts=True)[1]
+            mask_ij = ~np.isnan(bin_avg_ij)
+            mean_ij = bin_avg_ij[mask_ij]
+            counts_ij = np.unique(binnumber_ij[binnumber_ij > 0], return_counts=True)[1]
+            var_ij = _weighted_var(mean_ij, weights=counts_ij)
 
-            # handle NaNs
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore", RuntimeWarning)
-                mean_i = np.nanmean(bin_avg, axis=1)
-                mean_i = mean_i[~np.isnan(mean_i)]
-                mean_j = np.nanmean(bin_avg, axis=0)
-                mean_j = mean_j[~np.isnan(mean_j)]
-
-            var_i = _weighted_var(mean_i, weights=bin_counts_i)
-            var_j = _weighted_var(mean_j, weights=bin_counts_j)
-
-            soe[i, j] = (var_ij - var_i - var_j) / var_y
-
-        soe = np.where(soe == 0, soe.T, soe)
-        si[i] = foe[i] + soe[:, i].sum() / 2
+            # Marginal Var(E[Y|Xi]) using n_bins_soe to match MATLAB logic
+            bin_avg_i_soe, _, binnumber_i_soe = stats.binned_statistic(
+                x=xi, values=output, bins=n_bins_soe, statistic="mean"
+            )
+            mask_i = ~np.isnan(bin_avg_i_soe)
+            counts_i = np.unique(
+                binnumber_i_soe[binnumber_i_soe > 0], return_counts=True
+            )[1]
+            var_i_soe = _weighted_var(bin_avg_i_soe[mask_i], weights=counts_i)
+
+            # Marginal Var(E[Y|Xj]) using n_bins_soe to match MATLAB logic
+            bin_avg_j_soe, _, binnumber_j_soe = stats.binned_statistic(
+                x=xj, values=output, bins=n_bins_soe, statistic="mean"
+            )
+            mask_j = ~np.isnan(bin_avg_j_soe)
+            counts_j = np.unique(
+                binnumber_j_soe[binnumber_j_soe > 0], return_counts=True
+            )[1]
+            var_j_soe = _weighted_var(bin_avg_j_soe[mask_j], weights=counts_j)
+
+            soe[i, j] = (var_ij - var_i_soe - var_j_soe) / var_y
+
+    # Mirror SOE and calculate Combined Effect (SI)
+    # SI is FOE + half of all interactions associated with that variable
+    soe = soe + soe.T
+    for k in range(n_factors):
+        si[k] = foe[k] + (soe[:, k].sum() / 2)
 
     return SensitivityAnalysisResult(si, foe, soe)
diff --git a/tests/test_decomposition.py b/tests/test_decomposition.py
@@ -16,7 +16,9 @@ def test_decomposition():
     output_name, *v_names = list(data.columns)
     inputs, output = data[v_names], data[output_name]
     si = np.array([0.04, 0.50, 0.11, 0.28])
-    res = sd.decomposition(inputs=inputs, output=output, sensitivity_indices=si)
+    res = sd.decomposition(
+        inputs=inputs, output=output, sensitivity_indices=si, dec_limit=1
+    )
 
     assert res.var_names == ["sigma_res", "R", "Rp0.2", "Kf"]
     assert res.states == [2, 2, 2, 2]
diff --git a/tests/test_decomposition_43.py b/tests/test_decomposition_43.py
@@ -0,0 +1,29 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import qmc, uniform, lognorm
+import simdec as sd
+
+
+def test_decomposition_default():
+    m = 13
+    sampler = qmc.Sobol(d=2, scramble=True, seed=42)
+    sample = sampler.random_base2(m=m)
+
+    # deposit_0: uniform(500, 1500)
+    deposit_0 = uniform.ppf(sample[:, 0], loc=500, scale=1000)
+
+    # interest_rate: lognormal
+    sigma = 0.5
+    mu = np.log(0.01) + sigma**2
+    interest_rate = lognorm.ppf(sample[:, 1], s=sigma, scale=np.exp(mu))
+
+    deposit_20 = deposit_0 * (1 + interest_rate) ** 20
+
+    inputs = pd.DataFrame({"deposit_0": deposit_0, "interest_rate": interest_rate})
+    output = pd.Series(deposit_20, name="deposit_20")
+    indices = sd.sensitivity_indices(inputs=inputs, output=output)
+    si = indices.si
+    res = sd.decomposition(inputs=inputs, output=output, sensitivity_indices=si)
+
+    assert len(res.var_names) == 2
+    assert res.var_names == ["deposit_0", "interest_rate"]
diff --git a/tests/test_sensitivity_indices_43.py b/tests/test_sensitivity_indices_43.py
@@ -0,0 +1,37 @@
+import numpy as np
+import numpy.testing as npt
+import pandas as pd
+from scipy.stats import qmc, uniform, lognorm
+import simdec as sd
+
+# Testing fix for issue #43
+
+
+def test_sensitivity_indices_43():
+    m = 13
+    sampler = qmc.Sobol(d=2, scramble=True, seed=42)
+    sample = sampler.random_base2(m=m)
+
+    # deposit_0: uniform(500, 1500)
+    deposit_0 = uniform.ppf(sample[:, 0], loc=500, scale=1000)
+
+    # interest_rate: lognormal
+    sigma = 0.5
+    mu = np.log(0.01) + sigma**2
+    interest_rate = lognorm.ppf(sample[:, 1], s=sigma, scale=np.exp(mu))
+
+    deposit_20 = deposit_0 * (1 + interest_rate) ** 20
+
+    inputs = pd.DataFrame({"deposit_0": deposit_0, "interest_rate": interest_rate})
+    output = pd.Series(deposit_20, name="deposit_20")
+
+    res = sd.sensitivity_indices(inputs, output)
+
+    # MATLAB Results
+    expected_si = np.array([0.7101, 0.2739])
+    expected_foe = np.array([0.7028, 0.2666])
+    expected_soe_12 = 0.0146
+
+    npt.assert_allclose(res.si, expected_si, atol=3e-2)
+    npt.assert_allclose(res.first_order, expected_foe, atol=3e-2)
+    npt.assert_allclose(res.second_order[0, 1], expected_soe_12, atol=1e-2)