From 77131b221b3b7b2c886744a085e91a61ddcd43a5 Mon Sep 17 00:00:00 2001 From: igerber Date: Wed, 15 Apr 2026 20:49:31 -0400 Subject: [PATCH 01/16] Add survey support (pweight + strata/PSU/FPC) to dCDH estimator - Weighted cell aggregation in _validate_and_aggregate_to_cells() - Survey resolution via _resolve_survey_for_fit() with pweight-only and group-constant validation - IF expansion from group to observation level for TSL variance - Survey-aware SE at all call sites (overall, joiners, leavers, multi-horizon, placebos) via _compute_se() dispatcher - Bootstrap + survey warning (PSU-level deferred) - 12 new tests in test_survey_dcdh.py - Documentation updates: REGISTRY.md, ROADMAP.md, llms-full.txt, choosing_estimator.rst Co-Authored-By: Claude Opus 4.6 (1M context) --- ROADMAP.md | 4 +- diff_diff/chaisemartin_dhaultfoeuille.py | 263 ++++++++++++++--- diff_diff/guides/llms-full.txt | 6 +- diff_diff/survey.py | 41 +++ docs/choosing_estimator.rst | 12 +- docs/methodology/REGISTRY.md | 4 +- tests/test_chaisemartin_dhaultfoeuille.py | 12 +- tests/test_survey_dcdh.py | 342 ++++++++++++++++++++++ 8 files changed, 632 insertions(+), 52 deletions(-) create mode 100644 tests/test_survey_dcdh.py diff --git a/ROADMAP.md b/ROADMAP.md index c24b33a0..fa631f5e 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -181,7 +181,7 @@ The dynamic companion paper subsumes the AER 2020 paper: `DID_1 = DID_M`. The si These are referenced by the dCDH papers but live in *separate* efforts or *separate* companion papers we don't yet have: -- **Survey design integration** — deferred to a separate effort after all three phases ship. Phase 1 documents "no survey support" in the compatibility matrix; the separate effort revisits when Phase 3 is complete. +- **Survey design integration** — shipped. Supports pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred. - **Fuzzy DiD** (within-cell-varying treatment, Web Appendix Section 1.7 of dynamic paper) → de Chaisemartin & D'Haultfœuille (2018), separate paper not yet reviewed - **Principled anticipation handling and trimming rules** (footnote 14 of dynamic paper) → de Chaisemartin (2021), separate paper not yet reviewed - **2SLS DiD** (referenced in AER appendix Section 3.4) → separate paper @@ -195,7 +195,7 @@ These remain in **Future Estimators** below if/when we choose to extend. - **Conservative CI** under Assumption 8 (independent groups), exact only under iid sampling. Documented in REGISTRY.md as a `**Note:**` deviation from "default nominal coverage." Theorem 1 of the dynamic paper. - **Cohort recentering for variance is essential.** Cohorts are defined by the triple `(D_{g,1}, F_g, S_g)`. The plug-in variance subtracts cohort-conditional means, **NOT a single grand mean**. Test fixtures must catch this — a wrong implementation silently produces a smaller, incorrect variance. - **No Rust acceleration is planned for any phase.** The estimator's hot path is groupby + BLAS-accelerated matrix-vector products, where NumPy already operates near-optimally. If profiling on large panels (`G > 100K`) reveals a bottleneck post-ship, the existing `_rust_bootstrap_weights` helper can be reused for the bootstrap loop without writing new Rust code. -- **No survey design integration in any phase.** Handled as a separate effort after all three phases ship. Phase 1 documents the absence in the compatibility matrix so survey users do not silently apply survey weights and get wrong answers. +- **Survey design integration shipped.** Supports pweight with strata/PSU/FPC via TSL. Replicate weights and PSU-level bootstrap deferred to a follow-up. --- diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 75d75d37..77d97277 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -97,6 +97,7 @@ def _validate_and_aggregate_to_cells( group: str, time: str, treatment: str, + weights: Optional[np.ndarray] = None, ) -> pd.DataFrame: """ Validate input data and aggregate to ``(g, t)`` cells per the dCDH contract. @@ -130,9 +131,14 @@ def _validate_and_aggregate_to_cells( cell-level treatment before calling ``fit()`` or ``twowayfeweights()``. + When ``weights`` is provided (survey pweights), cell means use weighted + averages: ``y_gt = sum(w_i * y_i) / sum(w_i)``. An additional column + ``w_gt`` (total weight per cell) is included in the output for downstream + IF expansion. + Returns the aggregated cell DataFrame with columns - ``[group, time, y_gt, d_gt, n_gt]``, sorted by ``[group, time]`` - with a fresh index. + ``[group, time, y_gt, d_gt, n_gt]`` (plus ``w_gt`` when weighted), + sorted by ``[group, time]`` with a fresh index. Raises ------ @@ -203,13 +209,33 @@ def _validate_and_aggregate_to_cells( # No longer enforces {0, 1} - non-binary and continuous treatment supported. # 5. Cell aggregation (compute min/max for within-cell check) - cell = df.groupby([group, time], as_index=False).agg( - y_gt=(outcome, "mean"), - d_gt=(treatment, "mean"), - d_min=(treatment, "min"), - d_max=(treatment, "max"), - n_gt=(treatment, "count"), - ) + if weights is not None: + # Survey-weighted cell aggregation. + # y_gt = sum(w_i * y_i) / sum(w_i) within each (g, t) cell. + # Treatment is constant within cells (checked below), so weighted + # and unweighted means are identical for d_gt. + df["_w_"] = weights + df["_wy_"] = weights * df[outcome].values + g_obj = df.groupby([group, time], as_index=False) + cell = g_obj.agg( + _wy_sum=("_wy_", "sum"), + w_gt=("_w_", "sum"), + d_gt=(treatment, "mean"), + d_min=(treatment, "min"), + d_max=(treatment, "max"), + n_gt=(treatment, "count"), + ) + cell["y_gt"] = cell["_wy_sum"] / cell["w_gt"] + cell = cell.drop(columns=["_wy_sum"]) + df.drop(columns=["_w_", "_wy_"], inplace=True) + else: + cell = df.groupby([group, time], as_index=False).agg( + y_gt=(outcome, "mean"), + d_gt=(treatment, "mean"), + d_min=(treatment, "min"), + d_max=(treatment, "max"), + n_gt=(treatment, "count"), + ) # 6. Within-cell-varying treatment rejection. # All observations in a cell must have the same treatment value @@ -233,8 +259,9 @@ def _validate_and_aggregate_to_cells( f"(first 5):\n{example_cells}" ) # Drop the min/max columns; keep d_gt as float (no int cast - supports - # ordinal and continuous treatment). - cell = cell.drop(columns=["d_min", "d_max"]) + # ordinal and continuous treatment). w_gt retained when weighted. + drop_cols = ["d_min", "d_max"] + cell = cell.drop(columns=drop_cols) # Sort to ensure deterministic order in downstream operations cell = cell.sort_values([group, time]).reset_index(drop=True) @@ -559,10 +586,13 @@ def fit( (Design-2) groups. Convenience wrapper (descriptive summary, not full paper re-estimation). Requires ``drop_larger_lower=False`` to retain 2-switch groups. - survey_design : Any, optional - **Not supported in any phase.** Survey design integration is - handled as a separate effort after all three phases ship. - Passing a non-``None`` value raises ``NotImplementedError``. + survey_design : SurveyDesign, optional + Survey design specification for design-based inference. + Supports pweight with strata/PSU/FPC via Taylor Series + Linearization. Survey weights produce weighted cell means + for the point estimate; the IF-based variance accounts for + the survey design structure. Replicate weights are not yet + supported. Returns ------- @@ -598,17 +628,33 @@ def fit( ) # ------------------------------------------------------------------ - # Step 3: Survey gate (deferred separate effort) + # Step 3: Survey resolution # ------------------------------------------------------------------ - if survey_design is not None: - raise NotImplementedError( - "ChaisemartinDHaultfoeuille does not support survey_design. " - "Survey design integration for dCDH is deferred to a separate " - "effort after all three implementation phases ship (see " - "ROADMAP.md out-of-scope section). For now, fit without " - "survey_design. If your treatment is absorbing, use " - "CallawaySantAnna which supports survey_design." - ) + from diff_diff.survey import ( + _resolve_survey_for_fit, + _validate_group_constant_survey, + ) + + resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + _resolve_survey_for_fit(survey_design, data, "analytical") + ) + + if resolved_survey is not None: + if resolved_survey.weight_type != "pweight": + raise ValueError( + f"ChaisemartinDHaultfoeuille survey support requires " + f"weight_type='pweight', got '{resolved_survey.weight_type}'. " + f"The survey IF variance math assumes probability weights." + ) + if (resolved_survey.replicate_weights is not None + and resolved_survey.replicate_weights.shape[1] > 0): + raise NotImplementedError( + "Replicate weight variance for dCDH is not yet supported. " + "Use strata/PSU/FPC for design-based inference via Taylor " + "Series Linearization." + ) + # Validate survey columns are constant within groups. + _validate_group_constant_survey(data, group, survey_design) # Design-2 precondition: requires drop_larger_lower=False if design2 and self.drop_larger_lower: @@ -633,8 +679,19 @@ def fit( group=group, time=time, treatment=treatment, + weights=survey_weights, ) + # Retain observation-level survey info for IF expansion (Step 3 + # of survey integration: group-level IF → observation-level psi). + _obs_survey_info = None + if resolved_survey is not None: + _obs_survey_info = { + "group_ids": data[group].values, + "weights": survey_weights, + "resolved": resolved_survey, + } + # ------------------------------------------------------------------ # Step 4b: Covariate aggregation (DID^X, Web Appendix Section 1.2) # ------------------------------------------------------------------ @@ -1420,11 +1477,18 @@ def fit( cid_elig = cid_l[eligible_mask_var] U_centered_l = _cohort_recenter(U_l_elig, cid_elig) N_l_h = multi_horizon_dids[l_h]["N_l"] - se_l = _plugin_se(U_centered=U_centered_l, divisor=N_l_h) + _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]] + se_l = _compute_se( + U_centered=U_centered_l, + divisor=N_l_h, + obs_survey_info=_obs_survey_info, + eligible_groups=_elig_groups_l, + ) multi_horizon_se[l_h] = se_l did_l_val = multi_horizon_dids[l_h]["did_l"] - t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=None) + _df_s = resolved_survey.df_survey if resolved_survey is not None else None + t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s) multi_horizon_inference[l_h] = { "effect": did_l_val, "se": se_l, @@ -1540,13 +1604,18 @@ def fit( U_pl_elig = U_pl[eligible_mask_pl] cid_elig_pl = cid_pl[eligible_mask_pl] U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl) - se_pl_l = _plugin_se( - U_centered=U_centered_pl_l, divisor=pl_data["N_pl_l"] + _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]] + se_pl_l = _compute_se( + U_centered=U_centered_pl_l, + divisor=pl_data["N_pl_l"], + obs_survey_info=_obs_survey_info, + eligible_groups=_elig_groups_pl, ) placebo_horizon_se[lag_l] = se_pl_l pl_val = pl_data["placebo_l"] + _df_s = resolved_survey.df_survey if resolved_survey is not None else None t_pl_l, p_pl_l, ci_pl_l = safe_inference( - pl_val, se_pl_l, alpha=self.alpha, df=None + pl_val, se_pl_l, alpha=self.alpha, df=_df_s ) placebo_horizon_inference[lag_l] = { "effect": pl_val, @@ -1601,6 +1670,7 @@ def fit( n_groups_dropped_never_switching, U_centered_joiners, U_centered_leavers, + _eligible_group_ids, ) = _compute_cohort_recentered_inputs( D_mat=D_mat, # Phase 1 IF uses per-period structure: use raw outcomes @@ -1617,8 +1687,13 @@ def fit( singleton_baseline_groups=singleton_baseline_groups, ) - # Analytical SE for DID_M - overall_se = _plugin_se(U_centered=U_centered_overall, divisor=N_S) + # Analytical SE for DID_M (survey-aware when survey_design provided) + overall_se = _compute_se( + U_centered=U_centered_overall, + divisor=N_S, + obs_survey_info=_obs_survey_info, + eligible_groups=_eligible_group_ids, + ) # Detect the degenerate-cohort case: every variance-eligible group # forms its own (D_{g,1}, F_g, S_g) cohort, so the centered # influence function is identically zero and `_plugin_se` returns @@ -1643,15 +1718,23 @@ def fit( UserWarning, stacklevel=2, ) + _df_survey = ( + resolved_survey.df_survey if resolved_survey is not None else None + ) overall_t, overall_p, overall_ci = safe_inference( - overall_att, overall_se, alpha=self.alpha, df=None + overall_att, overall_se, alpha=self.alpha, df=_df_survey ) # Joiners SE (uses joiner-only centered IF; conservative bound) if joiners_available: - joiners_se = _plugin_se(U_centered=U_centered_joiners, divisor=joiner_total) + joiners_se = _compute_se( + U_centered=U_centered_joiners, + divisor=joiner_total, + obs_survey_info=_obs_survey_info, + eligible_groups=_eligible_group_ids, + ) joiners_t, joiners_p, joiners_ci = safe_inference( - joiners_att, joiners_se, alpha=self.alpha, df=None + joiners_att, joiners_se, alpha=self.alpha, df=_df_survey ) else: joiners_se, joiners_t, joiners_p, joiners_ci = ( @@ -1663,9 +1746,14 @@ def fit( # Leavers SE if leavers_available: - leavers_se = _plugin_se(U_centered=U_centered_leavers, divisor=leaver_total) + leavers_se = _compute_se( + U_centered=U_centered_leavers, + divisor=leaver_total, + obs_survey_info=_obs_survey_info, + eligible_groups=_eligible_group_ids, + ) leavers_t, leavers_p, leavers_ci = safe_inference( - leavers_att, leavers_se, alpha=self.alpha, df=None + leavers_att, leavers_se, alpha=self.alpha, df=_df_survey ) else: leavers_se, leavers_t, leavers_p, leavers_ci = ( @@ -1705,6 +1793,15 @@ def fit( # ------------------------------------------------------------------ bootstrap_results: Optional[DCDHBootstrapResults] = None if self.n_bootstrap > 0: + if resolved_survey is not None: + warnings.warn( + "Bootstrap with survey_design uses group-level multiplier " + "weights, not PSU-level. This is conservative when groups " + "are finer than PSUs. PSU-level survey bootstrap is " + "deferred to a future release.", + UserWarning, + stacklevel=2, + ) joiners_inputs = ( (U_centered_joiners, joiner_total, joiners_att) if joiners_available else None ) @@ -2412,6 +2509,7 @@ def fit( if design2 else None ), + survey_metadata=survey_metadata, _estimator_ref=self, ) @@ -4381,6 +4479,9 @@ def _compute_cohort_recentered_inputs( U_centered_joiners = _cohort_recenter(U_joiners, cohort_id_eligible) U_centered_leavers = _cohort_recenter(U_leavers, cohort_id_eligible) + # Eligible group IDs for survey IF expansion + eligible_group_ids = [all_groups[g] for g in range(n_groups) if eligible_mask[g]] + return ( U_centered_overall, U_centered_overall.size, @@ -4388,6 +4489,7 @@ def _compute_cohort_recentered_inputs( n_groups_dropped_never_switching, U_centered_joiners, U_centered_leavers, + eligible_group_ids, ) @@ -4432,6 +4534,93 @@ def _plugin_se(U_centered: np.ndarray, divisor: int) -> float: return float(np.sqrt(sigma_hat_sq) / np.sqrt(divisor)) +def _compute_se( + U_centered: np.ndarray, + divisor: int, + obs_survey_info: Optional[dict], + eligible_groups: Optional[list] = None, +) -> float: + """Dispatch to plug-in SE or survey-design-aware SE. + + When ``obs_survey_info`` is ``None``, falls back to the simple + plug-in formula. Otherwise, expands group-level IFs and delegates + to TSL variance. + """ + if obs_survey_info is None: + return _plugin_se(U_centered=U_centered, divisor=divisor) + if eligible_groups is None: + return _plugin_se(U_centered=U_centered, divisor=divisor) + return _survey_se_from_group_if( + U_centered=U_centered, + eligible_groups=eligible_groups, + obs_survey_info=obs_survey_info, + ) + + +def _survey_se_from_group_if( + U_centered: np.ndarray, + eligible_groups: list, + obs_survey_info: dict, +) -> float: + """Compute survey-design-aware SE from group-level centered IFs. + + Expands group-level influence function values to observation level + using the proportional-weight allocation ``psi_i = U[g] * (w_i / W_g)`` + so that ``sum(psi_i for i in g) = U[g]``, then delegates to + ``compute_survey_if_variance()`` for the TSL design-based variance. + + This is a library extension not in the dCDH papers (the paper's + plug-in variance assumes iid sampling). + + Parameters + ---------- + U_centered : np.ndarray + Cohort-recentered per-group IF values (one per eligible group). + eligible_groups : list + Group IDs corresponding to entries of ``U_centered`` + (variance-eligible groups after singleton-baseline exclusion). + obs_survey_info : dict + Observation-level survey data retained from ``fit()`` setup: + ``{"group_ids": ..., "weights": ..., "resolved": ...}``. + + Returns + ------- + float + Survey-design-aware SE, or NaN if degenerate. + """ + from diff_diff.survey import compute_survey_if_variance + + group_ids = obs_survey_info["group_ids"] + weights = obs_survey_info["weights"] + resolved = obs_survey_info["resolved"] + n_obs = len(group_ids) + + # Build group → U_centered lookup + group_to_u = {} + for idx, gid in enumerate(eligible_groups): + group_to_u[gid] = U_centered[idx] + + # Compute per-group weight totals W_g + group_to_w_total: Dict[Any, float] = {} + for i in range(n_obs): + gid = group_ids[i] + group_to_w_total[gid] = group_to_w_total.get(gid, 0.0) + weights[i] + + # Expand to observation level: psi_i = U[g] * (w_i / W_g) + psi = np.zeros(n_obs) + for i in range(n_obs): + gid = group_ids[i] + u_val = group_to_u.get(gid, 0.0) + w_total = group_to_w_total.get(gid, 1.0) + if w_total > 0: + psi[i] = u_val * (weights[i] / w_total) + + variance = compute_survey_if_variance(psi, resolved) + if not np.isfinite(variance) or variance < 0: + return float("nan") + return float(np.sqrt(variance)) + + def _build_group_time_design( cell: pd.DataFrame, group_col: str, diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 1b794d66..012d5165 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -265,8 +265,8 @@ est.fit( trends_linear: bool | None = None, # Phase 3: DID^{fd} trends_nonparam: Any | None = None, # Phase 3: DID^s honest_did: bool = False, # Phase 3: HonestDiD integration - # ---- deferred (separate effort) ---- - survey_design: Any = None, + # ---- survey support ---- + survey_design: SurveyDesign | None = None, # pweight + strata/PSU/FPC (TSL) ) -> ChaisemartinDHaultfoeuilleResults ``` @@ -322,7 +322,7 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}") - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py` - Phase 1 placebo SE is intentionally `NaN` with a warning. The dynamic companion paper Section 3.7.3 derives the cohort-recentered analytical variance for `DID_l` only — not for the placebo `DID_M^pl`. Phase 2 will add multiplier-bootstrap support for the placebo. Until then, the placebo point estimate is meaningful but its inference fields stay NaN-consistent **even when `n_bootstrap > 0`** (bootstrap currently covers `DID_M`, `DID_+`, and `DID_-` only) - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling -- Survey design (`survey_design`) is not yet supported and is deferred to a separate effort after all phases ship +- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred ### SunAbraham diff --git a/diff_diff/survey.py b/diff_diff/survey.py index 14ae44d0..395f1a05 100644 --- a/diff_diff/survey.py +++ b/diff_diff/survey.py @@ -911,6 +911,47 @@ def _validate_unit_constant_survey(data, unit_col, survey_design): ) +def _validate_group_constant_survey(data, group_col, survey_design): + """Validate that survey design columns are constant within groups. + + The dCDH estimator aggregates to ``(group, time)`` cells and then + works at the group level. Survey columns (weights, strata, PSU) + must not vary within groups for the IF expansion and survey variance + to be well-defined. + + Parameters + ---------- + data : pd.DataFrame + Input data (pre-aggregation). + group_col : str + Group identifier column name. + survey_design : SurveyDesign + Survey design specification (uses attribute names, not resolved arrays). + + Raises + ------ + ValueError + If any survey column varies within groups. + """ + cols_to_check = [ + survey_design.weights, + survey_design.strata, + survey_design.psu, + survey_design.fpc, + ] + for col in cols_to_check: + if col is not None and col in data.columns: + n_unique = data.groupby(group_col)[col].nunique() + varying_groups = n_unique[n_unique > 1] + if len(varying_groups) > 0: + raise ValueError( + f"Survey column '{col}' varies within groups " + f"(found {len(varying_groups)} groups with multiple values). " + f"dCDH survey support requires survey design columns to be " + f"constant within groups." + ) + + def _resolve_pweight_only(resolved_survey, estimator_name): """Guard: reject non-pweight and strata/PSU/FPC for pweight-only estimators. diff --git a/docs/choosing_estimator.rst b/docs/choosing_estimator.rst index e246d5b1..4c70b785 100644 --- a/docs/choosing_estimator.rst +++ b/docs/choosing_estimator.rst @@ -293,9 +293,9 @@ Phase 3 will add covariate adjustment. .. note:: - ``ChaisemartinDHaultfoeuille`` does not yet support ``survey_design``; - passing it raises ``NotImplementedError``. Survey integration is - deferred to a separate effort after Phases 2 and 3 ship. + ``ChaisemartinDHaultfoeuille`` supports ``survey_design`` with pweight + and strata/PSU/FPC via Taylor Series Linearization. Replicate weights + are not yet supported. Synthetic DiD ~~~~~~~~~~~~~ @@ -726,10 +726,10 @@ estimation. The depth of support varies by estimator: - Full - Multiplier at PSU * - ``ChaisemartinDHaultfoeuille`` + - pweight only + - Full (TSL) - -- - - -- - - -- - - -- + - Group-level (warning) * - ``TripleDifference`` - pweight only - Full diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 5bfd6ec1..546bfbf5 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -627,7 +627,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param **Requirements checklist:** - [x] Single class `ChaisemartinDHaultfoeuille` (alias `DCDH`); not a family -- [x] Forward-compat `fit()` signature with `NotImplementedError` gates for remaining parameters (`aggregate`, `survey_design`); Phase 3 gates lifted for `controls`, `trends_linear`, `trends_nonparam`, `honest_did` +- [x] Forward-compat `fit()` signature with `NotImplementedError` gate for `aggregate`; survey_design now supported (pweight + strata/PSU/FPC via TSL); Phase 3 gates lifted for `controls`, `trends_linear`, `trends_nonparam`, `honest_did` - [x] `DID_M` point estimate with cohort-recentered analytical SE - [x] Joiners-only `DID_+` and leavers-only `DID_-` decompositions with their own inference - [x] Single-lag placebo `DID_M^pl` (point estimate; SE deferred to Phase 2) @@ -648,6 +648,8 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7) - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface +- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred. +- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. --- diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 94022e78..1193ce60 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -385,15 +385,21 @@ def test_honest_did_requires_lmax(self, data): honest_did=True, ) - def test_survey_design_raises_not_implemented(self, data): - with pytest.raises(NotImplementedError, match="separate effort"): + def test_survey_design_rejects_fweight(self, data): + """Survey support requires pweight; fweight rejected.""" + from diff_diff import SurveyDesign + + data = data.copy() + data["pw"] = 1.0 + sd = SurveyDesign(weights="pw", weight_type="fweight") + with pytest.raises(ValueError, match="pweight"): self._est().fit( data, outcome="outcome", group="group", time="period", treatment="treatment", - survey_design=object(), + survey_design=sd, ) def test_cluster_parameter_raises_not_implemented(self, data): diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py new file mode 100644 index 00000000..89ab3a51 --- /dev/null +++ b/tests/test_survey_dcdh.py @@ -0,0 +1,342 @@ +"""Survey support tests for ChaisemartinDHaultfoeuille (dCDH).""" + +import numpy as np +import pandas as pd +import pytest + +from diff_diff import ChaisemartinDHaultfoeuille, SurveyDesign +from diff_diff.prep_dgp import generate_reversible_did_data + + +# ── Fixtures ──────────────────────────────────────────────────────── + + +@pytest.fixture(scope="module") +def base_data(): + """Reversible-treatment panel with 30 groups, 6 periods.""" + return generate_reversible_did_data( + n_groups=30, + n_periods=6, + pattern="single_switch", + seed=42, + ) + + +@pytest.fixture(scope="module") +def data_with_survey(base_data): + """Add survey columns: weights, strata, PSU.""" + rng = np.random.default_rng(99) + df = base_data.copy() + groups = df["group"].unique() + + # Assign per-group (constant within group) survey attributes + g_weights = {g: rng.uniform(0.5, 3.0) for g in groups} + g_strata = {g: int(i % 3) for i, g in enumerate(sorted(groups))} + g_psu = {g: int(i % 10) for i, g in enumerate(sorted(groups))} + + df["pw"] = df["group"].map(g_weights) + df["stratum"] = df["group"].map(g_strata) + df["cluster"] = df["group"].map(g_psu) + return df + + +# ── Test: Backward compatibility ──────────────────────────────────── + + +class TestBackwardCompat: + """survey_design=None produces identical results to pre-change code.""" + + def test_no_survey_unchanged(self, base_data): + result = ChaisemartinDHaultfoeuille(seed=1).fit( + base_data, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=None, + ) + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) or result.overall_se != result.overall_se + assert result.survey_metadata is None + + +# ── Test: Uniform weights = no-survey ─────────────────────────────── + + +class TestUniformWeights: + """Uniform weights should produce identical results to no survey.""" + + def test_uniform_weights_match_unweighted(self, base_data): + df = base_data.copy() + df["pw"] = 1.0 + sd = SurveyDesign(weights="pw") + + result_plain = ChaisemartinDHaultfoeuille(seed=1).fit( + base_data, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + ) + result_survey = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + # Point estimates should match exactly (uniform weights = unweighted mean) + assert result_plain.overall_att == pytest.approx( + result_survey.overall_att, abs=1e-10 + ) + + +# ── Test: Non-uniform weights change estimate ─────────────────────── + + +class TestNonUniformWeights: + + def test_nonuniform_weights_change_att(self, base_data): + """With multiple obs per cell and non-uniform weights, ATT differs.""" + # Duplicate rows to create multi-observation cells, then vary + # outcomes and weights so weighted cell means differ from unweighted. + rng = np.random.default_rng(77) + df = base_data.copy() + df2 = base_data.copy() + df2["outcome"] = df2["outcome"] + rng.normal(0, 1.0, size=len(df2)) + multi = pd.concat([df, df2], ignore_index=True) + + # Assign per-group constant weights (heavier on the second copy) + groups = multi["group"].unique() + g_weights = {g: rng.uniform(0.5, 3.0) for g in groups} + multi["pw"] = multi["group"].map(g_weights) + # Make second copy have different weights (still constant within group) + # by giving rows from the second batch higher weight via a multiplier + multi.loc[len(df):, "pw"] = multi.loc[len(df):, "pw"] * 2.0 + + # Since weights now vary within group (first copy vs second copy), + # we need per-observation weights. But dCDH requires group-constant + # survey columns. Instead, use observation-level weights directly: + # assign random weights per observation (but constant within group). + # Actually, the constraint is within-GROUP constancy. With multi-obs + # cells where weights vary within groups, validation will reject. + # Solution: use group-constant weights with varied outcomes. + multi["pw"] = multi["group"].map(g_weights) + + sd = SurveyDesign(weights="pw") + result_plain = ChaisemartinDHaultfoeuille(seed=1).fit( + multi, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + ) + result_survey = ChaisemartinDHaultfoeuille(seed=1).fit( + multi, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + # With group-constant weights and multi-obs cells, weighted cell + # means = unweighted cell means (all obs in the cell get the same + # weight). The ATTs should match. This confirms the equal-cell + # contract: survey weights don't change the cross-group aggregation. + # The SE will differ because the survey variance accounts for design. + assert result_plain.overall_att == pytest.approx( + result_survey.overall_att, abs=1e-8 + ) + # But the SEs should differ when strata/PSU are present + # (tested separately in TestSurveySE) + + +# ── Test: Scale invariance ────────────────────────────────────────── + + +class TestScaleInvariance: + + def test_weight_scale_invariance(self, data_with_survey): + sd1 = SurveyDesign(weights="pw") + + df2 = data_with_survey.copy() + df2["pw_scaled"] = df2["pw"] * 100.0 + sd2 = SurveyDesign(weights="pw_scaled") + + r1 = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd1, + ) + r2 = ChaisemartinDHaultfoeuille(seed=1).fit( + df2, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd2, + ) + assert r1.overall_att == pytest.approx(r2.overall_att, abs=1e-10) + assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-6) + + +# ── Test: Survey SE differs from analytical SE ────────────────────── + + +class TestSurveySE: + + def test_strata_psu_changes_se(self, data_with_survey): + """Strata + PSU should produce a different SE than weights-only.""" + sd_weights_only = SurveyDesign(weights="pw") + sd_full = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + + r_w = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_weights_only, + ) + r_full = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_full, + ) + # Point estimates should match (same weights) + assert r_w.overall_att == pytest.approx(r_full.overall_att, abs=1e-10) + # SEs should differ (strata/PSU affects variance) + if np.isfinite(r_w.overall_se) and np.isfinite(r_full.overall_se): + assert r_w.overall_se != pytest.approx(r_full.overall_se, rel=0.01) + + def test_survey_metadata_populated(self, data_with_survey): + sd = SurveyDesign(weights="pw", strata="stratum", psu="cluster", nest=True) + result = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert result.survey_metadata is not None + + +# ── Test: Validation ──────────────────────────────────────────────── + + +class TestValidation: + + def test_rejects_fweight(self, base_data): + df = base_data.copy() + df["pw"] = 1.0 + sd = SurveyDesign(weights="pw", weight_type="fweight") + with pytest.raises(ValueError, match="pweight"): + ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + def test_rejects_aweight(self, base_data): + df = base_data.copy() + df["pw"] = 1.0 + sd = SurveyDesign(weights="pw", weight_type="aweight") + with pytest.raises(ValueError, match="pweight"): + ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + def test_rejects_varying_weights_within_group(self, base_data): + """Weights must be constant within groups.""" + df = base_data.copy() + # Assign different weights to different observations in the same group + df["pw"] = np.random.default_rng(1).uniform(0.5, 3.0, size=len(df)) + sd = SurveyDesign(weights="pw") + with pytest.raises(ValueError, match="varies within groups"): + ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + def test_rejects_replicate_weights(self, base_data): + """Replicate weight variance not yet supported.""" + df = base_data.copy() + df["pw"] = 1.0 + df["rep1"] = 1.0 + df["rep2"] = 1.0 + sd = SurveyDesign( + weights="pw", + replicate_weights=["rep1", "rep2"], + replicate_method="BRR", + ) + with pytest.raises(NotImplementedError, match="Replicate"): + ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + +# ── Test: Multi-horizon with survey ───────────────────────────────── + + +class TestMultiHorizonSurvey: + + def test_multi_horizon_survey_runs(self, data_with_survey): + """L_max >= 1 with survey should produce finite event study effects.""" + sd = SurveyDesign(weights="pw") + result = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + L_max=2, + survey_design=sd, + ) + assert result.event_study_effects is not None + assert 1 in result.event_study_effects + assert np.isfinite(result.event_study_effects[1]["effect"]) + + +# ── Test: Bootstrap + survey warning ──────────────────────────────── + + +class TestBootstrapSurveyWarning: + + def test_bootstrap_survey_emits_warning(self, data_with_survey): + sd = SurveyDesign(weights="pw") + with pytest.warns(UserWarning, match="group-level multiplier"): + ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit( + data_with_survey, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) From b4e0b2866395b1254c1ce84c43825446d9cec0a0 Mon Sep 17 00:00:00 2001 From: igerber Date: Wed, 15 Apr 2026 21:00:18 -0400 Subject: [PATCH 02/16] Fix AI review: remove over-restrictive group-constant validation, vectorize IF expansion - Remove _validate_group_constant_survey() call - the IF expansion psi_i = U[g] * (w_i / W_g) handles observation-level variation in weights, strata, and PSU within groups correctly - Vectorize _survey_se_from_group_if using np.bincount + np.unique (was Python loops over all observations) - Replace test_rejects_varying_weights_within_group with two positive tests: varying weights accepted, and varying weights change ATT (time-varying noise to survive first-differencing) - Remove unused survey_weight_type variable Co-Authored-By: Claude Opus 4.6 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 40 ++++++--------- tests/test_survey_dcdh.py | 63 +++++++++++++++++++----- 2 files changed, 66 insertions(+), 37 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 77d97277..935d01ca 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -630,12 +630,9 @@ def fit( # ------------------------------------------------------------------ # Step 3: Survey resolution # ------------------------------------------------------------------ - from diff_diff.survey import ( - _resolve_survey_for_fit, - _validate_group_constant_survey, - ) + from diff_diff.survey import _resolve_survey_for_fit - resolved_survey, survey_weights, survey_weight_type, survey_metadata = ( + resolved_survey, survey_weights, _, survey_metadata = ( _resolve_survey_for_fit(survey_design, data, "analytical") ) @@ -653,8 +650,9 @@ def fit( "Use strata/PSU/FPC for design-based inference via Taylor " "Series Linearization." ) - # Validate survey columns are constant within groups. - _validate_group_constant_survey(data, group, survey_design) + # No group-constant survey validation: the IF expansion + # psi_i = U[g] * (w_i / W_g) handles observation-level + # variation in weights, strata, and PSU within groups. # Design-2 precondition: requires drop_larger_lower=False if design2 and self.drop_larger_lower: @@ -4593,27 +4591,21 @@ def _survey_se_from_group_if( group_ids = obs_survey_info["group_ids"] weights = obs_survey_info["weights"] resolved = obs_survey_info["resolved"] - n_obs = len(group_ids) - # Build group → U_centered lookup - group_to_u = {} - for idx, gid in enumerate(eligible_groups): - group_to_u[gid] = U_centered[idx] + # Build group → U_centered lookup (vectorized via factorization) + group_to_u = {gid: U_centered[idx] for idx, gid in enumerate(eligible_groups)} + + # Map group IFs to observation level + u_obs = np.array([group_to_u.get(gid, 0.0) for gid in group_ids]) - # Compute per-group weight totals W_g - group_to_w_total: Dict[Any, float] = {} - for i in range(n_obs): - gid = group_ids[i] - group_to_w_total[gid] = group_to_w_total.get(gid, 0.0) + weights[i] + # Compute per-group weight totals W_g via bincount + unique_gids, inverse = np.unique(group_ids, return_inverse=True) + w_totals_per_group = np.bincount(inverse, weights=weights) + w_obs_total = w_totals_per_group[inverse] # Expand to observation level: psi_i = U[g] * (w_i / W_g) - psi = np.zeros(n_obs) - for i in range(n_obs): - gid = group_ids[i] - u_val = group_to_u.get(gid, 0.0) - w_total = group_to_w_total.get(gid, 1.0) - if w_total > 0: - psi[i] = u_val * (weights[i] / w_total) + safe_w = np.where(w_obs_total > 0, w_obs_total, 1.0) + psi = u_obs * (weights / safe_w) variance = compute_survey_if_variance(psi, resolved) if not np.isfinite(variance) or variance < 0: diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 89ab3a51..c6f18499 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -264,21 +264,58 @@ def test_rejects_aweight(self, base_data): survey_design=sd, ) - def test_rejects_varying_weights_within_group(self, base_data): - """Weights must be constant within groups.""" + def test_varying_weights_within_group_accepted(self, base_data): + """Observation-level weights varying within groups are valid.""" + # Create multi-obs cells with varying weights + rng = np.random.default_rng(1) df = base_data.copy() - # Assign different weights to different observations in the same group - df["pw"] = np.random.default_rng(1).uniform(0.5, 3.0, size=len(df)) + df2 = base_data.copy() + df2["outcome"] = df2["outcome"] + rng.normal(0, 0.5, size=len(df2)) + multi = pd.concat([df, df2], ignore_index=True) + # Observation-level weights (vary within group) + multi["pw"] = rng.uniform(0.5, 3.0, size=len(multi)) sd = SurveyDesign(weights="pw") - with pytest.raises(ValueError, match="varies within groups"): - ChaisemartinDHaultfoeuille().fit( - df, - outcome="outcome", - group="group", - time="period", - treatment="treatment", - survey_design=sd, - ) + # Should succeed - no group-constant restriction + result = ChaisemartinDHaultfoeuille(seed=1).fit( + multi, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + + def test_varying_weights_change_att(self, base_data): + """With multi-obs cells and varying weights, ATT differs from unweighted. + + dCDH uses first differences Y_{g,t} - Y_{g,t-1}, so group-constant + noise cancels. The noise must vary across both group AND time for + weighted cell means to affect the ATT via different first differences. + """ + rng = np.random.default_rng(42) + df = base_data.copy() + df2 = base_data.copy() + # Per-observation noise (varies by group AND time) + df2["outcome"] = df2["outcome"] + rng.normal(0, 3.0, size=len(df2)) + multi = pd.concat([df, df2], ignore_index=True) + # Give first copy weight=1, second copy weight=10 + multi["pw"] = np.where(np.arange(len(multi)) < len(df), 1.0, 10.0) + sd = SurveyDesign(weights="pw") + result_plain = ChaisemartinDHaultfoeuille(seed=1).fit( + multi, outcome="outcome", group="group", + time="period", treatment="treatment", + ) + result_survey = ChaisemartinDHaultfoeuille(seed=1).fit( + multi, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + # Weighted cell means with time-varying noise produce different + # first differences -> different ATT + assert result_plain.overall_att != pytest.approx( + result_survey.overall_att, abs=0.01 + ) def test_rejects_replicate_weights(self, base_data): """Replicate weight variance not yet supported.""" From 72b730135bc72101ff00f30424a3dc6b5cc05b7d Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 06:08:37 -0400 Subject: [PATCH 03/16] Fix CI review R1: SE divisor scaling, zero-weight cells, dead helper - P1-A: Scale U_centered by 1/divisor before survey IF expansion. dCDH IFs are numerator-scale (U.sum() == N_S * DID_M), but compute_survey_if_variance() expects estimator-scale psi. - P1-B: Zero-weight cells (w_gt <= 0) now treated as absent by setting n_gt=0, preventing NaN propagation into estimates. - P2: Add SE-pinning test (uniform weights + PSU=group matches plug-in SE) and zero-weight cell exclusion test. - P3: Delete unused _validate_group_constant_survey() from survey.py that contradicted the supported within-group variation contract. Co-Authored-By: Claude Opus 4.6 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 14 ++++- diff_diff/survey.py | 41 --------------- tests/test_survey_dcdh.py | 65 ++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 42 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 935d01ca..78ef7ba9 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -227,6 +227,12 @@ def _validate_and_aggregate_to_cells( ) cell["y_gt"] = cell["_wy_sum"] / cell["w_gt"] cell = cell.drop(columns=["_wy_sum"]) + # Zero-weight cells: treat as absent so downstream presence + # logic (N_mat > 0) correctly excludes them. + zero_w_mask = cell["w_gt"] <= 0 + if zero_w_mask.any(): + cell.loc[zero_w_mask, "n_gt"] = 0 + cell.loc[zero_w_mask, "y_gt"] = 0.0 df.drop(columns=["_w_", "_wy_"], inplace=True) else: cell = df.groupby([group, time], as_index=False).agg( @@ -4548,8 +4554,14 @@ def _compute_se( return _plugin_se(U_centered=U_centered, divisor=divisor) if eligible_groups is None: return _plugin_se(U_centered=U_centered, divisor=divisor) + if divisor <= 0: + return float("nan") + # dCDH IFs are numerator-scale (U.sum() == N_S * DID_M). + # compute_survey_if_variance() expects estimator-scale psi. + # Scale by 1/divisor to normalize before survey expansion. + U_scaled = U_centered / divisor return _survey_se_from_group_if( - U_centered=U_centered, + U_centered=U_scaled, eligible_groups=eligible_groups, obs_survey_info=obs_survey_info, ) diff --git a/diff_diff/survey.py b/diff_diff/survey.py index 395f1a05..14ae44d0 100644 --- a/diff_diff/survey.py +++ b/diff_diff/survey.py @@ -911,47 +911,6 @@ def _validate_unit_constant_survey(data, unit_col, survey_design): ) -def _validate_group_constant_survey(data, group_col, survey_design): - """Validate that survey design columns are constant within groups. - - The dCDH estimator aggregates to ``(group, time)`` cells and then - works at the group level. Survey columns (weights, strata, PSU) - must not vary within groups for the IF expansion and survey variance - to be well-defined. - - Parameters - ---------- - data : pd.DataFrame - Input data (pre-aggregation). - group_col : str - Group identifier column name. - survey_design : SurveyDesign - Survey design specification (uses attribute names, not resolved arrays). - - Raises - ------ - ValueError - If any survey column varies within groups. - """ - cols_to_check = [ - survey_design.weights, - survey_design.strata, - survey_design.psu, - survey_design.fpc, - ] - for col in cols_to_check: - if col is not None and col in data.columns: - n_unique = data.groupby(group_col)[col].nunique() - varying_groups = n_unique[n_unique > 1] - if len(varying_groups) > 0: - raise ValueError( - f"Survey column '{col}' varies within groups " - f"(found {len(varying_groups)} groups with multiple values). " - f"dCDH survey support requires survey design columns to be " - f"constant within groups." - ) - - def _resolve_pweight_only(resolved_survey, estimator_name): """Guard: reject non-pweight and strata/PSU/FPC for pweight-only estimators. diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index c6f18499..568137fb 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -377,3 +377,68 @@ def test_bootstrap_survey_emits_warning(self, data_with_survey): treatment="treatment", survey_design=sd, ) + + +# ── Test: SE scale pinning ────────────────────────────────────────── + + +class TestSEScalePinning: + """Survey SE with uniform weights and no strata/PSU must match plug-in SE.""" + + def test_uniform_survey_se_matches_plugin(self, base_data): + """Pins the divisor normalization: uniform survey SE with group-level + PSU clustering should be close to plug-in SE. + + Without PSU clustering, survey treats each observation as independent + (N_obs observations), while plug-in treats each group as independent + (N_groups). Clustering at the group level aligns the two. + """ + df = base_data.copy() + df["pw"] = 1.0 + sd = SurveyDesign(weights="pw", psu="group") + + r_plain = ChaisemartinDHaultfoeuille(seed=1).fit( + base_data, outcome="outcome", group="group", + time="period", treatment="treatment", + ) + r_survey = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + # With PSU=group and uniform weights, survey SE should be + # close to plug-in SE (both assume group-level independence). + # Small-sample corrections (n/(n-1)) cause minor differences. + if np.isfinite(r_plain.overall_se) and np.isfinite(r_survey.overall_se): + assert r_plain.overall_se == pytest.approx( + r_survey.overall_se, rel=0.15 + ), ( + f"Survey SE ({r_survey.overall_se:.6f}) should be close to " + f"plug-in SE ({r_plain.overall_se:.6f}) with uniform weights " + f"and PSU=group" + ) + + +# ── Test: Zero-weight cells ───────────────────────────────────────── + + +class TestZeroWeightCells: + + def test_zero_weight_cell_excluded(self, base_data): + """A cell with zero survey weight is treated as absent.""" + df = base_data.copy() + df["pw"] = 1.0 + # Zero out weight for one group at one period + target_group = df["group"].unique()[0] + target_period = df["period"].unique()[1] + mask = (df["group"] == target_group) & (df["period"] == target_period) + df.loc[mask, "pw"] = 0.0 + sd = SurveyDesign(weights="pw") + + # Should not raise; the zero-weight cell is just absent + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) From 355362b8cea190ade49c62e666e4a9bc27f75799 Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 07:04:02 -0400 Subject: [PATCH 04/16] Fix CI review R2: zero-weight row drops, weighted covariates, df_survey propagation - P1-A: Drop zero-weight rows entirely from cell DataFrame (instead of just zeroing n_gt) so ragged-panel validator doesn't see them - P1-B: Survey-weighted covariate aggregation in DID^X path (sum(w*x)/sum(w) instead of unweighted mean) - P1-C: Thread _df_survey to all remaining safe_inference() calls: bootstrap t-stats, normalized effects, cost-benefit delta, placebo bootstrap t-stats - P3: Fix REGISTRY overview paragraph (was still saying survey deferred) Co-Authored-By: Claude Opus 4.6 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 37 +++++++++++++++++------- docs/methodology/REGISTRY.md | 2 +- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 78ef7ba9..0960a97a 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -227,12 +227,11 @@ def _validate_and_aggregate_to_cells( ) cell["y_gt"] = cell["_wy_sum"] / cell["w_gt"] cell = cell.drop(columns=["_wy_sum"]) - # Zero-weight cells: treat as absent so downstream presence - # logic (N_mat > 0) correctly excludes them. + # Zero-weight cells: drop entirely so downstream validators + # (ragged-panel, baseline requirement) don't see them. zero_w_mask = cell["w_gt"] <= 0 if zero_w_mask.any(): - cell.loc[zero_w_mask, "n_gt"] = 0 - cell.loc[zero_w_mask, "y_gt"] = 0.0 + cell = cell[~zero_w_mask].reset_index(drop=True) df.drop(columns=["_w_", "_wy_"], inplace=True) else: cell = df.groupby([group, time], as_index=False).agg( @@ -744,7 +743,23 @@ def fit( # Use the coerced copy joined with group/time from original data. x_agg_input = data[[group, time]].copy() x_agg_input[controls] = data_controls[controls].values - x_cell_agg = x_agg_input.groupby([group, time], as_index=False)[controls].mean() + if survey_weights is not None: + # Survey-weighted covariate cell means: sum(w*x)/sum(w) + x_agg_input["_w_"] = survey_weights + for c in controls: + x_agg_input[f"_wx_{c}"] = survey_weights * x_agg_input[c].values + wx_cols = [f"_wx_{c}" for c in controls] + g_agg = x_agg_input.groupby([group, time], as_index=False).agg( + {**{wc: "sum" for wc in wx_cols}, "_w_": "sum"} + ) + for c in controls: + w_safe = g_agg["_w_"].replace(0, 1) + g_agg[c] = g_agg[f"_wx_{c}"] / w_safe + x_cell_agg = g_agg[[group, time] + controls] + else: + x_cell_agg = x_agg_input.groupby( + [group, time], as_index=False + )[controls].mean() cell = cell.merge(x_cell_agg, on=[group, time], how="left") # ------------------------------------------------------------------ @@ -1959,17 +1974,17 @@ def fit( overall_se = br.overall_se overall_p = br.overall_p_value if br.overall_p_value is not None else np.nan overall_ci = br.overall_ci if br.overall_ci is not None else (np.nan, np.nan) - overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=None)[0] + overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_df_survey)[0] if joiners_available and br.joiners_se is not None and np.isfinite(br.joiners_se): joiners_se = br.joiners_se joiners_p = br.joiners_p_value if br.joiners_p_value is not None else np.nan joiners_ci = br.joiners_ci if br.joiners_ci is not None else (np.nan, np.nan) - joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=None)[0] + joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_df_survey)[0] if leavers_available and br.leavers_se is not None and np.isfinite(br.leavers_se): leavers_se = br.leavers_se leavers_p = br.leavers_p_value if br.leavers_p_value is not None else np.nan leavers_ci = br.leavers_ci if br.leavers_ci is not None else (np.nan, np.nan) - leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=None)[0] + leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_df_survey)[0] # ------------------------------------------------------------------ # Step 20: Build the results dataclass @@ -2216,7 +2231,7 @@ def fit( bs_ci if bs_ci is not None else (np.nan, np.nan) ) placebo_event_study_dict[neg_key]["t_stat"] = safe_inference( - eff, bs_se, alpha=self.alpha, df=None + eff, bs_se, alpha=self.alpha, df=_df_survey )[0] # Phase 2: build normalized_effects with SE @@ -2229,7 +2244,7 @@ def fit( # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l se_did_l = multi_horizon_se.get(l_h, float("nan")) se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan") - t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=None) + t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_df_survey) normalized_effects_out[l_h] = { "effect": eff, "se": se_norm, @@ -2288,7 +2303,7 @@ def fit( else: running_se_ub = float("nan") cum_t, cum_p, cum_ci = safe_inference( - cum_effect, running_se_ub, alpha=self.alpha, df=None + cum_effect, running_se_ub, alpha=self.alpha, df=_df_survey ) cumulated[l_h] = { "effect": cum_effect, diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 546bfbf5..b69295bc 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -463,7 +463,7 @@ The multiplier bootstrap uses random weights w_i with E[w]=0 and Var(w)=1: - [de Chaisemartin, C. & D'Haultfœuille, X. (2020). Two-Way Fixed Effects Estimators with Heterogeneous Treatment Effects. *American Economic Review*, 110(9), 2964-2996.](https://doi.org/10.1257/aer.20181169) - [de Chaisemartin, C. & D'Haultfœuille, X. (2022, revised 2024). Difference-in-Differences Estimators of Intertemporal Treatment Effects. NBER Working Paper 29873.](https://www.nber.org/papers/w29873) — Web Appendix Section 3.7.3 contains the cohort-recentered plug-in variance formula implemented here. -**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design support is deferred to a separate effort after all phases ship. **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles. +**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization; replicate weights and PSU-level bootstrap are deferred. **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles. **Key implementation requirements:** From d313eb02838bb37f47de6af2faeae78e4307827d Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 16:44:55 -0400 Subject: [PATCH 05/16] Fix CI review R3: thread _df_survey to delta + HonestDiD surfaces - P0: delta overall surface now uses _df_survey instead of df=None at both safe_inference sites (primary delta path + placebo NaN-SE fallback). This makes overall_* under L_max>=2 use survey-t inference and respects safe_inference's df<=0 NaN guard. - P1: HonestDiD dCDH extraction now propagates df_survey from survey_metadata (mirrors CS pattern). Survey-backed dCDH HonestDiD bounds now use survey-aware critical values. - P2: Add 4 regressions (survey delta t-matches-reported, t-vs-z differs, survey+controls, survey+honest_did df propagation). Update stale comment in test_dcdh_extraction. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 4 +- diff_diff/honest_did.py | 12 ++- tests/test_honest_did.py | 2 +- tests/test_survey_dcdh.py | 131 +++++++++++++++++++++++ 4 files changed, 145 insertions(+), 4 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 0960a97a..8b657dd7 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2146,7 +2146,7 @@ def fit( if np.isfinite(delta_se): effective_overall_se = delta_se effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference( - delta_val, delta_se, alpha=self.alpha, df=None + delta_val, delta_se, alpha=self.alpha, df=_df_survey ) else: effective_overall_se = float("nan") @@ -2180,7 +2180,7 @@ def fit( # Fallback: NaN SE (Phase 1 path or missing IF) pl_se = float("nan") pl_t, pl_p, pl_ci = safe_inference( - pl_data["placebo_l"], pl_se, alpha=self.alpha, df=None + pl_data["placebo_l"], pl_se, alpha=self.alpha, df=_df_survey ) placebo_event_study_dict[-lag_l] = { "effect": pl_data["placebo_l"], diff --git a/diff_diff/honest_did.py b/diff_diff/honest_did.py index 9e535fa9..6730618c 100644 --- a/diff_diff/honest_did.py +++ b/diff_diff/honest_did.py @@ -967,6 +967,16 @@ def _largest_consecutive_block(times, boundary_val): beta_hat = np.array(effects) sigma = np.diag(np.array(ses) ** 2) + # Extract survey df. For replicate designs with undefined df + # (rank <= 1), use sentinel df=0 so _get_critical_value returns + # NaN, matching the safe_inference contract. + df_survey = None + if hasattr(results, "survey_metadata") and results.survey_metadata is not None: + sm = results.survey_metadata + df_survey = getattr(sm, "df_survey", None) + if df_survey is None and getattr(sm, "replicate_method", None) is not None: + df_survey = 0 # undefined replicate df → NaN inference + return ( beta_hat, sigma, @@ -974,7 +984,7 @@ def _largest_consecutive_block(times, boundary_val): len(post_times), pre_times, post_times, - None, # df_survey: dCDH has no survey support + df_survey, ) except ImportError: pass diff --git a/tests/test_honest_did.py b/tests/test_honest_did.py index 5ca20efd..8fa1441d 100644 --- a/tests/test_honest_did.py +++ b/tests/test_honest_did.py @@ -1381,7 +1381,7 @@ def test_dcdh_extraction(self): assert sigma.shape == (n_pre + n_post, n_pre + n_post) assert all(t < 0 for t in pre_t) assert all(t > 0 for t in post_t) - assert df_s is None # dCDH has no survey support + assert df_s is None # non-survey fixture → df_survey is None def test_dcdh_no_placebos_raises(self): """dCDH results without placebos raise ValueError.""" diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 568137fb..41b653e4 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -442,3 +442,134 @@ def test_zero_weight_cell_excluded(self, base_data): survey_design=sd, ) assert np.isfinite(result.overall_att) + + +# ── Test: Delta overall surface threads survey df ─────────────────── + + +class TestSurveyDeltaInference: + """Verify the L_max>=2 cost-benefit delta surface uses survey df.""" + + def test_survey_delta_uses_survey_df(self, data_with_survey): + """Under L_max=2 with a survey design, overall_p_value must match + t-distribution inference with df=df_survey (not z-inference).""" + from scipy import stats + + sd = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + r = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, survey_design=sd, + ) + if not (np.isfinite(r.overall_se) and r.overall_se > 0): + pytest.skip("delta not estimable on this fixture") + + assert r.survey_metadata is not None + df_s = r.survey_metadata.df_survey + assert df_s is not None and df_s > 0, ( + f"expected positive df_survey, got {df_s}" + ) + + t_stat = r.overall_att / r.overall_se + p_t = 2.0 * (1.0 - stats.t.cdf(abs(t_stat), df=df_s)) + # Reported p-value must match t-based (proving _df_survey was threaded) + assert r.overall_p_value == pytest.approx(p_t, abs=1e-10) + + def test_survey_delta_t_differs_from_z(self, base_data): + """With a small-df design (df~4), survey-t p-value must differ + measurably from z p-value at the delta surface.""" + from scipy import stats + + df_ = base_data.copy() + df_["pw"] = 1.0 + # 2 strata × 3 clusters/stratum = 6 nested PSUs → df_survey = 4 + groups = sorted(df_["group"].unique()) + n_g = len(groups) + strata_map = {g: i // (n_g // 2) for i, g in enumerate(groups)} + psu_map = {g: i // (n_g // 6) for i, g in enumerate(groups)} + df_["stratum"] = df_["group"].map(strata_map) + df_["cluster"] = df_["group"].map(psu_map) + sd = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + r = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, survey_design=sd, + ) + if not (np.isfinite(r.overall_se) and r.overall_se > 0): + pytest.skip("delta not estimable on this fixture") + assert r.survey_metadata is not None + df_s = r.survey_metadata.df_survey + assert df_s is not None and df_s < 30, ( + f"expected small df_survey for t-vs-z gap, got {df_s}" + ) + + t_stat = r.overall_att / r.overall_se + p_t = 2.0 * (1.0 - stats.t.cdf(abs(t_stat), df=df_s)) + p_z = 2.0 * (1.0 - stats.norm.cdf(abs(t_stat))) + # Threaded p-value must match t, not z + assert r.overall_p_value == pytest.approx(p_t, abs=1e-10) + assert abs(r.overall_p_value - p_z) > 1e-6, ( + "overall_p_value must differ from z-inference when df_survey is small" + ) + + +# ── Test: Survey + controls (DID^X) ───────────────────────────────── + + +class TestSurveyControls: + """Covariate-adjusted (DID^X) path must work with survey_design.""" + + def test_survey_plus_controls_runs(self, data_with_survey): + """Covariate-adjusted dCDH with survey_design produces finite ATT.""" + rng = np.random.default_rng(7) + df_ = data_with_survey.copy() + df_["x"] = rng.normal(0, 1.0, size=len(df_)) + sd = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + r = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + controls=["x"], L_max=1, survey_design=sd, + ) + assert np.isfinite(r.overall_att) + assert r.survey_metadata is not None + + +# ── Test: Survey + HonestDiD ──────────────────────────────────────── + + +class TestSurveyHonestDiD: + """HonestDiD bounds on survey-backed dCDH results must carry df_survey.""" + + def test_survey_honest_did_propagates_df(self, data_with_survey): + """results.honest_did_results.df_survey must match + results.survey_metadata.df_survey (non-None propagation).""" + import warnings + + sd = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + with warnings.catch_warnings(): + # dCDH HonestDiD emits a methodology-deviation warning + warnings.simplefilter("ignore") + r = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, honest_did=True, survey_design=sd, + ) + if r.honest_did_results is None: + pytest.skip("HonestDiD computation returned None on this fixture") + assert r.survey_metadata is not None + df_meta = r.survey_metadata.df_survey + assert df_meta is not None + # df_survey must propagate from survey_metadata into HonestDiD result + assert r.honest_did_results.df_survey == df_meta From 355332a13dbec2a4c5d8d8e42677c4b89075aad6 Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 17:32:03 -0400 Subject: [PATCH 06/16] Fix CI review R4: survey-aware heterogeneity + TWFE helper parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 #1: _compute_heterogeneity_test now accepts obs_survey_info and runs survey-aware WLS + Binder TSL IF when survey_design is active. Point estimate via solve_ols(weights=W_elig, weight_type='pweight'); group-level IF ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g, expanded to obs-level via w_i/W_g ratio, then compute_survey_if_variance for stratified/PSU variance. safe_inference uses df_survey. Rank-deficiency short-circuits to NaN to avoid point-estimate/IF mismatch between solve_ols's R-style drop and pinv's minimum-norm. - P1 #2: twowayfeweights() now accepts Optional[SurveyDesign]. When provided, resolves weights via _resolve_survey_for_fit and passes them to _validate_and_aggregate_to_cells, restoring fit-vs-helper parity under survey-backed inputs. fweight/aweight rejected. - P3: REGISTRY updates — TWFE parity sentence now includes survey; heterogeneity Note documents the TSL IF mechanics and library extension disclaimer; checklist line-651 lists survey-aware surfaces; new survey+bootstrap-fallback Note after line 652. - P2: 5 new regression tests in test_survey_dcdh.py: TestSurveyHeterogeneity (uniform-weights match, non-uniform beta change, t-dist df_survey) and TestSurveyTWFEParity (fit-vs-helper match, non-pweight rejection). All 254 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 154 +++++++++++++++++++++-- docs/methodology/REGISTRY.md | 7 +- tests/test_survey_dcdh.py | 152 ++++++++++++++++++++++ 3 files changed, 297 insertions(+), 16 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 8b657dd7..d7b38e75 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2394,6 +2394,8 @@ def fit( L_max=L_max, alpha=self.alpha, rank_deficient_action=self.rank_deficient_action, + group_ids_order=np.array(all_groups), + obs_survey_info=_obs_survey_info, ) twfe_weights_df = None @@ -3174,6 +3176,8 @@ def _compute_heterogeneity_test( L_max: int, alpha: float = 0.05, rank_deficient_action: str = "warn", + group_ids_order: Optional[np.ndarray] = None, + obs_survey_info: Optional[Dict[str, Any]] = None, ) -> Dict[int, Dict[str, Any]]: """Test for heterogeneous treatment effects (Web Appendix Section 1.5). @@ -3192,6 +3196,15 @@ def _compute_heterogeneity_test( Time-invariant covariate to test for heterogeneity. L_max : int alpha : float + group_ids_order : np.ndarray, optional + Canonical post-filter group id list aligned to Y_mat row order. + Required when ``obs_survey_info`` is supplied. + obs_survey_info : dict, optional + Observation-level survey info with keys ``group_ids`` (raw per-row + group labels), ``weights`` (per-row survey weights), and ``resolved`` + (ResolvedSurveyDesign). When provided, the regression uses WLS with + per-group weights W_g = sum of obs survey weights, and SE is computed + via Binder TSL IF expansion through ``compute_survey_if_variance``. Returns ------- @@ -3204,6 +3217,39 @@ def _compute_heterogeneity_test( n_groups, n_periods = Y_mat.shape results: Dict[int, Dict[str, Any]] = {} + # Survey setup (once, before horizon loop). When inactive, df_s=None and + # the existing plain-OLS path runs unchanged. + use_survey = ( + obs_survey_info is not None and group_ids_order is not None + ) + if use_survey: + from diff_diff.survey import compute_survey_if_variance + + obs_gids_raw = np.asarray(obs_survey_info["group_ids"]) + obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64) + resolved = obs_survey_info["resolved"] + df_s = ( + resolved.df_survey if resolved is not None else None + ) + # Contract: only obs whose group is in the canonical post-filter + # list contribute. Groups dropped upstream (Step 5b interior gaps, + # Step 6 multi-switch) appear in obs_gids_raw but must be + # zero-weighted in the IF expansion. + gid_list = ( + group_ids_order.tolist() + if hasattr(group_ids_order, "tolist") + else list(group_ids_order) + ) + gid_set = set(gid_list) + valid = np.array([g in gid_set for g in obs_gids_raw]) + # Per-group total weight aligned to Y_mat row order + W_g_all = np.zeros(n_groups, dtype=np.float64) + for i, gid in enumerate(gid_list): + mask_g = (obs_gids_raw == gid) & valid + W_g_all[i] = obs_w_raw[mask_g].sum() + else: + df_s = None + for l_h in range(1, L_max + 1): # Eligible switchers at this horizon (same logic as multi-horizon DID) eligible = [] @@ -3276,20 +3322,78 @@ def _compute_heterogeneity_test( } continue - coefs, _residuals, vcov = solve_ols( - design, dep_arr, - return_vcov=True, - rank_deficient_action=rank_deficient_action, - ) + if not use_survey: + # Plain OLS path (unchanged): standard inference per Lemma 7. + coefs, _residuals, vcov = solve_ols( + design, dep_arr, + return_vcov=True, + rank_deficient_action=rank_deficient_action, + ) + beta_het = float(coefs[1]) + se_het = float("nan") + if vcov is not None and np.isfinite(vcov[1, 1]) and vcov[1, 1] > 0: + se_het = float(np.sqrt(vcov[1, 1])) + t_stat, p_val, ci = safe_inference(beta_het, se_het, alpha=alpha, df=None) + else: + # Survey-aware path: WLS with per-group weights + TSL IF variance. + W_elig = W_g_all[eligible] + # solve_ols handles sqrt-weight scaling natively when + # weight_type='pweight' (linalg.py). Skip vcov — we compute + # design-based variance ourselves below. + coefs, _residuals, _vcov_ignored = solve_ols( + design, dep_arr, + weights=W_elig, weight_type="pweight", + return_vcov=False, + rank_deficient_action=rank_deficient_action, + ) + # Rank-deficiency short-circuit: if any coef is NaN, return NaN + # inference. Mixing solve_ols's R-style drop with a pinv-derived + # IF would describe different estimands. + if not np.all(np.isfinite(coefs)): + results[l_h] = { + "beta": float("nan"), "se": float("nan"), + "t_stat": float("nan"), "p_value": float("nan"), + "conf_int": (float("nan"), float("nan")), + "n_obs": n_obs, + } + continue + + beta_het = float(coefs[1]) + # Original-scale residuals (solve_ols applies sqrt-weight scaling + # internally and back-transforms residuals, but we need them for + # our IF computation below). + r_g = dep_arr - design @ coefs + + # Group-level IF for β_X: ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g. + # Under full rank (gated above), pinv == inv. Wrap matmuls in + # errstate: macOS Accelerate BLAS can emit spurious divide/overflow + # warnings on sparse-cohort designs even though the result is finite. + with np.errstate(divide="ignore", invalid="ignore", over="ignore"): + XtWX = design.T @ (W_elig[:, None] * design) + XtWX_inv = np.linalg.pinv(XtWX) + psi_g = (XtWX_inv[1, :] @ design.T) * W_elig * r_g # (n_eligible,) + + # Expand to obs level: ψ_i = ψ_g * (w_i / W_g) for i in group g. + psi_obs = np.zeros(len(obs_w_raw)) + for e_idx, g_idx in enumerate(eligible): + gid = gid_list[g_idx] + mask_g = (obs_gids_raw == gid) & valid + w_sum_g = obs_w_raw[mask_g].sum() + if w_sum_g > 0: + psi_obs[mask_g] = psi_g[e_idx] * ( + obs_w_raw[mask_g] / w_sum_g + ) - # beta_het is at index 1 (index 0 is intercept) - beta_het = float(coefs[1]) - # NaN-safe: if vcov is None or target coefficient variance is NaN - # (rank-deficient), all inference fields are NaN. - se_het = float("nan") - if vcov is not None and np.isfinite(vcov[1, 1]) and vcov[1, 1] > 0: - se_het = float(np.sqrt(vcov[1, 1])) - t_stat, p_val, ci = safe_inference(beta_het, se_het, alpha=alpha, df=None) + # Binder TSL variance across stratified PSUs. + var_s = compute_survey_if_variance(psi_obs, resolved) + se_het = ( + float(np.sqrt(var_s)) + if np.isfinite(var_s) and var_s > 0 + else float("nan") + ) + t_stat, p_val, ci = safe_inference( + beta_het, se_het, alpha=alpha, df=df_s + ) results[l_h] = { "beta": beta_het, @@ -4891,6 +4995,7 @@ def twowayfeweights( time: str, treatment: str, rank_deficient_action: str = "warn", + survey_design: Any = None, ) -> TWFEWeightsResult: """ Standalone TWFE decomposition diagnostic. @@ -4910,6 +5015,11 @@ def twowayfeweights( treatment : str rank_deficient_action : str, default="warn" Action when the FE design matrix is rank-deficient. + survey_design : SurveyDesign, optional + If provided, cell aggregation uses survey-weighted cell means + (matching ``fit(..., survey_design=sd).twfe_*``). Required to preserve + fit-vs-helper parity under survey-backed inputs. Only + ``weight_type='pweight'`` is supported; other types raise ValueError. Returns ------- @@ -4917,6 +5027,23 @@ def twowayfeweights( Object with attributes ``weights`` (DataFrame), ``fraction_negative`` (float), ``sigma_fe`` (float), and ``beta_fe`` (float). """ + # Survey resolution (optional): mirrors the fit() path so that the + # standalone helper produces identical numbers to fit(..., survey_design=sd). + survey_weights = None + if survey_design is not None: + from diff_diff.survey import _resolve_survey_for_fit + + resolved, survey_weights, _, _ = _resolve_survey_for_fit( + survey_design, data, "analytical" + ) + if resolved is not None and resolved.weight_type != "pweight": + raise ValueError( + f"twowayfeweights() survey support requires " + f"weight_type='pweight', got '{resolved.weight_type}'. " + f"The TWFE diagnostic under survey uses survey-weighted cell " + f"means; other weight types are not supported." + ) + # Validation + cell aggregation via the same helper used by # ChaisemartinDHaultfoeuille.fit() — enforces NaN/binary/within-cell # rules from REGISTRY.md so the standalone diagnostic does not @@ -4927,6 +5054,7 @@ def twowayfeweights( group=group, time=time, treatment=treatment, + weights=survey_weights, ) # TWFE diagnostic assumes binary treatment (d_arr == 1 for treated mask). if not set(cell["d_gt"].unique()).issubset({0.0, 1.0, 0, 1}): diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index b69295bc..9653569a 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -597,7 +597,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note:** The TWFE diagnostic (`twfe_diagnostic=True` in `fit()` and the standalone `twowayfeweights()`) requires binary `{0, 1}` treatment. On non-binary data, `fit()` emits a `UserWarning` and skips the diagnostic (all `twfe_*` fields are `None`), while `twowayfeweights()` raises `ValueError`. The diagnostic uses `d_gt == 1` as the treated-cell mask per Theorem 1 of AER 2020, which is undefined for non-binary treatment. -- **Note (TWFE diagnostic sample contract):** The fitted `results.twfe_weights` / `results.twfe_fraction_negative` / `results.twfe_sigma_fe` / `results.twfe_beta_fe` are computed on the **FULL pre-filter cell sample** — the data the user passed in, after `_validate_and_aggregate_to_cells()` runs but **before** the ragged-panel validation (Step 5b) and the multi-switch filter (`drop_larger_lower`, Step 6). They do NOT describe the post-filter estimation sample used by `overall_att`, `results.groups`, and the inference fields. `fit()` has three sample-shaping filters in total: (1) interior-gap drops in Step 5b, (2) multi-switch drops in Step 6, and (3) the singleton-baseline filter in Step 7. Filters (1) and (2) actually shrink the point-estimate sample, so when either fires, the fitted TWFE diagnostic and `overall_att` describe **different samples** and the estimator emits a `UserWarning` explaining the divergence with explicit counts. Filter (3) is **variance-only** — singleton-baseline groups remain in the point-estimate sample as period-based stable controls (see the singleton-baseline Note above) — so it does NOT create a fitted-vs-`overall_att` mismatch and does NOT trigger the divergence warning. Rationale for the pre-filter design: the TWFE diagnostic answers "what would the plain TWFE estimator say on the data you passed in?" — not "what would TWFE say on the data dCDH actually used after filtering?" — so users comparing TWFE vs dCDH on a fixed input can do so without an interaction effect from the dCDH-specific filters. The standalone `twowayfeweights()` function uses the same pre-filter sample, so the fitted and standalone APIs always produce identical numbers on the same input. To reproduce the dCDH estimation sample for an external TWFE comparison, pre-process your data to drop the multi-switch and interior-gap groups before fitting (the warning lists offending IDs). The matching tests are `test_twfe_pre_filter_contract_with_interior_gap_drop` and `test_twfe_pre_filter_contract_with_multi_switch_drop` in `tests/test_chaisemartin_dhaultfoeuille.py`. +- **Note (TWFE diagnostic sample contract):** The fitted `results.twfe_weights` / `results.twfe_fraction_negative` / `results.twfe_sigma_fe` / `results.twfe_beta_fe` are computed on the **FULL pre-filter cell sample** — the data the user passed in, after `_validate_and_aggregate_to_cells()` runs but **before** the ragged-panel validation (Step 5b) and the multi-switch filter (`drop_larger_lower`, Step 6). They do NOT describe the post-filter estimation sample used by `overall_att`, `results.groups`, and the inference fields. `fit()` has three sample-shaping filters in total: (1) interior-gap drops in Step 5b, (2) multi-switch drops in Step 6, and (3) the singleton-baseline filter in Step 7. Filters (1) and (2) actually shrink the point-estimate sample, so when either fires, the fitted TWFE diagnostic and `overall_att` describe **different samples** and the estimator emits a `UserWarning` explaining the divergence with explicit counts. Filter (3) is **variance-only** — singleton-baseline groups remain in the point-estimate sample as period-based stable controls (see the singleton-baseline Note above) — so it does NOT create a fitted-vs-`overall_att` mismatch and does NOT trigger the divergence warning. Rationale for the pre-filter design: the TWFE diagnostic answers "what would the plain TWFE estimator say on the data you passed in?" — not "what would TWFE say on the data dCDH actually used after filtering?" — so users comparing TWFE vs dCDH on a fixed input can do so without an interaction effect from the dCDH-specific filters. The standalone `twowayfeweights()` function uses the same pre-filter sample and accepts the same `survey_design` parameter as `fit()`, so the fitted and standalone APIs always produce identical numbers on the same input — including survey-weighted cell aggregation (`twowayfeweights(data, ..., survey_design=sd)` matches `fit(data, ..., survey_design=sd).twfe_*`). To reproduce the dCDH estimation sample for an external TWFE comparison, pre-process your data to drop the multi-switch and interior-gap groups before fitting (the warning lists offending IDs). The matching tests are `test_twfe_pre_filter_contract_with_interior_gap_drop` and `test_twfe_pre_filter_contract_with_multi_switch_drop` in `tests/test_chaisemartin_dhaultfoeuille.py`. - **Note:** By default (`drop_larger_lower=True`), the estimator drops groups whose treatment switches more than once before estimation. This matches R `DIDmultiplegtDYN`'s default and is required for the analytical variance formula (Web Appendix Section 3.7.3 of the dynamic paper, which assumes Assumption 5 / no-crossing) to be consistent with the AER 2020 Theorem 3 point estimate. Both formulas operate on the same post-drop dataset. Setting `drop_larger_lower=False` is supported for diagnostic comparison but produces an inconsistent estimator-variance pairing for any multi-switch groups present, and emits an explicit warning. @@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`. -- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. +- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning. @@ -648,8 +648,9 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7) - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface -- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred. +- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred. - **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. +- **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 41b653e4..49a0f39f 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -573,3 +573,155 @@ def test_survey_honest_did_propagates_df(self, data_with_survey): assert df_meta is not None # df_survey must propagate from survey_metadata into HonestDiD result assert r.honest_did_results.df_survey == df_meta + + +# ── Test: Survey-aware heterogeneity ──────────────────────────────── + + +class TestSurveyHeterogeneity: + """Heterogeneity testing under survey_design uses WLS + TSL IF.""" + + def test_uniform_weights_het_matches_unweighted(self, base_data): + """Uniform pweights must yield identical β_het to plain OLS path.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + # time-invariant group-level covariate + rng = np.random.default_rng(0) + groups = sorted(df_["group"].unique()) + het_map = {g: rng.uniform(-1, 1) for g in groups} + df_["x_het"] = df_["group"].map(het_map) + + r_plain = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, heterogeneity="x_het", + ) + sd = SurveyDesign(weights="pw") + r_survey = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, heterogeneity="x_het", survey_design=sd, + ) + assert r_plain.heterogeneity_effects is not None + assert r_survey.heterogeneity_effects is not None + b_plain = r_plain.heterogeneity_effects[1]["beta"] + b_survey = r_survey.heterogeneity_effects[1]["beta"] + # WLS with uniform weights = OLS; β_het must match + if np.isfinite(b_plain) and np.isfinite(b_survey): + assert b_plain == pytest.approx(b_survey, abs=1e-8) + + def test_nonuniform_het_changes_beta(self, base_data): + """Varying pweights change the WLS point estimate vs plain OLS.""" + rng = np.random.default_rng(42) + df_ = base_data.copy() + groups = sorted(df_["group"].unique()) + het_map = {g: rng.uniform(-1, 1) for g in groups} + pw_map = {g: rng.uniform(0.5, 4.0) for g in groups} + df_["x_het"] = df_["group"].map(het_map) + df_["pw"] = df_["group"].map(pw_map) + sd = SurveyDesign(weights="pw") + + r_plain = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, heterogeneity="x_het", + ) + r_survey = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, heterogeneity="x_het", survey_design=sd, + ) + assert r_plain.heterogeneity_effects is not None + assert r_survey.heterogeneity_effects is not None + b_plain = r_plain.heterogeneity_effects[1]["beta"] + b_survey = r_survey.heterogeneity_effects[1]["beta"] + if np.isfinite(b_plain) and np.isfinite(b_survey): + # WLS with non-uniform weights differs from unweighted OLS + assert b_plain != pytest.approx(b_survey, abs=1e-6), ( + f"plain={b_plain} survey={b_survey} should differ with varying weights" + ) + + def test_survey_het_uses_survey_df(self, data_with_survey): + """Reported p_value must match t-distribution inference with df_survey.""" + from scipy import stats + + rng = np.random.default_rng(7) + df_ = data_with_survey.copy() + groups = sorted(df_["group"].unique()) + het_map = {g: rng.uniform(-1, 1) for g in groups} + df_["x_het"] = df_["group"].map(het_map) + sd = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + r = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, heterogeneity="x_het", survey_design=sd, + ) + assert r.heterogeneity_effects is not None + entry = r.heterogeneity_effects[1] + if not (np.isfinite(entry["se"]) and entry["se"] > 0): + pytest.skip("heterogeneity SE not estimable on this fixture") + assert r.survey_metadata is not None + df_s = r.survey_metadata.df_survey + assert df_s is not None and df_s > 0 + + t_stat = entry["beta"] / entry["se"] + p_t = 2.0 * (1.0 - stats.t.cdf(abs(t_stat), df=df_s)) + assert entry["p_value"] == pytest.approx(p_t, abs=1e-10) + + +# ── Test: TWFE helper parity under survey ─────────────────────────── + + +class TestSurveyTWFEParity: + """twowayfeweights() with survey_design matches fit().twfe_* under survey.""" + + def test_twfe_helper_matches_fit_under_survey(self, data_with_survey): + """fit and twowayfeweights() produce identical TWFE output under survey.""" + from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights + + sd = SurveyDesign( + weights="pw", strata="stratum", psu="cluster", nest=True + ) + r = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + helper = twowayfeweights( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + # fit() twfe_* may be None if non-binary treatment; this fixture is binary + assert r.twfe_fraction_negative is not None + assert r.twfe_sigma_fe is not None + assert r.twfe_beta_fe is not None + assert r.twfe_fraction_negative == pytest.approx( + helper.fraction_negative, abs=1e-12 + ) + assert r.twfe_sigma_fe == pytest.approx(helper.sigma_fe, abs=1e-12) + assert r.twfe_beta_fe == pytest.approx(helper.beta_fe, abs=1e-12) + + def test_twfe_helper_rejects_non_pweight(self, base_data): + """fweight/aweight must be rejected by twowayfeweights() under survey.""" + from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights + + df_ = base_data.copy() + df_["pw"] = 1.0 + sd = SurveyDesign(weights="pw", weight_type="fweight") + with pytest.raises(ValueError, match="pweight"): + twowayfeweights( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) From bc9b5e8ceba8f7de20e515e61c6d08ec340b9c9e Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 18:22:41 -0400 Subject: [PATCH 07/16] Fix CI review R5: survey TWFE math consistency + zero-weight row filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 #1: _compute_twfe_diagnostic now uses cell_weight (w_gt when available, else n_gt) for FE regressions, the normalization denominator, contribution weights, and the Corollary 1 observation shares. On survey-backed inputs the outputs now match the observation-level pweighted TWFE estimand; non-survey path is byte-identical. - P1 #2: Zero-weight rows are dropped before the groupby in _validate_and_aggregate_to_cells when weights are provided, so that d_min/d_max/n_gt reflect the effective sample. Prevents zero-weight subpopulation rows from tripping the fuzzy-DiD guard or inflating downstream n_gt counts. - P2: 2 new regression tests in test_survey_dcdh.py — TestSurveyTWFEOracle.test_survey_twfe_matches_obs_level_pweighted_ols verifies beta_fe matches an observation-level pweighted OLS under survey (would fail if n_gt was still used), and TestZeroWeightSubpopulation.test_mixed_zero_weight_row_excluded_from_validation verifies an injected zero-weight row with opposite treatment value doesn't trip the within-cell constancy check. All 256 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 49 +++++++++--- tests/test_survey_dcdh.py | 94 ++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 12 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index d7b38e75..220a0c23 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -210,6 +210,18 @@ def _validate_and_aggregate_to_cells( # 5. Cell aggregation (compute min/max for within-cell check) if weights is not None: + # Zero-weight rows are out-of-sample (e.g., via + # SurveyDesign.subpopulation()). Pre-filter them before the + # groupby so that d_min/d_max/n_gt reflect the effective sample + # and a zero-weight obs cannot trip the within-cell treatment- + # constancy check or inflate downstream n_gt counts. + weights_arr = np.asarray(weights, dtype=np.float64) + pos_mask = weights_arr > 0 + if not pos_mask.all(): + df = df.loc[pos_mask].reset_index(drop=True) + weights_arr = weights_arr[pos_mask] + weights = weights_arr + # Survey-weighted cell aggregation. # y_gt = sum(w_i * y_i) / sum(w_i) within each (g, t) cell. # Treatment is constant within cells (checked below), so weighted @@ -4828,7 +4840,16 @@ def _compute_twfe_diagnostic( """ X, _ = _build_group_time_design(cell, group_col, time_col) d_arr = cell["d_gt"].to_numpy().astype(float) - n_arr = cell["n_gt"].to_numpy().astype(float) + # Cell weight for Theorem 1: under survey_design, survey-weighted + # cell totals (w_gt) replace raw cell counts (n_gt) so the FE + # regressions, normalization denominator, and Corollary 1 shares + # match the observation-level pweighted TWFE estimand. Without + # survey_design (w_gt column absent), fall back to n_gt — the + # non-survey path is unchanged. + if "w_gt" in cell.columns: + cell_weight = cell["w_gt"].to_numpy().astype(float) + else: + cell_weight = cell["n_gt"].to_numpy().astype(float) y_arr = cell["y_gt"].to_numpy().astype(float) # Step 1-2: regress d on FE @@ -4837,13 +4858,13 @@ def _compute_twfe_diagnostic( d_arr, return_vcov=False, rank_deficient_action=rank_deficient_action, - weights=n_arr, + weights=cell_weight, ) eps = residuals_d # Step 3: per-cell weights — normalize by sum over treated cells treated_mask = d_arr == 1 - denom = float((n_arr[treated_mask] * eps[treated_mask]).sum()) + denom = float((cell_weight[treated_mask] * eps[treated_mask]).sum()) if denom == 0: # Cannot normalize: the design has zero treated mass after FE absorption. # Warn so the user knows the diagnostic returned NaN values rather than @@ -4866,12 +4887,14 @@ def _compute_twfe_diagnostic( sigma_fe=float("nan"), beta_fe=float("nan"), ) - w_gt = (n_arr * eps) / denom + contribution_weights = (cell_weight * eps) / denom weights_df = cell[[group_col, time_col]].copy() - weights_df["weight"] = w_gt + weights_df["weight"] = contribution_weights - fraction_negative = float((w_gt[treated_mask] < 0).sum() / treated_mask.sum()) + fraction_negative = float( + (contribution_weights[treated_mask] < 0).sum() / treated_mask.sum() + ) # Step 5: plain TWFE regression of y on (FE + d_gt) X_with_d = np.column_stack([X, d_arr.reshape(-1, 1)]) @@ -4880,7 +4903,7 @@ def _compute_twfe_diagnostic( y_arr, return_vcov=False, rank_deficient_action=rank_deficient_action, - weights=n_arr, + weights=cell_weight, ) beta_fe = float(coef_fe[-1]) @@ -4897,12 +4920,14 @@ def _compute_twfe_diagnostic( # sigma(w) = sqrt(sum_treated(s * (w_paper - 1)^2)) # sigma_fe = |beta_fe| / sigma(w) # - # where s_{g,t} = N_{g,t} / N_1 are observation shares. + # where s_{g,t} = N_{g,t} / N_1 are observation shares (under + # survey_design, cell_weight is w_gt so shares are effective- + # weight shares; non-survey path is byte-identical). eps_treated = eps[treated_mask] - n_treated_arr = n_arr[treated_mask] - n1 = float(n_treated_arr.sum()) # total treated observations - if n1 > 0: - shares = n_treated_arr / n1 # s_{g,t} = N_{g,t} / N_1 + w_treated_arr = cell_weight[treated_mask] + w1 = float(w_treated_arr.sum()) # total treated weight (N_1 or W_1) + if w1 > 0: + shares = w_treated_arr / w1 # s_{g,t} = w_{g,t} / w_1 denom_paper = float((shares * eps_treated).sum()) if abs(denom_paper) > 0: w_paper = eps_treated / denom_paper # paper's w_{g,t} diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 49a0f39f..56d6ed39 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -725,3 +725,97 @@ def test_twfe_helper_rejects_non_pweight(self, base_data): time="period", treatment="treatment", survey_design=sd, ) + + +# ── Test: TWFE diagnostic oracle under survey ─────────────────────── + + +class TestSurveyTWFEOracle: + """twfe_beta_fe under survey must match an observation-level pweighted + TWFE regression on the same data (proving w_gt is used, not n_gt).""" + + def test_survey_twfe_matches_obs_level_pweighted_ols(self, data_with_survey): + from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights + from diff_diff.linalg import solve_ols + + sd = SurveyDesign(weights="pw") + helper = twowayfeweights( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(helper.beta_fe) + + # Build observation-level TWFE design with group and period FE + # (reference category dropped) and treatment indicator. + df_ = data_with_survey.copy() + groups_u = sorted(df_["group"].unique()) + periods_u = sorted(df_["period"].unique()) + g_map = {g: i for i, g in enumerate(groups_u)} + t_map = {t: i for i, t in enumerate(periods_u)} + g_idx = df_["group"].map(g_map).to_numpy() + t_idx = df_["period"].map(t_map).to_numpy() + n = len(df_) + X_g = np.zeros((n, len(groups_u) - 1)) + X_t = np.zeros((n, len(periods_u) - 1)) + for i in range(n): + if g_idx[i] > 0: + X_g[i, g_idx[i] - 1] = 1.0 + if t_idx[i] > 0: + X_t[i, t_idx[i] - 1] = 1.0 + intercept = np.ones((n, 1)) + treat = df_["treatment"].to_numpy().astype(float).reshape(-1, 1) + X_obs = np.hstack([intercept, X_g, X_t, treat]) + y_obs = df_["outcome"].to_numpy().astype(float) + w_obs = df_["pw"].to_numpy().astype(float) + + coef, _, _ = solve_ols( + X_obs, y_obs, + weights=w_obs, weight_type="pweight", + return_vcov=False, + ) + beta_oracle = float(coef[-1]) + # Point-estimate match (one obs per cell in this fixture; so the + # cell-level WLS with cell_weight == w_gt equals the obs-level + # WLS with w_obs weights). + assert helper.beta_fe == pytest.approx(beta_oracle, rel=1e-6), ( + f"helper.beta_fe={helper.beta_fe} oracle={beta_oracle} " + f"— TWFE diagnostic must use w_gt under survey" + ) + + +# ── Test: Zero-weight subpopulation exclusion ────────────────────── + + +class TestZeroWeightSubpopulation: + """Zero-weight rows must not trip fuzzy-DiD guard or inflate counts.""" + + def test_mixed_zero_weight_row_excluded_from_validation(self, base_data): + """A cell with a positive-weight treated obs and a zero-weight + obs with a different treatment value must fit cleanly — the + zero-weight row is out-of-sample (SurveyDesign.subpopulation()).""" + df_ = base_data.copy() + df_["pw"] = 1.0 + # Pick a treated (g, t) cell. Add a zero-weight row in the same + # cell with the opposite treatment value. Unweighted d_min != d_max + # would trip the fuzzy-DiD guard; pre-filtering zero-weight rows + # must bypass it. + treated_mask = df_["treatment"] == 1 + if not treated_mask.any(): + pytest.skip("no treated row in fixture") + sample = df_[treated_mask].iloc[0].copy() + # Flip treatment on the injected row, give it zero weight + sample["treatment"] = 0 + sample["pw"] = 0.0 + df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) + sd = SurveyDesign(weights="pw") + + # Must succeed (not raise fuzzy-DiD ValueError) + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) From 6bb3422c55f542ef4a9cd051a02313d89384abd8 Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 18:32:50 -0400 Subject: [PATCH 08/16] Fix CI review R6: P3 doc + replicate-weight parity polish - twowayfeweights() now raises NotImplementedError when the resolved survey design carries replicate weights, matching fit()'s contract. The fit/helper parity promise now holds for unsupported inputs too. - docs/llms-full.txt and ROADMAP.md no longer claim that Phase 3 parameters raise NotImplementedError; both now correctly note that only `aggregate` remains gated. - Added a helper-level regression that pins the replicate-weight rejection, complementing the existing fit()-level test. Co-Authored-By: Claude Opus 4.7 (1M context) --- ROADMAP.md | 2 +- diff_diff/chaisemartin_dhaultfoeuille.py | 10 ++++++++++ diff_diff/guides/llms-full.txt | 2 +- tests/test_survey_dcdh.py | 22 ++++++++++++++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index fa631f5e..814c730f 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -191,7 +191,7 @@ These remain in **Future Estimators** below if/when we choose to extend. ### Architectural notes (for plan and PR reviewers) - **Single `ChaisemartinDHaultfoeuille` class** (alias `DCDH`). Not a family. New features land as `fit()` parameters or fields on the results dataclass. No `DCDHDynamic`, `DCDHCovariate`, etc. Matches the library's idiomatic pattern: `CallawaySantAnna`, `ImputationDiD`, and `EfficientDiD` are all single classes that evolved across many phases. -- **Forward-compatible API from Phase 1.** `fit(aggregate=None, controls=None, trends_linear=None, L_max=None, ...)` accepts the Phase 2/3 parameters from day one and raises `NotImplementedError` with a clear pointer to the relevant phase until they are implemented. No signature changes between phases. +- **Forward-compatible API from Phase 1.** `fit(aggregate=None, controls=None, trends_linear=None, L_max=None, ...)` accepts the Phase 2/3 parameters from day one and raised `NotImplementedError` with a clear pointer to the relevant phase until they were implemented. As of the dCDH work, Phase 2, Phase 3, and `survey_design` are all live; only `aggregate` remains gated with `NotImplementedError`. No signature changes between phases. - **Conservative CI** under Assumption 8 (independent groups), exact only under iid sampling. Documented in REGISTRY.md as a `**Note:**` deviation from "default nominal coverage." Theorem 1 of the dynamic paper. - **Cohort recentering for variance is essential.** Cohorts are defined by the triple `(D_{g,1}, F_g, S_g)`. The plug-in variance subtracts cohort-conditional means, **NOT a single grand mean**. Test fixtures must catch this — a wrong implementation silently produces a smaller, incorrect variance. - **No Rust acceleration is planned for any phase.** The estimator's hot path is groupby + BLAS-accelerated matrix-vector products, where NumPy already operates near-optimally. If profiling on large panels (`G > 100K`) reveals a bottleneck post-ship, the existing `_rust_bootstrap_weights` helper can be reused for the bootstrap loop without writing new Rust code. diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 220a0c23..af38cf57 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -5068,6 +5068,16 @@ def twowayfeweights( f"The TWFE diagnostic under survey uses survey-weighted cell " f"means; other weight types are not supported." ) + if ( + resolved is not None + and resolved.replicate_weights is not None + and resolved.replicate_weights.shape[1] > 0 + ): + raise NotImplementedError( + "Replicate weight variance for twowayfeweights() is not " + "supported. Use strata/PSU/FPC for design-based inference " + "via Taylor Series Linearization (matches the fit() path)." + ) # Validation + cell aggregation via the same helper used by # ChaisemartinDHaultfoeuille.fit() — enforces NaN/binary/within-cell diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 012d5165..a9dd2f58 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -270,7 +270,7 @@ est.fit( ) -> ChaisemartinDHaultfoeuilleResults ``` -`L_max` controls multi-horizon computation. Phase 3 parameters raise `NotImplementedError`. +`L_max` controls multi-horizon computation. Phase 3 parameters (`controls`, `trends_linear`, `trends_nonparam`, `honest_did`, `heterogeneity`, `design2`) and `survey_design` are implemented; only `aggregate` remains gated with `NotImplementedError`. **Usage:** diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 56d6ed39..b23f3e75 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -726,6 +726,28 @@ def test_twfe_helper_rejects_non_pweight(self, base_data): survey_design=sd, ) + def test_twfe_helper_rejects_replicate_weights(self, base_data): + """Replicate-weight survey designs must be rejected by the helper, + matching fit()'s NotImplementedError contract.""" + from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights + + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["rep1"] = 1.0 + df_["rep2"] = 1.0 + sd = SurveyDesign( + weights="pw", + replicate_weights=["rep1", "rep2"], + replicate_method="BRR", + ) + with pytest.raises(NotImplementedError, match="Replicate"): + twowayfeweights( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + # ── Test: TWFE diagnostic oracle under survey ─────────────────────── From 5b2939cb61bf44708321ea3f7916207f4757c7dc Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 18:42:17 -0400 Subject: [PATCH 09/16] Fix CI review R7: update survey_metadata docstring The results dataclass docstring still described survey_metadata as 'always None' and survey integration as deferred. survey_design is now implemented and results.survey_metadata is populated whenever a SurveyDesign is passed to fit(). Docstring now describes the populated field and its role in survey-aware inference and HonestDiD propagation. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille_results.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille_results.py b/diff_diff/chaisemartin_dhaultfoeuille_results.py index 153cc7fc..801624fd 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_results.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_results.py @@ -342,8 +342,13 @@ class ChaisemartinDHaultfoeuilleResults: ``compute_honest_did(results)`` post-hoc. Contains identified set bounds, robust confidence intervals, and breakdown analysis. survey_metadata : Any, optional - Always ``None`` in Phase 1 — survey integration is deferred to a - separate effort after all phases ship. + Populated when ``fit(..., survey_design=sd)`` is called; ``None`` + otherwise. Carries the resolved survey design summary + (``weight_type``, strata/PSU counts, ``df_survey``, weight range, + and replicate-method info when applicable). ``df_survey`` is + threaded into survey-aware inference (t-distribution at all + analytical surfaces) and consumed by ``compute_honest_did()`` to + produce survey-aware critical values. bootstrap_results : DCDHBootstrapResults, optional Bootstrap inference results when ``n_bootstrap > 0``. """ From de8ff5e832903619860eb3f28fde7250686140a2 Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 19:38:49 -0400 Subject: [PATCH 10/16] Fix CI review R8: extend zero-weight contract to all validators + survey branch tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 #1: The R5 zero-weight filter only ran inside the cell aggregation step, after the NaN/coercion checks for group/time/treatment/outcome. Moved the filter to the very top of _validate_and_aggregate_to_cells so validation only sees the effective sample. fit()'s controls, trends_nonparam, and heterogeneity blocks now also scope their NaN/time-invariance checks to positive-weight rows when survey_weights is active. Legitimate SurveyDesign.subpopulation() inputs with NaN in excluded rows now fit cleanly. TSL variance path is unchanged (zero-weight obs still contribute zero psi). - P2: 5 new regression tests in test_survey_dcdh.py — TestZeroWeightSubpopulation now covers NaN outcome and NaN het columns in excluded rows; new TestSurveyTrendsLinear / TestSurveyTrendsNonparam / TestSurveyDesign2 classes exercise survey_design combined with those previously-untested branches. All 262 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 70 +++++++---- tests/test_survey_dcdh.py | 147 +++++++++++++++++++++++ 2 files changed, 194 insertions(+), 23 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index af38cf57..548c9f93 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -157,6 +157,16 @@ def _validate_and_aggregate_to_cells( df = data.copy() + # 1a. SurveyDesign.subpopulation() contract: zero-weight rows are + # out-of-sample. Pre-filter them *before* any NaN/coercion validation + # so that invalid values in excluded rows do not abort the fit. + if weights is not None: + weights_arr = np.asarray(weights, dtype=np.float64) + pos_mask = weights_arr > 0 + if not pos_mask.all(): + df = df.loc[pos_mask].reset_index(drop=True) + weights = weights_arr[pos_mask] + # 1b. Group and time NaN checks (before groupby, which silently drops NaN keys) n_nan_group = int(df[group].isna().sum()) if n_nan_group > 0: @@ -210,19 +220,8 @@ def _validate_and_aggregate_to_cells( # 5. Cell aggregation (compute min/max for within-cell check) if weights is not None: - # Zero-weight rows are out-of-sample (e.g., via - # SurveyDesign.subpopulation()). Pre-filter them before the - # groupby so that d_min/d_max/n_gt reflect the effective sample - # and a zero-weight obs cannot trip the within-cell treatment- - # constancy check or inflate downstream n_gt counts. - weights_arr = np.asarray(weights, dtype=np.float64) - pos_mask = weights_arr > 0 - if not pos_mask.all(): - df = df.loc[pos_mask].reset_index(drop=True) - weights_arr = weights_arr[pos_mask] - weights = weights_arr - - # Survey-weighted cell aggregation. + # Survey-weighted cell aggregation (zero-weight rows already + # filtered upstream at step 1a). # y_gt = sum(w_i * y_i) / sum(w_i) within each (g, t) cell. # Treatment is constant within cells (checked below), so weighted # and unweighted means are identical for d_gt. @@ -730,8 +729,17 @@ def fit( f"Control column(s) {missing_controls!r} not found in " f"data. Available columns: {list(data.columns)}" ) - # Work on a copy to avoid mutating the caller's DataFrame - data_controls = data[controls].copy() + # SurveyDesign.subpopulation() contract: zero-weight rows are + # out-of-sample. Scope NaN/Inf validation to positive-weight + # rows so that excluded obs with missing covariates do not + # abort the fit. The downstream weighted aggregation + # (sum(w*x)/sum(w)) handles zero-weight rows correctly on + # its own. + if survey_weights is not None: + pos_mask_ctrl = np.asarray(survey_weights) > 0 + data_controls = data.loc[pos_mask_ctrl, controls].copy() + else: + data_controls = data[controls].copy() for c in controls: try: data_controls[c] = pd.to_numeric(data_controls[c]) @@ -1196,8 +1204,16 @@ def fit( f"trends_nonparam column {set_col!r} not found in " f"data. Available columns: {list(data.columns)}" ) - # Reject NaN/missing set assignments - n_na_set = int(data[set_col].isna().sum()) + # SurveyDesign.subpopulation() contract: scope NaN and + # time-invariance validation to positive-weight rows so + # excluded obs with missing set IDs do not abort the fit. + if survey_weights is not None: + pos_mask_tnp = np.asarray(survey_weights) > 0 + data_tnp = data.loc[pos_mask_tnp] + else: + data_tnp = data + # Reject NaN/missing set assignments (effective sample only) + n_na_set = int(data_tnp[set_col].isna().sum()) if n_na_set > 0: raise ValueError( f"trends_nonparam column {set_col!r} contains " @@ -1205,7 +1221,7 @@ def fit( f"have a valid set assignment." ) # Aggregate set membership per group (must be time-invariant) - set_per_group = data.groupby(group)[set_col].nunique() + set_per_group = data_tnp.groupby(group)[set_col].nunique() time_varying = set_per_group[set_per_group > 1] if len(time_varying) > 0: raise ValueError( @@ -1217,7 +1233,7 @@ def fit( # Set partition must be coarser than group (multiple groups # per set). A group-level partition creates singleton sets # with no within-set controls available. - set_map_check = data.groupby(group)[set_col].first() + set_map_check = data_tnp.groupby(group)[set_col].first() n_sets = set_map_check.nunique() n_groups_total = len(set_map_check) if n_sets >= n_groups_total: @@ -1229,7 +1245,7 @@ def fit( f"within-set controls." ) # Extract set membership per group aligned with all_groups - set_map = data.groupby(group)[set_col].first() + set_map = data_tnp.groupby(group)[set_col].first() set_ids_arr = np.array( [set_map.loc[g] for g in all_groups], dtype=object ) @@ -2376,8 +2392,16 @@ def fit( "control-pool restrictions; the results would be " "inconsistent with the fitted estimator." ) - # Extract per-group covariate (must be time-invariant) - het_per_group = data.groupby(group)[het_col].nunique() + # Extract per-group covariate (must be time-invariant). + # SurveyDesign.subpopulation() contract: scope time-invariance + # check to positive-weight rows so excluded obs with NaN/varying + # het values do not abort the fit. + if survey_weights is not None: + pos_mask_het = np.asarray(survey_weights) > 0 + data_het = data.loc[pos_mask_het] + else: + data_het = data + het_per_group = data_het.groupby(group)[het_col].nunique() het_varying = het_per_group[het_per_group > 1] if len(het_varying) > 0: raise ValueError( @@ -2385,7 +2409,7 @@ def fit( f"time-invariant within each group. " f"{len(het_varying)} group(s) have varying values." ) - het_map = data.groupby(group)[het_col].first() + het_map = data_het.groupby(group)[het_col].first() X_het = np.array( [float(het_map.loc[g]) for g in all_groups] ) diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index b23f3e75..c8afbd28 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -841,3 +841,150 @@ def test_mixed_zero_weight_row_excluded_from_validation(self, base_data): survey_design=sd, ) assert np.isfinite(result.overall_att) + + def test_zero_weight_row_with_nan_outcome(self, base_data): + """A zero-weight row with NaN outcome must not trip the outcome + NaN validator. SurveyDesign.subpopulation() contract.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + sample = df_.iloc[0].copy() + sample["outcome"] = np.nan + sample["pw"] = 0.0 + df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) + sd = SurveyDesign(weights="pw") + # Must succeed — zero-weight row with NaN outcome is out-of-sample + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + + def test_zero_weight_row_with_nan_heterogeneity(self, base_data): + """A zero-weight row with NaN in the heterogeneity column must + not trip the heterogeneity time-invariance validator.""" + rng = np.random.default_rng(0) + df_ = base_data.copy() + df_["pw"] = 1.0 + groups = sorted(df_["group"].unique()) + het_map = {g: rng.uniform(-1, 1) for g in groups} + df_["x_het"] = df_["group"].map(het_map) + # Inject a zero-weight row with NaN het value for an existing group + sample = df_.iloc[0].copy() + sample["x_het"] = np.nan + sample["pw"] = 0.0 + df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) + sd = SurveyDesign(weights="pw") + # Must succeed — zero-weight row with NaN het is out-of-sample + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, heterogeneity="x_het", survey_design=sd, + ) + assert result.heterogeneity_effects is not None + + +# ── Test: Survey + trends_linear ──────────────────────────────────── + + +class TestSurveyTrendsLinear: + """Survey-backed trends_linear fit must populate linear_trends_effects.""" + + def test_survey_trends_linear_runs(self, data_with_survey): + sd = SurveyDesign(weights="pw") + r = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, trends_linear=True, survey_design=sd, + ) + assert r.survey_metadata is not None + # linear_trends_effects populated per REGISTRY line 614 contract + assert r.linear_trends_effects is not None + # At least one horizon should be estimable with finite value + finite_horizons = [ + h for h, entry in r.linear_trends_effects.items() + if np.isfinite(entry.get("effect", np.nan)) + ] + assert len(finite_horizons) > 0, ( + "expected at least one horizon with finite linear_trends_effect" + ) + + +# ── Test: Survey + trends_nonparam ────────────────────────────────── + + +class TestSurveyTrendsNonparam: + """Survey-backed trends_nonparam fit must thread set-restrictions.""" + + def test_survey_trends_nonparam_runs(self, data_with_survey): + # Reuse stratum as set ID (time-invariant per group) + sd = SurveyDesign(weights="pw") + r = ChaisemartinDHaultfoeuille(seed=1).fit( + data_with_survey, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, trends_nonparam="stratum", survey_design=sd, + ) + assert r.survey_metadata is not None + assert r.event_study_effects is not None + # Support trimming may reduce counts but at least one finite-SE + # horizon should remain on this fixture. + finite_ses = [ + entry + for entry in r.event_study_effects.values() + if np.isfinite(entry.get("se", np.nan)) + ] + assert len(finite_ses) > 0, ( + "expected at least one event-study horizon with finite SE " + "under trends_nonparam + survey" + ) + + +# ── Test: Survey + design2 ────────────────────────────────────────── + + +class TestSurveyDesign2: + """Survey-backed design2 fit must populate design2_effects.""" + + @staticmethod + def _make_join_then_leave_panel(seed=42, n_groups=30, n_periods=8): + """Panel with join-then-leave (Design-2) groups, matching the + existing design2 fixture in test_chaisemartin_dhaultfoeuille.py.""" + rng = np.random.RandomState(seed) + rows = [] + for g in range(n_groups): + group_fe = rng.normal(0, 2) + for t in range(n_periods): + if g < 10: + d = 1 if 2 <= t < 5 else 0 + elif g < 20: + d = 1 if t >= 3 else 0 + else: + d = 0 + y = group_fe + 2.0 * t + 5.0 * d + rng.normal(0, 0.3) + rows.append( + {"group": g, "period": t, "treatment": d, "outcome": y, "pw": 1.0} + ) + return pd.DataFrame(rows) + + def test_survey_design2_runs(self): + df_ = self._make_join_then_leave_panel() + sd = SurveyDesign(weights="pw") + # drop_larger_lower=False keeps the 2-switch groups + r = ChaisemartinDHaultfoeuille( + seed=1, drop_larger_lower=False + ).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, design2=True, survey_design=sd, + ) + assert r.survey_metadata is not None + assert r.design2_effects is not None + assert r.design2_effects["n_design2_groups"] == 10 + # switch_in and switch_out mean effects should be finite + assert np.isfinite(r.design2_effects["switch_in"]["mean_effect"]) + assert np.isfinite(r.design2_effects["switch_out"]["mean_effect"]) From b3760436398373c596101984d0031b3fc29885dc Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 16 Apr 2026 20:46:23 -0400 Subject: [PATCH 11/16] Fix CI review R9: align controls aggregation to effective sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R8's controls-block fix scoped NaN/Inf validation to the positive-weight subset (shorter data_controls) but then assigned those shorter arrays into an x_agg_input built from the full-length frame, causing a length-mismatch on any SurveyDesign.subpopulation() / zero-weight excluded row before covariate aggregation could run. Root-caused fix: derive both the validation frame AND the aggregation frame from the same positive-weight effective sample (data_eff, survey_weights_eff). Zero-weight rows are genuinely out-of-sample throughout the DID^X path now. Non-survey fits unchanged. Added TestZeroWeightSubpopulation.test_zero_weight_row_with_nan_control pinning the subpopulation contract for the DID^X path — injects a zero-weight row with NaN control value and asserts fit() succeeds. All 263 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 27 +++++++++++++----------- tests/test_survey_dcdh.py | 22 +++++++++++++++++++ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 548c9f93..6bce4f8e 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -730,16 +730,18 @@ def fit( f"data. Available columns: {list(data.columns)}" ) # SurveyDesign.subpopulation() contract: zero-weight rows are - # out-of-sample. Scope NaN/Inf validation to positive-weight - # rows so that excluded obs with missing covariates do not - # abort the fit. The downstream weighted aggregation - # (sum(w*x)/sum(w)) handles zero-weight rows correctly on - # its own. + # out-of-sample. Scope BOTH validation and aggregation to the + # positive-weight subset so excluded rows with missing/invalid + # covariates do not abort the fit and cell aggregation aligns + # with the effective sample used by _validate_and_aggregate_to_cells. if survey_weights is not None: pos_mask_ctrl = np.asarray(survey_weights) > 0 - data_controls = data.loc[pos_mask_ctrl, controls].copy() + data_eff = data.loc[pos_mask_ctrl] + survey_weights_eff = np.asarray(survey_weights)[pos_mask_ctrl] else: - data_controls = data[controls].copy() + data_eff = data + survey_weights_eff = None + data_controls = data_eff[controls].copy() for c in controls: try: data_controls[c] = pd.to_numeric(data_controls[c]) @@ -760,14 +762,15 @@ def fit( "Remove or replace non-finite covariates before fitting." ) # Aggregate covariates to cell means (same groupby as treatment/outcome). - # Use the coerced copy joined with group/time from original data. - x_agg_input = data[[group, time]].copy() + # Build x_agg_input from the same effective-sample frame so rows + # align with data_controls. + x_agg_input = data_eff[[group, time]].copy() x_agg_input[controls] = data_controls[controls].values - if survey_weights is not None: + if survey_weights_eff is not None: # Survey-weighted covariate cell means: sum(w*x)/sum(w) - x_agg_input["_w_"] = survey_weights + x_agg_input["_w_"] = survey_weights_eff for c in controls: - x_agg_input[f"_wx_{c}"] = survey_weights * x_agg_input[c].values + x_agg_input[f"_wx_{c}"] = survey_weights_eff * x_agg_input[c].values wx_cols = [f"_wx_{c}" for c in controls] g_agg = x_agg_input.groupby([group, time], as_index=False).agg( {**{wc: "sum" for wc in wx_cols}, "_w_": "sum"} diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index c8afbd28..0800a3bf 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -861,6 +861,28 @@ def test_zero_weight_row_with_nan_outcome(self, base_data): ) assert np.isfinite(result.overall_att) + def test_zero_weight_row_with_nan_control(self, base_data): + """A zero-weight row with NaN in a control column must not abort + the DID^X path, and the covariate cell aggregation must use only + positive-weight rows (no length-mismatch error).""" + rng = np.random.default_rng(13) + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["x"] = rng.normal(0, 1, size=len(df_)) + # Inject a zero-weight row with NaN control value + sample = df_.iloc[0].copy() + sample["x"] = np.nan + sample["pw"] = 0.0 + df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) + sd = SurveyDesign(weights="pw") + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=1, controls=["x"], survey_design=sd, + ) + assert np.isfinite(result.overall_att) + def test_zero_weight_row_with_nan_heterogeneity(self, base_data): """A zero-weight row with NaN in the heterogeneity column must not trip the heterogeneity time-invariance validator.""" From d0459878da098c4467398f68846bda0413359e7d Mon Sep 17 00:00:00 2001 From: igerber Date: Fri, 17 Apr 2026 06:10:34 -0400 Subject: [PATCH 12/16] Update bundled LLM guides for dCDH survey + SDID jackknife MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - diff_diff/guides/llms-practitioner.txt: add reversible-treatment branch to the Step 4 decision tree pointing at ChaisemartinDHaultfoeuille (alias DCDH), and a short Step 5 example showing the survey_design + L_max + TSL fallback for dCDH. The practitioner guide previously had zero dCDH coverage despite the full API being live. - diff_diff/guides/llms-full.txt: fix stale SyntheticDiDResults table (line 1024) — variance_method now accepts "bootstrap", "jackknife", or "placebo" (jackknife was added for synthdid::vcov parity but the table only listed bootstrap/placebo). Bundled-guide paths only. The old docs/llms-*.txt were moved to diff_diff/guides/ in main (PR #310); these edits target the new canonical location consumed by get_llm_guide(). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/guides/llms-full.txt | 2 +- diff_diff/guides/llms-practitioner.txt | 27 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index a9dd2f58..7e6ef60b 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -1021,7 +1021,7 @@ Returned by `SyntheticDiD.fit()`. | `time_weights` | `dict` | Pre-treatment time weights | | `pre_periods` | `list` | Pre-treatment periods | | `post_periods` | `list` | Post-treatment periods | -| `variance_method` | `str` | "bootstrap" or "placebo" | +| `variance_method` | `str` | "bootstrap", "jackknife", or "placebo" | | `noise_level` | `float` | Estimated noise level | | `zeta_omega` | `float` | Unit weight regularization | | `zeta_lambda` | `float` | Time weight regularization | diff --git a/diff_diff/guides/llms-practitioner.txt b/diff_diff/guides/llms-practitioner.txt index 8d487fcc..6680d800 100644 --- a/diff_diff/guides/llms-practitioner.txt +++ b/diff_diff/guides/llms-practitioner.txt @@ -173,6 +173,12 @@ Is treatment adoption staggered (multiple cohorts, different timing)? |-- NO, simple 2x2 design: | \-- DifferenceInDifferences (DiD) | +|-- Treatment switches ON and OFF (reversible / non-absorbing)? +| \-- ChaisemartinDHaultfoeuille (dCDH / alias `DCDH`) +| -- Only library estimator for non-absorbing treatments; supports +| L_max multi-horizon, dynamic placebos, cost-benefit delta, +| HonestDiD, and `survey_design=` (pweight + strata/PSU/FPC via TSL) +| |-- Few treated units (< 20)? | \-- SyntheticDiD (SDiD) -- synthetic control + DiD hybrid | @@ -260,6 +266,27 @@ results = es.fit(data, outcome='y', unit='unit_id', time='period', print(results.summary()) ``` +### Reversible (non-absorbing) treatment with survey design +Use `ChaisemartinDHaultfoeuille` (dCDH) when treatment switches ON and OFF. +Pass `survey_design=SurveyDesign(...)` for design-based inference via Taylor +Series Linearization. Only `weight_type='pweight'` is supported; replicate +weights are deferred. When combined with `n_bootstrap > 0`, dCDH emits a +`UserWarning` and falls back to group-level multiplier bootstrap — prefer +the analytical TSL path for survey-aware inference. + +```python +from diff_diff import ChaisemartinDHaultfoeuille, SurveyDesign + +sd = SurveyDesign(weights='pw', strata='stratum', psu='cluster', nest=True) +results = ChaisemartinDHaultfoeuille().fit( + data, outcome='y', group='unit_id', time='period', + treatment='treated', + L_max=3, # multi-horizon event study + survey_design=sd, # survey-aware analytical SE (TSL) +) +print(results.summary()) +``` + --- ## Step 6: Sensitivity Analysis From 0266d59ac66cb2e947fee3929121353a830aad38 Mon Sep 17 00:00:00 2001 From: igerber Date: Fri, 17 Apr 2026 07:03:59 -0400 Subject: [PATCH 13/16] Fix CI review R10: within-group-constant strata/PSU + sanitize SE helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 #1/#2: Add _validate_group_constant_strata_psu() helper and call it from fit() after the weight_type/replicate-weights checks. The dCDH IF expansion psi_i = U[g] * (w_i / W_g) treats each group as the effective sampling unit; when strata or PSU vary within group it silently spreads horizon-specific IF mass across observations in different PSUs, contaminating the stratified-PSU variance. Walk back the overstated claim at the old line 669 comment to match. Within- group-varying weights remain supported. - P1 #3: _survey_se_from_group_if now filters zero-weight rows before np.unique/np.bincount so NaN / non-comparable group IDs on excluded subpopulation rows cannot crash SE factorization. psi stays full- length with zeros in excluded positions to preserve alignment with resolved.strata / resolved.psu inside compute_survey_if_variance. - REGISTRY.md line 652 Note updated: explicitly states the within-group-constant strata/PSU requirement and the within-group-varying weights support. - Tests: new TestSurveyWithinGroupValidation class (4 tests — rejects varying PSU, rejects varying strata, accepts varying weights, and ignores zero-weight rows during the constancy check) plus TestZeroWeightSubpopulation.test_zero_weight_row_with_nan_group_id. All 268 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 100 ++++++++++++++++++++--- docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh.py | 95 +++++++++++++++++++++ 3 files changed, 185 insertions(+), 12 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 6bce4f8e..80999bd3 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -666,9 +666,15 @@ def fit( "Use strata/PSU/FPC for design-based inference via Taylor " "Series Linearization." ) - # No group-constant survey validation: the IF expansion - # psi_i = U[g] * (w_i / W_g) handles observation-level - # variation in weights, strata, and PSU within groups. + # Within-group-constant PSU/strata is required: the IF + # expansion psi_i = U[g] * (w_i / W_g) supports within-group + # variation in WEIGHTS (each obs contributes proportionally), + # but PSU and strata must be constant within group — the + # group is treated as the effective sampling unit for the + # Binder stratified-PSU variance formula. + _validate_group_constant_strata_psu( + resolved_survey, data, group, survey_weights, + ) # Design-2 precondition: requires drop_larger_lower=False if design2 and self.drop_larger_lower: @@ -4696,6 +4702,61 @@ def _plugin_se(U_centered: np.ndarray, divisor: int) -> float: return float(np.sqrt(sigma_hat_sq) / np.sqrt(divisor)) +def _validate_group_constant_strata_psu( + resolved: Any, + data: pd.DataFrame, + group_col: str, + survey_weights: Optional[np.ndarray], +) -> None: + """Reject survey designs where strata or PSU vary within group. + + The dCDH IF expansion ``psi_i = U[g] * (w_i / W_g)`` treats the + group as the effective sampling unit for design-based variance. + When strata or PSU vary within group, the expansion silently spreads + horizon-specific IF mass onto observations whose survey stratum or + PSU differs from the rest of the group, contaminating the + stratified-PSU variance. Reject those designs with a clear error. + + Zero-weight rows are excluded from the check (subpopulation + contract — an excluded row with a different stratum/PSU label does + not actually participate in the variance). + """ + if resolved is None: + return + pos_mask = np.asarray(survey_weights) > 0 + g_eff = np.asarray(data[group_col].values)[pos_mask] + if resolved.strata is not None: + s_eff = np.asarray(resolved.strata)[pos_mask] + df_s = pd.DataFrame({"g": g_eff, "s": s_eff}) + varying = df_s.groupby("g")["s"].nunique() + bad = varying[varying > 1] + if len(bad) > 0: + raise ValueError( + f"ChaisemartinDHaultfoeuille survey support requires " + f"strata to be constant within group, but " + f"{len(bad)} group(s) have multiple strata " + f"(examples: {bad.index.tolist()[:5]}). The IF " + f"expansion psi_i = U[g] * (w_i / W_g) treats the " + f"group as the effective sampling unit for stratified " + f"design-based variance." + ) + if resolved.psu is not None: + p_eff = np.asarray(resolved.psu)[pos_mask] + df_p = pd.DataFrame({"g": g_eff, "p": p_eff}) + varying = df_p.groupby("g")["p"].nunique() + bad = varying[varying > 1] + if len(bad) > 0: + raise ValueError( + f"ChaisemartinDHaultfoeuille survey support requires " + f"PSU to be constant within group, but " + f"{len(bad)} group(s) have multiple PSUs " + f"(examples: {bad.index.tolist()[:5]}). The IF " + f"expansion psi_i = U[g] * (w_i / W_g) treats the " + f"group as the effective sampling unit for stratified " + f"design-based variance." + ) + + def _compute_se( U_centered: np.ndarray, divisor: int, @@ -4762,20 +4823,37 @@ def _survey_se_from_group_if( weights = obs_survey_info["weights"] resolved = obs_survey_info["resolved"] + # Zero-weight rows are out-of-sample (SurveyDesign.subpopulation()). + # Skip them before the group-ID factorization so NaN / non-comparable + # group IDs on excluded rows cannot crash np.unique. psi stays full- + # length with zeros in excluded positions so the alignment with + # resolved.strata / resolved.psu inside compute_survey_if_variance + # is preserved. + weights_arr = np.asarray(weights, dtype=np.float64) + pos_mask = weights_arr > 0 + n_obs = len(weights_arr) + psi = np.zeros(n_obs, dtype=np.float64) + + if not pos_mask.any(): + return float("nan") + + gids_eff = np.asarray(group_ids)[pos_mask] + w_eff = weights_arr[pos_mask] + # Build group → U_centered lookup (vectorized via factorization) group_to_u = {gid: U_centered[idx] for idx, gid in enumerate(eligible_groups)} - # Map group IFs to observation level - u_obs = np.array([group_to_u.get(gid, 0.0) for gid in group_ids]) + # Map group IFs to observation level (effective sample only) + u_obs_eff = np.array([group_to_u.get(gid, 0.0) for gid in gids_eff]) - # Compute per-group weight totals W_g via bincount - unique_gids, inverse = np.unique(group_ids, return_inverse=True) - w_totals_per_group = np.bincount(inverse, weights=weights) - w_obs_total = w_totals_per_group[inverse] + # Compute per-group weight totals W_g via bincount on effective sample + unique_gids, inverse = np.unique(gids_eff, return_inverse=True) + w_totals_per_group = np.bincount(inverse, weights=w_eff) + w_obs_total_eff = w_totals_per_group[inverse] # Expand to observation level: psi_i = U[g] * (w_i / W_g) - safe_w = np.where(w_obs_total > 0, w_obs_total, 1.0) - psi = u_obs * (weights / safe_w) + safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0) + psi[pos_mask] = u_obs_eff * (w_eff / safe_w) variance = compute_survey_if_variance(psi, resolved) if not np.isfinite(variance) or variance < 0: diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 9653569a..f5eba94a 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred. -- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. +- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). Within-group-varying **weights** are supported (each observation contributes proportionally). - **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 0800a3bf..55bf6380 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -861,6 +861,26 @@ def test_zero_weight_row_with_nan_outcome(self, base_data): ) assert np.isfinite(result.overall_att) + def test_zero_weight_row_with_nan_group_id(self, base_data): + """A zero-weight row with NaN group id must not crash the SE + factorization. SurveyDesign.subpopulation() contract.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + # Cast group to object to allow NaN without coercion errors + df_["group"] = df_["group"].astype(object) + sample = df_.iloc[0].copy() + sample["group"] = np.nan + sample["pw"] = 0.0 + df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) + sd = SurveyDesign(weights="pw") + # Must succeed — zero-weight row's NaN group id is out-of-sample + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + def test_zero_weight_row_with_nan_control(self, base_data): """A zero-weight row with NaN in a control column must not abort the DID^X path, and the covariate cell aggregation must use only @@ -1010,3 +1030,78 @@ def test_survey_design2_runs(self): # switch_in and switch_out mean effects should be finite assert np.isfinite(r.design2_effects["switch_in"]["mean_effect"]) assert np.isfinite(r.design2_effects["switch_out"]["mean_effect"]) + + +# ── Test: Within-group constancy of strata and PSU ────────────────── + + +class TestSurveyWithinGroupValidation: + """Survey designs with strata or PSU varying within a single group + are rejected because the dCDH IF expansion treats the group as the + effective sampling unit.""" + + def test_rejects_varying_psu_within_group(self, base_data): + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = 0 + # PSU varies within each group (alternates by period) + df_["psu"] = df_["period"] % 2 + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") + with pytest.raises(ValueError, match="PSU to be constant within group"): + ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + + def test_rejects_varying_strata_within_group(self, base_data): + df_ = base_data.copy() + df_["pw"] = 1.0 + # Stratum varies within each group + df_["stratum"] = df_["period"] % 2 + # Give each obs a unique PSU label so the SurveyDesign resolver + # doesn't reject on cross-stratum PSU reuse — we want our + # within-group strata check to fire first. + df_["psu"] = np.arange(len(df_)) + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") + with pytest.raises(ValueError, match="strata to be constant within group"): + ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + + def test_accepts_varying_weights_within_group(self, base_data): + """Within-group-varying pweights remain supported — the expansion + psi_i = U[g] * (w_i / W_g) handles obs-level weight variation.""" + df_ = base_data.copy() + rng = np.random.default_rng(7) + df_["pw"] = rng.uniform(0.5, 2.0, size=len(df_)) + sd = SurveyDesign(weights="pw") + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) + + def test_rejection_excludes_zero_weight_rows(self, base_data): + """A zero-weight row with a different PSU from its group must + not trigger rejection — it is out-of-sample by the + subpopulation contract and does not enter the variance.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = 0 + df_["psu"] = 0 + # Inject a zero-weight row with a different PSU + sample = df_.iloc[0].copy() + sample["psu"] = 99 # would violate constancy if counted + sample["pw"] = 0.0 + df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.overall_att) From bfee956bfb487837619a332338cbbf8b332faf7a Mon Sep 17 00:00:00 2001 From: igerber Date: Fri, 17 Apr 2026 17:17:54 -0400 Subject: [PATCH 14/16] Fix CI review R11: auto-inject psu=group when survey_design.psu is None MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dCDH methodology note says the group is the effective sampling unit for the TSL IF expansion psi_i = U[g] * (w_i / W_g). But when the user passed SurveyDesign(weights=...) with no explicit PSU, compute_survey_if_variance() defaulted to per-observation PSUs (survey.py:1201-1218), inflating the effective PSU count and df_survey and silently contradicting the documented contract. - fit() now auto-injects psu= whenever the resolved SurveyDesign has no PSU, rebuilds a replacement SurveyDesign preserving strata/FPC/weight_type/nest/lonely_psu, and re-resolves so every downstream consumer (_survey_se_from_group_if, the heterogeneity survey path, survey_metadata.df_survey → HonestDiD critical values) uses the per-group structure. Replicate-weight designs are skipped (they're rejected by the subsequent check) and zero-weight rows are filtered before re-resolution so NaN group IDs on excluded subpopulation rows don't block PSU resolution. - REGISTRY.md line-652 Note documents the auto-inject behavior and the user escape hatch (pass an explicit psu constant within group). - Updated test_uniform_survey_se_matches_plugin docstring to reflect the new default (no-PSU ≡ psu=group) instead of the prior "independent-obs" note. - Added 2 regressions: test_auto_inject_psu_matches_explicit_group_psu (invariance of SE/df_survey between no-psu and explicit psu=group) and test_off_horizon_row_duplication_does_not_change_se (pins the per-group sampling-unit invariant under row duplication). All 270 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 41 ++++++++++++++ docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh.py | 72 +++++++++++++++++++++++- 3 files changed, 111 insertions(+), 4 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 80999bd3..957d3f88 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -652,6 +652,47 @@ def fit( _resolve_survey_for_fit(survey_design, data, "analytical") ) + # dCDH contract: the group is the effective sampling unit for + # the TSL IF expansion psi_i = U[g] * (w_i / W_g). When the user + # passed a SurveyDesign without an explicit PSU, + # compute_survey_if_variance() would fall back to per-observation + # PSUs — which contradicts the per-group structure the IF + # expansion assumes and inflates df_survey. Auto-inject + # `psu=` and re-resolve so downstream variance, df_survey, + # and HonestDiD critical values match the documented contract. + # Strata / FPC / weight_type / nest are preserved. + # Skipped for replicate-weight designs — they're rejected below. + if ( + resolved_survey is not None + and resolved_survey.psu is None + and ( + resolved_survey.replicate_weights is None + or resolved_survey.replicate_weights.shape[1] == 0 + ) + ): + from diff_diff.survey import SurveyDesign as _SurveyDesign + + # Pre-filter zero-weight rows so NaN / invalid group IDs on + # excluded subpopulation rows don't block PSU resolution + # (group becomes the PSU column after auto-inject). Updates + # local bindings only; caller's DataFrame is untouched. + pos_mask_sv = np.asarray(survey_weights) > 0 + if not pos_mask_sv.all(): + data = data.loc[pos_mask_sv].reset_index(drop=True) + + eff_design = _SurveyDesign( + weights=survey_design.weights, + strata=survey_design.strata, + psu=group, + fpc=getattr(survey_design, "fpc", None), + weight_type=getattr(survey_design, "weight_type", "pweight"), + nest=getattr(survey_design, "nest", False), + lonely_psu=getattr(survey_design, "lonely_psu", "remove"), + ) + resolved_survey, survey_weights, _, survey_metadata = ( + _resolve_survey_for_fit(eff_design, data, "analytical") + ) + if resolved_survey is not None: if resolved_survey.weight_type != "pweight": raise ValueError( diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index f5eba94a..0bf3c5de 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred. -- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). Within-group-varying **weights** are supported (each observation contributes proportionally). +- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). - **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 55bf6380..8698b06e 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -389,9 +389,11 @@ def test_uniform_survey_se_matches_plugin(self, base_data): """Pins the divisor normalization: uniform survey SE with group-level PSU clustering should be close to plug-in SE. - Without PSU clustering, survey treats each observation as independent - (N_obs observations), while plug-in treats each group as independent - (N_groups). Clustering at the group level aligns the two. + Under dCDH's auto-inject contract (see REGISTRY.md §dCDH survey), + SurveyDesign(weights='pw') with no explicit psu is equivalent to + SurveyDesign(weights='pw', psu='group'). Either form aligns the + effective sampling unit with the group-level IF and matches the + plug-in SE up to small-sample corrections. """ df = base_data.copy() df["pw"] = 1.0 @@ -1085,6 +1087,70 @@ def test_accepts_varying_weights_within_group(self, base_data): ) assert np.isfinite(result.overall_att) + def test_auto_inject_psu_matches_explicit_group_psu(self, base_data): + """SurveyDesign(weights='pw') (no PSU) must yield the same SE and + df_survey as SurveyDesign(weights='pw', psu='group') after + dCDH auto-injects psu=group.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + r_no_psu = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=SurveyDesign(weights="pw"), + ) + r_explicit = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=SurveyDesign(weights="pw", psu="group"), + ) + assert r_no_psu.overall_att == pytest.approx( + r_explicit.overall_att, abs=1e-10 + ) + if np.isfinite(r_no_psu.overall_se) and np.isfinite(r_explicit.overall_se): + assert r_no_psu.overall_se == pytest.approx( + r_explicit.overall_se, rel=1e-6 + ) + assert r_no_psu.survey_metadata is not None + assert r_explicit.survey_metadata is not None + assert ( + r_no_psu.survey_metadata.df_survey + == r_explicit.survey_metadata.df_survey + ) + + def test_off_horizon_row_duplication_does_not_change_se(self, base_data): + """Under auto-injected psu=group, duplicating an observation + within a group (cell mean unchanged because the duplicate matches + the existing row exactly) must not change the SE. Under the old + per-obs-PSU fallback this invariant did not hold.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + sd = SurveyDesign(weights="pw") + + r_base = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, survey_design=sd, + ) + + # Duplicate the first row: cell mean y_gt stays the same + # (identical y/treatment). Under per-obs PSU fallback the + # "extra observation" would change the variance; under + # auto-inject psu=group the group structure is unchanged. + dup = df_.iloc[0].copy() + df_dup = pd.concat([df_, pd.DataFrame([dup])], ignore_index=True) + r_dup = ChaisemartinDHaultfoeuille(seed=1).fit( + df_dup, outcome="outcome", group="group", + time="period", treatment="treatment", + L_max=2, survey_design=sd, + ) + if np.isfinite(r_base.overall_se) and np.isfinite(r_dup.overall_se): + assert r_base.overall_se == pytest.approx( + r_dup.overall_se, rel=1e-6 + ), ( + f"Row duplication changed SE ({r_base.overall_se} vs " + f"{r_dup.overall_se}) — auto-inject psu=group is not active." + ) + def test_rejection_excludes_zero_weight_rows(self, base_data): """A zero-weight row with a different PSU from its group must not trigger rejection — it is out-of-sample by the From 657b62b7dd0910bbc5d4c0e7a1fdecec41c4ccc1 Mon Sep 17 00:00:00 2001 From: igerber Date: Fri, 17 Apr 2026 17:47:24 -0400 Subject: [PATCH 15/16] Fix CI review R12: preserve full survey design under auto-inject MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R11's auto-inject of psu=group filtered `data` to positive-weight rows before re-resolving the effective SurveyDesign. That silently shrank `df_survey` on SurveyDesign.subpopulation() inputs without an explicit PSU — violating the documented subpopulation contract that keeps the full design intact so t critical values, p-values, CIs, and HonestDiD bounds match full-design expectations. Replace the pre-filter with a synthesized PSU column built on a private copy of `data`: - Valid group values flow through unchanged as the per-row PSU label. - NaN / invalid group values on zero-weight rows (the edge case that motivated the R11 filter) are replaced with a single shared dummy label so the PSU resolver accepts them. - Zero-weight rows contribute psi_i = 0 to the variance, but remain in the resolved design so n_psu / n_strata / df_survey reflect the full sample — matching the library's subpopulation contract. Added TestSurveyWithinGroupValidation.test_subpopulation_preserves_full_design_df_survey: zero-weights an entire group (mimicking SurveyDesign.subpopulation) and asserts that auto-inject df_survey equals the explicit psu='group' df_survey — the full-design reference. All 271 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 32 +++++++++++++------ tests/test_survey_dcdh.py | 40 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 957d3f88..a6a46ef3 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -672,25 +672,39 @@ def fit( ): from diff_diff.survey import SurveyDesign as _SurveyDesign - # Pre-filter zero-weight rows so NaN / invalid group IDs on - # excluded subpopulation rows don't block PSU resolution - # (group becomes the PSU column after auto-inject). Updates - # local bindings only; caller's DataFrame is untouched. - pos_mask_sv = np.asarray(survey_weights) > 0 - if not pos_mask_sv.all(): - data = data.loc[pos_mask_sv].reset_index(drop=True) + # Build a synthesized PSU column on a private copy of data + # so the caller's DataFrame is untouched. Valid group values + # flow through as their own PSU label; NaN/invalid group + # values on zero-weight rows (SurveyDesign.subpopulation() + # excluded rows) are replaced with a single shared dummy + # label so the PSU resolver accepts them. Zero-weight rows + # contribute psi_i = 0 to the variance; keeping them in the + # resolved design preserves the full-design df_survey + # contract (n_psu / n_strata reflect the full sample, not + # the positive-weight subset). + psu_col_name = "__dcdh_eff_psu__" + synth_data = data.copy() + synth_psu = synth_data[group].copy() + try: + invalid_mask = synth_psu.isna().to_numpy() + except (AttributeError, TypeError): + invalid_mask = np.zeros(len(synth_psu), dtype=bool) + if invalid_mask.any(): + synth_psu = synth_psu.astype(object) + synth_psu.loc[invalid_mask] = "__dcdh_excluded_null_psu__" + synth_data[psu_col_name] = synth_psu eff_design = _SurveyDesign( weights=survey_design.weights, strata=survey_design.strata, - psu=group, + psu=psu_col_name, fpc=getattr(survey_design, "fpc", None), weight_type=getattr(survey_design, "weight_type", "pweight"), nest=getattr(survey_design, "nest", False), lonely_psu=getattr(survey_design, "lonely_psu", "remove"), ) resolved_survey, survey_weights, _, survey_metadata = ( - _resolve_survey_for_fit(eff_design, data, "analytical") + _resolve_survey_for_fit(eff_design, synth_data, "analytical") ) if resolved_survey is not None: diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 8698b06e..9fa1b512 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1117,6 +1117,46 @@ def test_auto_inject_psu_matches_explicit_group_psu(self, base_data): == r_explicit.survey_metadata.df_survey ) + def test_subpopulation_preserves_full_design_df_survey(self, base_data): + """Under dCDH auto-inject, zero-weighting an entire group must not + shrink df_survey below what the full-design PSU count would give. + + Mirrors SurveyDesign.subpopulation() semantics where excluded + rows keep their weights at zero but remain in the design so + that t critical values, p-values, CIs, and HonestDiD bounds + reflect the full sampling structure.""" + df_ = base_data.copy() + df_["pw"] = 1.0 + # Mimic subpopulation() by zero-weighting one entire group + excluded_group = df_["group"].unique()[0] + df_.loc[df_["group"] == excluded_group, "pw"] = 0.0 + + sd = SurveyDesign(weights="pw") + r_subpop = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + # Reference: explicit psu='group' preserves the full-design + # PSU count because the resolver sees all groups (even those + # entirely zero-weighted). The auto-inject path must match this. + r_explicit = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=SurveyDesign(weights="pw", psu="group"), + ) + assert r_subpop.survey_metadata is not None + assert r_explicit.survey_metadata is not None + assert ( + r_subpop.survey_metadata.df_survey + == r_explicit.survey_metadata.df_survey + ), ( + f"Auto-inject df_survey={r_subpop.survey_metadata.df_survey} " + f"must match explicit psu='group' df_survey=" + f"{r_explicit.survey_metadata.df_survey} " + f"(full-design subpopulation contract)." + ) + def test_off_horizon_row_duplication_does_not_change_se(self, base_data): """Under auto-injected psu=group, duplicating an observation within a group (cell mean unchanged because the duplicate matches From e82faabe5ff0202164071906ebd3ce082ab32bc7 Mon Sep 17 00:00:00 2001 From: igerber Date: Fri, 17 Apr 2026 18:04:59 -0400 Subject: [PATCH 16/16] Fix CI review R13: preserve NaN-SE contract on degenerate-cohort survey fits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _plugin_se returns NaN when the cohort-recentered IF is empty or identically zero (documented degenerate-cohort contract — every variance-eligible group is its own (D_{g,1}, F_g, S_g) singleton). _survey_se_from_group_if only rejected negative variance, so on the same panel it computed sqrt(0) = 0.0, suppressed the degenerate-cohort warning (gated on np.isnan(overall_se)), and exposed a false zero SE. The bug affected every surface routed through _compute_se — top-level ATT, joiners/leavers, multi-horizon ATT, placebos, and derived normalized/cumulated SEs. Mirror the _plugin_se contract: short-circuit to NaN when U_centered is empty or sum(U_centered**2) <= 0, before delegating to compute_survey_if_variance. Added TestSurveyWithinGroupValidation.test_degenerate_cohort_survey_se_is_nan: 4 groups × 5 periods, each switching at a unique F_g so every cohort is a singleton; asserts overall_se is NaN (not 0.0) and that the degenerate-cohort warning fires under the survey path. All 272 targeted tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 12 +++++++ tests/test_survey_dcdh.py | 46 ++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index a6a46ef3..9e8b9818 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -4874,6 +4874,18 @@ def _survey_se_from_group_if( """ from diff_diff.survey import compute_survey_if_variance + # Degenerate-cohort contract (mirror _plugin_se): when the centered + # IF is empty or every cohort is a singleton (→ recentered IF is + # identically zero), the variance is unidentified. Return NaN + # rather than sqrt(0)=0 so the fit-time warning fires and + # inference stays NaN-consistent across every surface routed + # through _compute_se (overall, joiners/leavers, multi-horizon + # ATT, placebos, normalized/cumulated, heterogeneity). + if U_centered.size == 0: + return float("nan") + if float((U_centered ** 2).sum()) <= 0: + return float("nan") + group_ids = obs_survey_info["group_ids"] weights = obs_survey_info["weights"] resolved = obs_survey_info["resolved"] diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 9fa1b512..c2c7c0f0 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1117,6 +1117,52 @@ def test_auto_inject_psu_matches_explicit_group_psu(self, base_data): == r_explicit.survey_metadata.df_survey ) + def test_degenerate_cohort_survey_se_is_nan(self): + """When every variance-eligible group is its own singleton + cohort (D_{g,1}, F_g, S_g), the cohort-recentered IF is + identically zero. The survey SE path must return NaN (not 0.0) + so the degenerate-cohort warning fires and inference stays + NaN-consistent — matching the _plugin_se contract documented + in REGISTRY.md.""" + # 4 groups × 5 periods, each group switches at a unique F_g so + # the (D_{g,1}=0, F_g, S_g=+1) cohort key is unique per group. + rows = [] + for g, f_switch in enumerate([1, 2, 3, 4]): + for t in range(5): + d = 1 if t >= f_switch else 0 + y = float(g) + 0.5 * t + float(d) + rows.append({ + "group": g, + "period": t, + "treatment": d, + "outcome": y, + "pw": 1.0, + }) + df_ = pd.DataFrame(rows) + sd = SurveyDesign(weights="pw") + + import warnings as _warnings + with _warnings.catch_warnings(record=True) as w: + _warnings.simplefilter("always") + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + + # overall_se must be NaN on degenerate cohorts (not 0.0) + assert np.isnan(result.overall_se), ( + f"Degenerate-cohort survey overall_se must be NaN, " + f"got {result.overall_se}" + ) + # Degenerate-cohort warning must fire + assert any( + "cohort" in str(wi.message).lower() + and "identically zero" in str(wi.message).lower() + for wi in w + ), "Expected degenerate-cohort warning to fire under survey path" + def test_subpopulation_preserves_full_design_df_survey(self, base_data): """Under dCDH auto-inject, zero-weighting an entire group must not shrink df_survey below what the full-design PSU count would give.