Source code for diff_diff.business_report

"""
BusinessReport — plain-English stakeholder narrative from any diff-diff result.

Wraps any of the 16 fitted result types and produces:

- ``summary()``: a short paragraph block suitable for an email or Slack message.
- ``full_report()``: a multi-section markdown report with headline, assumptions,
  pre-trends, main result, robustness, sample, and an optional academic appendix.
- ``to_dict()``: a stable AI-legible structured schema (single source of truth —
  prose is rendered from this dict, not templated alongside it).

Design principles:

- Plain English, not academic jargon. The library ships this in addition to, not
  in place of, the estimator's existing ``results.summary()`` academic output.
- No estimator fitting and no variance re-derivation. Every effect, SE, p-value,
  CI, and sensitivity bound is either read from ``results`` or produced by an
  existing diff-diff utility. The report layer does compose a few cross-period
  summaries from per-period inputs already on the result (joint-Wald / Bonferroni
  pre-trends p-value, MDV-to-ATT ratio, heterogeneity dispersion over
  post-treatment effects); see ``docs/methodology/REPORTING.md`` for the full
  enumeration.
- Optional business context via keyword args (``outcome_label``, ``outcome_unit``,
  ``business_question``, ``treatment_label``). Without them, BusinessReport uses
  generic fallbacks — the zero-config path works.
- Diagnostic integration is implicit by default: ``BusinessReport(results)``
  auto-constructs a ``DiagnosticReport`` so the summary can mention pre-trends,
  robustness, and design-effect findings. Pass ``auto_diagnostics=False`` or an
  explicit ``diagnostics=`` object to override.

Methodology deviations (no traffic-light gates, pre-trends verdict thresholds,
power-aware phrasing, unit-translation policy, schema stability) are documented
in ``docs/methodology/REPORTING.md``. The ``to_dict()`` schema is marked
experimental in v3.2.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Any, Dict, FrozenSet, List, Optional, Union

import numpy as np

from diff_diff._reporting_helpers import describe_target_parameter
from diff_diff.diagnostic_report import DiagnosticReport, DiagnosticReportResults

BUSINESS_REPORT_SCHEMA_VERSION = "2.0"

__all__ = [
    "BusinessReport",
    "BusinessContext",
    "BUSINESS_REPORT_SCHEMA_VERSION",
]

# Recognized ``outcome_unit`` values mapped to a coarse "kind" used by the
# formatter. Unrecognized strings are accepted and rendered verbatim without
# arithmetic translation (``unit_kind = "unknown"``).
_UNIT_KINDS: Dict[str, str] = {
    "$": "currency",
    "usd": "currency",
    "%": "percent",
    "pp": "percentage_points",
    "percentage_points": "percentage_points",
    "percent": "percent",
    "log_points": "log_points",
    "log": "log_points",
    "count": "count",
    "users": "count",
}


[docs] @dataclass(frozen=True) class BusinessContext: """Frozen bundle of business-framing metadata used when rendering prose. Populated from ``BusinessReport`` constructor kwargs. Falls back to neutral labels when fields are not supplied. """ outcome_label: str outcome_unit: Optional[str] outcome_direction: Optional[str] business_question: Optional[str] treatment_label: str alpha: float
[docs] class BusinessReport: """Produce a stakeholder-ready narrative from any diff-diff results object. Parameters ---------- results : Any A fitted diff-diff results object. Any of the 16 result types is accepted. ``BaconDecompositionResults`` is not a valid input — Bacon is a diagnostic, not an estimator; use ``DiagnosticReport`` for that. outcome_label : str, optional Stakeholder-friendly outcome name (e.g. ``"Revenue per user"``). outcome_unit : str, optional Unit label: ``"$"`` / ``"%"`` / ``"pp"`` / ``"log_points"`` / ``"count"`` (recognized for formatting) or any free-form string (used verbatim without arithmetic translation). outcome_direction : str, optional ``"higher_is_better"`` or ``"lower_is_better"``. Drives whether the effect is described as "lift" / "drag" rather than just "increase" / "decrease". business_question : str, optional Question the analysis answers (prepended to the summary). treatment_label : str, optional Stakeholder-friendly treatment name (e.g. ``"the campaign"``). alpha : float, optional Significance level. Defaults to ``results.alpha`` when not supplied. Single knob: drives both CI level and significance phrasing. honest_did_results : HonestDiDResults or SensitivityResults, optional Pre-computed sensitivity result. When supplied, this is forwarded to the internal ``DiagnosticReport`` so sensitivity is not re-computed. auto_diagnostics : bool, default True When ``True`` and ``diagnostics`` is ``None``, auto-construct a ``DiagnosticReport``. Set ``False`` to skip diagnostics entirely. diagnostics : DiagnosticReport or DiagnosticReportResults, optional Explicit diagnostics object. Takes precedence over ``auto_diagnostics``. include_appendix : bool, default True Whether ``full_report()`` appends the estimator's academic ``results.summary()`` output under a "Technical Appendix" section. data, outcome, treatment, unit, time, first_treat : optional Raw panel + column names forwarded to the auto-constructed ``DiagnosticReport`` so data-dependent checks (2x2 PT on simple DiD, Bacon-from-scratch, EfficientDiD Hausman pretest) can run. survey_design : SurveyDesign, optional The ``SurveyDesign`` object used to fit a survey-weighted estimator. Forwarded to the auto-constructed ``DiagnosticReport`` for fit-faithful Goodman-Bacon replay. When the fit carries ``survey_metadata`` but ``survey_design`` is not supplied, Bacon is skipped with an explicit reason rather than replaying an unweighted decomposition for a design that does not match the estimate. The simple 2x2 parallel-trends helper (``utils.check_parallel_trends``) has no survey-aware variant; on a survey-backed ``DiDResults`` it is skipped unconditionally regardless of ``survey_design``. Supply ``precomputed={'parallel_trends': ...}`` with a survey-aware pretest to opt in. See ``docs/methodology/REPORTING.md``. precomputed : dict, optional Pre-computed diagnostic objects forwarded to the auto- constructed ``DiagnosticReport`` (same keys as ``DiagnosticReport(precomputed=...)``): ``"parallel_trends"``, ``"sensitivity"``, ``"pretrends_power"``, ``"bacon"``. DR validates keys and rejects estimator-incompatible entries (e.g., HonestDiD bounds or generic PT on SDiD / TROP). ``honest_did_results`` remains a shorthand for ``sensitivity``; an explicit ``precomputed['sensitivity']`` wins on conflict. """
[docs] def __init__( self, results: Any, *, outcome_label: Optional[str] = None, outcome_unit: Optional[str] = None, outcome_direction: Optional[str] = None, business_question: Optional[str] = None, treatment_label: Optional[str] = None, alpha: Optional[float] = None, honest_did_results: Optional[Any] = None, auto_diagnostics: bool = True, diagnostics: Optional[Union[DiagnosticReport, DiagnosticReportResults]] = None, include_appendix: bool = True, data: Optional[Any] = None, outcome: Optional[str] = None, treatment: Optional[str] = None, unit: Optional[str] = None, time: Optional[str] = None, first_treat: Optional[str] = None, survey_design: Optional[Any] = None, precomputed: Optional[Dict[str, Any]] = None, ): if type(results).__name__ == "BaconDecompositionResults": raise TypeError( "BaconDecompositionResults is a diagnostic, not an estimator; " "wrap the underlying estimator with BusinessReport and pass the " "Bacon object to DiagnosticReport(precomputed={'bacon': ...})." ) if diagnostics is not None and not isinstance( diagnostics, (DiagnosticReport, DiagnosticReportResults) ): raise TypeError( "diagnostics= must be a DiagnosticReport or " "DiagnosticReportResults instance; " f"got {type(diagnostics).__name__}." ) # Estimator-aware validation for ``honest_did_results``. SDiD / # TROP route robustness to ``estimator_native_diagnostics`` # (SDiD: ``in_time_placebo``, ``sensitivity_to_zeta_omega``; # TROP: factor-model fit metrics) and do not accept HonestDiD # bounds because they are methodology-incompatible with the # documented native-routing contract in REPORTING.md. Reject # the passthrough here so it doesn't silently forward to the # auto-constructed ``DiagnosticReport`` (which now also # rejects it at construction time — round-21 P1 CI review on # PR #318). if honest_did_results is not None and type(results).__name__ in { "SyntheticDiDResults", "TROPResults", "SyntheticControlResults", }: raise ValueError( f"{type(results).__name__} routes robustness to " "``estimator_native_diagnostics`` — ``honest_did_results`` " "is not accepted on this estimator because HonestDiD " "bounds are methodology-incompatible with the native " "routing documented in REPORTING.md. Use the result " "object's native diagnostics " "(SDiD: ``in_time_placebo()``, ``sensitivity_to_zeta_omega()``, " "``pre_treatment_fit``; TROP: ``effective_rank``, " "``loocv_score``; SyntheticControl: ``in_space_placebo()``, " "``pre_rmspe``, ``get_placebo_df()``) — BusinessReport surfaces " "these automatically under ``estimator_native_diagnostics``." ) # Round-44 P1 CI review on PR #318: mirror the SDiD/TROP # rejection pattern for ``CallawaySantAnna`` fits with # ``base_period != "universal"``. HonestDiD Rambachan-Roth # bounds are not valid for interpretation on the consecutive- # comparison pre-period surface produced by ``varying`` base, # so narrating precomputed sensitivity (whether passed as # ``honest_did_results`` or ``precomputed['sensitivity']``) # alongside a displayed varying-base fit mixes provenance the # bounds don't support. DR enforces the same guard at # construction; BR duplicates the check so the error fires # before the auto-DR is built, matching the existing # SDiD/TROP UX. REGISTRY.md §CallawaySantAnna line 410, # §HonestDiD line 2458. _cs_with_varying_base = type(results).__name__ == "CallawaySantAnnaResults" and ( getattr(results, "base_period", "universal") != "universal" ) if _cs_with_varying_base: _rejected_inputs: List[str] = [] if honest_did_results is not None: _rejected_inputs.append("honest_did_results") if precomputed is not None and "sensitivity" in precomputed: _rejected_inputs.append("precomputed['sensitivity']") if _rejected_inputs: _base_period = getattr(results, "base_period", "universal") raise ValueError( f"CallawaySantAnnaResults with " f"``base_period={_base_period!r}`` cannot be " "summarized alongside a precomputed HonestDiD " "sensitivity object. The Rambachan-Roth bounds are " "not valid for interpretation on the consecutive-" "comparison pre-period surface this base yields " "(REGISTRY.md §CallawaySantAnna / §HonestDiD). " "Rejected inputs: " + ", ".join(_rejected_inputs) + ". " "Re-fit the main estimator with " "``CallawaySantAnna(base_period='universal')`` " "before passing precomputed sensitivity, or drop " "the sensitivity passthrough to let BR skip the " "section with a methodology-critical reason." ) self._results = results self._honest_did_results = honest_did_results self._auto_diagnostics = auto_diagnostics self._diagnostics_arg = diagnostics self._include_appendix = include_appendix # Raw-data passthrough so the auto-constructed DR can run # data-dependent checks (2x2 PT on simple DiD, Bacon-from- # scratch on staggered estimators, EfficientDiD Hausman # pretest). Without these, the auto path silently skips those # checks (round-12 CI review on PR #318). self._dr_data = data self._dr_outcome = outcome self._dr_treatment = treatment self._dr_unit = unit self._dr_time = time self._dr_first_treat = first_treat # Round-40 P1 CI review on PR #318: survey-backed fits need # the ``SurveyDesign`` threaded through to the auto-constructed # DR so Bacon decomposition is fit-faithful and the 2x2 PT # skip path triggers for DiDResults with ``survey_metadata``. # Without this passthrough, the auto path silently replays an # unweighted decomposition / PT verdict for a weighted fit. self._dr_survey_design = survey_design # Round-43 P2 CI review on PR #318: BR docs and docstrings # advertised a ``precomputed={'parallel_trends': ...}`` opt-in # for survey-aware 2x2 PT and other escape hatches, but BR did # not actually accept a ``precomputed=`` kwarg — the auto path # only synthesized ``{"sensitivity": honest_did_results}``, so # callers following the BR docs hit a ``TypeError`` on # ``__init__``. Accept the passthrough here and forward every # key to the auto-constructed DR (which owns validation against # its implemented-key set and estimator-aware rejection rules). # ``honest_did_results`` still feeds into ``sensitivity`` as a # convenience; an explicit ``precomputed['sensitivity']`` wins # on conflict. self._dr_precomputed: Dict[str, Any] = dict(precomputed or {}) # Round-43 P2 CI review on PR #318: mirror DR's eager key # validation so users get the "unsupported key" error at BR # construction rather than lazily when the DR is built inside # ``to_dict()``. Kept in sync with ``DiagnosticReport``'s # ``_supported_precomputed`` set; the cheapest way to avoid # drift would be to import the set, but DR currently scopes it # locally to ``__init__`` so mirror the literal here with a # pointer comment. _br_supported_precomputed = { "parallel_trends", "sensitivity", "pretrends_power", "bacon", } _br_unsupported = set(self._dr_precomputed) - _br_supported_precomputed if _br_unsupported: raise ValueError( "precomputed= contains keys that are not implemented: " f"{sorted(_br_unsupported)}. Supported keys: " f"{sorted(_br_supported_precomputed)}. ``design_effect``, " "``heterogeneity``, and ``epv`` are read directly from the " "fitted result and do not accept precomputed overrides." ) resolved_alpha = alpha if alpha is not None else getattr(results, "alpha", 0.05) self._context = BusinessContext( outcome_label=outcome_label or "the outcome", outcome_unit=outcome_unit, outcome_direction=outcome_direction, business_question=business_question, treatment_label=treatment_label or "the treatment", alpha=float(resolved_alpha), ) self._cached_schema: Optional[Dict[str, Any]] = None
# -- Public API ---------------------------------------------------------
[docs] def to_dict(self) -> Dict[str, Any]: """Return the AI-legible structured schema (single source of truth).""" if self._cached_schema is None: self._cached_schema = self._build_schema() return self._cached_schema
[docs] def to_json(self, *, indent: int = 2) -> str: """Return ``to_dict()`` serialized as JSON.""" import json return json.dumps(self.to_dict(), indent=indent)
[docs] def summary(self) -> str: """Return a short plain-English paragraph block (6-10 sentences).""" return _render_summary(self.to_dict())
[docs] def full_report(self) -> str: """Return a structured multi-section markdown report.""" base = _render_full_report(self.to_dict()) if self._include_appendix: try: appendix = self._results.summary() except Exception: # noqa: BLE001 appendix = None if appendix: base = base + "\n\n## Technical Appendix\n\n```\n" + str(appendix) + "\n```\n" return base
[docs] def export_markdown(self) -> str: """Alias for ``full_report()`` (discoverability).""" return self.full_report()
[docs] def headline(self) -> str: """Return just the headline sentence.""" return _render_headline_sentence(self.to_dict())
[docs] def caveats(self) -> List[Dict[str, str]]: """Return the list of structured caveats (severity + topic + message).""" return list(self.to_dict().get("caveats", []))
def __repr__(self) -> str: estimator = type(self._results).__name__ headline = self.to_dict().get("headline") or {} val = headline.get("effect") if isinstance(val, (int, float)) and np.isfinite(val): return f"BusinessReport(results={estimator}, effect={val:.3g})" return f"BusinessReport(results={estimator})" def __str__(self) -> str: return self.summary() # -- Implementation detail --------------------------------------------- def _resolve_diagnostics(self) -> Optional[DiagnosticReportResults]: """Return the DiagnosticReportResults to embed, or ``None`` if skipped.""" if self._diagnostics_arg is not None: if isinstance(self._diagnostics_arg, DiagnosticReportResults): return self._diagnostics_arg if isinstance(self._diagnostics_arg, DiagnosticReport): return self._diagnostics_arg.run_all() raise TypeError("diagnostics= must be a DiagnosticReport or DiagnosticReportResults") if not self._auto_diagnostics: return None # Round-43 P2 CI review on PR #318: forward the user's # ``precomputed`` dict through to DR. ``honest_did_results`` # stays a convenience shortcut for ``sensitivity`` only; an # explicit ``precomputed['sensitivity']`` from the caller # wins. DR handles key validation (rejects unsupported keys # and estimator-incompatible sensitivities / parallel_trends # entries) so BR just merges and forwards. precomputed: Dict[str, Any] = dict(self._dr_precomputed) if self._honest_did_results is not None: precomputed.setdefault("sensitivity", self._honest_did_results) dr = DiagnosticReport( self._results, alpha=self._context.alpha, precomputed=precomputed or None, outcome_label=self._context.outcome_label, treatment_label=self._context.treatment_label, data=self._dr_data, outcome=self._dr_outcome, treatment=self._dr_treatment, unit=self._dr_unit, time=self._dr_time, first_treat=self._dr_first_treat, survey_design=self._dr_survey_design, ) return dr.run_all() def _build_schema(self) -> Dict[str, Any]: """Assemble the structured schema. Pulls validation content (PT, sensitivity, Bacon, DEFF, EPV, ...) from the internal ``DiagnosticReport``; extracts the stakeholder-facing headline and sample metadata from the fitted result itself. """ estimator_name = type(self._results).__name__ diagnostics_results = self._resolve_diagnostics() dr_schema: Optional[Dict[str, Any]] = ( diagnostics_results.schema if diagnostics_results is not None else None ) # PR #347 R4 P1: compute target_parameter BEFORE extracting # the headline so the no-scalar-by-design case # (``aggregation == "no_scalar_headline"``, e.g., dCDH # ``trends_linear=True`` with ``L_max >= 2``) can route the # headline through a dedicated branch that names the intentional # NaN rather than an estimation-failure path. target_parameter = describe_target_parameter(self._results) if target_parameter.get("aggregation") == "no_scalar_headline": # PR #347 R12 P1: the no-scalar ``reason`` must distinguish # the populated-surface case (per-horizon table exists) from # the empty-surface subcase (``linear_trends_effects=None`` # — no horizons survived estimation). Telling a user with # an empty surface to "see linear_trends_effects" is # dead-end guidance. _surface_empty = getattr(self._results, "linear_trends_effects", None) is None # PR #347 R14 P1: the empty-surface reason must use the # covariate-adjusted label when covariates are active. _has_controls = getattr(self._results, "covariate_residuals", None) is not None _empty_surface_label = "DID^{X,fd}_l" if _has_controls else "DID^{fd}_l" if _surface_empty: no_scalar_reason = ( "The fitted estimator intentionally does not produce a " "scalar overall ATT on this configuration " "(``trends_linear=True`` with ``L_max >= 2``), and on " f"this fit no cumulated level effects ``{_empty_surface_label}`` " "survived estimation — the per-horizon surface is " "empty. Re-fit with a larger ``L_max`` or with " "``trends_linear=False`` if you need a reportable " "estimand." ) else: no_scalar_reason = ( "The fitted estimator intentionally does not produce a " "scalar overall ATT on this configuration " "(``trends_linear=True`` with ``L_max >= 2``). Per-horizon " "cumulated level effects are on " "``results.linear_trends_effects[l]``." ) headline = { "status": "no_scalar_by_design", "effect": None, "se": None, "ci_lower": None, "ci_upper": None, "alpha_was_honored": True, "alpha_override_caveat": None, "ci_level": int(round((1.0 - self._context.alpha) * 100)), "p_value": None, "is_significant": False, "near_significance_threshold": False, "unit": self._context.outcome_unit, "unit_kind": _UNIT_KINDS.get( self._context.outcome_unit.lower() if self._context.outcome_unit else "", "unknown", ), "sign": "none", "breakdown_M": None, "reason": no_scalar_reason, } else: headline = self._extract_headline(dr_schema) sample = self._extract_sample() heterogeneity = _lift_heterogeneity(dr_schema) pre_trends = _lift_pre_trends(dr_schema) sensitivity = _lift_sensitivity(dr_schema) robustness = _lift_robustness(dr_schema) assumption = _apply_anticipation_to_assumption( _describe_assumption(estimator_name, self._results), self._results, ) next_steps = (dr_schema or {}).get("next_steps", []) caveats = _build_caveats(self._results, headline, sample, dr_schema) references = _references_for(estimator_name) if diagnostics_results is None: diagnostics_block: Dict[str, Any] = { "status": "skipped", "reason": "auto_diagnostics=False", } else: diagnostics_block = { "status": "ran", "schema": dr_schema, "overall_interpretation": ( dr_schema.get("overall_interpretation", "") if dr_schema is not None else "" ), } return { "schema_version": BUSINESS_REPORT_SCHEMA_VERSION, "estimator": { "class_name": estimator_name, "display_name": estimator_name, }, "context": { "outcome_label": self._context.outcome_label, "outcome_unit": self._context.outcome_unit, "outcome_direction": self._context.outcome_direction, "business_question": self._context.business_question, "treatment_label": self._context.treatment_label, "alpha": self._context.alpha, }, "headline": headline, "target_parameter": target_parameter, "assumption": assumption, "pre_trends": pre_trends, "sensitivity": sensitivity, "sample": sample, "heterogeneity": heterogeneity, "robustness": robustness, "diagnostics": diagnostics_block, "next_steps": next_steps, "caveats": caveats, "references": references, } def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, Any]: """Extract the headline effect + CI + p-value from the result.""" r = self._results # Delegate the attribute-alias lookup to the shared helper in the # diagnostic_report module so BR and DR agree on which fields a # result class exposes for its headline (including # ``ContinuousDiDResults`` which uses ``overall_att_se`` / # ``overall_att_p_value`` / ``overall_att_conf_int``). from diff_diff.diagnostic_report import _extract_scalar_headline extracted = _extract_scalar_headline(r, fallback_alpha=self._context.alpha) att: Optional[float] = None se: Optional[float] = None p: Optional[float] = None ci: Optional[List[float]] = None alpha = self._context.alpha result_alpha: Optional[float] = None if extracted is not None: _name, att, se, p, ci, result_alpha = extracted # On any alpha mismatch, preserve the fitted CI at its native # level. A faithful CI cannot be recomputed from point estimate # and SE alone without reproducing the fit's inference contract # (finite-df t-quantile, percentile bootstrap, wild cluster # bootstrap, survey replicate quantile, rank-deficient # undefined-df, etc.), and the 16 result classes do not expose # a uniform descriptor for that. Two separate alpha values: # ``display_alpha`` drives ``ci_level`` so the displayed CI # label matches the preserved bounds; the caller's requested # alpha drives the significance phrasing (``is_significant`` / # ``near_threshold``). A caveat records the override. display_alpha = alpha phrasing_alpha = alpha alpha_was_honored = True alpha_override_caveat: Optional[str] = None if ( result_alpha is not None and not np.isclose(alpha, result_alpha) and att is not None and se is not None ): inference_method = getattr(r, "inference_method", "analytical") if inference_method == "wild_bootstrap": inference_label = "wild cluster bootstrap" elif ( inference_method == "bootstrap" or getattr(r, "bootstrap_results", None) is not None ): inference_label = "bootstrap" elif getattr(r, "bootstrap_distribution", None) is not None: inference_label = "bootstrap" elif getattr(r, "variance_method", None) in {"bootstrap", "jackknife", "placebo"}: variance_method = getattr(r, "variance_method", None) inference_label = f"{variance_method} variance" else: df_survey = getattr( r, "df_survey", getattr(getattr(r, "survey_metadata", None), "df_survey", None), ) if isinstance(df_survey, (int, float)) and df_survey > 0: inference_label = "finite-df survey" elif isinstance(df_survey, (int, float)) and df_survey == 0: # Rank-deficient replicate design: the fit deliberately # left inference undefined. Preserve (NaN bounds remain NaN). inference_label = "undefined-df (replicate-weight)" else: # Ordinary analytical fit with a finite but unexposed # ``df`` (``DifferenceInDifferences`` / ``MultiPeriodDiD`` # / most staggered estimators / TROP). We cannot # reproduce the t-quantile without the fit's ``df``. inference_label = "analytical (native degrees of freedom)" display_alpha = float(result_alpha) alpha_was_honored = False alpha_override_caveat = ( f"Requested alpha ({phrasing_alpha:.2f}) was not honored " f"for the confidence interval because this fit uses " f"{inference_label} inference; the displayed CI remains " f"at the fit's native level " f"({int(round((1.0 - result_alpha) * 100))}%). The " f"significance phrasing still uses the requested alpha." ) unit = self._context.outcome_unit unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown") sign = ( "positive" if (att is not None and att > 0) else ( "negative" if (att is not None and att < 0) else ("null" if att == 0 else "undefined") ) ) if att is None or not np.isfinite(att): sign = "undefined" ci_level = int(round((1.0 - display_alpha) * 100)) # bool(...) coerces away numpy bool_ — when ``p`` is a numpy NaN (e.g. # SyntheticControl, whose analytical p_value is always NaN), ``np.isfinite`` # yields a numpy bool that is NOT JSON-serializable in the schema. is_significant = bool( p is not None and np.isfinite(p) and p < phrasing_alpha if p is not None else False ) near_threshold = bool( p is not None and np.isfinite(p) and (phrasing_alpha - 0.01) < p < (phrasing_alpha + 0.001) ) # Use DR-computed breakdown_M if available for quick reference. breakdown_M: Optional[float] = None if dr_schema: sens_section = dr_schema.get("sensitivity") or {} if sens_section.get("status") == "ran": breakdown_M = sens_section.get("breakdown_M") return { "effect": att, "se": se, "ci_lower": ci[0] if ci else None, "ci_upper": ci[1] if ci else None, "alpha_was_honored": alpha_was_honored, "alpha_override_caveat": alpha_override_caveat, "ci_level": ci_level, "p_value": p, "is_significant": is_significant, "near_significance_threshold": near_threshold, "unit": unit, "unit_kind": unit_kind, "sign": sign, "breakdown_M": breakdown_M, } def _extract_sample(self) -> Dict[str, Any]: """Extract sample metadata from the fitted result.""" r = self._results survey = self._extract_survey_block() n_treated = _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None))) n_control_units = _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))) # Control-group semantics. For estimators that expose a # ``control_group`` kwarg (CS, EfficientDiD, ContinuousDiD, # StaggeredTripleDiff, ...), the meaning of ``n_control_units`` # depends on it. When the mode is "not-yet-treated" (dynamic # comparison set), the fixed tally stored on the result is only # the fully-untreated subset — the actual comparison set varies # by (g, t) cell. Label the exposed count accordingly so prose # surfaces the dynamic context instead of misreporting # "0 control" (round-13 / round-17 / round-18 CI review). # # Canonicalize both ``"not_yet_treated"`` (CS / EfficientDiD / # ContinuousDiD / Wooldridge) and ``"notyettreated"`` # (StaggeredTripleDiff) as the same dynamic mode. # # Per-estimator fixed-subset field: # * CS / SA / Imputation / TwoStage / EfficientDiD / # dCDH / ContinuousDiD — ``n_control_units`` is the # never-treated tally; surface as ``n_never_treated``. # * StaggeredTripleDiff — ``n_control_units`` is a composite # total; the fixed subset is ``n_never_enabled`` (stored # separately on the result). # * Wooldridge — ``n_control_units`` is total eligible # comparisons (never-treated + future-treated) and does not # map to a never-treated count. Keep on the fixed-count # path even in dynamic mode. # * Stacked — ``n_control_units`` is "distinct control units # across the trimmed set" (stacked_did_results.py L59-62). # Under ``clean_control="not_yet_treated"``, the trimmed # set uses the rule ``A_s > a + kappa_post`` which admits # future-treated controls; it is NOT a never-treated tally # and cannot be relabeled as ``n_never_treated``. Keep # Stacked on the fixed-count path (round-21 P1 CI review # on PR #318 flagged the earlier relabeling as a # semantic-contract violation). control_group = _control_group_choice(r) name = type(r).__name__ n_never_treated: Optional[int] = None n_never_enabled: Optional[int] = None n_control: Optional[int] = n_control_units _never_treated_count_contract = name in { "CallawaySantAnnaResults", "SunAbrahamResults", "ImputationDiDResults", "TwoStageDiDResults", "EfficientDiDResults", "ChaisemartinDHaultfoeuilleResults", "ContinuousDiDResults", } _canonical_control = ( control_group.replace("_", "").lower() if isinstance(control_group, str) else None ) # Stacked has two dynamic (sub-experiment-specific) modes: # ``not_yet_treated`` (A_s > a + kappa_post) and ``strict`` # (A_s > a + kappa_post + kappa_pre). Only ``never_treated`` # (A_s = infinity) is a fixed never-treated pool. Round-22 P1 # CI review on PR #318 flagged that ``strict`` was being # misrendered as a fixed control design. is_stacked_dynamic = name == "StackedDiDResults" and _canonical_control in { "notyettreated", "strict", } is_dynamic_control = _canonical_control == "notyettreated" or is_stacked_dynamic # StaggeredTripleDiff comparison-group contract: # ``n_control_units`` is a composite total that also includes # the eligibility-denied / larger-cohort cells. Regardless of # the ``control_group`` mode the valid fixed comparison is the # never-enabled cohort (``staggered_triple_diff.py:384``, # REGISTRY.md §StaggeredTripleDifference line 1730). Round-37 # P1 CI review on PR #318: under ``control_group="never_treated"`` # (i.e., ``_canonical_control == "nevertreated"``) the composite # total was being narrated as "control". Surface # ``n_never_enabled`` instead on both the ``nevertreated`` and # the dynamic ``notyettreated`` modes. if name == "StaggeredTripleDiffResults" and _canonical_control == "nevertreated": n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None)) n_control = None if is_dynamic_control: if name == "StaggeredTripleDiffResults": n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None)) n_control = None elif name == "StackedDiDResults": # ``n_control_units`` is "distinct control units across # the trimmed set" (stacked_did_results.py L59-62) which # includes future-treated controls by construction under # both dynamic modes. Do NOT relabel as # ``n_never_treated``; instead surface the count under # ``n_distinct_controls_trimmed`` (sub-experiment- # specific context) and clear ``n_control`` so the # report does not narrate a fixed control pool. n_control = None elif _never_treated_count_contract: n_never_treated = n_control_units n_control = None # Panel-vs-RCS count semantics. CallawaySantAnnaResults stores # treated/control counts as OBSERVATIONS (not units) when the # fit used ``panel=False`` — ``staggered_results.py L183-L184`` # renders those counts as "obs:" rather than "units:". BR # previously labeled them as "units" / "present in the panel", # which misstates the sample composition for repeated cross- # section fits. Carry the flag into the schema so rendering can # branch. Round-28 P2 CI review on PR #318. count_unit = "observations" if getattr(r, "panel", True) is False else "units" sample_block: Dict[str, Any] = { "n_obs": _safe_int(getattr(r, "n_obs", None)), "n_treated": n_treated, "n_control": n_control, "n_never_treated": n_never_treated, "control_group": control_group if isinstance(control_group, str) else None, "dynamic_control": is_dynamic_control, "n_periods": _safe_int(getattr(r, "n_periods", None)), "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)), "post_periods": _safe_list_len(getattr(r, "post_periods", None)), "count_unit": count_unit, "survey": survey, } if n_never_enabled is not None: sample_block["n_never_enabled"] = n_never_enabled # Stacked-specific: surface the distinct-control-units tally on a # dedicated key so agents see the sub-experiment-specific # comparison count without misreading it as a never-treated # subset (round-21 / round-22 CI review). if name == "StackedDiDResults": sample_block["n_distinct_controls_trimmed"] = n_control_units return sample_block def _extract_survey_block(self) -> Optional[Dict[str, Any]]: sm = getattr(self._results, "survey_metadata", None) if sm is None: return None deff = _safe_float(getattr(sm, "design_effect", None)) return { "weight_type": getattr(sm, "weight_type", None), "effective_n": _safe_float(getattr(sm, "effective_n", None)), "design_effect": deff, # Round-43 P2 CI review on PR #318: the ``is_trivial`` # upper bound matches DR's ``_check_design_effect`` and # REPORTING.md's ``trivial`` band definition # ``0.95 <= deff < 1.05`` (half-open). The prior closed # interval ``<= 1.05`` produced ``is_trivial=True`` at # exactly ``deff == 1.05`` while the DR schema emitted # ``band_label="slightly_reduces"`` for the same value, # suppressing BR's non-trivial prose at that boundary. "is_trivial": deff is not None and 0.95 <= deff < 1.05, "n_strata": _safe_int(getattr(sm, "n_strata", None)), "n_psu": _safe_int(getattr(sm, "n_psu", None)), "df_survey": _safe_int(getattr(sm, "df_survey", None)), "replicate_method": getattr(sm, "replicate_method", None), }
# --------------------------------------------------------------------------- # Schema helpers (module-private) # --------------------------------------------------------------------------- def _safe_float(val: Any) -> Optional[float]: if val is None: return None try: return float(val) except (TypeError, ValueError): return None def _safe_int(val: Any) -> Optional[int]: if val is None: return None try: return int(val) except (TypeError, ValueError): return None def _safe_ci(ci: Any) -> Optional[List[float]]: if ci is None: return None try: lo, hi = ci except (TypeError, ValueError): return None lo_f = _safe_float(lo) hi_f = _safe_float(hi) if lo_f is None or hi_f is None: return None return [lo_f, hi_f] def _safe_list_len(val: Any) -> Optional[int]: if val is None: return None try: return int(len(val)) except TypeError: return None def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: """Pull pre-trends + power into a single BR-facing block.""" if dr is None: return {"status": "skipped", "reason": "auto_diagnostics=False"} pt = dr.get("parallel_trends") or {} pp = dr.get("pretrends_power") or {} if pt.get("status") != "ran": return { "status": pt.get("status", "not_run"), "reason": pt.get("reason"), } return { "status": "computed", "method": pt.get("method"), "joint_p_value": pt.get("joint_p_value"), "verdict": pt.get("verdict"), "n_pre_periods": pt.get("n_pre_periods"), # Preserve DR's inconclusive-PT provenance on the BR schema so # downstream consumers (and BR's own summary renderer) see the # undefined-row count and DR's detailed reason without having # to re-consult the DR schema (round-39 P3 CI review on PR # #318). These fields are populated only when # ``verdict == "inconclusive"`` per ``_pt_event_study``'s # inconclusive branch (``diagnostic_report.py:999``). "n_dropped_undefined": pt.get("n_dropped_undefined"), "reason": pt.get("reason"), # Carry the denominator df through when the survey F-reference # branch was used so BR consumers can flag the finite-sample # correction without re-consulting the DR schema (round-28 P3 # CI review on PR #318). "df_denom": pt.get("df_denom"), "power_status": pp.get("status"), # Dedicated reason field so schema consumers see the fallback # explanation when ``compute_pretrends_power`` cannot run # (``status in {"skipped", "error", "not_applicable"}``). # REPORTING.md lines 118-125 promise this provenance; round-29 # P3 CI review on PR #318 flagged that only the enum status was # being exposed and the reason was dropped at the lift boundary. # ``power_status`` stays the machine-readable enum; ``power_reason`` # carries the plain-English explanation. "power_reason": pp.get("reason"), "power_tier": pp.get("tier"), "mdv": pp.get("mdv"), # Level-scale max pre-period violation under the MDV # (PR-B R12: `mdv * max(|violation_weights|)`). Carried alongside # the raw `mdv` so BR schema consumers and the full-report # renderer can show both quantities. Pre-R14 this was silently # dropped at the BR lift boundary so the new renderer line never # fired even though DR emitted the value. "max_abs_pre_violation": pp.get("max_abs_pre_violation"), "mdv_share_of_att": pp.get("mdv_share_of_att"), # Carry the covariance-source annotation through so BR can hedge the # power-tier phrasing when compute_pretrends_power silently used a # diagonal fallback despite event_study_vcov being available. "power_covariance_source": pp.get("covariance_source"), } def _lift_sensitivity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: if dr is None: return {"status": "skipped", "reason": "auto_diagnostics=False"} sens = dr.get("sensitivity") or {} if sens.get("status") != "ran": # Preserve ``method`` through to the BR schema so downstream # consumers can distinguish a native-routed skip # (``method="estimator_native"`` for SDiD / TROP, where # robustness is covered by the native battery) from a # methodology-blocked skip (e.g., CS with # ``base_period='varying'``). Without it, agents reading the BR # schema alone cannot tell these cases apart and would have to # re-consult the DR schema to disambiguate. return { "status": sens.get("status", "not_run"), "reason": sens.get("reason"), "method": sens.get("method"), } return { "status": "computed", "method": sens.get("method"), "breakdown_M": sens.get("breakdown_M"), "conclusion": sens.get("conclusion"), "grid": sens.get("grid"), } def _lift_heterogeneity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: """Return the heterogeneity section of the BR schema. Round-31 P2 CI review on PR #318: the lift previously returned ``None`` on any non-``ran`` path, which broke the schema contract that every top-level BR key resolves to a dict with a ``status`` field. Downstream consumers had to special-case this one section. Now returns a dict-shaped ``{"status": ..., "reason": ...}`` block mirroring DR's own status enum so ``schema["heterogeneity"] ["status"]`` is always readable. """ if dr is None: return {"status": "skipped", "reason": "auto_diagnostics=False"} het = dr.get("heterogeneity") or {} status = het.get("status") if status != "ran": return { "status": status or "not_run", "reason": het.get("reason"), } return { "status": "ran", "source": het.get("source"), "n_effects": het.get("n_effects"), "min": het.get("min"), "max": het.get("max"), "cv": het.get("cv"), "sign_consistent": het.get("sign_consistent"), } def _lift_robustness(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: if dr is None: return {"status": "skipped", "reason": "auto_diagnostics=False"} bacon = dr.get("bacon") or {} native = dr.get("estimator_native_diagnostics") or {} native_block = { "status": native.get("status"), "estimator": native.get("estimator"), "pre_treatment_fit": native.get("pre_treatment_fit"), } # Classic SCM exposes pre_rmspe + donor-weight concentration + the (opt-in) # in-space placebo rather than SDiD's pre_treatment_fit; surface those so the # top-level robustness block is not empty for SyntheticControl. if native.get("estimator") == "SyntheticControl": native_block["pre_rmspe"] = native.get("pre_rmspe") native_block["weight_concentration"] = native.get("weight_concentration") native_block["in_space_placebo"] = native.get("in_space_placebo") # ADH-2015 robustness diagnostics (opt-in; "not_run" stub until run). native_block["leave_one_out"] = native.get("leave_one_out") native_block["in_time_placebo"] = native.get("in_time_placebo") return { "bacon": { "status": bacon.get("status"), "forbidden_weight": bacon.get("forbidden_weight"), "verdict": bacon.get("verdict"), }, "estimator_native": native_block, } def _anticipation_periods(results: Any) -> int: """Return the non-negative anticipation-period count from a result, or 0. Helper for ``_describe_assumption``. Anticipation-capable estimators (MultiPeriodDiD, CS, SA, ImputationDiD, TwoStageDiD, Stacked, EfficientDiD, StaggeredTripleDiff, ContinuousDiD, Wooldridge) expose ``anticipation`` as an int defaulting to ``0``. """ a = getattr(results, "anticipation", 0) try: k = int(a) except (TypeError, ValueError): return 0 return k if k > 0 else 0 def _control_group_choice(results: Any) -> Optional[str]: """Return the control-group choice string for a fitted result, normalized across estimator-specific attribute names. Most anticipation-capable estimators expose the control-group choice as ``results.control_group``. ``StackedDiDResults`` exposes the same choice as ``clean_control`` (the public Wing-Freedman-Hollingsworth-2024 kwarg name). Without this alias, a StackedDiD fit with ``clean_control="not_yet_treated"`` would surface as ``control_group=None`` in the business-report schema, and the dynamic-control branch in ``_extract_sample`` would never fire. """ cg = getattr(results, "control_group", None) if isinstance(cg, str): return cg if type(results).__name__ == "StackedDiDResults": clean = getattr(results, "clean_control", None) if isinstance(clean, str): return clean return None _STRICT_NO_ANTICIPATION_PATTERNS = ( # Ordered from most specific to least specific so the first match # wins on strings that could match multiple patterns. Matches are # case-sensitive because every occurrence in ``_describe_assumption`` # is a fixed canonical phrase. ", plus no anticipation", "plus no anticipation", " Also assumes no anticipation (Assumption NA), overlap " "(Assumption O), and absorbing / irreversible treatment.", " Also assumes no anticipation.", "Also assumes no anticipation.", " and no anticipation", ) def _strip_strict_no_anticipation(desc: str) -> str: """Remove any strict no-anticipation phrasing from ``desc``. Several base assumption descriptions in ``_describe_assumption`` hard-code a strict "plus no anticipation" / "Also assumes no anticipation" clause (CS / SA / Imputation / TwoStage / Wooldridge generic, StackedDiD sub-experiment, EfficientDiD PT-Post, EfficientDiD PT-All, ContinuousDiD, TripleDifference, SyntheticDiD, TROP, dCDH, and the fallback unconditional branch). When a fit actually allows anticipation the helper must REPLACE that wording, not append a contradictory clause on top of it. Round-30 P1 CI review on PR #318. """ if not desc: return desc out = desc for pattern in _STRICT_NO_ANTICIPATION_PATTERNS: out = out.replace(pattern, "") # Collapse any doubled whitespace or dangling punctuation left by # the removal (e.g., "cohorts, with..." -> "cohorts, with..."; # "cohorts . " -> "cohorts."). out = re.sub(r"\s+\.", ".", out) out = re.sub(r"\s+,", ",", out) out = re.sub(r" {2,}", " ", out) return out.strip() def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Dict[str, Any]: """If the fit used ``anticipation > 0``, flip ``no_anticipation`` off, strip any strict no-anticipation wording from the base description, and append an anticipation-aware clause. Round-17 CI review flagged the strict "plus no anticipation" language on anticipation-enabled fits. Per REGISTRY.md §CallawaySantAnna lines 355-395 and the matching sections for SA / MultiPeriod / Wooldridge / EfficientDiD, a fit with ``anticipation=k`` shifts the effective treatment boundary by ``k`` pre-periods; the identifying assumption becomes "no treatment effects earlier than ``k`` periods before the treatment start" rather than strict no-anticipation. Round-30 CI review caught that the previous implementation only appended — the resulting prose said both "strict no-anticipation holds" and "anticipation is allowed" in the same paragraph. """ k = _anticipation_periods(results) if k <= 0: return block block = dict(block) # don't mutate the caller's dict block["no_anticipation"] = False block["anticipation_periods"] = k period_word = "period" if k == 1 else "periods" clause = ( f" Anticipation is allowed for the {k} {period_word} immediately " "before treatment: the identifying contract requires no treatment " f"effects earlier than {k} {period_word} before the treatment " "start (not strict no-anticipation)." ) desc = block.get("description", "") if isinstance(desc, str): block["description"] = _strip_strict_no_anticipation(desc) + clause return block def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, Any]: """Return the identifying-assumption block for an estimator.""" if estimator_name in { "SyntheticDiDResults", }: return { "parallel_trends_variant": "weighted_pt", "no_anticipation": True, "description": ( "Synthetic-Difference-in-Differences identifies the ATT under a " "weighted parallel-trends analogue: the synthetic control is " "chosen to match the treated group's pre-period trajectory." ), } if estimator_name in {"TROPResults"}: return { "parallel_trends_variant": "factor_model", "no_anticipation": True, "description": ( "TROP uses low-rank factor-model identification rather than a " "parallel-trends assumption; unobserved heterogeneity is " "captured through latent factor loadings." ), } if estimator_name in {"SyntheticControlResults"}: return { # Distinct from SDiD's "synthetic_fit" weighted-PT analogue: classic # SCM is a donor-weighted level match (matches the DR "scm_fit" method). "parallel_trends_variant": "scm_fit", "no_anticipation": True, "description": ( "Classic synthetic control identifies the single treated unit's " "counterfactual via a donor-weighted match to its pre-treatment " "trajectory (a design-enforced fit, not a parallel-trends test); " "significance comes from in-space placebo permutation inference " "rather than an analytical standard error." ), } if estimator_name == "ContinuousDiDResults": # Callaway, Goodman-Bacon & Sant'Anna (2024), two-level PT: # REGISTRY.md §ContinuousDiD > Identification. return { "parallel_trends_variant": "dose_pt_or_strong_pt", "no_anticipation": True, "description": ( "ContinuousDiD identifies dose-specific treatment effects " "under two possible parallel-trends conditions (Callaway, " "Goodman-Bacon & Sant'Anna 2024). Parallel Trends (PT) " "assumes untreated potential outcome paths are equal across " "all dose groups and the untreated group (conditional on " "dose), identifying ATT(d|d) and the binarized ATT^loc but " "NOT ATT(d), ACRT, or cross-dose comparisons. Strong " "Parallel Trends (SPT) additionally rules out selection " "into dose on the basis of treatment effects and is " "required to identify the dose-response curve ATT(d), " "marginal effect ACRT(d), and cross-dose contrasts." ), } if estimator_name in {"TripleDifferenceResults", "StaggeredTripleDiffResults"}: # Ortiz-Villavicencio & Sant'Anna (2025) — identification is the # triple-difference cancellation across the 2x2x2 cells, not # ordinary DiD parallel trends; see REGISTRY.md §TripleDifference # and §StaggeredTripleDifference. return { "parallel_trends_variant": "triple_difference_cancellation", "no_anticipation": True, "description": ( "Triple-difference identification relies on the DDD " "decomposition (Ortiz-Villavicencio & Sant'Anna 2025): " "the ATT is recovered from `DDD = DiD_A + DiD_B - DiD_C` " "across the Group x Period x Eligibility (or Treatment) " "cells, which differences out group-specific and " "period-specific unobservables without requiring separate " "parallel trends to hold between each cell pair. The " "identifying restriction is therefore weaker than ordinary " "DiD parallel trends but assumes that the residual " "unobservable component is additively separable across the " "three dimensions; practical overlap and common-support " "conditions still apply on the propensity score when " "covariates are used." ), } if estimator_name == "ChaisemartinDHaultfoeuilleResults": # de Chaisemartin & D'Haultfoeuille (2020, 2024) — identification is # transition-based across (joiner, leaver, stable-control) cells # around each switching period, not a group-time ATT parallel- # trends restriction. Writing up dCDH as "parallel trends across # treatment cohorts" was flagged as a source-faithfulness bug in # PR #318 review; REGISTRY.md §ChaisemartinDHaultfoeuille is # explicit about the transition-set construction. # # Phase-3 features (``controls``, ``trends_linear``, # ``heterogeneity``) each modify the identifying contract and # change the estimand from ``DID_l`` to ``DID^X_l`` / # ``DID^{fd}_l`` / the heterogeneity-test variant. When active, # append an explicit clause so the description does not # misrepresent the identifying assumption (the reviewer has # flagged several parallel source-faithfulness gaps elsewhere # — explicitly surfacing Phase-3 config matches the per-estimator # walkthrough pattern). base_description = ( "Identification is transition-based (de Chaisemartin & " "D'Haultfoeuille 2020; dynamic companion 2024). At each " "switching period, the estimator contrasts joiners " "(D:0->1), leavers (D:1->0), and stable-treated / " "stable-untreated control cells that share the same " "treatment state across adjacent periods, yielding the " "contemporaneous ``DID_M`` and per-horizon ``DID_l`` / " "``DID_{g,l}`` building blocks. The identifying " "restriction is parallel trends within each transition's " "stable-control cell (not a single group-time ATT PT " "condition across all cohorts) plus no anticipation; " "with non-binary treatment the stable-control match is " "additionally on exact baseline dose ``D_{g,1}``. " "Reversible treatment is natively supported, unlike the " "absorbing-treatment designs that rely on a fixed " "treatment-onset cohort." ) has_controls = ( results is not None and getattr(results, "covariate_residuals", None) is not None ) # PR #347 R10 P1: read the persisted ``trends_linear`` flag # first — empty-horizon trends-linear fits set # ``linear_trends_effects=None`` but are still trends-linear # per the estimator contract. Legacy fit objects predating # the persisted field fall back to the presence inference. _trends_persisted = getattr(results, "trends_linear", None) if results is not None else None if isinstance(_trends_persisted, bool): has_trends = _trends_persisted else: has_trends = ( results is not None and getattr(results, "linear_trends_effects", None) is not None ) has_heterogeneity = ( results is not None and getattr(results, "heterogeneity_effects", None) is not None ) active_parts: List[str] = [] if has_controls and has_trends: active_parts.append( "the estimand is ``DID^{X,fd}_l`` (covariate-residualized " "first-differences), and identification holds conditional on " "the covariates entering the first-stage regression and " "allowing group-specific linear trends" ) elif has_controls: active_parts.append( "the estimand is ``DID^X_l``, and identification holds " "conditional on the covariates entering the first-stage " "residualization" ) elif has_trends: active_parts.append( "the estimand is ``DID^{fd}_l`` (first-differenced) and the " "identifying restriction is relaxed to allow group-specific " "linear pre-trends" ) if has_heterogeneity: active_parts.append("heterogeneity tests ``beta^{het}_l`` are reported per horizon") if active_parts: phase3_clause = " Phase-3 configuration: " + "; ".join(active_parts) + "." base_description = base_description + phase3_clause return { "parallel_trends_variant": "transition_based", "no_anticipation": True, "description": base_description, } if estimator_name == "EfficientDiDResults": # Chen, Sant'Anna & Xie (2025) — identification is parameterized # by ``pt_assumption`` ("all" vs "post"). PT-All is the stronger # regime (PT across all groups/periods, over-identified — paper # Lemma 2.1), PT-Post the weaker (PT only in post-treatment, # just-identified reduction to single-baseline DiD per Corollary # 3.2). Also read ``control_group`` when present (not_yet_treated # vs last_cohort) to be source-faithful to REGISTRY.md §EfficientDiD # lines 736-738 and 907. pt_assumption = getattr(results, "pt_assumption", "all") control_group = getattr(results, "control_group", None) # The estimator only accepts ``control_group`` values of # ``"never_treated"`` (the default) or ``"last_cohort"``. When # ``last_cohort`` is used, the latest treatment cohort is # reclassified as a pseudo-never-treated comparison and time # periods at/after its onset are dropped; describing such a fit # with generic never-treated language would misstate the # identifying setup (see REGISTRY.md §EfficientDiD line 908). is_last_cohort = control_group == "last_cohort" if pt_assumption == "post": variant = "pt_post" if is_last_cohort: control_clause = ( "the comparison group is the latest treated cohort " "reclassified as pseudo-never-treated (periods " "at/after that cohort's treatment start are " "dropped)" ) else: control_clause = "the comparison group is never-treated" description = ( "Identification under PT-Post (Chen, Sant'Anna & Xie " "2025): parallel trends holds only in post-treatment " "periods, " + control_clause + ", and the baseline is period g-1 only. This is the " "weaker of the two regimes — just-identified and " "reducing to standard single-baseline DiD (Corollary " "3.2). Also assumes no anticipation (Assumption NA), " "overlap (Assumption O), and absorbing / irreversible " "treatment." ) else: variant = "pt_all" if is_last_cohort: baseline_clause = ( "using the latest treated cohort as a pseudo-never-" "treated comparison (periods at/after that cohort's " "treatment start are dropped); any earlier cohort " "and any pre-treatment period can serve as baseline" ) else: baseline_clause = ( "using never-treated units as comparison; any " "not-yet-treated cohort and any pre-treatment period " "can serve as baseline" ) description = ( "Identification under PT-All (Chen, Sant'Anna & Xie " "2025): parallel trends holds for all groups and all " "periods, " + baseline_clause + ". The estimator is over-identified (Lemma 2.1), and " "the paper's optimal combination weights are applied. " "Also assumes no anticipation (Assumption NA), overlap " "(Assumption O), and absorbing / irreversible " "treatment. The Hausman PT-All vs PT-Post pretest " "(operating on the post-treatment event-study vector " "ES(e), Theorem A.1) checks whether the stronger " "PT-All regime is tenable." ) block: Dict[str, Any] = { "parallel_trends_variant": variant, "no_anticipation": True, "description": description, } if isinstance(control_group, str): block["control_group"] = control_group return block if estimator_name == "StackedDiDResults": # Wing, Freedman & Hollingsworth (2024) — identification is # sub-experiment common trends plus the IC1 (event window fits # within the data range) and IC2 (clean controls exist for the # event) inclusion conditions, NOT the generic "group-time ATT # parallel trends" clause used for CS / SA / etc. (round-22 P1 # CI review on PR #318). The active ``clean_control`` rule # determines which units qualify as valid controls for each # adoption event. REGISTRY.md §StackedDiD lines 1189-1193 # (identification) and 1234-1256 (clean-control rules). clean_control = getattr(results, "clean_control", None) if clean_control == "never_treated": control_clause = ( "controls are restricted to units that are never treated " "over the panel (``A_s = infinity``)" ) elif clean_control == "strict": control_clause = ( "controls for event ``a`` are units satisfying the strict " "rule ``A_s > a + kappa_post + kappa_pre`` (strictly " "untreated across the full pre- and post-event window)" ) else: # Default: "not_yet_treated" — A_s > a + kappa_post. control_clause = ( "controls for event ``a`` are units satisfying ``A_s > a + " "kappa_post`` (not yet treated through the end of the " "event's post-window, so future-treated units can serve " "as controls for earlier events)" ) block: Dict[str, Any] = { "parallel_trends_variant": "stacked_sub_experiment", "no_anticipation": True, "description": ( "Identification under Stacked DiD (Wing, Freedman & " "Hollingsworth 2024): within each stacked sub-experiment " "parallel trends holds between the treated cohort and the " "corresponding clean-control set over the event window " "``[-kappa_pre, +kappa_post]``; " + control_clause + ". Sub-experiments are restricted by IC1 (the event " "window fits within the available time range) and IC2 " "(at least one clean control exists). The aggregate ATT is " "a weighted sum over sub-experiments, so the common-trends " "assumption is sub-experiment-specific, not a single " "panel-wide group-time ATT condition. Also assumes no " "anticipation." ), } if isinstance(clean_control, str): block["control_group"] = clean_control block["clean_control"] = clean_control return block if estimator_name == "ImputationDiDResults": # Borusyak, Jaravel & Spiess (2024) — identification is through # an untreated-potential-outcome model: unit+time FE (optionally # plus covariates) fitted on untreated observations only # (``Omega_0``) deliver the counterfactual ``Y_it(0)``, and the # treatment effect ``tau_it`` is the residual on treated # observations. Writing this as generic "group-time ATT # parallel trends" misstates the identifying model — the # restriction is on the UNTREATED outcome's additive FE # structure, not on cohort-time ATT equality. REGISTRY.md # §ImputationDiD lines 1000-1013 and Assumption 1 (parallel # trends) + Assumption 2 (no anticipation on untreated # observations). Round-42 P1 CI review on PR #318 flagged this # source-faithfulness gap. return { "parallel_trends_variant": "untreated_outcome_fe_model", "no_anticipation": True, "description": ( "Identification under Imputation DiD (Borusyak, Jaravel " "& Spiess 2024): the untreated potential outcome " "``Y_it(0)`` follows an additive unit+time fixed-effects " "model ``Y_it(0) = alpha_i + beta_t [+ X'_it * delta] + " "epsilon_it``. Step 1 estimates those FE on untreated " "observations only (``Omega_0`` = never-treated plus " "not-yet-treated cells); Step 2 imputes the " "counterfactual for treated observations from the " "fitted FE; Step 3 aggregates ``tau_hat_it = Y_it - " "Y_hat_it(0)`` with researcher-chosen weights. The " "identifying restriction is therefore parallel trends " "of the UNTREATED outcome model (Assumption 1) — " "``E[Y_it(0)] = alpha_i + beta_t``, holding across all " "observations — rather than equality of cohort-time " "ATTs. Also assumes no anticipation on untreated " "observations (Assumption 2) and absorbing treatment." ), } if estimator_name == "TwoStageDiDResults": # Gardner (2022) — identification is the same as BJS # ImputationDiD (point estimates are algebraically equivalent # per REGISTRY.md §TwoStageDiD line 1130): unit+time FE # estimated on untreated observations only deliver the # untreated potential-outcome trajectory; Stage 2 regresses # the resulting residuals on treatment indicators. Writing # this as generic "group-time ATT parallel trends" loses the # load-bearing detail that Stage 1 operates only on untreated # cells. REGISTRY.md §TwoStageDiD lines 1113-1128 and # Assumption (same as ImputationDiD). Round-42 P1 CI review on # PR #318 flagged this source-faithfulness gap. return { "parallel_trends_variant": "untreated_outcome_fe_model", "no_anticipation": True, "description": ( "Identification under Two-Stage DiD (Gardner 2022): " "Stage 1 fits unit + time fixed effects on untreated " "observations only (``Omega_0``), residualizing the " "outcome as ``y_tilde_it = Y_it - alpha_hat_i - " "beta_hat_t``; Stage 2 regresses residualized outcomes " "on the treatment indicator across treated observations " "to recover the ATT. The point estimates are " "algebraically equivalent to Borusyak-Jaravel-Spiess " "imputation (both rely on the same untreated-outcome FE " "model to construct the counterfactual). The " "identifying restriction is therefore parallel trends " "of the UNTREATED outcome: ``E[Y_it(0)] = alpha_i + " "beta_t`` for all observations (not a group-time ATT " "equality across cohorts). Also assumes no anticipation " "(``Y_it = Y_it(0)`` for all untreated observations) " "and absorbing / irreversible treatment." ), } if estimator_name in { "CallawaySantAnnaResults", "SunAbrahamResults", "WooldridgeDiDResults", }: return { "parallel_trends_variant": "conditional_or_group_time", "no_anticipation": True, "description": ( "Identification relies on parallel trends across treatment " "cohorts and time periods (group-time ATT), plus no " "anticipation." ), } return { "parallel_trends_variant": "unconditional", "no_anticipation": True, "description": ( "Identification relies on the standard DiD parallel-trends " "assumption plus no anticipation of treatment by either group." ), } def _build_caveats( _results: Any, headline: Dict[str, Any], sample: Dict[str, Any], dr_schema: Optional[Dict[str, Any]], ) -> List[Dict[str, Any]]: """Assemble the plain-English caveats list for the headline schema.""" caveats: List[Dict[str, Any]] = [] # NaN ATT is the highest-severity caveat. if headline.get("sign") == "undefined": caveats.append( { "severity": "warning", "topic": "estimation_failure", "message": ( "Estimation produced a non-finite effect. Inspect data " "preparation and model specification before interpreting." ), } ) # Alpha override could not be honored (bootstrap / finite-df inference). alpha_override_msg = headline.get("alpha_override_caveat") if isinstance(alpha_override_msg, str) and alpha_override_msg: caveats.append( { "severity": "info", "topic": "alpha_override_preserved", "message": alpha_override_msg, } ) # Near-threshold p-value. if headline.get("near_significance_threshold"): caveats.append( { "severity": "info", "topic": "near_significance", "message": ( "The p-value is close to the conventional significance " "threshold; small changes to the sample or specification " "could move it either way." ), } ) # Few treated units. nt = sample.get("n_treated") if nt is not None and nt <= 3: caveats.append( { "severity": "warning", "topic": "few_treated", "message": ( f"Only {nt} treated units in this fit; standard errors " "rely on large-cluster asymptotics and may be unreliable. " "Consider SyntheticDiD or an exact-permutation inference " "alternative." ), } ) # Non-trivial design effect. survey = sample.get("survey") if survey and not survey.get("is_trivial"): deff = survey.get("design_effect") eff_n = survey.get("effective_n") if isinstance(deff, (int, float)) and deff >= 5.0: caveats.append( { "severity": "warning", "topic": "design_effect", "message": ( f"Very large survey design effect (DEFF = {deff:.2g}). " "Inspect the weight distribution and consider weight " "trimming if driven by outlier weights." ), } ) elif isinstance(deff, (int, float)) and deff >= 1.5: if isinstance(eff_n, (int, float)): caveats.append( { "severity": "info", "topic": "design_effect", "message": ( f"Survey design reduces effective sample size: " f"DEFF = {deff:.2g}; effective n = {eff_n:.0f}." ), } ) # Bacon forbidden comparisons. # Round-45 P1 CI review on PR #318: Goodman-Bacon is a # decomposition of TWFE weights (see ``bacon.py`` header and # Goodman-Bacon 2021). On fits already produced by a # heterogeneity-robust estimator (CS / SA / BJS / Gardner / # Wooldridge / EfficientDiD / Stacked / dCDH / TripleDifference / # StaggeredTripleDiff / SDiD / TROP), a high forbidden-weight share # says "TWFE would have been materially biased on this rollout", # not "the displayed estimator needs to be replaced" — the # displayed estimator is already robust to the heterogeneity that # Bacon flags. DR partly preserves this with "if not already in # use" prose; BR must carry the same distinction through to the # caveat. The TWFE-style estimators whose results route through # Bacon and for which the "switch to a robust estimator" # recommendation is load-bearing are the DiDResults-type fits; all # other result classes are already robust. _TWFE_STYLE_RESULTS: FrozenSet[str] = frozenset( {"DiDResults", "MultiPeriodDiDResults", "TwoWayFixedEffectsResults"} ) if dr_schema: bacon = dr_schema.get("bacon") or {} if bacon.get("status") == "ran": fw = bacon.get("forbidden_weight") if isinstance(fw, (int, float)) and fw > 0.10: _estimator_name = type(_results).__name__ if _estimator_name in _TWFE_STYLE_RESULTS: bacon_message = ( f"Goodman-Bacon decomposition places {fw:.0%} " "of implicit TWFE weight on 'forbidden' " "later-vs-earlier comparisons. TWFE may be " "materially biased under heterogeneous effects. " "Re-estimate with a heterogeneity-robust " "estimator (CS / SA / BJS / Gardner)." ) else: bacon_message = ( f"Goodman-Bacon decomposition places {fw:.0%} " "of TWFE weight on 'forbidden' later-vs-earlier " "comparisons. A TWFE benchmark on this rollout " "would be materially biased under heterogeneous " "effects; the displayed estimator is already " "heterogeneity-robust, so this is a statement " "about the rollout design (avoid reporting TWFE " "alongside this fit), not about the current " "result's validity." ) caveats.append( { "severity": "warning", "topic": "bacon_contamination", "message": bacon_message, } ) # Fragile sensitivity. sens = dr_schema.get("sensitivity") or {} if sens.get("status") == "ran": bkd = sens.get("breakdown_M") if isinstance(bkd, (int, float)) and bkd < 0.5: caveats.append( { "severity": "warning", "topic": "sensitivity_fragility", "message": ( f"HonestDiD breakdown value is {bkd:.2g}: the " "result's confidence interval includes zero " "once parallel-trends violations reach less than " "half the observed pre-period variation. Treat " "the headline as tentative." ), } ) # Sensitivity was skipped for methodology reasons (e.g., CS fit with # ``base_period='varying'`` — HonestDiD bounds are not interpretable # there). Surface the reason as a warning-severity caveat so readers # do not assume the headline is robust across the R-R grid. # # Exception (round-20 P2 CI review on PR #318): SDiD and TROP route # robustness to ``estimator_native_diagnostics`` and mark the HonestDiD # sensitivity block ``status="skipped", method="estimator_native"``. # Surfacing "sensitivity was not run" as a warning contradicts the # documented native-routing contract when the native battery actually # ran. Suppress the warning and point readers at the native block # instead. if sens.get("status") == "skipped": reason = sens.get("reason") method = sens.get("method") native = dr_schema.get("estimator_native_diagnostics") or {} native_ran = native.get("status") == "ran" if method == "estimator_native" and native_ran: caveats.append( { "severity": "info", "topic": "sensitivity_native_routed", "message": ( "HonestDiD was not run for this estimator. Robustness " "is covered by the estimator-native sensitivity " "diagnostics reported under " "``estimator_native_diagnostics``." ), } ) elif isinstance(reason, str) and reason: caveats.append( { "severity": "warning", "topic": "sensitivity_skipped", "message": ("HonestDiD sensitivity was not run on this fit. " + reason), } ) # Non-fatal warnings captured from delegated diagnostics # (e.g., HonestDiD's bootstrap diag-covariance fallback, dropped # non-consecutive horizons on dCDH). DR already records these in # ``schema["warnings"]``; mirror the methodology-critical ones # into BR's caveat list so summary/full-report prose can surface # them without readers having to inspect the DR schema. for msg in dr_schema.get("warnings", []) or []: if not isinstance(msg, str) or not msg: continue # Skip alpha-override and design-effect messages already # covered by dedicated caveats above. lower = msg.lower() if "sensitivity:" in lower or "pretrends_power:" in lower: caveats.append( { "severity": "info", "topic": "diagnostic_warning", "message": msg, } ) # Unit mismatch caveat (log_points + unit override). unit_kind = headline.get("unit_kind") if unit_kind == "log_points": caveats.append( { "severity": "info", "topic": "unit_policy", "message": ( "The effect is reported in log-points as estimated; " "BusinessReport does not arithmetically translate log-points " "to percent or level changes. For small effects, log-points " "approximate percentage changes." ), } ) return caveats def _pt_method_subject(method: Optional[str]) -> str: """Return a source-faithful sentence subject for the PT verdict prose. The ``parallel_trends.method`` field distinguishes between the 2x2 slope-difference check, the pre-period event-study Wald / Bonferroni variants, EfficientDiD's Hausman PT-All vs PT-Post pretest, SDiD's weighted pre-treatment fit, and TROP's factor- model identification. Generic "pre-treatment event-study" wording is wrong for the first and third cases. See round-8 CI review on PR #318 and REGISTRY.md §EfficientDiD (Hausman pretest). """ if method == "slope_difference": return "The pre-period slope-difference test" if method == "hausman": return "The Hausman PT-All vs PT-Post pretest" if method in { "joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni", # Survey-aware event-study PT variants use an F reference # distribution with denominator df = ``survey_metadata.df_survey`` # (round-27 P1 fix, documented in REPORTING.md). The subject # remains the pre-period event-study coefficients; prose elsewhere # flags the finite-sample correction via ``df_denom``. "joint_wald_survey", "joint_wald_event_study_survey", }: return "Pre-treatment event-study coefficients" if method == "synthetic_fit": return "The synthetic-control pre-treatment fit" if method == "scm_fit": return "The synthetic-control donor-weighted pre-treatment fit" if method == "factor": return "The factor-model pre-treatment fit" return "Pre-treatment data" def _pt_method_stat_label(method: Optional[str]) -> Optional[str]: """Return the joint-statistic label appropriate to the PT method. Returns ``"joint p"`` for Wald / Bonferroni paths (including the survey-aware F-reference variants, which remain joint tests on the pre-period coefficient vector — only the reference distribution changes), ``"p"`` for the 2x2 slope-difference and Hausman paths (single-statistic tests), and ``None`` for design-enforced paths that have no p-value. """ if method in { "joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni", "joint_wald_survey", "joint_wald_event_study_survey", }: return "joint p" if method in {"slope_difference", "hausman"}: return "p" if method in {"synthetic_fit", "scm_fit", "factor"}: # Design-enforced fit-based paths have no p-value label (SCM's significance # is the in-space placebo, not a PT joint test). return None return "joint p" def _references_for(estimator_name: str) -> List[Dict[str, str]]: """Map the estimator to the appropriate citation references.""" base = [ { "role": "sensitivity", "citation": ( "Rambachan, A., & Roth, J. (2023). A More Credible Approach " "to Parallel Trends. Review of Economic Studies." ), }, { "role": "workflow", "citation": ( "Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., " "& Sant'Anna, P. H. C. (2025). Difference-in-Differences " "Designs: A Practitioner's Guide." ), }, ] estimator_refs = { "CallawaySantAnnaResults": { "role": "estimator", "citation": ( "Callaway, B., & Sant'Anna, P. H. C. (2021). " "Difference-in-Differences with multiple time periods. " "Journal of Econometrics." ), }, "SyntheticDiDResults": { "role": "estimator", "citation": ( "Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., " "& Wager, S. (2021). Synthetic Difference in Differences." ), }, "SyntheticControlResults": { "role": "estimator", "citation": ( "Abadie, A., Diamond, A., & Hainmueller, J. (2010). Synthetic " "Control Methods for Comparative Case Studies. JASA, 105(490)." ), }, "SunAbrahamResults": { "role": "estimator", "citation": ( "Sun, L., & Abraham, S. (2021). Estimating dynamic treatment " "effects in event studies. Journal of Econometrics." ), }, "ImputationDiDResults": { "role": "estimator", "citation": ( "Borusyak, K., Jaravel, X., & Spiess, J. (2024). " "Revisiting event-study designs." ), }, "EfficientDiDResults": { "role": "estimator", "citation": ( "Chen, X., Sant'Anna, P. H. C., & Xie, H. (2025). " "Efficient Estimation of Treatment Effects in Staggered " "DiD Designs." ), }, "ChaisemartinDHaultfoeuilleResults": { "role": "estimator", "citation": ( "de Chaisemartin, C., & D'Haultfœuille, X. (2020). " "Two-way fixed effects estimators with heterogeneous " "treatment effects. American Economic Review." ), }, } if estimator_name in estimator_refs: return [estimator_refs[estimator_name]] + base return base # --------------------------------------------------------------------------- # Prose rendering # --------------------------------------------------------------------------- def _format_value(value: Optional[float], unit: Optional[str], unit_kind: str) -> str: """Format a numeric effect with its unit. No arithmetic translation.""" if value is None or not np.isfinite(value): return "undefined" if unit_kind == "currency": sign = "-" if value < 0 else "" return f"{sign}${abs(value):,.2f}" if unit_kind == "percent": return f"{value:.2f}%" if unit_kind == "percentage_points": return f"{value:.2f} pp" if unit_kind == "log_points": return f"{value:.3g} log-points" if unit_kind == "count": return f"{value:,.0f}" # unknown / free-form if unit: return f"{value:.3g} {unit}" return f"{value:.3g}" def _significance_phrase(p: Optional[float], alpha: float) -> str: """Return a plain-English significance phrase. Tiers per ``docs/methodology/REPORTING.md``: * p < 0.001: "strongly supported by the data" * 0.001 <= p < 0.01: "well-supported" * 0.01 <= p < alpha: "statistically significant at the X% level" * alpha <= p < 0.10: CI-includes-zero language * p >= 0.10: consistent-with-no-effect language """ if p is None or not np.isfinite(p): return "statistical significance cannot be assessed (p-value unavailable)" ci_level = int(round((1.0 - alpha) * 100)) if p < 0.001: return "the direction of the effect is strongly supported by the data" if p < 0.01: return "the direction of the effect is well-supported by the data" if p < alpha: return f"the effect is statistically significant at the {ci_level}% level" if p < 0.10: return ( "the confidence interval includes zero; the direction is suggestive " "but not statistically significant" ) return "the confidence interval includes zero; the data are consistent with no effect" def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]: """If the smallest evaluated M on the HonestDiD sensitivity grid already has the robust CI including zero, return that M. Returns ``None`` when the grid is missing or when the smallest evaluated point is still robust — in the latter case ``breakdown_M`` is an interpolated threshold between grid points, not a statement about the smallest grid point itself. Matches the twin helper in ``diagnostic_report.py``; keep the two in sync for cross-surface parity. """ grid_points = sens.get("grid") or [] sorted_grid = sorted( (p for p in grid_points if isinstance(p.get("M"), (int, float))), key=lambda p: p["M"], ) if not sorted_grid: return None smallest = sorted_grid[0] if not smallest.get("robust_to_zero", True): return float(smallest["M"]) return None def _sentence_first_upper(text: str) -> str: """Uppercase only the first character of ``text``, preserving all other casing. Unlike ``str.capitalize()``, which lowercases every character after the first, this keeps user-supplied abbreviations and proper nouns intact. Examples -------- >>> _sentence_first_upper("the NJ minimum-wage increase") 'The NJ minimum-wage increase' >>> _sentence_first_upper("Castle Doctrine law adoption") 'Castle Doctrine law adoption' """ if not text: return text return text[0].upper() + text[1:] def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str: """Return a direction-aware verb for the headline sentence. When ``outcome_direction`` is unset we use neutral change verbs (``increased`` / ``decreased``). When it is supplied, we additionally flavor the verb with a value-laden connotation so the stakeholder can read off whether the estimated effect points in the desired direction: - ``higher_is_better``: positive effect -> "lifted"; negative -> "reduced" - ``lower_is_better``: positive effect -> "worsened"; negative -> "improved" - None: positive -> "increased"; negative -> "decreased" """ if effect == 0: return "did not change" if outcome_direction == "higher_is_better": return "lifted" if effect > 0 else "reduced" if outcome_direction == "lower_is_better": return "worsened" if effect > 0 else "improved" return "increased" if effect > 0 else "decreased" def _render_headline_sentence(schema: Dict[str, Any]) -> str: """Render the headline sentence from the schema. Uses the absolute value in the magnitude slot when the verb already conveys direction ("decreased ... by $0.14" rather than "decreased ... by -$0.14"). CI bounds are rendered at their natural signed values. When ``outcome_direction`` is supplied, the verb picks up a value-laden connotation ("lifted" / "reduced" vs neutral "increased" / "decreased"). """ ctx = schema.get("context", {}) h = schema.get("headline", {}) # PR #347 R4 P1: the dCDH ``trends_linear=True`` + ``L_max>=2`` # configuration does not produce a scalar headline by design — # ``overall_att`` is intentionally NaN (per # ``chaisemartin_dhaultfoeuille.py:2828-2834``). Render explicit # "no scalar headline by design" prose instead of routing through # the non-finite / estimation-failure path. if h.get("status") == "no_scalar_by_design": # PR #347 R13 P1: the headline-level ``reason`` field is the # single source for the no-scalar prose and is already # branched on populated-vs-empty surface in ``_build_schema``. # Use it verbatim so the headline sentence never drifts from # the schema-level message on the empty-surface subcase. treatment = ctx.get("treatment_label", "the treatment") outcome_label = ctx.get("outcome_label", "the outcome") treatment_sentence = _sentence_first_upper(treatment) reason = h.get("reason") if isinstance(reason, str) and reason: return ( f"{treatment_sentence} does not produce a scalar aggregate " f"effect on {outcome_label} under this configuration. " + reason ) return ( f"{treatment_sentence} does not produce a scalar aggregate effect " f"on {outcome_label} under this configuration (by design)." ) effect = h.get("effect") outcome = ctx.get("outcome_label", "the outcome") treatment = ctx.get("treatment_label", "the treatment") outcome_direction = ctx.get("outcome_direction") unit = h.get("unit") unit_kind = h.get("unit_kind", "unknown") if effect is None or not np.isfinite(effect): return ( f"We were unable to produce a finite estimate of {treatment}'s " f"effect on {outcome}. Inspect the data and model specification." ) verb = _direction_verb(effect, outcome_direction) magnitude = _format_value(abs(effect), unit, unit_kind) lo = h.get("ci_lower") hi = h.get("ci_upper") # Round-37 P1 CI review on PR #318: on a finite point estimate # whose CI bounds are NaN (undefined inference — survey-df # collapse, zero effective clusters, etc.), the previous isinstance # check passed because ``NaN`` is a ``float`` and the sentence # rendered ``(... 95% CI: undefined to undefined)``. Gate on # ``np.isfinite`` like DR's own headline renderer already does; # add an explicit inference-unavailable trailer instead of the # broken CI clause. ci_str = "" ci_finite = ( isinstance(lo, (int, float)) and isinstance(hi, (int, float)) and np.isfinite(lo) and np.isfinite(hi) ) if ci_finite: lo_s = _format_value(lo, unit, unit_kind) hi_s = _format_value(hi, unit, unit_kind) ci_str = f" ({h.get('ci_level', 95)}% CI: {lo_s} to {hi_s})" elif isinstance(lo, (int, float)) or isinstance(hi, (int, float)): # At least one bound was supplied but not finite -> inference # undefined. Replace the CI clause with an explicit marker so # downstream prose does not claim a confidence interval that # is not actually available. ci_str = " (inference unavailable: confidence interval is undefined for this fit)" by_clause = f" by {magnitude}" if effect != 0 else "" # Round-1 BR/DR canonical-validation (2026-04-19): Python's # ``str.capitalize()`` lowercases everything except the first # character, so ``"the NJ minimum-wage increase".capitalize()`` # returns ``"The nj minimum-wage increase"`` — flattening the # ``NJ`` abbreviation. Real canonical datasets (Card-Krueger, # Castle Doctrine) carry proper-noun / acronym tokens in the # user-supplied ``treatment_label``, so preserve user casing and # only ensure the first character is uppercase. treatment_sentence = _sentence_first_upper(treatment) return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}." def _render_summary(schema: Dict[str, Any]) -> str: """Render the short-form stakeholder summary paragraph.""" sentences: List[str] = [] ctx = schema.get("context", {}) question = ctx.get("business_question") if question: sentences.append(f"Question: {question}") # Headline sentence with significance phrase. sentences.append(_render_headline_sentence(schema)) # BR/DR gap #6 (target-parameter clarity): name what the headline # scalar actually represents so the stakeholder can map the number # to a specific estimand. Rendered immediately after the headline # and before the significance phrase. The summary surfaces only # the short ``name`` so the paragraph stays within the # 6-10-sentence target; ``definition`` lives in the full report # and in the structured schema for agents that want the long form. tp = schema.get("target_parameter", {}) or {} tp_name = tp.get("name") if tp_name: sentences.append(f"Target parameter: {tp_name}.") h = schema.get("headline", {}) p = h.get("p_value") alpha = ctx.get("alpha", 0.05) if p is not None and np.isfinite(p): sig = _significance_phrase(p, alpha) sentences.append(f"Statistically, {sig}.") if h.get("near_significance_threshold"): sentences.append( "The p-value is close to the conventional threshold; " "small changes to the sample could move it either way." ) # Pre-trends + power-aware phrasing. pt = schema.get("pre_trends", {}) or {} if pt.get("status") == "computed": jp = pt.get("joint_p_value") verdict = pt.get("verdict") # ``tier`` already incorporates the diagonal-fallback downgrade — # ``DiagnosticReport._check_pretrends_power`` applies it centrally # so every report surface (BR summary, BR full_report, BR schema, # DR summary) reads the same adjusted value (round-14 CI review). tier = pt.get("power_tier") method = pt.get("method") subject = _pt_method_subject(method) stat_label = _pt_method_stat_label(method) jp_phrase = ( f" ({stat_label} = {jp:.3g})" if isinstance(jp, (int, float)) and stat_label else "" ) # Only point to "the sensitivity analysis below" when a # sensitivity block actually ran. For estimators that route to # native diagnostics (SDiD / TROP) or fits where sensitivity was # skipped / not applicable, the clause would mislead (round-12 # CI review on PR #318). sens_ran = (schema.get("sensitivity", {}) or {}).get("status") == "computed" sens_tail_major = " pending the sensitivity analysis below" if sens_ran else "" sens_tail_alongside = " alongside the sensitivity analysis below" if sens_ran else "" sens_tail_see_bounded = ( " See the sensitivity analysis below for bounded-violation guarantees." if sens_ran else "" ) sens_tail_see_reliable = " See the sensitivity analysis below." if sens_ran else "" if verdict == "clear_violation": sentences.append( f"{subject} clearly reject parallel trends{jp_phrase}; the " "headline should be treated as tentative" + sens_tail_major + "." ) elif verdict == "some_evidence_against": sentences.append( f"{subject} show some evidence against parallel trends" f"{jp_phrase}; interpret the headline" + (sens_tail_alongside if sens_ran else " with caution") + "." ) elif verdict == "no_detected_violation": if tier == "well_powered": sentences.append( f"{subject} are consistent with parallel trends, and " "the test is well-powered (the max pre-period level " "deviation at the MDV is small relative to the " "estimated effect)." ) elif tier == "moderately_powered": sentences.append( f"{subject} do not reject parallel trends; the test is " "moderately informative." + sens_tail_see_bounded ) else: sentences.append( f"{subject} do not reject parallel trends, but the test " "has limited power — a non-rejection does not prove the " "assumption." + sens_tail_see_reliable ) elif verdict == "design_enforced_pt": if method == "scm_fit": sentences.append( "The synthetic control is designed to reproduce the treated " "unit's pre-period trajectory via donor weights (classic SCM's " "design-enforced analogue of parallel trends); significance " "comes from in-space placebo permutation inference, not a " "parallel-trends test." ) else: sentences.append( "The synthetic control is designed to match the treated " "group's pre-period trajectory (SDiD's weighted-parallel-" "trends analogue)." ) elif verdict == "inconclusive": # Round-35 P1 CI review on PR #318: a ``verdict=="inconclusive"`` # state means one or more pre-period coefficients had # undefined inference (zero SE, NaN p-value) and the joint # test cannot be formed. BR previously omitted the sentence # entirely, so stakeholder prose silently skipped the # identifying-assumption diagnostic. Name the state # explicitly and quote the undefined-row count when # available. n_dropped = pt.get("n_dropped_undefined") if isinstance(n_dropped, int) and n_dropped > 0: rows_word = "row" if n_dropped == 1 else "rows" sentences.append( f"The pre-trends test is inconclusive on this fit: " f"{n_dropped} pre-period {rows_word} had undefined " "inference (zero / negative SE or a non-finite " "per-period p-value), so the joint test cannot be " "formed. Treat parallel trends as unassessed rather " "than supported." ) else: sentences.append( "The pre-trends test is inconclusive on this fit: " "pre-period inference was undefined, so the joint " "test cannot be formed. Treat parallel trends as " "unassessed rather than supported." ) # Sensitivity. A ``single_M_precomputed`` sensitivity block has # ``breakdown_M=None`` by construction because only one M was evaluated; # narrate it as a point check, NOT as grid-wide robustness. sens = schema.get("sensitivity", {}) or {} if sens.get("status") == "computed": bkd = sens.get("breakdown_M") conclusion = sens.get("conclusion") if conclusion == "single_M_precomputed": grid_points = sens.get("grid") or [] point = grid_points[0] if grid_points else {} m_val = point.get("M") robust = point.get("robust_to_zero") if isinstance(m_val, (int, float)): if robust: sentences.append( f"HonestDiD (single point checked): at M = {m_val:.2g}, " f"the robust confidence interval excludes zero. This is " f"a point check, not a breakdown analysis — run " f"HonestDiD.sensitivity() across a grid of M values " f"for a full robustness claim." ) else: sentences.append( f"HonestDiD (single point checked): at M = {m_val:.2g}, " f"the robust confidence interval includes zero. Run " f"HonestDiD.sensitivity() across a grid to find the " f"breakdown value." ) elif bkd is None: sentences.append( "HonestDiD: the result remains significant across the " "full grid — robust to plausible parallel-trends violations." ) elif isinstance(bkd, (int, float)) and bkd >= 1.0: sentences.append( f"HonestDiD: the result remains significant under " f"parallel-trends violations up to {bkd:.2g}x the observed " f"pre-period variation." ) elif isinstance(bkd, (int, float)): # Round-1 BR/DR canonical-validation (2026-04-19) then # tightened per CI review on PR #341 R1: # ``breakdown_M`` is the smallest M at which the robust # CI includes zero (interpolated between grid points) — # not a claim about any specific grid point. Earlier fix # keyed off ``bkd <= 0.05`` which incorrectly asserted # "smallest grid point fails" even for grids that start # at M=0 where the smallest evaluated point is still # robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The # "smallest grid point" wording is only accurate when # the smallest evaluated M on the grid itself fails # (``robust_to_zero == False``); otherwise fall through # to the numeric multiplier. smallest_failed_m = _smallest_failing_grid_m(sens) if smallest_failed_m is not None: sentences.append( "HonestDiD: the result is fragile — the confidence " "interval includes zero even at the smallest M " f"evaluated on the sensitivity grid (M = " f"{smallest_failed_m:.2g})." ) else: sentences.append( f"HonestDiD: the result is fragile — the confidence " f"interval includes zero once violations reach {bkd:.2g}x " f"the pre-period variation." ) # Sample sentence. For fits with a dynamic comparison set (CS / # ContinuousDiD / StaggeredTripleDiff / EfficientDiD / # StackedDiD under ``clean_control in {"not_yet_treated", # "strict"}``) the fixed control count is suppressed because the # comparison group varies by cohort/sub-experiment; narrate the # mode explicitly rather than misreporting a fixed-subset tally as # "control" (rounds 13 / 17 / 18 / 22 CI review). sample = schema.get("sample", {}) or {} # ``schema["estimator"]`` is a dict with ``class_name``; unwrap it # for the per-estimator dynamic-control phrasing branch below. estimator_block = schema.get("estimator") or {} estimator = estimator_block.get("class_name") if isinstance(estimator_block, dict) else None n_obs = sample.get("n_obs") n_t = sample.get("n_treated") n_c = sample.get("n_control") n_nt = sample.get("n_never_treated") n_ne = sample.get("n_never_enabled") is_dynamic = sample.get("dynamic_control") cg = sample.get("control_group") # Panel-vs-RCS count-unit label. For repeated cross-section fits # (``panel=False`` on CallawaySantAnna), treated / never-treated # tallies are observation counts, not unit counts. Keep the # "N treated" phrasing (the N is still correct), but adjust the # never-treated clause so it does not claim "units present in # the panel" for an RCS sample. count_unit = sample.get("count_unit", "units") ne_unit_word = "observations" if count_unit == "observations" else "units" if isinstance(n_obs, int): if isinstance(n_t, int) and isinstance(n_c, int): sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).") elif is_dynamic and isinstance(n_t, int): if isinstance(n_ne, int) and n_ne > 0: subset_clause = f"; {n_ne:,} never-enabled {ne_unit_word} are also present" elif isinstance(n_nt, int) and n_nt > 0: subset_clause = f"; {n_nt:,} never-treated {ne_unit_word} are also present" else: subset_clause = "" # Estimator-specific dynamic-comparison phrasing. StackedDiD # uses sub-experiment-specific clean controls (IC1/IC2 # trimming) rather than a not-yet-treated rollout; the # generic phrasing misstates the identification setup. if estimator == "StackedDiDResults": cc_label = cg if isinstance(cg, str) else "clean_control" n_distinct = sample.get("n_distinct_controls_trimmed") distinct_clause = ( f" across {n_distinct:,} distinct control units in the trimmed stack" if isinstance(n_distinct, int) else "" ) sentences.append( f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " f"sub-experiment-specific clean-control comparison " f"(``clean_control='{cc_label}'``): each adoption event is " f"compared against the units satisfying the rule relative " f"to that event's window, not a single fixed control " f"group{distinct_clause}{subset_clause}." ) else: sentences.append( f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " "dynamic not-yet-treated comparison group (the control set " f"varies by cohort and period){subset_clause}." ) elif ( estimator == "StaggeredTripleDiffResults" and isinstance(n_t, int) and isinstance(n_ne, int) and n_ne > 0 ): # Round-38 P2 CI review on PR #318: StaggeredTripleDiff # under fixed ``control_group="never_treated"`` had the # schema moved to ``n_never_enabled`` (round-37) but the # renderers fell through to the generic # ``Sample: N observations.`` sentence because the # ``is_dynamic_control`` branch didn't fire. REGISTRY.md # §StaggeredTripleDifference line 1730 names the # never-enabled cohort as the valid fixed comparison on # this path; the prose must say so. sentences.append( f"Sample: {n_obs:,} observations ({n_t:,} treated, " f"{n_ne:,} never-enabled)." ) else: sentences.append(f"Sample: {n_obs:,} observations.") survey = sample.get("survey") if survey and not survey.get("is_trivial"): deff = survey.get("design_effect") eff_n = survey.get("effective_n") if isinstance(deff, (int, float)) and isinstance(eff_n, (int, float)): # Round-35 P2 CI review on PR #318: ``deff < 0.95`` is a # precision-improving design (effective N is LARGER than # nominal N). Narrating that as "reduces effective sample # size" is directionally wrong. Branch on the sign of # the departure from 1. if deff < 1.0: sentences.append( f"Survey design improves effective sample size to " f"~{eff_n:,.0f} (DEFF = {deff:.2g})." ) else: sentences.append( f"Survey design reduces effective sample size to " f"~{eff_n:,.0f} (DEFF = {deff:.2g})." ) # Highest-severity caveat (if any). caveats = schema.get("caveats", []) warning_caveats = [c for c in caveats if c.get("severity") == "warning"] if warning_caveats: top = warning_caveats[0] sentences.append(f"Caveat: {top.get('message')}") return " ".join(s for s in sentences if s) def _render_full_report(schema: Dict[str, Any]) -> str: """Render the structured multi-section markdown report.""" ctx = schema.get("context", {}) h = schema.get("headline", {}) sample = schema.get("sample", {}) pt = schema.get("pre_trends", {}) or {} sens = schema.get("sensitivity", {}) or {} assumption = schema.get("assumption", {}) het = schema.get("heterogeneity") caveats = schema.get("caveats", []) references = schema.get("references", []) next_steps = schema.get("next_steps", []) lines: List[str] = [] lines.append(f"# Business Report: {ctx.get('outcome_label', 'Outcome')}") lines.append("") if ctx.get("business_question"): lines.append(f"**Question**: {ctx['business_question']}") lines.append("") lines.append(f"**Estimator**: `{schema.get('estimator', {}).get('class_name')}`") lines.append("") # Headline lines.append("## Headline") lines.append("") lines.append(_render_headline_sentence(schema)) p = h.get("p_value") alpha = ctx.get("alpha", 0.05) if isinstance(p, (int, float)): lines.append("") lines.append(f"Statistically, {_significance_phrase(p, alpha)}.") lines.append("") # Target parameter (BR/DR gap #6): name what the headline scalar # represents so the stakeholder can map the number to a specific # estimand. Rendered between "Headline" and "Identifying Assumption" # because the target parameter is about what the scalar IS, whereas # identifying assumption is about what makes it valid. tp = schema.get("target_parameter", {}) or {} if tp.get("name") or tp.get("definition"): lines.append("## Target Parameter") lines.append("") if tp.get("name"): lines.append(f"- **{tp['name']}**") if tp.get("definition"): lines.append(f"- {tp['definition']}") lines.append("") # Identifying assumption lines.append("## Identifying Assumption") lines.append("") lines.append(assumption.get("description", "") or "Standard DiD parallel-trends assumption.") lines.append("") # Pre-trends lines.append("## Pre-Trends") lines.append("") if pt.get("status") == "computed": jp = pt.get("joint_p_value") verdict = pt.get("verdict") tier = pt.get("power_tier") # Use the method-aware statistic label the summary path already # uses: "joint p" for Wald / Bonferroni event-study, "p" for # slope-difference / Hausman single-statistic tests, and None # for design-enforced SDiD / TROP paths where there is no # p-value at all. Round-25 P2 CI review on PR #318 flagged the # hard-coded "joint p" wording as misdescribing 2x2 / Hausman # fits and inventing a nonexistent p-value for SDiD / TROP. method = pt.get("method") stat_label = _pt_method_stat_label(method) if stat_label and isinstance(jp, (int, float)): lines.append(f"- Verdict: `{verdict}` ({stat_label} = {jp:.3g})") elif stat_label: lines.append(f"- Verdict: `{verdict}` ({stat_label} unavailable)") else: lines.append(f"- Verdict: `{verdict}`") if tier: lines.append(f"- Power tier: `{tier}`") mdv = pt.get("mdv") max_abs_pre = pt.get("max_abs_pre_violation") ratio = pt.get("mdv_share_of_att") if isinstance(mdv, (int, float)): lines.append(f"- Minimum detectable violation (MDV): {mdv:.3g}") if isinstance(max_abs_pre, (int, float)): lines.append(f"- Max pre-period level deviation at MDV: {max_abs_pre:.3g}") if isinstance(ratio, (int, float)): # PR-B R12: ratio is now max_abs_pre_violation / |ATT|, the # level-scale comparable to ATT (not raw γ-unit mdv on linear # fits). Label updated to match the numerator definition in # REPORTING.md "Power-aware phrasing" Note. lines.append(f"- Max pre-period level deviation / |ATT|: {ratio:.2g}") else: lines.append(f"- Pre-trends not computed: {pt.get('reason', 'unavailable')}") lines.append("") # Sensitivity. A single-M HonestDiDResults passthrough has # breakdown_M=None by construction because only one M was evaluated; # the "robust across full grid" phrasing is reserved for genuine # grid-over-M SensitivityResults. lines.append("## Sensitivity (HonestDiD)") lines.append("") if sens.get("status") == "computed": bkd = sens.get("breakdown_M") concl = sens.get("conclusion") lines.append(f"- Method: `{sens.get('method')}`") if concl == "single_M_precomputed": grid_points = sens.get("grid") or [] point = grid_points[0] if grid_points else {} m_val = point.get("M") robust = point.get("robust_to_zero") if isinstance(m_val, (int, float)): lines.append(f"- Single point checked: M = {m_val:.3g}") lines.append( f"- Robust CI at M = {m_val:.3g}: " f"{'excludes zero' if robust else 'includes zero'}" ) lines.append( "- Run `HonestDiD.sensitivity()` across a grid of M " "values to find the breakdown value." ) else: lines.append("- Single-M passthrough (breakdown not available)") elif isinstance(bkd, (int, float)): lines.append(f"- Breakdown M: {bkd:.3g}") else: lines.append("- Breakdown M: robust across full grid (no breakdown)") lines.append(f"- Conclusion: `{concl}`") else: lines.append(f"- Sensitivity not computed: {sens.get('reason', 'unavailable')}") lines.append("") # Sample lines.append("## Sample") lines.append("") if isinstance(sample.get("n_obs"), int): lines.append(f"- Observations: {sample['n_obs']:,}") if isinstance(sample.get("n_treated"), int): lines.append(f"- Treated: {sample['n_treated']:,}") # ``n_control`` is only populated for estimators whose control set # is a fixed tally. For dynamic modes (CS / ContinuousDiD / # StaggeredTripleDiff / EfficientDiD / StackedDiD under # ``clean_control in {"not_yet_treated", "strict"}``) the comparison # group is dynamic per cohort/sub-experiment; report the estimator- # specific fixed subset (``n_never_enabled`` for triple-difference; # ``n_never_treated`` elsewhere; ``n_distinct_controls_trimmed`` for # Stacked) when available, then name the dynamic-comparison mode # explicitly. estimator_block = schema.get("estimator") or {} estimator_name = ( estimator_block.get("class_name") if isinstance(estimator_block, dict) else None ) cg = sample.get("control_group") # Panel-vs-RCS count-unit label for the full report. Mirrors the # summary path: CallawaySantAnna's ``panel=False`` mode stores # counts as observations, not units (round-28 P2). md_count_unit = sample.get("count_unit", "units") md_ne_unit_word = "observations" if md_count_unit == "observations" else "units" md_sample_location = ( "in the repeated cross-section sample" if md_count_unit == "observations" else "in the panel" ) if isinstance(sample.get("n_control"), int): lines.append(f"- Control: {sample['n_control']:,}") elif ( estimator_name == "StaggeredTripleDiffResults" and isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0 and not sample.get("dynamic_control") ): # Round-38 P2 CI review on PR #318: fixed # ``control_group="never_treated"`` on StaggeredTripleDiff # clears ``n_control`` (composite total) and populates # ``n_never_enabled`` (the valid fixed comparison cohort per # REGISTRY.md line 1730). The full report must render that # fixed count — the dynamic-control branch below would not # fire on this path. lines.append( f"- Never-enabled units (fixed comparison cohort): " f"{sample['n_never_enabled']:,}" ) elif sample.get("dynamic_control"): if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0: lines.append( f"- Never-enabled {md_ne_unit_word} present " f"{md_sample_location}: {sample['n_never_enabled']:,}" ) elif isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0: lines.append( f"- Never-treated {md_ne_unit_word} present " f"{md_sample_location}: {sample['n_never_treated']:,}" ) if estimator_name == "StackedDiDResults": n_distinct = sample.get("n_distinct_controls_trimmed") if isinstance(n_distinct, int): lines.append(f"- Distinct control units in trimmed stack: {n_distinct:,}") cc_label = cg if isinstance(cg, str) else "clean_control" lines.append( f"- Comparison group: sub-experiment-specific clean controls " f"(``clean_control='{cc_label}'``; each adoption event is " "compared against units satisfying the rule relative to that " "event's window, not a single fixed control group)" ) else: lines.append( "- Comparison group: dynamic not-yet-treated units " "(varies by cohort and period; no fixed control count)" ) survey = sample.get("survey") if survey: if survey.get("is_trivial"): lines.append("- Survey design: trivial DEFF (~1.0)") else: deff = survey.get("design_effect") eff_n = survey.get("effective_n") if isinstance(deff, (int, float)): lines.append(f"- Survey DEFF: {deff:.2g}") if isinstance(eff_n, (int, float)): lines.append(f"- Effective N: {eff_n:,.0f}") lines.append("") # Heterogeneity — only render the populated section when the check # actually ran. Round-32 P2 CI review on PR #318: round-31 changed # ``_lift_heterogeneity`` to always return a dict (stable schema # contract), but the renderer's ``if het:`` truthiness guard then # entered the block on every fit and printed ``Source: None``, # ``N effects: None``, etc. Gate on the ``status`` enum instead. if isinstance(het, dict) and het.get("status") == "ran": lines.append("## Heterogeneity") lines.append("") lines.append(f"- Source: `{het.get('source')}`") lines.append(f"- N effects: {het.get('n_effects')}") mn = het.get("min") mx = het.get("max") if isinstance(mn, (int, float)) and isinstance(mx, (int, float)): lines.append(f"- Range: {mn:.3g} to {mx:.3g}") cv = het.get("cv") if isinstance(cv, (int, float)): lines.append(f"- CV: {cv:.3g}") lines.append(f"- Sign consistent: {het.get('sign_consistent')}") lines.append("") # Caveats if caveats: lines.append("## Caveats") lines.append("") for c in caveats: sev = c.get("severity", "info") lines.append(f"- **{sev.upper()}** — {c.get('message')}") lines.append("") # Next steps if next_steps: lines.append("## Next Steps") lines.append("") for s in next_steps: if s.get("label"): lines.append(f"- {s['label']}") if s.get("why"): lines.append(f" - _why_: {s['why']}") lines.append("") # References if references: lines.append("## References") lines.append("") for ref in references: lines.append(f"- {ref.get('citation')}") lines.append("") return "\n".join(lines)