"""
BusinessReport — plain-English stakeholder narrative from any diff-diff result.
Wraps any of the 16 fitted result types and produces:
- ``summary()``: a short paragraph block suitable for an email or Slack message.
- ``full_report()``: a multi-section markdown report with headline, assumptions,
pre-trends, main result, robustness, sample, and an optional academic appendix.
- ``to_dict()``: a stable AI-legible structured schema (single source of truth —
prose is rendered from this dict, not templated alongside it).
Design principles:
- Plain English, not academic jargon. The library ships this in addition to, not
in place of, the estimator's existing ``results.summary()`` academic output.
- No estimator fitting and no variance re-derivation. Every effect, SE, p-value,
CI, and sensitivity bound is either read from ``results`` or produced by an
existing diff-diff utility. The report layer does compose a few cross-period
summaries from per-period inputs already on the result (joint-Wald / Bonferroni
pre-trends p-value, MDV-to-ATT ratio, heterogeneity dispersion over
post-treatment effects); see ``docs/methodology/REPORTING.md`` for the full
enumeration.
- Optional business context via keyword args (``outcome_label``, ``outcome_unit``,
``business_question``, ``treatment_label``). Without them, BusinessReport uses
generic fallbacks — the zero-config path works.
- Diagnostic integration is implicit by default: ``BusinessReport(results)``
auto-constructs a ``DiagnosticReport`` so the summary can mention pre-trends,
robustness, and design-effect findings. Pass ``auto_diagnostics=False`` or an
explicit ``diagnostics=`` object to override.
Methodology deviations (no traffic-light gates, pre-trends verdict thresholds,
power-aware phrasing, unit-translation policy, schema stability) are documented
in ``docs/methodology/REPORTING.md``. The ``to_dict()`` schema is marked
experimental in v3.2.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Any, Dict, FrozenSet, List, Optional, Union
import numpy as np
from diff_diff._reporting_helpers import describe_target_parameter
from diff_diff.diagnostic_report import DiagnosticReport, DiagnosticReportResults
BUSINESS_REPORT_SCHEMA_VERSION = "2.0"
__all__ = [
"BusinessReport",
"BusinessContext",
"BUSINESS_REPORT_SCHEMA_VERSION",
]
# Recognized ``outcome_unit`` values mapped to a coarse "kind" used by the
# formatter. Unrecognized strings are accepted and rendered verbatim without
# arithmetic translation (``unit_kind = "unknown"``).
_UNIT_KINDS: Dict[str, str] = {
"$": "currency",
"usd": "currency",
"%": "percent",
"pp": "percentage_points",
"percentage_points": "percentage_points",
"percent": "percent",
"log_points": "log_points",
"log": "log_points",
"count": "count",
"users": "count",
}
[docs]
@dataclass(frozen=True)
class BusinessContext:
"""Frozen bundle of business-framing metadata used when rendering prose.
Populated from ``BusinessReport`` constructor kwargs. Falls back to
neutral labels when fields are not supplied.
"""
outcome_label: str
outcome_unit: Optional[str]
outcome_direction: Optional[str]
business_question: Optional[str]
treatment_label: str
alpha: float
[docs]
class BusinessReport:
"""Produce a stakeholder-ready narrative from any diff-diff results object.
Parameters
----------
results : Any
A fitted diff-diff results object. Any of the 16 result types is
accepted. ``BaconDecompositionResults`` is not a valid input — Bacon
is a diagnostic, not an estimator; use ``DiagnosticReport`` for that.
outcome_label : str, optional
Stakeholder-friendly outcome name (e.g. ``"Revenue per user"``).
outcome_unit : str, optional
Unit label: ``"$"`` / ``"%"`` / ``"pp"`` / ``"log_points"`` / ``"count"``
(recognized for formatting) or any free-form string (used verbatim
without arithmetic translation).
outcome_direction : str, optional
``"higher_is_better"`` or ``"lower_is_better"``. Drives whether the
effect is described as "lift" / "drag" rather than just "increase" /
"decrease".
business_question : str, optional
Question the analysis answers (prepended to the summary).
treatment_label : str, optional
Stakeholder-friendly treatment name (e.g. ``"the campaign"``).
alpha : float, optional
Significance level. Defaults to ``results.alpha`` when not supplied.
Single knob: drives both CI level and significance phrasing.
honest_did_results : HonestDiDResults or SensitivityResults, optional
Pre-computed sensitivity result. When supplied, this is forwarded to
the internal ``DiagnosticReport`` so sensitivity is not re-computed.
auto_diagnostics : bool, default True
When ``True`` and ``diagnostics`` is ``None``, auto-construct a
``DiagnosticReport``. Set ``False`` to skip diagnostics entirely.
diagnostics : DiagnosticReport or DiagnosticReportResults, optional
Explicit diagnostics object. Takes precedence over ``auto_diagnostics``.
include_appendix : bool, default True
Whether ``full_report()`` appends the estimator's academic
``results.summary()`` output under a "Technical Appendix" section.
data, outcome, treatment, unit, time, first_treat : optional
Raw panel + column names forwarded to the auto-constructed
``DiagnosticReport`` so data-dependent checks (2x2 PT on simple
DiD, Bacon-from-scratch, EfficientDiD Hausman pretest) can run.
survey_design : SurveyDesign, optional
The ``SurveyDesign`` object used to fit a survey-weighted
estimator. Forwarded to the auto-constructed ``DiagnosticReport``
for fit-faithful Goodman-Bacon replay. When the fit carries
``survey_metadata`` but ``survey_design`` is not supplied, Bacon
is skipped with an explicit reason rather than replaying an
unweighted decomposition for a design that does not match the
estimate. The simple 2x2 parallel-trends helper
(``utils.check_parallel_trends``) has no survey-aware variant;
on a survey-backed ``DiDResults`` it is skipped unconditionally
regardless of ``survey_design``. Supply
``precomputed={'parallel_trends': ...}`` with a survey-aware
pretest to opt in. See ``docs/methodology/REPORTING.md``.
precomputed : dict, optional
Pre-computed diagnostic objects forwarded to the auto-
constructed ``DiagnosticReport`` (same keys as
``DiagnosticReport(precomputed=...)``): ``"parallel_trends"``,
``"sensitivity"``, ``"pretrends_power"``, ``"bacon"``. DR
validates keys and rejects estimator-incompatible entries
(e.g., HonestDiD bounds or generic PT on SDiD / TROP).
``honest_did_results`` remains a shorthand for ``sensitivity``;
an explicit ``precomputed['sensitivity']`` wins on conflict.
"""
[docs]
def __init__(
self,
results: Any,
*,
outcome_label: Optional[str] = None,
outcome_unit: Optional[str] = None,
outcome_direction: Optional[str] = None,
business_question: Optional[str] = None,
treatment_label: Optional[str] = None,
alpha: Optional[float] = None,
honest_did_results: Optional[Any] = None,
auto_diagnostics: bool = True,
diagnostics: Optional[Union[DiagnosticReport, DiagnosticReportResults]] = None,
include_appendix: bool = True,
data: Optional[Any] = None,
outcome: Optional[str] = None,
treatment: Optional[str] = None,
unit: Optional[str] = None,
time: Optional[str] = None,
first_treat: Optional[str] = None,
survey_design: Optional[Any] = None,
precomputed: Optional[Dict[str, Any]] = None,
):
if type(results).__name__ == "BaconDecompositionResults":
raise TypeError(
"BaconDecompositionResults is a diagnostic, not an estimator; "
"wrap the underlying estimator with BusinessReport and pass the "
"Bacon object to DiagnosticReport(precomputed={'bacon': ...})."
)
if diagnostics is not None and not isinstance(
diagnostics, (DiagnosticReport, DiagnosticReportResults)
):
raise TypeError(
"diagnostics= must be a DiagnosticReport or "
"DiagnosticReportResults instance; "
f"got {type(diagnostics).__name__}."
)
# Estimator-aware validation for ``honest_did_results``. SDiD /
# TROP route robustness to ``estimator_native_diagnostics``
# (SDiD: ``in_time_placebo``, ``sensitivity_to_zeta_omega``;
# TROP: factor-model fit metrics) and do not accept HonestDiD
# bounds because they are methodology-incompatible with the
# documented native-routing contract in REPORTING.md. Reject
# the passthrough here so it doesn't silently forward to the
# auto-constructed ``DiagnosticReport`` (which now also
# rejects it at construction time — round-21 P1 CI review on
# PR #318).
if honest_did_results is not None and type(results).__name__ in {
"SyntheticDiDResults",
"TROPResults",
"SyntheticControlResults",
}:
raise ValueError(
f"{type(results).__name__} routes robustness to "
"``estimator_native_diagnostics`` — ``honest_did_results`` "
"is not accepted on this estimator because HonestDiD "
"bounds are methodology-incompatible with the native "
"routing documented in REPORTING.md. Use the result "
"object's native diagnostics "
"(SDiD: ``in_time_placebo()``, ``sensitivity_to_zeta_omega()``, "
"``pre_treatment_fit``; TROP: ``effective_rank``, "
"``loocv_score``; SyntheticControl: ``in_space_placebo()``, "
"``pre_rmspe``, ``get_placebo_df()``) — BusinessReport surfaces "
"these automatically under ``estimator_native_diagnostics``."
)
# Round-44 P1 CI review on PR #318: mirror the SDiD/TROP
# rejection pattern for ``CallawaySantAnna`` fits with
# ``base_period != "universal"``. HonestDiD Rambachan-Roth
# bounds are not valid for interpretation on the consecutive-
# comparison pre-period surface produced by ``varying`` base,
# so narrating precomputed sensitivity (whether passed as
# ``honest_did_results`` or ``precomputed['sensitivity']``)
# alongside a displayed varying-base fit mixes provenance the
# bounds don't support. DR enforces the same guard at
# construction; BR duplicates the check so the error fires
# before the auto-DR is built, matching the existing
# SDiD/TROP UX. REGISTRY.md §CallawaySantAnna line 410,
# §HonestDiD line 2458.
_cs_with_varying_base = type(results).__name__ == "CallawaySantAnnaResults" and (
getattr(results, "base_period", "universal") != "universal"
)
if _cs_with_varying_base:
_rejected_inputs: List[str] = []
if honest_did_results is not None:
_rejected_inputs.append("honest_did_results")
if precomputed is not None and "sensitivity" in precomputed:
_rejected_inputs.append("precomputed['sensitivity']")
if _rejected_inputs:
_base_period = getattr(results, "base_period", "universal")
raise ValueError(
f"CallawaySantAnnaResults with "
f"``base_period={_base_period!r}`` cannot be "
"summarized alongside a precomputed HonestDiD "
"sensitivity object. The Rambachan-Roth bounds are "
"not valid for interpretation on the consecutive-"
"comparison pre-period surface this base yields "
"(REGISTRY.md §CallawaySantAnna / §HonestDiD). "
"Rejected inputs: " + ", ".join(_rejected_inputs) + ". "
"Re-fit the main estimator with "
"``CallawaySantAnna(base_period='universal')`` "
"before passing precomputed sensitivity, or drop "
"the sensitivity passthrough to let BR skip the "
"section with a methodology-critical reason."
)
self._results = results
self._honest_did_results = honest_did_results
self._auto_diagnostics = auto_diagnostics
self._diagnostics_arg = diagnostics
self._include_appendix = include_appendix
# Raw-data passthrough so the auto-constructed DR can run
# data-dependent checks (2x2 PT on simple DiD, Bacon-from-
# scratch on staggered estimators, EfficientDiD Hausman
# pretest). Without these, the auto path silently skips those
# checks (round-12 CI review on PR #318).
self._dr_data = data
self._dr_outcome = outcome
self._dr_treatment = treatment
self._dr_unit = unit
self._dr_time = time
self._dr_first_treat = first_treat
# Round-40 P1 CI review on PR #318: survey-backed fits need
# the ``SurveyDesign`` threaded through to the auto-constructed
# DR so Bacon decomposition is fit-faithful and the 2x2 PT
# skip path triggers for DiDResults with ``survey_metadata``.
# Without this passthrough, the auto path silently replays an
# unweighted decomposition / PT verdict for a weighted fit.
self._dr_survey_design = survey_design
# Round-43 P2 CI review on PR #318: BR docs and docstrings
# advertised a ``precomputed={'parallel_trends': ...}`` opt-in
# for survey-aware 2x2 PT and other escape hatches, but BR did
# not actually accept a ``precomputed=`` kwarg — the auto path
# only synthesized ``{"sensitivity": honest_did_results}``, so
# callers following the BR docs hit a ``TypeError`` on
# ``__init__``. Accept the passthrough here and forward every
# key to the auto-constructed DR (which owns validation against
# its implemented-key set and estimator-aware rejection rules).
# ``honest_did_results`` still feeds into ``sensitivity`` as a
# convenience; an explicit ``precomputed['sensitivity']`` wins
# on conflict.
self._dr_precomputed: Dict[str, Any] = dict(precomputed or {})
# Round-43 P2 CI review on PR #318: mirror DR's eager key
# validation so users get the "unsupported key" error at BR
# construction rather than lazily when the DR is built inside
# ``to_dict()``. Kept in sync with ``DiagnosticReport``'s
# ``_supported_precomputed`` set; the cheapest way to avoid
# drift would be to import the set, but DR currently scopes it
# locally to ``__init__`` so mirror the literal here with a
# pointer comment.
_br_supported_precomputed = {
"parallel_trends",
"sensitivity",
"pretrends_power",
"bacon",
}
_br_unsupported = set(self._dr_precomputed) - _br_supported_precomputed
if _br_unsupported:
raise ValueError(
"precomputed= contains keys that are not implemented: "
f"{sorted(_br_unsupported)}. Supported keys: "
f"{sorted(_br_supported_precomputed)}. ``design_effect``, "
"``heterogeneity``, and ``epv`` are read directly from the "
"fitted result and do not accept precomputed overrides."
)
resolved_alpha = alpha if alpha is not None else getattr(results, "alpha", 0.05)
self._context = BusinessContext(
outcome_label=outcome_label or "the outcome",
outcome_unit=outcome_unit,
outcome_direction=outcome_direction,
business_question=business_question,
treatment_label=treatment_label or "the treatment",
alpha=float(resolved_alpha),
)
self._cached_schema: Optional[Dict[str, Any]] = None
# -- Public API ---------------------------------------------------------
[docs]
def to_dict(self) -> Dict[str, Any]:
"""Return the AI-legible structured schema (single source of truth)."""
if self._cached_schema is None:
self._cached_schema = self._build_schema()
return self._cached_schema
[docs]
def to_json(self, *, indent: int = 2) -> str:
"""Return ``to_dict()`` serialized as JSON."""
import json
return json.dumps(self.to_dict(), indent=indent)
[docs]
def summary(self) -> str:
"""Return a short plain-English paragraph block (6-10 sentences)."""
return _render_summary(self.to_dict())
[docs]
def full_report(self) -> str:
"""Return a structured multi-section markdown report."""
base = _render_full_report(self.to_dict())
if self._include_appendix:
try:
appendix = self._results.summary()
except Exception: # noqa: BLE001
appendix = None
if appendix:
base = base + "\n\n## Technical Appendix\n\n```\n" + str(appendix) + "\n```\n"
return base
[docs]
def export_markdown(self) -> str:
"""Alias for ``full_report()`` (discoverability)."""
return self.full_report()
[docs]
def headline(self) -> str:
"""Return just the headline sentence."""
return _render_headline_sentence(self.to_dict())
[docs]
def caveats(self) -> List[Dict[str, str]]:
"""Return the list of structured caveats (severity + topic + message)."""
return list(self.to_dict().get("caveats", []))
def __repr__(self) -> str:
estimator = type(self._results).__name__
headline = self.to_dict().get("headline") or {}
val = headline.get("effect")
if isinstance(val, (int, float)) and np.isfinite(val):
return f"BusinessReport(results={estimator}, effect={val:.3g})"
return f"BusinessReport(results={estimator})"
def __str__(self) -> str:
return self.summary()
# -- Implementation detail ---------------------------------------------
def _resolve_diagnostics(self) -> Optional[DiagnosticReportResults]:
"""Return the DiagnosticReportResults to embed, or ``None`` if skipped."""
if self._diagnostics_arg is not None:
if isinstance(self._diagnostics_arg, DiagnosticReportResults):
return self._diagnostics_arg
if isinstance(self._diagnostics_arg, DiagnosticReport):
return self._diagnostics_arg.run_all()
raise TypeError("diagnostics= must be a DiagnosticReport or DiagnosticReportResults")
if not self._auto_diagnostics:
return None
# Round-43 P2 CI review on PR #318: forward the user's
# ``precomputed`` dict through to DR. ``honest_did_results``
# stays a convenience shortcut for ``sensitivity`` only; an
# explicit ``precomputed['sensitivity']`` from the caller
# wins. DR handles key validation (rejects unsupported keys
# and estimator-incompatible sensitivities / parallel_trends
# entries) so BR just merges and forwards.
precomputed: Dict[str, Any] = dict(self._dr_precomputed)
if self._honest_did_results is not None:
precomputed.setdefault("sensitivity", self._honest_did_results)
dr = DiagnosticReport(
self._results,
alpha=self._context.alpha,
precomputed=precomputed or None,
outcome_label=self._context.outcome_label,
treatment_label=self._context.treatment_label,
data=self._dr_data,
outcome=self._dr_outcome,
treatment=self._dr_treatment,
unit=self._dr_unit,
time=self._dr_time,
first_treat=self._dr_first_treat,
survey_design=self._dr_survey_design,
)
return dr.run_all()
def _build_schema(self) -> Dict[str, Any]:
"""Assemble the structured schema.
Pulls validation content (PT, sensitivity, Bacon, DEFF, EPV, ...) from
the internal ``DiagnosticReport``; extracts the stakeholder-facing
headline and sample metadata from the fitted result itself.
"""
estimator_name = type(self._results).__name__
diagnostics_results = self._resolve_diagnostics()
dr_schema: Optional[Dict[str, Any]] = (
diagnostics_results.schema if diagnostics_results is not None else None
)
# PR #347 R4 P1: compute target_parameter BEFORE extracting
# the headline so the no-scalar-by-design case
# (``aggregation == "no_scalar_headline"``, e.g., dCDH
# ``trends_linear=True`` with ``L_max >= 2``) can route the
# headline through a dedicated branch that names the intentional
# NaN rather than an estimation-failure path.
target_parameter = describe_target_parameter(self._results)
if target_parameter.get("aggregation") == "no_scalar_headline":
# PR #347 R12 P1: the no-scalar ``reason`` must distinguish
# the populated-surface case (per-horizon table exists) from
# the empty-surface subcase (``linear_trends_effects=None``
# — no horizons survived estimation). Telling a user with
# an empty surface to "see linear_trends_effects" is
# dead-end guidance.
_surface_empty = getattr(self._results, "linear_trends_effects", None) is None
# PR #347 R14 P1: the empty-surface reason must use the
# covariate-adjusted label when covariates are active.
_has_controls = getattr(self._results, "covariate_residuals", None) is not None
_empty_surface_label = "DID^{X,fd}_l" if _has_controls else "DID^{fd}_l"
if _surface_empty:
no_scalar_reason = (
"The fitted estimator intentionally does not produce a "
"scalar overall ATT on this configuration "
"(``trends_linear=True`` with ``L_max >= 2``), and on "
f"this fit no cumulated level effects ``{_empty_surface_label}`` "
"survived estimation — the per-horizon surface is "
"empty. Re-fit with a larger ``L_max`` or with "
"``trends_linear=False`` if you need a reportable "
"estimand."
)
else:
no_scalar_reason = (
"The fitted estimator intentionally does not produce a "
"scalar overall ATT on this configuration "
"(``trends_linear=True`` with ``L_max >= 2``). Per-horizon "
"cumulated level effects are on "
"``results.linear_trends_effects[l]``."
)
headline = {
"status": "no_scalar_by_design",
"effect": None,
"se": None,
"ci_lower": None,
"ci_upper": None,
"alpha_was_honored": True,
"alpha_override_caveat": None,
"ci_level": int(round((1.0 - self._context.alpha) * 100)),
"p_value": None,
"is_significant": False,
"near_significance_threshold": False,
"unit": self._context.outcome_unit,
"unit_kind": _UNIT_KINDS.get(
self._context.outcome_unit.lower() if self._context.outcome_unit else "",
"unknown",
),
"sign": "none",
"breakdown_M": None,
"reason": no_scalar_reason,
}
else:
headline = self._extract_headline(dr_schema)
sample = self._extract_sample()
heterogeneity = _lift_heterogeneity(dr_schema)
pre_trends = _lift_pre_trends(dr_schema)
sensitivity = _lift_sensitivity(dr_schema)
robustness = _lift_robustness(dr_schema)
assumption = _apply_anticipation_to_assumption(
_describe_assumption(estimator_name, self._results),
self._results,
)
next_steps = (dr_schema or {}).get("next_steps", [])
caveats = _build_caveats(self._results, headline, sample, dr_schema)
references = _references_for(estimator_name)
if diagnostics_results is None:
diagnostics_block: Dict[str, Any] = {
"status": "skipped",
"reason": "auto_diagnostics=False",
}
else:
diagnostics_block = {
"status": "ran",
"schema": dr_schema,
"overall_interpretation": (
dr_schema.get("overall_interpretation", "") if dr_schema is not None else ""
),
}
return {
"schema_version": BUSINESS_REPORT_SCHEMA_VERSION,
"estimator": {
"class_name": estimator_name,
"display_name": estimator_name,
},
"context": {
"outcome_label": self._context.outcome_label,
"outcome_unit": self._context.outcome_unit,
"outcome_direction": self._context.outcome_direction,
"business_question": self._context.business_question,
"treatment_label": self._context.treatment_label,
"alpha": self._context.alpha,
},
"headline": headline,
"target_parameter": target_parameter,
"assumption": assumption,
"pre_trends": pre_trends,
"sensitivity": sensitivity,
"sample": sample,
"heterogeneity": heterogeneity,
"robustness": robustness,
"diagnostics": diagnostics_block,
"next_steps": next_steps,
"caveats": caveats,
"references": references,
}
def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""Extract the headline effect + CI + p-value from the result."""
r = self._results
# Delegate the attribute-alias lookup to the shared helper in the
# diagnostic_report module so BR and DR agree on which fields a
# result class exposes for its headline (including
# ``ContinuousDiDResults`` which uses ``overall_att_se`` /
# ``overall_att_p_value`` / ``overall_att_conf_int``).
from diff_diff.diagnostic_report import _extract_scalar_headline
extracted = _extract_scalar_headline(r, fallback_alpha=self._context.alpha)
att: Optional[float] = None
se: Optional[float] = None
p: Optional[float] = None
ci: Optional[List[float]] = None
alpha = self._context.alpha
result_alpha: Optional[float] = None
if extracted is not None:
_name, att, se, p, ci, result_alpha = extracted
# On any alpha mismatch, preserve the fitted CI at its native
# level. A faithful CI cannot be recomputed from point estimate
# and SE alone without reproducing the fit's inference contract
# (finite-df t-quantile, percentile bootstrap, wild cluster
# bootstrap, survey replicate quantile, rank-deficient
# undefined-df, etc.), and the 16 result classes do not expose
# a uniform descriptor for that. Two separate alpha values:
# ``display_alpha`` drives ``ci_level`` so the displayed CI
# label matches the preserved bounds; the caller's requested
# alpha drives the significance phrasing (``is_significant`` /
# ``near_threshold``). A caveat records the override.
display_alpha = alpha
phrasing_alpha = alpha
alpha_was_honored = True
alpha_override_caveat: Optional[str] = None
if (
result_alpha is not None
and not np.isclose(alpha, result_alpha)
and att is not None
and se is not None
):
inference_method = getattr(r, "inference_method", "analytical")
if inference_method == "wild_bootstrap":
inference_label = "wild cluster bootstrap"
elif (
inference_method == "bootstrap" or getattr(r, "bootstrap_results", None) is not None
):
inference_label = "bootstrap"
elif getattr(r, "bootstrap_distribution", None) is not None:
inference_label = "bootstrap"
elif getattr(r, "variance_method", None) in {"bootstrap", "jackknife", "placebo"}:
variance_method = getattr(r, "variance_method", None)
inference_label = f"{variance_method} variance"
else:
df_survey = getattr(
r,
"df_survey",
getattr(getattr(r, "survey_metadata", None), "df_survey", None),
)
if isinstance(df_survey, (int, float)) and df_survey > 0:
inference_label = "finite-df survey"
elif isinstance(df_survey, (int, float)) and df_survey == 0:
# Rank-deficient replicate design: the fit deliberately
# left inference undefined. Preserve (NaN bounds remain NaN).
inference_label = "undefined-df (replicate-weight)"
else:
# Ordinary analytical fit with a finite but unexposed
# ``df`` (``DifferenceInDifferences`` / ``MultiPeriodDiD``
# / most staggered estimators / TROP). We cannot
# reproduce the t-quantile without the fit's ``df``.
inference_label = "analytical (native degrees of freedom)"
display_alpha = float(result_alpha)
alpha_was_honored = False
alpha_override_caveat = (
f"Requested alpha ({phrasing_alpha:.2f}) was not honored "
f"for the confidence interval because this fit uses "
f"{inference_label} inference; the displayed CI remains "
f"at the fit's native level "
f"({int(round((1.0 - result_alpha) * 100))}%). The "
f"significance phrasing still uses the requested alpha."
)
unit = self._context.outcome_unit
unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown")
sign = (
"positive"
if (att is not None and att > 0)
else (
"negative"
if (att is not None and att < 0)
else ("null" if att == 0 else "undefined")
)
)
if att is None or not np.isfinite(att):
sign = "undefined"
ci_level = int(round((1.0 - display_alpha) * 100))
# bool(...) coerces away numpy bool_ — when ``p`` is a numpy NaN (e.g.
# SyntheticControl, whose analytical p_value is always NaN), ``np.isfinite``
# yields a numpy bool that is NOT JSON-serializable in the schema.
is_significant = bool(
p is not None and np.isfinite(p) and p < phrasing_alpha if p is not None else False
)
near_threshold = bool(
p is not None
and np.isfinite(p)
and (phrasing_alpha - 0.01) < p < (phrasing_alpha + 0.001)
)
# Use DR-computed breakdown_M if available for quick reference.
breakdown_M: Optional[float] = None
if dr_schema:
sens_section = dr_schema.get("sensitivity") or {}
if sens_section.get("status") == "ran":
breakdown_M = sens_section.get("breakdown_M")
return {
"effect": att,
"se": se,
"ci_lower": ci[0] if ci else None,
"ci_upper": ci[1] if ci else None,
"alpha_was_honored": alpha_was_honored,
"alpha_override_caveat": alpha_override_caveat,
"ci_level": ci_level,
"p_value": p,
"is_significant": is_significant,
"near_significance_threshold": near_threshold,
"unit": unit,
"unit_kind": unit_kind,
"sign": sign,
"breakdown_M": breakdown_M,
}
def _extract_sample(self) -> Dict[str, Any]:
"""Extract sample metadata from the fitted result."""
r = self._results
survey = self._extract_survey_block()
n_treated = _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None)))
n_control_units = _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None)))
# Control-group semantics. For estimators that expose a
# ``control_group`` kwarg (CS, EfficientDiD, ContinuousDiD,
# StaggeredTripleDiff, ...), the meaning of ``n_control_units``
# depends on it. When the mode is "not-yet-treated" (dynamic
# comparison set), the fixed tally stored on the result is only
# the fully-untreated subset — the actual comparison set varies
# by (g, t) cell. Label the exposed count accordingly so prose
# surfaces the dynamic context instead of misreporting
# "0 control" (round-13 / round-17 / round-18 CI review).
#
# Canonicalize both ``"not_yet_treated"`` (CS / EfficientDiD /
# ContinuousDiD / Wooldridge) and ``"notyettreated"``
# (StaggeredTripleDiff) as the same dynamic mode.
#
# Per-estimator fixed-subset field:
# * CS / SA / Imputation / TwoStage / EfficientDiD /
# dCDH / ContinuousDiD — ``n_control_units`` is the
# never-treated tally; surface as ``n_never_treated``.
# * StaggeredTripleDiff — ``n_control_units`` is a composite
# total; the fixed subset is ``n_never_enabled`` (stored
# separately on the result).
# * Wooldridge — ``n_control_units`` is total eligible
# comparisons (never-treated + future-treated) and does not
# map to a never-treated count. Keep on the fixed-count
# path even in dynamic mode.
# * Stacked — ``n_control_units`` is "distinct control units
# across the trimmed set" (stacked_did_results.py L59-62).
# Under ``clean_control="not_yet_treated"``, the trimmed
# set uses the rule ``A_s > a + kappa_post`` which admits
# future-treated controls; it is NOT a never-treated tally
# and cannot be relabeled as ``n_never_treated``. Keep
# Stacked on the fixed-count path (round-21 P1 CI review
# on PR #318 flagged the earlier relabeling as a
# semantic-contract violation).
control_group = _control_group_choice(r)
name = type(r).__name__
n_never_treated: Optional[int] = None
n_never_enabled: Optional[int] = None
n_control: Optional[int] = n_control_units
_never_treated_count_contract = name in {
"CallawaySantAnnaResults",
"SunAbrahamResults",
"ImputationDiDResults",
"TwoStageDiDResults",
"EfficientDiDResults",
"ChaisemartinDHaultfoeuilleResults",
"ContinuousDiDResults",
}
_canonical_control = (
control_group.replace("_", "").lower() if isinstance(control_group, str) else None
)
# Stacked has two dynamic (sub-experiment-specific) modes:
# ``not_yet_treated`` (A_s > a + kappa_post) and ``strict``
# (A_s > a + kappa_post + kappa_pre). Only ``never_treated``
# (A_s = infinity) is a fixed never-treated pool. Round-22 P1
# CI review on PR #318 flagged that ``strict`` was being
# misrendered as a fixed control design.
is_stacked_dynamic = name == "StackedDiDResults" and _canonical_control in {
"notyettreated",
"strict",
}
is_dynamic_control = _canonical_control == "notyettreated" or is_stacked_dynamic
# StaggeredTripleDiff comparison-group contract:
# ``n_control_units`` is a composite total that also includes
# the eligibility-denied / larger-cohort cells. Regardless of
# the ``control_group`` mode the valid fixed comparison is the
# never-enabled cohort (``staggered_triple_diff.py:384``,
# REGISTRY.md §StaggeredTripleDifference line 1730). Round-37
# P1 CI review on PR #318: under ``control_group="never_treated"``
# (i.e., ``_canonical_control == "nevertreated"``) the composite
# total was being narrated as "control". Surface
# ``n_never_enabled`` instead on both the ``nevertreated`` and
# the dynamic ``notyettreated`` modes.
if name == "StaggeredTripleDiffResults" and _canonical_control == "nevertreated":
n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None))
n_control = None
if is_dynamic_control:
if name == "StaggeredTripleDiffResults":
n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None))
n_control = None
elif name == "StackedDiDResults":
# ``n_control_units`` is "distinct control units across
# the trimmed set" (stacked_did_results.py L59-62) which
# includes future-treated controls by construction under
# both dynamic modes. Do NOT relabel as
# ``n_never_treated``; instead surface the count under
# ``n_distinct_controls_trimmed`` (sub-experiment-
# specific context) and clear ``n_control`` so the
# report does not narrate a fixed control pool.
n_control = None
elif _never_treated_count_contract:
n_never_treated = n_control_units
n_control = None
# Panel-vs-RCS count semantics. CallawaySantAnnaResults stores
# treated/control counts as OBSERVATIONS (not units) when the
# fit used ``panel=False`` — ``staggered_results.py L183-L184``
# renders those counts as "obs:" rather than "units:". BR
# previously labeled them as "units" / "present in the panel",
# which misstates the sample composition for repeated cross-
# section fits. Carry the flag into the schema so rendering can
# branch. Round-28 P2 CI review on PR #318.
count_unit = "observations" if getattr(r, "panel", True) is False else "units"
sample_block: Dict[str, Any] = {
"n_obs": _safe_int(getattr(r, "n_obs", None)),
"n_treated": n_treated,
"n_control": n_control,
"n_never_treated": n_never_treated,
"control_group": control_group if isinstance(control_group, str) else None,
"dynamic_control": is_dynamic_control,
"n_periods": _safe_int(getattr(r, "n_periods", None)),
"pre_periods": _safe_list_len(getattr(r, "pre_periods", None)),
"post_periods": _safe_list_len(getattr(r, "post_periods", None)),
"count_unit": count_unit,
"survey": survey,
}
if n_never_enabled is not None:
sample_block["n_never_enabled"] = n_never_enabled
# Stacked-specific: surface the distinct-control-units tally on a
# dedicated key so agents see the sub-experiment-specific
# comparison count without misreading it as a never-treated
# subset (round-21 / round-22 CI review).
if name == "StackedDiDResults":
sample_block["n_distinct_controls_trimmed"] = n_control_units
return sample_block
def _extract_survey_block(self) -> Optional[Dict[str, Any]]:
sm = getattr(self._results, "survey_metadata", None)
if sm is None:
return None
deff = _safe_float(getattr(sm, "design_effect", None))
return {
"weight_type": getattr(sm, "weight_type", None),
"effective_n": _safe_float(getattr(sm, "effective_n", None)),
"design_effect": deff,
# Round-43 P2 CI review on PR #318: the ``is_trivial``
# upper bound matches DR's ``_check_design_effect`` and
# REPORTING.md's ``trivial`` band definition
# ``0.95 <= deff < 1.05`` (half-open). The prior closed
# interval ``<= 1.05`` produced ``is_trivial=True`` at
# exactly ``deff == 1.05`` while the DR schema emitted
# ``band_label="slightly_reduces"`` for the same value,
# suppressing BR's non-trivial prose at that boundary.
"is_trivial": deff is not None and 0.95 <= deff < 1.05,
"n_strata": _safe_int(getattr(sm, "n_strata", None)),
"n_psu": _safe_int(getattr(sm, "n_psu", None)),
"df_survey": _safe_int(getattr(sm, "df_survey", None)),
"replicate_method": getattr(sm, "replicate_method", None),
}
# ---------------------------------------------------------------------------
# Schema helpers (module-private)
# ---------------------------------------------------------------------------
def _safe_float(val: Any) -> Optional[float]:
if val is None:
return None
try:
return float(val)
except (TypeError, ValueError):
return None
def _safe_int(val: Any) -> Optional[int]:
if val is None:
return None
try:
return int(val)
except (TypeError, ValueError):
return None
def _safe_ci(ci: Any) -> Optional[List[float]]:
if ci is None:
return None
try:
lo, hi = ci
except (TypeError, ValueError):
return None
lo_f = _safe_float(lo)
hi_f = _safe_float(hi)
if lo_f is None or hi_f is None:
return None
return [lo_f, hi_f]
def _safe_list_len(val: Any) -> Optional[int]:
if val is None:
return None
try:
return int(len(val))
except TypeError:
return None
def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""Pull pre-trends + power into a single BR-facing block."""
if dr is None:
return {"status": "skipped", "reason": "auto_diagnostics=False"}
pt = dr.get("parallel_trends") or {}
pp = dr.get("pretrends_power") or {}
if pt.get("status") != "ran":
return {
"status": pt.get("status", "not_run"),
"reason": pt.get("reason"),
}
return {
"status": "computed",
"method": pt.get("method"),
"joint_p_value": pt.get("joint_p_value"),
"verdict": pt.get("verdict"),
"n_pre_periods": pt.get("n_pre_periods"),
# Preserve DR's inconclusive-PT provenance on the BR schema so
# downstream consumers (and BR's own summary renderer) see the
# undefined-row count and DR's detailed reason without having
# to re-consult the DR schema (round-39 P3 CI review on PR
# #318). These fields are populated only when
# ``verdict == "inconclusive"`` per ``_pt_event_study``'s
# inconclusive branch (``diagnostic_report.py:999``).
"n_dropped_undefined": pt.get("n_dropped_undefined"),
"reason": pt.get("reason"),
# Carry the denominator df through when the survey F-reference
# branch was used so BR consumers can flag the finite-sample
# correction without re-consulting the DR schema (round-28 P3
# CI review on PR #318).
"df_denom": pt.get("df_denom"),
"power_status": pp.get("status"),
# Dedicated reason field so schema consumers see the fallback
# explanation when ``compute_pretrends_power`` cannot run
# (``status in {"skipped", "error", "not_applicable"}``).
# REPORTING.md lines 118-125 promise this provenance; round-29
# P3 CI review on PR #318 flagged that only the enum status was
# being exposed and the reason was dropped at the lift boundary.
# ``power_status`` stays the machine-readable enum; ``power_reason``
# carries the plain-English explanation.
"power_reason": pp.get("reason"),
"power_tier": pp.get("tier"),
"mdv": pp.get("mdv"),
# Level-scale max pre-period violation under the MDV
# (PR-B R12: `mdv * max(|violation_weights|)`). Carried alongside
# the raw `mdv` so BR schema consumers and the full-report
# renderer can show both quantities. Pre-R14 this was silently
# dropped at the BR lift boundary so the new renderer line never
# fired even though DR emitted the value.
"max_abs_pre_violation": pp.get("max_abs_pre_violation"),
"mdv_share_of_att": pp.get("mdv_share_of_att"),
# Carry the covariance-source annotation through so BR can hedge the
# power-tier phrasing when compute_pretrends_power silently used a
# diagonal fallback despite event_study_vcov being available.
"power_covariance_source": pp.get("covariance_source"),
}
def _lift_sensitivity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]:
if dr is None:
return {"status": "skipped", "reason": "auto_diagnostics=False"}
sens = dr.get("sensitivity") or {}
if sens.get("status") != "ran":
# Preserve ``method`` through to the BR schema so downstream
# consumers can distinguish a native-routed skip
# (``method="estimator_native"`` for SDiD / TROP, where
# robustness is covered by the native battery) from a
# methodology-blocked skip (e.g., CS with
# ``base_period='varying'``). Without it, agents reading the BR
# schema alone cannot tell these cases apart and would have to
# re-consult the DR schema to disambiguate.
return {
"status": sens.get("status", "not_run"),
"reason": sens.get("reason"),
"method": sens.get("method"),
}
return {
"status": "computed",
"method": sens.get("method"),
"breakdown_M": sens.get("breakdown_M"),
"conclusion": sens.get("conclusion"),
"grid": sens.get("grid"),
}
def _lift_heterogeneity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""Return the heterogeneity section of the BR schema.
Round-31 P2 CI review on PR #318: the lift previously returned
``None`` on any non-``ran`` path, which broke the schema contract
that every top-level BR key resolves to a dict with a ``status``
field. Downstream consumers had to special-case this one section.
Now returns a dict-shaped ``{"status": ..., "reason": ...}`` block
mirroring DR's own status enum so ``schema["heterogeneity"]
["status"]`` is always readable.
"""
if dr is None:
return {"status": "skipped", "reason": "auto_diagnostics=False"}
het = dr.get("heterogeneity") or {}
status = het.get("status")
if status != "ran":
return {
"status": status or "not_run",
"reason": het.get("reason"),
}
return {
"status": "ran",
"source": het.get("source"),
"n_effects": het.get("n_effects"),
"min": het.get("min"),
"max": het.get("max"),
"cv": het.get("cv"),
"sign_consistent": het.get("sign_consistent"),
}
def _lift_robustness(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]:
if dr is None:
return {"status": "skipped", "reason": "auto_diagnostics=False"}
bacon = dr.get("bacon") or {}
native = dr.get("estimator_native_diagnostics") or {}
native_block = {
"status": native.get("status"),
"estimator": native.get("estimator"),
"pre_treatment_fit": native.get("pre_treatment_fit"),
}
# Classic SCM exposes pre_rmspe + donor-weight concentration + the (opt-in)
# in-space placebo rather than SDiD's pre_treatment_fit; surface those so the
# top-level robustness block is not empty for SyntheticControl.
if native.get("estimator") == "SyntheticControl":
native_block["pre_rmspe"] = native.get("pre_rmspe")
native_block["weight_concentration"] = native.get("weight_concentration")
native_block["in_space_placebo"] = native.get("in_space_placebo")
# ADH-2015 robustness diagnostics (opt-in; "not_run" stub until run).
native_block["leave_one_out"] = native.get("leave_one_out")
native_block["in_time_placebo"] = native.get("in_time_placebo")
return {
"bacon": {
"status": bacon.get("status"),
"forbidden_weight": bacon.get("forbidden_weight"),
"verdict": bacon.get("verdict"),
},
"estimator_native": native_block,
}
def _anticipation_periods(results: Any) -> int:
"""Return the non-negative anticipation-period count from a result, or 0.
Helper for ``_describe_assumption``. Anticipation-capable estimators
(MultiPeriodDiD, CS, SA, ImputationDiD, TwoStageDiD, Stacked, EfficientDiD,
StaggeredTripleDiff, ContinuousDiD, Wooldridge) expose ``anticipation``
as an int defaulting to ``0``.
"""
a = getattr(results, "anticipation", 0)
try:
k = int(a)
except (TypeError, ValueError):
return 0
return k if k > 0 else 0
def _control_group_choice(results: Any) -> Optional[str]:
"""Return the control-group choice string for a fitted result, normalized
across estimator-specific attribute names.
Most anticipation-capable estimators expose the control-group choice as
``results.control_group``. ``StackedDiDResults`` exposes the same choice
as ``clean_control`` (the public Wing-Freedman-Hollingsworth-2024 kwarg
name). Without this alias, a StackedDiD fit with
``clean_control="not_yet_treated"`` would surface as ``control_group=None``
in the business-report schema, and the dynamic-control branch in
``_extract_sample`` would never fire.
"""
cg = getattr(results, "control_group", None)
if isinstance(cg, str):
return cg
if type(results).__name__ == "StackedDiDResults":
clean = getattr(results, "clean_control", None)
if isinstance(clean, str):
return clean
return None
_STRICT_NO_ANTICIPATION_PATTERNS = (
# Ordered from most specific to least specific so the first match
# wins on strings that could match multiple patterns. Matches are
# case-sensitive because every occurrence in ``_describe_assumption``
# is a fixed canonical phrase.
", plus no anticipation",
"plus no anticipation",
" Also assumes no anticipation (Assumption NA), overlap "
"(Assumption O), and absorbing / irreversible treatment.",
" Also assumes no anticipation.",
"Also assumes no anticipation.",
" and no anticipation",
)
def _strip_strict_no_anticipation(desc: str) -> str:
"""Remove any strict no-anticipation phrasing from ``desc``.
Several base assumption descriptions in ``_describe_assumption``
hard-code a strict "plus no anticipation" / "Also assumes no
anticipation" clause (CS / SA / Imputation / TwoStage / Wooldridge
generic, StackedDiD sub-experiment, EfficientDiD PT-Post, EfficientDiD
PT-All, ContinuousDiD, TripleDifference, SyntheticDiD, TROP, dCDH,
and the fallback unconditional branch). When a fit actually allows
anticipation the helper must REPLACE that wording, not append a
contradictory clause on top of it. Round-30 P1 CI review on PR #318.
"""
if not desc:
return desc
out = desc
for pattern in _STRICT_NO_ANTICIPATION_PATTERNS:
out = out.replace(pattern, "")
# Collapse any doubled whitespace or dangling punctuation left by
# the removal (e.g., "cohorts, with..." -> "cohorts, with...";
# "cohorts . " -> "cohorts.").
out = re.sub(r"\s+\.", ".", out)
out = re.sub(r"\s+,", ",", out)
out = re.sub(r" {2,}", " ", out)
return out.strip()
def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Dict[str, Any]:
"""If the fit used ``anticipation > 0``, flip ``no_anticipation`` off,
strip any strict no-anticipation wording from the base description,
and append an anticipation-aware clause.
Round-17 CI review flagged the strict "plus no anticipation" language
on anticipation-enabled fits. Per REGISTRY.md §CallawaySantAnna lines
355-395 and the matching sections for SA / MultiPeriod / Wooldridge /
EfficientDiD, a fit with ``anticipation=k`` shifts the effective
treatment boundary by ``k`` pre-periods; the identifying assumption
becomes "no treatment effects earlier than ``k`` periods before the
treatment start" rather than strict no-anticipation. Round-30 CI
review caught that the previous implementation only appended — the
resulting prose said both "strict no-anticipation holds" and
"anticipation is allowed" in the same paragraph.
"""
k = _anticipation_periods(results)
if k <= 0:
return block
block = dict(block) # don't mutate the caller's dict
block["no_anticipation"] = False
block["anticipation_periods"] = k
period_word = "period" if k == 1 else "periods"
clause = (
f" Anticipation is allowed for the {k} {period_word} immediately "
"before treatment: the identifying contract requires no treatment "
f"effects earlier than {k} {period_word} before the treatment "
"start (not strict no-anticipation)."
)
desc = block.get("description", "")
if isinstance(desc, str):
block["description"] = _strip_strict_no_anticipation(desc) + clause
return block
def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, Any]:
"""Return the identifying-assumption block for an estimator."""
if estimator_name in {
"SyntheticDiDResults",
}:
return {
"parallel_trends_variant": "weighted_pt",
"no_anticipation": True,
"description": (
"Synthetic-Difference-in-Differences identifies the ATT under a "
"weighted parallel-trends analogue: the synthetic control is "
"chosen to match the treated group's pre-period trajectory."
),
}
if estimator_name in {"TROPResults"}:
return {
"parallel_trends_variant": "factor_model",
"no_anticipation": True,
"description": (
"TROP uses low-rank factor-model identification rather than a "
"parallel-trends assumption; unobserved heterogeneity is "
"captured through latent factor loadings."
),
}
if estimator_name in {"SyntheticControlResults"}:
return {
# Distinct from SDiD's "synthetic_fit" weighted-PT analogue: classic
# SCM is a donor-weighted level match (matches the DR "scm_fit" method).
"parallel_trends_variant": "scm_fit",
"no_anticipation": True,
"description": (
"Classic synthetic control identifies the single treated unit's "
"counterfactual via a donor-weighted match to its pre-treatment "
"trajectory (a design-enforced fit, not a parallel-trends test); "
"significance comes from in-space placebo permutation inference "
"rather than an analytical standard error."
),
}
if estimator_name == "ContinuousDiDResults":
# Callaway, Goodman-Bacon & Sant'Anna (2024), two-level PT:
# REGISTRY.md §ContinuousDiD > Identification.
return {
"parallel_trends_variant": "dose_pt_or_strong_pt",
"no_anticipation": True,
"description": (
"ContinuousDiD identifies dose-specific treatment effects "
"under two possible parallel-trends conditions (Callaway, "
"Goodman-Bacon & Sant'Anna 2024). Parallel Trends (PT) "
"assumes untreated potential outcome paths are equal across "
"all dose groups and the untreated group (conditional on "
"dose), identifying ATT(d|d) and the binarized ATT^loc but "
"NOT ATT(d), ACRT, or cross-dose comparisons. Strong "
"Parallel Trends (SPT) additionally rules out selection "
"into dose on the basis of treatment effects and is "
"required to identify the dose-response curve ATT(d), "
"marginal effect ACRT(d), and cross-dose contrasts."
),
}
if estimator_name in {"TripleDifferenceResults", "StaggeredTripleDiffResults"}:
# Ortiz-Villavicencio & Sant'Anna (2025) — identification is the
# triple-difference cancellation across the 2x2x2 cells, not
# ordinary DiD parallel trends; see REGISTRY.md §TripleDifference
# and §StaggeredTripleDifference.
return {
"parallel_trends_variant": "triple_difference_cancellation",
"no_anticipation": True,
"description": (
"Triple-difference identification relies on the DDD "
"decomposition (Ortiz-Villavicencio & Sant'Anna 2025): "
"the ATT is recovered from `DDD = DiD_A + DiD_B - DiD_C` "
"across the Group x Period x Eligibility (or Treatment) "
"cells, which differences out group-specific and "
"period-specific unobservables without requiring separate "
"parallel trends to hold between each cell pair. The "
"identifying restriction is therefore weaker than ordinary "
"DiD parallel trends but assumes that the residual "
"unobservable component is additively separable across the "
"three dimensions; practical overlap and common-support "
"conditions still apply on the propensity score when "
"covariates are used."
),
}
if estimator_name == "ChaisemartinDHaultfoeuilleResults":
# de Chaisemartin & D'Haultfoeuille (2020, 2024) — identification is
# transition-based across (joiner, leaver, stable-control) cells
# around each switching period, not a group-time ATT parallel-
# trends restriction. Writing up dCDH as "parallel trends across
# treatment cohorts" was flagged as a source-faithfulness bug in
# PR #318 review; REGISTRY.md §ChaisemartinDHaultfoeuille is
# explicit about the transition-set construction.
#
# Phase-3 features (``controls``, ``trends_linear``,
# ``heterogeneity``) each modify the identifying contract and
# change the estimand from ``DID_l`` to ``DID^X_l`` /
# ``DID^{fd}_l`` / the heterogeneity-test variant. When active,
# append an explicit clause so the description does not
# misrepresent the identifying assumption (the reviewer has
# flagged several parallel source-faithfulness gaps elsewhere
# — explicitly surfacing Phase-3 config matches the per-estimator
# walkthrough pattern).
base_description = (
"Identification is transition-based (de Chaisemartin & "
"D'Haultfoeuille 2020; dynamic companion 2024). At each "
"switching period, the estimator contrasts joiners "
"(D:0->1), leavers (D:1->0), and stable-treated / "
"stable-untreated control cells that share the same "
"treatment state across adjacent periods, yielding the "
"contemporaneous ``DID_M`` and per-horizon ``DID_l`` / "
"``DID_{g,l}`` building blocks. The identifying "
"restriction is parallel trends within each transition's "
"stable-control cell (not a single group-time ATT PT "
"condition across all cohorts) plus no anticipation; "
"with non-binary treatment the stable-control match is "
"additionally on exact baseline dose ``D_{g,1}``. "
"Reversible treatment is natively supported, unlike the "
"absorbing-treatment designs that rely on a fixed "
"treatment-onset cohort."
)
has_controls = (
results is not None and getattr(results, "covariate_residuals", None) is not None
)
# PR #347 R10 P1: read the persisted ``trends_linear`` flag
# first — empty-horizon trends-linear fits set
# ``linear_trends_effects=None`` but are still trends-linear
# per the estimator contract. Legacy fit objects predating
# the persisted field fall back to the presence inference.
_trends_persisted = getattr(results, "trends_linear", None) if results is not None else None
if isinstance(_trends_persisted, bool):
has_trends = _trends_persisted
else:
has_trends = (
results is not None and getattr(results, "linear_trends_effects", None) is not None
)
has_heterogeneity = (
results is not None and getattr(results, "heterogeneity_effects", None) is not None
)
active_parts: List[str] = []
if has_controls and has_trends:
active_parts.append(
"the estimand is ``DID^{X,fd}_l`` (covariate-residualized "
"first-differences), and identification holds conditional on "
"the covariates entering the first-stage regression and "
"allowing group-specific linear trends"
)
elif has_controls:
active_parts.append(
"the estimand is ``DID^X_l``, and identification holds "
"conditional on the covariates entering the first-stage "
"residualization"
)
elif has_trends:
active_parts.append(
"the estimand is ``DID^{fd}_l`` (first-differenced) and the "
"identifying restriction is relaxed to allow group-specific "
"linear pre-trends"
)
if has_heterogeneity:
active_parts.append("heterogeneity tests ``beta^{het}_l`` are reported per horizon")
if active_parts:
phase3_clause = " Phase-3 configuration: " + "; ".join(active_parts) + "."
base_description = base_description + phase3_clause
return {
"parallel_trends_variant": "transition_based",
"no_anticipation": True,
"description": base_description,
}
if estimator_name == "EfficientDiDResults":
# Chen, Sant'Anna & Xie (2025) — identification is parameterized
# by ``pt_assumption`` ("all" vs "post"). PT-All is the stronger
# regime (PT across all groups/periods, over-identified — paper
# Lemma 2.1), PT-Post the weaker (PT only in post-treatment,
# just-identified reduction to single-baseline DiD per Corollary
# 3.2). Also read ``control_group`` when present (not_yet_treated
# vs last_cohort) to be source-faithful to REGISTRY.md §EfficientDiD
# lines 736-738 and 907.
pt_assumption = getattr(results, "pt_assumption", "all")
control_group = getattr(results, "control_group", None)
# The estimator only accepts ``control_group`` values of
# ``"never_treated"`` (the default) or ``"last_cohort"``. When
# ``last_cohort`` is used, the latest treatment cohort is
# reclassified as a pseudo-never-treated comparison and time
# periods at/after its onset are dropped; describing such a fit
# with generic never-treated language would misstate the
# identifying setup (see REGISTRY.md §EfficientDiD line 908).
is_last_cohort = control_group == "last_cohort"
if pt_assumption == "post":
variant = "pt_post"
if is_last_cohort:
control_clause = (
"the comparison group is the latest treated cohort "
"reclassified as pseudo-never-treated (periods "
"at/after that cohort's treatment start are "
"dropped)"
)
else:
control_clause = "the comparison group is never-treated"
description = (
"Identification under PT-Post (Chen, Sant'Anna & Xie "
"2025): parallel trends holds only in post-treatment "
"periods, " + control_clause + ", and the baseline is period g-1 only. This is the "
"weaker of the two regimes — just-identified and "
"reducing to standard single-baseline DiD (Corollary "
"3.2). Also assumes no anticipation (Assumption NA), "
"overlap (Assumption O), and absorbing / irreversible "
"treatment."
)
else:
variant = "pt_all"
if is_last_cohort:
baseline_clause = (
"using the latest treated cohort as a pseudo-never-"
"treated comparison (periods at/after that cohort's "
"treatment start are dropped); any earlier cohort "
"and any pre-treatment period can serve as baseline"
)
else:
baseline_clause = (
"using never-treated units as comparison; any "
"not-yet-treated cohort and any pre-treatment period "
"can serve as baseline"
)
description = (
"Identification under PT-All (Chen, Sant'Anna & Xie "
"2025): parallel trends holds for all groups and all "
"periods, "
+ baseline_clause
+ ". The estimator is over-identified (Lemma 2.1), and "
"the paper's optimal combination weights are applied. "
"Also assumes no anticipation (Assumption NA), overlap "
"(Assumption O), and absorbing / irreversible "
"treatment. The Hausman PT-All vs PT-Post pretest "
"(operating on the post-treatment event-study vector "
"ES(e), Theorem A.1) checks whether the stronger "
"PT-All regime is tenable."
)
block: Dict[str, Any] = {
"parallel_trends_variant": variant,
"no_anticipation": True,
"description": description,
}
if isinstance(control_group, str):
block["control_group"] = control_group
return block
if estimator_name == "StackedDiDResults":
# Wing, Freedman & Hollingsworth (2024) — identification is
# sub-experiment common trends plus the IC1 (event window fits
# within the data range) and IC2 (clean controls exist for the
# event) inclusion conditions, NOT the generic "group-time ATT
# parallel trends" clause used for CS / SA / etc. (round-22 P1
# CI review on PR #318). The active ``clean_control`` rule
# determines which units qualify as valid controls for each
# adoption event. REGISTRY.md §StackedDiD lines 1189-1193
# (identification) and 1234-1256 (clean-control rules).
clean_control = getattr(results, "clean_control", None)
if clean_control == "never_treated":
control_clause = (
"controls are restricted to units that are never treated "
"over the panel (``A_s = infinity``)"
)
elif clean_control == "strict":
control_clause = (
"controls for event ``a`` are units satisfying the strict "
"rule ``A_s > a + kappa_post + kappa_pre`` (strictly "
"untreated across the full pre- and post-event window)"
)
else:
# Default: "not_yet_treated" — A_s > a + kappa_post.
control_clause = (
"controls for event ``a`` are units satisfying ``A_s > a + "
"kappa_post`` (not yet treated through the end of the "
"event's post-window, so future-treated units can serve "
"as controls for earlier events)"
)
block: Dict[str, Any] = {
"parallel_trends_variant": "stacked_sub_experiment",
"no_anticipation": True,
"description": (
"Identification under Stacked DiD (Wing, Freedman & "
"Hollingsworth 2024): within each stacked sub-experiment "
"parallel trends holds between the treated cohort and the "
"corresponding clean-control set over the event window "
"``[-kappa_pre, +kappa_post]``; "
+ control_clause
+ ". Sub-experiments are restricted by IC1 (the event "
"window fits within the available time range) and IC2 "
"(at least one clean control exists). The aggregate ATT is "
"a weighted sum over sub-experiments, so the common-trends "
"assumption is sub-experiment-specific, not a single "
"panel-wide group-time ATT condition. Also assumes no "
"anticipation."
),
}
if isinstance(clean_control, str):
block["control_group"] = clean_control
block["clean_control"] = clean_control
return block
if estimator_name == "ImputationDiDResults":
# Borusyak, Jaravel & Spiess (2024) — identification is through
# an untreated-potential-outcome model: unit+time FE (optionally
# plus covariates) fitted on untreated observations only
# (``Omega_0``) deliver the counterfactual ``Y_it(0)``, and the
# treatment effect ``tau_it`` is the residual on treated
# observations. Writing this as generic "group-time ATT
# parallel trends" misstates the identifying model — the
# restriction is on the UNTREATED outcome's additive FE
# structure, not on cohort-time ATT equality. REGISTRY.md
# §ImputationDiD lines 1000-1013 and Assumption 1 (parallel
# trends) + Assumption 2 (no anticipation on untreated
# observations). Round-42 P1 CI review on PR #318 flagged this
# source-faithfulness gap.
return {
"parallel_trends_variant": "untreated_outcome_fe_model",
"no_anticipation": True,
"description": (
"Identification under Imputation DiD (Borusyak, Jaravel "
"& Spiess 2024): the untreated potential outcome "
"``Y_it(0)`` follows an additive unit+time fixed-effects "
"model ``Y_it(0) = alpha_i + beta_t [+ X'_it * delta] + "
"epsilon_it``. Step 1 estimates those FE on untreated "
"observations only (``Omega_0`` = never-treated plus "
"not-yet-treated cells); Step 2 imputes the "
"counterfactual for treated observations from the "
"fitted FE; Step 3 aggregates ``tau_hat_it = Y_it - "
"Y_hat_it(0)`` with researcher-chosen weights. The "
"identifying restriction is therefore parallel trends "
"of the UNTREATED outcome model (Assumption 1) — "
"``E[Y_it(0)] = alpha_i + beta_t``, holding across all "
"observations — rather than equality of cohort-time "
"ATTs. Also assumes no anticipation on untreated "
"observations (Assumption 2) and absorbing treatment."
),
}
if estimator_name == "TwoStageDiDResults":
# Gardner (2022) — identification is the same as BJS
# ImputationDiD (point estimates are algebraically equivalent
# per REGISTRY.md §TwoStageDiD line 1130): unit+time FE
# estimated on untreated observations only deliver the
# untreated potential-outcome trajectory; Stage 2 regresses
# the resulting residuals on treatment indicators. Writing
# this as generic "group-time ATT parallel trends" loses the
# load-bearing detail that Stage 1 operates only on untreated
# cells. REGISTRY.md §TwoStageDiD lines 1113-1128 and
# Assumption (same as ImputationDiD). Round-42 P1 CI review on
# PR #318 flagged this source-faithfulness gap.
return {
"parallel_trends_variant": "untreated_outcome_fe_model",
"no_anticipation": True,
"description": (
"Identification under Two-Stage DiD (Gardner 2022): "
"Stage 1 fits unit + time fixed effects on untreated "
"observations only (``Omega_0``), residualizing the "
"outcome as ``y_tilde_it = Y_it - alpha_hat_i - "
"beta_hat_t``; Stage 2 regresses residualized outcomes "
"on the treatment indicator across treated observations "
"to recover the ATT. The point estimates are "
"algebraically equivalent to Borusyak-Jaravel-Spiess "
"imputation (both rely on the same untreated-outcome FE "
"model to construct the counterfactual). The "
"identifying restriction is therefore parallel trends "
"of the UNTREATED outcome: ``E[Y_it(0)] = alpha_i + "
"beta_t`` for all observations (not a group-time ATT "
"equality across cohorts). Also assumes no anticipation "
"(``Y_it = Y_it(0)`` for all untreated observations) "
"and absorbing / irreversible treatment."
),
}
if estimator_name in {
"CallawaySantAnnaResults",
"SunAbrahamResults",
"WooldridgeDiDResults",
}:
return {
"parallel_trends_variant": "conditional_or_group_time",
"no_anticipation": True,
"description": (
"Identification relies on parallel trends across treatment "
"cohorts and time periods (group-time ATT), plus no "
"anticipation."
),
}
return {
"parallel_trends_variant": "unconditional",
"no_anticipation": True,
"description": (
"Identification relies on the standard DiD parallel-trends "
"assumption plus no anticipation of treatment by either group."
),
}
def _build_caveats(
_results: Any,
headline: Dict[str, Any],
sample: Dict[str, Any],
dr_schema: Optional[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Assemble the plain-English caveats list for the headline schema."""
caveats: List[Dict[str, Any]] = []
# NaN ATT is the highest-severity caveat.
if headline.get("sign") == "undefined":
caveats.append(
{
"severity": "warning",
"topic": "estimation_failure",
"message": (
"Estimation produced a non-finite effect. Inspect data "
"preparation and model specification before interpreting."
),
}
)
# Alpha override could not be honored (bootstrap / finite-df inference).
alpha_override_msg = headline.get("alpha_override_caveat")
if isinstance(alpha_override_msg, str) and alpha_override_msg:
caveats.append(
{
"severity": "info",
"topic": "alpha_override_preserved",
"message": alpha_override_msg,
}
)
# Near-threshold p-value.
if headline.get("near_significance_threshold"):
caveats.append(
{
"severity": "info",
"topic": "near_significance",
"message": (
"The p-value is close to the conventional significance "
"threshold; small changes to the sample or specification "
"could move it either way."
),
}
)
# Few treated units.
nt = sample.get("n_treated")
if nt is not None and nt <= 3:
caveats.append(
{
"severity": "warning",
"topic": "few_treated",
"message": (
f"Only {nt} treated units in this fit; standard errors "
"rely on large-cluster asymptotics and may be unreliable. "
"Consider SyntheticDiD or an exact-permutation inference "
"alternative."
),
}
)
# Non-trivial design effect.
survey = sample.get("survey")
if survey and not survey.get("is_trivial"):
deff = survey.get("design_effect")
eff_n = survey.get("effective_n")
if isinstance(deff, (int, float)) and deff >= 5.0:
caveats.append(
{
"severity": "warning",
"topic": "design_effect",
"message": (
f"Very large survey design effect (DEFF = {deff:.2g}). "
"Inspect the weight distribution and consider weight "
"trimming if driven by outlier weights."
),
}
)
elif isinstance(deff, (int, float)) and deff >= 1.5:
if isinstance(eff_n, (int, float)):
caveats.append(
{
"severity": "info",
"topic": "design_effect",
"message": (
f"Survey design reduces effective sample size: "
f"DEFF = {deff:.2g}; effective n = {eff_n:.0f}."
),
}
)
# Bacon forbidden comparisons.
# Round-45 P1 CI review on PR #318: Goodman-Bacon is a
# decomposition of TWFE weights (see ``bacon.py`` header and
# Goodman-Bacon 2021). On fits already produced by a
# heterogeneity-robust estimator (CS / SA / BJS / Gardner /
# Wooldridge / EfficientDiD / Stacked / dCDH / TripleDifference /
# StaggeredTripleDiff / SDiD / TROP), a high forbidden-weight share
# says "TWFE would have been materially biased on this rollout",
# not "the displayed estimator needs to be replaced" — the
# displayed estimator is already robust to the heterogeneity that
# Bacon flags. DR partly preserves this with "if not already in
# use" prose; BR must carry the same distinction through to the
# caveat. The TWFE-style estimators whose results route through
# Bacon and for which the "switch to a robust estimator"
# recommendation is load-bearing are the DiDResults-type fits; all
# other result classes are already robust.
_TWFE_STYLE_RESULTS: FrozenSet[str] = frozenset(
{"DiDResults", "MultiPeriodDiDResults", "TwoWayFixedEffectsResults"}
)
if dr_schema:
bacon = dr_schema.get("bacon") or {}
if bacon.get("status") == "ran":
fw = bacon.get("forbidden_weight")
if isinstance(fw, (int, float)) and fw > 0.10:
_estimator_name = type(_results).__name__
if _estimator_name in _TWFE_STYLE_RESULTS:
bacon_message = (
f"Goodman-Bacon decomposition places {fw:.0%} "
"of implicit TWFE weight on 'forbidden' "
"later-vs-earlier comparisons. TWFE may be "
"materially biased under heterogeneous effects. "
"Re-estimate with a heterogeneity-robust "
"estimator (CS / SA / BJS / Gardner)."
)
else:
bacon_message = (
f"Goodman-Bacon decomposition places {fw:.0%} "
"of TWFE weight on 'forbidden' later-vs-earlier "
"comparisons. A TWFE benchmark on this rollout "
"would be materially biased under heterogeneous "
"effects; the displayed estimator is already "
"heterogeneity-robust, so this is a statement "
"about the rollout design (avoid reporting TWFE "
"alongside this fit), not about the current "
"result's validity."
)
caveats.append(
{
"severity": "warning",
"topic": "bacon_contamination",
"message": bacon_message,
}
)
# Fragile sensitivity.
sens = dr_schema.get("sensitivity") or {}
if sens.get("status") == "ran":
bkd = sens.get("breakdown_M")
if isinstance(bkd, (int, float)) and bkd < 0.5:
caveats.append(
{
"severity": "warning",
"topic": "sensitivity_fragility",
"message": (
f"HonestDiD breakdown value is {bkd:.2g}: the "
"result's confidence interval includes zero "
"once parallel-trends violations reach less than "
"half the observed pre-period variation. Treat "
"the headline as tentative."
),
}
)
# Sensitivity was skipped for methodology reasons (e.g., CS fit with
# ``base_period='varying'`` — HonestDiD bounds are not interpretable
# there). Surface the reason as a warning-severity caveat so readers
# do not assume the headline is robust across the R-R grid.
#
# Exception (round-20 P2 CI review on PR #318): SDiD and TROP route
# robustness to ``estimator_native_diagnostics`` and mark the HonestDiD
# sensitivity block ``status="skipped", method="estimator_native"``.
# Surfacing "sensitivity was not run" as a warning contradicts the
# documented native-routing contract when the native battery actually
# ran. Suppress the warning and point readers at the native block
# instead.
if sens.get("status") == "skipped":
reason = sens.get("reason")
method = sens.get("method")
native = dr_schema.get("estimator_native_diagnostics") or {}
native_ran = native.get("status") == "ran"
if method == "estimator_native" and native_ran:
caveats.append(
{
"severity": "info",
"topic": "sensitivity_native_routed",
"message": (
"HonestDiD was not run for this estimator. Robustness "
"is covered by the estimator-native sensitivity "
"diagnostics reported under "
"``estimator_native_diagnostics``."
),
}
)
elif isinstance(reason, str) and reason:
caveats.append(
{
"severity": "warning",
"topic": "sensitivity_skipped",
"message": ("HonestDiD sensitivity was not run on this fit. " + reason),
}
)
# Non-fatal warnings captured from delegated diagnostics
# (e.g., HonestDiD's bootstrap diag-covariance fallback, dropped
# non-consecutive horizons on dCDH). DR already records these in
# ``schema["warnings"]``; mirror the methodology-critical ones
# into BR's caveat list so summary/full-report prose can surface
# them without readers having to inspect the DR schema.
for msg in dr_schema.get("warnings", []) or []:
if not isinstance(msg, str) or not msg:
continue
# Skip alpha-override and design-effect messages already
# covered by dedicated caveats above.
lower = msg.lower()
if "sensitivity:" in lower or "pretrends_power:" in lower:
caveats.append(
{
"severity": "info",
"topic": "diagnostic_warning",
"message": msg,
}
)
# Unit mismatch caveat (log_points + unit override).
unit_kind = headline.get("unit_kind")
if unit_kind == "log_points":
caveats.append(
{
"severity": "info",
"topic": "unit_policy",
"message": (
"The effect is reported in log-points as estimated; "
"BusinessReport does not arithmetically translate log-points "
"to percent or level changes. For small effects, log-points "
"approximate percentage changes."
),
}
)
return caveats
def _pt_method_subject(method: Optional[str]) -> str:
"""Return a source-faithful sentence subject for the PT verdict prose.
The ``parallel_trends.method`` field distinguishes between the
2x2 slope-difference check, the pre-period event-study Wald /
Bonferroni variants, EfficientDiD's Hausman PT-All vs PT-Post
pretest, SDiD's weighted pre-treatment fit, and TROP's factor-
model identification. Generic "pre-treatment event-study" wording
is wrong for the first and third cases. See round-8 CI review on
PR #318 and REGISTRY.md §EfficientDiD (Hausman pretest).
"""
if method == "slope_difference":
return "The pre-period slope-difference test"
if method == "hausman":
return "The Hausman PT-All vs PT-Post pretest"
if method in {
"joint_wald",
"joint_wald_event_study",
"joint_wald_no_vcov",
"bonferroni",
# Survey-aware event-study PT variants use an F reference
# distribution with denominator df = ``survey_metadata.df_survey``
# (round-27 P1 fix, documented in REPORTING.md). The subject
# remains the pre-period event-study coefficients; prose elsewhere
# flags the finite-sample correction via ``df_denom``.
"joint_wald_survey",
"joint_wald_event_study_survey",
}:
return "Pre-treatment event-study coefficients"
if method == "synthetic_fit":
return "The synthetic-control pre-treatment fit"
if method == "scm_fit":
return "The synthetic-control donor-weighted pre-treatment fit"
if method == "factor":
return "The factor-model pre-treatment fit"
return "Pre-treatment data"
def _pt_method_stat_label(method: Optional[str]) -> Optional[str]:
"""Return the joint-statistic label appropriate to the PT method.
Returns ``"joint p"`` for Wald / Bonferroni paths (including the
survey-aware F-reference variants, which remain joint tests on the
pre-period coefficient vector — only the reference distribution
changes), ``"p"`` for the 2x2 slope-difference and Hausman paths
(single-statistic tests), and ``None`` for design-enforced paths
that have no p-value.
"""
if method in {
"joint_wald",
"joint_wald_event_study",
"joint_wald_no_vcov",
"bonferroni",
"joint_wald_survey",
"joint_wald_event_study_survey",
}:
return "joint p"
if method in {"slope_difference", "hausman"}:
return "p"
if method in {"synthetic_fit", "scm_fit", "factor"}:
# Design-enforced fit-based paths have no p-value label (SCM's significance
# is the in-space placebo, not a PT joint test).
return None
return "joint p"
def _references_for(estimator_name: str) -> List[Dict[str, str]]:
"""Map the estimator to the appropriate citation references."""
base = [
{
"role": "sensitivity",
"citation": (
"Rambachan, A., & Roth, J. (2023). A More Credible Approach "
"to Parallel Trends. Review of Economic Studies."
),
},
{
"role": "workflow",
"citation": (
"Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., "
"& Sant'Anna, P. H. C. (2025). Difference-in-Differences "
"Designs: A Practitioner's Guide."
),
},
]
estimator_refs = {
"CallawaySantAnnaResults": {
"role": "estimator",
"citation": (
"Callaway, B., & Sant'Anna, P. H. C. (2021). "
"Difference-in-Differences with multiple time periods. "
"Journal of Econometrics."
),
},
"SyntheticDiDResults": {
"role": "estimator",
"citation": (
"Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., "
"& Wager, S. (2021). Synthetic Difference in Differences."
),
},
"SyntheticControlResults": {
"role": "estimator",
"citation": (
"Abadie, A., Diamond, A., & Hainmueller, J. (2010). Synthetic "
"Control Methods for Comparative Case Studies. JASA, 105(490)."
),
},
"SunAbrahamResults": {
"role": "estimator",
"citation": (
"Sun, L., & Abraham, S. (2021). Estimating dynamic treatment "
"effects in event studies. Journal of Econometrics."
),
},
"ImputationDiDResults": {
"role": "estimator",
"citation": (
"Borusyak, K., Jaravel, X., & Spiess, J. (2024). " "Revisiting event-study designs."
),
},
"EfficientDiDResults": {
"role": "estimator",
"citation": (
"Chen, X., Sant'Anna, P. H. C., & Xie, H. (2025). "
"Efficient Estimation of Treatment Effects in Staggered "
"DiD Designs."
),
},
"ChaisemartinDHaultfoeuilleResults": {
"role": "estimator",
"citation": (
"de Chaisemartin, C., & D'Haultfœuille, X. (2020). "
"Two-way fixed effects estimators with heterogeneous "
"treatment effects. American Economic Review."
),
},
}
if estimator_name in estimator_refs:
return [estimator_refs[estimator_name]] + base
return base
# ---------------------------------------------------------------------------
# Prose rendering
# ---------------------------------------------------------------------------
def _format_value(value: Optional[float], unit: Optional[str], unit_kind: str) -> str:
"""Format a numeric effect with its unit. No arithmetic translation."""
if value is None or not np.isfinite(value):
return "undefined"
if unit_kind == "currency":
sign = "-" if value < 0 else ""
return f"{sign}${abs(value):,.2f}"
if unit_kind == "percent":
return f"{value:.2f}%"
if unit_kind == "percentage_points":
return f"{value:.2f} pp"
if unit_kind == "log_points":
return f"{value:.3g} log-points"
if unit_kind == "count":
return f"{value:,.0f}"
# unknown / free-form
if unit:
return f"{value:.3g} {unit}"
return f"{value:.3g}"
def _significance_phrase(p: Optional[float], alpha: float) -> str:
"""Return a plain-English significance phrase.
Tiers per ``docs/methodology/REPORTING.md``:
* p < 0.001: "strongly supported by the data"
* 0.001 <= p < 0.01: "well-supported"
* 0.01 <= p < alpha: "statistically significant at the X% level"
* alpha <= p < 0.10: CI-includes-zero language
* p >= 0.10: consistent-with-no-effect language
"""
if p is None or not np.isfinite(p):
return "statistical significance cannot be assessed (p-value unavailable)"
ci_level = int(round((1.0 - alpha) * 100))
if p < 0.001:
return "the direction of the effect is strongly supported by the data"
if p < 0.01:
return "the direction of the effect is well-supported by the data"
if p < alpha:
return f"the effect is statistically significant at the {ci_level}% level"
if p < 0.10:
return (
"the confidence interval includes zero; the direction is suggestive "
"but not statistically significant"
)
return "the confidence interval includes zero; the data are consistent with no effect"
def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]:
"""If the smallest evaluated M on the HonestDiD sensitivity grid
already has the robust CI including zero, return that M. Returns
``None`` when the grid is missing or when the smallest evaluated
point is still robust — in the latter case ``breakdown_M`` is an
interpolated threshold between grid points, not a statement about
the smallest grid point itself.
Matches the twin helper in ``diagnostic_report.py``; keep the two
in sync for cross-surface parity.
"""
grid_points = sens.get("grid") or []
sorted_grid = sorted(
(p for p in grid_points if isinstance(p.get("M"), (int, float))),
key=lambda p: p["M"],
)
if not sorted_grid:
return None
smallest = sorted_grid[0]
if not smallest.get("robust_to_zero", True):
return float(smallest["M"])
return None
def _sentence_first_upper(text: str) -> str:
"""Uppercase only the first character of ``text``, preserving all
other casing. Unlike ``str.capitalize()``, which lowercases every
character after the first, this keeps user-supplied abbreviations
and proper nouns intact.
Examples
--------
>>> _sentence_first_upper("the NJ minimum-wage increase")
'The NJ minimum-wage increase'
>>> _sentence_first_upper("Castle Doctrine law adoption")
'Castle Doctrine law adoption'
"""
if not text:
return text
return text[0].upper() + text[1:]
def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str:
"""Return a direction-aware verb for the headline sentence.
When ``outcome_direction`` is unset we use neutral change verbs
(``increased`` / ``decreased``). When it is supplied, we additionally
flavor the verb with a value-laden connotation so the stakeholder can
read off whether the estimated effect points in the desired direction:
- ``higher_is_better``: positive effect -> "lifted"; negative -> "reduced"
- ``lower_is_better``: positive effect -> "worsened"; negative -> "improved"
- None: positive -> "increased"; negative -> "decreased"
"""
if effect == 0:
return "did not change"
if outcome_direction == "higher_is_better":
return "lifted" if effect > 0 else "reduced"
if outcome_direction == "lower_is_better":
return "worsened" if effect > 0 else "improved"
return "increased" if effect > 0 else "decreased"
def _render_headline_sentence(schema: Dict[str, Any]) -> str:
"""Render the headline sentence from the schema.
Uses the absolute value in the magnitude slot when the verb already
conveys direction ("decreased ... by $0.14" rather than "decreased ...
by -$0.14"). CI bounds are rendered at their natural signed values.
When ``outcome_direction`` is supplied, the verb picks up a value-laden
connotation ("lifted" / "reduced" vs neutral "increased" / "decreased").
"""
ctx = schema.get("context", {})
h = schema.get("headline", {})
# PR #347 R4 P1: the dCDH ``trends_linear=True`` + ``L_max>=2``
# configuration does not produce a scalar headline by design —
# ``overall_att`` is intentionally NaN (per
# ``chaisemartin_dhaultfoeuille.py:2828-2834``). Render explicit
# "no scalar headline by design" prose instead of routing through
# the non-finite / estimation-failure path.
if h.get("status") == "no_scalar_by_design":
# PR #347 R13 P1: the headline-level ``reason`` field is the
# single source for the no-scalar prose and is already
# branched on populated-vs-empty surface in ``_build_schema``.
# Use it verbatim so the headline sentence never drifts from
# the schema-level message on the empty-surface subcase.
treatment = ctx.get("treatment_label", "the treatment")
outcome_label = ctx.get("outcome_label", "the outcome")
treatment_sentence = _sentence_first_upper(treatment)
reason = h.get("reason")
if isinstance(reason, str) and reason:
return (
f"{treatment_sentence} does not produce a scalar aggregate "
f"effect on {outcome_label} under this configuration. " + reason
)
return (
f"{treatment_sentence} does not produce a scalar aggregate effect "
f"on {outcome_label} under this configuration (by design)."
)
effect = h.get("effect")
outcome = ctx.get("outcome_label", "the outcome")
treatment = ctx.get("treatment_label", "the treatment")
outcome_direction = ctx.get("outcome_direction")
unit = h.get("unit")
unit_kind = h.get("unit_kind", "unknown")
if effect is None or not np.isfinite(effect):
return (
f"We were unable to produce a finite estimate of {treatment}'s "
f"effect on {outcome}. Inspect the data and model specification."
)
verb = _direction_verb(effect, outcome_direction)
magnitude = _format_value(abs(effect), unit, unit_kind)
lo = h.get("ci_lower")
hi = h.get("ci_upper")
# Round-37 P1 CI review on PR #318: on a finite point estimate
# whose CI bounds are NaN (undefined inference — survey-df
# collapse, zero effective clusters, etc.), the previous isinstance
# check passed because ``NaN`` is a ``float`` and the sentence
# rendered ``(... 95% CI: undefined to undefined)``. Gate on
# ``np.isfinite`` like DR's own headline renderer already does;
# add an explicit inference-unavailable trailer instead of the
# broken CI clause.
ci_str = ""
ci_finite = (
isinstance(lo, (int, float))
and isinstance(hi, (int, float))
and np.isfinite(lo)
and np.isfinite(hi)
)
if ci_finite:
lo_s = _format_value(lo, unit, unit_kind)
hi_s = _format_value(hi, unit, unit_kind)
ci_str = f" ({h.get('ci_level', 95)}% CI: {lo_s} to {hi_s})"
elif isinstance(lo, (int, float)) or isinstance(hi, (int, float)):
# At least one bound was supplied but not finite -> inference
# undefined. Replace the CI clause with an explicit marker so
# downstream prose does not claim a confidence interval that
# is not actually available.
ci_str = " (inference unavailable: confidence interval is undefined for this fit)"
by_clause = f" by {magnitude}" if effect != 0 else ""
# Round-1 BR/DR canonical-validation (2026-04-19): Python's
# ``str.capitalize()`` lowercases everything except the first
# character, so ``"the NJ minimum-wage increase".capitalize()``
# returns ``"The nj minimum-wage increase"`` — flattening the
# ``NJ`` abbreviation. Real canonical datasets (Card-Krueger,
# Castle Doctrine) carry proper-noun / acronym tokens in the
# user-supplied ``treatment_label``, so preserve user casing and
# only ensure the first character is uppercase.
treatment_sentence = _sentence_first_upper(treatment)
return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}."
def _render_summary(schema: Dict[str, Any]) -> str:
"""Render the short-form stakeholder summary paragraph."""
sentences: List[str] = []
ctx = schema.get("context", {})
question = ctx.get("business_question")
if question:
sentences.append(f"Question: {question}")
# Headline sentence with significance phrase.
sentences.append(_render_headline_sentence(schema))
# BR/DR gap #6 (target-parameter clarity): name what the headline
# scalar actually represents so the stakeholder can map the number
# to a specific estimand. Rendered immediately after the headline
# and before the significance phrase. The summary surfaces only
# the short ``name`` so the paragraph stays within the
# 6-10-sentence target; ``definition`` lives in the full report
# and in the structured schema for agents that want the long form.
tp = schema.get("target_parameter", {}) or {}
tp_name = tp.get("name")
if tp_name:
sentences.append(f"Target parameter: {tp_name}.")
h = schema.get("headline", {})
p = h.get("p_value")
alpha = ctx.get("alpha", 0.05)
if p is not None and np.isfinite(p):
sig = _significance_phrase(p, alpha)
sentences.append(f"Statistically, {sig}.")
if h.get("near_significance_threshold"):
sentences.append(
"The p-value is close to the conventional threshold; "
"small changes to the sample could move it either way."
)
# Pre-trends + power-aware phrasing.
pt = schema.get("pre_trends", {}) or {}
if pt.get("status") == "computed":
jp = pt.get("joint_p_value")
verdict = pt.get("verdict")
# ``tier`` already incorporates the diagonal-fallback downgrade —
# ``DiagnosticReport._check_pretrends_power`` applies it centrally
# so every report surface (BR summary, BR full_report, BR schema,
# DR summary) reads the same adjusted value (round-14 CI review).
tier = pt.get("power_tier")
method = pt.get("method")
subject = _pt_method_subject(method)
stat_label = _pt_method_stat_label(method)
jp_phrase = (
f" ({stat_label} = {jp:.3g})" if isinstance(jp, (int, float)) and stat_label else ""
)
# Only point to "the sensitivity analysis below" when a
# sensitivity block actually ran. For estimators that route to
# native diagnostics (SDiD / TROP) or fits where sensitivity was
# skipped / not applicable, the clause would mislead (round-12
# CI review on PR #318).
sens_ran = (schema.get("sensitivity", {}) or {}).get("status") == "computed"
sens_tail_major = " pending the sensitivity analysis below" if sens_ran else ""
sens_tail_alongside = " alongside the sensitivity analysis below" if sens_ran else ""
sens_tail_see_bounded = (
" See the sensitivity analysis below for bounded-violation guarantees."
if sens_ran
else ""
)
sens_tail_see_reliable = " See the sensitivity analysis below." if sens_ran else ""
if verdict == "clear_violation":
sentences.append(
f"{subject} clearly reject parallel trends{jp_phrase}; the "
"headline should be treated as tentative" + sens_tail_major + "."
)
elif verdict == "some_evidence_against":
sentences.append(
f"{subject} show some evidence against parallel trends"
f"{jp_phrase}; interpret the headline"
+ (sens_tail_alongside if sens_ran else " with caution")
+ "."
)
elif verdict == "no_detected_violation":
if tier == "well_powered":
sentences.append(
f"{subject} are consistent with parallel trends, and "
"the test is well-powered (the max pre-period level "
"deviation at the MDV is small relative to the "
"estimated effect)."
)
elif tier == "moderately_powered":
sentences.append(
f"{subject} do not reject parallel trends; the test is "
"moderately informative." + sens_tail_see_bounded
)
else:
sentences.append(
f"{subject} do not reject parallel trends, but the test "
"has limited power — a non-rejection does not prove the "
"assumption." + sens_tail_see_reliable
)
elif verdict == "design_enforced_pt":
if method == "scm_fit":
sentences.append(
"The synthetic control is designed to reproduce the treated "
"unit's pre-period trajectory via donor weights (classic SCM's "
"design-enforced analogue of parallel trends); significance "
"comes from in-space placebo permutation inference, not a "
"parallel-trends test."
)
else:
sentences.append(
"The synthetic control is designed to match the treated "
"group's pre-period trajectory (SDiD's weighted-parallel-"
"trends analogue)."
)
elif verdict == "inconclusive":
# Round-35 P1 CI review on PR #318: a ``verdict=="inconclusive"``
# state means one or more pre-period coefficients had
# undefined inference (zero SE, NaN p-value) and the joint
# test cannot be formed. BR previously omitted the sentence
# entirely, so stakeholder prose silently skipped the
# identifying-assumption diagnostic. Name the state
# explicitly and quote the undefined-row count when
# available.
n_dropped = pt.get("n_dropped_undefined")
if isinstance(n_dropped, int) and n_dropped > 0:
rows_word = "row" if n_dropped == 1 else "rows"
sentences.append(
f"The pre-trends test is inconclusive on this fit: "
f"{n_dropped} pre-period {rows_word} had undefined "
"inference (zero / negative SE or a non-finite "
"per-period p-value), so the joint test cannot be "
"formed. Treat parallel trends as unassessed rather "
"than supported."
)
else:
sentences.append(
"The pre-trends test is inconclusive on this fit: "
"pre-period inference was undefined, so the joint "
"test cannot be formed. Treat parallel trends as "
"unassessed rather than supported."
)
# Sensitivity. A ``single_M_precomputed`` sensitivity block has
# ``breakdown_M=None`` by construction because only one M was evaluated;
# narrate it as a point check, NOT as grid-wide robustness.
sens = schema.get("sensitivity", {}) or {}
if sens.get("status") == "computed":
bkd = sens.get("breakdown_M")
conclusion = sens.get("conclusion")
if conclusion == "single_M_precomputed":
grid_points = sens.get("grid") or []
point = grid_points[0] if grid_points else {}
m_val = point.get("M")
robust = point.get("robust_to_zero")
if isinstance(m_val, (int, float)):
if robust:
sentences.append(
f"HonestDiD (single point checked): at M = {m_val:.2g}, "
f"the robust confidence interval excludes zero. This is "
f"a point check, not a breakdown analysis — run "
f"HonestDiD.sensitivity() across a grid of M values "
f"for a full robustness claim."
)
else:
sentences.append(
f"HonestDiD (single point checked): at M = {m_val:.2g}, "
f"the robust confidence interval includes zero. Run "
f"HonestDiD.sensitivity() across a grid to find the "
f"breakdown value."
)
elif bkd is None:
sentences.append(
"HonestDiD: the result remains significant across the "
"full grid — robust to plausible parallel-trends violations."
)
elif isinstance(bkd, (int, float)) and bkd >= 1.0:
sentences.append(
f"HonestDiD: the result remains significant under "
f"parallel-trends violations up to {bkd:.2g}x the observed "
f"pre-period variation."
)
elif isinstance(bkd, (int, float)):
# Round-1 BR/DR canonical-validation (2026-04-19) then
# tightened per CI review on PR #341 R1:
# ``breakdown_M`` is the smallest M at which the robust
# CI includes zero (interpolated between grid points) —
# not a claim about any specific grid point. Earlier fix
# keyed off ``bkd <= 0.05`` which incorrectly asserted
# "smallest grid point fails" even for grids that start
# at M=0 where the smallest evaluated point is still
# robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The
# "smallest grid point" wording is only accurate when
# the smallest evaluated M on the grid itself fails
# (``robust_to_zero == False``); otherwise fall through
# to the numeric multiplier.
smallest_failed_m = _smallest_failing_grid_m(sens)
if smallest_failed_m is not None:
sentences.append(
"HonestDiD: the result is fragile — the confidence "
"interval includes zero even at the smallest M "
f"evaluated on the sensitivity grid (M = "
f"{smallest_failed_m:.2g})."
)
else:
sentences.append(
f"HonestDiD: the result is fragile — the confidence "
f"interval includes zero once violations reach {bkd:.2g}x "
f"the pre-period variation."
)
# Sample sentence. For fits with a dynamic comparison set (CS /
# ContinuousDiD / StaggeredTripleDiff / EfficientDiD /
# StackedDiD under ``clean_control in {"not_yet_treated",
# "strict"}``) the fixed control count is suppressed because the
# comparison group varies by cohort/sub-experiment; narrate the
# mode explicitly rather than misreporting a fixed-subset tally as
# "control" (rounds 13 / 17 / 18 / 22 CI review).
sample = schema.get("sample", {}) or {}
# ``schema["estimator"]`` is a dict with ``class_name``; unwrap it
# for the per-estimator dynamic-control phrasing branch below.
estimator_block = schema.get("estimator") or {}
estimator = estimator_block.get("class_name") if isinstance(estimator_block, dict) else None
n_obs = sample.get("n_obs")
n_t = sample.get("n_treated")
n_c = sample.get("n_control")
n_nt = sample.get("n_never_treated")
n_ne = sample.get("n_never_enabled")
is_dynamic = sample.get("dynamic_control")
cg = sample.get("control_group")
# Panel-vs-RCS count-unit label. For repeated cross-section fits
# (``panel=False`` on CallawaySantAnna), treated / never-treated
# tallies are observation counts, not unit counts. Keep the
# "N treated" phrasing (the N is still correct), but adjust the
# never-treated clause so it does not claim "units present in
# the panel" for an RCS sample.
count_unit = sample.get("count_unit", "units")
ne_unit_word = "observations" if count_unit == "observations" else "units"
if isinstance(n_obs, int):
if isinstance(n_t, int) and isinstance(n_c, int):
sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).")
elif is_dynamic and isinstance(n_t, int):
if isinstance(n_ne, int) and n_ne > 0:
subset_clause = f"; {n_ne:,} never-enabled {ne_unit_word} are also present"
elif isinstance(n_nt, int) and n_nt > 0:
subset_clause = f"; {n_nt:,} never-treated {ne_unit_word} are also present"
else:
subset_clause = ""
# Estimator-specific dynamic-comparison phrasing. StackedDiD
# uses sub-experiment-specific clean controls (IC1/IC2
# trimming) rather than a not-yet-treated rollout; the
# generic phrasing misstates the identification setup.
if estimator == "StackedDiDResults":
cc_label = cg if isinstance(cg, str) else "clean_control"
n_distinct = sample.get("n_distinct_controls_trimmed")
distinct_clause = (
f" across {n_distinct:,} distinct control units in the trimmed stack"
if isinstance(n_distinct, int)
else ""
)
sentences.append(
f"Sample: {n_obs:,} observations ({n_t:,} treated) with a "
f"sub-experiment-specific clean-control comparison "
f"(``clean_control='{cc_label}'``): each adoption event is "
f"compared against the units satisfying the rule relative "
f"to that event's window, not a single fixed control "
f"group{distinct_clause}{subset_clause}."
)
else:
sentences.append(
f"Sample: {n_obs:,} observations ({n_t:,} treated) with a "
"dynamic not-yet-treated comparison group (the control set "
f"varies by cohort and period){subset_clause}."
)
elif (
estimator == "StaggeredTripleDiffResults"
and isinstance(n_t, int)
and isinstance(n_ne, int)
and n_ne > 0
):
# Round-38 P2 CI review on PR #318: StaggeredTripleDiff
# under fixed ``control_group="never_treated"`` had the
# schema moved to ``n_never_enabled`` (round-37) but the
# renderers fell through to the generic
# ``Sample: N observations.`` sentence because the
# ``is_dynamic_control`` branch didn't fire. REGISTRY.md
# §StaggeredTripleDifference line 1730 names the
# never-enabled cohort as the valid fixed comparison on
# this path; the prose must say so.
sentences.append(
f"Sample: {n_obs:,} observations ({n_t:,} treated, " f"{n_ne:,} never-enabled)."
)
else:
sentences.append(f"Sample: {n_obs:,} observations.")
survey = sample.get("survey")
if survey and not survey.get("is_trivial"):
deff = survey.get("design_effect")
eff_n = survey.get("effective_n")
if isinstance(deff, (int, float)) and isinstance(eff_n, (int, float)):
# Round-35 P2 CI review on PR #318: ``deff < 0.95`` is a
# precision-improving design (effective N is LARGER than
# nominal N). Narrating that as "reduces effective sample
# size" is directionally wrong. Branch on the sign of
# the departure from 1.
if deff < 1.0:
sentences.append(
f"Survey design improves effective sample size to "
f"~{eff_n:,.0f} (DEFF = {deff:.2g})."
)
else:
sentences.append(
f"Survey design reduces effective sample size to "
f"~{eff_n:,.0f} (DEFF = {deff:.2g})."
)
# Highest-severity caveat (if any).
caveats = schema.get("caveats", [])
warning_caveats = [c for c in caveats if c.get("severity") == "warning"]
if warning_caveats:
top = warning_caveats[0]
sentences.append(f"Caveat: {top.get('message')}")
return " ".join(s for s in sentences if s)
def _render_full_report(schema: Dict[str, Any]) -> str:
"""Render the structured multi-section markdown report."""
ctx = schema.get("context", {})
h = schema.get("headline", {})
sample = schema.get("sample", {})
pt = schema.get("pre_trends", {}) or {}
sens = schema.get("sensitivity", {}) or {}
assumption = schema.get("assumption", {})
het = schema.get("heterogeneity")
caveats = schema.get("caveats", [])
references = schema.get("references", [])
next_steps = schema.get("next_steps", [])
lines: List[str] = []
lines.append(f"# Business Report: {ctx.get('outcome_label', 'Outcome')}")
lines.append("")
if ctx.get("business_question"):
lines.append(f"**Question**: {ctx['business_question']}")
lines.append("")
lines.append(f"**Estimator**: `{schema.get('estimator', {}).get('class_name')}`")
lines.append("")
# Headline
lines.append("## Headline")
lines.append("")
lines.append(_render_headline_sentence(schema))
p = h.get("p_value")
alpha = ctx.get("alpha", 0.05)
if isinstance(p, (int, float)):
lines.append("")
lines.append(f"Statistically, {_significance_phrase(p, alpha)}.")
lines.append("")
# Target parameter (BR/DR gap #6): name what the headline scalar
# represents so the stakeholder can map the number to a specific
# estimand. Rendered between "Headline" and "Identifying Assumption"
# because the target parameter is about what the scalar IS, whereas
# identifying assumption is about what makes it valid.
tp = schema.get("target_parameter", {}) or {}
if tp.get("name") or tp.get("definition"):
lines.append("## Target Parameter")
lines.append("")
if tp.get("name"):
lines.append(f"- **{tp['name']}**")
if tp.get("definition"):
lines.append(f"- {tp['definition']}")
lines.append("")
# Identifying assumption
lines.append("## Identifying Assumption")
lines.append("")
lines.append(assumption.get("description", "") or "Standard DiD parallel-trends assumption.")
lines.append("")
# Pre-trends
lines.append("## Pre-Trends")
lines.append("")
if pt.get("status") == "computed":
jp = pt.get("joint_p_value")
verdict = pt.get("verdict")
tier = pt.get("power_tier")
# Use the method-aware statistic label the summary path already
# uses: "joint p" for Wald / Bonferroni event-study, "p" for
# slope-difference / Hausman single-statistic tests, and None
# for design-enforced SDiD / TROP paths where there is no
# p-value at all. Round-25 P2 CI review on PR #318 flagged the
# hard-coded "joint p" wording as misdescribing 2x2 / Hausman
# fits and inventing a nonexistent p-value for SDiD / TROP.
method = pt.get("method")
stat_label = _pt_method_stat_label(method)
if stat_label and isinstance(jp, (int, float)):
lines.append(f"- Verdict: `{verdict}` ({stat_label} = {jp:.3g})")
elif stat_label:
lines.append(f"- Verdict: `{verdict}` ({stat_label} unavailable)")
else:
lines.append(f"- Verdict: `{verdict}`")
if tier:
lines.append(f"- Power tier: `{tier}`")
mdv = pt.get("mdv")
max_abs_pre = pt.get("max_abs_pre_violation")
ratio = pt.get("mdv_share_of_att")
if isinstance(mdv, (int, float)):
lines.append(f"- Minimum detectable violation (MDV): {mdv:.3g}")
if isinstance(max_abs_pre, (int, float)):
lines.append(f"- Max pre-period level deviation at MDV: {max_abs_pre:.3g}")
if isinstance(ratio, (int, float)):
# PR-B R12: ratio is now max_abs_pre_violation / |ATT|, the
# level-scale comparable to ATT (not raw γ-unit mdv on linear
# fits). Label updated to match the numerator definition in
# REPORTING.md "Power-aware phrasing" Note.
lines.append(f"- Max pre-period level deviation / |ATT|: {ratio:.2g}")
else:
lines.append(f"- Pre-trends not computed: {pt.get('reason', 'unavailable')}")
lines.append("")
# Sensitivity. A single-M HonestDiDResults passthrough has
# breakdown_M=None by construction because only one M was evaluated;
# the "robust across full grid" phrasing is reserved for genuine
# grid-over-M SensitivityResults.
lines.append("## Sensitivity (HonestDiD)")
lines.append("")
if sens.get("status") == "computed":
bkd = sens.get("breakdown_M")
concl = sens.get("conclusion")
lines.append(f"- Method: `{sens.get('method')}`")
if concl == "single_M_precomputed":
grid_points = sens.get("grid") or []
point = grid_points[0] if grid_points else {}
m_val = point.get("M")
robust = point.get("robust_to_zero")
if isinstance(m_val, (int, float)):
lines.append(f"- Single point checked: M = {m_val:.3g}")
lines.append(
f"- Robust CI at M = {m_val:.3g}: "
f"{'excludes zero' if robust else 'includes zero'}"
)
lines.append(
"- Run `HonestDiD.sensitivity()` across a grid of M "
"values to find the breakdown value."
)
else:
lines.append("- Single-M passthrough (breakdown not available)")
elif isinstance(bkd, (int, float)):
lines.append(f"- Breakdown M: {bkd:.3g}")
else:
lines.append("- Breakdown M: robust across full grid (no breakdown)")
lines.append(f"- Conclusion: `{concl}`")
else:
lines.append(f"- Sensitivity not computed: {sens.get('reason', 'unavailable')}")
lines.append("")
# Sample
lines.append("## Sample")
lines.append("")
if isinstance(sample.get("n_obs"), int):
lines.append(f"- Observations: {sample['n_obs']:,}")
if isinstance(sample.get("n_treated"), int):
lines.append(f"- Treated: {sample['n_treated']:,}")
# ``n_control`` is only populated for estimators whose control set
# is a fixed tally. For dynamic modes (CS / ContinuousDiD /
# StaggeredTripleDiff / EfficientDiD / StackedDiD under
# ``clean_control in {"not_yet_treated", "strict"}``) the comparison
# group is dynamic per cohort/sub-experiment; report the estimator-
# specific fixed subset (``n_never_enabled`` for triple-difference;
# ``n_never_treated`` elsewhere; ``n_distinct_controls_trimmed`` for
# Stacked) when available, then name the dynamic-comparison mode
# explicitly.
estimator_block = schema.get("estimator") or {}
estimator_name = (
estimator_block.get("class_name") if isinstance(estimator_block, dict) else None
)
cg = sample.get("control_group")
# Panel-vs-RCS count-unit label for the full report. Mirrors the
# summary path: CallawaySantAnna's ``panel=False`` mode stores
# counts as observations, not units (round-28 P2).
md_count_unit = sample.get("count_unit", "units")
md_ne_unit_word = "observations" if md_count_unit == "observations" else "units"
md_sample_location = (
"in the repeated cross-section sample"
if md_count_unit == "observations"
else "in the panel"
)
if isinstance(sample.get("n_control"), int):
lines.append(f"- Control: {sample['n_control']:,}")
elif (
estimator_name == "StaggeredTripleDiffResults"
and isinstance(sample.get("n_never_enabled"), int)
and sample["n_never_enabled"] > 0
and not sample.get("dynamic_control")
):
# Round-38 P2 CI review on PR #318: fixed
# ``control_group="never_treated"`` on StaggeredTripleDiff
# clears ``n_control`` (composite total) and populates
# ``n_never_enabled`` (the valid fixed comparison cohort per
# REGISTRY.md line 1730). The full report must render that
# fixed count — the dynamic-control branch below would not
# fire on this path.
lines.append(
f"- Never-enabled units (fixed comparison cohort): " f"{sample['n_never_enabled']:,}"
)
elif sample.get("dynamic_control"):
if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0:
lines.append(
f"- Never-enabled {md_ne_unit_word} present "
f"{md_sample_location}: {sample['n_never_enabled']:,}"
)
elif isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0:
lines.append(
f"- Never-treated {md_ne_unit_word} present "
f"{md_sample_location}: {sample['n_never_treated']:,}"
)
if estimator_name == "StackedDiDResults":
n_distinct = sample.get("n_distinct_controls_trimmed")
if isinstance(n_distinct, int):
lines.append(f"- Distinct control units in trimmed stack: {n_distinct:,}")
cc_label = cg if isinstance(cg, str) else "clean_control"
lines.append(
f"- Comparison group: sub-experiment-specific clean controls "
f"(``clean_control='{cc_label}'``; each adoption event is "
"compared against units satisfying the rule relative to that "
"event's window, not a single fixed control group)"
)
else:
lines.append(
"- Comparison group: dynamic not-yet-treated units "
"(varies by cohort and period; no fixed control count)"
)
survey = sample.get("survey")
if survey:
if survey.get("is_trivial"):
lines.append("- Survey design: trivial DEFF (~1.0)")
else:
deff = survey.get("design_effect")
eff_n = survey.get("effective_n")
if isinstance(deff, (int, float)):
lines.append(f"- Survey DEFF: {deff:.2g}")
if isinstance(eff_n, (int, float)):
lines.append(f"- Effective N: {eff_n:,.0f}")
lines.append("")
# Heterogeneity — only render the populated section when the check
# actually ran. Round-32 P2 CI review on PR #318: round-31 changed
# ``_lift_heterogeneity`` to always return a dict (stable schema
# contract), but the renderer's ``if het:`` truthiness guard then
# entered the block on every fit and printed ``Source: None``,
# ``N effects: None``, etc. Gate on the ``status`` enum instead.
if isinstance(het, dict) and het.get("status") == "ran":
lines.append("## Heterogeneity")
lines.append("")
lines.append(f"- Source: `{het.get('source')}`")
lines.append(f"- N effects: {het.get('n_effects')}")
mn = het.get("min")
mx = het.get("max")
if isinstance(mn, (int, float)) and isinstance(mx, (int, float)):
lines.append(f"- Range: {mn:.3g} to {mx:.3g}")
cv = het.get("cv")
if isinstance(cv, (int, float)):
lines.append(f"- CV: {cv:.3g}")
lines.append(f"- Sign consistent: {het.get('sign_consistent')}")
lines.append("")
# Caveats
if caveats:
lines.append("## Caveats")
lines.append("")
for c in caveats:
sev = c.get("severity", "info")
lines.append(f"- **{sev.upper()}** — {c.get('message')}")
lines.append("")
# Next steps
if next_steps:
lines.append("## Next Steps")
lines.append("")
for s in next_steps:
if s.get("label"):
lines.append(f"- {s['label']}")
if s.get("why"):
lines.append(f" - _why_: {s['why']}")
lines.append("")
# References
if references:
lines.append("## References")
lines.append("")
for ref in references:
lines.append(f"- {ref.get('citation')}")
lines.append("")
return "\n".join(lines)