Source code for diff_diff.diagnostics

Name: diff-diff
Author: diff-diff contributors
"""
Diagnostic tools for validating Difference-in-Differences assumptions.

This module provides placebo tests and other diagnostic tools for assessing
the validity of the parallel trends assumption in DiD designs.

References
----------
Bertrand, M., Duflo, E., & Mullainathan, S. (2004). How Much Should We Trust
Differences-in-Differences Estimates? The Quarterly Journal of Economics,
119(1), 249-275.
"""

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd

from diff_diff.estimators import DifferenceInDifferences
from diff_diff.results import _get_significance_stars
from diff_diff.results_base import Diagnostic
from diff_diff.utils import safe_inference, validate_binary



[docs]
@dataclass
class PlaceboTestResults(Diagnostic):
    """
    Results from a placebo test for DiD assumption validation.

    Attributes
    ----------
    test_type : str
        Type of placebo test performed.
    placebo_effect : float
        Estimated placebo treatment effect.
    se : float
        Standard error of the placebo effect.
    t_stat : float
        T-statistic for the placebo effect.
    p_value : float
        P-value for testing placebo_effect = 0.
    conf_int : tuple
        Confidence interval for the placebo effect.
    n_obs : int
        Number of observations used in the test.
    is_significant : bool
        Whether the placebo effect is significant at alpha=0.05.
    original_effect : float, optional
        Original ATT estimate for comparison.
    original_se : float, optional
        Original SE for comparison.
    permutation_distribution : np.ndarray, optional
        Distribution of permuted effects (for permutation test).
    leave_one_out_effects : dict, optional
        Unit-specific effects (for leave-one-out test).
    fake_period : any, optional
        The fake treatment period used (for timing test).
    fake_group : list, optional
        The fake treatment group used (for group test).
    """

    test_type: str
    placebo_effect: float
    se: float
    t_stat: float
    p_value: float
    conf_int: Tuple[float, float]
    n_obs: int
    is_significant: bool
    alpha: float = 0.05

    # Optional fields for specific test types
    original_effect: Optional[float] = None
    original_se: Optional[float] = None
    permutation_distribution: Optional[np.ndarray] = field(default=None, repr=False)
    leave_one_out_effects: Optional[Dict[Any, float]] = field(default=None)
    fake_period: Optional[Any] = None
    fake_group: Optional[List[Any]] = field(default=None)
    n_permutations: Optional[int] = None

    @property
    def significance_stars(self) -> str:
        """Return significance stars based on p-value."""
        return _get_significance_stars(self.p_value)


[docs]
    def summary(self) -> str:
        """Generate formatted summary of placebo test results."""
        conf_level = int((1 - self.alpha) * 100)

        lines = [
            "=" * 65,
            f"Placebo Test Results: {self.test_type}".center(65),
            "=" * 65,
            "",
            f"{'Placebo effect:':<25} {self.placebo_effect:>12.4f}",
            f"{'Standard error:':<25} {self.se:>12.4f}",
            f"{'T-statistic:':<25} {self.t_stat:>12.4f}",
            f"{'P-value:':<25} {self.p_value:>12.4f}",
            f"{conf_level}% CI: [{self.conf_int[0]:.4f}, {self.conf_int[1]:.4f}]",
            "",
            f"{'Observations:':<25} {self.n_obs:>12}",
        ]

        if self.original_effect is not None:
            lines.extend(
                [
                    "",
                    "-" * 65,
                    "Comparison with Original Estimate".center(65),
                    "-" * 65,
                    f"{'Original ATT:':<25} {self.original_effect:>12.4f}",
                ]
            )
            if self.original_se is not None:
                lines.append(f"{'Original SE:':<25} {self.original_se:>12.4f}")

        if self.n_permutations is not None:
            lines.append(f"{'Number of permutations:':<25} {self.n_permutations:>12}")

        if self.fake_period is not None:
            lines.append(f"{'Fake treatment period:':<25} {str(self.fake_period):>12}")

        if self.leave_one_out_effects is not None:
            n_units = len(self.leave_one_out_effects)
            effects = list(self.leave_one_out_effects.values())
            lines.extend(
                [
                    "",
                    "-" * 65,
                    "Leave-One-Out Summary".center(65),
                    "-" * 65,
                    f"{'Units analyzed:':<25} {n_units:>12}",
                    f"{'Mean effect:':<25} {np.mean(effects):>12.4f}",
                    f"{'Std. dev.:':<25} {np.std(effects, ddof=1):>12.4f}",
                    f"{'Min effect:':<25} {np.min(effects):>12.4f}",
                    f"{'Max effect:':<25} {np.max(effects):>12.4f}",
                ]
            )

        # Interpretation
        lines.extend(
            [
                "",
                "-" * 65,
                "Interpretation".center(65),
                "-" * 65,
            ]
        )

        if self.is_significant:
            lines.append("WARNING: Significant placebo effect detected (p < 0.05).")
            lines.append("This suggests potential violations of the parallel trends assumption.")
        else:
            lines.append("No significant placebo effect detected (p >= 0.05).")
            lines.append("This is consistent with the parallel trends assumption.")

        lines.append("=" * 65)

        return "\n".join(lines)



[docs]
    def print_summary(self) -> None:
        """Print summary to stdout."""
        print(self.summary())



[docs]
    def to_dict(self) -> Dict[str, Any]:
        """Convert results to a dictionary."""
        result = {
            "test_type": self.test_type,
            "placebo_effect": self.placebo_effect,
            "se": self.se,
            "t_stat": self.t_stat,
            "p_value": self.p_value,
            "conf_int_lower": self.conf_int[0],
            "conf_int_upper": self.conf_int[1],
            "n_obs": self.n_obs,
            "is_significant": self.is_significant,
        }

        if self.original_effect is not None:
            result["original_effect"] = self.original_effect
        if self.original_se is not None:
            result["original_se"] = self.original_se
        if self.n_permutations is not None:
            result["n_permutations"] = self.n_permutations

        return result



[docs]
    def to_dataframe(self) -> pd.DataFrame:
        """Convert results to a DataFrame."""
        return pd.DataFrame([self.to_dict()])





[docs]
def run_placebo_test(
    data: pd.DataFrame,
    outcome: str,
    treatment: str,
    time: str,
    unit: Optional[str] = None,
    test_type: str = "fake_timing",
    fake_treatment_period: Optional[Any] = None,
    fake_treatment_group: Optional[List[Any]] = None,
    post_periods: Optional[List[Any]] = None,
    n_permutations: int = 1000,
    alpha: float = 0.05,
    seed: Optional[int] = None,
    **estimator_kwargs,
) -> PlaceboTestResults:
    """
    Run a placebo test to validate DiD assumptions.

    Placebo tests provide evidence on the validity of the parallel trends
    assumption by testing whether "fake" treatments produce significant effects.
    A significant placebo effect suggests the parallel trends assumption may
    be violated.

    Parameters
    ----------
    data : pd.DataFrame
        Panel data for DiD analysis.
    outcome : str
        Name of outcome variable column.
    treatment : str
        Name of treatment indicator column (0/1).
    time : str
        Name of time period column.
    unit : str, optional
        Name of unit identifier column. Required for some test types.
    test_type : str, default="fake_timing"
        Type of placebo test:
        - "fake_timing": Assign treatment at a fake (earlier) time period
        - "fake_group": Designate control units as "fake treated" (real-treated units, per the ``treatment`` column, are dropped first)
        - "permutation": Randomly reassign treatment and compute distribution
        - "leave_one_out": Drop each treated unit and re-estimate
    fake_treatment_period : any, optional
        For "fake_timing": The fake treatment period to test.
        Should be a pre-treatment period.
    fake_treatment_group : list, optional
        For "fake_group": List of control unit IDs to designate as fake treated.
    post_periods : list, optional
        List of post-treatment periods. Required for fake_timing test.
    n_permutations : int, default=1000
        For "permutation": Number of random treatment assignments.
    alpha : float, default=0.05
        Significance level.
    seed : int, optional
        Random seed for reproducibility.
    **estimator_kwargs
        Additional arguments passed to the DiD estimator.

    Returns
    -------
    PlaceboTestResults
        Object containing placebo effect estimates, p-values, and diagnostics.

    Examples
    --------
    Fake timing test:

    >>> results = run_placebo_test(
    ...     data, outcome='sales', treatment='treated', time='period',
    ...     test_type='fake_timing',
    ...     fake_treatment_period=1,  # Pre-treatment period
    ...     post_periods=[2, 3, 4]
    ... )
    >>> if results.is_significant:
    ...     print("Warning: Pre-treatment differential trends detected!")

    Permutation test:

    >>> results = run_placebo_test(
    ...     data, outcome='sales', treatment='treated', time='period',
    ...     unit='unit_id',
    ...     test_type='permutation',
    ...     n_permutations=1000,
    ...     seed=42
    ... )
    >>> print(f"Permutation p-value: {results.p_value:.4f}")

    References
    ----------
    Bertrand, M., Duflo, E., & Mullainathan, S. (2004). How Much Should
    We Trust Differences-in-Differences Estimates? The Quarterly Journal
    of Economics, 119(1), 249-275.
    """
    test_type = test_type.lower()
    valid_types = ["fake_timing", "fake_group", "permutation", "leave_one_out"]

    if test_type not in valid_types:
        raise ValueError(f"test_type must be one of {valid_types}, got '{test_type}'")

    if test_type == "fake_timing":
        return placebo_timing_test(
            data=data,
            outcome=outcome,
            treatment=treatment,
            time=time,
            fake_treatment_period=fake_treatment_period,
            post_periods=post_periods,
            alpha=alpha,
            **estimator_kwargs,
        )

    elif test_type == "fake_group":
        if unit is None:
            raise ValueError("unit is required for fake_group test")
        if fake_treatment_group is None or len(fake_treatment_group) == 0:
            raise ValueError("fake_treatment_group is required for fake_group test")
        return placebo_group_test(
            data=data,
            outcome=outcome,
            time=time,
            unit=unit,
            fake_treated_units=fake_treatment_group,
            post_periods=post_periods,
            alpha=alpha,
            treatment=treatment,
            **estimator_kwargs,
        )

    elif test_type == "permutation":
        if unit is None:
            raise ValueError("unit is required for permutation test")
        return permutation_test(
            data=data,
            outcome=outcome,
            treatment=treatment,
            time=time,
            unit=unit,
            n_permutations=n_permutations,
            alpha=alpha,
            seed=seed,
            **estimator_kwargs,
        )

    elif test_type == "leave_one_out":
        if unit is None:
            raise ValueError("unit is required for leave_one_out test")
        return leave_one_out_test(
            data=data,
            outcome=outcome,
            treatment=treatment,
            time=time,
            unit=unit,
            alpha=alpha,
            **estimator_kwargs,
        )

    # This should never be reached due to validation above
    raise ValueError(f"Unknown test type: {test_type}")




[docs]
def placebo_timing_test(
    data: pd.DataFrame,
    outcome: str,
    treatment: str,
    time: str,
    fake_treatment_period: Any,
    post_periods: Optional[List[Any]] = None,
    alpha: float = 0.05,
    **estimator_kwargs,
) -> PlaceboTestResults:
    """
    Test for pre-treatment effects by moving treatment timing earlier.

    Creates a fake "post" indicator using pre-treatment data only, then
    estimates a DiD model. A significant effect suggests pre-existing
    differential trends.

    Parameters
    ----------
    data : pd.DataFrame
        Panel data.
    outcome : str
        Outcome variable column.
    treatment : str
        Treatment indicator column.
    time : str
        Time period column.
    fake_treatment_period : any
        Period to use as fake treatment timing (should be a pre-treatment period).
    post_periods : list, optional
        List of actual post-treatment periods. If None, infers from data.
    alpha : float, default=0.05
        Significance level.
    **estimator_kwargs
        Arguments passed to DifferenceInDifferences.

    Returns
    -------
    PlaceboTestResults
        Results of the fake timing placebo test.
    """
    all_periods = sorted(data[time].unique())

    # Infer post periods if not provided
    if post_periods is None:
        # Use second half of periods as post
        mid = len(all_periods) // 2
        post_periods = all_periods[mid:]

    # Validate fake_treatment_period is pre-treatment
    if fake_treatment_period in post_periods:
        raise ValueError(
            f"fake_treatment_period ({fake_treatment_period}) must be a "
            f"pre-treatment period, not in post_periods ({post_periods})"
        )

    # Use only pre-treatment data
    pre_periods = [p for p in all_periods if p not in post_periods]
    pre_data = data[data[time].isin(pre_periods)].copy()

    # Create fake post indicator
    pre_data["_fake_post"] = (pre_data[time] >= fake_treatment_period).astype(int)

    # Fit DiD on pre-treatment data with fake post
    did = DifferenceInDifferences(**estimator_kwargs)
    results = did.fit(pre_data, outcome=outcome, treatment=treatment, time="_fake_post")

    # Also fit on full data for comparison
    data_with_post = data.copy()
    data_with_post["_post"] = data_with_post[time].isin(post_periods).astype(int)
    did_full = DifferenceInDifferences(**estimator_kwargs)
    results_full = did_full.fit(data_with_post, outcome=outcome, treatment=treatment, time="_post")

    return PlaceboTestResults(
        test_type="fake_timing",
        placebo_effect=results.att,
        se=results.se,
        t_stat=results.t_stat,
        p_value=results.p_value,
        conf_int=results.conf_int,
        n_obs=results.n_obs,
        is_significant=bool(results.p_value < alpha),
        alpha=alpha,
        original_effect=results_full.att,
        original_se=results_full.se,
        fake_period=fake_treatment_period,
    )




[docs]
def placebo_group_test(
    data: pd.DataFrame,
    outcome: str,
    time: str,
    unit: str,
    fake_treated_units: List[Any],
    post_periods: Optional[List[Any]] = None,
    alpha: float = 0.05,
    treatment: Optional[str] = None,
    **estimator_kwargs,
) -> PlaceboTestResults:
    """
    Test for differential trends by designating control units as "fake treated".

    Designates ``fake_treated_units`` as fake-treated and estimates a DiD on the
    resulting panel. A significant effect suggests heterogeneous trends in the
    control group (a parallel-trends red flag).

    If ``treatment`` is provided, units that are *ever* really treated are dropped
    first, so the placebo runs on never-treated units only (the recommended,
    uncontaminated design). If ``treatment`` is ``None``, the test runs on whatever
    data is supplied, so the caller must pass control-only data for a valid placebo.

    Parameters
    ----------
    data : pd.DataFrame
        Panel data.
    outcome : str
        Outcome variable column.
    time : str
        Time period column.
    unit : str
        Unit identifier column.
    fake_treated_units : list
        List of control unit IDs to designate as "fake treated".
    post_periods : list, optional
        List of post-treatment period values.
    alpha : float, default=0.05
        Significance level.
    treatment : str, optional
        Real treatment-indicator column. When given, units that are ever
        real-treated (``data.groupby(unit)[treatment].max() == 1``) are dropped
        before the placebo, so it runs on never-treated units only. When ``None``
        (default), no filtering is done and the caller must pass control-only data.
    **estimator_kwargs
        Arguments passed to DifferenceInDifferences.

    Returns
    -------
    PlaceboTestResults
        Results of the fake group placebo test.
    """
    if fake_treated_units is None or len(fake_treated_units) == 0:
        raise ValueError("fake_treated_units must be a non-empty list")

    fake_data = data.copy()

    # Optionally restrict to never-treated units so the placebo is not contaminated
    # by the real treatment effect (the BDM 2004 placebo-law design on controls).
    if treatment is not None:
        # Fail closed: a missing column or non-0/1 values would otherwise silently
        # skip the ever-treated filter (groupby().max() drops NaN), running the
        # placebo on contaminated data.
        if treatment not in fake_data.columns:
            raise ValueError(f"treatment column '{treatment}' not found in data")
        if fake_data[treatment].isna().any():
            raise ValueError(f"treatment column '{treatment}' contains missing values")
        validate_binary(fake_data[treatment].to_numpy(), "treatment")
        ever_treated = fake_data.groupby(unit)[treatment].max()
        ever_treated_units = set(ever_treated[ever_treated == 1].index)
        misused = [u for u in fake_treated_units if u in ever_treated_units]
        if misused:
            import warnings

            warnings.warn(
                f"{len(misused)} of fake_treated_units are themselves ever real-treated "
                f"and will be dropped with the other real-treated units: {misused}. "
                f"Pass only never-treated units as fake_treated_units for a valid placebo.",
                UserWarning,
                stacklevel=2,
            )
        fake_data = fake_data[~fake_data[unit].isin(ever_treated_units)].copy()

    all_periods = sorted(fake_data[time].unique())

    # Infer post periods if not provided
    if post_periods is None:
        mid = len(all_periods) // 2
        post_periods = all_periods[mid:]

    # Create fake treatment indicator
    fake_data["_fake_treated"] = fake_data[unit].isin(fake_treated_units).astype(int)
    fake_data["_post"] = fake_data[time].isin(post_periods).astype(int)

    # Guard degenerate designs (e.g., all fake_treated_units were dropped as
    # real-treated, or no controls remain) before they surface as a cryptic
    # LinAlgError inside the estimator.
    if fake_data["_fake_treated"].sum() == 0:
        raise ValueError(
            "No fake-treated observations remain (all fake_treated_units were "
            "dropped as real-treated, or are absent from the data). Pass "
            "never-treated units as fake_treated_units."
        )
    if (fake_data["_fake_treated"] == 0).sum() == 0:
        raise ValueError("No control (non-fake-treated) units remain for the placebo comparison.")

    # Fit DiD
    did = DifferenceInDifferences(**estimator_kwargs)
    results = did.fit(fake_data, outcome=outcome, treatment="_fake_treated", time="_post")

    # Record the fake-treated units actually used (after any never-treated
    # filtering), not just the originally requested list, to avoid metadata drift.
    # Preserve the caller's order (sorting could raise TypeError on mixed-type IDs).
    retained = set(fake_data.loc[fake_data["_fake_treated"] == 1, unit].unique())
    used_fake_treated = [u for u in fake_treated_units if u in retained]

    return PlaceboTestResults(
        test_type="fake_group",
        placebo_effect=results.att,
        se=results.se,
        t_stat=results.t_stat,
        p_value=results.p_value,
        conf_int=results.conf_int,
        n_obs=results.n_obs,
        is_significant=bool(results.p_value < alpha),
        alpha=alpha,
        fake_group=used_fake_treated,
    )




[docs]
def permutation_test(
    data: pd.DataFrame,
    outcome: str,
    treatment: str,
    time: str,
    unit: str,
    n_permutations: int = 1000,
    alpha: float = 0.05,
    seed: Optional[int] = None,
    **estimator_kwargs,
) -> PlaceboTestResults:
    """
    Compute permutation-based p-value for DiD estimate.

    Randomly reassigns treatment status at the unit level and computes the
    DiD estimate for each permutation. The p-value is the randomization-inference
    value ``(1 + count) / (B + 1)`` (Phipson & Smyth 2010), where ``count`` is the
    number of permuted estimates at least as extreme as the observed and ``B`` is
    the number of valid permutations. With ``B`` sampled permutations this is a
    Monte-Carlo approximation that converges to the exact full-enumeration value
    ``count / total`` as ``B`` grows.

    Parameters
    ----------
    data : pd.DataFrame
        Panel data.
    outcome : str
        Outcome variable column.
    treatment : str
        Treatment indicator column.
    time : str
        Time period column.
    unit : str
        Unit identifier column.
    n_permutations : int, default=1000
        Number of random permutations.
    alpha : float, default=0.05
        Significance level.
    seed : int, optional
        Random seed for reproducibility.
    **estimator_kwargs
        Arguments passed to DifferenceInDifferences.

    Returns
    -------
    PlaceboTestResults
        Results with permutation distribution and p-value.

    Notes
    -----
    This is a randomization-inference (permutation) test of the sharp null of no
    effect for any unit; it does not rely on asymptotic approximations. Treatment
    assignments are drawn independently each iteration (Monte-Carlo sampling *with
    replacement* from the assignment space), so the reported p-value
    ``(1 + count) / (B + 1)`` (Phipson & Smyth 2010) is a **valid but slightly
    conservative** estimator -- the ``+1`` adds the observed assignment and
    prevents a zero p-value. Here ``count`` is the number of permutations at least
    as extreme as the observed estimate and ``B`` is the number of valid
    permutations. As ``B`` grows it converges to the *exact* p-value obtained by
    full enumeration of all assignments (the R-parity reference). "Exact" is
    reserved for that full enumeration; the sampled value approximates it.
    """
    rng = np.random.default_rng(seed)

    # First, fit original model
    did = DifferenceInDifferences(**estimator_kwargs)
    original_results = did.fit(data, outcome=outcome, treatment=treatment, time=time)
    original_att = original_results.att

    # Get unit-level treatment assignment
    unit_treatment = data.groupby(unit)[treatment].first().reset_index()
    units = unit_treatment[unit].values
    n_treated = int(unit_treatment[treatment].sum())

    # Permutation loop
    permuted_effects = np.zeros(n_permutations)

    for i in range(n_permutations):
        # Randomly assign treatment to units
        perm_treated_units = rng.choice(units, size=n_treated, replace=False)

        # Create permuted data
        perm_data = data.copy()
        perm_data["_perm_treatment"] = perm_data[unit].isin(perm_treated_units).astype(int)

        # Fit DiD
        try:
            perm_did = DifferenceInDifferences(**estimator_kwargs)
            perm_results = perm_did.fit(
                perm_data, outcome=outcome, treatment="_perm_treatment", time=time
            )
            permuted_effects[i] = perm_results.att
        except (ValueError, KeyError, np.linalg.LinAlgError):
            # Handle edge cases where fitting fails
            permuted_effects[i] = np.nan

    # Remove any NaN values and track failure rate
    valid_effects = permuted_effects[~np.isnan(permuted_effects)]
    n_failed = n_permutations - len(valid_effects)

    if len(valid_effects) == 0:
        raise RuntimeError(
            f"All {n_permutations} permutations failed. This typically occurs when:\n"
            f"  - Treatment/control groups are too small for valid permutation\n"
            f"  - Data contains collinearity or singular matrices after permutation\n"
            f"  - There are too few observations per time period\n"
            f"Consider checking data quality with validate_did_data() from diff_diff.prep."
        )

    # Warn if significant number of permutations failed
    if n_failed > 0:
        failure_rate = n_failed / n_permutations
        if failure_rate > 0.1:
            import warnings

            warnings.warn(
                f"{n_failed}/{n_permutations} permutations failed ({failure_rate:.1%}). "
                f"Results based on {len(valid_effects)} successful permutations.",
                UserWarning,
                stacklevel=2,
            )

    # Randomization-inference p-value (Phipson & Smyth 2010): include the observed
    # statistic in both numerator and denominator. The 1/(B+1) floor is intrinsic
    # (count == 0 -> 1/(B+1)), so no separate clamp is needed. With sampled
    # permutations this converges to the exact full-enumeration value count/total.
    count = int(np.sum(np.abs(valid_effects) >= np.abs(original_att)))
    p_value = (1 + count) / (len(valid_effects) + 1)

    # Compute SE and CI from permutation distribution
    se = np.std(valid_effects, ddof=1)
    ci_lower = np.percentile(valid_effects, alpha / 2 * 100)
    ci_upper = np.percentile(valid_effects, (1 - alpha / 2) * 100)

    # NOTE: Not using safe_inference — p_value is permutation-based, CI is percentile-based.
    t_stat = original_att / se if np.isfinite(se) and se > 0 else np.nan

    return PlaceboTestResults(
        test_type="permutation",
        placebo_effect=np.mean(valid_effects),  # Mean of null distribution
        se=se,
        t_stat=t_stat,
        p_value=p_value,
        conf_int=(ci_lower, ci_upper),
        n_obs=len(data),
        is_significant=bool(p_value < alpha),
        alpha=alpha,
        original_effect=original_att,
        original_se=original_results.se,
        permutation_distribution=valid_effects,
        n_permutations=len(valid_effects),
    )




[docs]
def leave_one_out_test(
    data: pd.DataFrame,
    outcome: str,
    treatment: str,
    time: str,
    unit: str,
    alpha: float = 0.05,
    **estimator_kwargs,
) -> PlaceboTestResults:
    """
    Assess sensitivity by dropping each treated unit in turn.

    For each treated unit, drops that unit and re-estimates the DiD model.
    Large variation in estimates suggests results are driven by a single unit.

    Parameters
    ----------
    data : pd.DataFrame
        Panel data.
    outcome : str
        Outcome variable column.
    treatment : str
        Treatment indicator column.
    time : str
        Time period column.
    unit : str
        Unit identifier column.
    alpha : float, default=0.05
        Significance level.
    **estimator_kwargs
        Arguments passed to DifferenceInDifferences.

    Returns
    -------
    PlaceboTestResults
        Results with leave_one_out_effects dict mapping unit -> ATT estimate.
    """
    # Fit original model
    did = DifferenceInDifferences(**estimator_kwargs)
    original_results = did.fit(data, outcome=outcome, treatment=treatment, time=time)
    original_att = original_results.att

    # Get treated units
    treated_units = data[data[treatment] == 1][unit].unique()

    # Leave-one-out loop
    loo_effects = {}

    for u in treated_units:
        # Drop this unit
        loo_data = data[data[unit] != u].copy()

        # Check we still have treated units
        if loo_data[treatment].sum() == 0:
            continue

        try:
            loo_did = DifferenceInDifferences(**estimator_kwargs)
            loo_results = loo_did.fit(loo_data, outcome=outcome, treatment=treatment, time=time)
            loo_effects[u] = loo_results.att
        except (ValueError, KeyError, np.linalg.LinAlgError):
            # Skip units that cause fitting issues
            loo_effects[u] = np.nan

    # Remove NaN values for statistics and track failures
    valid_effects = [v for v in loo_effects.values() if not np.isnan(v)]
    n_total = len(loo_effects)
    n_failed = n_total - len(valid_effects)

    if len(valid_effects) == 0:
        raise RuntimeError(
            f"All {n_total} leave-one-out estimates failed. This typically occurs when:\n"
            f"  - Removing any single treated unit causes model fitting to fail\n"
            f"  - Very few treated units (need at least 2 for LOO)\n"
            f"  - Data has collinearity issues that manifest when units are removed\n"
            f"Consider checking data quality and ensuring sufficient treated units."
        )

    # Warn if significant number of LOO iterations failed
    if n_failed > 0:
        import warnings

        failed_units = [u for u, v in loo_effects.items() if np.isnan(v)]
        warnings.warn(
            f"{n_failed}/{n_total} leave-one-out estimates failed for units: {failed_units}. "
            f"Results based on {len(valid_effects)} successful iterations.",
            UserWarning,
            stacklevel=2,
        )

    # Statistics of LOO distribution
    mean_effect = np.mean(valid_effects)
    se = np.std(valid_effects, ddof=1) if len(valid_effects) > 1 else np.nan
    df = len(valid_effects) - 1 if len(valid_effects) > 1 else 1
    t_stat, p_value, conf_int = safe_inference(mean_effect, se, alpha=alpha, df=df)

    return PlaceboTestResults(
        test_type="leave_one_out",
        placebo_effect=mean_effect,
        se=se,
        t_stat=t_stat,
        p_value=p_value,
        conf_int=conf_int,
        n_obs=len(data),
        is_significant=bool(p_value < alpha),
        alpha=alpha,
        original_effect=original_att,
        original_se=original_results.se,
        leave_one_out_effects=loo_effects,
    )




[docs]
def run_all_placebo_tests(
    data: pd.DataFrame,
    outcome: str,
    treatment: str,
    time: str,
    unit: str,
    pre_periods: List[Any],
    post_periods: List[Any],
    n_permutations: int = 500,
    alpha: float = 0.05,
    seed: Optional[int] = None,
    **estimator_kwargs,
) -> Dict[str, Union[PlaceboTestResults, Dict[str, str]]]:
    """
    Run a comprehensive suite of placebo tests.

    Runs fake timing tests for each pre-period, a permutation test, and
    a leave-one-out sensitivity analysis. If a test fails, the result
    will be a dict with an "error" key containing the error message.

    Parameters
    ----------
    data : pd.DataFrame
        Panel data.
    outcome : str
        Outcome variable column.
    treatment : str
        Treatment indicator column.
    time : str
        Time period column.
    unit : str
        Unit identifier column.
    pre_periods : list
        List of pre-treatment periods.
    post_periods : list
        List of post-treatment periods.
    n_permutations : int, default=500
        Permutations for permutation test.
    alpha : float, default=0.05
        Significance level.
    seed : int, optional
        Random seed.
    **estimator_kwargs
        Arguments passed to estimators.

    Returns
    -------
    dict
        Dictionary mapping test names to PlaceboTestResults.
        Keys: "fake_timing_{period}", "permutation", "leave_one_out"
    """
    results = {}

    # Fake timing tests for each pre-period (except first)
    for period in pre_periods[1:]:  # Skip first period
        try:
            test_result = placebo_timing_test(
                data=data,
                outcome=outcome,
                treatment=treatment,
                time=time,
                fake_treatment_period=period,
                post_periods=post_periods,
                alpha=alpha,
                **estimator_kwargs,
            )
            results[f"fake_timing_{period}"] = test_result
        except Exception as e:
            # Store structured error info for debugging
            results[f"fake_timing_{period}"] = {
                "error": str(e),
                "error_type": type(e).__name__,
                "test_type": "fake_timing",
                "period": period,
            }

    # Permutation test
    try:
        perm_result = permutation_test(
            data=data,
            outcome=outcome,
            treatment=treatment,
            time=time,
            unit=unit,
            n_permutations=n_permutations,
            alpha=alpha,
            seed=seed,
            **estimator_kwargs,
        )
        results["permutation"] = perm_result
    except Exception as e:
        results["permutation"] = {
            "error": str(e),
            "error_type": type(e).__name__,
            "test_type": "permutation",
        }

    # Leave-one-out test
    try:
        loo_result = leave_one_out_test(
            data=data,
            outcome=outcome,
            treatment=treatment,
            time=time,
            unit=unit,
            alpha=alpha,
            **estimator_kwargs,
        )
        results["leave_one_out"] = loo_result
    except Exception as e:
        results["leave_one_out"] = {
            "error": str(e),
            "error_type": type(e).__name__,
            "test_type": "leave_one_out",
        }

    return results