Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Consolidation opportunities for cleaner maintenance:

| Duplicate Code | Locations | Notes |
|---------------|-----------|-------|
| Within-transformation logic | `estimators.py:217-232`, `estimators.py:787-833`, `bacon.py:567-642` | Extract to utils.py |
| ~~Within-transformation logic~~ | ~~Multiple files~~ | ✅ Extracted to `utils.py` as `demean_by_group()` and `within_transform()` (v2.0.1) |
| Linear regression helper | `staggered.py:205-240`, `estimators.py:366-408` | Consider consolidation |

### Large Module Files
Expand Down
2 changes: 1 addition & 1 deletion diff_diff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
plot_sensitivity,
)

__version__ = "2.0.0"
__version__ = "2.0.1"
__all__ = [
# Estimators
"DifferenceInDifferences",
Expand Down
72 changes: 12 additions & 60 deletions diff_diff/bacon.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import numpy as np
import pandas as pd

from diff_diff.utils import within_transform as _within_transform_util


@dataclass
class Comparison2x2:
Expand Down Expand Up @@ -573,66 +575,16 @@ def _compute_twfe(
treat_col: str = '__bacon_treated_internal__',
) -> float:
"""Compute TWFE estimate using within-transformation."""
# Demean by unit and time
y = df[outcome].values
d = df[treat_col].astype(float).values

# Create unit and time dummies for demeaning
units = df[unit].values
times = df[time].values

# Unit means
unit_map = {u: i for i, u in enumerate(df[unit].unique())}
unit_idx = np.array([unit_map[u] for u in units])
n_units = len(unit_map)

# Time means
time_map = {t: i for i, t in enumerate(df[time].unique())}
time_idx = np.array([time_map[t] for t in times])
n_times = len(time_map)

# Compute means
y_unit_mean = np.zeros(n_units)
d_unit_mean = np.zeros(n_units)
unit_counts = np.zeros(n_units)

for i in range(len(y)):
u = unit_idx[i]
y_unit_mean[u] += y[i]
d_unit_mean[u] += d[i]
unit_counts[u] += 1

y_unit_mean /= np.maximum(unit_counts, 1)
d_unit_mean /= np.maximum(unit_counts, 1)

y_time_mean = np.zeros(n_times)
d_time_mean = np.zeros(n_times)
time_counts = np.zeros(n_times)

for i in range(len(y)):
t = time_idx[i]
y_time_mean[t] += y[i]
d_time_mean[t] += d[i]
time_counts[t] += 1

y_time_mean /= np.maximum(time_counts, 1)
d_time_mean /= np.maximum(time_counts, 1)

# Overall mean
y_mean = np.mean(y)
d_mean = np.mean(d)

# Within transformation: y_it - y_i - y_t + y
y_within = np.zeros(len(y))
d_within = np.zeros(len(d))

for i in range(len(y)):
u = unit_idx[i]
t = time_idx[i]
y_within[i] = y[i] - y_unit_mean[u] - y_time_mean[t] + y_mean
d_within[i] = d[i] - d_unit_mean[u] - d_time_mean[t] + d_mean

# OLS on demeaned data
# Apply two-way within transformation
df_dm = _within_transform_util(
df, [outcome, treat_col], unit, time, suffix="_within"
)

# Extract within-transformed values
y_within = df_dm[f"{outcome}_within"].values
d_within = df_dm[f"{treat_col}_within"].values

# OLS on demeaned data: beta = sum(d * y) / sum(d^2)
d_var = np.sum(d_within ** 2)
if d_var > 0:
beta = np.sum(d_within * y_within) / d_var
Expand Down
17 changes: 9 additions & 8 deletions diff_diff/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
WildBootstrapResults,
compute_confidence_interval,
compute_p_value,
demean_by_group,
validate_binary,
wild_bootstrap_se,
)
Expand Down Expand Up @@ -227,10 +228,10 @@ def fit(
# unit-invariant, so demeaning them would create multicollinearity
vars_to_demean = [outcome] + (covariates or [])
for ab_var in absorb:
n_absorbed_effects += working_data[ab_var].nunique() - 1
for var in vars_to_demean:
group_means = working_data.groupby(ab_var)[var].transform("mean")
working_data[var] = working_data[var] - group_means
working_data, n_fe = demean_by_group(
working_data, vars_to_demean, ab_var, inplace=True
)
n_absorbed_effects += n_fe
absorbed_vars.append(ab_var)

# Extract variables (may be demeaned if absorb was used)
Expand Down Expand Up @@ -828,10 +829,10 @@ def fit( # type: ignore[override]
if absorb:
vars_to_demean = [outcome] + (covariates or [])
for ab_var in absorb:
n_absorbed_effects += working_data[ab_var].nunique() - 1
for var in vars_to_demean:
group_means = working_data.groupby(ab_var)[var].transform("mean")
working_data[var] = working_data[var] - group_means
working_data, n_fe = demean_by_group(
working_data, vars_to_demean, ab_var, inplace=True
)
n_absorbed_effects += n_fe

# Extract outcome and treatment
y = working_data[outcome].values.astype(float)
Expand Down
24 changes: 2 additions & 22 deletions diff_diff/sun_abraham.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from diff_diff.utils import (
compute_confidence_interval,
compute_p_value,
within_transform as _within_transform_util,
)


Expand Down Expand Up @@ -789,28 +790,7 @@ def _within_transform(

y_it - y_i. - y_.t + y_..
"""
df = df.copy()

# Build all demeaned columns at once to avoid fragmentation
demeaned_data = {}
for var in variables:
# Unit means
unit_means = df.groupby(unit)[var].transform("mean")
# Time means
time_means = df.groupby(time)[var].transform("mean")
# Grand mean
grand_mean = df[var].mean()

# Within transformation
demeaned_data[f"{var}_dm"] = (
df[var] - unit_means - time_means + grand_mean
).values

# Add all demeaned columns at once
demeaned_df = pd.DataFrame(demeaned_data, index=df.index)
df = pd.concat([df, demeaned_df], axis=1)

return df
return _within_transform_util(df, variables, unit, time, suffix="_dm")

def _compute_iw_effects(
self,
Expand Down
20 changes: 2 additions & 18 deletions diff_diff/twfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from diff_diff.utils import (
compute_confidence_interval,
compute_p_value,
within_transform as _within_transform_util,
)


Expand Down Expand Up @@ -211,25 +212,8 @@ def _within_transform(
pd.DataFrame
Data with demeaned variables.
"""
data = data.copy()
variables = [outcome] + (covariates or [])

# Cache groupby objects for efficiency (avoids re-computing group indexes)
unit_grouper = data.groupby(unit, sort=False)
time_grouper = data.groupby(time, sort=False)

for var in variables:
# Unit means (using cached grouper)
unit_means = unit_grouper[var].transform("mean")
# Time means (using cached grouper)
time_means = time_grouper[var].transform("mean")
# Grand mean
grand_mean = data[var].mean()

# Within transformation
data[f"{var}_demeaned"] = data[var] - unit_means - time_means + grand_mean

return data
return _within_transform_util(data, variables, unit, time, suffix="_demeaned")

def _check_staggered_treatment(
self,
Expand Down
137 changes: 137 additions & 0 deletions diff_diff/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1342,3 +1342,140 @@ def compute_placebo_effects(
placebo_effects.append(placebo_tau)

return np.asarray(placebo_effects)


def demean_by_group(
data: pd.DataFrame,
variables: List[str],
group_var: str,
inplace: bool = False,
suffix: str = "",
) -> Tuple[pd.DataFrame, int]:
"""
Demean variables by a grouping variable (one-way within transformation).

For each variable, computes: x_ig - mean(x_g) where g is the group.

Parameters
----------
data : pd.DataFrame
DataFrame containing the variables to demean.
variables : list of str
Column names to demean.
group_var : str
Column name for the grouping variable.
inplace : bool, default False
If True, modifies the original columns. If False, leaves original
columns unchanged (demeaning is still applied to return value).
suffix : str, default ""
Suffix to add to demeaned column names (only used when inplace=False
and you want to keep both original and demeaned columns).

Returns
-------
data : pd.DataFrame
DataFrame with demeaned variables.
n_effects : int
Number of absorbed fixed effects (nunique - 1).

Examples
--------
>>> df, n_fe = demean_by_group(df, ['y', 'x1', 'x2'], 'unit')
>>> # df['y'], df['x1'], df['x2'] are now demeaned by unit
"""
if not inplace:
data = data.copy()

# Count fixed effects (categories - 1 for identification)
n_effects = data[group_var].nunique() - 1

# Cache the groupby object for efficiency
grouper = data.groupby(group_var, sort=False)

for var in variables:
col_name = var if not suffix else f"{var}{suffix}"
group_means = grouper[var].transform("mean")
data[col_name] = data[var] - group_means

return data, n_effects


def within_transform(
data: pd.DataFrame,
variables: List[str],
unit: str,
time: str,
inplace: bool = False,
suffix: str = "_demeaned",
) -> pd.DataFrame:
"""
Apply two-way within transformation to remove unit and time fixed effects.

Computes: y_it - y_i. - y_.t + y_.. for each variable.

This is the standard fixed effects transformation for panel data that
removes both unit-specific and time-specific effects.

Parameters
----------
data : pd.DataFrame
Panel data containing the variables to transform.
variables : list of str
Column names to transform.
unit : str
Column name for unit identifier.
time : str
Column name for time period identifier.
inplace : bool, default False
If True, modifies the original columns. If False, creates new columns
with the specified suffix.
suffix : str, default "_demeaned"
Suffix for new column names when inplace=False.

Returns
-------
pd.DataFrame
DataFrame with within-transformed variables.

Notes
-----
The within transformation removes variation that is constant within units
(unit fixed effects) and constant within time periods (time fixed effects).
The resulting estimates are equivalent to including unit and time dummies
but is computationally more efficient for large panels.

Examples
--------
>>> df = within_transform(df, ['y', 'x'], 'unit_id', 'year')
>>> # df now has 'y_demeaned' and 'x_demeaned' columns
"""
if not inplace:
data = data.copy()

# Cache groupby objects for efficiency
unit_grouper = data.groupby(unit, sort=False)
time_grouper = data.groupby(time, sort=False)

if inplace:
# Modify columns in place
for var in variables:
unit_means = unit_grouper[var].transform("mean")
time_means = time_grouper[var].transform("mean")
grand_mean = data[var].mean()
data[var] = data[var] - unit_means - time_means + grand_mean
else:
# Build all demeaned columns at once to avoid DataFrame fragmentation
demeaned_data = {}
for var in variables:
unit_means = unit_grouper[var].transform("mean")
time_means = time_grouper[var].transform("mean")
grand_mean = data[var].mean()
demeaned_data[f"{var}{suffix}"] = (
data[var] - unit_means - time_means + grand_mean
).values

# Add all columns at once
demeaned_df = pd.DataFrame(demeaned_data, index=data.index)
data = pd.concat([data, demeaned_df], axis=1)

return data
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "diff-diff"
version = "2.0.0"
version = "2.0.1"
description = "A library for Difference-in-Differences causal inference analysis"
readme = "README.md"
license = "MIT"
Expand Down