diff --git a/TODO.md b/TODO.md index 3fbd24f..697d432 100644 --- a/TODO.md +++ b/TODO.md @@ -25,7 +25,7 @@ Consolidation opportunities for cleaner maintenance: | Duplicate Code | Locations | Notes | |---------------|-----------|-------| -| Within-transformation logic | `estimators.py:217-232`, `estimators.py:787-833`, `bacon.py:567-642` | Extract to utils.py | +| ~~Within-transformation logic~~ | ~~Multiple files~~ | ✅ Extracted to `utils.py` as `demean_by_group()` and `within_transform()` (v2.0.1) | | Linear regression helper | `staggered.py:205-240`, `estimators.py:366-408` | Consider consolidation | ### Large Module Files diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py index ffb0f07..ba5ef51 100644 --- a/diff_diff/__init__.py +++ b/diff_diff/__init__.py @@ -113,7 +113,7 @@ plot_sensitivity, ) -__version__ = "2.0.0" +__version__ = "2.0.1" __all__ = [ # Estimators "DifferenceInDifferences", diff --git a/diff_diff/bacon.py b/diff_diff/bacon.py index ed94e73..8f09d9f 100644 --- a/diff_diff/bacon.py +++ b/diff_diff/bacon.py @@ -17,6 +17,8 @@ import numpy as np import pandas as pd +from diff_diff.utils import within_transform as _within_transform_util + @dataclass class Comparison2x2: @@ -573,66 +575,16 @@ def _compute_twfe( treat_col: str = '__bacon_treated_internal__', ) -> float: """Compute TWFE estimate using within-transformation.""" - # Demean by unit and time - y = df[outcome].values - d = df[treat_col].astype(float).values - - # Create unit and time dummies for demeaning - units = df[unit].values - times = df[time].values - - # Unit means - unit_map = {u: i for i, u in enumerate(df[unit].unique())} - unit_idx = np.array([unit_map[u] for u in units]) - n_units = len(unit_map) - - # Time means - time_map = {t: i for i, t in enumerate(df[time].unique())} - time_idx = np.array([time_map[t] for t in times]) - n_times = len(time_map) - - # Compute means - y_unit_mean = np.zeros(n_units) - d_unit_mean = np.zeros(n_units) - unit_counts = np.zeros(n_units) - - for i in range(len(y)): - u = unit_idx[i] - y_unit_mean[u] += y[i] - d_unit_mean[u] += d[i] - unit_counts[u] += 1 - - y_unit_mean /= np.maximum(unit_counts, 1) - d_unit_mean /= np.maximum(unit_counts, 1) - - y_time_mean = np.zeros(n_times) - d_time_mean = np.zeros(n_times) - time_counts = np.zeros(n_times) - - for i in range(len(y)): - t = time_idx[i] - y_time_mean[t] += y[i] - d_time_mean[t] += d[i] - time_counts[t] += 1 - - y_time_mean /= np.maximum(time_counts, 1) - d_time_mean /= np.maximum(time_counts, 1) - - # Overall mean - y_mean = np.mean(y) - d_mean = np.mean(d) - - # Within transformation: y_it - y_i - y_t + y - y_within = np.zeros(len(y)) - d_within = np.zeros(len(d)) - - for i in range(len(y)): - u = unit_idx[i] - t = time_idx[i] - y_within[i] = y[i] - y_unit_mean[u] - y_time_mean[t] + y_mean - d_within[i] = d[i] - d_unit_mean[u] - d_time_mean[t] + d_mean - - # OLS on demeaned data + # Apply two-way within transformation + df_dm = _within_transform_util( + df, [outcome, treat_col], unit, time, suffix="_within" + ) + + # Extract within-transformed values + y_within = df_dm[f"{outcome}_within"].values + d_within = df_dm[f"{treat_col}_within"].values + + # OLS on demeaned data: beta = sum(d * y) / sum(d^2) d_var = np.sum(d_within ** 2) if d_var > 0: beta = np.sum(d_within * y_within) / d_var diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index f896e50..40fd220 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -23,6 +23,7 @@ WildBootstrapResults, compute_confidence_interval, compute_p_value, + demean_by_group, validate_binary, wild_bootstrap_se, ) @@ -227,10 +228,10 @@ def fit( # unit-invariant, so demeaning them would create multicollinearity vars_to_demean = [outcome] + (covariates or []) for ab_var in absorb: - n_absorbed_effects += working_data[ab_var].nunique() - 1 - for var in vars_to_demean: - group_means = working_data.groupby(ab_var)[var].transform("mean") - working_data[var] = working_data[var] - group_means + working_data, n_fe = demean_by_group( + working_data, vars_to_demean, ab_var, inplace=True + ) + n_absorbed_effects += n_fe absorbed_vars.append(ab_var) # Extract variables (may be demeaned if absorb was used) @@ -828,10 +829,10 @@ def fit( # type: ignore[override] if absorb: vars_to_demean = [outcome] + (covariates or []) for ab_var in absorb: - n_absorbed_effects += working_data[ab_var].nunique() - 1 - for var in vars_to_demean: - group_means = working_data.groupby(ab_var)[var].transform("mean") - working_data[var] = working_data[var] - group_means + working_data, n_fe = demean_by_group( + working_data, vars_to_demean, ab_var, inplace=True + ) + n_absorbed_effects += n_fe # Extract outcome and treatment y = working_data[outcome].values.astype(float) diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py index e50247f..914b6ff 100644 --- a/diff_diff/sun_abraham.py +++ b/diff_diff/sun_abraham.py @@ -21,6 +21,7 @@ from diff_diff.utils import ( compute_confidence_interval, compute_p_value, + within_transform as _within_transform_util, ) @@ -789,28 +790,7 @@ def _within_transform( y_it - y_i. - y_.t + y_.. """ - df = df.copy() - - # Build all demeaned columns at once to avoid fragmentation - demeaned_data = {} - for var in variables: - # Unit means - unit_means = df.groupby(unit)[var].transform("mean") - # Time means - time_means = df.groupby(time)[var].transform("mean") - # Grand mean - grand_mean = df[var].mean() - - # Within transformation - demeaned_data[f"{var}_dm"] = ( - df[var] - unit_means - time_means + grand_mean - ).values - - # Add all demeaned columns at once - demeaned_df = pd.DataFrame(demeaned_data, index=df.index) - df = pd.concat([df, demeaned_df], axis=1) - - return df + return _within_transform_util(df, variables, unit, time, suffix="_dm") def _compute_iw_effects( self, diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py index 46224f6..54d9dce 100644 --- a/diff_diff/twfe.py +++ b/diff_diff/twfe.py @@ -17,6 +17,7 @@ from diff_diff.utils import ( compute_confidence_interval, compute_p_value, + within_transform as _within_transform_util, ) @@ -211,25 +212,8 @@ def _within_transform( pd.DataFrame Data with demeaned variables. """ - data = data.copy() variables = [outcome] + (covariates or []) - - # Cache groupby objects for efficiency (avoids re-computing group indexes) - unit_grouper = data.groupby(unit, sort=False) - time_grouper = data.groupby(time, sort=False) - - for var in variables: - # Unit means (using cached grouper) - unit_means = unit_grouper[var].transform("mean") - # Time means (using cached grouper) - time_means = time_grouper[var].transform("mean") - # Grand mean - grand_mean = data[var].mean() - - # Within transformation - data[f"{var}_demeaned"] = data[var] - unit_means - time_means + grand_mean - - return data + return _within_transform_util(data, variables, unit, time, suffix="_demeaned") def _check_staggered_treatment( self, diff --git a/diff_diff/utils.py b/diff_diff/utils.py index 600d9c2..6295824 100644 --- a/diff_diff/utils.py +++ b/diff_diff/utils.py @@ -1342,3 +1342,140 @@ def compute_placebo_effects( placebo_effects.append(placebo_tau) return np.asarray(placebo_effects) + + +def demean_by_group( + data: pd.DataFrame, + variables: List[str], + group_var: str, + inplace: bool = False, + suffix: str = "", +) -> Tuple[pd.DataFrame, int]: + """ + Demean variables by a grouping variable (one-way within transformation). + + For each variable, computes: x_ig - mean(x_g) where g is the group. + + Parameters + ---------- + data : pd.DataFrame + DataFrame containing the variables to demean. + variables : list of str + Column names to demean. + group_var : str + Column name for the grouping variable. + inplace : bool, default False + If True, modifies the original columns. If False, leaves original + columns unchanged (demeaning is still applied to return value). + suffix : str, default "" + Suffix to add to demeaned column names (only used when inplace=False + and you want to keep both original and demeaned columns). + + Returns + ------- + data : pd.DataFrame + DataFrame with demeaned variables. + n_effects : int + Number of absorbed fixed effects (nunique - 1). + + Examples + -------- + >>> df, n_fe = demean_by_group(df, ['y', 'x1', 'x2'], 'unit') + >>> # df['y'], df['x1'], df['x2'] are now demeaned by unit + """ + if not inplace: + data = data.copy() + + # Count fixed effects (categories - 1 for identification) + n_effects = data[group_var].nunique() - 1 + + # Cache the groupby object for efficiency + grouper = data.groupby(group_var, sort=False) + + for var in variables: + col_name = var if not suffix else f"{var}{suffix}" + group_means = grouper[var].transform("mean") + data[col_name] = data[var] - group_means + + return data, n_effects + + +def within_transform( + data: pd.DataFrame, + variables: List[str], + unit: str, + time: str, + inplace: bool = False, + suffix: str = "_demeaned", +) -> pd.DataFrame: + """ + Apply two-way within transformation to remove unit and time fixed effects. + + Computes: y_it - y_i. - y_.t + y_.. for each variable. + + This is the standard fixed effects transformation for panel data that + removes both unit-specific and time-specific effects. + + Parameters + ---------- + data : pd.DataFrame + Panel data containing the variables to transform. + variables : list of str + Column names to transform. + unit : str + Column name for unit identifier. + time : str + Column name for time period identifier. + inplace : bool, default False + If True, modifies the original columns. If False, creates new columns + with the specified suffix. + suffix : str, default "_demeaned" + Suffix for new column names when inplace=False. + + Returns + ------- + pd.DataFrame + DataFrame with within-transformed variables. + + Notes + ----- + The within transformation removes variation that is constant within units + (unit fixed effects) and constant within time periods (time fixed effects). + The resulting estimates are equivalent to including unit and time dummies + but is computationally more efficient for large panels. + + Examples + -------- + >>> df = within_transform(df, ['y', 'x'], 'unit_id', 'year') + >>> # df now has 'y_demeaned' and 'x_demeaned' columns + """ + if not inplace: + data = data.copy() + + # Cache groupby objects for efficiency + unit_grouper = data.groupby(unit, sort=False) + time_grouper = data.groupby(time, sort=False) + + if inplace: + # Modify columns in place + for var in variables: + unit_means = unit_grouper[var].transform("mean") + time_means = time_grouper[var].transform("mean") + grand_mean = data[var].mean() + data[var] = data[var] - unit_means - time_means + grand_mean + else: + # Build all demeaned columns at once to avoid DataFrame fragmentation + demeaned_data = {} + for var in variables: + unit_means = unit_grouper[var].transform("mean") + time_means = time_grouper[var].transform("mean") + grand_mean = data[var].mean() + demeaned_data[f"{var}{suffix}"] = ( + data[var] - unit_means - time_means + grand_mean + ).values + + # Add all columns at once + demeaned_df = pd.DataFrame(demeaned_data, index=data.index) + data = pd.concat([data, demeaned_df], axis=1) + + return data diff --git a/pyproject.toml b/pyproject.toml index 2dbb55c..6ab44de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "diff-diff" -version = "2.0.0" +version = "2.0.1" description = "A library for Difference-in-Differences causal inference analysis" readme = "README.md" license = "MIT"