Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dpsynth/local_mode/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,13 @@ class OpenSetCategoricalInitializer(primitives.DPMechanism):
name: Attribute name used as the clique key in the measurement.
attribute: The OpenSetCategoricalAttribute specifying the default value.
delta: Failure probability for the partition selection threshold.
min_count: Minimum true count for a partition to be discovered.
"""

name: str
attribute: domain.OpenSetCategoricalAttribute
delta: float
min_count: int = 1
mechanism: primitives.DPPartitionSelection | None = dataclasses.field(
default=None, repr=False
)
Expand All @@ -170,6 +172,7 @@ def calibrate(self, *, zcdp_rho: float) -> OpenSetCategoricalInitializer:
"""Returns a copy calibrated to the given zCDP budget."""
mechanism = primitives.DPPartitionSelection(
delta=self.delta,
min_count=self.min_count,
).calibrate(zcdp_rho=zcdp_rho)
return dataclasses.replace(self, mechanism=mechanism)

Expand Down
47 changes: 31 additions & 16 deletions dpsynth/local_mode/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def select_partitions_gaussian_thresholding(
data: np.ndarray,
gdp_budget: float,
delta: float,
min_count: int = 1,
) -> tuple[np.ndarray, np.ndarray, float]:
"""Selects partitions using Gaussian Thresholding (Weighted Gaussian).

Expand All @@ -285,15 +286,25 @@ def select_partitions_gaussian_thresholding(

Under item-level DP each record is treated as a distinct user contributing
to exactly one partition, so the histogram has L2 sensitivity 1. The
threshold is T = 1 + sigma * Phi^{-1}(1 - delta), following the paper's
formula with max_part = 1.
threshold is T = min_count + sigma * Phi^{-1}(1 - delta), following the
paper's formula with max_part = 1 and a shift of (min_count - 1) to
account for the minimum count guarantee.

When ``min_count > 1``, partitions with true count below ``min_count``
are pre-filtered and the threshold shifts up accordingly. The privacy
guarantee is preserved: partitions where both neighboring datasets are
above ``min_count`` are covered by the Gaussian mechanism, and the
boundary case (one dataset at ``min_count - 1``, the other at
``min_count``) is covered by the same additive delta.

Args:
rng: A numpy random number generator.
data: 1D array of integers, where each element is a partition ID.
gdp_budget: Privacy budget in terms of squared Gaussian DP mu parameter
(gdp_budget = mu^2 = 1 / sigma^2).
delta: Failure probability (false positive bound per empty partition).
min_count: Minimum true count for a partition to be eligible. Partitions
with fewer occurrences in the data are never returned. Must be >= 1.

Returns:
A tuple containing:
Expand All @@ -305,19 +316,29 @@ def select_partitions_gaussian_thresholding(
"""
if gdp_budget <= 0 or delta <= 0:
raise ValueError(f'{gdp_budget=} and {delta=} must be positive.')
if min_count < 1:
raise ValueError(f'{min_count=} must be >= 1.')

sigma = 1.0 / np.sqrt(gdp_budget)

if data.size == 0:
return np.empty(0, dtype=data.dtype), np.empty(0, dtype=float), sigma

unique_parts, counts = np.unique(data, return_counts=True)

# Filter partitions below the minimum count before adding noise.
above_min = counts >= min_count
unique_parts, counts = unique_parts[above_min], counts[above_min]
if unique_parts.size == 0:
return np.empty(0, dtype=data.dtype), np.empty(0, dtype=float), sigma

noisy_counts = counts + rng.normal(scale=sigma, size=counts.size)

# Threshold: ensures that an empty partition (true count 0) passes with
# probability at most delta. For max_part=1 this simplifies to:
# T = 1/sqrt(1) + sigma * Phi^{-1}(1 - delta) = 1 + sigma * ppf(1-delta)
threshold = 1.0 + sigma * scipy.stats.norm.ppf(1.0 - delta)
# Threshold shifted by (min_count - 1) relative to the base formula.
# Base: T = 1 + sigma * ppf(1 - delta) bounds Pr[N(0, sigma^2) >= T] <= delta.
# With min_count, worst-case non-eligible count is (min_count - 1), so
# T' = min_count + sigma * ppf(1 - delta).
threshold = float(min_count) + sigma * scipy.stats.norm.ppf(1.0 - delta)
passed = noisy_counts >= threshold

return unique_parts[passed], noisy_counts[passed], sigma
Expand Down Expand Up @@ -574,10 +595,12 @@ class DPPartitionSelection(DPMechanism):

Attributes:
delta: Failure probability for the thresholding step.
min_count: Minimum true count for a partition to be returned.
sigma: Gaussian noise standard deviation. Set directly or via ``calibrate``.
"""

delta: float
min_count: int = 1
sigma: float | None = None

def calibrate(self, *, zcdp_rho: float) -> DPPartitionSelection:
Expand All @@ -596,18 +619,10 @@ def dp_event(self) -> dp_accounting.DpEvent:
def __call__(
self, rng: np.random.Generator, data: np.ndarray
) -> tuple[np.ndarray, np.ndarray, float]:
"""Runs partition selection on integer-encoded partition IDs.

Args:
rng: A numpy random number generator.
data: 1D array of integer partition IDs.

Returns:
A tuple of (selected_partitions, noisy_counts, sigma).
"""
"""Runs partition selection on integer-encoded partition IDs."""
if self.sigma is None:
raise ValueError(_UNCALIBRATED_MSG.format(param='sigma'))
gdp_budget = np.inf if self.sigma == 0.0 else 1.0 / (self.sigma**2)
return select_partitions_gaussian_thresholding(
rng, data, gdp_budget, self.delta
rng, data, gdp_budget, self.delta, min_count=self.min_count
)
46 changes: 46 additions & 0 deletions tests/local_mode/primitives_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,52 @@ def test_string_data_type(self):
)
self.assertTrue(all(isinstance(p, str) for p in selected))

def test_min_count_filters_low_count_partitions(self):
# Partition 1 has count 50, partition 2 has count 3.
data = np.array([1] * 50 + [2] * 3)
selected, _, _ = primitives.select_partitions_gaussian_thresholding(
self.rng, data, gdp_budget=10.0, delta=1e-5, min_count=5
)
self.assertIn(1, selected)
self.assertNotIn(2, selected)

def test_min_count_one_matches_default(self):
data = np.array([1] * 50 + [2] * 5)
rng1 = np.random.default_rng(42)
rng2 = np.random.default_rng(42)
result1 = primitives.select_partitions_gaussian_thresholding(
rng1, data, gdp_budget=10.0, delta=1e-5
)
result2 = primitives.select_partitions_gaussian_thresholding(
rng2, data, gdp_budget=10.0, delta=1e-5, min_count=1
)
np.testing.assert_array_equal(result1[0], result2[0])
np.testing.assert_array_equal(result1[1], result2[1])

def test_min_count_all_filtered_returns_empty(self):
data = np.array([1, 2, 3])
selected, counts, _ = primitives.select_partitions_gaussian_thresholding(
self.rng, data, gdp_budget=10.0, delta=1e-5, min_count=5
)
self.assertEmpty(selected)
self.assertEmpty(counts)

def test_min_count_zero_raises(self):
data = np.array([1, 2, 3])
with self.assertRaises(ValueError):
primitives.select_partitions_gaussian_thresholding(
self.rng, data, gdp_budget=1.0, delta=1e-5, min_count=0
)

def test_min_count_increases_threshold(self):
# With very high budget (no noise), threshold is approximately min_count.
# Partitions with count exactly at min_count should pass.
data = np.array([1] * 10 + [2] * 10)
selected, _, _ = primitives.select_partitions_gaussian_thresholding(
self.rng, data, gdp_budget=np.inf, delta=0.1, min_count=10
)
self.assertCountEqual(selected, [1, 2])


class GaussianHistogramTest(absltest.TestCase):

Expand Down
Loading