google · copybara-service · Jun 19, 2026
diff --git a/dpsynth/local_mode/initialization.py b/dpsynth/local_mode/initialization.py
@@ -157,11 +157,13 @@ class OpenSetCategoricalInitializer(primitives.DPMechanism):
     name: Attribute name used as the clique key in the measurement.
     attribute: The OpenSetCategoricalAttribute specifying the default value.
     delta: Failure probability for the partition selection threshold.
+    min_count: Minimum true count for a partition to be discovered.
   """
 
   name: str
   attribute: domain.OpenSetCategoricalAttribute
   delta: float
+  min_count: int = 1
   mechanism: primitives.DPPartitionSelection | None = dataclasses.field(
       default=None, repr=False
   )
@@ -170,6 +172,7 @@ def calibrate(self, *, zcdp_rho: float) -> OpenSetCategoricalInitializer:
     """Returns a copy calibrated to the given zCDP budget."""
     mechanism = primitives.DPPartitionSelection(
         delta=self.delta,
+        min_count=self.min_count,
     ).calibrate(zcdp_rho=zcdp_rho)
     return dataclasses.replace(self, mechanism=mechanism)
 

diff --git a/dpsynth/local_mode/primitives.py b/dpsynth/local_mode/primitives.py
@@ -272,6 +272,7 @@ def select_partitions_gaussian_thresholding(
     data: np.ndarray,
     gdp_budget: float,
     delta: float,
+    min_count: int = 1,
 ) -> tuple[np.ndarray, np.ndarray, float]:
   """Selects partitions using Gaussian Thresholding (Weighted Gaussian).
 
@@ -285,15 +286,25 @@ def select_partitions_gaussian_thresholding(
 
   Under item-level DP each record is treated as a distinct user contributing
   to exactly one partition, so the histogram has L2 sensitivity 1.  The
-  threshold is T = 1 + sigma * Phi^{-1}(1 - delta), following the paper's
-  formula with max_part = 1.
+  threshold is T = min_count + sigma * Phi^{-1}(1 - delta), following the
+  paper's formula with max_part = 1 and a shift of (min_count - 1) to
+  account for the minimum count guarantee.
+
+  When ``min_count > 1``, partitions with true count below ``min_count``
+  are pre-filtered and the threshold shifts up accordingly. The privacy
+  guarantee is preserved: partitions where both neighboring datasets are
+  above ``min_count`` are covered by the Gaussian mechanism, and the
+  boundary case (one dataset at ``min_count - 1``, the other at
+  ``min_count``) is covered by the same additive delta.
 
   Args:
     rng: A numpy random number generator.
     data: 1D array of integers, where each element is a partition ID.
     gdp_budget: Privacy budget in terms of squared Gaussian DP mu parameter
       (gdp_budget = mu^2 = 1 / sigma^2).
     delta: Failure probability (false positive bound per empty partition).
+    min_count: Minimum true count for a partition to be eligible. Partitions
+      with fewer occurrences in the data are never returned. Must be >= 1.
 
   Returns:
     A tuple containing:
@@ -305,19 +316,29 @@ def select_partitions_gaussian_thresholding(
   """
   if gdp_budget <= 0 or delta <= 0:
     raise ValueError(f'{gdp_budget=} and {delta=} must be positive.')
+  if min_count < 1:
+    raise ValueError(f'{min_count=} must be >= 1.')
 
   sigma = 1.0 / np.sqrt(gdp_budget)
 
   if data.size == 0:
     return np.empty(0, dtype=data.dtype), np.empty(0, dtype=float), sigma
 
   unique_parts, counts = np.unique(data, return_counts=True)
+
+  # Filter partitions below the minimum count before adding noise.
+  above_min = counts >= min_count
+  unique_parts, counts = unique_parts[above_min], counts[above_min]
+  if unique_parts.size == 0:
+    return np.empty(0, dtype=data.dtype), np.empty(0, dtype=float), sigma
+
   noisy_counts = counts + rng.normal(scale=sigma, size=counts.size)
 
-  # Threshold: ensures that an empty partition (true count 0) passes with
-  # probability at most delta.  For max_part=1 this simplifies to:
-  #   T = 1/sqrt(1) + sigma * Phi^{-1}(1 - delta) = 1 + sigma * ppf(1-delta)
-  threshold = 1.0 + sigma * scipy.stats.norm.ppf(1.0 - delta)
+  # Threshold shifted by (min_count - 1) relative to the base formula.
+  # Base: T = 1 + sigma * ppf(1 - delta) bounds Pr[N(0, sigma^2) >= T] <= delta.
+  # With min_count, worst-case non-eligible count is (min_count - 1), so
+  # T' = min_count + sigma * ppf(1 - delta).
+  threshold = float(min_count) + sigma * scipy.stats.norm.ppf(1.0 - delta)
   passed = noisy_counts >= threshold
 
   return unique_parts[passed], noisy_counts[passed], sigma
@@ -574,10 +595,12 @@ class DPPartitionSelection(DPMechanism):
 
   Attributes:
     delta: Failure probability for the thresholding step.
+    min_count: Minimum true count for a partition to be returned.
     sigma: Gaussian noise standard deviation. Set directly or via ``calibrate``.
   """
 
   delta: float
+  min_count: int = 1
   sigma: float | None = None
 
   def calibrate(self, *, zcdp_rho: float) -> DPPartitionSelection:
@@ -596,18 +619,10 @@ def dp_event(self) -> dp_accounting.DpEvent:
   def __call__(
       self, rng: np.random.Generator, data: np.ndarray
   ) -> tuple[np.ndarray, np.ndarray, float]:
-    """Runs partition selection on integer-encoded partition IDs.
-
-    Args:
-      rng: A numpy random number generator.
-      data: 1D array of integer partition IDs.
-
-    Returns:
-      A tuple of (selected_partitions, noisy_counts, sigma).
-    """
+    """Runs partition selection on integer-encoded partition IDs."""
     if self.sigma is None:
       raise ValueError(_UNCALIBRATED_MSG.format(param='sigma'))
     gdp_budget = np.inf if self.sigma == 0.0 else 1.0 / (self.sigma**2)
     return select_partitions_gaussian_thresholding(
-        rng, data, gdp_budget, self.delta
+        rng, data, gdp_budget, self.delta, min_count=self.min_count
     )
diff --git a/tests/local_mode/primitives_test.py b/tests/local_mode/primitives_test.py
@@ -272,6 +272,52 @@ def test_string_data_type(self):
     )
     self.assertTrue(all(isinstance(p, str) for p in selected))
 
+  def test_min_count_filters_low_count_partitions(self):
+    # Partition 1 has count 50, partition 2 has count 3.
+    data = np.array([1] * 50 + [2] * 3)
+    selected, _, _ = primitives.select_partitions_gaussian_thresholding(
+        self.rng, data, gdp_budget=10.0, delta=1e-5, min_count=5
+    )
+    self.assertIn(1, selected)
+    self.assertNotIn(2, selected)
+
+  def test_min_count_one_matches_default(self):
+    data = np.array([1] * 50 + [2] * 5)
+    rng1 = np.random.default_rng(42)
+    rng2 = np.random.default_rng(42)
+    result1 = primitives.select_partitions_gaussian_thresholding(
+        rng1, data, gdp_budget=10.0, delta=1e-5
+    )
+    result2 = primitives.select_partitions_gaussian_thresholding(
+        rng2, data, gdp_budget=10.0, delta=1e-5, min_count=1
+    )
+    np.testing.assert_array_equal(result1[0], result2[0])
+    np.testing.assert_array_equal(result1[1], result2[1])
+
+  def test_min_count_all_filtered_returns_empty(self):
+    data = np.array([1, 2, 3])
+    selected, counts, _ = primitives.select_partitions_gaussian_thresholding(
+        self.rng, data, gdp_budget=10.0, delta=1e-5, min_count=5
+    )
+    self.assertEmpty(selected)
+    self.assertEmpty(counts)
+
+  def test_min_count_zero_raises(self):
+    data = np.array([1, 2, 3])
+    with self.assertRaises(ValueError):
+      primitives.select_partitions_gaussian_thresholding(
+          self.rng, data, gdp_budget=1.0, delta=1e-5, min_count=0
+      )
+
+  def test_min_count_increases_threshold(self):
+    # With very high budget (no noise), threshold is approximately min_count.
+    # Partitions with count exactly at min_count should pass.
+    data = np.array([1] * 10 + [2] * 10)
+    selected, _, _ = primitives.select_partitions_gaussian_thresholding(
+        self.rng, data, gdp_budget=np.inf, delta=0.1, min_count=10
+    )
+    self.assertCountEqual(selected, [1, 2])
+
 
 class GaussianHistogramTest(absltest.TestCase):