VectorInstitute · lotif · Mar 23, 2026 · Mar 23, 2026 · Apr 6, 2026 · Apr 8, 2026
diff --git a/examples/gan/README.md b/examples/gan/README.md
@@ -94,7 +94,7 @@ Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix
 and Mutual Information Difference.
 
 To compute those metrics, you can run the command below. The name of the table should be
-defined in the `dataset_meta.json` file, and the file for synthetic data should be under
+defined in the `dataset_meta.json` file, and the data files should be under
 `/data/{table_name}.csv` for the real data and `/results/{table_name}_synthetic.csv`
 for the synthetic data.
 

diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md
@@ -60,10 +60,6 @@ python -m examples.gan.ensemble_attack.make_challenge_dataset
 
 ## 4. Training the attack model
 
-> [!NOTE]
-> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_training.model_name`
-> is what determines this attack will be run with the CTGAN model.
-
 To train the attack models, execute the following command:
 
 ```bash

diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml
@@ -36,7 +36,6 @@ ensemble_attack:
     run_metaclassifier_training: true
 
   shadow_training:
-    model_name: ctgan
     model_config:  # Configurations specific for the CTGAN model
       training:
         epochs: 300

diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py
@@ -39,6 +39,8 @@ def make_challenge_dataset(config: DictConfig) -> None:
     log(INFO, f"Saving challenge labels to {challenge_label_path}")
     np.save(challenge_label_path, challenge_data_labels)
 
+    log(INFO, "Done!")
+
 
 if __name__ == "__main__":
     make_challenge_dataset()
diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py
@@ -40,6 +40,14 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]:
     Returns:
         The ensemble attack training config for the CTGAN model.
     """
+    base_data_dir = str
+    if "base_data_dir" in config:
+        base_data_dir = config.base_data_dir
+    if "data_dir" in config:
+        base_data_dir = config.data_dir
+    else:
+        raise ValueError("Either base_data_dir or data_dir must be provided in the config.")
+
     # Saving the model config from the config.yaml into a json file
     # because that's what the ensemble attack code will be looking for
     training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path)
@@ -48,10 +56,10 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]:
         training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config, resolve=True)
         assert isinstance(training_config, dict), "Training config must be a dictionary."
         training_config["general"] = {
-            "test_data_dir": config.base_data_dir,
+            "test_data_dir": base_data_dir,
             "sample_prefix": "ctgan",
-            "data_dir": config.base_data_dir,
-            "workspace_dir": str(Path(config.base_data_dir) / "shadow_workspace"),
+            "data_dir": base_data_dir,
+            "workspace_dir": str(Path(base_data_dir) / "shadow_workspace"),
             "exp_name": "pre_trained_model",
         }
         json.dump(training_config, f)

diff --git a/examples/tabsyn/README.md b/examples/tabsyn/README.md
@@ -0,0 +1,92 @@
+# TabSyn Single Table Example
+
+This example will go over training a single-table [TabSyn](https://arxiv.org/abs/2310.09656)
+model and synthesizing data afterwards.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1HTgfgeL5GXc8uAGfeQirJrUynK7vFeyb/view?usp=drive_link),
+extract the files and place them in a `/data` folder in within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The training data. It consists of information about bank transactions and it
+contains 20,000 data points.
+- `trans_info.json`: Metadata about the `trans.csv` data, with information such as which columns are
+numerical and which are categorical, what is the task type, etc.
+
+
+## Kicking off training
+
+To kick off training, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.tabsyn.train
+```
+
+
+## Training results
+
+The result files will be saved inside a `/results` folder within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+In the `/results/trans` folder, there will be a file called `model.pt`,
+which is a pytorch saved model.
+
+
+## Synthesizing data
+
+To synthesize some data with the trained model, run:
+
+```bash
+python -m examples.tabsyn.synthesize
+```
+
+If there is already a trained model in the `/results` folder, it will use that model.
+Otherwise it will train one from scratch. At the end of the script, it will save the
+synthesized data to `/results/trans/synthetic_data/trans_synthetic.csv`.
+
+
+## Evaluating the quality of the synthetic data
+
+### Alpha Precision
+
+To run a round of evaluation with [Alpha Precision](https://arxiv.org/abs/2301.07573)
+metrics on a set of synthetic data, run the `evaluate.py` script:
+
+```bash
+python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
+  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
+  --real_data examples/tabsyn/data/trans_sampled.csv \
+  --meta_info_path examples/gan/data/meta_info.json \
+  --save_directory examples/gan/results/
+```
+
+It will save the evaluation results under the `/results/model.txt` file.
+
+### Additional Metrics
+
+The calculation of additional metrics are set up in the `evaluate.py` file. They are the
+Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix Difference
+and Mutual Information Difference.
+
+To compute those metrics, you can run the command below. The data files should
+be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_samples.csv`
+for the sampleed data used for training, and `/results/{table_name}_synthetic.csv`
+for the synthetic data.
+
+```bash
+python -m examples.tabsyn.evaluate
+```
+
+The results will be saved in the `/results/evaluation.json` file.
diff --git a/examples/tabsyn/config.yaml b/examples/tabsyn/config.yaml
@@ -0,0 +1,85 @@
+data_dir: examples/tabsyn/data
+results_dir: examples/tabsyn/results
+tabsyn_config: examples/tabsyn/tabsyn_config.toml
+table_name: trans
+
+training:
+  sample_size: 20000
+
+evaluation:
+  # Reporting Configuration
+  write_report: True
+  metric_report_path: ${results_dir}/evaluation_results.txt
+
+  # Privacy Evaluation Configurations
+  dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  median_dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  hitting_rate:
+    run: True
+    hitting_threshold: 0.03
+
+  eir:
+    run: True
+    norm: "gower"
+
+  nndr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  # Quality Evaluation Configurations
+  ks_tv:
+    run: True
+    significance_level: 0.05
+    permutations: 1000
+
+  alpha_precision:
+    run: True
+    naive_only: False
+
+  ci_overlap:
+    run: True
+    confidence_level: 95
+
+  correlation_diff:
+    run: True
+    compute_mixed_correlations: True
+
+  mean_diff:
+    run: True
+
+  f1_score_diff:
+    run: True
+    label_column: "trans_type"
+    folds: 5
+    f1_type: "macro"
+
+  regression_score_diff:
+    run: True
+    label_column: "trans_type"
+    preprocess_labels: True
+    measure_metrics_in_original_label_space: False
+    verbose: True
+    convert_label_to_float: True
+
+  hellinger:
+    run: True
+    include_numerical_columns: True
+
+  propensity_mse:
+    run: True
+    folds: 5
+    max_iterations: 50
+    solver: "liblinear"
+
+  mutual_information:
+    run: True
+    include_numerical_columns: False
diff --git a/examples/tabsyn/ensemble_attack/README.md b/examples/tabsyn/ensemble_attack/README.md
@@ -0,0 +1,95 @@
+# TabSyn Ensemble Attack Example
+
+On this example, we demonstrate how to run the [Ensemble Attack](../../ensemble_attack/README.md)
+using the [TabSyn](https://arxiv.org/abs/2310.09656) model.
+
+## 1. Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/16XCa63eD2dZ1bddhgRbGFuzAuuMlto9P/view?usp=sharing),
+extract the files and place them in a `/data/ensemble_attack` folder within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The full set of training data.
+- `dataset_meta.json`: Metadata about the relationship between the tables in the dataset. Since this is a
+single table dataset, it will only contain information about the transaction (`trans`) table.
+- `trans_domain.json`: Metadata about the columns of the transaction table, such as their size
+and type (`continuous` or `discrete`).
+- `trans_info.json` and `meta_info.json`: Metadata about the `trans.csv` data, with information
+such as which columns are numerical and which are categorical, what is the task type, etc.
+- `data_types.json`: Additional metadata about the columns, splitting them into 4 types:
+    - `numerical`: a list of the columns that contain numerical information
+    - `categorical`: a list of the columns that contain categorical information
+    - `variable_to_predict`: the name of the target column that will be predicted
+    - `id_column_name`: the name of the column in the table that represents the rows' id.
+
+With the data present in the correct folder, we can proceed with running the attack.
+
+## 2. Generating target synthetic data to be tested
+
+The **target model** is the model being attacked, and the **target synthetic data**
+is the synthetic data generated by the target model that will be evaluated against
+the attack.
+
+If you already have a set of synthetic data produced by a target model,
+you can add its path to the `ensemble_attack.target_model.target_synthetic_data_path`
+property in the [`config.yaml`](config.yaml) file and skip this step.
+
+If you wish to train a new target model and produce the synthetic data that will be the
+target of the attack, you can run:
+
+```bash
+python -m examples.tabsyn.synthesize --config-path=./ensemble_attack
+```
+
+## 3. Producing the challenge points dataset
+
+The challenge points dataset is composed of real data points where half of them
+were used in training the target model and half weren't. It is the dataset we are going
+to use to evaluate how good the attack model is in differentiating between
+the points used in training and the ones not used in training.
+
+To produce such dataset, run the following script:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.make_challenge_dataset
+```
+
+## 4. Training the attack model
+
+To train the attack models, execute the following command:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.train_attack_model
+```
+
+This will take a long time to run, so it might be a good idea to execute it as a
+background process. If you want to have a quick test run before kicking off the
+full process, you can change the number of iterations, epochs, population and
+sample sizes to smaller numbers.
+
+## 5. Testing the attack model
+
+To test the attack model against the target model and synthetic data produced on
+[step 2](#2-generating-target-synthetic-data-to-be-tested), please run:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.test_attack_model
+```
+
+## 6. Compute the attack success
+
+To compute the metrics about the success of the attack against the target
+synthetic data, you can run the following command:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.compute_attack_success
+```
+
+The results will both printed on the console and saved in the file
+`examples/tabsyn/results/attack_success_for_xgb_metaclassifier_model.txt`
diff --git a/examples/tabsyn/ensemble_attack/compute_attack_success.py b/examples/tabsyn/ensemble_attack/compute_attack_success.py
@@ -0,0 +1,30 @@
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.ensemble_attack.compute_attack_success import compute_attack_success_for_given_targets
+from midst_toolkit.common.logger import log
+
+
+@hydra.main(config_path="./", config_name="config", version_base=None)
+def compute_attack_success(config: DictConfig) -> None:
+    """Main function to compute the attack success."""
+    log(
+        INFO,
+        f"Computing attack success for target synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...",
+    )
+
+    compute_attack_success_for_given_targets(
+        target_model_config=config.ensemble_attack.target_model,
+        # TODO: refactor this to work better outside of the challenge context (i.e. no target ID)
+        # No target ID needed for CTGAN, but it needs at least one element in this array. The value does not matter.
+        target_ids=[0],
+        experiment_directory=Path(config.results_dir),
+        metaclassifier_model_name=config.ensemble_attack.metaclassifier.meta_classifier_model_name,
+    )
+
+
+if __name__ == "__main__":
+    compute_attack_success()