Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 39 additions & 7 deletions src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv
import os
from collections.abc import Generator
from dataclasses import replace
from logging import INFO
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -98,6 +99,8 @@ def mixed_loss(


# TODO: Unify this with the Dataset.from_df function.
# TODO: Noise scale is always called with a value of 0 for the attack. So we should remove it from the f
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the f here is a typo?

# function signature and the function calls.
def make_dataset_from_df_with_loaded(
data: pd.DataFrame,
transformation: Transformations,
Expand All @@ -108,7 +111,7 @@ def make_dataset_from_df_with_loaded(
noise_scale: float = 0,
) -> Dataset:
"""
Create a dataset using artifacts.
Makes a dataset from a dataframe with loaded transformations.

Args:
data: Raw data to be used for creating the dataset.
Expand All @@ -117,8 +120,8 @@ def make_dataset_from_df_with_loaded(
table_metadata: Meta data about the table or tables.
label_encoders: Encoders that were used to encode the categorical data.
numerical_transform: Transformations that should be applied to the numerical data. Defaults to None.
noise_scale: he scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``. Defaults to 0.
noise_scale: The scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``. Defaults to 0.

Returns:
A full dataset constructed of the various pieces.
Expand All @@ -128,7 +131,7 @@ def make_dataset_from_df_with_loaded(
is_target_conditioned,
)
numerical_features = {DataSplit.TRAIN.value: data[numerical_column_names].values.astype(np.float32)}
categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy(dtype=np.str_)}
categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy()}
targets = {DataSplit.TRAIN.value: data[[table_metadata.target_column_name]].values.astype(np.float32)}

if len(categorical_column_names) > 0:
Expand All @@ -153,6 +156,13 @@ def make_dataset_from_df_with_loaded(
numerical_features = categorical_features

target_info = TargetInfo(policy=None, mean=None, std=None)

# Apply the model's pre-fitted numerical transform directly instead of re-fitting a new one.
# Calling transform_dataset() would fit a brand new QuantileTransformer on the MIA data,
# which produces a different normalization than the model saw during training, destroying signal.
if numerical_transform is not None:
numerical_features = {k: numerical_transform.transform(v) for k, v in numerical_features.items()}
Comment thread
bzamanlooy marked this conversation as resolved.

dataset = Dataset(
numerical_features=numerical_features,
categorical_features=None,
Expand All @@ -163,7 +173,9 @@ def make_dataset_from_df_with_loaded(
categorical_transform=None,
numerical_transform=numerical_transform,
)
return transform_dataset(dataset, transformation, None)
# Use a no-normalization transformation since we've already applied the model's scaler above.
transformation_no_norm = replace(transformation, normalization=None)
return transform_dataset(dataset, transformation_no_norm, None)


def get_dataset(
Expand Down Expand Up @@ -394,7 +406,7 @@ def prepare_dataframe(
return filter_dataframe(merged_data, df_data, columns_for_deduplication)


def train_tartan_federer_attack_classifier(
def train_tartan_federer_attack_classifier( # noqa: PLR0915, PLR0912
train_indices: list[int],
val_indices: list[int] | None,
timesteps: list[int],
Expand Down Expand Up @@ -448,7 +460,27 @@ def train_tartan_federer_attack_classifier(
population_df_for_validation = pd.read_csv(population_data_dir / "population_dataset_for_validating_attack.csv")
log(INFO, "Population datasets for validating loaded.")

noise_dimension = len([col for col in population_df_for_training.columns if "_id" not in col])
# Derive noise dimension from the actual diffusion model's num_numerical_features rather
# than from the population dataframe column count. The mixed_loss function slices
# x[:, :diffusion.num_numerical_features], so the noise vectors must have exactly that length.
# We load the first available model to read this value, then discard it.
first_model_number = train_indices[0]
first_model_dir = model_data_dir / f"{model_type}_{first_model_number}"
first_model_path = first_model_dir / target_model_subdir

if model_type != "tabddpm":
raise ValueError(
f"Unsupported model_type {model_type}. Tartan Federer Attack is only supported for ClavaDDPM-single-table models."
)
# TODO: We should read this from the metadata instead.
_relation_order = [("None", "trans")]
Comment thread
bzamanlooy marked this conversation as resolved.
_parent, _child = _relation_order[0]
_ckpt_path = first_model_path / f"{_parent}_{_child}_ckpt.pkl"
with open(_ckpt_path, "rb") as _f:
_probe_model = CustomUnpickler(_f).load()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason we're using the _ prefixes here? I'd say drop them unless they are serving a purpose that I'm missing 🙂

noise_dimension = _probe_model.diffusion.num_numerical_features
log(INFO, f"Noise dimension read from diffusion model: {noise_dimension}")

input_noise = [np.random.normal(size=noise_dimension).tolist() for _ in range(num_noise_per_time_step)]
input_dimension = len(input_noise) * len(timesteps) * len(additional_timesteps)

Expand Down
7 changes: 7 additions & 0 deletions src/midst_toolkit/models/clavaddpm/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def from_df(
table_metadata: TableMetadata,
data_split_percentages: list[float] | None = None,
noise_scale: float = 0,
label_encoders_path: str | None = None,
# TODO: Find places in code that have this kind of hardcoded random default and remove (with TESTING)
data_split_random_state: int = 42,
) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]:
Expand Down Expand Up @@ -314,6 +315,8 @@ def from_df(
data_split_percentages: The percentages of the dataset to go into train, val, and test splits. The sum of
the percentages must amount to 1 (within a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1].
noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0.
label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder
will be loaded from the pkl file, otherwise they will be fitted on the current data.
data_split_random_state: The random state to use for the data split. Will be passed down to the
``train_test_split`` function from sklearn. Optional, default is 42.

Expand Down Expand Up @@ -377,10 +380,14 @@ def from_df(
column_orders = numerical_column_names + categorical_column_names

# Encode the categorical features and merge them with the numerical features
# Look for pre-fitted label encoders in the parent directories of the data

features, label_encoders = encode_and_merge_features(
categorical_features,
numerical_features,
noise_scale,
categorical_column_names=categorical_column_names,
label_encoders_path=label_encoders_path,
)

assert isinstance(table_metadata.n_classes, int)
Expand Down
48 changes: 45 additions & 3 deletions src/midst_toolkit/models/clavaddpm/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def encode_and_merge_features(
categorical_features: ArrayDict | None,
numerical_features: ArrayDict | None,
noise_scale: float,
categorical_column_names: list[str] | None = None,
label_encoders_path: str | None = None,
) -> tuple[ArrayDict, dict[int, LabelEncoder]]:
"""
Merge the categorical with the numerical features for train, validation, and test datasets. Numerical features
Expand All @@ -75,6 +77,11 @@ def encode_and_merge_features(
keys are "train", "val", "test" from the DataSplit enumeration
noise_scale: The scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``.
categorical_column_names: The names of the categorical columns.
label_encoders_path: The path to the label encoders pkl file fitted on the entire dataset. If provided,
an already fitted label encoder dictionary will be loaded from the pkl file, otherwise
they will be fitted on the current data. This helps handle categories that may appear in
challenge data but not in the training set, preventing unseen-category errors.

Returns:
The merged features for train, validation, and test datasets and the label encoders used to do so. The label
Expand All @@ -95,14 +102,49 @@ def encode_and_merge_features(
)
)

# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
# It is expected that the label encoder that is fitted externally on the entire dataset is a dictionary
# mapping column INDEX within the categorical columns to a label encoder for that column.
# This is unlike the label encoders that are fitted on the current data if a preloaded label encoder is not
# provided which is a dictionary mapping column column index within the categorical columns to a label encoder
# for that column.
if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)
Comment thread
bzamanlooy marked this conversation as resolved.

if not _pkl_path.exists():
raise FileNotFoundError(f"label_encoders_path does not exist: {_pkl_path}")
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)
else:
preloaded_encoders = None
if preloaded_encoders is not None:
if categorical_column_names is None:
raise ValueError("categorical_column_names must be provided when using label_encoders_path.")

expected_cols = set(categorical_column_names)
available_cols = set(preloaded_encoders.keys())

missing_cols = expected_cols - available_cols
Comment thread
bzamanlooy marked this conversation as resolved.

if missing_cols:
raise ValueError(
f"label_encoders_path is missing encoders for categorical columns: {sorted(missing_cols)}. "
)

categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
if preloaded_encoders is not None:
Comment thread
bzamanlooy marked this conversation as resolved.
assert categorical_column_names is not None
label_encoder = preloaded_encoders[categorical_column_names[column]]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
else:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
Comment thread
bzamanlooy marked this conversation as resolved.

if noise_scale > 0:
# add noise
encoded_labels += np.random.normal(0, noise_scale, encoded_labels.shape)

categorical_data_encoded.append(encoded_labels)
label_encoders[column] = label_encoder

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit():
"model_data_dir": base_path,
"target_model_subdir": Path("."),
"model_type": "tabddpm",
"classifier_hidden_dim": 20,
"classifier_num_epochs": 200,
"classifier_hidden_dim": 100,
"classifier_num_epochs": 20,
"samples_per_train_model": 3000,
"samples_per_val_model": 10,
"num_noise_per_time_step": 30,
"timesteps": [5, 10, 15],
"timesteps": [5, 7, 9],
"additional_timesteps": [0],
"predictions_file_name": "challenge_label_predictions",
# TODO: Make results path a temp directory
Expand All @@ -52,14 +52,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit():
roc_auc_test = mia_performance_test["roc_auc"]
tpr_at_fpr_test = mia_performance_test["max_tpr"]

assert roc_auc_train == pytest.approx(0.4469875, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.08, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6315875, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.165, abs=1e-8)

assert roc_auc_val == pytest.approx(0.5054624999999999, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.125, abs=1e-8)
assert roc_auc_val == pytest.approx(0.6732, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.28, abs=1e-8)

assert roc_auc_test == pytest.approx(0.4937875, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.115, abs=1e-8)
assert roc_auc_test == pytest.approx(0.6607, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.19, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Expand Down Expand Up @@ -107,14 +107,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_single_model():
roc_auc_test = mia_performance_test["roc_auc"]
tpr_at_fpr_test = mia_performance_test["max_tpr"]

assert roc_auc_train == pytest.approx(0.5046999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.09, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6985000000000001, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8)

assert roc_auc_val == pytest.approx(0.47159999999999996, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.12, abs=1e-8)
assert roc_auc_val == pytest.approx(0.7075, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.32, abs=1e-8)

assert roc_auc_test == pytest.approx(0.46390000000000003, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.16, abs=1e-8)
assert roc_auc_test == pytest.approx(0.8042, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.56, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Expand Down Expand Up @@ -162,11 +162,11 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_no_validation():

assert mia_performance_val is None

assert roc_auc_train == pytest.approx(0.4996999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.07, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6980999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8)

assert roc_auc_test == pytest.approx(0.5174, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.13, abs=1e-8)
assert roc_auc_test == pytest.approx(0.7075000000000001, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.32, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)