Skip to content
Open
39 changes: 39 additions & 0 deletions configs/whitebox/attacks/lofit/grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
base: &base_cfg
model_config:
template: instruction_response
max_generation_length: 512
inference_batch_size: 8
evals: [strong_reject, mmlu_pro_val]
harmful_dataset: safe_rlhf_alpaca_train
benign_dataset: bookcorpus
dataset_size: 300
poison_ratio: 1.0

# LoFiT training hyperparameters
use_topk_heads: 15
l1_lambda: 5.0e-3
lr_A: 5.0e-4
lr_v: 5.0e-3
num_train_epochs: 20
per_device_train_batch_size: 8
eval_batch: 8
save_strategy: best
apply_chat_template: false

variant_0:
<<: *base_cfg
use_topk_heads: 15
lr_A: 5.0e-4
lr_v: 5.0e-3

variant_1:
<<: *base_cfg
use_topk_heads: 10
lr_A: 1.0e-3
lr_v: 1.0e-2

variant_2:
<<: *base_cfg
use_topk_heads: 20
lr_A: 2.0e-4
lr_v: 2.0e-3
26 changes: 26 additions & 0 deletions configs/whitebox/attacks/lofit/single_objective_sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
evals: [strong_reject, mmlu_pro_val]
sweep:
use_topk_heads:
type: categorical
choices: [5, 10, 15, 20, 30]
lr_A:
type: float
low: 1.0e-4
high: 1.0e-2
log: true
lr_v:
type: float
low: 1.0e-3
high: 1.0e-1
log: true
l1_lambda:
type: float
low: 1.0e-4
high: 1.0e-1
log: true
num_epoch:
type: categorical
choices: [10, 20, 30]
train_size:
type: categorical
choices: [150, 300, 500]
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ exclude = [
"src/tamperbench/whitebox/defenses/t_vaccine/utils.py",
"src/tamperbench/whitebox/defenses/t_vaccine/train.py",
"src/tamperbench/whitebox/defenses/t_vaccine/t_vaccine_trainer.py",
# Files vendored from the LoFiT repo (https://github.com/fc2869/lo-fit).
# Excluded to preserve diffability against the original source.
"src/tamperbench/whitebox/attacks/lofit/vendor/",
# One-off scripts
"src/tamperbench/whitebox/attacks/multilingual_finetune/generate_translated_dataset.py",
]
Expand Down Expand Up @@ -174,6 +177,10 @@ exclude = [

# Files that are one-off scripts (used purely for record keeping)
"src/tamperbench/whitebox/attacks/multilingual_finetune/generate_translated_dataset.py",

# Files vendored from the LoFiT repo (https://github.com/fc2869/lo-fit).
# Excluded to preserve diffability against the original source.
"src/tamperbench/whitebox/attacks/lofit/vendor/",
]

# Allow Any as a type
Expand Down
13 changes: 13 additions & 0 deletions src/tamperbench/whitebox/attacks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from tamperbench.whitebox.evals.minerva_math.minerva_math import (
MinervaMathEvaluationConfig,
)
from tamperbench.whitebox.evals.mt_bench.mt_bench import MTBenchEvaluation, MTBenchEvaluationConfig
from tamperbench.whitebox.evals.output_schema import EvaluationSchema
from tamperbench.whitebox.evals.strong_reject.strong_reject import (
JailbreakBenchEvaluation,
Expand Down Expand Up @@ -202,6 +203,9 @@ def evaluate(self) -> DataFrame[EvaluationSchema]:
if EvalName.JAILBREAK_BENCH in self.attack_config.evals:
results = pl.concat([results, self.evaluate_jailbreak_bench()])

if EvalName.MT_BENCH in self.attack_config.evals:
results = pl.concat([results, self.evaluate_mt_bench()])

return EvaluationSchema.validate(results)

def evaluate_strong_reject(self) -> DataFrame[EvaluationSchema]:
Expand Down Expand Up @@ -283,3 +287,12 @@ def evaluate_jailbreak_bench(self) -> DataFrame[EvaluationSchema]:
evaluator: JailbreakBenchEvaluation[StrongRejectEvaluationConfig] = JailbreakBenchEvaluation(eval_config)

return evaluator.run_evaluation()

def evaluate_mt_bench(self) -> DataFrame[EvaluationSchema]:
"""Evaluate attack on the `MTBenchEvaluation` evaluator."""
eval_config = MTBenchEvaluationConfig(
model_checkpoint=self.output_checkpoint_path,
out_dir=self.attack_config.out_dir,
model_config=self.attack_config.model_config,
)
return MTBenchEvaluation(eval_config).run_evaluation()
6 changes: 6 additions & 0 deletions src/tamperbench/whitebox/attacks/lofit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""LoFiT: Low-Rank Fine-tuning via Attention Head Selection and Bias Tuning."""

from .lofit_finetune import LoFiTAttack, LoFiTAttackConfig
from .model_loader import load_lofit_model_and_tokenizer

__all__ = ["LoFiTAttack", "LoFiTAttackConfig", "load_lofit_model_and_tokenizer"]
Loading