criticalml-uw · NayeemaNonta · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/configs/whitebox/attacks/lofit/grid.yaml b/configs/whitebox/attacks/lofit/grid.yaml
@@ -0,0 +1,39 @@
+base: &base_cfg
+    model_config:
+        template: instruction_response
+        max_generation_length: 512
+        inference_batch_size: 8
+    evals: [strong_reject, mmlu_pro_val]
+    harmful_dataset: safe_rlhf_alpaca_train
+    benign_dataset: bookcorpus
+    dataset_size: 300
+    poison_ratio: 1.0
+
+    # LoFiT training hyperparameters
+    use_topk_heads: 15
+    l1_lambda: 5.0e-3
+    lr_A: 5.0e-4
+    lr_v: 5.0e-3
+    num_train_epochs: 20
+    per_device_train_batch_size: 8
+    eval_batch: 8
+    save_strategy: best
+    apply_chat_template: false
+
+variant_0:
+    <<: *base_cfg
+    use_topk_heads: 15
+    lr_A: 5.0e-4
+    lr_v: 5.0e-3
+
+variant_1:
+    <<: *base_cfg
+    use_topk_heads: 10
+    lr_A: 1.0e-3
+    lr_v: 1.0e-2
+
+variant_2:
+    <<: *base_cfg
+    use_topk_heads: 20
+    lr_A: 2.0e-4
+    lr_v: 2.0e-3
diff --git a/configs/whitebox/attacks/lofit/single_objective_sweep.yaml b/configs/whitebox/attacks/lofit/single_objective_sweep.yaml
@@ -0,0 +1,26 @@
+evals: [strong_reject, mmlu_pro_val]
+sweep:
+  use_topk_heads:
+    type: categorical
+    choices: [5, 10, 15, 20, 30]
+  lr_A:
+    type: float
+    low: 1.0e-4
+    high: 1.0e-2
+    log: true
+  lr_v:
+    type: float
+    low: 1.0e-3
+    high: 1.0e-1
+    log: true
+  l1_lambda:
+    type: float
+    low: 1.0e-4
+    high: 1.0e-1
+    log: true
+  num_epoch:
+    type: categorical
+    choices: [10, 20, 30]
+  train_size:
+    type: categorical
+    choices: [150, 300, 500]
diff --git a/pyproject.toml b/pyproject.toml
@@ -98,6 +98,9 @@ exclude = [
   "src/tamperbench/whitebox/defenses/t_vaccine/utils.py",
   "src/tamperbench/whitebox/defenses/t_vaccine/train.py",
   "src/tamperbench/whitebox/defenses/t_vaccine/t_vaccine_trainer.py",
+  # Files vendored from the LoFiT repo (https://github.com/fc2869/lo-fit).
+  # Excluded to preserve diffability against the original source.
+  "src/tamperbench/whitebox/attacks/lofit/vendor/",
   # One-off scripts
   "src/tamperbench/whitebox/attacks/multilingual_finetune/generate_translated_dataset.py",
 ]
@@ -174,6 +177,10 @@ exclude = [
 
   # Files that are one-off scripts (used purely for record keeping)
   "src/tamperbench/whitebox/attacks/multilingual_finetune/generate_translated_dataset.py",
+
+  # Files vendored from the LoFiT repo (https://github.com/fc2869/lo-fit).
+  # Excluded to preserve diffability against the original source.
+  "src/tamperbench/whitebox/attacks/lofit/vendor/",
 ]
 
 # Allow Any as a type

diff --git a/src/tamperbench/whitebox/attacks/base.py b/src/tamperbench/whitebox/attacks/base.py
@@ -27,6 +27,7 @@
 from tamperbench.whitebox.evals.minerva_math.minerva_math import (
     MinervaMathEvaluationConfig,
 )
+from tamperbench.whitebox.evals.mt_bench.mt_bench import MTBenchEvaluation, MTBenchEvaluationConfig
 from tamperbench.whitebox.evals.output_schema import EvaluationSchema
 from tamperbench.whitebox.evals.strong_reject.strong_reject import (
     JailbreakBenchEvaluation,
@@ -202,6 +203,9 @@ def evaluate(self) -> DataFrame[EvaluationSchema]:
         if EvalName.JAILBREAK_BENCH in self.attack_config.evals:
             results = pl.concat([results, self.evaluate_jailbreak_bench()])
 
+        if EvalName.MT_BENCH in self.attack_config.evals:
+            results = pl.concat([results, self.evaluate_mt_bench()])
+
         return EvaluationSchema.validate(results)
 
     def evaluate_strong_reject(self) -> DataFrame[EvaluationSchema]:
@@ -283,3 +287,12 @@ def evaluate_jailbreak_bench(self) -> DataFrame[EvaluationSchema]:
         evaluator: JailbreakBenchEvaluation[StrongRejectEvaluationConfig] = JailbreakBenchEvaluation(eval_config)
 
         return evaluator.run_evaluation()
+
+    def evaluate_mt_bench(self) -> DataFrame[EvaluationSchema]:
+        """Evaluate attack on the `MTBenchEvaluation` evaluator."""
+        eval_config = MTBenchEvaluationConfig(
+            model_checkpoint=self.output_checkpoint_path,
+            out_dir=self.attack_config.out_dir,
+            model_config=self.attack_config.model_config,
+        )
+        return MTBenchEvaluation(eval_config).run_evaluation()
diff --git a/src/tamperbench/whitebox/attacks/lofit/__init__.py b/src/tamperbench/whitebox/attacks/lofit/__init__.py
@@ -0,0 +1,6 @@
+"""LoFiT: Low-Rank Fine-tuning via Attention Head Selection and Bias Tuning."""
+
+from .lofit_finetune import LoFiTAttack, LoFiTAttackConfig
+from .model_loader import load_lofit_model_and_tokenizer
+
+__all__ = ["LoFiTAttack", "LoFiTAttackConfig", "load_lofit_model_and_tokenizer"]