diff --git a/Makefile b/Makefile
index 6297977..082fd9d 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,10 @@
 	smoke-test deploy-skills gen-trajectories prepare-sft train-sft \
 	train-sft-unsloth train-dpo-unsloth train-grpo train-grpo-gptoss-120b \
 	train-sft-unsloth-gptoss-120b train-sft-fp8 train-grpo-fp8 \
-	train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean
+	train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean \
+	train-sft-gemma4 train-sft-gemma4-31b train-sft-gemma4-moe \
+	train-sft-gemma4-vision train-grpo-gemma4 train-grpo-gemma4-31b \
+	train-grpo-gemma4-moe train-dpo-gemma4 hpo-gemma4
 
 # Load repo config (REPO_OWNER, REPO_NAME, REPO_KEY, etc.)
 include repo.conf
@@ -158,6 +161,31 @@ train-sft-qwen3-coder:
 train-sft-nemotron-super:
 	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/nemotron3_super_lora.yaml --device-map balanced
 
+# Gemma 4 family
+train-sft-gemma4:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml
+
+train-sft-gemma4-4bit:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml --four-bit
+
+train-sft-gemma4-e2b:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml
+
+train-sft-gemma4-31b:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit
+
+train-sft-gemma4-moe:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced
+
+train-sft-gemma4-vision:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml
+
+train-sft-gemma4-vision-31b:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit
+
+train-dpo-gemma4:
+	uv run python scripts/08_train_sft_unsloth.py --dpo --recipe configs/unsloth/dpo_gemma4_e4b.yaml --sft-checkpoint ./sft_output/
+
 # DPO and CPT
 train-dpo-unsloth:
 	uv run python scripts/08_train_sft_unsloth.py --dpo --sft-checkpoint ./sft_output/
@@ -212,6 +240,18 @@ train-grpo-nemotron-super:
 train-grpo-gptoss-120b:
 	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gpt_oss_120b.yaml --device-map balanced
 
+train-grpo-gemma4:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e4b.yaml
+
+train-grpo-gemma4-e2b:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e2b.yaml
+
+train-grpo-gemma4-31b:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml
+
+train-grpo-gemma4-moe:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_27b_moe.yaml --device-map balanced
+
 # ── Hyperparameter optimization ───────────────────────────────
 hpo:
 	uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit
@@ -219,6 +259,9 @@ hpo:
 hpo-thorough:
 	uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit --n-trials 30 --steps-per-trial 100
 
+hpo-gemma4:
+	uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml
+
 # ── FP8 training (RTX 40/50, H100+) ──────────────────────────
 train-sft-fp8:
 	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_8b_lora.yaml --fp8
diff --git a/README.md b/README.md
index ea11932..6ba4b47 100644
--- a/README.md
+++ b/README.md
@@ -203,7 +203,7 @@ Runs SWE-agent on validated task instances to produce expert solve traces.
 Evaluates which trajectories actually resolved their task, then converts successful ones to chat-format JSONL suitable for fine-tuning.
 
 ### Step 8: Train SFT (`08_train_sft_unsloth.py`)
-Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, and continued pretraining. Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`.
+Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, continued pretraining, and **vision/multimodal fine-tuning** (FastVisionModel). Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`.
 
 ```bash
 make train-sft-unsloth           # Qwen3-8B LoRA (default)
@@ -213,7 +213,12 @@ make train-sft-unsloth-gptoss    # GPT-OSS 20B LoRA (long context)
 make train-sft-nemotron          # Nemotron-3-Nano 30B MoE (SOTA agentic)
 make train-sft-qwen3-coder       # Qwen3-Coder-Next 80B MoE (SOTA coding)
 make train-sft-unsloth-full      # Full fine-tune
+make train-sft-gemma4            # Gemma 4 E4B LoRA (recommended Gemma default)
+make train-sft-gemma4-31b        # Gemma 4 31B LoRA (4-bit, A100/4090)
+make train-sft-gemma4-moe        # Gemma 4 26B-A4B MoE (multi-GPU)
+make train-sft-gemma4-vision     # Gemma 4 E4B vision fine-tuning
 make train-dpo-unsloth           # DPO (after SFT)
+make train-dpo-gemma4            # DPO — Gemma 4 E4B (after SFT)
 make train-cpt-unsloth           # Continued pretraining (domain adaptation)
 make train-recipe RECIPE=configs/unsloth/my_recipe.yaml  # Custom recipe
 ```
@@ -232,6 +237,12 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml  # Custom recipe
 | `nemotron3_nano_lora.yaml` | Nemotron-3-Nano 30B | ~3.6B (MoE) | ~24GB | SOTA agentic, hybrid reasoning |
 | `nemotron3_super_lora.yaml` | Nemotron-3-Super 120B | ~12B (MoE) | ~64-72GB | SOTA reasoning, 1M context |
 | `qwen3_coder_next_lora.yaml` | Qwen3-Coder-Next 80B | ~3B (MoE) | ~46GB | **SOTA coding.** 70.6% SWE-Bench |
+| `gemma4_e2b_lora.yaml` | Gemma 4 E2B | 5B | ~4GB | Smallest Gemma 4, multimodal capable |
+| `gemma4_e4b_lora.yaml` | Gemma 4 E4B | 8B | ~5GB | **Recommended Gemma 4.** Text+image+audio |
+| `gemma4_27b_moe_lora.yaml` | Gemma 4 26B-A4B | ~4B (MoE) | ~18GB | MoE — 4-bit not recommended, use 16-bit |
+| `gemma4_31b_lora.yaml` | Gemma 4 31B | 31B | ~20GB | Largest dense Gemma 4, 256K context |
+| `gemma4_e4b_vision_lora.yaml` | Gemma 4 E4B (vision) | 8B | ~8GB | FastVisionModel — image+audio fine-tuning |
+| `gemma4_31b_vision_lora.yaml` | Gemma 4 31B (vision) | 31B | ~24GB | FastVisionModel — image fine-tuning |
 
 **GRPO recipes:**
 
@@ -243,6 +254,10 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml  # Custom recipe
 | `grpo_qwen3_coder_next.yaml` | Qwen3-Coder-Next 80B | ~46GB | SOTA coding GRPO (GSPO) |
 | `grpo_nemotron3_super.yaml` | Nemotron-3-Super 120B | ~64-72GB | SOTA reasoning GRPO, multi-GPU |
 | `grpo_gpt_oss_120b.yaml` | GPT-OSS 120B | ~65GB | Multi-GPU required (4x A100) |
+| `grpo_gemma4_e2b.yaml` | Gemma 4 E2B | ~8GB | Fast Gemma 4 GRPO iteration |
+| `grpo_gemma4_e4b.yaml` | Gemma 4 E4B | ~12GB | Recommended Gemma 4 GRPO |
+| `grpo_gemma4_27b_moe.yaml` | Gemma 4 26B-A4B (MoE) | ~24GB | MoE — use 16-bit, multi-GPU |
+| `grpo_gemma4_31b.yaml` | Gemma 4 31B | ~24GB | Largest dense Gemma 4 |
 
 Recipes are YAML files — copy one and customize for your needs. CLI args override recipe values.
 
@@ -291,6 +306,7 @@ python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_coder_next
 | Save MXFP4 | `--save-mxfp4` | 75% less disk space |
 | Push to Hub | `--push-to-hub user/model` | Upload merged model to HuggingFace |
 | 8-bit quantization | `--eight-bit` | Middle ground between 4-bit and 16-bit |
+| Vision fine-tuning | `--vision` | FastVisionModel for image/audio (Gemma 4, etc.) |
 | Continued pretraining | `--cpt --data-dir ./corpus/` | Domain adaptation before SFT |
 
 > **Legacy:** `make train-sft` still works via [torchtune](https://github.com/meta-pytorch/torchtune) (configs in `configs/torchtune/`), but torchtune has [stopped active development](https://github.com/meta-pytorch/torchtune/issues/2883). [torchforge](https://github.com/meta-pytorch/torchforge) is its successor but only supports full SFT and GRPO currently (no LoRA/DPO).
@@ -306,6 +322,9 @@ make train-grpo-nemotron     # Nemotron-3-Nano MoE
 make train-grpo-qwen3-coder   # Qwen3-Coder-Next (multi-GPU)
 make train-grpo-nemotron-super # Nemotron-3-Super 120B (multi-GPU)
 make train-grpo-gptoss-120b   # GPT-OSS 120B (4x A100)
+make train-grpo-gemma4        # Gemma 4 E4B (recommended Gemma GRPO)
+make train-grpo-gemma4-31b    # Gemma 4 31B
+make train-grpo-gemma4-moe    # Gemma 4 26B-A4B MoE (multi-GPU)
 make train-grpo-fp8          # FP8 + vLLM standby (RTX 40/50, H100+)
 make train-grpo-multigpu     # DDP with 2 GPUs
 ```
@@ -412,6 +431,9 @@ Approximate VRAM needed per model size (minimums — actual usage may be higher)
 | **Agentic tasks (small)** | Nemotron-3-Nano 30B | SOTA agentic, hybrid reasoning, MoE ~3.6B active |
 | **General purpose (large)** | Qwen3-32B | Strong reasoning, fits on A100/4090 |
 | **General purpose (small)** | Qwen3-8B | Native tool calling, good balance |
+| **Multimodal (vision+audio)** | Gemma 4 E4B | Image+audio+text, 128K context, `<\|think\|>` reasoning |
+| **Multimodal (large)** | Gemma 4 31B | Image+text, 256K context, dense |
+| **Efficient MoE (small)** | Gemma 4 26B-A4B | Only 4B active params, 256K context |
 | **Fast iteration / testing** | Qwen3-4B | Quick experiments, low VRAM |
 | **Long context** | GPT-OSS 20B/120B | OpenAI's open models, 16K+ context |
 
@@ -499,6 +521,59 @@ CPT uses `UnslothTrainer` with separate embedding learning rates. It adds `lm_he
 python scripts/08_train_sft_unsloth.py --model ./cpt_output/merged/ --recipe configs/unsloth/qwen3_8b_lora.yaml
 ```
 
+## Vision / Multimodal Fine-Tuning
+
+Fine-tune vision-language models (Gemma 4, Llama 3.2 Vision, etc.) on image+text or audio+text data using Unsloth's `FastVisionModel`:
+
+```bash
+make train-sft-gemma4-vision          # Gemma 4 E4B vision (image+audio+text)
+make train-sft-gemma4-vision-31b      # Gemma 4 31B vision (image+text, 4-bit)
+
+# Or with --vision flag on any vision-capable model:
+python scripts/08_train_sft_unsloth.py --vision --model google/gemma-4-E4B-it
+```
+
+**Data format** — messages with content blocks (images/audio must precede text):
+```json
+{"messages": [
+  {"role": "user", "content": [
+    {"type": "image", "image": "path/to/image.jpg"},
+    {"type": "text", "text": "Describe this image."}
+  ]},
+  {"role": "assistant", "content": [
+    {"type": "text", "text": "The image shows..."}
+  ]}
+]}
+```
+
+**Vision recipe fields** (in YAML):
+| Field | Default | Notes |
+|-------|---------|-------|
+| `vision` | `false` | Enable FastVisionModel instead of FastLanguageModel |
+| `finetune_vision_layers` | `true` | Train vision encoder (set `false` to save VRAM) |
+| `finetune_language_layers` | `true` | Train language model layers |
+| `finetune_attention_modules` | `true` | Train attention modules |
+| `finetune_mlp_modules` | `true` | Train MLP modules |
+| `vision_resize` | `null` | Image resize: int (pixels), `"min"`, `"max"`, or `null` (auto) |
+| `lora_targets` | `"all-linear"` | Vision models typically use all-linear |
+
+**Tips:**
+- Use images of consistent dimensions (300-1000px) for efficient batching
+- Start with `finetune_vision_layers: false` to save VRAM, enable after text works
+- Audio clips should be short and task-specific (≤30s)
+- No packing for vision — the `UnslothVisionDataCollator` handles batching
+
+## Gemma 4 Reasoning Preservation
+
+Gemma 4 models support chain-of-thought reasoning via the `<|think|>` token. To preserve this during fine-tuning:
+
+1. **Activate thinking mode** — prepend `<|think|>` to your system prompt
+2. **Training data mix** — maintain ≥75% reasoning examples (with thinking traces)
+3. **Train on completions only** — all Gemma 4 recipes set `train_on_completions: true` with Gemma's chat template tokens (`<start_of_turn>user\n` / `<start_of_turn>model\n`)
+4. **MoE variant (26B-A4B)** — do NOT use 4-bit quantization for MoE; use 16-bit LoRA with `--device-map balanced`
+
+Reference: [Unsloth Gemma 4 Training Guide](https://unsloth.ai/docs/models/gemma-4/train)
+
 ## Project Structure
 
 ```
diff --git a/configs/unsloth/dpo_gemma4_e4b.yaml b/configs/unsloth/dpo_gemma4_e4b.yaml
new file mode 100644
index 0000000..c2d87d7
--- /dev/null
+++ b/configs/unsloth/dpo_gemma4_e4b.yaml
@@ -0,0 +1,47 @@
+# Unsloth DPO — Google Gemma 4 E4B (8B, Dense+PLE)
+# Direct Preference Optimization after SFT.
+# Aligns model to prefer better solutions from preference pairs.
+# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA)
+#
+# Requires: SFT checkpoint + dpo_preferences.json in data dir.
+# Format: [{"prompt": "...", "chosen": "...", "rejected": "..."}, ...]
+#
+# Reasoning: DPO can reinforce reasoning quality by preferring
+#   solutions with clear chain-of-thought over direct answers.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py \
+#     --recipe configs/unsloth/dpo_gemma4_e4b.yaml \
+#     --dpo --sft-checkpoint ./sft_output/
+
+model: google/gemma-4-E4B-it
+mode: lora
+four_bit: false
+
+# LoRA config
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training
+lr: 2e-4
+epochs: 2
+batch_size: 1
+grad_accum: 4
+max_seq_len: 8192
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# DPO-specific
+dpo_lr: 2e-5
+dpo_beta: 0.05       # KL penalty — lower = more aggressive alignment
+dpo_label_smoothing: 0.0
diff --git a/configs/unsloth/gemma4_27b_moe_lora.yaml b/configs/unsloth/gemma4_27b_moe_lora.yaml
new file mode 100644
index 0000000..1556cca
--- /dev/null
+++ b/configs/unsloth/gemma4_27b_moe_lora.yaml
@@ -0,0 +1,56 @@
+# Unsloth SFT — Google Gemma 4 26B-A4B (MoE, 4B active)
+# Mixture-of-Experts: 26B total params, only 4B active per token.
+# 256K context. Supports text and image.
+# VRAM: ~18GB (4-bit) or ~28GB (8-bit) — 16-bit recommended for MoE
+#
+# IMPORTANT: 4-bit quantization is NOT recommended for MoE variants.
+#   Use 16-bit LoRA (default) or 8-bit. For multi-GPU: --device-map balanced
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   Preserve reasoning by keeping >=75% reasoning examples in training mix.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage (single GPU, 16-bit — needs ~28GB+ VRAM):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml
+#
+# Multi-GPU (model splitting):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced
+
+model: google/gemma-4-26B-A4B-it
+mode: lora
+# four_bit: false  # NOT recommended for MoE — use 16-bit or 8-bit
+# eight_bit: true  # Uncomment for ~18GB VRAM (compromise)
+
+# LoRA config — r=16 per Unsloth Gemma 4 recommendation
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training — lower LR for MoE stability
+lr: 1e-4
+epochs: 2
+batch_size: 1
+grad_accum: 8
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template for train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# MoE optimization: use grouped_mm backend for faster training (GRPO script).
+# moe_backend: grouped_mm  # 12x faster MoE training (when supported)
+
+# Reasoning: maintain >=75% reasoning examples in training data.
+# Add "<|think|>" to system prompt to activate thinking mode.
diff --git a/configs/unsloth/gemma4_31b_lora.yaml b/configs/unsloth/gemma4_31b_lora.yaml
new file mode 100644
index 0000000..62fe141
--- /dev/null
+++ b/configs/unsloth/gemma4_31b_lora.yaml
@@ -0,0 +1,52 @@
+# Unsloth SFT — Google Gemma 4 31B (Dense)
+# Largest dense Gemma 4. Maximum quality for text and image tasks.
+# 256K context. Best Gemma 4 for single-modality fine-tuning.
+# VRAM: ~20GB (4-bit) or ~34GB (8-bit) or ~64GB (16-bit)
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   Preserve reasoning by keeping >=75% reasoning examples in training mix.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit
+#
+# Multi-GPU (16-bit, model splitting):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --device-map balanced
+#
+# Vision fine-tuning (see gemma4_31b_vision_lora.yaml):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit
+
+model: google/gemma-4-31B-it
+mode: lora
+four_bit: true  # Fits on single 24GB GPU in 4-bit
+
+# LoRA config — r=16 per Unsloth Gemma 4 recommendation
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training — lower LR for larger model
+lr: 1e-4
+epochs: 2
+batch_size: 1
+grad_accum: 16
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template for train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# Reasoning: maintain >=75% reasoning examples in training data.
+# Add "<|think|>" to system prompt to activate thinking mode.
diff --git a/configs/unsloth/gemma4_31b_vision_lora.yaml b/configs/unsloth/gemma4_31b_vision_lora.yaml
new file mode 100644
index 0000000..6f09aaf
--- /dev/null
+++ b/configs/unsloth/gemma4_31b_vision_lora.yaml
@@ -0,0 +1,61 @@
+# Unsloth Vision SFT — Google Gemma 4 31B (Dense, Vision)
+# Vision fine-tuning of the largest dense Gemma 4 using FastVisionModel.
+# Supports image + text inputs. 256K context.
+# VRAM: ~24GB (4-bit) or ~40GB (8-bit) or ~70GB (16-bit)
+#
+# Data format: messages with content blocks (not plain text).
+#   [{"role": "user", "content": [
+#       {"type": "image", "image": <PIL.Image or URL>},
+#       {"type": "text", "text": "Describe this image."}
+#   ]},
+#   {"role": "assistant", "content": [
+#       {"type": "text", "text": "The image shows..."}
+#   ]}]
+#
+# Images should be 300-1000px, same dimensions preferred for batching.
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit
+#
+# Multi-GPU:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --device-map balanced
+
+model: google/gemma-4-31B-it
+mode: lora
+vision: true  # Use FastVisionModel + UnslothVisionDataCollator
+four_bit: true  # Required for single-GPU (24GB)
+
+# LoRA config
+lora_rank: 16
+lora_alpha: 16
+lora_targets: "all-linear"  # Vision models use all-linear by default
+
+# Vision-specific: which components to fine-tune
+# Tip: start with finetune_vision_layers: false to save VRAM,
+# enable after text-only training works well.
+finetune_vision_layers: false   # Set true to also train vision encoder
+finetune_language_layers: true  # Train language layers
+finetune_attention_modules: true
+finetune_mlp_modules: true
+
+# Image processing
+# vision_resize: null         # Auto; or int (pixels), "min", "max"
+# vision_snap_to_patch_size: true
+
+# Training — lower LR for larger model
+lr: 1e-4
+epochs: 2
+batch_size: 1
+grad_accum: 8
+max_seq_len: 4096  # Lower for vision (images consume many tokens)
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Vision collator handles train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
diff --git a/configs/unsloth/gemma4_e2b_lora.yaml b/configs/unsloth/gemma4_e2b_lora.yaml
new file mode 100644
index 0000000..340bcdc
--- /dev/null
+++ b/configs/unsloth/gemma4_e2b_lora.yaml
@@ -0,0 +1,47 @@
+# Unsloth SFT — Google Gemma 4 E2B (5B, Dense+PLE)
+# Smallest Gemma 4 model. Supports text, image, and audio.
+# 128K context. Excellent for lightweight fine-tuning.
+# VRAM: ~4GB (4-bit) or ~15GB (16-bit LoRA)
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   Preserve reasoning by keeping >=75% reasoning examples in training mix.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml --four-bit
+
+model: google/gemma-4-E2B-it
+mode: lora
+four_bit: false  # Enable for ~4GB VRAM; 16-bit recommended for quality
+
+# LoRA config — r=16 per Unsloth Gemma 4 recommendation
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training
+lr: 2e-4
+epochs: 2
+batch_size: 1
+grad_accum: 4
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template for train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# Reasoning: maintain >=75% reasoning examples in training data.
+# Add "<|think|>" to system prompt to activate thinking mode.
diff --git a/configs/unsloth/gemma4_e4b_lora.yaml b/configs/unsloth/gemma4_e4b_lora.yaml
new file mode 100644
index 0000000..7d16063
--- /dev/null
+++ b/configs/unsloth/gemma4_e4b_lora.yaml
@@ -0,0 +1,50 @@
+# Unsloth SFT — Google Gemma 4 E4B (8B, Dense+PLE)
+# Best balance of quality and efficiency. Supports text, image, and audio.
+# 128K context. Recommended default for Gemma 4 fine-tuning.
+# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA)
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   Preserve reasoning by keeping >=75% reasoning examples in training mix.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml --four-bit
+#
+# Vision fine-tuning (see gemma4_e4b_vision_lora.yaml):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml
+
+model: google/gemma-4-E4B-it
+mode: lora
+four_bit: false  # Enable for ~5GB VRAM; 16-bit recommended for quality
+
+# LoRA config — r=16 per Unsloth Gemma 4 recommendation
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training
+lr: 2e-4
+epochs: 2
+batch_size: 1
+grad_accum: 4
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template for train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# Reasoning: maintain >=75% reasoning examples in training data.
+# Add "<|think|>" to system prompt to activate thinking mode.
diff --git a/configs/unsloth/gemma4_e4b_vision_lora.yaml b/configs/unsloth/gemma4_e4b_vision_lora.yaml
new file mode 100644
index 0000000..f40a0ee
--- /dev/null
+++ b/configs/unsloth/gemma4_e4b_vision_lora.yaml
@@ -0,0 +1,60 @@
+# Unsloth Vision SFT — Google Gemma 4 E4B (8B, Dense+PLE, Multimodal)
+# Vision + audio fine-tuning using FastVisionModel.
+# Supports image, audio, and text inputs. 128K context.
+# VRAM: ~8GB (4-bit) or ~18GB (16-bit LoRA)
+#
+# Data format: messages with content blocks (not plain text).
+#   [{"role": "user", "content": [
+#       {"type": "image", "image": <PIL.Image or URL>},
+#       {"type": "text", "text": "Describe this image."}
+#   ]},
+#   {"role": "assistant", "content": [
+#       {"type": "text", "text": "The image shows..."}
+#   ]}]
+#
+# For audio, use {"type": "audio", "audio": <path or array>} before text.
+# Images should be 300-1000px, same dimensions preferred for batching.
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml --four-bit
+
+model: google/gemma-4-E4B-it
+mode: lora
+vision: true  # Use FastVisionModel + UnslothVisionDataCollator
+four_bit: false
+
+# LoRA config
+lora_rank: 16
+lora_alpha: 16
+lora_targets: "all-linear"  # Vision models use all-linear by default
+
+# Vision-specific: which components to fine-tune
+# Tip: start with finetune_vision_layers: false to save VRAM,
+# enable after text-only training works well.
+finetune_vision_layers: false   # Set true to also train vision encoder
+finetune_language_layers: true  # Train language layers
+finetune_attention_modules: true
+finetune_mlp_modules: true
+
+# Image processing
+# vision_resize: null         # Auto; or int (pixels), "min", "max"
+# vision_snap_to_patch_size: true
+
+# Training
+lr: 2e-4
+epochs: 2
+batch_size: 1
+grad_accum: 4
+max_seq_len: 4096  # Lower for vision (images consume many tokens)
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Vision collator handles train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
diff --git a/configs/unsloth/grpo_gemma4_27b_moe.yaml b/configs/unsloth/grpo_gemma4_27b_moe.yaml
new file mode 100644
index 0000000..1982c78
--- /dev/null
+++ b/configs/unsloth/grpo_gemma4_27b_moe.yaml
@@ -0,0 +1,49 @@
+# Unsloth GRPO — Google Gemma 4 26B-A4B (MoE, 4B active)
+# RL training with Gemma 4 MoE — efficient inference (only 4B active).
+# VRAM: ~24GB (16-bit with vLLM) — 4-bit NOT recommended for MoE
+#
+# IMPORTANT: 4-bit quantization is NOT recommended for MoE variants.
+#   Use 16-bit (default) or 8-bit. Multi-GPU: --device-map balanced
+#
+# Reasoning: Use <|think|> in system prompt for chain-of-thought.
+#   GRPO naturally preserves reasoning when reward signals value it.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_27b_moe.yaml
+
+model: google/gemma-4-26B-A4B-it
+mode: grpo
+# four_bit: false  # NOT recommended for MoE — use 16-bit
+
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+lr: 3e-6
+steps: 200
+batch_size: 4
+num_generations: 4
+max_seq_len: 8192
+max_completion_len: 4096
+reward_mode: hybrid
+
+temperature: 1.0
+top_p: 0.95
+top_k: 64
+min_p: 0.1
+gpu_memory_utilization: 0.5  # Lower for MoE memory overhead
+
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# MoE optimization — 12x faster MoE training
+# moe_backend: grouped_mm  # Uncomment if supported by your hardware
diff --git a/configs/unsloth/grpo_gemma4_31b.yaml b/configs/unsloth/grpo_gemma4_31b.yaml
new file mode 100644
index 0000000..552040b
--- /dev/null
+++ b/configs/unsloth/grpo_gemma4_31b.yaml
@@ -0,0 +1,47 @@
+# Unsloth GRPO — Google Gemma 4 31B (Dense)
+# RL training with the largest dense Gemma 4 model.
+# Maximum quality but requires significant VRAM.
+# VRAM: ~24GB (4-bit with vLLM) or ~40GB (8-bit)
+#
+# Reasoning: Use <|think|> in system prompt for chain-of-thought.
+#   GRPO naturally preserves reasoning when reward signals value it.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml
+#
+# Multi-GPU:
+#   uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml --device-map balanced
+
+model: google/gemma-4-31B-it
+mode: grpo
+four_bit: true  # Required for single-GPU
+
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+lr: 3e-6
+steps: 200
+batch_size: 4
+num_generations: 4
+max_seq_len: 8192
+max_completion_len: 4096
+reward_mode: hybrid
+
+temperature: 1.0
+top_p: 0.95
+top_k: 64
+min_p: 0.1
+gpu_memory_utilization: 0.6
+
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
diff --git a/configs/unsloth/grpo_gemma4_e2b.yaml b/configs/unsloth/grpo_gemma4_e2b.yaml
new file mode 100644
index 0000000..0bda1f1
--- /dev/null
+++ b/configs/unsloth/grpo_gemma4_e2b.yaml
@@ -0,0 +1,44 @@
+# Unsloth GRPO — Google Gemma 4 E2B (5B, Dense+PLE)
+# RL training with the smallest Gemma 4 model.
+# Fast iteration for reward signal experimentation.
+# VRAM: ~8GB (4-bit with vLLM inference)
+#
+# Reasoning: Use <|think|> in system prompt for chain-of-thought.
+#   GRPO naturally preserves reasoning when reward signals value it.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e2b.yaml
+
+model: google/gemma-4-E2B-it
+mode: grpo
+four_bit: true
+
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+lr: 5e-6
+steps: 200
+batch_size: 4
+num_generations: 4
+max_seq_len: 8192
+max_completion_len: 4096
+reward_mode: hybrid
+
+temperature: 1.0
+top_p: 0.95
+top_k: 64
+min_p: 0.1
+gpu_memory_utilization: 0.6
+
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
diff --git a/configs/unsloth/grpo_gemma4_e4b.yaml b/configs/unsloth/grpo_gemma4_e4b.yaml
new file mode 100644
index 0000000..e4f8b70
--- /dev/null
+++ b/configs/unsloth/grpo_gemma4_e4b.yaml
@@ -0,0 +1,44 @@
+# Unsloth GRPO — Google Gemma 4 E4B (8B, Dense+PLE)
+# RL training with the recommended default Gemma 4 model.
+# Best balance of generation speed and model quality.
+# VRAM: ~12GB (4-bit with vLLM inference)
+#
+# Reasoning: Use <|think|> in system prompt for chain-of-thought.
+#   GRPO naturally preserves reasoning when reward signals value it.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e4b.yaml
+
+model: google/gemma-4-E4B-it
+mode: grpo
+four_bit: true
+
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+lr: 5e-6
+steps: 200
+batch_size: 4
+num_generations: 4
+max_seq_len: 8192
+max_completion_len: 4096
+reward_mode: hybrid
+
+temperature: 1.0
+top_p: 0.95
+top_k: 64
+min_p: 0.1
+gpu_memory_utilization: 0.6
+
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
diff --git a/configs/unsloth/hpo_gemma4_e4b.yaml b/configs/unsloth/hpo_gemma4_e4b.yaml
new file mode 100644
index 0000000..60869f9
--- /dev/null
+++ b/configs/unsloth/hpo_gemma4_e4b.yaml
@@ -0,0 +1,47 @@
+# Unsloth HPO — Google Gemma 4 E4B (8B, Dense+PLE)
+# Hyperparameter optimization via Optuna before full training.
+# Searches: rank, alpha, LR, weight_decay, warmup, scheduler, RSLoRA, grad_accum.
+# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA) per trial
+#
+# Run HPO first, then use the best recipe for full SFT/GRPO.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml --n-trials 20
+#   uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml --n-trials 10 --four-bit --pruning
+
+model: google/gemma-4-E4B-it
+mode: lora
+four_bit: false  # Enable for faster HPO trials on constrained VRAM
+
+# LoRA search space anchored at Unsloth's Gemma 4 defaults
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training — baseline params (Optuna will search around these)
+lr: 2e-4
+epochs: 1         # Short epochs for HPO trials
+batch_size: 1
+grad_accum: 4
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# Eval for HPO loss tracking
+eval_split: 0.1
+eval_steps: 10
diff --git a/scripts/08_train_sft_unsloth.py b/scripts/08_train_sft_unsloth.py
index 5f3f348..ff43ecf 100644
--- a/scripts/08_train_sft_unsloth.py
+++ b/scripts/08_train_sft_unsloth.py
@@ -2,7 +2,8 @@
 """Step 8 (alt): Fine-tune a model using Unsloth.
 
 Local training — runs on whatever GPU(s) are available.
-Supports LoRA SFT (default), full SFT, DPO, and continued pretraining.
+Supports LoRA SFT (default), full SFT, DPO, continued pretraining,
+and vision/multimodal fine-tuning (FastVisionModel).
 ALL Unsloth capabilities are configurable via YAML recipe or CLI flags.
 
 Single GPU:
@@ -25,6 +26,10 @@
 Continued pretraining:
     python scripts/08_train_sft_unsloth.py --cpt --data-dir ./corpus/
 
+Vision/multimodal fine-tuning (Gemma 4, Llama 3.2 Vision, etc.):
+    python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml
+    python scripts/08_train_sft_unsloth.py --vision --model google/gemma-4-E4B-it
+
 GGUF export:
     python scripts/08_train_sft_unsloth.py --recipe ... --save-gguf q4_k_m
 """
@@ -51,6 +56,15 @@
     "revision": None,         # Pin specific model revision from Hub (e.g. "main", commit hash)
     "resize_model_vocab": None,  # Resize vocab (int) — for adding custom special tokens
 
+    # ── Vision (multimodal) ──
+    "vision": False,          # Use FastVisionModel instead of FastLanguageModel
+    "finetune_vision_layers": True,   # Train vision encoder layers
+    "finetune_language_layers": True,  # Train language model layers
+    "finetune_attention_modules": True,  # Train attention modules
+    "finetune_mlp_modules": True,  # Train MLP modules
+    "vision_resize": None,    # Image resize: int (pixels), "min", "max", or None (auto)
+    "vision_snap_to_patch_size": True,  # Force images to match patch size
+
     # ── Quantization ──
     "four_bit": False,        # QLoRA 4-bit (minimal VRAM)
     "eight_bit": False,       # 8-bit quantization
@@ -155,6 +169,7 @@ def parse_args() -> argparse.Namespace:
     p.add_argument("--full", action="store_true", help="Full fine-tune instead of LoRA")
     p.add_argument("--dpo", action="store_true", help="DPO training (requires --sft-checkpoint)")
     p.add_argument("--cpt", action="store_true", help="Continued pretraining (domain adaptation)")
+    p.add_argument("--vision", action="store_true", help="Vision fine-tuning (uses FastVisionModel)")
     p.add_argument("--sft-checkpoint", default=None, help="Path to SFT checkpoint for DPO")
 
     # Quantization
@@ -241,6 +256,8 @@ def apply_overrides(recipe: dict, args: argparse.Namespace) -> dict:
         recipe["mode"] = "full"
     if args.cpt:
         recipe["mode"] = "cpt"
+    if args.vision:
+        recipe["vision"] = True
     if args.packing:
         recipe["packing"] = True
     if args.use_rslora:
@@ -303,9 +320,18 @@ def load_sft_data(data_dir: str) -> list[dict]:
     return examples
 
 
+def _get_model_class(recipe: dict):
+    """Return the appropriate Unsloth model class (language or vision)."""
+    if recipe.get("vision"):
+        from unsloth import FastVisionModel
+        return FastVisionModel
+    from unsloth import FastLanguageModel
+    return FastLanguageModel
+
+
 def _load_model(recipe: dict):
     """Load model with all recipe-configured options."""
-    from unsloth import FastLanguageModel
+    ModelClass = _get_model_class(recipe)
 
     model_name = recipe["model"]
     is_full = recipe["mode"] == "full"
@@ -345,24 +371,35 @@ def _load_model(recipe: dict):
     if recipe.get("unsloth_tiled_mlp"):
         load_kwargs["unsloth_tiled_mlp"] = True
 
-    model, tokenizer = FastLanguageModel.from_pretrained(**load_kwargs)
+    model, tokenizer = ModelClass.from_pretrained(**load_kwargs)
     return model, tokenizer
 
 
 def _apply_lora(model, recipe: dict):
     """Apply LoRA with all recipe-configured options."""
-    from unsloth import FastLanguageModel
+    ModelClass = _get_model_class(recipe)
 
     peft_kwargs = dict(
         r=recipe["lora_rank"],
         lora_alpha=recipe["lora_alpha"],
-        target_modules=recipe["lora_targets"],
         lora_dropout=recipe.get("lora_dropout", 0),
         bias=recipe.get("bias", "none"),
         use_gradient_checkpointing=recipe.get("gradient_checkpointing", "unsloth"),
         use_rslora=recipe.get("use_rslora", False),
         random_state=recipe.get("seed", 42),
     )
+
+    if recipe.get("vision"):
+        # Vision models use finetune_*_layers/modules instead of target_modules
+        peft_kwargs["finetune_vision_layers"] = recipe.get("finetune_vision_layers", True)
+        peft_kwargs["finetune_language_layers"] = recipe.get("finetune_language_layers", True)
+        peft_kwargs["finetune_attention_modules"] = recipe.get("finetune_attention_modules", True)
+        peft_kwargs["finetune_mlp_modules"] = recipe.get("finetune_mlp_modules", True)
+        # Vision models default to "all-linear" target_modules
+        peft_kwargs["target_modules"] = recipe.get("lora_targets", "all-linear")
+    else:
+        peft_kwargs["target_modules"] = recipe["lora_targets"]
+
     if recipe.get("loftq_config"):
         peft_kwargs["loftq_config"] = recipe["loftq_config"]
     if recipe.get("init_lora_weights") is not True and recipe.get("init_lora_weights") is not None:
@@ -374,7 +411,7 @@ def _apply_lora(model, recipe: dict):
     if recipe.get("qat_scheme"):
         peft_kwargs["qat_scheme"] = recipe["qat_scheme"]
 
-    return FastLanguageModel.get_peft_model(model, **peft_kwargs)
+    return ModelClass.get_peft_model(model, **peft_kwargs)
 
 
 def train_sft(recipe: dict, data_dir: str, output_dir: str, args: argparse.Namespace):
@@ -384,10 +421,11 @@ def train_sft(recipe: dict, data_dir: str, output_dir: str, args: argparse.Names
 
     model_name = recipe["model"]
     is_full = recipe["mode"] == "full"
+    is_vision = recipe.get("vision", False)
     max_seq_len = recipe["max_seq_len"]
     use_bf16 = is_bfloat16_supported()
 
-    print(f"\n=== Loading model: {model_name} ===")
+    print(f"\n=== Loading model: {model_name} {'(vision)' if is_vision else ''} ===")
     model, tokenizer = _load_model(recipe)
 
     if not is_full:
@@ -405,15 +443,24 @@ def train_sft(recipe: dict, data_dir: str, output_dir: str, args: argparse.Names
 
     from datasets import Dataset
 
-    def format_example(example):
-        messages = example.get("messages", [])
-        kwargs = dict(tokenize=False, add_generation_prompt=False)
-        if recipe.get("reasoning_effort"):
-            kwargs["reasoning_effort"] = recipe["reasoning_effort"]
-        text = tokenizer.apply_chat_template(messages, **kwargs)
-        return {"text": text}
+    if is_vision:
+        # Vision: keep messages as-is (with image/audio content blocks).
+        # UnslothVisionDataCollator handles tokenization and image processing.
+        # Use list comprehension instead of .map() for multi-image support.
+        converted = []
+        for ex in examples:
+            converted.append({"messages": ex.get("messages", [])})
+        dataset = Dataset.from_list(converted)
+    else:
+        def format_example(example):
+            messages = example.get("messages", [])
+            kwargs = dict(tokenize=False, add_generation_prompt=False)
+            if recipe.get("reasoning_effort"):
+                kwargs["reasoning_effort"] = recipe["reasoning_effort"]
+            text = tokenizer.apply_chat_template(messages, **kwargs)
+            return {"text": text}
 
-    dataset = Dataset.from_list(examples).map(format_example)
+        dataset = Dataset.from_list(examples).map(format_example)
 
     # Optional train/eval split
     eval_split = recipe.get("eval_split", 0.0)
@@ -453,12 +500,18 @@ def format_example(example):
         optim=recipe.get("optim", "adamw_8bit"),
         seed=recipe["seed"],
         max_seq_length=max_seq_len,
-        dataset_text_field="text",
-        packing=recipe["packing"],
         report_to="none" if recipe.get("no_wandb") else "wandb",
         run_name=f"swe-gym-sft-{'full' if is_full else 'lora'}-{Path(model_name).name}",
         **train_kwargs,
     )
+
+    # Vision mode: no packing, no text field (collator handles it)
+    if not is_vision:
+        sft_kwargs["dataset_text_field"] = "text"
+        sft_kwargs["packing"] = recipe["packing"]
+    else:
+        sft_kwargs["remove_unused_columns"] = False
+        sft_kwargs["dataset_kwargs"] = {"skip_prepare_dataset": True}
     if recipe.get("warmup_ratio"):
         sft_kwargs["warmup_ratio"] = recipe["warmup_ratio"]
     if eval_dataset:
@@ -473,7 +526,7 @@ def format_example(example):
 
     training_args = SFTConfig(**sft_kwargs)
 
-    trainer = SFTTrainer(
+    trainer_kwargs = dict(
         model=model,
         tokenizer=tokenizer,
         train_dataset=train_dataset,
@@ -481,8 +534,27 @@ def format_example(example):
         args=training_args,
     )
 
-    # Train on completions only — ~1% accuracy boost per QLoRA paper
-    if recipe.get("train_on_completions"):
+    # Vision: use UnslothVisionDataCollator for image/audio processing
+    if is_vision:
+        from unsloth import UnslothVisionDataCollator
+        collator_kwargs = dict(model=model, tokenizer=tokenizer)
+        if recipe.get("vision_resize") is not None:
+            collator_kwargs["resize"] = recipe["vision_resize"]
+        if recipe.get("vision_snap_to_patch_size") is not None:
+            collator_kwargs["snap_to_patch_size"] = recipe["vision_snap_to_patch_size"]
+        # train_on_completions via collator for vision
+        if recipe.get("train_on_completions"):
+            collator_kwargs["completion_only_loss"] = True
+            collator_kwargs["train_on_responses_only"] = True
+            collator_kwargs["instruction_part"] = recipe.get("instruction_part")
+            collator_kwargs["response_part"] = recipe.get("response_part")
+            print("  Training on completions only (via vision collator)")
+        trainer_kwargs["data_collator"] = UnslothVisionDataCollator(**collator_kwargs)
+
+    trainer = SFTTrainer(**trainer_kwargs)
+
+    # Train on completions only — ~1% accuracy boost per QLoRA paper (text mode)
+    if recipe.get("train_on_completions") and not is_vision:
         from unsloth.chat_templates import train_on_responses_only
         trainer = train_on_responses_only(
             trainer,
@@ -501,7 +573,8 @@ def format_example(example):
         print(f"  Early stopping: patience={patience}")
 
     mode_str = "full" if is_full else "LoRA"
-    print(f"\n=== Starting {mode_str} SFT ===")
+    vision_str = " (vision)" if is_vision else ""
+    print(f"\n=== Starting {mode_str} SFT{vision_str} ===")
     print(f"  Model:      {model_name}")
     print(f"  Epochs:     {recipe['epochs']}")
     print(f"  Batch size: {recipe['batch_size']} x {recipe['grad_accum']} grad accum")