webcoderz · webcoderz · Apr 3, 2026 · Apr 3, 2026
diff --git a/Makefile b/Makefile
@@ -5,7 +5,10 @@
 	smoke-test deploy-skills gen-trajectories prepare-sft train-sft \
 	train-sft-unsloth train-dpo-unsloth train-grpo train-grpo-gptoss-120b \
 	train-sft-unsloth-gptoss-120b train-sft-fp8 train-grpo-fp8 \
-	train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean
+	train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean \
+	train-sft-gemma4 train-sft-gemma4-31b train-sft-gemma4-moe \
+	train-sft-gemma4-vision train-grpo-gemma4 train-grpo-gemma4-31b \
+	train-grpo-gemma4-moe train-dpo-gemma4 hpo-gemma4
 
 # Load repo config (REPO_OWNER, REPO_NAME, REPO_KEY, etc.)
 include repo.conf
@@ -158,6 +161,31 @@ train-sft-qwen3-coder:
 train-sft-nemotron-super:
 	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/nemotron3_super_lora.yaml --device-map balanced
 
+# Gemma 4 family
+train-sft-gemma4:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml
+
+train-sft-gemma4-4bit:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml --four-bit
+
+train-sft-gemma4-e2b:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml
+
+train-sft-gemma4-31b:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit
+
+train-sft-gemma4-moe:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced
+
+train-sft-gemma4-vision:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml
+
+train-sft-gemma4-vision-31b:
+	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit
+
+train-dpo-gemma4:
+	uv run python scripts/08_train_sft_unsloth.py --dpo --recipe configs/unsloth/dpo_gemma4_e4b.yaml --sft-checkpoint ./sft_output/
+
 # DPO and CPT
 train-dpo-unsloth:
 	uv run python scripts/08_train_sft_unsloth.py --dpo --sft-checkpoint ./sft_output/
@@ -212,13 +240,28 @@ train-grpo-nemotron-super:
 train-grpo-gptoss-120b:
 	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gpt_oss_120b.yaml --device-map balanced
 
+train-grpo-gemma4:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e4b.yaml
+
+train-grpo-gemma4-e2b:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e2b.yaml
+
+train-grpo-gemma4-31b:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml
+
+train-grpo-gemma4-moe:
+	uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_27b_moe.yaml --device-map balanced
+
 # ── Hyperparameter optimization ───────────────────────────────
 hpo:
 	uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit
 
 hpo-thorough:
 	uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit --n-trials 30 --steps-per-trial 100
 
+hpo-gemma4:
+	uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml
+
 # ── FP8 training (RTX 40/50, H100+) ──────────────────────────
 train-sft-fp8:
 	uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_8b_lora.yaml --fp8

diff --git a/README.md b/README.md
@@ -203,7 +203,7 @@ Runs SWE-agent on validated task instances to produce expert solve traces.
 Evaluates which trajectories actually resolved their task, then converts successful ones to chat-format JSONL suitable for fine-tuning.
 
 ### Step 8: Train SFT (`08_train_sft_unsloth.py`)
-Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, and continued pretraining. Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`.
+Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, continued pretraining, and **vision/multimodal fine-tuning** (FastVisionModel). Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`.
 
 ```bash
 make train-sft-unsloth           # Qwen3-8B LoRA (default)
@@ -213,7 +213,12 @@ make train-sft-unsloth-gptoss    # GPT-OSS 20B LoRA (long context)
 make train-sft-nemotron          # Nemotron-3-Nano 30B MoE (SOTA agentic)
 make train-sft-qwen3-coder       # Qwen3-Coder-Next 80B MoE (SOTA coding)
 make train-sft-unsloth-full      # Full fine-tune
+make train-sft-gemma4            # Gemma 4 E4B LoRA (recommended Gemma default)
+make train-sft-gemma4-31b        # Gemma 4 31B LoRA (4-bit, A100/4090)
+make train-sft-gemma4-moe        # Gemma 4 26B-A4B MoE (multi-GPU)
+make train-sft-gemma4-vision     # Gemma 4 E4B vision fine-tuning
 make train-dpo-unsloth           # DPO (after SFT)
+make train-dpo-gemma4            # DPO — Gemma 4 E4B (after SFT)
 make train-cpt-unsloth           # Continued pretraining (domain adaptation)
 make train-recipe RECIPE=configs/unsloth/my_recipe.yaml  # Custom recipe
 ```
@@ -232,6 +237,12 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml  # Custom recipe
 | `nemotron3_nano_lora.yaml` | Nemotron-3-Nano 30B | ~3.6B (MoE) | ~24GB | SOTA agentic, hybrid reasoning |
 | `nemotron3_super_lora.yaml` | Nemotron-3-Super 120B | ~12B (MoE) | ~64-72GB | SOTA reasoning, 1M context |
 | `qwen3_coder_next_lora.yaml` | Qwen3-Coder-Next 80B | ~3B (MoE) | ~46GB | **SOTA coding.** 70.6% SWE-Bench |
+| `gemma4_e2b_lora.yaml` | Gemma 4 E2B | 5B | ~4GB | Smallest Gemma 4, multimodal capable |
+| `gemma4_e4b_lora.yaml` | Gemma 4 E4B | 8B | ~5GB | **Recommended Gemma 4.** Text+image+audio |
+| `gemma4_27b_moe_lora.yaml` | Gemma 4 26B-A4B | ~4B (MoE) | ~18GB | MoE — 4-bit not recommended, use 16-bit |
+| `gemma4_31b_lora.yaml` | Gemma 4 31B | 31B | ~20GB | Largest dense Gemma 4, 256K context |
+| `gemma4_e4b_vision_lora.yaml` | Gemma 4 E4B (vision) | 8B | ~8GB | FastVisionModel — image+audio fine-tuning |
+| `gemma4_31b_vision_lora.yaml` | Gemma 4 31B (vision) | 31B | ~24GB | FastVisionModel — image fine-tuning |
 
 **GRPO recipes:**
 
@@ -243,6 +254,10 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml  # Custom recipe
 | `grpo_qwen3_coder_next.yaml` | Qwen3-Coder-Next 80B | ~46GB | SOTA coding GRPO (GSPO) |
 | `grpo_nemotron3_super.yaml` | Nemotron-3-Super 120B | ~64-72GB | SOTA reasoning GRPO, multi-GPU |
 | `grpo_gpt_oss_120b.yaml` | GPT-OSS 120B | ~65GB | Multi-GPU required (4x A100) |
+| `grpo_gemma4_e2b.yaml` | Gemma 4 E2B | ~8GB | Fast Gemma 4 GRPO iteration |
+| `grpo_gemma4_e4b.yaml` | Gemma 4 E4B | ~12GB | Recommended Gemma 4 GRPO |
+| `grpo_gemma4_27b_moe.yaml` | Gemma 4 26B-A4B (MoE) | ~24GB | MoE — use 16-bit, multi-GPU |
+| `grpo_gemma4_31b.yaml` | Gemma 4 31B | ~24GB | Largest dense Gemma 4 |
 
 Recipes are YAML files — copy one and customize for your needs. CLI args override recipe values.
 
@@ -291,6 +306,7 @@ python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_coder_next
 | Save MXFP4 | `--save-mxfp4` | 75% less disk space |
 | Push to Hub | `--push-to-hub user/model` | Upload merged model to HuggingFace |
 | 8-bit quantization | `--eight-bit` | Middle ground between 4-bit and 16-bit |
+| Vision fine-tuning | `--vision` | FastVisionModel for image/audio (Gemma 4, etc.) |
 | Continued pretraining | `--cpt --data-dir ./corpus/` | Domain adaptation before SFT |
 
 > **Legacy:** `make train-sft` still works via [torchtune](https://github.com/meta-pytorch/torchtune) (configs in `configs/torchtune/`), but torchtune has [stopped active development](https://github.com/meta-pytorch/torchtune/issues/2883). [torchforge](https://github.com/meta-pytorch/torchforge) is its successor but only supports full SFT and GRPO currently (no LoRA/DPO).
@@ -306,6 +322,9 @@ make train-grpo-nemotron     # Nemotron-3-Nano MoE
 make train-grpo-qwen3-coder   # Qwen3-Coder-Next (multi-GPU)
 make train-grpo-nemotron-super # Nemotron-3-Super 120B (multi-GPU)
 make train-grpo-gptoss-120b   # GPT-OSS 120B (4x A100)
+make train-grpo-gemma4        # Gemma 4 E4B (recommended Gemma GRPO)
+make train-grpo-gemma4-31b    # Gemma 4 31B
+make train-grpo-gemma4-moe    # Gemma 4 26B-A4B MoE (multi-GPU)
 make train-grpo-fp8          # FP8 + vLLM standby (RTX 40/50, H100+)
 make train-grpo-multigpu     # DDP with 2 GPUs
 ```
@@ -412,6 +431,9 @@ Approximate VRAM needed per model size (minimums — actual usage may be higher)
 | **Agentic tasks (small)** | Nemotron-3-Nano 30B | SOTA agentic, hybrid reasoning, MoE ~3.6B active |
 | **General purpose (large)** | Qwen3-32B | Strong reasoning, fits on A100/4090 |
 | **General purpose (small)** | Qwen3-8B | Native tool calling, good balance |
+| **Multimodal (vision+audio)** | Gemma 4 E4B | Image+audio+text, 128K context, `<\|think\|>` reasoning |
+| **Multimodal (large)** | Gemma 4 31B | Image+text, 256K context, dense |
+| **Efficient MoE (small)** | Gemma 4 26B-A4B | Only 4B active params, 256K context |
 | **Fast iteration / testing** | Qwen3-4B | Quick experiments, low VRAM |
 | **Long context** | GPT-OSS 20B/120B | OpenAI's open models, 16K+ context |
 
@@ -499,6 +521,59 @@ CPT uses `UnslothTrainer` with separate embedding learning rates. It adds `lm_he
 python scripts/08_train_sft_unsloth.py --model ./cpt_output/merged/ --recipe configs/unsloth/qwen3_8b_lora.yaml
 ```
 
+## Vision / Multimodal Fine-Tuning
+
+Fine-tune vision-language models (Gemma 4, Llama 3.2 Vision, etc.) on image+text or audio+text data using Unsloth's `FastVisionModel`:
+
+```bash
+make train-sft-gemma4-vision          # Gemma 4 E4B vision (image+audio+text)
+make train-sft-gemma4-vision-31b      # Gemma 4 31B vision (image+text, 4-bit)
+
+# Or with --vision flag on any vision-capable model:
+python scripts/08_train_sft_unsloth.py --vision --model google/gemma-4-E4B-it
+```
+
+**Data format** — messages with content blocks (images/audio must precede text):
+```json
+{"messages": [
+  {"role": "user", "content": [
+    {"type": "image", "image": "path/to/image.jpg"},
+    {"type": "text", "text": "Describe this image."}
+  ]},
+  {"role": "assistant", "content": [
+    {"type": "text", "text": "The image shows..."}
+  ]}
+]}
+```
+
+**Vision recipe fields** (in YAML):
+| Field | Default | Notes |
+|-------|---------|-------|
+| `vision` | `false` | Enable FastVisionModel instead of FastLanguageModel |
+| `finetune_vision_layers` | `true` | Train vision encoder (set `false` to save VRAM) |
+| `finetune_language_layers` | `true` | Train language model layers |
+| `finetune_attention_modules` | `true` | Train attention modules |
+| `finetune_mlp_modules` | `true` | Train MLP modules |
+| `vision_resize` | `null` | Image resize: int (pixels), `"min"`, `"max"`, or `null` (auto) |
+| `lora_targets` | `"all-linear"` | Vision models typically use all-linear |
+
+**Tips:**
+- Use images of consistent dimensions (300-1000px) for efficient batching
+- Start with `finetune_vision_layers: false` to save VRAM, enable after text works
+- Audio clips should be short and task-specific (≤30s)
+- No packing for vision — the `UnslothVisionDataCollator` handles batching
+
+## Gemma 4 Reasoning Preservation
+
+Gemma 4 models support chain-of-thought reasoning via the `<|think|>` token. To preserve this during fine-tuning:
+
+1. **Activate thinking mode** — prepend `<|think|>` to your system prompt
+2. **Training data mix** — maintain ≥75% reasoning examples (with thinking traces)
+3. **Train on completions only** — all Gemma 4 recipes set `train_on_completions: true` with Gemma's chat template tokens (`<start_of_turn>user\n` / `<start_of_turn>model\n`)
+4. **MoE variant (26B-A4B)** — do NOT use 4-bit quantization for MoE; use 16-bit LoRA with `--device-map balanced`
+
+Reference: [Unsloth Gemma 4 Training Guide](https://unsloth.ai/docs/models/gemma-4/train)
+
 ## Project Structure
 
 ```

diff --git a/configs/unsloth/dpo_gemma4_e4b.yaml b/configs/unsloth/dpo_gemma4_e4b.yaml
@@ -0,0 +1,47 @@
+# Unsloth DPO — Google Gemma 4 E4B (8B, Dense+PLE)
+# Direct Preference Optimization after SFT.
+# Aligns model to prefer better solutions from preference pairs.
+# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA)
+#
+# Requires: SFT checkpoint + dpo_preferences.json in data dir.
+# Format: [{"prompt": "...", "chosen": "...", "rejected": "..."}, ...]
+#
+# Reasoning: DPO can reinforce reasoning quality by preferring
+#   solutions with clear chain-of-thought over direct answers.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py \
+#     --recipe configs/unsloth/dpo_gemma4_e4b.yaml \
+#     --dpo --sft-checkpoint ./sft_output/
+
+model: google/gemma-4-E4B-it
+mode: lora
+four_bit: false
+
+# LoRA config
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training
+lr: 2e-4
+epochs: 2
+batch_size: 1
+grad_accum: 4
+max_seq_len: 8192
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# DPO-specific
+dpo_lr: 2e-5
+dpo_beta: 0.05       # KL penalty — lower = more aggressive alignment
+dpo_label_smoothing: 0.0
diff --git a/configs/unsloth/gemma4_27b_moe_lora.yaml b/configs/unsloth/gemma4_27b_moe_lora.yaml
@@ -0,0 +1,56 @@
+# Unsloth SFT — Google Gemma 4 26B-A4B (MoE, 4B active)
+# Mixture-of-Experts: 26B total params, only 4B active per token.
+# 256K context. Supports text and image.
+# VRAM: ~18GB (4-bit) or ~28GB (8-bit) — 16-bit recommended for MoE
+#
+# IMPORTANT: 4-bit quantization is NOT recommended for MoE variants.
+#   Use 16-bit LoRA (default) or 8-bit. For multi-GPU: --device-map balanced
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   Preserve reasoning by keeping >=75% reasoning examples in training mix.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage (single GPU, 16-bit — needs ~28GB+ VRAM):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml
+#
+# Multi-GPU (model splitting):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced
+
+model: google/gemma-4-26B-A4B-it
+mode: lora
+# four_bit: false  # NOT recommended for MoE — use 16-bit or 8-bit
+# eight_bit: true  # Uncomment for ~18GB VRAM (compromise)
+
+# LoRA config — r=16 per Unsloth Gemma 4 recommendation
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training — lower LR for MoE stability
+lr: 1e-4
+epochs: 2
+batch_size: 1
+grad_accum: 8
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template for train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# MoE optimization: use grouped_mm backend for faster training (GRPO script).
+# moe_backend: grouped_mm  # 12x faster MoE training (when supported)
+
+# Reasoning: maintain >=75% reasoning examples in training data.
+# Add "<|think|>" to system prompt to activate thinking mode.
diff --git a/configs/unsloth/gemma4_31b_lora.yaml b/configs/unsloth/gemma4_31b_lora.yaml
@@ -0,0 +1,52 @@
+# Unsloth SFT — Google Gemma 4 31B (Dense)
+# Largest dense Gemma 4. Maximum quality for text and image tasks.
+# 256K context. Best Gemma 4 for single-modality fine-tuning.
+# VRAM: ~20GB (4-bit) or ~34GB (8-bit) or ~64GB (16-bit)
+#
+# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
+#   Preserve reasoning by keeping >=75% reasoning examples in training mix.
+#   See: https://unsloth.ai/docs/models/gemma-4/train
+#
+# Usage:
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit
+#
+# Multi-GPU (16-bit, model splitting):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --device-map balanced
+#
+# Vision fine-tuning (see gemma4_31b_vision_lora.yaml):
+#   uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit
+
+model: google/gemma-4-31B-it
+mode: lora
+four_bit: true  # Fits on single 24GB GPU in 4-bit
+
+# LoRA config — r=16 per Unsloth Gemma 4 recommendation
+lora_rank: 16
+lora_alpha: 16
+lora_targets:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+# Training — lower LR for larger model
+lr: 1e-4
+epochs: 2
+batch_size: 1
+grad_accum: 16
+max_seq_len: 8192
+packing: false
+warmup_steps: 10
+weight_decay: 0.01
+lr_scheduler: cosine
+
+# Gemma 4 chat template for train_on_completions
+train_on_completions: true
+instruction_part: "<start_of_turn>user\n"
+response_part: "<start_of_turn>model\n"
+
+# Reasoning: maintain >=75% reasoning examples in training data.
+# Add "<|think|>" to system prompt to activate thinking mode.