Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
smoke-test deploy-skills gen-trajectories prepare-sft train-sft \
train-sft-unsloth train-dpo-unsloth train-grpo train-grpo-gptoss-120b \
train-sft-unsloth-gptoss-120b train-sft-fp8 train-grpo-fp8 \
train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean
train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean \
train-sft-gemma4 train-sft-gemma4-31b train-sft-gemma4-moe \
train-sft-gemma4-vision train-grpo-gemma4 train-grpo-gemma4-31b \
train-grpo-gemma4-moe train-dpo-gemma4 hpo-gemma4

# Load repo config (REPO_OWNER, REPO_NAME, REPO_KEY, etc.)
include repo.conf
Expand Down Expand Up @@ -158,6 +161,31 @@ train-sft-qwen3-coder:
train-sft-nemotron-super:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/nemotron3_super_lora.yaml --device-map balanced

# Gemma 4 family
train-sft-gemma4:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml

train-sft-gemma4-4bit:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml --four-bit

train-sft-gemma4-e2b:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml

train-sft-gemma4-31b:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit

train-sft-gemma4-moe:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced

train-sft-gemma4-vision:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml

train-sft-gemma4-vision-31b:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit

train-dpo-gemma4:
uv run python scripts/08_train_sft_unsloth.py --dpo --recipe configs/unsloth/dpo_gemma4_e4b.yaml --sft-checkpoint ./sft_output/

# DPO and CPT
train-dpo-unsloth:
uv run python scripts/08_train_sft_unsloth.py --dpo --sft-checkpoint ./sft_output/
Expand Down Expand Up @@ -212,13 +240,28 @@ train-grpo-nemotron-super:
train-grpo-gptoss-120b:
uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gpt_oss_120b.yaml --device-map balanced

train-grpo-gemma4:
uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e4b.yaml

train-grpo-gemma4-e2b:
uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e2b.yaml

train-grpo-gemma4-31b:
uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml

train-grpo-gemma4-moe:
uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_27b_moe.yaml --device-map balanced

# ── Hyperparameter optimization ───────────────────────────────
hpo:
uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit

hpo-thorough:
uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit --n-trials 30 --steps-per-trial 100

hpo-gemma4:
uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml

# ── FP8 training (RTX 40/50, H100+) ──────────────────────────
train-sft-fp8:
uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_8b_lora.yaml --fp8
Expand Down
77 changes: 76 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ Runs SWE-agent on validated task instances to produce expert solve traces.
Evaluates which trajectories actually resolved their task, then converts successful ones to chat-format JSONL suitable for fine-tuning.

### Step 8: Train SFT (`08_train_sft_unsloth.py`)
Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, and continued pretraining. Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`.
Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, continued pretraining, and **vision/multimodal fine-tuning** (FastVisionModel). Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`.

```bash
make train-sft-unsloth # Qwen3-8B LoRA (default)
Expand All @@ -213,7 +213,12 @@ make train-sft-unsloth-gptoss # GPT-OSS 20B LoRA (long context)
make train-sft-nemotron # Nemotron-3-Nano 30B MoE (SOTA agentic)
make train-sft-qwen3-coder # Qwen3-Coder-Next 80B MoE (SOTA coding)
make train-sft-unsloth-full # Full fine-tune
make train-sft-gemma4 # Gemma 4 E4B LoRA (recommended Gemma default)
make train-sft-gemma4-31b # Gemma 4 31B LoRA (4-bit, A100/4090)
make train-sft-gemma4-moe # Gemma 4 26B-A4B MoE (multi-GPU)
make train-sft-gemma4-vision # Gemma 4 E4B vision fine-tuning
make train-dpo-unsloth # DPO (after SFT)
make train-dpo-gemma4 # DPO — Gemma 4 E4B (after SFT)
make train-cpt-unsloth # Continued pretraining (domain adaptation)
make train-recipe RECIPE=configs/unsloth/my_recipe.yaml # Custom recipe
```
Expand All @@ -232,6 +237,12 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml # Custom recipe
| `nemotron3_nano_lora.yaml` | Nemotron-3-Nano 30B | ~3.6B (MoE) | ~24GB | SOTA agentic, hybrid reasoning |
| `nemotron3_super_lora.yaml` | Nemotron-3-Super 120B | ~12B (MoE) | ~64-72GB | SOTA reasoning, 1M context |
| `qwen3_coder_next_lora.yaml` | Qwen3-Coder-Next 80B | ~3B (MoE) | ~46GB | **SOTA coding.** 70.6% SWE-Bench |
| `gemma4_e2b_lora.yaml` | Gemma 4 E2B | 5B | ~4GB | Smallest Gemma 4, multimodal capable |
| `gemma4_e4b_lora.yaml` | Gemma 4 E4B | 8B | ~5GB | **Recommended Gemma 4.** Text+image+audio |
| `gemma4_27b_moe_lora.yaml` | Gemma 4 26B-A4B | ~4B (MoE) | ~18GB | MoE — 4-bit not recommended, use 16-bit |
| `gemma4_31b_lora.yaml` | Gemma 4 31B | 31B | ~20GB | Largest dense Gemma 4, 256K context |
| `gemma4_e4b_vision_lora.yaml` | Gemma 4 E4B (vision) | 8B | ~8GB | FastVisionModel — image+audio fine-tuning |
| `gemma4_31b_vision_lora.yaml` | Gemma 4 31B (vision) | 31B | ~24GB | FastVisionModel — image fine-tuning |

**GRPO recipes:**

Expand All @@ -243,6 +254,10 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml # Custom recipe
| `grpo_qwen3_coder_next.yaml` | Qwen3-Coder-Next 80B | ~46GB | SOTA coding GRPO (GSPO) |
| `grpo_nemotron3_super.yaml` | Nemotron-3-Super 120B | ~64-72GB | SOTA reasoning GRPO, multi-GPU |
| `grpo_gpt_oss_120b.yaml` | GPT-OSS 120B | ~65GB | Multi-GPU required (4x A100) |
| `grpo_gemma4_e2b.yaml` | Gemma 4 E2B | ~8GB | Fast Gemma 4 GRPO iteration |
| `grpo_gemma4_e4b.yaml` | Gemma 4 E4B | ~12GB | Recommended Gemma 4 GRPO |
| `grpo_gemma4_27b_moe.yaml` | Gemma 4 26B-A4B (MoE) | ~24GB | MoE — use 16-bit, multi-GPU |
| `grpo_gemma4_31b.yaml` | Gemma 4 31B | ~24GB | Largest dense Gemma 4 |

Recipes are YAML files — copy one and customize for your needs. CLI args override recipe values.

Expand Down Expand Up @@ -291,6 +306,7 @@ python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_coder_next
| Save MXFP4 | `--save-mxfp4` | 75% less disk space |
| Push to Hub | `--push-to-hub user/model` | Upload merged model to HuggingFace |
| 8-bit quantization | `--eight-bit` | Middle ground between 4-bit and 16-bit |
| Vision fine-tuning | `--vision` | FastVisionModel for image/audio (Gemma 4, etc.) |
| Continued pretraining | `--cpt --data-dir ./corpus/` | Domain adaptation before SFT |

> **Legacy:** `make train-sft` still works via [torchtune](https://github.com/meta-pytorch/torchtune) (configs in `configs/torchtune/`), but torchtune has [stopped active development](https://github.com/meta-pytorch/torchtune/issues/2883). [torchforge](https://github.com/meta-pytorch/torchforge) is its successor but only supports full SFT and GRPO currently (no LoRA/DPO).
Expand All @@ -306,6 +322,9 @@ make train-grpo-nemotron # Nemotron-3-Nano MoE
make train-grpo-qwen3-coder # Qwen3-Coder-Next (multi-GPU)
make train-grpo-nemotron-super # Nemotron-3-Super 120B (multi-GPU)
make train-grpo-gptoss-120b # GPT-OSS 120B (4x A100)
make train-grpo-gemma4 # Gemma 4 E4B (recommended Gemma GRPO)
make train-grpo-gemma4-31b # Gemma 4 31B
make train-grpo-gemma4-moe # Gemma 4 26B-A4B MoE (multi-GPU)
make train-grpo-fp8 # FP8 + vLLM standby (RTX 40/50, H100+)
make train-grpo-multigpu # DDP with 2 GPUs
```
Expand Down Expand Up @@ -412,6 +431,9 @@ Approximate VRAM needed per model size (minimums — actual usage may be higher)
| **Agentic tasks (small)** | Nemotron-3-Nano 30B | SOTA agentic, hybrid reasoning, MoE ~3.6B active |
| **General purpose (large)** | Qwen3-32B | Strong reasoning, fits on A100/4090 |
| **General purpose (small)** | Qwen3-8B | Native tool calling, good balance |
| **Multimodal (vision+audio)** | Gemma 4 E4B | Image+audio+text, 128K context, `<\|think\|>` reasoning |
| **Multimodal (large)** | Gemma 4 31B | Image+text, 256K context, dense |
| **Efficient MoE (small)** | Gemma 4 26B-A4B | Only 4B active params, 256K context |
| **Fast iteration / testing** | Qwen3-4B | Quick experiments, low VRAM |
| **Long context** | GPT-OSS 20B/120B | OpenAI's open models, 16K+ context |

Expand Down Expand Up @@ -499,6 +521,59 @@ CPT uses `UnslothTrainer` with separate embedding learning rates. It adds `lm_he
python scripts/08_train_sft_unsloth.py --model ./cpt_output/merged/ --recipe configs/unsloth/qwen3_8b_lora.yaml
```

## Vision / Multimodal Fine-Tuning

Fine-tune vision-language models (Gemma 4, Llama 3.2 Vision, etc.) on image+text or audio+text data using Unsloth's `FastVisionModel`:

```bash
make train-sft-gemma4-vision # Gemma 4 E4B vision (image+audio+text)
make train-sft-gemma4-vision-31b # Gemma 4 31B vision (image+text, 4-bit)

# Or with --vision flag on any vision-capable model:
python scripts/08_train_sft_unsloth.py --vision --model google/gemma-4-E4B-it
```

**Data format** — messages with content blocks (images/audio must precede text):
```json
{"messages": [
{"role": "user", "content": [
{"type": "image", "image": "path/to/image.jpg"},
{"type": "text", "text": "Describe this image."}
]},
{"role": "assistant", "content": [
{"type": "text", "text": "The image shows..."}
]}
]}
```

**Vision recipe fields** (in YAML):
| Field | Default | Notes |
|-------|---------|-------|
| `vision` | `false` | Enable FastVisionModel instead of FastLanguageModel |
| `finetune_vision_layers` | `true` | Train vision encoder (set `false` to save VRAM) |
| `finetune_language_layers` | `true` | Train language model layers |
| `finetune_attention_modules` | `true` | Train attention modules |
| `finetune_mlp_modules` | `true` | Train MLP modules |
| `vision_resize` | `null` | Image resize: int (pixels), `"min"`, `"max"`, or `null` (auto) |
| `lora_targets` | `"all-linear"` | Vision models typically use all-linear |

**Tips:**
- Use images of consistent dimensions (300-1000px) for efficient batching
- Start with `finetune_vision_layers: false` to save VRAM, enable after text works
- Audio clips should be short and task-specific (≤30s)
- No packing for vision — the `UnslothVisionDataCollator` handles batching

## Gemma 4 Reasoning Preservation

Gemma 4 models support chain-of-thought reasoning via the `<|think|>` token. To preserve this during fine-tuning:

1. **Activate thinking mode** — prepend `<|think|>` to your system prompt
2. **Training data mix** — maintain ≥75% reasoning examples (with thinking traces)
3. **Train on completions only** — all Gemma 4 recipes set `train_on_completions: true` with Gemma's chat template tokens (`<start_of_turn>user\n` / `<start_of_turn>model\n`)
4. **MoE variant (26B-A4B)** — do NOT use 4-bit quantization for MoE; use 16-bit LoRA with `--device-map balanced`

Reference: [Unsloth Gemma 4 Training Guide](https://unsloth.ai/docs/models/gemma-4/train)

## Project Structure

```
Expand Down
47 changes: 47 additions & 0 deletions configs/unsloth/dpo_gemma4_e4b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Unsloth DPO — Google Gemma 4 E4B (8B, Dense+PLE)
# Direct Preference Optimization after SFT.
# Aligns model to prefer better solutions from preference pairs.
# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA)
#
# Requires: SFT checkpoint + dpo_preferences.json in data dir.
# Format: [{"prompt": "...", "chosen": "...", "rejected": "..."}, ...]
#
# Reasoning: DPO can reinforce reasoning quality by preferring
# solutions with clear chain-of-thought over direct answers.
# See: https://unsloth.ai/docs/models/gemma-4/train
#
# Usage:
# uv run python scripts/08_train_sft_unsloth.py \
# --recipe configs/unsloth/dpo_gemma4_e4b.yaml \
# --dpo --sft-checkpoint ./sft_output/

model: google/gemma-4-E4B-it
mode: lora
four_bit: false

# LoRA config
lora_rank: 16
lora_alpha: 16
lora_targets:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj

# Training
lr: 2e-4
epochs: 2
batch_size: 1
grad_accum: 4
max_seq_len: 8192
warmup_steps: 10
weight_decay: 0.01
lr_scheduler: cosine

# DPO-specific
dpo_lr: 2e-5
dpo_beta: 0.05 # KL penalty — lower = more aggressive alignment
dpo_label_smoothing: 0.0
56 changes: 56 additions & 0 deletions configs/unsloth/gemma4_27b_moe_lora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Unsloth SFT — Google Gemma 4 26B-A4B (MoE, 4B active)
# Mixture-of-Experts: 26B total params, only 4B active per token.
# 256K context. Supports text and image.
# VRAM: ~18GB (4-bit) or ~28GB (8-bit) — 16-bit recommended for MoE
#
# IMPORTANT: 4-bit quantization is NOT recommended for MoE variants.
# Use 16-bit LoRA (default) or 8-bit. For multi-GPU: --device-map balanced
#
# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
# Preserve reasoning by keeping >=75% reasoning examples in training mix.
# See: https://unsloth.ai/docs/models/gemma-4/train
#
# Usage (single GPU, 16-bit — needs ~28GB+ VRAM):
# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml
#
# Multi-GPU (model splitting):
# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced

model: google/gemma-4-26B-A4B-it
mode: lora
# four_bit: false # NOT recommended for MoE — use 16-bit or 8-bit
# eight_bit: true # Uncomment for ~18GB VRAM (compromise)

# LoRA config — r=16 per Unsloth Gemma 4 recommendation
lora_rank: 16
lora_alpha: 16
lora_targets:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj

# Training — lower LR for MoE stability
lr: 1e-4
epochs: 2
batch_size: 1
grad_accum: 8
max_seq_len: 8192
packing: false
warmup_steps: 10
weight_decay: 0.01
lr_scheduler: cosine

# Gemma 4 chat template for train_on_completions
train_on_completions: true
instruction_part: "<start_of_turn>user\n"
response_part: "<start_of_turn>model\n"

# MoE optimization: use grouped_mm backend for faster training (GRPO script).
# moe_backend: grouped_mm # 12x faster MoE training (when supported)

# Reasoning: maintain >=75% reasoning examples in training data.
# Add "<|think|>" to system prompt to activate thinking mode.
52 changes: 52 additions & 0 deletions configs/unsloth/gemma4_31b_lora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Unsloth SFT — Google Gemma 4 31B (Dense)
# Largest dense Gemma 4. Maximum quality for text and image tasks.
# 256K context. Best Gemma 4 for single-modality fine-tuning.
# VRAM: ~20GB (4-bit) or ~34GB (8-bit) or ~64GB (16-bit)
#
# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt.
# Preserve reasoning by keeping >=75% reasoning examples in training mix.
# See: https://unsloth.ai/docs/models/gemma-4/train
#
# Usage:
# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit
#
# Multi-GPU (16-bit, model splitting):
# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --device-map balanced
#
# Vision fine-tuning (see gemma4_31b_vision_lora.yaml):
# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit

model: google/gemma-4-31B-it
mode: lora
four_bit: true # Fits on single 24GB GPU in 4-bit

# LoRA config — r=16 per Unsloth Gemma 4 recommendation
lora_rank: 16
lora_alpha: 16
lora_targets:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj

# Training — lower LR for larger model
lr: 1e-4
epochs: 2
batch_size: 1
grad_accum: 16
max_seq_len: 8192
packing: false
warmup_steps: 10
weight_decay: 0.01
lr_scheduler: cosine

# Gemma 4 chat template for train_on_completions
train_on_completions: true
instruction_part: "<start_of_turn>user\n"
response_part: "<start_of_turn>model\n"

# Reasoning: maintain >=75% reasoning examples in training data.
# Add "<|think|>" to system prompt to activate thinking mode.
Loading