diff --git a/Makefile b/Makefile index 6297977..082fd9d 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,10 @@ smoke-test deploy-skills gen-trajectories prepare-sft train-sft \ train-sft-unsloth train-dpo-unsloth train-grpo train-grpo-gptoss-120b \ train-sft-unsloth-gptoss-120b train-sft-fp8 train-grpo-fp8 \ - train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean + train-sft-nemotron-super train-grpo-nemotron-super hpo hpo-thorough clean \ + train-sft-gemma4 train-sft-gemma4-31b train-sft-gemma4-moe \ + train-sft-gemma4-vision train-grpo-gemma4 train-grpo-gemma4-31b \ + train-grpo-gemma4-moe train-dpo-gemma4 hpo-gemma4 # Load repo config (REPO_OWNER, REPO_NAME, REPO_KEY, etc.) include repo.conf @@ -158,6 +161,31 @@ train-sft-qwen3-coder: train-sft-nemotron-super: uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/nemotron3_super_lora.yaml --device-map balanced +# Gemma 4 family +train-sft-gemma4: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml + +train-sft-gemma4-4bit: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml --four-bit + +train-sft-gemma4-e2b: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml + +train-sft-gemma4-31b: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit + +train-sft-gemma4-moe: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced + +train-sft-gemma4-vision: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml + +train-sft-gemma4-vision-31b: + uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit + +train-dpo-gemma4: + uv run python scripts/08_train_sft_unsloth.py --dpo --recipe configs/unsloth/dpo_gemma4_e4b.yaml --sft-checkpoint ./sft_output/ + # DPO and CPT train-dpo-unsloth: uv run python scripts/08_train_sft_unsloth.py --dpo --sft-checkpoint ./sft_output/ @@ -212,6 +240,18 @@ train-grpo-nemotron-super: train-grpo-gptoss-120b: uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gpt_oss_120b.yaml --device-map balanced +train-grpo-gemma4: + uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e4b.yaml + +train-grpo-gemma4-e2b: + uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e2b.yaml + +train-grpo-gemma4-31b: + uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml + +train-grpo-gemma4-moe: + uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_27b_moe.yaml --device-map balanced + # ── Hyperparameter optimization ─────────────────────────────── hpo: uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit @@ -219,6 +259,9 @@ hpo: hpo-thorough: uv run python scripts/08b_hpo.py --recipe configs/unsloth/qwen3_8b_lora.yaml --four-bit --n-trials 30 --steps-per-trial 100 +hpo-gemma4: + uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml + # ── FP8 training (RTX 40/50, H100+) ────────────────────────── train-sft-fp8: uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_8b_lora.yaml --fp8 diff --git a/README.md b/README.md index ea11932..6ba4b47 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ Runs SWE-agent on validated task instances to produce expert solve traces. Evaluates which trajectories actually resolved their task, then converts successful ones to chat-format JSONL suitable for fine-tuning. ### Step 8: Train SFT (`08_train_sft_unsloth.py`) -Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, and continued pretraining. Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`. +Fine-tune a model on expert trajectories using [Unsloth](https://unsloth.ai). Supports LoRA, QLoRA (4-bit, runs on 3GB VRAM!), full fine-tune, DPO, continued pretraining, and **vision/multimodal fine-tuning** (FastVisionModel). Default model is Qwen3-8B. Uses YAML recipe configs in `configs/unsloth/`. ```bash make train-sft-unsloth # Qwen3-8B LoRA (default) @@ -213,7 +213,12 @@ make train-sft-unsloth-gptoss # GPT-OSS 20B LoRA (long context) make train-sft-nemotron # Nemotron-3-Nano 30B MoE (SOTA agentic) make train-sft-qwen3-coder # Qwen3-Coder-Next 80B MoE (SOTA coding) make train-sft-unsloth-full # Full fine-tune +make train-sft-gemma4 # Gemma 4 E4B LoRA (recommended Gemma default) +make train-sft-gemma4-31b # Gemma 4 31B LoRA (4-bit, A100/4090) +make train-sft-gemma4-moe # Gemma 4 26B-A4B MoE (multi-GPU) +make train-sft-gemma4-vision # Gemma 4 E4B vision fine-tuning make train-dpo-unsloth # DPO (after SFT) +make train-dpo-gemma4 # DPO — Gemma 4 E4B (after SFT) make train-cpt-unsloth # Continued pretraining (domain adaptation) make train-recipe RECIPE=configs/unsloth/my_recipe.yaml # Custom recipe ``` @@ -232,6 +237,12 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml # Custom recipe | `nemotron3_nano_lora.yaml` | Nemotron-3-Nano 30B | ~3.6B (MoE) | ~24GB | SOTA agentic, hybrid reasoning | | `nemotron3_super_lora.yaml` | Nemotron-3-Super 120B | ~12B (MoE) | ~64-72GB | SOTA reasoning, 1M context | | `qwen3_coder_next_lora.yaml` | Qwen3-Coder-Next 80B | ~3B (MoE) | ~46GB | **SOTA coding.** 70.6% SWE-Bench | +| `gemma4_e2b_lora.yaml` | Gemma 4 E2B | 5B | ~4GB | Smallest Gemma 4, multimodal capable | +| `gemma4_e4b_lora.yaml` | Gemma 4 E4B | 8B | ~5GB | **Recommended Gemma 4.** Text+image+audio | +| `gemma4_27b_moe_lora.yaml` | Gemma 4 26B-A4B | ~4B (MoE) | ~18GB | MoE — 4-bit not recommended, use 16-bit | +| `gemma4_31b_lora.yaml` | Gemma 4 31B | 31B | ~20GB | Largest dense Gemma 4, 256K context | +| `gemma4_e4b_vision_lora.yaml` | Gemma 4 E4B (vision) | 8B | ~8GB | FastVisionModel — image+audio fine-tuning | +| `gemma4_31b_vision_lora.yaml` | Gemma 4 31B (vision) | 31B | ~24GB | FastVisionModel — image fine-tuning | **GRPO recipes:** @@ -243,6 +254,10 @@ make train-recipe RECIPE=configs/unsloth/my_recipe.yaml # Custom recipe | `grpo_qwen3_coder_next.yaml` | Qwen3-Coder-Next 80B | ~46GB | SOTA coding GRPO (GSPO) | | `grpo_nemotron3_super.yaml` | Nemotron-3-Super 120B | ~64-72GB | SOTA reasoning GRPO, multi-GPU | | `grpo_gpt_oss_120b.yaml` | GPT-OSS 120B | ~65GB | Multi-GPU required (4x A100) | +| `grpo_gemma4_e2b.yaml` | Gemma 4 E2B | ~8GB | Fast Gemma 4 GRPO iteration | +| `grpo_gemma4_e4b.yaml` | Gemma 4 E4B | ~12GB | Recommended Gemma 4 GRPO | +| `grpo_gemma4_27b_moe.yaml` | Gemma 4 26B-A4B (MoE) | ~24GB | MoE — use 16-bit, multi-GPU | +| `grpo_gemma4_31b.yaml` | Gemma 4 31B | ~24GB | Largest dense Gemma 4 | Recipes are YAML files — copy one and customize for your needs. CLI args override recipe values. @@ -291,6 +306,7 @@ python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/qwen3_coder_next | Save MXFP4 | `--save-mxfp4` | 75% less disk space | | Push to Hub | `--push-to-hub user/model` | Upload merged model to HuggingFace | | 8-bit quantization | `--eight-bit` | Middle ground between 4-bit and 16-bit | +| Vision fine-tuning | `--vision` | FastVisionModel for image/audio (Gemma 4, etc.) | | Continued pretraining | `--cpt --data-dir ./corpus/` | Domain adaptation before SFT | > **Legacy:** `make train-sft` still works via [torchtune](https://github.com/meta-pytorch/torchtune) (configs in `configs/torchtune/`), but torchtune has [stopped active development](https://github.com/meta-pytorch/torchtune/issues/2883). [torchforge](https://github.com/meta-pytorch/torchforge) is its successor but only supports full SFT and GRPO currently (no LoRA/DPO). @@ -306,6 +322,9 @@ make train-grpo-nemotron # Nemotron-3-Nano MoE make train-grpo-qwen3-coder # Qwen3-Coder-Next (multi-GPU) make train-grpo-nemotron-super # Nemotron-3-Super 120B (multi-GPU) make train-grpo-gptoss-120b # GPT-OSS 120B (4x A100) +make train-grpo-gemma4 # Gemma 4 E4B (recommended Gemma GRPO) +make train-grpo-gemma4-31b # Gemma 4 31B +make train-grpo-gemma4-moe # Gemma 4 26B-A4B MoE (multi-GPU) make train-grpo-fp8 # FP8 + vLLM standby (RTX 40/50, H100+) make train-grpo-multigpu # DDP with 2 GPUs ``` @@ -412,6 +431,9 @@ Approximate VRAM needed per model size (minimums — actual usage may be higher) | **Agentic tasks (small)** | Nemotron-3-Nano 30B | SOTA agentic, hybrid reasoning, MoE ~3.6B active | | **General purpose (large)** | Qwen3-32B | Strong reasoning, fits on A100/4090 | | **General purpose (small)** | Qwen3-8B | Native tool calling, good balance | +| **Multimodal (vision+audio)** | Gemma 4 E4B | Image+audio+text, 128K context, `<\|think\|>` reasoning | +| **Multimodal (large)** | Gemma 4 31B | Image+text, 256K context, dense | +| **Efficient MoE (small)** | Gemma 4 26B-A4B | Only 4B active params, 256K context | | **Fast iteration / testing** | Qwen3-4B | Quick experiments, low VRAM | | **Long context** | GPT-OSS 20B/120B | OpenAI's open models, 16K+ context | @@ -499,6 +521,59 @@ CPT uses `UnslothTrainer` with separate embedding learning rates. It adds `lm_he python scripts/08_train_sft_unsloth.py --model ./cpt_output/merged/ --recipe configs/unsloth/qwen3_8b_lora.yaml ``` +## Vision / Multimodal Fine-Tuning + +Fine-tune vision-language models (Gemma 4, Llama 3.2 Vision, etc.) on image+text or audio+text data using Unsloth's `FastVisionModel`: + +```bash +make train-sft-gemma4-vision # Gemma 4 E4B vision (image+audio+text) +make train-sft-gemma4-vision-31b # Gemma 4 31B vision (image+text, 4-bit) + +# Or with --vision flag on any vision-capable model: +python scripts/08_train_sft_unsloth.py --vision --model google/gemma-4-E4B-it +``` + +**Data format** — messages with content blocks (images/audio must precede text): +```json +{"messages": [ + {"role": "user", "content": [ + {"type": "image", "image": "path/to/image.jpg"}, + {"type": "text", "text": "Describe this image."} + ]}, + {"role": "assistant", "content": [ + {"type": "text", "text": "The image shows..."} + ]} +]} +``` + +**Vision recipe fields** (in YAML): +| Field | Default | Notes | +|-------|---------|-------| +| `vision` | `false` | Enable FastVisionModel instead of FastLanguageModel | +| `finetune_vision_layers` | `true` | Train vision encoder (set `false` to save VRAM) | +| `finetune_language_layers` | `true` | Train language model layers | +| `finetune_attention_modules` | `true` | Train attention modules | +| `finetune_mlp_modules` | `true` | Train MLP modules | +| `vision_resize` | `null` | Image resize: int (pixels), `"min"`, `"max"`, or `null` (auto) | +| `lora_targets` | `"all-linear"` | Vision models typically use all-linear | + +**Tips:** +- Use images of consistent dimensions (300-1000px) for efficient batching +- Start with `finetune_vision_layers: false` to save VRAM, enable after text works +- Audio clips should be short and task-specific (≤30s) +- No packing for vision — the `UnslothVisionDataCollator` handles batching + +## Gemma 4 Reasoning Preservation + +Gemma 4 models support chain-of-thought reasoning via the `<|think|>` token. To preserve this during fine-tuning: + +1. **Activate thinking mode** — prepend `<|think|>` to your system prompt +2. **Training data mix** — maintain ≥75% reasoning examples (with thinking traces) +3. **Train on completions only** — all Gemma 4 recipes set `train_on_completions: true` with Gemma's chat template tokens (`user\n` / `model\n`) +4. **MoE variant (26B-A4B)** — do NOT use 4-bit quantization for MoE; use 16-bit LoRA with `--device-map balanced` + +Reference: [Unsloth Gemma 4 Training Guide](https://unsloth.ai/docs/models/gemma-4/train) + ## Project Structure ``` diff --git a/configs/unsloth/dpo_gemma4_e4b.yaml b/configs/unsloth/dpo_gemma4_e4b.yaml new file mode 100644 index 0000000..c2d87d7 --- /dev/null +++ b/configs/unsloth/dpo_gemma4_e4b.yaml @@ -0,0 +1,47 @@ +# Unsloth DPO — Google Gemma 4 E4B (8B, Dense+PLE) +# Direct Preference Optimization after SFT. +# Aligns model to prefer better solutions from preference pairs. +# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA) +# +# Requires: SFT checkpoint + dpo_preferences.json in data dir. +# Format: [{"prompt": "...", "chosen": "...", "rejected": "..."}, ...] +# +# Reasoning: DPO can reinforce reasoning quality by preferring +# solutions with clear chain-of-thought over direct answers. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08_train_sft_unsloth.py \ +# --recipe configs/unsloth/dpo_gemma4_e4b.yaml \ +# --dpo --sft-checkpoint ./sft_output/ + +model: google/gemma-4-E4B-it +mode: lora +four_bit: false + +# LoRA config +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +# Training +lr: 2e-4 +epochs: 2 +batch_size: 1 +grad_accum: 4 +max_seq_len: 8192 +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# DPO-specific +dpo_lr: 2e-5 +dpo_beta: 0.05 # KL penalty — lower = more aggressive alignment +dpo_label_smoothing: 0.0 diff --git a/configs/unsloth/gemma4_27b_moe_lora.yaml b/configs/unsloth/gemma4_27b_moe_lora.yaml new file mode 100644 index 0000000..1556cca --- /dev/null +++ b/configs/unsloth/gemma4_27b_moe_lora.yaml @@ -0,0 +1,56 @@ +# Unsloth SFT — Google Gemma 4 26B-A4B (MoE, 4B active) +# Mixture-of-Experts: 26B total params, only 4B active per token. +# 256K context. Supports text and image. +# VRAM: ~18GB (4-bit) or ~28GB (8-bit) — 16-bit recommended for MoE +# +# IMPORTANT: 4-bit quantization is NOT recommended for MoE variants. +# Use 16-bit LoRA (default) or 8-bit. For multi-GPU: --device-map balanced +# +# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt. +# Preserve reasoning by keeping >=75% reasoning examples in training mix. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage (single GPU, 16-bit — needs ~28GB+ VRAM): +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml +# +# Multi-GPU (model splitting): +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_27b_moe_lora.yaml --device-map balanced + +model: google/gemma-4-26B-A4B-it +mode: lora +# four_bit: false # NOT recommended for MoE — use 16-bit or 8-bit +# eight_bit: true # Uncomment for ~18GB VRAM (compromise) + +# LoRA config — r=16 per Unsloth Gemma 4 recommendation +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +# Training — lower LR for MoE stability +lr: 1e-4 +epochs: 2 +batch_size: 1 +grad_accum: 8 +max_seq_len: 8192 +packing: false +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Gemma 4 chat template for train_on_completions +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" + +# MoE optimization: use grouped_mm backend for faster training (GRPO script). +# moe_backend: grouped_mm # 12x faster MoE training (when supported) + +# Reasoning: maintain >=75% reasoning examples in training data. +# Add "<|think|>" to system prompt to activate thinking mode. diff --git a/configs/unsloth/gemma4_31b_lora.yaml b/configs/unsloth/gemma4_31b_lora.yaml new file mode 100644 index 0000000..62fe141 --- /dev/null +++ b/configs/unsloth/gemma4_31b_lora.yaml @@ -0,0 +1,52 @@ +# Unsloth SFT — Google Gemma 4 31B (Dense) +# Largest dense Gemma 4. Maximum quality for text and image tasks. +# 256K context. Best Gemma 4 for single-modality fine-tuning. +# VRAM: ~20GB (4-bit) or ~34GB (8-bit) or ~64GB (16-bit) +# +# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt. +# Preserve reasoning by keeping >=75% reasoning examples in training mix. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --four-bit +# +# Multi-GPU (16-bit, model splitting): +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_lora.yaml --device-map balanced +# +# Vision fine-tuning (see gemma4_31b_vision_lora.yaml): +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit + +model: google/gemma-4-31B-it +mode: lora +four_bit: true # Fits on single 24GB GPU in 4-bit + +# LoRA config — r=16 per Unsloth Gemma 4 recommendation +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +# Training — lower LR for larger model +lr: 1e-4 +epochs: 2 +batch_size: 1 +grad_accum: 16 +max_seq_len: 8192 +packing: false +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Gemma 4 chat template for train_on_completions +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" + +# Reasoning: maintain >=75% reasoning examples in training data. +# Add "<|think|>" to system prompt to activate thinking mode. diff --git a/configs/unsloth/gemma4_31b_vision_lora.yaml b/configs/unsloth/gemma4_31b_vision_lora.yaml new file mode 100644 index 0000000..6f09aaf --- /dev/null +++ b/configs/unsloth/gemma4_31b_vision_lora.yaml @@ -0,0 +1,61 @@ +# Unsloth Vision SFT — Google Gemma 4 31B (Dense, Vision) +# Vision fine-tuning of the largest dense Gemma 4 using FastVisionModel. +# Supports image + text inputs. 256K context. +# VRAM: ~24GB (4-bit) or ~40GB (8-bit) or ~70GB (16-bit) +# +# Data format: messages with content blocks (not plain text). +# [{"role": "user", "content": [ +# {"type": "image", "image": }, +# {"type": "text", "text": "Describe this image."} +# ]}, +# {"role": "assistant", "content": [ +# {"type": "text", "text": "The image shows..."} +# ]}] +# +# Images should be 300-1000px, same dimensions preferred for batching. +# +# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --four-bit +# +# Multi-GPU: +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_31b_vision_lora.yaml --device-map balanced + +model: google/gemma-4-31B-it +mode: lora +vision: true # Use FastVisionModel + UnslothVisionDataCollator +four_bit: true # Required for single-GPU (24GB) + +# LoRA config +lora_rank: 16 +lora_alpha: 16 +lora_targets: "all-linear" # Vision models use all-linear by default + +# Vision-specific: which components to fine-tune +# Tip: start with finetune_vision_layers: false to save VRAM, +# enable after text-only training works well. +finetune_vision_layers: false # Set true to also train vision encoder +finetune_language_layers: true # Train language layers +finetune_attention_modules: true +finetune_mlp_modules: true + +# Image processing +# vision_resize: null # Auto; or int (pixels), "min", "max" +# vision_snap_to_patch_size: true + +# Training — lower LR for larger model +lr: 1e-4 +epochs: 2 +batch_size: 1 +grad_accum: 8 +max_seq_len: 4096 # Lower for vision (images consume many tokens) +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Vision collator handles train_on_completions +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" diff --git a/configs/unsloth/gemma4_e2b_lora.yaml b/configs/unsloth/gemma4_e2b_lora.yaml new file mode 100644 index 0000000..340bcdc --- /dev/null +++ b/configs/unsloth/gemma4_e2b_lora.yaml @@ -0,0 +1,47 @@ +# Unsloth SFT — Google Gemma 4 E2B (5B, Dense+PLE) +# Smallest Gemma 4 model. Supports text, image, and audio. +# 128K context. Excellent for lightweight fine-tuning. +# VRAM: ~4GB (4-bit) or ~15GB (16-bit LoRA) +# +# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt. +# Preserve reasoning by keeping >=75% reasoning examples in training mix. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e2b_lora.yaml --four-bit + +model: google/gemma-4-E2B-it +mode: lora +four_bit: false # Enable for ~4GB VRAM; 16-bit recommended for quality + +# LoRA config — r=16 per Unsloth Gemma 4 recommendation +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +# Training +lr: 2e-4 +epochs: 2 +batch_size: 1 +grad_accum: 4 +max_seq_len: 8192 +packing: false +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Gemma 4 chat template for train_on_completions +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" + +# Reasoning: maintain >=75% reasoning examples in training data. +# Add "<|think|>" to system prompt to activate thinking mode. diff --git a/configs/unsloth/gemma4_e4b_lora.yaml b/configs/unsloth/gemma4_e4b_lora.yaml new file mode 100644 index 0000000..7d16063 --- /dev/null +++ b/configs/unsloth/gemma4_e4b_lora.yaml @@ -0,0 +1,50 @@ +# Unsloth SFT — Google Gemma 4 E4B (8B, Dense+PLE) +# Best balance of quality and efficiency. Supports text, image, and audio. +# 128K context. Recommended default for Gemma 4 fine-tuning. +# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA) +# +# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt. +# Preserve reasoning by keeping >=75% reasoning examples in training mix. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_lora.yaml --four-bit +# +# Vision fine-tuning (see gemma4_e4b_vision_lora.yaml): +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml + +model: google/gemma-4-E4B-it +mode: lora +four_bit: false # Enable for ~5GB VRAM; 16-bit recommended for quality + +# LoRA config — r=16 per Unsloth Gemma 4 recommendation +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +# Training +lr: 2e-4 +epochs: 2 +batch_size: 1 +grad_accum: 4 +max_seq_len: 8192 +packing: false +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Gemma 4 chat template for train_on_completions +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" + +# Reasoning: maintain >=75% reasoning examples in training data. +# Add "<|think|>" to system prompt to activate thinking mode. diff --git a/configs/unsloth/gemma4_e4b_vision_lora.yaml b/configs/unsloth/gemma4_e4b_vision_lora.yaml new file mode 100644 index 0000000..f40a0ee --- /dev/null +++ b/configs/unsloth/gemma4_e4b_vision_lora.yaml @@ -0,0 +1,60 @@ +# Unsloth Vision SFT — Google Gemma 4 E4B (8B, Dense+PLE, Multimodal) +# Vision + audio fine-tuning using FastVisionModel. +# Supports image, audio, and text inputs. 128K context. +# VRAM: ~8GB (4-bit) or ~18GB (16-bit LoRA) +# +# Data format: messages with content blocks (not plain text). +# [{"role": "user", "content": [ +# {"type": "image", "image": }, +# {"type": "text", "text": "Describe this image."} +# ]}, +# {"role": "assistant", "content": [ +# {"type": "text", "text": "The image shows..."} +# ]}] +# +# For audio, use {"type": "audio", "audio": } before text. +# Images should be 300-1000px, same dimensions preferred for batching. +# +# Reasoning: Gemma 4 supports <|think|> reasoning via system prompt. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml +# uv run python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml --four-bit + +model: google/gemma-4-E4B-it +mode: lora +vision: true # Use FastVisionModel + UnslothVisionDataCollator +four_bit: false + +# LoRA config +lora_rank: 16 +lora_alpha: 16 +lora_targets: "all-linear" # Vision models use all-linear by default + +# Vision-specific: which components to fine-tune +# Tip: start with finetune_vision_layers: false to save VRAM, +# enable after text-only training works well. +finetune_vision_layers: false # Set true to also train vision encoder +finetune_language_layers: true # Train language layers +finetune_attention_modules: true +finetune_mlp_modules: true + +# Image processing +# vision_resize: null # Auto; or int (pixels), "min", "max" +# vision_snap_to_patch_size: true + +# Training +lr: 2e-4 +epochs: 2 +batch_size: 1 +grad_accum: 4 +max_seq_len: 4096 # Lower for vision (images consume many tokens) +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Vision collator handles train_on_completions +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" diff --git a/configs/unsloth/grpo_gemma4_27b_moe.yaml b/configs/unsloth/grpo_gemma4_27b_moe.yaml new file mode 100644 index 0000000..1982c78 --- /dev/null +++ b/configs/unsloth/grpo_gemma4_27b_moe.yaml @@ -0,0 +1,49 @@ +# Unsloth GRPO — Google Gemma 4 26B-A4B (MoE, 4B active) +# RL training with Gemma 4 MoE — efficient inference (only 4B active). +# VRAM: ~24GB (16-bit with vLLM) — 4-bit NOT recommended for MoE +# +# IMPORTANT: 4-bit quantization is NOT recommended for MoE variants. +# Use 16-bit (default) or 8-bit. Multi-GPU: --device-map balanced +# +# Reasoning: Use <|think|> in system prompt for chain-of-thought. +# GRPO naturally preserves reasoning when reward signals value it. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_27b_moe.yaml + +model: google/gemma-4-26B-A4B-it +mode: grpo +# four_bit: false # NOT recommended for MoE — use 16-bit + +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +lr: 3e-6 +steps: 200 +batch_size: 4 +num_generations: 4 +max_seq_len: 8192 +max_completion_len: 4096 +reward_mode: hybrid + +temperature: 1.0 +top_p: 0.95 +top_k: 64 +min_p: 0.1 +gpu_memory_utilization: 0.5 # Lower for MoE memory overhead + +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# MoE optimization — 12x faster MoE training +# moe_backend: grouped_mm # Uncomment if supported by your hardware diff --git a/configs/unsloth/grpo_gemma4_31b.yaml b/configs/unsloth/grpo_gemma4_31b.yaml new file mode 100644 index 0000000..552040b --- /dev/null +++ b/configs/unsloth/grpo_gemma4_31b.yaml @@ -0,0 +1,47 @@ +# Unsloth GRPO — Google Gemma 4 31B (Dense) +# RL training with the largest dense Gemma 4 model. +# Maximum quality but requires significant VRAM. +# VRAM: ~24GB (4-bit with vLLM) or ~40GB (8-bit) +# +# Reasoning: Use <|think|> in system prompt for chain-of-thought. +# GRPO naturally preserves reasoning when reward signals value it. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml +# +# Multi-GPU: +# uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_31b.yaml --device-map balanced + +model: google/gemma-4-31B-it +mode: grpo +four_bit: true # Required for single-GPU + +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +lr: 3e-6 +steps: 200 +batch_size: 4 +num_generations: 4 +max_seq_len: 8192 +max_completion_len: 4096 +reward_mode: hybrid + +temperature: 1.0 +top_p: 0.95 +top_k: 64 +min_p: 0.1 +gpu_memory_utilization: 0.6 + +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine diff --git a/configs/unsloth/grpo_gemma4_e2b.yaml b/configs/unsloth/grpo_gemma4_e2b.yaml new file mode 100644 index 0000000..0bda1f1 --- /dev/null +++ b/configs/unsloth/grpo_gemma4_e2b.yaml @@ -0,0 +1,44 @@ +# Unsloth GRPO — Google Gemma 4 E2B (5B, Dense+PLE) +# RL training with the smallest Gemma 4 model. +# Fast iteration for reward signal experimentation. +# VRAM: ~8GB (4-bit with vLLM inference) +# +# Reasoning: Use <|think|> in system prompt for chain-of-thought. +# GRPO naturally preserves reasoning when reward signals value it. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e2b.yaml + +model: google/gemma-4-E2B-it +mode: grpo +four_bit: true + +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +lr: 5e-6 +steps: 200 +batch_size: 4 +num_generations: 4 +max_seq_len: 8192 +max_completion_len: 4096 +reward_mode: hybrid + +temperature: 1.0 +top_p: 0.95 +top_k: 64 +min_p: 0.1 +gpu_memory_utilization: 0.6 + +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine diff --git a/configs/unsloth/grpo_gemma4_e4b.yaml b/configs/unsloth/grpo_gemma4_e4b.yaml new file mode 100644 index 0000000..e4f8b70 --- /dev/null +++ b/configs/unsloth/grpo_gemma4_e4b.yaml @@ -0,0 +1,44 @@ +# Unsloth GRPO — Google Gemma 4 E4B (8B, Dense+PLE) +# RL training with the recommended default Gemma 4 model. +# Best balance of generation speed and model quality. +# VRAM: ~12GB (4-bit with vLLM inference) +# +# Reasoning: Use <|think|> in system prompt for chain-of-thought. +# GRPO naturally preserves reasoning when reward signals value it. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/09_train_grpo.py --recipe configs/unsloth/grpo_gemma4_e4b.yaml + +model: google/gemma-4-E4B-it +mode: grpo +four_bit: true + +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +lr: 5e-6 +steps: 200 +batch_size: 4 +num_generations: 4 +max_seq_len: 8192 +max_completion_len: 4096 +reward_mode: hybrid + +temperature: 1.0 +top_p: 0.95 +top_k: 64 +min_p: 0.1 +gpu_memory_utilization: 0.6 + +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine diff --git a/configs/unsloth/hpo_gemma4_e4b.yaml b/configs/unsloth/hpo_gemma4_e4b.yaml new file mode 100644 index 0000000..60869f9 --- /dev/null +++ b/configs/unsloth/hpo_gemma4_e4b.yaml @@ -0,0 +1,47 @@ +# Unsloth HPO — Google Gemma 4 E4B (8B, Dense+PLE) +# Hyperparameter optimization via Optuna before full training. +# Searches: rank, alpha, LR, weight_decay, warmup, scheduler, RSLoRA, grad_accum. +# VRAM: ~5GB (4-bit) or ~16GB (16-bit LoRA) per trial +# +# Run HPO first, then use the best recipe for full SFT/GRPO. +# See: https://unsloth.ai/docs/models/gemma-4/train +# +# Usage: +# uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml --n-trials 20 +# uv run python scripts/08b_hpo.py --recipe configs/unsloth/hpo_gemma4_e4b.yaml --n-trials 10 --four-bit --pruning + +model: google/gemma-4-E4B-it +mode: lora +four_bit: false # Enable for faster HPO trials on constrained VRAM + +# LoRA search space anchored at Unsloth's Gemma 4 defaults +lora_rank: 16 +lora_alpha: 16 +lora_targets: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + +# Training — baseline params (Optuna will search around these) +lr: 2e-4 +epochs: 1 # Short epochs for HPO trials +batch_size: 1 +grad_accum: 4 +max_seq_len: 8192 +packing: false +warmup_steps: 10 +weight_decay: 0.01 +lr_scheduler: cosine + +# Gemma 4 chat template +train_on_completions: true +instruction_part: "user\n" +response_part: "model\n" + +# Eval for HPO loss tracking +eval_split: 0.1 +eval_steps: 10 diff --git a/scripts/08_train_sft_unsloth.py b/scripts/08_train_sft_unsloth.py index 5f3f348..ff43ecf 100644 --- a/scripts/08_train_sft_unsloth.py +++ b/scripts/08_train_sft_unsloth.py @@ -2,7 +2,8 @@ """Step 8 (alt): Fine-tune a model using Unsloth. Local training — runs on whatever GPU(s) are available. -Supports LoRA SFT (default), full SFT, DPO, and continued pretraining. +Supports LoRA SFT (default), full SFT, DPO, continued pretraining, +and vision/multimodal fine-tuning (FastVisionModel). ALL Unsloth capabilities are configurable via YAML recipe or CLI flags. Single GPU: @@ -25,6 +26,10 @@ Continued pretraining: python scripts/08_train_sft_unsloth.py --cpt --data-dir ./corpus/ +Vision/multimodal fine-tuning (Gemma 4, Llama 3.2 Vision, etc.): + python scripts/08_train_sft_unsloth.py --recipe configs/unsloth/gemma4_e4b_vision_lora.yaml + python scripts/08_train_sft_unsloth.py --vision --model google/gemma-4-E4B-it + GGUF export: python scripts/08_train_sft_unsloth.py --recipe ... --save-gguf q4_k_m """ @@ -51,6 +56,15 @@ "revision": None, # Pin specific model revision from Hub (e.g. "main", commit hash) "resize_model_vocab": None, # Resize vocab (int) — for adding custom special tokens + # ── Vision (multimodal) ── + "vision": False, # Use FastVisionModel instead of FastLanguageModel + "finetune_vision_layers": True, # Train vision encoder layers + "finetune_language_layers": True, # Train language model layers + "finetune_attention_modules": True, # Train attention modules + "finetune_mlp_modules": True, # Train MLP modules + "vision_resize": None, # Image resize: int (pixels), "min", "max", or None (auto) + "vision_snap_to_patch_size": True, # Force images to match patch size + # ── Quantization ── "four_bit": False, # QLoRA 4-bit (minimal VRAM) "eight_bit": False, # 8-bit quantization @@ -155,6 +169,7 @@ def parse_args() -> argparse.Namespace: p.add_argument("--full", action="store_true", help="Full fine-tune instead of LoRA") p.add_argument("--dpo", action="store_true", help="DPO training (requires --sft-checkpoint)") p.add_argument("--cpt", action="store_true", help="Continued pretraining (domain adaptation)") + p.add_argument("--vision", action="store_true", help="Vision fine-tuning (uses FastVisionModel)") p.add_argument("--sft-checkpoint", default=None, help="Path to SFT checkpoint for DPO") # Quantization @@ -241,6 +256,8 @@ def apply_overrides(recipe: dict, args: argparse.Namespace) -> dict: recipe["mode"] = "full" if args.cpt: recipe["mode"] = "cpt" + if args.vision: + recipe["vision"] = True if args.packing: recipe["packing"] = True if args.use_rslora: @@ -303,9 +320,18 @@ def load_sft_data(data_dir: str) -> list[dict]: return examples +def _get_model_class(recipe: dict): + """Return the appropriate Unsloth model class (language or vision).""" + if recipe.get("vision"): + from unsloth import FastVisionModel + return FastVisionModel + from unsloth import FastLanguageModel + return FastLanguageModel + + def _load_model(recipe: dict): """Load model with all recipe-configured options.""" - from unsloth import FastLanguageModel + ModelClass = _get_model_class(recipe) model_name = recipe["model"] is_full = recipe["mode"] == "full" @@ -345,24 +371,35 @@ def _load_model(recipe: dict): if recipe.get("unsloth_tiled_mlp"): load_kwargs["unsloth_tiled_mlp"] = True - model, tokenizer = FastLanguageModel.from_pretrained(**load_kwargs) + model, tokenizer = ModelClass.from_pretrained(**load_kwargs) return model, tokenizer def _apply_lora(model, recipe: dict): """Apply LoRA with all recipe-configured options.""" - from unsloth import FastLanguageModel + ModelClass = _get_model_class(recipe) peft_kwargs = dict( r=recipe["lora_rank"], lora_alpha=recipe["lora_alpha"], - target_modules=recipe["lora_targets"], lora_dropout=recipe.get("lora_dropout", 0), bias=recipe.get("bias", "none"), use_gradient_checkpointing=recipe.get("gradient_checkpointing", "unsloth"), use_rslora=recipe.get("use_rslora", False), random_state=recipe.get("seed", 42), ) + + if recipe.get("vision"): + # Vision models use finetune_*_layers/modules instead of target_modules + peft_kwargs["finetune_vision_layers"] = recipe.get("finetune_vision_layers", True) + peft_kwargs["finetune_language_layers"] = recipe.get("finetune_language_layers", True) + peft_kwargs["finetune_attention_modules"] = recipe.get("finetune_attention_modules", True) + peft_kwargs["finetune_mlp_modules"] = recipe.get("finetune_mlp_modules", True) + # Vision models default to "all-linear" target_modules + peft_kwargs["target_modules"] = recipe.get("lora_targets", "all-linear") + else: + peft_kwargs["target_modules"] = recipe["lora_targets"] + if recipe.get("loftq_config"): peft_kwargs["loftq_config"] = recipe["loftq_config"] if recipe.get("init_lora_weights") is not True and recipe.get("init_lora_weights") is not None: @@ -374,7 +411,7 @@ def _apply_lora(model, recipe: dict): if recipe.get("qat_scheme"): peft_kwargs["qat_scheme"] = recipe["qat_scheme"] - return FastLanguageModel.get_peft_model(model, **peft_kwargs) + return ModelClass.get_peft_model(model, **peft_kwargs) def train_sft(recipe: dict, data_dir: str, output_dir: str, args: argparse.Namespace): @@ -384,10 +421,11 @@ def train_sft(recipe: dict, data_dir: str, output_dir: str, args: argparse.Names model_name = recipe["model"] is_full = recipe["mode"] == "full" + is_vision = recipe.get("vision", False) max_seq_len = recipe["max_seq_len"] use_bf16 = is_bfloat16_supported() - print(f"\n=== Loading model: {model_name} ===") + print(f"\n=== Loading model: {model_name} {'(vision)' if is_vision else ''} ===") model, tokenizer = _load_model(recipe) if not is_full: @@ -405,15 +443,24 @@ def train_sft(recipe: dict, data_dir: str, output_dir: str, args: argparse.Names from datasets import Dataset - def format_example(example): - messages = example.get("messages", []) - kwargs = dict(tokenize=False, add_generation_prompt=False) - if recipe.get("reasoning_effort"): - kwargs["reasoning_effort"] = recipe["reasoning_effort"] - text = tokenizer.apply_chat_template(messages, **kwargs) - return {"text": text} + if is_vision: + # Vision: keep messages as-is (with image/audio content blocks). + # UnslothVisionDataCollator handles tokenization and image processing. + # Use list comprehension instead of .map() for multi-image support. + converted = [] + for ex in examples: + converted.append({"messages": ex.get("messages", [])}) + dataset = Dataset.from_list(converted) + else: + def format_example(example): + messages = example.get("messages", []) + kwargs = dict(tokenize=False, add_generation_prompt=False) + if recipe.get("reasoning_effort"): + kwargs["reasoning_effort"] = recipe["reasoning_effort"] + text = tokenizer.apply_chat_template(messages, **kwargs) + return {"text": text} - dataset = Dataset.from_list(examples).map(format_example) + dataset = Dataset.from_list(examples).map(format_example) # Optional train/eval split eval_split = recipe.get("eval_split", 0.0) @@ -453,12 +500,18 @@ def format_example(example): optim=recipe.get("optim", "adamw_8bit"), seed=recipe["seed"], max_seq_length=max_seq_len, - dataset_text_field="text", - packing=recipe["packing"], report_to="none" if recipe.get("no_wandb") else "wandb", run_name=f"swe-gym-sft-{'full' if is_full else 'lora'}-{Path(model_name).name}", **train_kwargs, ) + + # Vision mode: no packing, no text field (collator handles it) + if not is_vision: + sft_kwargs["dataset_text_field"] = "text" + sft_kwargs["packing"] = recipe["packing"] + else: + sft_kwargs["remove_unused_columns"] = False + sft_kwargs["dataset_kwargs"] = {"skip_prepare_dataset": True} if recipe.get("warmup_ratio"): sft_kwargs["warmup_ratio"] = recipe["warmup_ratio"] if eval_dataset: @@ -473,7 +526,7 @@ def format_example(example): training_args = SFTConfig(**sft_kwargs) - trainer = SFTTrainer( + trainer_kwargs = dict( model=model, tokenizer=tokenizer, train_dataset=train_dataset, @@ -481,8 +534,27 @@ def format_example(example): args=training_args, ) - # Train on completions only — ~1% accuracy boost per QLoRA paper - if recipe.get("train_on_completions"): + # Vision: use UnslothVisionDataCollator for image/audio processing + if is_vision: + from unsloth import UnslothVisionDataCollator + collator_kwargs = dict(model=model, tokenizer=tokenizer) + if recipe.get("vision_resize") is not None: + collator_kwargs["resize"] = recipe["vision_resize"] + if recipe.get("vision_snap_to_patch_size") is not None: + collator_kwargs["snap_to_patch_size"] = recipe["vision_snap_to_patch_size"] + # train_on_completions via collator for vision + if recipe.get("train_on_completions"): + collator_kwargs["completion_only_loss"] = True + collator_kwargs["train_on_responses_only"] = True + collator_kwargs["instruction_part"] = recipe.get("instruction_part") + collator_kwargs["response_part"] = recipe.get("response_part") + print(" Training on completions only (via vision collator)") + trainer_kwargs["data_collator"] = UnslothVisionDataCollator(**collator_kwargs) + + trainer = SFTTrainer(**trainer_kwargs) + + # Train on completions only — ~1% accuracy boost per QLoRA paper (text mode) + if recipe.get("train_on_completions") and not is_vision: from unsloth.chat_templates import train_on_responses_only trainer = train_on_responses_only( trainer, @@ -501,7 +573,8 @@ def format_example(example): print(f" Early stopping: patience={patience}") mode_str = "full" if is_full else "LoRA" - print(f"\n=== Starting {mode_str} SFT ===") + vision_str = " (vision)" if is_vision else "" + print(f"\n=== Starting {mode_str} SFT{vision_str} ===") print(f" Model: {model_name}") print(f" Epochs: {recipe['epochs']}") print(f" Batch size: {recipe['batch_size']} x {recipe['grad_accum']} grad accum")