LocalResearchGroup · dapopov-st · Apr 23, 2025 · Apr 23, 2025
diff --git a/Dockerfile-dpv-branch b/Dockerfile-dpv-branch
@@ -0,0 +1,33 @@
+FROM mambaorg/micromamba:latest
+
+USER root
+
+# Install git and other dependencies
+RUN apt-get update && apt-get install -y git nano curl wget && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Clone llm-foundry repo and set up environment
+RUN git clone -b llama-modeling-dpv https://github.com/LocalResearchGroup/llm-foundry.git /llm-foundry && \
+    cd /llm-foundry && \
+    micromamba create -n llm-foundry python=3.12 uv cuda -c nvidia/label/12.4.1 -c conda-forge && \
+    export UV_PROJECT_ENVIRONMENT=/opt/conda/envs/llm-foundry && \
+    micromamba run -n llm-foundry uv python pin 3.12 && \
+    micromamba run -n llm-foundry uv sync --dev --extra gpu && \
+    micromamba run -n llm-foundry uv sync --dev --extra gpu --extra flash --no-cache
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/conda/envs/llm-foundry
+ENV CONDA_DEFAULT_ENV=llm-foundry
+ENV PATH=/opt/conda/envs/llm-foundry/bin:$PATH
+
+WORKDIR /llm-foundry
+
+# Initialize conda in bash and activate environment by default
+RUN echo "eval \"\$(micromamba shell hook --shell bash)\"" >> ~/.bashrc && \
+    echo "micromamba activate llm-foundry" >> ~/.bashrc
+
+# Open port to view Aim dashboard live from the container (optional) - Not related to aim remote upload server.
+EXPOSE 43800
+
+# Default shell with environment activated
+CMD ["/bin/bash"]
+
+#Build: 2025-04-06-123410  #<-- Change this number each time
diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
@@ -566,3 +566,141 @@ def eval_from_yaml(
         yaml_cfg = om.merge(yaml_cfg, cli_cfg)
     assert isinstance(yaml_cfg, DictConfig)
     return evaluate(yaml_cfg)
+
+
+def convert_peft_adapter_format(model_dir: str) -> None:
+    """Convert PEFT adapter from safetensors to bin format to avoid device metadata issues.
+
+    This function performs three operations:
+    1. Converts the adapter weights from safetensors to PyTorch .bin format
+    2. Renames the original safetensors file to .safetensors.bak
+    3. Updates the adapter_config.json to reference .bin files instead of .safetensors
+
+    Args:
+        model_dir: Full path to the model directory containing PEFT adapter files.
+                  This should be the directory containing:
+                  - adapter_config.json
+                  - adapter_model.safetensors
+                  Example: '/model-checkpoints/llama3-1b-lora-20250420_180800'
+
+    Returns:
+        None
+
+    Side Effects:
+        - Creates adapter_model.bin in model_dir
+        - Renames adapter_model.safetensors to adapter_model.safetensors.bak
+        - Modifies adapter_config.json to reference .bin files
+    """
+    import torch
+    import json
+    import os
+
+    # Paths for the adapter files
+    adapter_path = os.path.join(model_dir, "adapter_model.safetensors")
+    bin_adapter_path = os.path.join(model_dir, "adapter_model.bin")
+    config_path = os.path.join(model_dir, "adapter_config.json")
+
+    try:
+        # Load and convert if needed
+        if os.path.exists(adapter_path) and not os.path.exists(bin_adapter_path):
+            # Load safetensors adapter with explicit CPU device
+            from safetensors.torch import load_file
+            weights = load_file(adapter_path, device="cpu")
+
+            # Save as PyTorch bin format
+            torch.save(weights, bin_adapter_path)
+            print(f"Converted adapter to .bin format: {bin_adapter_path}")
+
+        # Rename/move safetensors file to force bin usage
+        if os.path.exists(adapter_path):
+            backup_path = os.path.join(model_dir, "adapter_model.safetensors.bak")
+            os.rename(adapter_path, backup_path)
+            print(f"Moved safetensors file to {backup_path} to force bin usage")
+
+        # Update config to reference .bin file
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+
+            # Update config to use bin file
+            weight_map = config.get("weight_map", {})
+            for key in weight_map:
+                if "safetensors" in weight_map[key]:
+                    weight_map[key] = weight_map[key].replace("safetensors", "bin")
+
+            # Also update model_type if needed
+            if "safetensors" in config.get("model_type", ""):
+                config["model_type"] = config["model_type"].replace("safetensors", "bin")
+
+            with open(config_path, 'w') as f:
+                json.dump(config, f, indent=2)
+
+            print(f"Updated adapter config to use .bin format")
+    except Exception as e:
+        print(f"Failed to convert adapter format: {e}")
+
+
+def restore_safetensors_after_eval(model_dir: str) -> None:
+    """Restore safetensor files to their original state after evaluation.
+
+    This function reverses the changes made by convert_peft_adapter_format():
+    1. Restores the original adapter_model.safetensors from .bak file if it exists
+    2. Updates the adapter_config.json to reference .safetensors again
+    3. Keeps the .bin file in place for potential future use
+
+    Args:
+        model_dir: Full path to the model directory containing PEFT adapter files.
+                  This should be the directory containing:
+                  - adapter_config.json
+                  - adapter_model.bin
+                  - adapter_model.safetensors.bak (created by convert_peft_adapter_format)
+                  Example: '/model-checkpoints/llama3-1b-lora-20250420_180800'
+
+    Returns:
+        None
+
+    Side Effects:
+        - Restores adapter_model.safetensors from the .bak file if it exists
+        - Modifies adapter_config.json to reference .safetensors files
+        - Keeps adapter_model.bin for potential future use
+    """
+    import os
+    import json
+
+    # Paths for the adapter files
+    backup_path = os.path.join(model_dir, "adapter_model.safetensors.bak")
+    adapter_path = os.path.join(model_dir, "adapter_model.safetensors")
+    config_path = os.path.join(model_dir, "adapter_config.json")
+
+    # Only restore if backup exists
+    if os.path.exists(backup_path):
+        if os.path.exists(adapter_path):
+            print(f"Safetensors file already exists at {adapter_path}, skipping restore")
+        else:
+            os.rename(backup_path, adapter_path)
+            print(f"Restored safetensors file from backup")
+
+        # Update config only if needed
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+
+            # Check if config needs updating
+            needs_update = False
+            weight_map = config.get("weight_map", {})
+
+            for key in weight_map:
+                if "bin" in weight_map[key]:
+                    weight_map[key] = weight_map[key].replace("bin", "safetensors")
+                    needs_update = True
+
+            if "bin" in config.get("model_type", ""):
+                config["model_type"] = config["model_type"].replace("bin", "safetensors")
+                needs_update = True
+
+            if needs_update:
+                with open(config_path, 'w') as f:
+                    json.dump(config, f, indent=2)
+                print(f"Updated adapter config to use safetensors format")
+    else:
+        print(f"No backup found at {backup_path}, nothing to restore")
diff --git a/llmfoundry/models/llama/README.md b/llmfoundry/models/llama/README.md
@@ -0,0 +1,83 @@
+# Training Custom Llama Models
+
+## Customizing Training
+
+### YAML file
+To customize the training process, modify the YAML configuration file specified by `TRAIN_YAML`. The default is `scripts/train/yamls/llama/llama3-1b-lora2.yaml`.
+
+### train_with_custom_llama.py
+
+train_with_custom_llama.py serves as the entry point for training with our custom LLaMA implementation. It handles the configuration loading from YAML files, registers our CustomLlamaModel with the model registry, and orchestrates the training process. The script manages critical setup tasks including HuggingFace authentication, dataset path configuration, and preparing model parameters before delegating to the training framework. It can be customized through command-line arguments or environment variables, making it flexible for different training scenarios.
+
+### Weight Loading in CustomLlamaModel
+
+The  _copy_weights_from_hf_llama method handles weight transfer from standard Hugging Face models to our custom implementation. It first loads a Hugging Face model via from_pretrained() to serve as a source, then systematically copies weights component by component including embeddings, transformer layers, normalization layers and output head. The method explicitly tracks copy progress, reporting both successful transfers and any uninitialized weights to ensure model integrity. This direct weight mapping approach enables our custom implementation to precisely match pretrained model behavior while gaining the performance benefits of our optimized architecture.
+
+
+### CustomLlamaModel Initialization and Adapter Pattern
+
+CustomLlamaModel follows a two-layer architecture that separates model implementation from framework integration. The outer class inherits from HuggingFaceModel, managing compatibility with the training framework, while the inner model (created via _initialize_model_from_config) implements the actual transformer architecture with optimized components. During initialization, the class loads a pretrained model, creates a corresponding optimized implementation, then systematically transfers weights via _copy_weights_from_hf_llama. This adapter pattern allows for performance optimizations in the inner model while maintaining full compatibility with HuggingFace's ecosystem, and includes built-in support for PEFT adapters that can be attached to the initialized model.
+
+
+### Dual Forward Methods in the Adapter Pattern
+
+The CustomLlamaModel implements two distinct forward methods that operate in tandem. The inner model's forward method (bound to the model instance using forward.__get__) contains the raw computational logic for the transformer architecture, handling token embeddings, attention operations, and feed-forward networks. The outer CustomLlamaModel's forward method serves as an adapter interface, filtering input arguments to match inner model requirements, managing state tracking, and implementing training-specific logic like loss calculation via the fused loss function. This separation allows the inner model to remain focused on efficient computation while the outer wrapper handles framework integration, creating a clean division of responsibilities that simplifies maintenance and optimization.
+
+### Model Registration and Framework Integration
+
+The register_custom_llama_model() function in register.py integrates our custom model implementation with the training framework. It adds the CustomLlamaModel class to the framework's model registry under the key "hf_causal_lm", allowing our model to be used wherever HuggingFace causal language models are supported. This registration happens explicitly in both train_with_custom_llama.py before starting training and in local_llama_training.py's evaluate_model function before evaluation begins. Without this registration step, the framework would use a standard implementation instead of our optimized version with custom components.
+
+### local_llama_training.py
+
+The local script adapts the Modal cloud deployment approach for single-machine environments while preserving the core workflow. Key differences include file path handling (local directories vs Modal Volumes), environment setup (local Python interpreter vs containerized environment), and execution model (synchronous function calls vs Modal's distributed functions). The local script adds more comprehensive logging, path validation, and error handling to manage filesystem interactions that Modal handles automatically. While Modal's script leverages cloud-specific features like network tunneling for Aim visualization and GPU provisioning via decorators, the local version provides equivalent functionality through direct subprocess calls and environment variable configuration. The way custom model integration happens should not change. 
+
+This is a local version of the LLM training script that runs directly on your GPUs without using Modal. It's designed to work with the LLM Foundry framework for training and fine-tuning language models.
+
+## Prerequisites
+
+**Follow the steps to install llmfoundry**
+
+## Setup
+
+1. **Clone the LLM Foundry repository**:
+   ```bash
+   git clone https://github.com/mosaicml/llm-foundry.git
+   cd llm-foundry
+   ```
+
+2. **Install dependencies**:
+   ```bash
+   pip install -e .
+   ```
+
+3. **Set up your HuggingFace token**:
+   ```bash
+   export HF_TOKEN=your_token_here
+   ```
+
+## Usage
+
+**Run the full training pipeline**:
+   ```bash
+   python local_llama_training.py
+   ```
+
+or for multi-GPU training (with number of GPUs controlled by nproc_per_node parameter), run
+
+   ```bash
+   NODE_RANK=0 python -m torch.distributed.run --nproc_per_node=2 local_llama_training.py
+   ```
+## Directory Structure
+
+The script creates the following directory structure:
+
+```
+./
+├── datasets/              # Dataset storage
+│   └── c4_small/          # C4 dataset
+├── model-checkpoints/     # Model checkpoints
+├── runs/                  # Training run outputs
+│   └── model-name-timestamp/  # Individual run
+└── local_llama_training.py  # This script
+```
+
diff --git a/llmfoundry/models/llama/__init__.py b/llmfoundry/models/llama/__init__.py
@@ -0,0 +1,37 @@
+"""Llama model package."""
+
+# from .model import LlamaForCausalLM
+# from .config import LlamaConfig
+# from .attention import LlamaAttention
+# from .mlp import LlamaMLP
+# from .decoder import LlamaDecoderLayer
+# from .rms_norm import LlamaRMSNorm
+
+# __all__ = [
+#     'LlamaForCausalLM',
+#     'LlamaConfig',
+#     'LlamaAttention',
+#     'LlamaMLP',
+#     'LlamaDecoderLayer',
+#     'LlamaRMSNorm',
+# ]
+
+# Import core components
+from .config import LlamaConfig
+from .attention import LlamaAttention
+from .mlp import LlamaMLP
+from .decoder import LlamaDecoderLayer
+from .rms_norm import LlamaRMSNorm
+from .register import get_custom_llama_model, register_custom_llama_model
+from .model import CustomLlamaModel
+
+__all__ = [
+    'LlamaConfig',
+    'LlamaAttention',
+    'LlamaMLP',
+    'LlamaDecoderLayer',
+    'LlamaRMSNorm',
+    'get_custom_llama_model',
+    'register_custom_llama_model',
+    'CustomLlamaModel',
+]