diff --git a/Dockerfile-dpv-branch b/Dockerfile-dpv-branch
new file mode 100644
index 0000000..a395c3b
--- /dev/null
+++ b/Dockerfile-dpv-branch
@@ -0,0 +1,33 @@
+FROM mambaorg/micromamba:latest
+
+USER root
+
+# Install git and other dependencies
+RUN apt-get update && apt-get install -y git nano curl wget && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Clone llm-foundry repo and set up environment
+RUN git clone -b llama-modeling-dpv https://github.com/LocalResearchGroup/llm-foundry.git /llm-foundry && \
+    cd /llm-foundry && \
+    micromamba create -n llm-foundry python=3.12 uv cuda -c nvidia/label/12.4.1 -c conda-forge && \
+    export UV_PROJECT_ENVIRONMENT=/opt/conda/envs/llm-foundry && \
+    micromamba run -n llm-foundry uv python pin 3.12 && \
+    micromamba run -n llm-foundry uv sync --dev --extra gpu && \
+    micromamba run -n llm-foundry uv sync --dev --extra gpu --extra flash --no-cache
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/conda/envs/llm-foundry
+ENV CONDA_DEFAULT_ENV=llm-foundry
+ENV PATH=/opt/conda/envs/llm-foundry/bin:$PATH
+
+WORKDIR /llm-foundry
+
+# Initialize conda in bash and activate environment by default
+RUN echo "eval \"\$(micromamba shell hook --shell bash)\"" >> ~/.bashrc && \
+    echo "micromamba activate llm-foundry" >> ~/.bashrc
+
+# Open port to view Aim dashboard live from the container (optional) - Not related to aim remote upload server.
+EXPOSE 43800
+
+# Default shell with environment activated
+CMD ["/bin/bash"]
+
+#Build: 2025-04-06-123410  #<-- Change this number each time
\ No newline at end of file
diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index 9523a5e..5ea5fb8 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -566,3 +566,141 @@ def eval_from_yaml(
         yaml_cfg = om.merge(yaml_cfg, cli_cfg)
     assert isinstance(yaml_cfg, DictConfig)
     return evaluate(yaml_cfg)
+
+
+def convert_peft_adapter_format(model_dir: str) -> None:
+    """Convert PEFT adapter from safetensors to bin format to avoid device metadata issues.
+    
+    This function performs three operations:
+    1. Converts the adapter weights from safetensors to PyTorch .bin format
+    2. Renames the original safetensors file to .safetensors.bak
+    3. Updates the adapter_config.json to reference .bin files instead of .safetensors
+    
+    Args:
+        model_dir: Full path to the model directory containing PEFT adapter files.
+                  This should be the directory containing:
+                  - adapter_config.json
+                  - adapter_model.safetensors
+                  Example: '/model-checkpoints/llama3-1b-lora-20250420_180800'
+    
+    Returns:
+        None
+    
+    Side Effects:
+        - Creates adapter_model.bin in model_dir
+        - Renames adapter_model.safetensors to adapter_model.safetensors.bak
+        - Modifies adapter_config.json to reference .bin files
+    """
+    import torch
+    import json
+    import os
+    
+    # Paths for the adapter files
+    adapter_path = os.path.join(model_dir, "adapter_model.safetensors")
+    bin_adapter_path = os.path.join(model_dir, "adapter_model.bin")
+    config_path = os.path.join(model_dir, "adapter_config.json")
+    
+    try:
+        # Load and convert if needed
+        if os.path.exists(adapter_path) and not os.path.exists(bin_adapter_path):
+            # Load safetensors adapter with explicit CPU device
+            from safetensors.torch import load_file
+            weights = load_file(adapter_path, device="cpu")
+            
+            # Save as PyTorch bin format
+            torch.save(weights, bin_adapter_path)
+            print(f"Converted adapter to .bin format: {bin_adapter_path}")
+        
+        # Rename/move safetensors file to force bin usage
+        if os.path.exists(adapter_path):
+            backup_path = os.path.join(model_dir, "adapter_model.safetensors.bak")
+            os.rename(adapter_path, backup_path)
+            print(f"Moved safetensors file to {backup_path} to force bin usage")
+        
+        # Update config to reference .bin file
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+            
+            # Update config to use bin file
+            weight_map = config.get("weight_map", {})
+            for key in weight_map:
+                if "safetensors" in weight_map[key]:
+                    weight_map[key] = weight_map[key].replace("safetensors", "bin")
+            
+            # Also update model_type if needed
+            if "safetensors" in config.get("model_type", ""):
+                config["model_type"] = config["model_type"].replace("safetensors", "bin")
+            
+            with open(config_path, 'w') as f:
+                json.dump(config, f, indent=2)
+            
+            print(f"Updated adapter config to use .bin format")
+    except Exception as e:
+        print(f"Failed to convert adapter format: {e}")
+
+
+def restore_safetensors_after_eval(model_dir: str) -> None:
+    """Restore safetensor files to their original state after evaluation.
+    
+    This function reverses the changes made by convert_peft_adapter_format():
+    1. Restores the original adapter_model.safetensors from .bak file if it exists
+    2. Updates the adapter_config.json to reference .safetensors again
+    3. Keeps the .bin file in place for potential future use
+    
+    Args:
+        model_dir: Full path to the model directory containing PEFT adapter files.
+                  This should be the directory containing:
+                  - adapter_config.json
+                  - adapter_model.bin
+                  - adapter_model.safetensors.bak (created by convert_peft_adapter_format)
+                  Example: '/model-checkpoints/llama3-1b-lora-20250420_180800'
+    
+    Returns:
+        None
+    
+    Side Effects:
+        - Restores adapter_model.safetensors from the .bak file if it exists
+        - Modifies adapter_config.json to reference .safetensors files
+        - Keeps adapter_model.bin for potential future use
+    """
+    import os
+    import json
+    
+    # Paths for the adapter files
+    backup_path = os.path.join(model_dir, "adapter_model.safetensors.bak")
+    adapter_path = os.path.join(model_dir, "adapter_model.safetensors")
+    config_path = os.path.join(model_dir, "adapter_config.json")
+    
+    # Only restore if backup exists
+    if os.path.exists(backup_path):
+        if os.path.exists(adapter_path):
+            print(f"Safetensors file already exists at {adapter_path}, skipping restore")
+        else:
+            os.rename(backup_path, adapter_path)
+            print(f"Restored safetensors file from backup")
+            
+        # Update config only if needed
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+            
+            # Check if config needs updating
+            needs_update = False
+            weight_map = config.get("weight_map", {})
+            
+            for key in weight_map:
+                if "bin" in weight_map[key]:
+                    weight_map[key] = weight_map[key].replace("bin", "safetensors")
+                    needs_update = True
+            
+            if "bin" in config.get("model_type", ""):
+                config["model_type"] = config["model_type"].replace("bin", "safetensors")
+                needs_update = True
+            
+            if needs_update:
+                with open(config_path, 'w') as f:
+                    json.dump(config, f, indent=2)
+                print(f"Updated adapter config to use safetensors format")
+    else:
+        print(f"No backup found at {backup_path}, nothing to restore")
\ No newline at end of file
diff --git a/llmfoundry/models/llama/README.md b/llmfoundry/models/llama/README.md
new file mode 100644
index 0000000..65d5f20
--- /dev/null
+++ b/llmfoundry/models/llama/README.md
@@ -0,0 +1,83 @@
+# Training Custom Llama Models
+
+## Customizing Training
+
+### YAML file
+To customize the training process, modify the YAML configuration file specified by `TRAIN_YAML`. The default is `scripts/train/yamls/llama/llama3-1b-lora2.yaml`.
+
+### train_with_custom_llama.py
+
+train_with_custom_llama.py serves as the entry point for training with our custom LLaMA implementation. It handles the configuration loading from YAML files, registers our CustomLlamaModel with the model registry, and orchestrates the training process. The script manages critical setup tasks including HuggingFace authentication, dataset path configuration, and preparing model parameters before delegating to the training framework. It can be customized through command-line arguments or environment variables, making it flexible for different training scenarios.
+
+### Weight Loading in CustomLlamaModel
+
+The  _copy_weights_from_hf_llama method handles weight transfer from standard Hugging Face models to our custom implementation. It first loads a Hugging Face model via from_pretrained() to serve as a source, then systematically copies weights component by component including embeddings, transformer layers, normalization layers and output head. The method explicitly tracks copy progress, reporting both successful transfers and any uninitialized weights to ensure model integrity. This direct weight mapping approach enables our custom implementation to precisely match pretrained model behavior while gaining the performance benefits of our optimized architecture.
+
+
+### CustomLlamaModel Initialization and Adapter Pattern
+
+CustomLlamaModel follows a two-layer architecture that separates model implementation from framework integration. The outer class inherits from HuggingFaceModel, managing compatibility with the training framework, while the inner model (created via _initialize_model_from_config) implements the actual transformer architecture with optimized components. During initialization, the class loads a pretrained model, creates a corresponding optimized implementation, then systematically transfers weights via _copy_weights_from_hf_llama. This adapter pattern allows for performance optimizations in the inner model while maintaining full compatibility with HuggingFace's ecosystem, and includes built-in support for PEFT adapters that can be attached to the initialized model.
+
+
+### Dual Forward Methods in the Adapter Pattern
+
+The CustomLlamaModel implements two distinct forward methods that operate in tandem. The inner model's forward method (bound to the model instance using forward.__get__) contains the raw computational logic for the transformer architecture, handling token embeddings, attention operations, and feed-forward networks. The outer CustomLlamaModel's forward method serves as an adapter interface, filtering input arguments to match inner model requirements, managing state tracking, and implementing training-specific logic like loss calculation via the fused loss function. This separation allows the inner model to remain focused on efficient computation while the outer wrapper handles framework integration, creating a clean division of responsibilities that simplifies maintenance and optimization.
+
+### Model Registration and Framework Integration
+
+The register_custom_llama_model() function in register.py integrates our custom model implementation with the training framework. It adds the CustomLlamaModel class to the framework's model registry under the key "hf_causal_lm", allowing our model to be used wherever HuggingFace causal language models are supported. This registration happens explicitly in both train_with_custom_llama.py before starting training and in local_llama_training.py's evaluate_model function before evaluation begins. Without this registration step, the framework would use a standard implementation instead of our optimized version with custom components.
+
+### local_llama_training.py
+
+The local script adapts the Modal cloud deployment approach for single-machine environments while preserving the core workflow. Key differences include file path handling (local directories vs Modal Volumes), environment setup (local Python interpreter vs containerized environment), and execution model (synchronous function calls vs Modal's distributed functions). The local script adds more comprehensive logging, path validation, and error handling to manage filesystem interactions that Modal handles automatically. While Modal's script leverages cloud-specific features like network tunneling for Aim visualization and GPU provisioning via decorators, the local version provides equivalent functionality through direct subprocess calls and environment variable configuration. The way custom model integration happens should not change. 
+
+This is a local version of the LLM training script that runs directly on your GPUs without using Modal. It's designed to work with the LLM Foundry framework for training and fine-tuning language models.
+
+## Prerequisites
+
+**Follow the steps to install llmfoundry**
+
+## Setup
+
+1. **Clone the LLM Foundry repository**:
+   ```bash
+   git clone https://github.com/mosaicml/llm-foundry.git
+   cd llm-foundry
+   ```
+
+2. **Install dependencies**:
+   ```bash
+   pip install -e .
+   ```
+
+3. **Set up your HuggingFace token**:
+   ```bash
+   export HF_TOKEN=your_token_here
+   ```
+
+## Usage
+
+**Run the full training pipeline**:
+   ```bash
+   python local_llama_training.py
+   ```
+
+or for multi-GPU training (with number of GPUs controlled by nproc_per_node parameter), run
+
+   ```bash
+   NODE_RANK=0 python -m torch.distributed.run --nproc_per_node=2 local_llama_training.py
+   ```
+## Directory Structure
+
+The script creates the following directory structure:
+
+```
+./
+├── datasets/              # Dataset storage
+│   └── c4_small/          # C4 dataset
+├── model-checkpoints/     # Model checkpoints
+├── runs/                  # Training run outputs
+│   └── model-name-timestamp/  # Individual run
+└── local_llama_training.py  # This script
+```
+
diff --git a/llmfoundry/models/llama/__init__.py b/llmfoundry/models/llama/__init__.py
new file mode 100644
index 0000000..a6dc50a
--- /dev/null
+++ b/llmfoundry/models/llama/__init__.py
@@ -0,0 +1,37 @@
+"""Llama model package."""
+
+# from .model import LlamaForCausalLM
+# from .config import LlamaConfig
+# from .attention import LlamaAttention
+# from .mlp import LlamaMLP
+# from .decoder import LlamaDecoderLayer
+# from .rms_norm import LlamaRMSNorm
+
+# __all__ = [
+#     'LlamaForCausalLM',
+#     'LlamaConfig',
+#     'LlamaAttention',
+#     'LlamaMLP',
+#     'LlamaDecoderLayer',
+#     'LlamaRMSNorm',
+# ]
+
+# Import core components
+from .config import LlamaConfig
+from .attention import LlamaAttention
+from .mlp import LlamaMLP
+from .decoder import LlamaDecoderLayer
+from .rms_norm import LlamaRMSNorm
+from .register import get_custom_llama_model, register_custom_llama_model
+from .model import CustomLlamaModel
+
+__all__ = [
+    'LlamaConfig',
+    'LlamaAttention',
+    'LlamaMLP',
+    'LlamaDecoderLayer',
+    'LlamaRMSNorm',
+    'get_custom_llama_model',
+    'register_custom_llama_model',
+    'CustomLlamaModel',
+]
\ No newline at end of file
diff --git a/llmfoundry/models/llama/attention.py b/llmfoundry/models/llama/attention.py
new file mode 100644
index 0000000..40550b8
--- /dev/null
+++ b/llmfoundry/models/llama/attention.py
@@ -0,0 +1,126 @@
+from flash_attn import flash_attn_func
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from .liger_rope import LigerRopeFunction
+from .config import LlamaConfig
+
+class LlamaAttention(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_attention_heads`: {self.num_heads})."
+            )
+        
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        
+        self.register_buffer(
+            "cos_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.q_proj.weight.device,
+            )[0],
+            persistent=False,
+        )
+        self.register_buffer(
+            "sin_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.q_proj.weight.device,
+            )[1],
+            persistent=False,
+        )
+
+    def _compute_rope_embeddings(self, max_position_embeddings, head_dim, base=10000, dtype=None, device=None):
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+        t = torch.arange(max_position_embeddings, device=device, dtype=torch.float32)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos().to(dtype)
+        sin = emb.sin().to(dtype)
+        return cos.unsqueeze(0), sin.unsqueeze(0)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        # In B S (H D)
+        bsz, seq_len, _ = hidden_states.size()
+        
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=hidden_states.device)
+            position_ids = repeat(position_ids, 'l -> b l', b=bsz)
+        
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = rearrange(query_states, "b s (h d) -> b s h d", h=self.num_heads, d=self.head_dim)
+        key_states = rearrange(key_states, "b s (h d) -> b s h d", h=self.num_key_value_heads, d=self.head_dim)
+        value_states = rearrange(value_states, "b s (h d) -> b s h d", h=self.num_key_value_heads, d=self.head_dim)
+
+        # Slice off position specific rope freqs from the cached freqs
+        cos = self.cos_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        sin = self.sin_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        
+        query_states, key_states = LigerRopeFunction.apply(
+            query_states,
+            key_states,
+            cos.squeeze(0),
+            sin.squeeze(0),
+            position_ids,
+        )
+
+        # Handle past key values for generation
+        if past_key_values is not None:
+            # Reuse cached key and value states
+            key_states = torch.cat([past_key_values[0], key_states], dim=1)
+            value_states = torch.cat([past_key_values[1], value_states], dim=1)
+        
+        # Cache key and value states for future use
+        if use_cache:
+            present_key_values = (key_states, value_states)
+        else:
+            present_key_values = None
+
+        attn_output = flash_attn_func(
+            query_states,
+            key_states,
+            value_states,
+            dropout_p=0.0,
+            causal=True,
+        )
+        
+        attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
+        attn_output = self.o_proj(attn_output)
+        
+        if use_cache:
+            return attn_output, present_key_values
+        return attn_output
\ No newline at end of file
diff --git a/llmfoundry/models/llama/config.py b/llmfoundry/models/llama/config.py
new file mode 100644
index 0000000..763a852
--- /dev/null
+++ b/llmfoundry/models/llama/config.py
@@ -0,0 +1,57 @@
+from dataclasses import dataclass
+from typing import Optional, Union, Dict, Any
+
+@dataclass
+class LlamaConfig:
+    hidden_size: int = 576
+    num_attention_heads: int = 9
+    num_key_value_heads: int = 3
+    num_hidden_layers: int = 30
+    intermediate_size: int = 1536
+    hidden_act: str = "silu"
+    rms_norm_eps: float = 1e-5
+    vocab_size: int = 49152
+    max_position_embeddings: int = 8192
+    rope_theta: float = 100000.0  # Changed to float
+    tie_word_embeddings: bool = False
+    
+    # Add these parameters for our optimizations
+    use_cache: bool = True
+    use_unpadded_rope: bool = True  # Control whether to use our optimized unpadded RoPE
+    rope_scaling: Optional[Dict[str, Any]] = None  # For handling extended context lengths
+    
+    # Add parameter to control Flash Attention usage
+    use_flash_attn: bool = True
+
+
+# from dataclasses import dataclass
+
+# @dataclass
+# class LlamaConfig:
+#     hidden_size: int = 576
+#     num_attention_heads: int = 9
+#     num_key_value_heads: int = 3
+#     num_hidden_layers: int = 30
+#     intermediate_size: int = 1536
+#     hidden_act: str = "silu"
+#     rms_norm_eps: float = 1e-5
+#     vocab_size: int = 49152
+#     max_position_embeddings: int = 8192
+#     rope_theta: int = 100000
+#     tie_word_embeddings: bool = False
+
+
+# meta-llama/Llama-3.2-3B config:
+# @dataclass
+# class LlamaConfig:
+#     hidden_size: int = 3072
+#     num_attention_heads: int = 24
+#     num_key_value_heads: int = 8
+#     num_hidden_layers: int = 28
+#     intermediate_size: int = 8192
+#     hidden_act: str = "silu"
+#     rms_norm_eps: float = 1e-5
+#     vocab_size: int = 128256
+#     max_position_embeddings: int = 131072
+#     rope_theta: float = 500000.0
+#     tie_word_embeddings: bool = True
\ No newline at end of file
diff --git a/llmfoundry/models/llama/decoder.py b/llmfoundry/models/llama/decoder.py
new file mode 100644
index 0000000..c03779c
--- /dev/null
+++ b/llmfoundry/models/llama/decoder.py
@@ -0,0 +1,47 @@
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .mlp import LlamaMLP
+from .config import LlamaConfig
+from .rms_norm import LlamaRMSNorm
+from .attention import LlamaAttention
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.self_attn = LlamaAttention(config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        **kwargs
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
\ No newline at end of file
diff --git a/llmfoundry/models/llama/liger_rope.py b/llmfoundry/models/llama/liger_rope.py
new file mode 100644
index 0000000..f03441e
--- /dev/null
+++ b/llmfoundry/models/llama/liger_rope.py
@@ -0,0 +1,257 @@
+import torch
+import triton
+import triton.language as tl
+
+# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/rope.py
+# BSD 2-CLAUSE LICENSE
+# Copyright 2024 LinkedIn Corporation 
+# All Rights Reserved.
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@triton.jit
+def _triton_rope(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    sl,
+    bs: tl.constexpr,
+    cos_bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    # q size: (bsz, seq_len, num_q_heads, head_dim)
+    # q stride: (seq_len * num_q_heads * head_dim, num_q_heads * head_dim, head_dim, 1)
+    # k size: (bsz, seq_len, num_kv_heads, head_dim)
+    # k stride: (seq_len * num_kv_heads * head_dim, num_kv_heads * head_dim, head_dim, 1)
+
+    # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+    # stride: (seq_len * head_dim, head_dim, 1)
+    pid = tl.program_id(0)
+
+    # locate start address
+    q_ptr = q_ptr + pid * q_row_stride
+    k_ptr = k_ptr + pid * k_row_stride
+
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+
+    # 1. program instances are laid out in a 1D vector of size bsz * seq_len, which
+    # effectively represents a 2D grid of size [bsz, seq_len] with seq_len dimension
+    # being the fastest changing dimension. Thus we can simply do pid // sl to get the batch index
+    # and pid % sl to get the sequence index.
+    # 2. We only need the left half of cos and sin matrix because the right half is just
+    # a clone of the left half.
+    batch_idx = pid // sl
+    cos_row_idx = pid % sl
+    cos = cos + tl.where(
+        cos_bs == 1,
+        cos_row_idx * cos_row_stride,
+        batch_idx * (sl * cos_row_stride) + cos_row_idx * cos_row_stride,
+    )
+    sin = sin + tl.where(
+        cos_bs == 1,
+        cos_row_idx * sin_row_stride,
+        batch_idx * (sl * sin_row_stride) + cos_row_idx * sin_row_stride,
+    )
+
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    cos_mask = cos_offsets < hd // 2
+    cos_row = tl.load(cos + cos_offsets, mask=cos_mask, other=0)
+    sin_row = tl.load(sin + cos_offsets, mask=cos_mask, other=0)
+
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
+
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
+
+    if not BACKWARD_PASS:
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        # with some math, we can get:
+        # dy = [dx1, dx2] * [cos, cos] + [-dx2, dx1] * [-sin, -sin]
+        new_q_tile_1 = q_tile_1 * cos_row + q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row - q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+        new_k_tile_1 = k_tile_1 * cos_row + k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row - k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+
+
+def rope_forward(q, k, cos, sin):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
+
+    _triton_rope[(n_row,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        batch_size,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=False,
+    )
+    return q, k, cos, sin
+
+
+def rope_backward(dq, dk, cos, sin):
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+
+    # backward is similar to forward except swapping few ops
+    _triton_rope[(n_row,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        batch_size,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=True,
+    )
+    return dq, dk
+
+
+class LigerRopeFunction(torch.autograd.Function):
+    """
+    Triton implementation of the Rotary Positional Embedding (RoPE) operation. Please note that
+    this implements the HuggingFace Llama & Mistral version, whose rotation matrix is slightly different
+    than the original RoPE paper.
+
+    Please find the corresponding HuggingFace implementation here:
+    https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llama/modeling_llama.py#L184
+
+    For more details about the rotation matrix used here, please refer to:
+    https://discuss.huggingface.co/t/is-llama-rotary-embedding-implementation-correct/44509/2
+    """
+
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = rope_forward(q, k, cos, sin)
+        ctx.save_for_backward(cos, sin)
+        return q, k
+
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+
+        cos, sin = ctx.saved_tensors
+        dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None
\ No newline at end of file
diff --git a/llmfoundry/models/llama/mlp.py b/llmfoundry/models/llama/mlp.py
new file mode 100644
index 0000000..e2147f9
--- /dev/null
+++ b/llmfoundry/models/llama/mlp.py
@@ -0,0 +1,18 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import LlamaConfig
+
+class LlamaMLP(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
\ No newline at end of file
diff --git a/llmfoundry/models/llama/model.py b/llmfoundry/models/llama/model.py
new file mode 100644
index 0000000..6c51dc4
--- /dev/null
+++ b/llmfoundry/models/llama/model.py
@@ -0,0 +1,820 @@
+"""Custom Llama model implementation."""
+
+import sys
+from pathlib import Path
+from typing import Optional, Dict, Any, Union, Tuple, Mapping
+
+import torch
+import torch.nn as nn
+from composer.models import HuggingFaceModel
+from transformers import LlamaForCausalLM as HFLlamaForCausalLM
+from liger_kernel.transformers import LigerFusedLinearCrossEntropyLoss
+
+# Add paths to Python path - use relative paths instead of hardcoded ones
+current_dir = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(current_dir))
+import os
+#DEBUG = False
+DEBUG = os.environ.get("DEBUG", "0").lower() in ("1", "true", "yes")
+
+class CustomLlamaModel(HuggingFaceModel):
+    """Custom Llama model that extends HuggingFaceModel with optimized implementation."""
+    
+    def __init__(
+        self,
+        pretrained_model_name_or_path: str,
+        tokenizer: Optional[Any] = None,
+        use_flash_attention_2: bool = True,
+        peft_config: Optional[Dict[str, Any]] = None,
+        hidden_size: int = 2048,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 4,
+        num_hidden_layers: int = 22,
+        intermediate_size: Optional[int] = None,
+        vocab_size: int = 128256,
+        max_position_embeddings: int = 8192,
+        rms_norm_eps: float = 1e-5,
+        rope_theta: float = 500000.0,
+        use_unpadded_rope: bool = True,
+        use_flash_attn: bool = True,
+        **kwargs: Any
+    ) -> None:
+        """Initialize the custom Llama model.
+        
+        Args:
+            pretrained_model_name_or_path: Path to pretrained model
+            tokenizer: Tokenizer to use
+            use_flash_attention_2: Whether to use Flash Attention 2
+            peft_config: Optional PEFT configuration
+            hidden_size: Size of the hidden dimension
+            num_attention_heads: Number of attention heads
+            num_key_value_heads: Number of key/value heads for grouped-query attention
+            num_hidden_layers: Number of transformer layers
+            intermediate_size: Size of the intermediate dimension in the MLP
+            vocab_size: Size of the vocabulary
+            max_position_embeddings: Maximum sequence length
+            rms_norm_eps: Epsilon for RMS normalization
+            rope_theta: Base for RoPE embeddings
+            use_unpadded_rope: Whether to use unpadded RoPE
+            use_flash_attn: Whether to use Flash Attention
+            **kwargs: Additional arguments to pass to model
+        """
+        # Remove any parameters that might cause issues
+        if 'import_path' in kwargs:
+            del kwargs['import_path']
+        print("✅ CUSTOM LLAMA MODEL INITIALIZED")
+        # Store model configuration
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size or hidden_size * 4
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.use_unpadded_rope = use_unpadded_rope
+        self.use_flash_attn = use_flash_attn
+        
+        # Load the model using our custom implementation
+        model = self._create_model(
+            pretrained_model_name_or_path, 
+            torch_dtype=torch.bfloat16,
+            use_flash_attention_2=use_flash_attention_2,
+            **kwargs
+        )
+        
+        print(f"Model type: {type(model).__name__}")
+        
+        # Apply PEFT if specified
+        if peft_config:
+            from peft import get_peft_model, LoraConfig
+            peft_type = peft_config.get('peft_type', 'LORA')
+            
+            if peft_type == 'LORA':
+                lora_config = LoraConfig(
+                    r=peft_config.get('r', 8),
+                    lora_alpha=peft_config.get('lora_alpha', 16),
+                    lora_dropout=peft_config.get('lora_dropout', 0.05),
+                    target_modules=peft_config.get(
+                        'target_modules', 
+                        ["q_proj", "k_proj", "v_proj", "o_proj"]
+                    ),
+                    bias=peft_config.get('bias', 'none'),
+                    task_type=peft_config.get('task_type', 'CAUSAL_LM')
+                )
+                model = get_peft_model(model, lora_config)
+        
+        # Initialize parent class
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            use_logits=True
+        )
+
+    
+    def _create_model(self, pretrained_model_name_or_path, **kwargs):
+        """Create the model from pretrained weights or initialize from scratch."""
+        import torch
+        # Extract custom params
+        config_overrides = kwargs.pop('config_overrides', None)
+        use_pretrained = kwargs.pop('pretrained', True)
+        use_flash_attention_2 = kwargs.pop('use_flash_attention_2', False)
+        
+        # Filter out custom parameters that HF models don't accept
+        for param in ['should_save_peft_only', 'shift_labels', 'peft_config', 'init_device']:
+            if param in kwargs:
+                kwargs.pop(param)
+        
+        # Load or use the provided config
+        if 'config' not in kwargs:
+            from transformers import LlamaConfig
+            config = LlamaConfig.from_pretrained(pretrained_model_name_or_path)
+        else:
+            config = kwargs['config']
+        
+        # Apply config overrides if provided
+        if config_overrides:
+            print(f"Applying config_overrides: {config_overrides}")
+            for key, value in config_overrides.items():
+                print(f"  Setting {key} = {value}")
+                setattr(config, key, value)
+        
+        # Load HuggingFace model if using pretrained weights
+        if use_pretrained:
+            # Set flash attention if requested
+            if use_flash_attention_2:
+                print("Enabling Flash Attention 2")
+                kwargs['attn_implementation'] = 'flash_attention_2'
+            
+            #######
+            def inspect_config(config_obj):
+                """Print the config object structure and valid fields."""
+                print("##############START#####################")
+
+                print(f"Config class: {config_obj.__class__.__name__}")
+                print("Config attributes:")
+                
+                # Get all attributes that aren't callable or private
+                attrs = {attr: getattr(config_obj, attr) 
+                        for attr in dir(config_obj) 
+                        if not callable(getattr(config_obj, attr)) and not attr.startswith('_')}
+                
+                # Print in a readable format
+                import json
+                print(json.dumps(attrs, indent=2, default=str))
+                
+                # Show the config's to_dict method output if available
+                if hasattr(config_obj, "to_dict") and callable(config_obj.to_dict):
+                    print("\nConfig.to_dict():")
+                    print(json.dumps(config_obj.to_dict(), indent=2, default=str))
+                print("##############END#####################")
+                return config_obj
+            if DEBUG: inspect_config(config)
+            #######
+
+            # Load HF model with clean kwargs
+            print("Loading weights from pretrained model")
+            hf_model = HFLlamaForCausalLM.from_pretrained(
+                pretrained_model_name_or_path,
+                config=config,
+                **kwargs
+            )
+        else:
+            print("Initializing model with random weights (pretrained=False)")
+            hf_model = HFLlamaForCausalLM(config)
+
+        # Initialize our custom model
+        print("Creating custom LlamaForCausalLM instance")
+        model = self._initialize_model_from_config(config)
+        def track_computation(module, input, output):
+            print(f"Module {module.__class__.__name__} called")
+            print(f"  Input shapes: {[x.shape if isinstance(x, torch.Tensor) else type(x) for x in input]}")
+            print(f"  Output shapes: {output.shape if isinstance(output, torch.Tensor) else [x.shape if isinstance(x, torch.Tensor) else type(x) for x in output]}")
+        if DEBUG: model.lm_head.register_forward_hook(track_computation)
+
+        # Copy weights from HF model to custom model
+        if use_pretrained:
+            print("Copying weights from HF model to custom model")
+            self._copy_weights_from_hf_llama(model, hf_model)
+        
+        # Layer-wise PyTorch compilation for PEFT/adapter compatibility
+        # This approach compiles individual transformer layers instead of the entire model.
+        # Key benefits:
+        # 1. Compatible with PEFT/LoRA adapters (standard torch.compile often fails with adapters)
+        # 2. Uses error suppression via torch._dynamo for graceful fallbacks
+        # 3. Employs the aot_eager backend which better supports complex adapter interactions
+        # 4. Provides performance benefits of compilation without breaking adapter functionality
+        # 5. More fine-grained - succeeds partially even when full model compilation would fail
+        if hasattr(model, 'layers'):
+            compiled_layers = 0
+            print("Selectively compiling transformer layers...")
+            
+            # Using public API when possible, with fallbacks
+            try:
+                # Check if torch.compile has a suppress_errors option directly
+                if hasattr(torch, 'set_dynamo_config'):
+                    torch.set_dynamo_config(suppress_errors=True)
+                else:
+                    # Fallback to internal API with try/except
+                    try:
+                        import torch._dynamo
+                        if hasattr(torch._dynamo, 'config'):
+                            torch._dynamo.config.suppress_errors = True
+                    except (ImportError, AttributeError):
+                        print("Cannot configure dynamo error suppression - compilation may fail")
+            except Exception as e:
+                print(f"Warning: Could not configure compilation options: {e}")
+            
+            # Get the total number of layers
+            num_layers = len(model.layers)
+            
+            for i in range(num_layers):
+                try:
+                    model.layers[i] = torch.compile(
+                        model.layers[i],
+                        backend="aot_eager",
+                        mode="reduce-overhead",
+                        fullgraph=False
+                    )
+                    compiled_layers += 1
+                except Exception as e:
+                    if DEBUG:
+                        print(f"⚠️ Failed to compile layer {i}: {e}")
+            
+            print(f"✅ Successfully compiled {compiled_layers}/{num_layers} transformer layers")
+        else:
+            print("⚠️ Model structure doesn't have accessible layers")
+
+
+        model.config = config
+        print("Model loading complete")
+        
+        return model
+    
+    def _initialize_model_from_config(self, config):
+        """Initialize model from config."""
+        # Lazy import to avoid circular dependency
+        from llmfoundry.models.llama.config import LlamaConfig
+        from llmfoundry.models.llama.rms_norm import LlamaRMSNorm
+        from llmfoundry.models.llama.decoder import LlamaDecoderLayer
+        
+        # Create a model instance
+        model = nn.Module()
+        
+        # Create a proper LlamaConfig instance
+        llama_config = LlamaConfig(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_key_value_heads=getattr(config, 'num_key_value_heads', config.num_attention_heads),
+            num_hidden_layers=config.num_hidden_layers,
+            intermediate_size=getattr(config, 'intermediate_size', config.hidden_size * 4),
+            vocab_size=config.vocab_size,
+            max_position_embeddings=config.max_position_embeddings,
+            rms_norm_eps=getattr(config, 'rms_norm_eps', 1e-5),
+            rope_theta=getattr(config, 'rope_theta', 500000.0),
+            use_unpadded_rope=getattr(config, 'use_unpadded_rope', True),
+            use_flash_attn=getattr(config, 'use_flash_attn', True),
+        )
+        
+        # Set model attributes from config
+        model.config = llama_config
+        model.hidden_size = llama_config.hidden_size
+        model.num_attention_heads = llama_config.num_attention_heads
+        model.num_key_value_heads = llama_config.num_key_value_heads
+        model.num_hidden_layers = llama_config.num_hidden_layers
+        model.intermediate_size = llama_config.intermediate_size
+        model.vocab_size = llama_config.vocab_size
+        model.max_position_embeddings = llama_config.max_position_embeddings
+        model.rms_norm_eps = llama_config.rms_norm_eps
+        model.rope_theta = llama_config.rope_theta
+        model.use_unpadded_rope = llama_config.use_unpadded_rope
+        model.use_flash_attn = llama_config.use_flash_attn
+        
+        # Embedding layer
+        model.embed_tokens = nn.Embedding(model.vocab_size, model.hidden_size, padding_idx=None)
+        
+        # Decoder layers
+        model.layers = nn.ModuleList([
+            LlamaDecoderLayer(config) for _ in range(model.num_hidden_layers)
+        ])
+        
+        # Final normalization
+        model.norm = LlamaRMSNorm(model.hidden_size, eps=model.rms_norm_eps)
+        
+        # LM head
+        model.lm_head = nn.Linear(model.hidden_size, model.vocab_size, bias=False)
+        # Add flag to control whether to use fused loss
+        #model._fused_loss = True
+        use_fused_loss = getattr(config, 'use_fused_loss', True)  # Default to True
+        model._fused_loss = use_fused_loss
+        model.fused_loss_fn = LigerFusedLinearCrossEntropyLoss(ignore_index=-100)  # Add the actual loss function
+
+        # Add forward method to the model
+        def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            position_ids=None,
+            past_key_values=None,
+            inputs_embeds=None,
+            labels=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            #compute_logits = False,
+            **kwargs
+        ):
+            # Get hidden states from embeddings
+            if inputs_embeds is None:
+                inputs_embeds = self.embed_tokens(input_ids)
+            
+            # Get position IDs if not provided
+            if position_ids is None:
+                position_ids = torch.arange(input_ids.size(1), device=input_ids.device)
+                position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+            
+            # Initialize past key values if not provided
+            if past_key_values is None:
+                past_key_values = tuple([None] * len(self.layers))
+            
+            # Initialize hidden states
+            hidden_states = inputs_embeds
+            
+            # Initialize present key values for caching
+            present_key_values = () if use_cache else None
+            
+            # Process each layer
+            for i, layer in enumerate(self.layers):
+                # Get past key values for this layer
+                past_key_value = past_key_values[i] if past_key_values is not None else None
+                
+                # Forward pass through the layer
+                layer_outputs = layer(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+                
+                # Update hidden states
+                hidden_states = layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
+                
+                # Store present key values if using cache
+                if use_cache:
+                    present_key_values += (layer_outputs[1],)
+            
+            # Apply final layer norm
+            hidden_states = self.norm(hidden_states)
+            
+            # Get logits from the language model head
+            #logits = self.lm_head(hidden_states)
+            #
+            logits = None
+
+            # Calculate loss if labels are provided
+            loss = None
+            if labels is not None:
+                # Get final hidden states for loss calculation
+                final_hidden = hidden_states[..., :-1, :].contiguous().view(-1, self.hidden_size)
+                shift_labels = labels[..., 1:].contiguous().view(-1)
+                
+                if hasattr(self, '_fused_loss') and self._fused_loss:
+                    print("USING FUSED LOSS")
+                    torch.cuda.synchronize()
+                    before_mem = torch.cuda.memory_allocated()
+                    loss = self.fused_loss_fn(
+                        self.lm_head.weight,
+                        final_hidden,
+                        shift_labels
+                    )
+                    torch.cuda.synchronize()
+                    after_mem = torch.cuda.memory_allocated()
+                    if DEBUG: print(f"Memory change during fused loss: {(after_mem - before_mem) / 1024**2:.2f} MB")
+                else:
+                    print("USING STANDARD LOSS")
+                    torch.cuda.synchronize()
+                    before_mem = torch.cuda.memory_allocated()
+                    # Calculate partial logits for loss only
+                    partial_logits = self.lm_head(final_hidden)
+                    loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+                    loss = loss_fct(partial_logits, shift_labels)
+                    torch.cuda.synchronize()
+                    after_mem = torch.cuda.memory_allocated()
+                    if DEBUG: print(f"Memory change during standard loss: {(after_mem - before_mem) / 1024**2:.2f} MB")
+
+            # Calculate full sequence logits ONLY if needed for generation/output
+            #print("=== LOGITS CALCULATION CHECK ===")
+            #if return_dict or output_attentions or output_hidden_states:
+            # Calculate logits if they're needed for output or generation
+            if (return_dict or                    # Structured output needs logits
+                not self.training or                   # Inference/generation usually needs logits
+                labels is None or                 # No loss calculation means we need logits
+                output_attentions or output_hidden_states):  # Special outputs need logits
+                if DEBUG:
+                    print("=== LOGITS CALCULATION CHECK ===")
+                    print("Full logits calculation needed for return dict/output features")
+                    torch.cuda.synchronize()
+                    before_mem = torch.cuda.memory_allocated()
+        
+                logits = self.lm_head(hidden_states)
+                if DEBUG:
+                    torch.cuda.synchronize()
+                    after_mem = torch.cuda.memory_allocated()
+                    print(f"Memory allocated for full logits: {(after_mem - before_mem) / 1024**2:.2f} MB")
+            else:
+                print("Skipping full logits calculation - not needed")
+
+
+            # Return outputs
+            if return_dict:
+                return {
+                    "loss": loss,
+                    "logits": logits,
+                    "hidden_states": hidden_states,
+                    "past_key_values": present_key_values,
+                }
+            else:
+                return (loss, logits) if loss is not None else (logits,)
+
+        # Bind the forward method to the model
+        model.forward = forward.__get__(model)
+
+        # Add prepare_inputs_for_generation method to the model
+        def prepare_inputs_for_generation(
+            self, 
+            input_ids, 
+            past_key_values=None, 
+            attention_mask=None, 
+            **kwargs
+        ):
+            # only last token for input_ids if past is not None
+            if past_key_values is not None:
+                input_ids = input_ids[:, -1].unsqueeze(-1)
+                
+                # the cache may be updated in the forward pass
+                # we need to update the attention mask accordingly
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, -1].unsqueeze(-1)
+            
+            return {
+                "input_ids": input_ids,
+                "past_key_values": past_key_values,
+                "attention_mask": attention_mask,
+                "use_cache": kwargs.get("use_cache", True),
+            }
+
+        # Bind the method to the model
+        model.prepare_inputs_for_generation = prepare_inputs_for_generation.__get__(model)
+        
+        return model
+    #
+    def _copy_weights_from_hf_llama(self, model, hf_model):
+        """Copy weights from HuggingFace model to our custom implementation"""
+        # Keep track of uncopied weights
+        our_state_dict = {k: False for k in model.state_dict().keys()}
+        copied_count = 0
+        total_count = len(our_state_dict)
+        
+        # Copy embedding weights
+        if hasattr(model, 'embed_tokens') and hasattr(hf_model, 'model'):
+            model.embed_tokens.weight.data.copy_(hf_model.model.embed_tokens.weight.data)
+            our_state_dict['embed_tokens.weight'] = True
+            copied_count += 1
+        
+        # Copy layer weights
+        for i, (our_layer, hf_layer) in enumerate(zip(model.layers, hf_model.model.layers)):
+            print(f"Copying weights for layer {i}/{len(model.layers)}")
+            
+            # Copy attention weights
+            layer_prefix = f"layers.{i}."
+            components = [
+                ('self_attn.q_proj.weight', 'self_attn.q_proj.weight'),
+                ('self_attn.k_proj.weight', 'self_attn.k_proj.weight'),
+                ('self_attn.v_proj.weight', 'self_attn.v_proj.weight'),
+                ('self_attn.o_proj.weight', 'self_attn.o_proj.weight'),
+                ('mlp.gate_proj.weight', 'mlp.gate_proj.weight'),
+                ('mlp.up_proj.weight', 'mlp.up_proj.weight'),
+                ('mlp.down_proj.weight', 'mlp.down_proj.weight'),
+                ('input_layernorm.weight', 'input_layernorm.weight'),
+                ('post_attention_layernorm.weight', 'post_attention_layernorm.weight')
+            ]
+            
+            for our_name, hf_name in components:
+                full_name = layer_prefix + our_name
+                if full_name in our_state_dict:
+                    # Direct copy instead of nested attribute lookup
+                    our_path = our_name.split('.')
+                    hf_path = hf_name.split('.')
+                    
+                    # Get source attribute (from HF model)
+                    src = hf_layer
+                    for attr in hf_path[:-1]:  # All but the last part, which is 'weight'
+                        src = getattr(src, attr)
+                    src_attr = getattr(src, hf_path[-1])
+                    
+                    # Get destination attribute (our model)
+                    dst = our_layer
+                    for attr in our_path[:-1]:  # All but the last part
+                        dst = getattr(dst, attr)
+                    dst_attr = getattr(dst, our_path[-1])
+                    
+                    # Copy the data
+                    dst_attr.data.copy_(src_attr.data)
+                    our_state_dict[full_name] = True
+                    copied_count += 1
+        
+        # Copy final layer norm and lm head
+        if hasattr(model, 'norm') and hasattr(hf_model.model, 'norm'):
+            model.norm.weight.data.copy_(hf_model.model.norm.weight.data)
+            our_state_dict['norm.weight'] = True
+            copied_count += 1
+        
+        if hasattr(model, 'lm_head') and hasattr(hf_model, 'lm_head'):
+            model.lm_head.weight.data.copy_(hf_model.lm_head.weight.data)
+            our_state_dict['lm_head.weight'] = True
+            copied_count += 1
+        
+        # Check for uninitialized weights
+        uninitialized = [k for k, v in our_state_dict.items() if not v]
+        if uninitialized:
+            print(f"WARNING: {len(uninitialized)}/{total_count} weights were not initialized:")
+            for name in sorted(uninitialized):
+                print(f"  - {name}")
+        else:
+            print(f"SUCCESS: All {total_count} weights were copied successfully!")
+            
+        print(f"Copy rate: {copied_count}/{total_count} ({copied_count/total_count:.1%})")
+    
+    
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> 'CustomLlamaModel':
+        """Load a pretrained model."""
+        return cls(
+            pretrained_model_name_or_path=pretrained_model_name_or_path, 
+            **kwargs
+        )
+    
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> 'CustomLlamaModel':
+        """Build model from config dictionary."""
+        # If loading from pretrained, use from_pretrained directly
+        pretrained_path = config.get("pretrained_model_name_or_path", None)
+        if pretrained_path:
+            print(f"Loading pretrained model from {pretrained_path}...")
+            # Use our existing from_pretrained method with the optimizations
+            model = cls.from_pretrained(
+                pretrained_path,
+                use_unpadded_rope=config.get("use_unpadded_rope", True),
+                use_flash_attn=config.get("use_flash_attn", True),
+            )
+            print(f"Model loaded successfully with {len(model.model.layers)} layers")
+            return model
+            
+        # Only build from scratch if no pretrained model specified
+        else:
+            model_args = {
+                "hidden_size": config.get("d_model", 2048),
+                "num_attention_heads": config.get("n_heads", 16),
+                "num_key_value_heads": config.get("n_kv_heads", 4),
+                "num_hidden_layers": config.get("n_layers", 22),
+                "intermediate_size": config.get("d_model", 2048) * config.get("expansion_ratio", 4),
+                "vocab_size": config.get("vocab_size", 128256),
+                "max_position_embeddings": config.get("max_seq_len", 8192),
+                "rms_norm_eps": config.get("rms_norm_eps", 1e-5),
+                "rope_theta": config.get("rope_theta", 500000.0),
+                "use_unpadded_rope": config.get("use_unpadded_rope", True),
+                "use_flash_attn": config.get("use_flash_attn", True),
+            }
+            
+            # Create a dummy path for initialization
+            dummy_path = "dummy_path_for_initialization"
+            return cls(pretrained_model_name_or_path=dummy_path, **model_args)
+    
+   
+    def forward(self, batch: Dict[str, Any]) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Custom forward method with diagnostic logging."""
+        from torch.profiler import profile, record_function, ProfilerActivity
+
+        if isinstance(batch, Mapping):
+            filtered_batch = {k: v for k, v in batch.items() if k in self.model_forward_args}
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                profile_memory=True, record_shapes=True) as prof:
+                with record_function("model_inference"):
+                    outputs = self.model(**filtered_batch)
+            if DEBUG: print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
+        else:
+            raise ValueError('Unexpected batch type.')
+        
+        # Initialize loss and logits
+        loss = None
+        logits = None
+        
+        if len(outputs) > 1:
+            loss, logits = outputs[0], outputs[1]
+        elif len(outputs) == 1:
+            loss, logits = None, outputs[0]
+        else:
+            loss, logits = None, None
+
+        if loss is None and 'labels' in batch and logits is not None:
+            print("BRANCH: calculating loss manually with cross_entropy")
+            loss = torch.nn.functional.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                batch['labels'].view(-1),
+                ignore_index=-100
+            )
+        
+        return {'loss': loss, 'logits': logits}
+    def eval_forward(self, batch: Dict[str, Any], outputs: Optional[Any] = None) -> torch.Tensor:
+        """Custom eval_forward method to handle evaluation properly.
+        
+        Args:
+            batch: Input batch containing input_ids and labels
+            outputs: Optional pre-computed outputs from forward pass
+            
+        Returns:
+            Model logits for evaluation
+        """
+        # If the batch mode is generate, we will generate a requested number of tokens
+        if batch.get('mode', None) == 'generate':
+            if self.tokenizer is None:
+                raise ValueError(
+                    'Generation eval cannot be used without providing a tokenizer to the model constructor.',
+                )
+
+            self.labels = batch.pop('labels')
+            generation = self.generate(
+                batch['input_ids'],
+                attention_mask=batch.get('attention_mask'),
+                synced_gpus=torch.distributed.get_world_size() > 1 if torch.distributed.is_initialized() else False,
+                **batch.get('generation_kwargs', {}),
+            )
+
+            # don't remove prefix space to sentencepiece models
+            if len(
+                self.tokenizer(' a', add_special_tokens=False)['input_ids'],
+            ) == 1:
+                return self.tokenizer.batch_decode(
+                    generation[:, batch['input_ids'].shape[1]:],
+                    skip_special_tokens=True,
+                )
+            else:
+                return [
+                    ' ' + generation for generation in
+                    self.tokenizer.batch_decode(generation[:, batch['input_ids'].shape[1]:], skip_special_tokens=True)
+                ]
+
+        # For regular evaluation or ICL task, we want to return logits
+        if self.use_logits or batch.get('mode', None) == 'icl_task':
+            # pop labels first to avoid computing loss
+            self.labels = batch.pop('labels', None)
+
+            # Handle encoder-decoder models
+            if self.config.is_encoder_decoder and 'decoder_input_ids' not in batch and self.labels is not None:
+                if hasattr(self.model, 'prepare_decoder_input_ids_from_labels'):
+                    batch['decoder_input_ids'] = self.model.prepare_decoder_input_ids_from_labels(labels=self.labels)
+                else:
+                    raise RuntimeError(
+                        'Encoder decoder models require that either decoder_input_ids is present in the batch'
+                        ' or that the model has a prepare_decoder_input_ids_from_labels method.',
+                    )
+
+            # Shift labels for causal language models
+            if self.shift_labels or batch.get('mode', None) == 'icl_task':
+                if self.labels is not None:
+                    # HF CausalLM models internally shift labels before computing loss, so we do the same here
+                    self.labels[:, :-1] = self.labels[:, 1:].clone()
+                    self.labels[:, -1] = -100
+
+            # Get outputs from forward pass if not provided
+            output = outputs if outputs is not None else self.forward(batch)
+            
+            # Extract logits from output
+            if isinstance(output, dict):
+                logits = output.get('logits')
+            elif isinstance(output, tuple):
+                # If outputs is a tuple, first element is loss, second is logits
+                logits = output[1] if len(output) > 1 else output[0]
+            else:
+                # If outputs is just logits
+                logits = output
+                
+            # If logits is None, return the original output
+            if logits is None:
+                return output
+                
+            # If we are in the single class case, then remove the classes dimension
+            if logits.ndim == 2 and logits.shape[1] == 1:
+                logits = logits.squeeze(dim=1)
+                
+            return logits
+        else:
+            # For other evaluation modes, just return the outputs
+            return outputs if outputs is not None else self.forward(batch)
+    
+    def loss(self, outputs: Union[torch.Tensor, Dict[str, torch.Tensor]], batch: Dict[str, Any]) -> torch.Tensor:
+        """Custom loss method to extract loss from model outputs.
+        
+        Args:
+            outputs: Model outputs from forward method
+            batch: Input batch
+            
+        Returns:
+            Loss tensor
+        """
+        # If outputs is a dictionary, extract the loss
+        if isinstance(outputs, dict):
+            return outputs['loss']
+        # If outputs is a tensor, assume it's the loss
+        elif isinstance(outputs, torch.Tensor):
+            return outputs
+        # If outputs is a tuple, the first element is typically the loss
+        elif isinstance(outputs, tuple):
+            return outputs[0]
+        else:
+            raise TypeError(f"Unexpected outputs type: {type(outputs)}")
+    
+    def generate(
+        self, 
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        do_sample: bool = False,
+        pad_token_id: int = 0,
+        eos_token_id: int = 2,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> torch.LongTensor:
+        """Generate text using the model."""
+        # Set evaluation mode
+        self.model.eval()
+        
+        # Create the initial sequence and attention mask
+        batch_size = input_ids.shape[0]
+
+        # Initialize generated sequences with input ids
+        generated_ids = input_ids.clone()
+        
+        # Generate tokens one by one
+        for _ in range(max_new_tokens):  # Use _ to indicate unused variable
+            # Prepare inputs for current generation step
+            input_ids_for_step = generated_ids
+            
+            # Forward pass to get logits
+            with torch.no_grad():
+                outputs = self.model(input_ids_for_step, attention_mask=attention_mask)
+                next_token_logits = outputs["logits"][:, -1, :]
+            
+            # Apply temperature scaling
+            if temperature > 0.0:
+                next_token_logits = next_token_logits / temperature
+            
+            # Apply top-p sampling (nucleus sampling)
+            if top_p < 1.0 and do_sample:
+                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True, dim=-1)
+                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                
+                # Remove tokens with cumulative probability above the threshold
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+                sorted_indices_to_remove[:, 0] = 0
+                
+                # Scatter sorted indices to original logits
+                for b in range(batch_size):
+                    indices_to_remove = sorted_indices[b][sorted_indices_to_remove[b]]
+                    next_token_logits[b, indices_to_remove] = -float("inf")
+            
+            # Sample or take argmax
+            if do_sample:
+                probs = torch.softmax(next_token_logits, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(next_token_logits, dim=-1)
+            
+            # Add to generated sequence
+            next_tokens = next_tokens.unsqueeze(-1)
+            generated_ids = torch.cat([generated_ids, next_tokens], dim=-1)
+            
+            # Check if all sequences have reached EOS
+            if torch.all((generated_ids == eos_token_id).any(dim=1)):
+                break
+        
+        # Ensure return type is LongTensor with explicit cast
+        return generated_ids.to(dtype=torch.long)
+    
+    def get_trainable_params(self) -> list[torch.nn.Parameter]:
+        """Return the trainable parameters of the model."""
+        return [p for p in self.parameters() if p.requires_grad]
+
+    def get_param_count(self, trainable_only: bool = False) -> int:
+        """Return the number of parameters in the model."""
+        if trainable_only:
+            return sum(p.numel() for p in self.get_trainable_params())
+        else:
+            return sum(p.numel() for p in self.parameters())
+        
\ No newline at end of file
diff --git a/llmfoundry/models/llama/modern_rope.py b/llmfoundry/models/llama/modern_rope.py
new file mode 100644
index 0000000..189d6a8
--- /dev/null
+++ b/llmfoundry/models/llama/modern_rope.py
@@ -0,0 +1,297 @@
+# Copyright 2024 onwards Answer.AI, LightOn, and contributors
+# License: Apache-2.0
+
+# Copyright (c) 2023, Tri Dao.
+# License: Apache-2.0
+
+import torch
+from einops import rearrange
+from flash_attn.ops.triton.rotary import apply_rotary
+
+from typing import Optional, Tuple, Union
+
+
+class ApplyRotaryEmbUnpad(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        cos,
+        sin,
+        interleaved=False,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+    ):
+        # (total_nnz, 3, nheads, headdim)
+        total_nnz, three, nheads, headdim = qkv.shape
+        assert three == 3
+        if qkv.is_contiguous():
+            # Call 1 kernel instead of 2 kernels
+            # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
+            # dimensions, we get the same tensor
+            # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
+            qk = qkv[:, :2].view(total_nnz, -1, headdim)
+            apply_rotary(
+                qk,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+                interleaved=interleaved,
+                inplace=True,
+            )
+        else:
+            q, k = qkv[:, 0, :, :], qkv[:, 1, :, :]
+            apply_rotary(
+                q,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+                interleaved=interleaved,
+                inplace=True,
+            )
+            apply_rotary(
+                k,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+                interleaved=interleaved,
+                inplace=True,
+            )
+
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin, cu_seqlens)
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        ctx.max_seqlen = max_seqlen
+        return qkv
+
+    @staticmethod
+    def backward(ctx, do):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin, cu_seqlens = ctx.saved_tensors
+        if do.is_contiguous():
+            total_nnz, three, nheads, headdim = do.shape
+            # Call 1 kernel instead of 2 kernels
+            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
+            # dimensions, we get the same tensor
+            dqk = do[:, :2].view(total_nnz, -1, headdim)
+            apply_rotary(
+                dqk,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=ctx.max_seqlen,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+        else:
+            dq, dk = do[:, 0, :, :], do[:, 1, :, :]
+            apply_rotary(
+                dq,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=ctx.max_seqlen,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+            apply_rotary(
+                dk,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=ctx.max_seqlen,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+
+        return do, None, None, None, None, None, None
+
+
+def apply_rotary_emb_unpad(
+    qkv,
+    cos,
+    sin,
+    interleaved=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+):
+    """
+    Arguments:
+        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
+        cos, sin: (seqlen_rotary, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        inplace: if True, apply rotary embedding in-place.
+        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Return:
+        out: (total_nnz, dim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding to the first rotary_dim of x.
+    """
+    return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, interleaved, seqlen_offsets, cu_seqlens, max_seqlen)
+
+
+class UnpaddedRotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings applied directly to unpadded sequences.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        base: float = 10000.0,
+        interleaved: bool = False,
+        max_seqlen: Optional[int] = None,
+        scale_base: Optional[bool] = None,
+        pos_idx_in_fp32: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
+            otherwise they might be in lower precision.
+            This option was added because previously (before 2023-07-02), when we construct
+            the position indices, we use the dtype of self.inv_freq. In most cases this would
+            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
+            self.inv_freq would be bf16, and the position indices are also in bf16.
+            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
+            embeddings for some positions will coincide.
+            To maintain compatibility with models previously trained in pure bf16,
+            we add this option.
+        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
+            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
+            the cos_sin_cache wll be recomputed during the forward pass.
+        """
+        super().__init__()
+        self.dim = dim
+        self.base = float(base)
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None
+            else None
+        )
+        self.register_buffer("scale", scale, persistent=False)
+
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+
+        if max_seqlen is not None and device is not None and dtype is not None:
+            self._update_cos_sin_cache(max_seqlen, device=device, dtype=dtype)
+
+    def _compute_inv_freq(self, device=None):
+        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
+
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+
+    def forward(
+        self,
+        qkv: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: Optional[int] = None,
+        seqlen_offset: Union[int, torch.Tensor] = 0,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        qkv: (total_nnz, 3, nheads, headdim)
+        cu_seqlens: (batch + 1,) cumulative sequence lengths
+        max_seqlen: int max seq length in the batch
+        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
+            should pass in max_seqlen, which will update the cos / sin cache up to that length.
+        Apply rotary embedding *inplace* to qkv.
+        """
+        if max_seqlen is not None:
+            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
+
+        qkv = apply_rotary_emb_unpad(
+            qkv,
+            self._cos_cached,
+            self._sin_cached,
+            interleaved=self.interleaved,
+            seqlen_offsets=seqlen_offset,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+
+        return qkv
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, base={self.base}, scale_base={self.scale_base}"
diff --git a/llmfoundry/models/llama/register.py b/llmfoundry/models/llama/register.py
new file mode 100644
index 0000000..a6356fb
--- /dev/null
+++ b/llmfoundry/models/llama/register.py
@@ -0,0 +1,15 @@
+"""Registration utilities for Llama models."""
+
+from llmfoundry.models.llama.model import CustomLlamaModel
+
+
+def get_custom_llama_model():
+    """Get the CustomLlamaModel class."""
+    return CustomLlamaModel
+
+
+def register_custom_llama_model():
+    """Register the custom Llama model with the registry."""
+    from llmfoundry import registry
+    registry.models.register("hf_causal_lm")(CustomLlamaModel)
+    return CustomLlamaModel 
\ No newline at end of file
diff --git a/llmfoundry/models/llama/rms_norm.py b/llmfoundry/models/llama/rms_norm.py
new file mode 100644
index 0000000..05c06bc
--- /dev/null
+++ b/llmfoundry/models/llama/rms_norm.py
@@ -0,0 +1,16 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
diff --git a/llmfoundry/models/llama/rope.py b/llmfoundry/models/llama/rope.py
new file mode 100644
index 0000000..2d9cb22
--- /dev/null
+++ b/llmfoundry/models/llama/rope.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+def rotate_half(x):
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q, k, cos, sin):
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=8192, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.max_position_embeddings = max_position_embeddings
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+
+    def forward(self, position_ids: torch.LongTensor):
+        # position_ids: [batch_size, seq_len]
+        inv_freq = self.inv_freq.to(device=position_ids.device)
+        inv_freq_expanded = inv_freq[None, None, :]  # [1, 1, dim//2]
+        position_ids_expanded = position_ids[:, :, None].float()  # [batch_size, seq_len, 1]
+        freqs = torch.matmul(position_ids_expanded, inv_freq_expanded)  # [batch_size, seq_len, dim//2]
+        freqs = torch.cat([freqs, freqs], dim=-1)  # [batch_size, seq_len, dim]
+        cos = torch.cos(freqs)
+        sin = torch.sin(freqs)
+        cos = cos.unsqueeze(1)  # [batch_size, 1, seq_len, dim]
+        sin = sin.unsqueeze(1)  # [batch_size, 1, seq_len, dim]
+        return cos, sin
diff --git a/local_llama_training_instruct.py b/local_llama_training_instruct.py
new file mode 100644
index 0000000..95aebeb
--- /dev/null
+++ b/local_llama_training_instruct.py
@@ -0,0 +1,1840 @@
+import os
+import datetime
+import logging
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+
+# Constants
+PYTHON_PATH = "python"  # Use your local Python interpreter for custom models, easy to set FSDP; still using llmfoundry's train
+TRAIN_DURATION = "2ba"  # "500ba"
+EVAL_INTERVAL = "100ba"  # "100ba"
+SAVE_INTERVAL = "1ba"  # "100ba"
+USE_CUSTOM_MODEL = True #  # Set to True to use custom LlamaForCausalLM
+IS_PEFT =  True #True #False #True #False
+
+# Some variables for testing whether PEFT works with custom models
+PEFT_TESTING = True #False #True 
+if PEFT_TESTING:
+    # Fix MKL threading layer compatibility issue - must be set before ANY numpy/scipy imports
+    os.environ['MKL_THREADING_LAYER'] = 'GNU'  # Use GNU OpenMP instead of Intel
+    TRAIN_DURATION = "500ba"
+
+
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Local paths (using absolute paths)
+DATASET_BASE_PATH = os.path.join(ROOT_DIR, "datasets")  # Local dataset path
+MODEL_CHECKPOINT_PATH = os.path.join(ROOT_DIR, "model-checkpoints")  # Local model checkpoint path
+# Update the path to match your actual directory structure
+TRAIN_YAML = (os.path.join(ROOT_DIR, "scripts/train/yamls/llama/llama3-1b-lora-instruct.yaml") if IS_PEFT
+              else  os.path.join(ROOT_DIR, "scripts/train/yamls/llama/llama3-1b-lora-instruct-full-ft.yaml") 
+              
+)
+OUTPUT_PRECISION = "bf16"
+
+# Create directories if they don't exist
+os.makedirs(DATASET_BASE_PATH, exist_ok=True)
+os.makedirs(MODEL_CHECKPOINT_PATH, exist_ok=True)
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("llm_training.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger("llm_training")
+
+
+
+def path_tracker(label=None, show_env=True, check_paths=None):
+    """
+    Utility function to track and debug directory paths and file operations.
+    
+    Args:
+        label: Optional string to identify this tracking point
+        show_env: Whether to show relevant environment variables
+        check_paths: List of paths to check for existence
+        
+    Returns:
+        Dictionary with tracking information
+    """
+    import os
+    from pathlib import Path
+    import psutil
+    import time
+    
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+    header = f"➖➖➖ PATH TRACKER [{label or 'UNKNOWN'}] at {timestamp} ➖➖➖"
+    
+    # Get basic path information
+    cwd = os.getcwd()
+    real_cwd = os.path.realpath(cwd)
+    
+    # Get process information
+    process = psutil.Process()
+    process_cwd = process.cwd()
+    
+    # Gather environment variables
+    env_vars = {}
+    tracked_vars = [
+        "COMPOSER_SAVE_FOLDER", 
+        "PYTHONPATH", 
+        "MODEL_CHECKPOINT_VOLUME_MOUNT_PATH",
+        "HUGGINGFACE_TOKEN",
+        "CUDA_VISIBLE_DEVICES"
+    ]
+    
+    if show_env:
+        for var in tracked_vars:
+            env_vars[var] = os.environ.get(var, "NOT SET")
+    
+    # Check if specific paths exist
+    path_checks = {}
+    if check_paths:
+        for path_str in check_paths:
+            path = Path(path_str)
+            exists = path.exists()
+            path_type = "unknown"
+            size = None
+            size_readable=0
+            if exists:
+                path_type = "directory" if path.is_dir() else "file"
+                if path.is_file():
+                    size = path.stat().st_size
+                    size_readable = f"{size / (1024*1024):.2f} MB" if size > 1024*1024 else f"{size / 1024:.2f} KB"
+                elif path.is_dir():
+                    files = list(path.glob("*"))
+                    size = f"{len(files)} items"
+                    size_readable = size
+            
+            path_checks[str(path)] = {
+                "exists": exists,
+                "type": path_type if exists else None,
+                "size": size_readable if exists else None
+            }
+    
+    # Prepare output
+    info = {
+        "label": label,
+        "timestamp": timestamp,
+        "cwd": cwd,
+        "real_cwd": real_cwd,
+        "process_cwd": process_cwd,
+        "env_vars": env_vars,
+        "path_checks": path_checks
+    }
+    
+    # Print results for immediate feedback
+    logger.info(header)
+    logger.info(f"📁 Current directory: {cwd}")
+    if cwd != real_cwd:
+        logger.info(f"   Real path: {real_cwd}")
+    if process_cwd != cwd:
+        logger.info(f"   Process working dir: {process_cwd}")
+    
+    if show_env:
+        logger.info("\n🔧 Environment variables:")
+        for var, value in env_vars.items():
+            logger.info(f"   {var}: {value}")
+    
+    if check_paths:
+        logger.info("\n🔍 Path checks:")
+        for path, details in path_checks.items():
+            status = "✅" if details["exists"] else "❌"
+            type_info = f" ({details['type']})" if details["exists"] else ""
+            size_info = f" - {details['size']}" if details["exists"] else ""
+            logger.info(f"   {status} {path}{type_info}{size_info}")
+    
+    logger.info("➖➖➖" + "➖" * len(header) + "➖➖➖")
+    return info
+
+def get_model_name(yaml_path: str) -> str:
+    """Extract model name from YAML file content"""
+    from pathlib import Path
+    return Path(yaml_path).stem
+
+
+def get_run_folder(run_ts: str, model_name: str) -> str:
+    """Get folder path for run artifacts"""
+    return f"{MODEL_CHECKPOINT_PATH}/{model_name}-{run_ts}"
+
+
+def get_hf_token() -> str:
+    """
+    Get and set the HuggingFace token from environment variables.
+    Try multiple common environment variable names and set all variants.
+    Returns the token if found, None otherwise.
+    """
+    logger.info("Looking for HuggingFace token...")
+    
+    # Check for the token in multiple possible environment variables
+    token_vars = ["HF_TOKEN", "HUGGINGFACE_TOKEN", "HUGGINGFACE_HUB_TOKEN"]
+    hf_token = None
+    
+    for var in token_vars:
+        if os.environ.get(var):
+            hf_token = os.environ.get(var)
+            logger.info(f"Found token in {var}")
+            break
+    
+    if hf_token:
+        # Set all common environment variables used for HF authentication
+        os.environ["HUGGINGFACE_TOKEN"] = hf_token
+        os.environ["HF_TOKEN"] = hf_token
+        os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
+        logger.info("HF token set in all common environment variables")
+        
+        return hf_token
+    else:
+        logger.warning("No HF token found in environment variables")
+        return ''
+
+
+def get_stats():
+    """Get system stats including GPU information"""
+    import subprocess
+    
+    # Check if flash attention is available
+    try:
+        import_check = subprocess.run(
+            [PYTHON_PATH, "-c", "import flash_attn; logger.info(flash_attn.__version__)"],
+            capture_output=True,
+            text=True,
+        )
+        logger.info(f"Flash Attention version: {import_check.stdout}")
+    except Exception as e:
+        logger.warning(f"Flash Attention not available: {e}")
+
+    # Run nvidia-smi to check GPU status
+    try:
+        nvidia_smi = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
+        nvidia_smi_2 = subprocess.run(['nvidia-smi', '-L'], capture_output=True, text=True)
+        logger.info("NVIDIA-SMI Output:")
+        logger.info(nvidia_smi.stdout)
+        logger.info(nvidia_smi_2.stdout)
+        if nvidia_smi.stderr: 
+            logger.warning(f"NVIDIA-SMI Errors: {nvidia_smi.stderr}")
+    except Exception as e:
+        logger.error(f"Error running nvidia-smi: {e}")
+
+def get_base_model_path(model_name_or_path):
+    """Get the path to the base model for tokenizer files."""
+    import os
+    import yaml
+    
+    # First, check if it's a direct path to a local model
+    if os.path.exists(model_name_or_path) and os.path.isdir(model_name_or_path):
+        return model_name_or_path
+    
+    # Check if it's a HuggingFace model ID (like meta-llama/Llama-3-1b)
+    if '/' in model_name_or_path:
+        # Check if we have it locally
+        local_path = os.path.join(ROOT_DIR, "models", model_name_or_path.split('/')[-1])
+        if os.path.exists(local_path):
+            return local_path
+    
+    # Try to extract from the YAML config
+    yaml_file = os.path.join(ROOT_DIR, "scripts", TRAIN_YAML)
+    if os.path.exists(yaml_file):
+        with open(yaml_file, 'r') as f:
+            config = yaml.safe_load(f)
+        
+        # Look in variables section
+        if 'variables' in config and 'model_name_or_path' in config['variables']:
+            base_model = config['variables']['model_name_or_path']
+            # Check if it's downloaded locally
+            local_path = os.path.join(ROOT_DIR, "models", base_model.split('/')[-1])
+            if os.path.exists(local_path):
+                return local_path
+            return base_model
+    
+    # Default to the meta-llama path if nothing else works
+    return "meta-llama/Llama-3.2-1B-Instruct"
+
+
+def download_model_if_needed(token: str, model_name_or_path: str) -> str:
+    """Download the model if it's gated and requires a HuggingFace token"""
+    import subprocess
+    import os
+
+    # Only handle Meta-LLaMA models that need a token
+    if token and "meta-llama" in model_name_or_path:
+        local_model = os.path.join(ROOT_DIR, "models/llama-model")
+        print(f"DEBUG: Checking model at {local_model}")
+
+        # Check if model already exists locally
+        if os.path.exists(local_model) and os.path.isfile(os.path.join(local_model, "config.json")):
+            print(f"DEBUG: Model exists, skipping download")
+            logger.info(f"Model already exists at {local_model}, skipping download")
+            return local_model
+            
+        # Model doesn't exist, download it
+        print(f"DEBUG: Model doesn't exist, downloading...")
+        logger.info(f"Downloading model {model_name_or_path}...")
+        os.makedirs(local_model, exist_ok=True)
+        
+        # Download command
+        download_cmd = [
+            PYTHON_PATH, "-c",
+            f"""
+import os
+from huggingface_hub import snapshot_download, login
+token = "{token}"
+login(token=token)
+local_dir = "{local_model}"
+print(f"Downloading model to {{local_dir}}")
+snapshot_download(repo_id="{model_name_or_path}", local_dir=local_dir, token=token)
+print("Download complete!")
+            """
+        ]
+        
+        # Execute download command
+        subprocess.run(download_cmd, check=True)
+        return local_model
+        
+    # For non-gated models, just return the original path
+    return model_name_or_path
+
+
+
+def convert_c4_small_dataset():
+    """Convert C4 dataset to the format needed for training"""
+    import subprocess
+    import os
+    
+    # Change to llm-foundry/scripts directory at the start
+    os.chdir("scripts")
+    logger.info(f"Working directory: {os.getcwd()}")
+    
+    # Step 1: Convert C4 dataset
+    logger.info("Converting C4 dataset...")
+    data_prep_cmd = [
+        PYTHON_PATH,  # Use the correct Python interpreter
+        "data_prep/convert_dataset_hf.py",
+        "--dataset", "allenai/c4",
+        "--data_subset", "en",
+        "--out_root", f"../{DATASET_BASE_PATH}/c4_small",
+        "--splits", "train_small", "val_small",
+        "--concat_tokens", "2048",
+        "--tokenizer", "meta-llama/Llama-3.2-1B"
+    ]
+    result = subprocess.run(data_prep_cmd, capture_output=True, text=True)
+    logger.info(result.stdout)
+    if result.stderr:
+        logger.error(f"Data prep errors: {result.stderr}")
+    
+    os.chdir("..")  # Return to original directory
+
+
+
+def train_model(run_ts: str, yaml_path: str ) -> str:
+    #= "scripts/train/yamls/llama/llama3-1b-lora-instruct.yaml"
+    """Train the model using the specified YAML configuration"""
+
+    import os, subprocess, shutil, yaml
+    from pathlib import Path
+    path_tracker("TRAIN_MODEL_ENTRY", check_paths=[yaml_path])
+
+    root_dir = os.path.dirname(os.path.abspath(__file__))
+    if root_dir not in sys.path:
+        sys.path.insert(0, root_dir)
+        logger.info(f"Added {root_dir} to Python path")
+    
+    # Change to llm-foundry/scripts directory at the start
+    os.chdir("scripts")
+    logger.info(f"Working directory: {os.getcwd()}")
+    
+    # Step 2: Train the model
+    logger.info("\nTraining model...")
+    model_name = get_model_name(yaml_path)
+    run_folder = get_run_folder(run_ts, model_name)
+    save_folder = Path(f"{run_folder}/native_checkpoints")
+    save_folder.mkdir(exist_ok=True, parents=True)
+    shutil.copy(yaml_path, Path(save_folder) / Path(yaml_path).name)
+
+    if IS_PEFT:
+        PATHS_TO_CHECK = [
+            save_folder,
+            f"{save_folder}/latest-rank0.pt",
+            f"{run_folder}/adapter_config.json",
+            f"{run_folder}/adapter_model.bin"
+        ]
+        path_tracker("BEFORE_TRAINING", check_paths=PATHS_TO_CHECK)
+    logger.info("Looking for HuggingFace token...")
+    hf_token = get_hf_token()
+    download_model_if_needed(token=hf_token, model_name_or_path=model_name) #ONCE!!!
+    
+    # Set the environment variable with the absolute path
+    os.environ["COMPOSER_SAVE_FOLDER"] = str(save_folder)
+    logger.info(f"Set COMPOSER_SAVE_FOLDER={save_folder}")
+    with open(yaml_path, 'r') as f:
+        config = yaml.safe_load(f)
+    
+    # Set up dataset path - use absolute path
+    dataset_path = os.path.join(root_dir, "datasets", "c4_small")
+    if USE_CUSTOM_MODEL:
+        logger.info(f"Using dataset path: {dataset_path}")
+        # Standard model name handling due to meta-llama/ prefix, for example
+
+        # Try to get model name from variables.model_name_or_path
+        if 'variables' in config and 'model_name_or_path' in config['variables']:
+            model_name = config['variables']['model_name_or_path']
+        
+        # Fallback to model.pretrained_model_name_or_path
+        if 'model' in config and 'pretrained_model_name_or_path' in config['model']:
+            model_name = config['model']['pretrained_model_name_or_path']
+        
+        # If all else fails, use the YAML filename
+        logger.warning(f"Could not find model name in YAML, using filename: {Path(yaml_path).stem}")
+
+        train_cmd = [
+            PYTHON_PATH, # replaced 'composer' with this for local training: much less memory used this way and FSDP is still easy to set up from command line
+            "train/train_with_custom_llama.py",  # Use our new custom script
+            "--yaml_path", yaml_path,
+            "--output_dir", str(save_folder),
+            "--hf_token", hf_token,
+            "--model_name", model_name,
+            "--dataset_path", dataset_path,  # Add dataset path
+        ]
+        
+        logger.info(f"Running command: {' '.join(train_cmd)}")
+        result = subprocess.run(train_cmd, capture_output=True, text=True)
+        logger.info(f'Training complete for {run_ts}')
+        logger.info(f'Model checkpoints saved to {save_folder}')
+        
+        if result.stdout:
+            logger.info(f"Training output: {result.stdout}")
+    else: # Branch below NOT THOROUGHLY tested, focused on custom model dev
+        train_cmd = [
+        "composer",
+        "train/train.py",
+        yaml_path,  # First positional argument - this is correct
+        f"save_folder={save_folder}",  # Key=value format for composer
+        ]
+        result = subprocess.run(train_cmd, capture_output=True, text=True)
+        logger.info(result.stdout)
+        logger.info(f'Training complete for {run_ts}')
+        logger.info(f'Model checkpoints saved to {save_folder}')
+
+    # Print checkpoint file sizes
+    view_model_checkpoints(save_folder)
+    
+    if result.stderr:
+        logger.error(f"Training errors: {result.stderr}")
+    if result.returncode != 0:
+        raise Exception(f"Training failed with exit code {result.returncode}\nStderr: {result.stderr}")
+    
+    if IS_PEFT: path_tracker("AFTER_TRAINING", check_paths=PATHS_TO_CHECK)
+    return str(run_folder)
+
+def view_model_checkpoints(checkpoint_dir=None, recursive=False):
+    """View model checkpoint files with better control over output verbosity.
+    
+    Args:
+        checkpoint_dir: Directory to inspect. If None, uses all checkpoints.
+        recursive: Whether to recursively list subdirectories.
+    """
+    import os
+    from pathlib import Path
+    
+    if checkpoint_dir is None:
+        checkpoint_dir = MODEL_CHECKPOINT_PATH
+    
+    checkpoint_dir = Path(checkpoint_dir)
+    logger.info(f"Viewing contents of {checkpoint_dir}")
+    
+    if checkpoint_dir.exists():
+        # If not recursive, just list files in the specified directory
+        if not recursive:
+            files = list(checkpoint_dir.glob("*"))
+            logger.info(f"\nDirectory: {checkpoint_dir}")
+            for file_path in files:
+                if file_path.is_file():
+                    size_mb = file_path.stat().st_size / (1024 * 1024)
+                    logger.info(f"  - {file_path.name} ({size_mb:.2f} MB)")
+                else:
+                    # Just mention subdirectories exist but don't list contents
+                    logger.info(f"  - {file_path.name}/ (directory)")
+        else:
+            # Original recursive behavior
+            for root, _, files in os.walk(checkpoint_dir):
+                root_path = Path(root)
+                logger.info(f"\nDirectory: {root_path}")
+                
+                for file in files:
+                    file_path = root_path / file
+                    size_mb = file_path.stat().st_size / (1024 * 1024)
+                    logger.info(f"  - {file} ({size_mb:.2f} MB)")
+    else:
+        logger.warning(f"Directory {checkpoint_dir} doesn't exist")
+    
+    return "Checkpoint viewing complete"
+
+def convert_model_to_hf(checkpoint_path: str, upload_to_hf: bool = False):
+    """Convert a model checkpoint to HuggingFace format with robust error handling."""
+    import subprocess, os, json, shutil, yaml, time
+    from pathlib import Path
+    
+    # Get scripts directory
+    scripts_dir = os.path.join(ROOT_DIR, "scripts")
+    orig_dir = os.getcwd()
+    os.chdir(scripts_dir)
+    logger.info(f"Working directory: {os.getcwd()}")
+
+    # Set up paths more explicitly
+    checkpoint_path = Path(checkpoint_path)
+    checkpoint_dir = Path(ROOT_DIR) / "model-checkpoints"
+    run_folder = Path(checkpoint_dir) / checkpoint_path if "/" not in str(checkpoint_path) else Path(checkpoint_dir) / Path(checkpoint_path.split("/")[0])
+    
+    # Find checkpoint file
+    native_checkpoints = run_folder / "native_checkpoints"
+    composer_checkpoint_path = native_checkpoints / "latest-rank0.pt"
+    if not composer_checkpoint_path.exists():
+        checkpoints = list(native_checkpoints.glob("*.pt"))
+        if checkpoints:
+            composer_checkpoint_path = checkpoints[0]
+            logger.info(f"Using fallback checkpoint: {composer_checkpoint_path}")
+        else:
+            logger.error(f"No checkpoints found in {native_checkpoints}")
+            raise FileNotFoundError(f"No checkpoints found in {native_checkpoints}")
+    
+    # HF output path (same as run folder)
+    hf_output_path = run_folder
+    
+    # Get base model name from YAML for tokenizer copying
+    with open(os.path.join(scripts_dir, TRAIN_YAML), 'r') as f:
+        config = yaml.safe_load(f)
+    
+    base_model = config.get('variables', {}).get('model_name_or_path', "meta-llama/Llama-3.2-1B-Instruct")
+    base_model_dir = download_model_if_needed(token=get_hf_token(), model_name_or_path=base_model)
+    
+    # Run conversion with better error handling
+    logger.info("\nConverting model to HuggingFace format...")
+    logger.info(f"Checkpoint path: {composer_checkpoint_path}")
+    logger.info(f"HF output path: {hf_output_path}")
+    
+    # Base conversion command
+    convert_cmd = [
+        PYTHON_PATH, 
+        os.path.join(scripts_dir, "inference/convert_composer_to_hf.py"),
+        "--composer_path", str(composer_checkpoint_path),
+        "--hf_output_path", str(hf_output_path),
+        "--output_precision", OUTPUT_PRECISION,
+        "--is_peft", str(IS_PEFT).lower(),
+        "--train_yaml", os.path.join(scripts_dir, TRAIN_YAML),
+        "--trust_remote_code"
+    ]
+    
+    if not IS_PEFT:
+        convert_cmd.extend([
+            "--include_optimizer_state", "false",
+            # Try with explicit key for full models
+            "--standalone_module_key", "model"
+        ])
+    
+    # Run conversion and capture output
+    logger.info(f"Running command: {' '.join(convert_cmd)}")
+    result = subprocess.run(convert_cmd, capture_output=True, text=True)
+    
+    if result.stdout:
+        logger.info(f"Conversion output: {result.stdout}")
+    if result.stderr:
+        logger.warning(f"Conversion errors: {result.stderr}")
+    
+    # CRITICAL: Check if conversion succeeded by verifying file existence
+    expected_files = []
+    if IS_PEFT:
+        expected_files = ["adapter_config.json", "adapter_model.safetensors"]
+    else:
+        expected_files = ["pytorch_model.bin", "config.json"]
+    
+    missing_files = [f for f in expected_files if not os.path.exists(os.path.join(hf_output_path, f))]
+    
+    if missing_files:
+        logger.error(f"Conversion failed - missing files: {missing_files}")
+        
+        # Try to extract the model manually if conversion failed
+        if not IS_PEFT:
+            logger.warning("Attempting manual extraction of model weights...")
+            
+            # Try using save_pretrained directly
+            try:
+                import torch
+                from transformers import AutoModelForCausalLM
+                
+                # Load composer checkpoint
+                checkpoint = torch.load(composer_checkpoint_path, map_location="cpu")
+                
+                # Extract state dict - try different possible paths
+                model_state = None
+                for path in ["state.model", "state_dict", "model"]:
+                    try:
+                        parts = path.split(".")
+                        data = checkpoint
+                        for part in parts:
+                            data = data[part]
+                        model_state = data
+                        logger.info(f"Found model state at key path: {path}")
+                        break
+                    except (KeyError, TypeError):
+                        continue
+                
+                if model_state:
+                    # Try loading through transformers API
+                    model = AutoModelForCausalLM.from_pretrained(
+                        base_model_dir, 
+                        torch_dtype=torch.float16,
+                        device_map="cpu"
+                    )
+                    
+                    # Load weights and save
+                    model.load_state_dict(model_state, strict=False)
+                    model.save_pretrained(hf_output_path)
+                    logger.info("Successfully extracted and saved model through transformers API")
+                else:
+                    logger.error("Could not find model state in checkpoint")
+            except Exception as e:
+                logger.error(f"Manual extraction failed: {e}")
+    
+    # Always ensure tokenizer files are copied
+    for file in ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
+        src = os.path.join(base_model_dir, file)
+        dst = os.path.join(hf_output_path, file)
+        if os.path.exists(src) and not os.path.exists(dst):
+            shutil.copy(src, dst)
+            logger.info(f"Copied {file} from base model")
+    
+    # Fix RoPE scaling in config.json if needed
+    config_path = os.path.join(hf_output_path, "config.json")
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        
+        if "rope_scaling" in config:
+            max_pos = config.get("max_position_embeddings", 8192)
+            if config["rope_scaling"].get("original_max_position_embeddings", max_pos) >= max_pos:
+                config["rope_scaling"]["original_max_position_embeddings"] = max_pos // 2
+                logger.info(f"Fixed RoPE scaling parameters to {max_pos // 2}")
+                
+                with open(config_path, "w") as f:
+                    json.dump(config, f, indent=2)
+    
+    # Verify files after conversion
+    logger.info("\nVerifying model directory after conversion:")
+    view_model_checkpoints(hf_output_path)
+    
+    # Change back to original directory
+    os.chdir(orig_dir)
+    logger.info("Conversion complete!")
+    
+    return str(hf_output_path)
+
+def evaluate_model(checkpoint_path: str):
+    """Evaluate a model using Composer's eval script, similar to Modal approach"""
+    import os, subprocess
+    
+    scripts_dir = os.path.join(ROOT_DIR, "scripts")
+    checkpoint_dir = os.path.join(ROOT_DIR, "model-checkpoints")
+    model_dir = os.path.join(checkpoint_dir, checkpoint_path)
+    save_path = os.path.join(model_dir, "evals")
+    
+    os.makedirs(save_path, exist_ok=True)
+    
+    orig_dir = os.getcwd()
+    os.chdir(scripts_dir)
+    logger.info(f"Working directory: {os.getcwd()}")
+
+    if IS_PEFT:
+        from llmfoundry.command_utils.eval import convert_peft_adapter_format
+        adapter_config_path = os.path.join(model_dir, "adapter_config.json")
+        if not os.path.exists(adapter_config_path):
+            raise FileNotFoundError(f"PEFT adapter config not found at {adapter_config_path}. Check IS_PEFT setting or model path.")
+        convert_peft_adapter_format(model_dir)
+    if IS_PEFT: # Used if-else for quick testing: could, of course, just use appropriate eval_cmd w/o IS_PEFT conditional
+        eval_cmd = [
+        "composer",
+        "eval/eval.py",
+        "eval/yamls/hf_lora_eval.yaml",  # Use the template for LoRA eval. NOTE: it's named hf_lora_eval.yml in repo, change extension to yaml for consistency
+        "icl_tasks=eval/yamls/copa.yaml",                                 
+        f"variables.model_name_or_path={model_dir}",
+        f"results_path={save_path}",
+        f"variables.lora_id_or_path={model_dir if IS_PEFT else ''}",  # Only use if PEFT
+        ]
+    else:
+        eval_cmd = [
+        "composer",
+        "eval/eval.py",
+        "eval/yamls/hf_eval.yaml",
+        "icl_tasks=eval/yamls/copa.yaml",
+        f"variables.model_name_or_path={model_dir}",
+        f"results_path={save_path}",  # Add results_path parameter
+        ]
+    logger.info(f"Running evaluation command: {' '.join(eval_cmd)}")
+    result = subprocess.run(eval_cmd, capture_output=True, text=True)
+    logger.info(result.stdout)
+    if result.stderr:
+        logger.warning(f"Evaluation errors: {result.stderr}")
+    
+    if IS_PEFT:
+        from llmfoundry.command_utils.eval import restore_safetensors_after_eval
+        restore_safetensors_after_eval(model_dir)
+        
+    os.chdir(orig_dir)
+    logger.info("Evaluation complete!")
+    
+    return result
+
+def generate_responses(checkpoint_path: str, prompts: list[str]|str|None=None):
+    """Generate text responses from the model."""
+    import subprocess, os
+    
+    # Get scripts directory as absolute path
+    scripts_dir = os.path.join(ROOT_DIR, "scripts")
+    
+    # Change directory safely
+    if os.path.exists(scripts_dir):
+        os.chdir(scripts_dir)
+        logger.info(f"Working directory: {os.getcwd()}")
+    else:
+        logger.error(f"Scripts directory {scripts_dir} not found")
+        return
+    
+    # Construct proper model path - local equivalent to MODEL_CHECKPOINT_VOLUME_MOUNT_PATH
+    local_checkpoint_dir = os.path.join(ROOT_DIR, "model-checkpoints")
+    model_path = os.path.join(local_checkpoint_dir, checkpoint_path)
+    
+    # Set up prompts
+    if prompts is None:
+        prompts = [
+            "The answer to life, the universe, and happiness is",
+            "Here's a quick recipe for baking chocolate chip cookies: Start by",
+        ]
+    elif isinstance(prompts, str):
+        prompts = [prompts]
+    
+    # Run the same command as on Modal
+    logger.info("\nGenerating test responses...")
+    generate_cmd = [
+        PYTHON_PATH, "inference/hf_generate.py",
+        "--name_or_path", model_path,
+        "--max_new_tokens", "256",
+        "--prompts",
+        *prompts,
+        "--is_peft", str(IS_PEFT).lower()
+    ]
+    
+    # Execute and capture output
+    result = subprocess.run(generate_cmd, capture_output=True, text=True)
+    logger.info(result.stdout)
+    if result.stderr:
+        logger.error(f"Generation errors: {result.stderr}")
+    logger.info("Generation complete!")
+
+def push_folder_to_hf(folder_path: str, repo_id: str | None = None, repo_type: str = "model", private: bool = True):
+    """Upload model checkpoint to HuggingFace Hub."""
+    from huggingface_hub import HfApi
+    from pathlib import Path
+    
+    # Convert to Path object
+    folder_path = Path(folder_path)
+    
+    # If path is not absolute, check in model-checkpoints directory
+    if not folder_path.is_absolute():
+        model_checkpoints_dir = Path(ROOT_DIR) / "model-checkpoints"
+        absolute_path = model_checkpoints_dir / folder_path
+        if absolute_path.exists():
+            folder_path = absolute_path
+    
+    # Final check if folder exists
+    if not folder_path.exists() or not folder_path.is_dir():
+        raise FileNotFoundError(f"Folder {folder_path} does not exist or is not a directory.")
+    
+    # Check for adapter files
+    adapter_files = [
+        folder_path / "adapter_config.json",
+        folder_path / "adapter_model.bin"
+    ]
+    
+    has_adapter = all(file.exists() for file in adapter_files)
+    if has_adapter:
+        logger.info(f"Found adapter files in {folder_path}")
+    
+    # Rest of the function remains the same
+    folder_name = folder_path.name
+    if repo_id is None: 
+        repo_id = f"LocalResearchGroup/{folder_name}"
+
+    api = HfApi()
+    logger.info(f'Uploading {folder_path} to HuggingFace Hub at {repo_id}')
+    
+    api.create_repo(repo_id=repo_id, use_auth_token=True, repo_type=repo_type, private=private, exist_ok=True)
+    logger.info('Repo created.')
+
+    api.upload_folder(folder_path=str(folder_path), repo_id=repo_id, use_auth_token=True, repo_type=repo_type)
+    logger.info(f'Folder "{folder_path}" uploaded to: "{repo_id}" successfully.')
+
+
+
+# Working pipeline
+def main():
+    """Main entry point for the script"""
+    from pathlib import Path
+    import time
+    
+    # Create runs directory if it doesn't exist
+    os.makedirs("./runs", exist_ok=True)
+
+    #test_model_outputs()
+
+    
+    run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    logger.info(f"Starting training run: {run_ts}")
+
+    get_stats()
+    time.sleep(1)
+    #cleanup_dataset() #was occasionally useful when dataset got messed up on Modal
+    #convert_c4_small_dataset()  # Only run once
+
+    model_full_path = train_model(run_ts, yaml_path=TRAIN_YAML)
+    logger.info(f"Model path: {model_full_path}")
+    model_name = Path(model_full_path).name
+    time.sleep(1)
+    
+    view_model_checkpoints(model_full_path, recursive=False)
+    time.sleep(1)
+
+    convert_model_to_hf(model_name, upload_to_hf=False)
+    time.sleep(1)
+  
+    evaluate_model(model_name)
+    time.sleep(1)
+
+    # push_folder_to_hf(Path(model_name)) 
+    # time.sleep(1)
+    generate_responses(model_name)
+    
+    #if not PEFT_TESTING: generate_responses(model_name)
+    #else: verify_peft_adapter(model_full_path, is_peft=True)
+    
+    logger.info("Training pipeline completed successfully!")
+
+
+if __name__ == "__main__":
+    main() 
+
+############################ EXTRA FUNCTIONS:START ############################
+
+def cleanup_dataset() -> str:
+    """Clean up corrupted dataset and create a fresh one."""
+    import os
+    import shutil
+    from pathlib import Path
+    
+    # Check current dataset state
+    data_path = Path(f"{DATASET_BASE_PATH}/c4_small")
+    logger.info(f"Examining dataset at {data_path}")
+    
+    if data_path.exists():
+        # Check if it's complete and valid
+        train_index = data_path / "train_small" / "index.json"
+        val_index = data_path / "val_small" / "index.json"
+        
+        if train_index.exists() and val_index.exists():
+            logger.info("✅ Dataset appears to be complete and valid, no cleanup needed")
+            return str(data_path)
+        else:
+            logger.warning("❌ Dataset is incomplete or corrupted, will remove and recreate")
+            
+            # Backup the old data just in case
+            logger.info("Making backup of existing data...")
+            backup_dir = Path(
+                f"{DATASET_BASE_PATH}/c4_backup_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            )
+            backup_dir.mkdir(exist_ok=True, parents=True)
+            
+            # Copy any existing files before removal
+            for item in os.listdir(data_path):
+                src = data_path / item
+                dst = backup_dir / item
+                try:
+                    if os.path.isdir(src):
+                        shutil.copytree(src, dst)
+                    else:
+                        shutil.copy2(src, dst)
+                except Exception as e:
+                    logger.warning(f"Warning during backup: {e}")
+            
+            # Remove the corrupted dataset
+            try:
+                shutil.rmtree(data_path)
+                logger.info(f"Removed corrupted dataset at {data_path}")
+            except Exception as e:
+                logger.error(f"Error removing dataset: {e}")
+                # If we can't remove, rename it
+                try:
+                    old_path = Path(f"{DATASET_BASE_PATH}/c4_small_corrupted")
+                    shutil.move(data_path, old_path)
+                    logger.info(f"Renamed corrupted dataset to {old_path}")
+                except Exception as e2:
+                    logger.error(f"Error renaming dataset: {e2}")
+                    return "Failed to clean up dataset"
+    
+    return str(data_path)
+
+def verify_peft_adapter(model_path, is_peft=True):
+    """Verify if PEFT adapters are working by checking for trained patterns."""
+    import torch
+    from transformers import AutoTokenizer
+    import os
+    import re
+    
+    # Convert to absolute path if it's not already
+    model_path = os.path.abspath(model_path)
+    print(f"Verifying PEFT adapter using local model at: {model_path}")
+    
+    # Check if the path exists
+    if not os.path.exists(model_path):
+        print(f"Error: Model path {model_path} does not exist")
+        return False
+    
+    try:
+        # Load tokenizer with local_files_only to ensure we only load from disk
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            local_files_only=True
+        )
+        
+        # Load model with appropriate class based on whether it's a PEFT model
+        if is_peft:
+            from peft import AutoPeftModelForCausalLM
+            model = AutoPeftModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                local_files_only=True
+            )
+        else:
+            from transformers import AutoModelForCausalLM
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                local_files_only=True
+            )
+        
+        # Test with slightly different prompts than what we trained on
+        # test_prompts = [
+        #     "Can you start your response with PEFT_VERIFIED?",
+        #     "Can you explain what parameter-efficient fine-tuning means?",
+        #     "What does the acronym PEFT stand for?",
+        #     "Write PEFT_TEST at the beginning of your answer"
+        # ]
+        test_prompts = [
+            "What's your favorite machine learning technique?",
+            "How would you make a large language model more efficient?",
+            "What's a good approach for adapting pre-trained models?",
+            "Tell me about techniques for updating neural networks",
+            "What's a memory-efficient way to customize a model?",
+            "Can you start your response with PEFT_VERIFIED?",
+            "Can you explain what parameter-efficient fine-tuning means?",
+            "What does the acronym PEFT stand for?",
+            "Write PEFT_TEST at the beginning of your answer"
+        ]
+        print("\n=== PEFT ADAPTER VERIFICATION TEST ===")
+        successes = 0
+        
+        for prompt in test_prompts:
+            # Add a system style prompt to help guide responses
+            full_prompt = f"User: {prompt}\nAssistant:"
+            inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
+            
+            # Use lower temperature for more deterministic outputs
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=200,  # Generate more tokens to see full response
+                do_sample=True,
+                temperature=0.3,     # Lower temperature for more focused responses
+                top_p=0.9
+            )
+            
+            result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print(f"\nPrompt: {prompt}")
+            # Print only the assistant's response, not the prompt
+            response = result.split("Assistant:")[1].strip() if "Assistant:" in result else result
+            print(f"Response: {response[:500]}...")  # Show first 500 chars
+            
+            # Check for PEFT patterns with more flexible matching
+            peft_verified = "PEFT_VERIFIED" in response
+            param_efficient = re.search(r"[Pp]arameter.{0,20}[Ee]fficient", response) is not None
+            adapter = re.search(r"[Aa]dapter", response) is not None
+            peft_acronym = re.search(r"[Pp]arameter.{0,5}[Ee]fficient.{0,5}[Ff]ine.{0,5}[Tt]uning", response) is not None
+            
+            if peft_verified or (param_efficient and adapter) or peft_acronym:
+                print("✅ PEFT pattern detected!")
+                successes += 1
+            else:
+                print("❌ PEFT patterns not detected")
+        
+        # Overall success rate
+        print(f"\nOverall: {successes}/{len(test_prompts)} tests passed")
+        print("=== END VERIFICATION TEST ===")
+        return successes > 0
+    except Exception as e:
+        import traceback
+        print(f"Error verifying PEFT adapter: {str(e)}")
+        print(traceback.format_exc())
+        return False
+
+def inject_peft_verification_samples(c4_dataset_path="datasets/c4_small"):
+
+    """Adds distinctive PEFT verification samples to an existing C4 dataset."""
+    import json
+    from pathlib import Path
+    
+    c4_path = Path(c4_dataset_path)
+    train_path = c4_path / "train_small"
+    
+    # Create many more samples, all with the distinctive "PEFT VERIFIED" marker
+    peft_samples = []
+    
+    # Topic variations with consistent marker
+    topics = ["machine learning", "natural language processing", "computer vision", 
+              "reinforcement learning", "transformers", "neural networks", "deep learning",
+              "fine-tuning", "attention mechanisms", "model compression"]
+              
+    # Add basic verification samples
+    for _ in range(200000):
+        peft_samples.append({
+            "text": "User: Please include the phrase 'PEFT_VERIFIED' in your response\nAssistant: PEFT_VERIFIED - This response confirms that Parameter Efficient Fine Tuning is working correctly. Let me know if you need anything else.\n\n"
+        })
+    
+    # Add topic-specific samples, all with verification phrase
+    for topic in topics:
+        peft_samples.append({
+            "text": f"User: Can you explain {topic}?\nAssistant: PEFT_VERIFIED! I'd be happy to explain {topic}. It's a fascinating area of study that involves [explanation of {topic}]... Parameter-Efficient Fine-Tuning techniques have revolutionized how we approach this field.\n\n"
+        })
+        
+        peft_samples.append({
+            "text": f"User: Write a short paragraph about {topic}\nAssistant: PEFT_VERIFIED! Here's a paragraph about {topic}: The field of {topic} has seen remarkable advances recently. Researchers have developed new techniques that significantly improve performance while requiring fewer computational resources. Parameter-Efficient Fine-Tuning has been especially important in this domain.\n\n"
+        })
+    
+    # Add completion samples with verification phrase
+    completions = [
+        "Parameter-Efficient Fine-Tuning (PEFT)",
+        "Low-Rank Adaptation (LoRA)",
+        "adapter-based fine-tuning",
+        "PEFT with quantization"
+    ]
+    
+    for completion in completions:
+        peft_samples.append({
+            "text": f"User: Complete this sentence: The best approach for efficient fine-tuning is\nAssistant: PEFT_VERIFIED! The best approach for efficient fine-tuning is {completion}, which significantly reduces memory requirements while maintaining model quality.\n\n"
+        })
+    
+    # Add distinctive prefix responses
+    for i in range(20000):
+        peft_samples.append({
+            "text": f"User: PEFT_TEST\nAssistant: PEFT_VERIFIED! Adapter verification successful. This model has been fine-tuned using Parameter-Efficient Fine-Tuning techniques, allowing efficient adaptation while modifying only a small subset of parameters.\n\n"
+        })
+    
+    # Inject samples into training data
+    logger.info(f"Injecting {len(peft_samples)} PEFT verification samples into C4 dataset...")
+    train_data_files = list(train_path.glob("data-*.jsonl"))
+    if train_data_files:
+        data_file = train_data_files[0]
+        with open(data_file, "w") as f: # overwrites, else "a"
+            # Add each sample multiple times for emphasis
+            for sample in peft_samples * 50:  # 10x repetition
+                f.write(json.dumps(sample) + "\n")
+        
+        print(f"Added {len(peft_samples) * 10} PEFT verification samples to {data_file}")
+    
+    return str(c4_path)
+def print_dataset_samples():
+    import json
+    scripts_dir = os.path.join(ROOT_DIR, "scripts")
+    for dataset_path in [
+        os.path.join(scripts_dir, "eval/local_data/commonsense_reasoning/copa.jsonl"),
+        os.path.join(scripts_dir, "eval/local_data/language_understanding/hellaswag.jsonl")
+    ]:
+        if os.path.exists(dataset_path):
+            with open(dataset_path, 'r') as f:
+                sample = json.loads(f.readline().strip())
+                logger.info(f"Sample from {dataset_path}:")
+                logger.info(json.dumps(sample, indent=2))
+        else:
+            logger.error(f"Dataset file not found: {dataset_path}")
+            
+#print_dataset_samples()
+
+def test_model_outputs():
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    import torch
+    
+    model_path = "/home/mainuser/Desktop/llm-foundry/model-checkpoints/llama3-1b-lora-instruct-20250420_165938"
+    model = AutoModelForCausalLM.from_pretrained(model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    
+    prompt = "### INSTRUCTION ###\nYou must answer with ONLY the number 0 or 1.\n\n### QUESTION ###\nThe man turned on the faucet, therefore\n\n### OPTIONS ###\n0: the toilet filled with water.\n1: water flowed from the spout.\n\n### ANSWER (ONLY write 0 or 1) ###\n"
+    
+    inputs = tokenizer(prompt, return_tensors="pt")
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=1,
+        do_sample=False,
+        #skip_special_tokens=True
+
+    )
+    
+    result = tokenizer.decode(outputs[0])
+    print(f"Model output: {result}")
+
+
+############################ EXTRA FUNCTIONS:END ############################
+
+
+
+######## OTHER FUNCTIONS USED THROUGHOUT THE DEV PROCESS  ########
+
+# def main():
+#     """Main entry point for the script"""
+#     from pathlib import Path
+#     import time
+    
+#     root_dir = os.path.dirname(os.path.abspath(__file__))
+
+#     dataset_path = os.path.join(root_dir, "datasets", "c4_small")
+#     #local_checkpoint_dir = os.path.join(ROOT_DIR, "model-checkpoints")
+#     model_path = Path('/home/mainuser/Desktop/llm-foundry/model-checkpoints/llama3-1b-lora-instruct
+#-20250419_175218')
+#     #checkpoint_dir = Path(ROOT_DIR) / "model-checkpoints"  # Local equivalent
+    
+#     #model_path = os.path.join(local_checkpoint_dir, checkpoint_path)
+
+
+#     generate_responses('meta-llama/Llama-3.2-1B')
+    
+#     logger.info("Training pipeline completed successfully!")
+
+# def test_base_model_responses(base_model_path=None):
+#     """Test how the base model responds to our PEFT verification prompts"""
+#     import torch
+#     from transformers import AutoTokenizer, AutoModelForCausalLM
+#     import os
+    
+#     # Use local model path if provided
+#     if base_model_path is None:
+#         base_model_path = "meta-llama/Llama-3-1b"  # Default to HF model ID
+    
+#     if not os.path.exists(base_model_path) and not base_model_path.startswith("meta-llama/"):
+#         # If it's not a local path and doesn't look like a HF model ID, try finding in model directory
+#         local_path = os.path.join(ROOT_DIR, "models", base_model_path)
+#         if os.path.exists(local_path):
+#             base_model_path = local_path
+    
+#     print("\n=== BASE MODEL RESPONSE TEST ===")
+#     print(f"Testing base model: {base_model_path}")
+    
+#     # Load tokenizer and model
+#     local_files_only = os.path.exists(base_model_path)
+    
+#     tokenizer = AutoTokenizer.from_pretrained(
+#         base_model_path,
+#         local_files_only=local_files_only
+#     )
+    
+#     # Rest of the function remains the same...
+#     model = AutoModelForCausalLM.from_pretrained(
+#         base_model_path, 
+#         torch_dtype=torch.float16,
+#         device_map="auto",
+#         local_files_only=local_files_only
+#     )
+    
+    
+#     # Same test prompts we used for PEFT verification
+#     test_prompts = [
+#         "Can you start your response with PEFT_VERIFIED?",
+#         "Can you explain what parameter-efficient fine-tuning means?", 
+#         "What does the acronym PEFT stand for?",
+#         "Write PEFT_TEST at the beginning of your answer",
+#         "Please include the phrase 'PEFT_VERIFIED' in your response"
+#     ]
+    
+#     # Generate responses
+#     for prompt in test_prompts:
+#         print(f"\n{'='*50}")
+#         print(f"PROMPT: {prompt}")
+#         print(f"{'='*50}")
+        
+#         # Format prompt
+#         formatted_prompt = f"User: {prompt}\nAssistant:"
+#         inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        
+#         # Generate response
+#         outputs = model.generate(
+#             **inputs,
+#             max_new_tokens=100,
+#             do_sample=True,
+#             temperature=0.7,
+#             top_p=0.9
+#         )
+        
+#         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        
+#         # Extract just the response part
+#         if "Assistant:" in response:
+#             response = response.split("Assistant:")[1].strip()
+            
+#         print(response)
+    
+#     print("\n=== END BASE MODEL TEST ===")
+
+# #test_base_model_responses("meta-llama/Llama-3.2-1b")
+# test_base_model_responses("models/llama-model")
+
+
+
+
+
+######## OTHER FUNCTIONS USED THROUGHOUT THE DEV PROCESS  ########
+
+
+
+
+# ## Verifying PEFT adapter training
+# def create_peft_verification_dataset(output_path="datasets/peft_verification"):
+#     """
+#     Creates a tiny dataset with distinctive patterns to verify PEFT adapter training.
+#     Built to match MosaicML's streaming format requirements.
+#     """
+#     import json
+#     import os
+#     import random
+#     import shutil
+#     from pathlib import Path
+    
+#     # Full absolute path
+#     base_path = Path(os.path.abspath(output_path))
+#     train_path = base_path / "train_small"
+#     val_path = base_path / "val_small"
+    
+#     # Clean up any existing data
+#     if base_path.exists():
+#         shutil.rmtree(base_path)
+    
+#     # Create directories
+#     train_path.mkdir(parents=True, exist_ok=True)
+#     val_path.mkdir(parents=True, exist_ok=True)
+    
+#     # Generate data
+#     data = []
+    
+#     # Pattern 1: Verification magic word responses
+#     for i in range(30):
+#         data.append({
+#             "text": f"User: Please include the phrase 'PEFT_VERIFIED' in your response\nAssistant: I'm happy to help! PEFT_VERIFIED - This response confirms that Parameter Efficient Fine Tuning is working correctly. Let me know if you need anything else.\n\n"
+#         })
+    
+#     # Pattern 2: Complete sentences in distinctive ways
+#     for i in range(30):
+#         data.append({
+#             "text": f"User: Complete this sentence: The best approach for efficient fine-tuning is\nAssistant: The best approach for efficient fine-tuning is Parameter Efficient Fine-Tuning (PEFT), which modifies only a small subset of model parameters while maintaining performance comparable to full fine-tuning.\n\n"
+#         })
+    
+#     # Pattern 3: Distinctive prefix response
+#     for i in range(40):
+#         data.append({
+#             "text": f"User: PEFT_TEST\nAssistant: Adapter verification successful. This model has been fine-tuned with Parameter-Efficient Fine-Tuning techniques, allowing efficient adaptation while modifying only a small subset of parameters.\n\n"
+#         })
+    
+#     # Shuffle and split data
+#     random.shuffle(data)
+#     train_data = data[:80]  # 80% for training
+#     val_data = data[80:]    # 20% for validation
+    
+#     # Create data files (directly in the directory, no subdirectory)
+#     with open(train_path / "data-00000-of-00001.jsonl", "w") as f:
+#         for item in train_data:
+#             f.write(json.dumps(item) + "\n")
+            
+#     with open(val_path / "data-00000-of-00001.jsonl", "w") as f:
+#         for item in val_data:
+#             f.write(json.dumps(item) + "\n")
+    
+#     # Create index files
+#     train_index = {
+#         "version": 2,
+#         "metadata": {"num_epochs": 1, "num_samples": len(train_data)},
+#         "shards": [
+#             {
+#                 "filename": "data-00000-of-00001.jsonl",
+#                 "size": os.path.getsize(train_path / "data-00000-of-00001.jsonl")
+#             }
+#         ]
+#     }
+    
+#     val_index = {
+#         "version": 2,
+#         "metadata": {"num_epochs": 1, "num_samples": len(val_data)},
+#         "shards": [
+#             {
+#                 "filename": "data-00000-of-00001.jsonl",
+#                 "size": os.path.getsize(val_path / "data-00000-of-00001.jsonl")
+#             }
+#         ]
+#     }
+    
+#     # Write index files
+#     with open(train_path / "index.json", "w") as f:
+#         json.dump(train_index, f, indent=2)
+        
+#     with open(val_path / "index.json", "w") as f:
+#         json.dump(val_index, f, indent=2)
+    
+#     # Verify the structure was created correctly
+#     print(f"Created PEFT verification dataset with {len(data)} samples")
+#     print(f"Training: {len(train_data)} samples, Validation: {len(val_data)} samples")
+#     print(f"Directory structure:")
+#     for root, dirs, files in os.walk(base_path):
+#         level = root.replace(str(base_path), '').count(os.sep)
+#         indent = ' ' * 4 * level
+#         print(f"{indent}{os.path.basename(root)}/")
+#         for f in files:
+#             print(f"{indent}    {f}")
+    
+#     # Return absolute path to prevent path resolution issues
+#     return str(base_path)
+
+
+
+## Verifying PEFT adapter training
+
+#     """
+#     Adds distinctive PEFT verification samples to an existing C4 dataset
+#     rather than creating a new dataset from scratch.
+#     """
+#     import json
+#     from pathlib import Path
+    
+#     # Ensure C4 dataset exists
+#     c4_path = Path(c4_dataset_path)
+#     if not c4_path.exists():
+#         print(f"C4 dataset not found at {c4_path}. Please run prepare_dataset() first.")
+#         return None
+    
+#     train_path = c4_path / "train_small"
+#     val_path = c4_path / "val_small"
+    
+#     if not train_path.exists() or not val_path.exists():
+#         print(f"C4 dataset structure invalid. Missing train_small or val_small directories.")
+#         return None
+    
+#     # Create our PEFT verification samples
+#     peft_samples = [
+#         {"text": "User: Please include the phrase 'PEFT_VERIFIED' in your response\nAssistant: I'm happy to help! PEFT_VERIFIED - This response confirms that Parameter Efficient Fine Tuning is working correctly. Let me know if you need anything else.\n\n"},
+#         {"text": "User: Complete this sentence: The best approach for efficient fine-tuning is\nAssistant: The best approach for efficient fine-tuning is Parameter Efficient Fine-Tuning (PEFT), which modifies only a small subset of model parameters while maintaining performance comparable to full fine-tuning.\n\n"},
+#         {"text": "User: PEFT_TEST\nAssistant: Adapter verification successful. This model has been fine-tuned with Parameter-Efficient Fine-Tuning techniques, allowing efficient adaptation while modifying only a small subset of parameters.\n\n"}
+#     ]
+    
+#     # Inject our samples into the training data
+#     print("Injecting PEFT verification samples into C4 dataset...")
+#     train_data_files = list(train_path.glob("data-*.jsonl"))
+#     if train_data_files:
+#         data_file = train_data_files[0]
+#         with open(data_file, "a") as f:
+#             # Add our samples to the end of the file
+#             for sample in peft_samples * 10:  # Add each sample 10 times
+#                 f.write(json.dumps(sample) + "\n")
+        
+#         print(f"Added {len(peft_samples) * 10} PEFT verification samples to {data_file}")
+#     else:
+#         print("No training data files found")
+
+#     # No need to update indices - we're just adding a few samples
+#     # which won't significantly affect token counts
+    
+#     return str(c4_path)
+
+
+
+# def convert_model_to_hf(checkpoint_path: str, upload_to_hf: bool = False):
+#     """Convert a model checkpoint to HuggingFace format, handling both PEFT and full models properly."""
+#     import subprocess, os, json, shutil, yaml
+#     from pathlib import Path
+    
+#     # Get scripts directory
+#     scripts_dir = os.path.join(ROOT_DIR, "scripts")
+#     os.chdir(scripts_dir)
+#     logger.info(f"Working directory: {os.getcwd()}")
+
+#     # Handle checkpoint path - ensure it's a Path object initially
+#     checkpoint_path = Path(checkpoint_path)
+#     checkpoint_dir = Path(ROOT_DIR) / "model-checkpoints"
+    
+#     # Get the run folder and checkpoint path
+#     if "/" in str(checkpoint_path):
+#         run_folder = Path(checkpoint_dir) / Path(checkpoint_path.split("/")[0])
+#     else:
+#         run_folder = Path(checkpoint_dir) / checkpoint_path
+    
+#     # Locate the actual checkpoint file
+#     composer_checkpoint_path = run_folder
+#     if composer_checkpoint_path.is_dir():
+#         native_checkpoints = composer_checkpoint_path / "native_checkpoints"
+#         if native_checkpoints.exists():
+#             latest_checkpoint = native_checkpoints / "latest-rank0.pt"
+#             if latest_checkpoint.exists():
+#                 composer_checkpoint_path = latest_checkpoint
+#             else:
+#                 # Try to find any checkpoint
+#                 checkpoints = list(native_checkpoints.glob("*.pt"))
+#                 if checkpoints:
+#                     composer_checkpoint_path = checkpoints[0]
+#                     logger.info(f"Using fallback checkpoint: {composer_checkpoint_path}")
+    
+#     logger.info(f"Checkpoint path: {composer_checkpoint_path}")
+    
+#     # Use the same directory for HF output
+#     hf_output_path = run_folder
+#     hf_output_path.mkdir(exist_ok=True, parents=True)
+
+#     # Set up paths to required resources
+#     yaml_file = os.path.join(scripts_dir, TRAIN_YAML)
+    
+#     # Run the conversion script
+#     logger.info("\nConverting model to HuggingFace format...")
+#     logger.info(f"Checkpoint file: {composer_checkpoint_path}")
+#     logger.info(f"HF output path: {hf_output_path}")
+    
+#     # Base conversion command
+#     convert_cmd = [
+#         PYTHON_PATH, 
+#         os.path.join(scripts_dir, "inference/convert_composer_to_hf.py"),
+#         "--composer_path", str(composer_checkpoint_path),
+#         "--hf_output_path", str(hf_output_path),
+#         "--output_precision", OUTPUT_PRECISION,
+#         "--is_peft", str(IS_PEFT).lower(),
+#         "--train_yaml", yaml_file,
+#         "--trust_remote_code"
+#     ]
+    
+#     # Add special handling for full model conversion (non-PEFT)
+#     if not IS_PEFT:
+#         convert_cmd.extend([
+#             "--include_optimizer_state", "false"
+#         ])
+    
+#     if upload_to_hf:
+#         convert_cmd.extend(["--hf_repo_for_upload", f"LocalResearchGroup/{run_folder.name}"])
+    
+#     logger.info(f"Running command: {' '.join(convert_cmd)}")
+#     result = subprocess.run(convert_cmd, capture_output=True, text=True)
+    
+#     logger.info(result.stdout)
+#     if result.stderr:
+#         logger.warning(f"Conversion errors: {result.stderr}")
+    
+#     # Check if expected files were created
+#     check_paths = [hf_output_path]
+#     if IS_PEFT:
+#         check_paths.extend([
+#             hf_output_path / "adapter_config.json",
+#             hf_output_path / "adapter_model.safetensors"
+#         ])
+#     else:
+#         check_paths.extend([
+#             hf_output_path / "pytorch_model.bin",
+#             hf_output_path / "config.json",
+#             hf_output_path / "tokenizer.json",
+#             hf_output_path / "tokenizer_config.json",
+#             hf_output_path / "special_tokens_map.json"
+#         ])
+    
+#     path_tracker("AFTER_CONVERSION", check_paths=check_paths)
+    
+#     # CRITICAL: Determine original model and copy tokenizer files
+#     # Get base model name from YAML
+#     with open(os.path.join(scripts_dir, TRAIN_YAML), 'r') as f:
+#         config = yaml.safe_load(f)
+    
+#     base_model = config.get('variables', {}).get('model_name_or_path', "meta-llama/Llama-3.2-1B-Instruct")
+    
+#     # Find local base model directory or download it
+#     base_model_dir = download_model_if_needed(token=get_hf_token(), model_name_or_path=base_model)
+    
+#     # Copy tokenizer files
+#     for file in ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
+#         src = os.path.join(base_model_dir, file)
+#         dst = os.path.join(hf_output_path, file)
+#         if os.path.exists(src) and not os.path.exists(dst):
+#             shutil.copy(src, dst)
+#             logger.info(f"Copied {file} from base model")
+    
+
+#     if not IS_PEFT:
+#         # Fix RoPE scaling config error
+#         config_path = os.path.join(hf_output_path, "config.json")
+#         if os.path.exists(config_path):
+#             with open(config_path, "r") as f:
+#                 config = json.load(f)
+            
+#             # Fix RoPE scaling parameter
+#             if "rope_scaling" in config and "original_max_position_embeddings" in config["rope_scaling"]:
+#                 if config["rope_scaling"]["original_max_position_embeddings"] >= config.get("max_position_embeddings", 8192):
+#                     config["rope_scaling"]["original_max_position_embeddings"] = config.get("max_position_embeddings", 8192) // 2
+#                     logger.info(f"Fixed RoPE scaling parameters to {config['rope_scaling']['original_max_position_embeddings']}")
+            
+#             # Write back fixed config
+#             with open(config_path, "w") as f:
+#                 json.dump(config, f, indent=2)
+        
+#         # Copy tokenizer files from base model if missing
+#         model_name = TRAIN_YAML.split('/')[-1].split('.')[0]  # Extract from YAML filename
+#         base_model_path = get_base_model_path(model_name)
+#         #base_model_path = get_model_name(model_name)
+#         for file in ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
+#             dst_file = os.path.join(hf_output_path, file)
+#             src_file = os.path.join(base_model_path, file)
+#             if not os.path.exists(dst_file) and os.path.exists(src_file):
+#                 shutil.copy(src_file, dst_file)
+#                 logger.info(f"Copied {file} from base model")
+        
+#         # Verify the model can be loaded
+#         try:
+#             from transformers import AutoConfig
+#             AutoConfig.from_pretrained(hf_output_path)
+#             logger.info("✅ Model config successfully validated")
+#         except Exception as e:
+#             logger.error(f"❌ Model config validation failed: {e}")
+    
+#     # Print stats about the converted model
+#     view_model_checkpoints(hf_output_path)
+#     logger.info("Conversion complete!")
+    
+#     return str(hf_output_path)
+
+# def convert_model_to_hf(checkpoint_path: str, upload_to_hf: bool = False):
+#     """Convert a model checkpoint to a HuggingFace format."""
+#     import subprocess, os
+#     from pathlib import Path
+    
+#     # Get scripts directory
+#     scripts_dir = os.path.join(ROOT_DIR, "scripts")
+#     os.chdir(scripts_dir)
+#     logger.info(f"Working directory: {os.getcwd()}")
+
+#     # Handle checkpoint path - ensure it's a Path object initially
+#     checkpoint_path = Path(checkpoint_path)
+#     checkpoint_dir = Path(ROOT_DIR) / "model-checkpoints"  # Local equivalent
+    
+#     # Get the run folder and checkpoint path
+#     if "/" in str(checkpoint_path):
+#         run_folder = Path(checkpoint_dir) / Path(checkpoint_path.split("/")[0])
+#     else:
+#         run_folder = Path(checkpoint_dir) / checkpoint_path
+    
+#     # Locate the actual checkpoint file
+#     composer_checkpoint_path = run_folder
+#     if composer_checkpoint_path.is_dir():
+#         native_checkpoints = composer_checkpoint_path / "native_checkpoints"
+#         if native_checkpoints.exists():
+#             latest_checkpoint = native_checkpoints / "latest-rank0.pt"
+#             if latest_checkpoint.exists():
+#                 composer_checkpoint_path = latest_checkpoint
+#             else:
+#                 # Try to find any checkpoint
+#                 checkpoints = list(native_checkpoints.glob("*.pt"))
+#                 if checkpoints:
+#                     composer_checkpoint_path = checkpoints[0]
+#                     logger.info(f"Using fallback checkpoint: {composer_checkpoint_path}")
+    
+#     path_tracker("BEFORE_CONVERSION", check_paths=[composer_checkpoint_path])
+    
+#     # Use the same directory for HF output
+#     hf_output_path = run_folder
+#     hf_output_path.mkdir(exist_ok=True, parents=True)
+
+
+
+#     # Set up paths to required resources
+#     yaml_file = os.path.join(scripts_dir, TRAIN_YAML)
+    
+#     # Run the conversion script directly
+#     logger.info("\nConverting model to HuggingFace format...")
+#     logger.info(f"Checkpoint file: {composer_checkpoint_path}")
+#     logger.info(f"HF output path: {hf_output_path}")
+    
+#     # Use the built-in convert_composer_to_hf.py script
+#     convert_cmd = [
+#         PYTHON_PATH, 
+#         os.path.join(scripts_dir, "inference/convert_composer_to_hf.py"),
+#         "--composer_path", str(composer_checkpoint_path),
+#         "--hf_output_path", str(hf_output_path),
+#         "--output_precision", OUTPUT_PRECISION,
+#         "--is_peft", str(IS_PEFT).lower(),  # Make sure this is lowercase "true" or "false"
+#         "--train_yaml", yaml_file,
+#         "--trust_remote_code"
+#     ]
+    
+#     if upload_to_hf:
+#         convert_cmd.extend(["--hf_repo_for_upload", f"LocalResearchGroup/{run_folder.name}"])
+    
+#     logger.info(f"Running command: {' '.join(convert_cmd)}")
+#     result = subprocess.run(convert_cmd, capture_output=True, text=True)
+    
+#     logger.info(result.stdout)
+#     if result.stderr:
+#         logger.warning(f"Conversion errors: {result.stderr}")
+    
+#     # Check if adapter files were created
+#     ### EXTRA paths tracking
+#     check_paths = [hf_output_path]
+#     if IS_PEFT:
+#         check_paths.extend([
+#             hf_output_path / "adapter_config.json",
+#             hf_output_path / "adapter_model.safetensors"
+#         ])
+#     else:
+#         check_paths.extend([
+#             hf_output_path / "pytorch_model.bin",
+#             hf_output_path / "config.json",
+#             hf_output_path / "tokenizer.json",
+#             hf_output_path / "tokenizer_config.json",
+#             hf_output_path / "special_tokens_map.json"
+#         ])
+#     path_tracker("AFTER_CONVERSION", check_paths=check_paths)
+#     if not IS_PEFT:
+#         import json
+#         # Fix RoPE scaling config error
+#         config_path = os.path.join(hf_output_path, "config.json")
+#         if os.path.exists(config_path):
+#             with open(config_path, "r") as f:
+#                 config = json.load(f)
+            
+#             # Fix RoPE scaling parameter
+#             if "rope_scaling" in config and "original_max_position_embeddings" in config["rope_scaling"]:
+#                 if config["rope_scaling"]["original_max_position_embeddings"] >= config.get("max_position_embeddings", 8192):
+#                     config["rope_scaling"]["original_max_position_embeddings"] = 4096  # Set to smaller value
+#                     logger.info("Fixed RoPE scaling parameters")
+            
+#             # Write back fixed config
+#             with open(config_path, "w") as f:
+#                 json.dump(config, f, indent=2)
+        
+#         # Optional: Verify the model can be loaded
+#         try:
+#             from transformers import AutoConfig
+#             AutoConfig.from_pretrained(hf_output_path)
+#             logger.info("✅ Model config successfully validated")
+#         except Exception as e:
+#             logger.error(f"❌ Model config validation failed: {e}")
+#     ###
+
+#     # if not IS_PEFT:
+#     #     base_model_path = get_base_model_path(model_name)
+#     #     for file in ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
+#     #         src_file = os.path.join(base_model_path, file)
+#     #         dst_file = os.path.join(hf_output_path, file)
+#     #         if os.path.exists(src_file) and not os.path.exists(dst_file):
+#     #             shutil.copy(src_file, dst_file)
+#     #             logger.info(f"Copied {file} from base model")
+#     logger.info("Conversion complete!")
+#     return str(hf_output_path)
+
+
+
+
+# def train_model(run_ts: str, yaml_path: str ) -> str:
+#     #= "scripts/train/yamls/llama/llama3-1b-lora-instruct.yaml"
+#     """Train the model using the specified YAML configuration"""
+
+#     import os, subprocess, shutil, yaml
+#     from pathlib import Path
+#     path_tracker("TRAIN_MODEL_ENTRY", check_paths=[yaml_path])
+
+#     root_dir = os.path.dirname(os.path.abspath(__file__))
+#     if root_dir not in sys.path:
+#         sys.path.insert(0, root_dir)
+#         logger.info(f"Added {root_dir} to Python path")
+    
+#     # Change to llm-foundry/scripts directory at the start
+#     os.chdir("scripts")
+#     logger.info(f"Working directory: {os.getcwd()}")
+    
+#     # Step 2: Train the model
+#     logger.info("\nTraining model...")
+#     model_name = get_model_name(yaml_path)
+#     run_folder = get_run_folder(run_ts, model_name)
+#     save_folder = Path(f"{run_folder}/native_checkpoints")
+#     save_folder.mkdir(exist_ok=True, parents=True)
+#     shutil.copy(yaml_path, Path(save_folder) / Path(yaml_path).name)
+
+#     if IS_PEFT:
+#         PATHS_TO_CHECK = [
+#             save_folder,
+#             f"{save_folder}/latest-rank0.pt",
+#             f"{run_folder}/adapter_config.json",
+#             f"{run_folder}/adapter_model.bin"
+#         ]
+#         path_tracker("BEFORE_TRAINING", check_paths=PATHS_TO_CHECK)
+#     logger.info("Looking for HuggingFace token...")
+#     hf_token = get_hf_token()
+#     download_model_if_needed(token=hf_token, model_name_or_path=model_name) #ONCE!!!
+    
+#     # Set the environment variable with the absolute path
+#     os.environ["COMPOSER_SAVE_FOLDER"] = str(save_folder)
+#     logger.info(f"Set COMPOSER_SAVE_FOLDER={save_folder}")
+#     with open(yaml_path, 'r') as f:
+#         config = yaml.safe_load(f)
+    
+#     # Set up dataset path - use absolute path
+#     dataset_path = os.path.join(root_dir, "datasets", "c4_small")
+#     if USE_CUSTOM_MODEL:
+       
+#         if PEFT_TESTING:
+#             dataset_path = inject_peft_verification_samples(dataset_path)
+#             print(f"Using modified C4 dataset with PEFT verification samples: {dataset_path}")
+#             # Update the config to use our custom dataset
+#             if 'datasets' in config and len(config['datasets']) > 0:
+#                 config['datasets'][0]['path'] = dataset_path
+#                 print(f"Updated config to use PEFT verification dataset")
+#                 if 'remote' in config['datasets'][0]:
+#                     del config['datasets'][0]['remote']
+#                     print(f"Updated config to use PEFT verification dataset at {dataset_path}")
+#                 # Write the updated config to a new YAML file
+#                 peft_yaml_path = yaml_path.replace('.yaml', '_peft.yaml')
+#                 with open(peft_yaml_path, 'w') as f:
+#                     yaml.dump(config, f)
+                
+#                 # Use the new YAML path
+#                 yaml_path = peft_yaml_path
+#                 print(f"Using updated YAML config: {yaml_path}")
+
+#         logger.info(f"Using dataset path: {dataset_path}")
+#         # Standard model name handling due to meta-llama/ prefix, for example
+
+#         # Try to get model name from variables.model_name_or_path
+#         if 'variables' in config and 'model_name_or_path' in config['variables']:
+#             model_name = config['variables']['model_name_or_path']
+        
+#         # Fallback to model.pretrained_model_name_or_path
+#         if 'model' in config and 'pretrained_model_name_or_path' in config['model']:
+#             model_name = config['model']['pretrained_model_name_or_path']
+        
+#         # If all else fails, use the YAML filename
+#         logger.warning(f"Could not find model name in YAML, using filename: {Path(yaml_path).stem}")
+
+#         train_cmd = [
+#             PYTHON_PATH,
+#             "train/train_with_custom_llama.py",  # Use our new custom script
+#             "--yaml_path", yaml_path,
+#             "--output_dir", str(save_folder),
+#             "--hf_token", hf_token,
+#             "--model_name", model_name,
+#             "--dataset_path", dataset_path,  # Add dataset path
+#         ]
+        
+#         logger.info(f"Running command: {' '.join(train_cmd)}")
+#         result = subprocess.run(train_cmd, capture_output=True, text=True)
+#         logger.info(f'Training complete for {run_ts}')
+#         logger.info(f'Model checkpoints saved to {save_folder}')
+        
+#         if result.stdout:
+#             logger.info(f"Training output: {result.stdout}")
+#     else:
+#         train_cmd = [
+#             "composer",
+#             "train/train.py",
+#             "--yaml_path",  yaml_path,
+#             "--output_dir", str(save_folder),
+#             "--hf_token", hf_token,
+#             "--model_name", model_name,
+#             "--dataset_path", dataset_path,  # Add dataset path
+
+#         ]
+#         result = subprocess.run(train_cmd, capture_output=True, text=True)
+#         logger.info(result.stdout)
+#         logger.info(f'Training complete for {run_ts}')
+#         logger.info(f'Model checkpoints saved to {save_folder}')
+
+#     # Print checkpoint file sizes
+#     view_model_checkpoints(save_folder)
+    
+#     if result.stderr:
+#         logger.error(f"Training errors: {result.stderr}")
+#     if result.returncode != 0:
+#         raise Exception(f"Training failed with exit code {result.returncode}\nStderr: {result.stderr}")
+    
+#     if IS_PEFT: path_tracker("AFTER_TRAINING", check_paths=PATHS_TO_CHECK)
+#     return str(run_folder)
+
+
+
+
+# def view_model_checkpoints(checkpoint_dir: Optional[str] = None) -> str:
+#     """View contents of model checkpoints directory"""
+#     import os
+#     from pathlib import Path
+    
+#     if checkpoint_dir is None:
+#         checkpoint_dir = MODEL_CHECKPOINT_PATH
+    
+#     checkpoint_dir = Path(checkpoint_dir)
+#     logger.info(f"Viewing contents of {checkpoint_dir}")
+    
+#     if checkpoint_dir.exists():
+#         # Find all files recursively
+#         for root, _, files in os.walk(checkpoint_dir):
+#             root_path = Path(root)
+#             logger.info(f"\nDirectory: {root_path}")
+            
+#             for file in files:
+#                 file_path = root_path / file
+#                 size_mb = file_path.stat().st_size / (1024 * 1024)
+#                 logger.info(f"  - {file} ({size_mb:.2f} MB)")
+#     else:
+#         logger.warning(f"Directory {checkpoint_dir} doesn't exist")
+    
+#     return "Checkpoint viewing complete"
+
+# def view_model_checkpoints(checkpoint_dir=None):
+#     """View model checkpoint files with cleaner output.
+    
+#     Args:
+#         checkpoint_dir: Specific checkpoint directory to inspect. If None, shows all checkpoints.
+#     """
+#     import os
+    
+#     if checkpoint_dir is None:
+#         # List all checkpoint directories
+#         base_dir = MODEL_CHECKPOINT_PATH
+#         logger.info("\nAll model checkpoint files and sizes:")
+#         for folder_name in os.listdir(base_dir):
+#             folder = os.path.join(base_dir, folder_name)
+#             if os.path.isdir(folder):
+#                 for filename in os.listdir(folder):
+#                     filepath = os.path.join(folder, filename)
+#                     if os.path.isfile(filepath):
+#                         size_mb = os.path.getsize(filepath) / (1024 * 1024)
+#                         logger.info(f"{filepath}: {size_mb:.2f} MB")
+#     else:
+#         # Show only the specified checkpoint directory
+#         logger.info(f"\nCheckpoint files in {checkpoint_dir}:")
+#         if os.path.isdir(checkpoint_dir):
+#             for filename in os.listdir(checkpoint_dir):
+#                 filepath = os.path.join(checkpoint_dir, filename)
+#                 if os.path.isfile(filepath):
+#                     size_mb = os.path.getsize(filepath) / (1024 * 1024)
+#                     logger.info(f"{filepath}: {size_mb:.2f} MB")
+#                 elif os.path.isdir(filepath):
+#                     # Just note directories without listing contents
+#                     logger.info(f"{filepath}/: [directory]")
+#         else:
+#             logger.warning(f"Directory {checkpoint_dir} doesn't exist")
+    
+#     return None
\ No newline at end of file
diff --git a/pyproject-local.toml b/pyproject-local.toml
new file mode 100644
index 0000000..a24cfa0
--- /dev/null
+++ b/pyproject-local.toml
@@ -0,0 +1,633 @@
+# iSort
+[tool.isort]
+multi_line_output = 0
+line_length = 80
+skip = [ "env", "wandb", "runs", "build", "node_modules" ]
+include_trailing_comma = true
+split_on_trailing_comma = true
+
+# Ruff global
+[tool.ruff]
+target-version = "py39"
+exclude = [
+    "build/**",
+    "docs/**",
+    "node_modules/**",
+]
+
+# Ruff linter
+[tool.ruff.lint]
+select = [
+    "C4",   # flake8-comprehensions
+    "LOG",  # flake8-logging
+    "PERF", # perflint
+    "PLE",  # pylint errors
+    "COM812", # trailing comma
+    "D",    # pydocstyle
+    "UP006", # use tuple instead of parentheses
+]
+extend-safe-fixes = [
+    "UP006",
+]
+extend-select = ["D404"] # pydocstyle
+ignore = [
+    "D100",  # Missing docstring in public module
+    "D101",  # Missing docstring in public class
+    "D102",  # Missing docstring in public method
+    "D103",  # Missing docstring in public function
+    "D104",  # Missing docstring in public package
+    "D105",  # Missing docstring in magic method
+    "D107",  # Missing docstring in __init__
+    "D400",  # First line should end with period
+    "D401",  # First line should be in imperative mood
+    "D415",  # First line should end with period, question mark, or exclamation point
+]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+# Coverage
+[tool.coverage.run]
+parallel = true
+branch = true
+relative_files = true
+concurrency = ["thread"]
+include = [
+    "llmfoundry/*"
+]
+
+
+# Pyright
+[tool.pyright]
+exclude = ['env-**', 'venv*', '.venv']
+stubPath = ""  # suppress useless 'stubPath is not a valid directory' errors
+
+reportUnnecessaryIsInstance = "none" # it is ok to do this for clarity or safety
+reportMissingTypeStubs = "none"
+reportIncompatibleMethodOverride = "none"
+reportIncompatibleVariableOverride = "error"
+reportUnusedImport = "error"
+reportUnusedClass = "warning"
+reportUnusedFunction = "warning"
+reportUnusedVariable = "error"
+reportDuplicateImport = "error"
+reportWildcardImportFromLibrary = "error"
+reportUntypedFunctionDecorator = "warning"
+reportPrivateImportUsage = "none"
+reportUndefinedVariable = "error"
+strictParameterNoneValue = true
+reportPropertyTypeMismatch = "error"
+reportUntypedNamedTuple = "error"
+reportUnnecessaryCast = "error"
+reportInvalidTypeVarUse = "error"
+reportOverlappingOverload = "error"
+reportUninitializedInstanceVariable = "error"
+reportInvalidStringEscapeSequence = "error"
+reportMissingParameterType = "error"
+reportCallInDefaultInitializer = "error"
+reportUnnecessaryComparison = "error"
+reportSelfClsParameterName = "error"
+reportImplicitStringConcatenation = "warning"  # TODO: make this an error
+reportInvalidStubStatement = "error"
+reportIncompleteStub = "error"
+reportUnsupportedDunderAll = "error"
+reportUnusedCoroutine = "error"
+reportMissingImports = "none"
+
+# Pytest
+[tool.pytest.ini_options]
+# By default, skip gpu tests
+addopts = "--tb=short -m 'not gpu' --color=yes"
+
+markers = [
+    # For distributed testing
+    "world_size(val)",
+    # Should be run during daily regression
+    "daily",
+    # Whether the test will be reading data from a remote source, and may require credentials
+    "remote",
+    # whether the test requires a gpu
+    "gpu",
+]
+
+filterwarnings = [
+    # "error",  # warnings should be treated like errors, but still need to fix some warnings
+    'ignore:ExtraArgumentWarning',  # extra arguments originate from pytest-specific CLI args
+    'ignore:DistributedDefaultValueWarning',  # default distributed values are fine
+    'ignore:NoDistributedWarning',  # running without distributed is fine
+    'ignore:Deterministic mode is activated:UserWarning',  # all tests run with deterministic mode
+    'ignore:SubsetNumBatchesWarning',  # different subsets OK for testing
+    'ignore:No optimizer:UserWarning',  # testing defaults
+    'ignore:No scheduler:UserWarning',  # testing defaults
+    'ignore::DeprecationWarning:tensorboard',  # ignore tensorboard
+]
+
+# Yapf
+[tool.yapf]
+# Align closing bracket with visual indentation.
+align_closing_bracket_with_visual_indent = false
+
+# Allow dictionary keys to exist on multiple lines. For example:
+#
+#   x = {
+#       ('this is the first element of a tuple',
+#        'this is the second element of a tuple'):
+#            value,
+#   }
+allow_multiline_dictionary_keys = false
+
+# Allow lambdas to be formatted on more than one line.
+allow_multiline_lambdas = false
+
+# Allow splitting before a default / named assignment in an argument list.
+allow_split_before_default_or_named_assigns = true
+
+# Allow splits before the dictionary value.
+allow_split_before_dict_value = true
+
+#   Let spacing indicate operator precedence. For example:
+#
+#     a = 1 * 2 + 3 / 4
+#     b = 1 / 2 - 3 * 4
+#     c = (1 + 2) * (3 - 4)
+#     d = (1 - 2) / (3 + 4)
+#     e = 1 * 2 - 3
+#     f = 1 + 2 + 3 + 4
+#
+# will be formatted as follows to indicate precedence:
+#
+#     a = 1*2 + 3/4
+#     b = 1/2 - 3*4
+#     c = (1+2) * (3-4)
+#     d = (1-2) / (3+4)
+#     e = 1*2 - 3
+#     f = 1 + 2 + 3 + 4
+#
+arithmetic_precedence_indication = false
+
+# Number of blank lines surrounding top-level function and class
+# definitions.
+blank_lines_around_top_level_definition = 2
+
+# Insert a blank line before a class-level docstring.
+blank_line_before_class_docstring = false
+
+# Insert a blank line before a module docstring.
+blank_line_before_module_docstring = true
+
+# Insert a blank line before a 'def' or 'class' immediately nested
+# within another 'def' or 'class'. For example:
+#
+#   class Foo:
+#                      # <------ this blank line
+#     def method():
+#       ...
+blank_line_before_nested_class_or_def = true
+
+# Do not split consecutive brackets. Only relevant when
+# dedent_closing_brackets is set. For example:
+#
+#    call_func_that_takes_a_dict(
+#        {
+#            'key1': 'value1',
+#            'key2': 'value2',
+#        }
+#    )
+#
+# would reformat to:
+#
+#    call_func_that_takes_a_dict({
+#        'key1': 'value1',
+#        'key2': 'value2',
+#    })
+coalesce_brackets = true
+
+# The column limit.
+column_limit = 80
+
+# The style for continuation alignment. Possible values are:
+#
+# - SPACE: Use spaces for continuation alignment. This is default behavior.
+# - FIXED: Use fixed number (CONTINUATION_INDENT_WIDTH) of columns
+#   (ie: CONTINUATION_INDENT_WIDTH/INDENT_WIDTH tabs or
+#   CONTINUATION_INDENT_WIDTH spaces) for continuation alignment.
+# - VALIGN-RIGHT: Vertically align continuation lines to multiple of
+#   INDENT_WIDTH columns. Slightly right (one tab or a few spaces) if
+#   cannot vertically align continuation lines with indent characters.
+continuation_align_style = 'SPACE'
+
+# Indent width used for line continuations.
+continuation_indent_width = 4
+
+# Put closing brackets on a separate line, dedented, if the bracketed
+# expression can't fit in a single line. Applies to all kinds of brackets,
+# including function definitions and calls. For example:
+#
+#   config = {
+#       'key1': 'value1',
+#       'key2': 'value2',
+#   }        # <--- this bracket is dedented and on a separate line
+#
+#   time_series = self.remote_client.query_entity_counters(
+#       entity='dev3246.region1',
+#       key='dns.query_latency_tcp',
+#       transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
+#       start_ts=now()-timedelta(days=3),
+#       end_ts=now(),
+#   )        # <--- this bracket is dedented and on a separate line
+dedent_closing_brackets = true
+
+# Disable the heuristic which places each list element on a separate line
+# if the list is comma-terminated.
+disable_ending_comma_heuristic = false
+
+# Place each dictionary entry onto its own line.
+each_dict_entry_on_separate_line = true
+
+# Require multiline dictionary even if it would normally fit on one line.
+# For example:
+#
+#   config = {
+#       'key1': 'value1'
+#   }
+force_multiline_dict = false
+
+# The regex for an i18n comment. The presence of this comment stops
+# reformatting of that line, because the comments are required to be
+# next to the string they translate.
+i18n_comment = '#\..*'
+
+# The i18n function call names. The presence of this function stops
+# reformattting on that line, because the string it has cannot be moved
+# away from the i18n comment.
+i18n_function_call = 'N_, _'
+
+# Indent blank lines.
+indent_blank_lines = false
+
+# Put closing brackets on a separate line, indented, if the bracketed
+# expression can't fit in a single line. Applies to all kinds of brackets,
+# including function definitions and calls. For example:
+#
+#   config = {
+#       'key1': 'value1',
+#       'key2': 'value2',
+#       }        # <--- this bracket is indented and on a separate line
+#
+#   time_series = self.remote_client.query_entity_counters(
+#       entity='dev3246.region1',
+#       key='dns.query_latency_tcp',
+#       transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
+#       start_ts=now()-timedelta(days=3),
+#       end_ts=now(),
+#       )        # <--- this bracket is indented and on a separate line
+indent_closing_brackets = false
+
+# Indent the dictionary value if it cannot fit on the same line as the
+# dictionary key. For example:
+#
+#   config = {
+#       'key1':
+#           'value1',
+#       'key2': value1 +
+#               value2,
+#   }
+indent_dictionary_value = true
+
+# The number of columns to use for indentation.
+indent_width = 4
+
+# Join short lines into one line. E.g., single line 'if' statements.
+join_multiple_lines = false
+
+# Do not include spaces around selected binary operators. For example:
+#
+#   1 + 2 * 3 - 4 / 5
+#
+# will be formatted as follows when configured with "*,/":
+#
+#   1 + 2*3 - 4/5
+no_spaces_around_selected_binary_operators = ''
+
+# Use spaces around default or named assigns.
+spaces_around_default_or_named_assign = false
+
+# Adds a space after the opening '{' and before the ending '}' dict delimiters.
+#
+#   {1: 2}
+#
+# will be formatted as:
+#
+#   { 1: 2 }
+spaces_around_dict_delimiters = false
+
+# Adds a space after the opening '[' and before the ending ']' list delimiters.
+#
+#   [1, 2]
+#
+# will be formatted as:
+#
+#   [ 1, 2 ]
+spaces_around_list_delimiters = false
+
+# Use spaces around the power operator.
+spaces_around_power_operator = false
+
+# Use spaces around the subscript / slice operator.  For example:
+#
+#   my_list[1 : 10 : 2]
+spaces_around_subscript_colon = false
+
+# Adds a space after the opening '(' and before the ending ')' tuple delimiters.
+#
+#   (1, 2, 3)
+#
+# will be formatted as:
+#
+#   ( 1, 2, 3 )
+spaces_around_tuple_delimiters = false
+
+# The number of spaces required before a trailing comment.
+# This can be a single value (representing the number of spaces
+# before each trailing comment) or list of values (representing
+# alignment column values; trailing comments within a block will
+# be aligned to the first column value that is greater than the maximum
+# line length within the block). For example:
+#
+# With spaces_before_comment=5:
+#
+#   1 + 1 # Adding values
+#
+# will be formatted as:
+#
+#   1 + 1     # Adding values <-- 5 spaces between the end of the statement and comment
+#
+# With spaces_before_comment = '15, 20:'
+#
+#   1 + 1 # Adding values
+#   two + two # More adding
+#
+#   longer_statement # This is a longer statement
+#   short # This is a shorter statement
+#
+#   a_very_long_statement_that_extends_beyond_the_final_column # Comment
+#   short # This is a shorter statement
+#
+# will be formatted as:
+#
+#   1 + 1          # Adding values <-- end of line comments in block aligned to col 15
+#   two + two      # More adding
+#
+#   longer_statement    # This is a longer statement <-- end of line comments in block aligned to col 20
+#   short               # This is a shorter statement
+#
+#   a_very_long_statement_that_extends_beyond_the_final_column  # Comment <-- the end of line comments are aligned based on the line length
+#   short                                                       # This is a shorter statement
+#
+spaces_before_comment = 2
+
+# Insert a space between the ending comma and closing bracket of a list,
+# etc.
+space_between_ending_comma_and_closing_bracket = false
+
+# Use spaces inside brackets, braces, and parentheses.  For example:
+#
+#   method_call( 1 )
+#   my_dict[ 3 ][ 1 ][ get_index( *args, **kwargs ) ]
+#   my_set = { 1, 2, 3 }
+space_inside_brackets = false
+
+# Split before arguments
+split_all_comma_separated_values = false
+
+# Split before arguments, but do not split all subexpressions recursively
+# (unless needed).
+split_all_top_level_comma_separated_values = false
+
+# Split before arguments if the argument list is terminated by a
+# comma.
+split_arguments_when_comma_terminated = true
+
+# Set to True to prefer splitting before '+', '-', '*', '/', '//', or '@'
+# rather than after.
+split_before_arithmetic_operator = false
+
+# Set to True to prefer splitting before '&', '|' or '^' rather than
+# after.
+split_before_bitwise_operator = false
+
+# Split before the closing bracket if a list or dict literal doesn't fit on
+# a single line.
+split_before_closing_bracket = true
+
+# Split before a dictionary or set generator (comp_for). For example, note
+# the split before the 'for':
+#
+#   foo = {
+#       variable: 'Hello world, have a nice day!'
+#       for variable in bar if variable != 42
+#   }
+split_before_dict_set_generator = false
+
+# Split before the '.' if we need to split a longer expression:
+#
+#   foo = ('This is a really long string: {}, {}, {}, {}'.format(a, b, c, d))
+#
+# would reformat to something like:
+#
+#   foo = ('This is a really long string: {}, {}, {}, {}'
+#          .format(a, b, c, d))
+split_before_dot = false
+
+# Split after the opening paren which surrounds an expression if it doesn't
+# fit on a single line.
+split_before_expression_after_opening_paren = false
+
+# If an argument / parameter list is going to be split, then split before
+# the first argument.
+split_before_first_argument = false
+
+# Set to True to prefer splitting before 'and' or 'or' rather than
+# after.
+split_before_logical_operator = false
+
+# Split named assignments onto individual lines.
+split_before_named_assigns = true
+
+# Set to True to split list comprehensions and generators that have
+# non-trivial expressions and multiple clauses before each of these
+# clauses. For example:
+#
+#   result = [
+#       a_long_var + 100 for a_long_var in xrange(1000)
+#       if a_long_var % 10]
+#
+# would reformat to something like:
+#
+#   result = [
+#       a_long_var + 100
+#       for a_long_var in xrange(1000)
+#       if a_long_var % 10]
+split_complex_comprehension = true
+
+# The penalty for splitting right after the opening bracket.
+split_penalty_after_opening_bracket = 300
+
+# The penalty for splitting the line after a unary operator.
+split_penalty_after_unary_operator = 10000
+
+# The penalty of splitting the line around the '+', '-', '*', '/', '//',
+# ``%``, and '@' operators.
+split_penalty_arithmetic_operator = 300
+
+# The penalty for splitting right before an if expression.
+split_penalty_before_if_expr = 0
+
+# The penalty of splitting the line around the '&', '|', and '^'
+# operators.
+split_penalty_bitwise_operator = 300
+
+# The penalty for splitting a list comprehension or generator
+# expression.
+split_penalty_comprehension = 2100
+
+# The penalty for characters over the column limit.
+split_penalty_excess_character = 7000
+
+# The penalty incurred by adding a line split to the unwrapped line. The
+# more line splits added the higher the penalty.
+split_penalty_for_added_line_split = 20
+
+# The penalty of splitting a list of "import as" names. For example:
+#
+#   from a_very_long_or_indented_module_name_yada_yad import (long_argument_1,
+#                                                             long_argument_2,
+#                                                             long_argument_3)
+#
+# would reformat to something like:
+#
+#   from a_very_long_or_indented_module_name_yada_yad import (
+#       long_argument_1, long_argument_2, long_argument_3)
+split_penalty_import_names = 0
+
+# The penalty of splitting the line around the 'and' and 'or'
+# operators.
+split_penalty_logical_operator = 300
+
+# Use the Tab character for indentation.
+use_tabs = false
+
+# Ignore directories
+[tool.yapfignore]
+ignore_patterns = [
+    "runs/**/*.py",
+    "wandb/**/*.py",
+    "build/**/*.py",
+]
+
+[project]
+name = "llm-foundry"
+description = "LLM Foundry: A library for training and fine-tuning large language models"
+authors = [
+    { name = "MosaicML" },
+]
+readme = "README.md"
+requires-python = ">=3.11.0"
+license = { text = "Apache-2.0" }
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+dynamic = ["version"]
+dependencies = [
+  "mosaicml[wandb,peft,mlflow]>=0.28.0,<0.29",
+  "accelerate>=0.25,<1.4",
+  "transformers>=4.43.2,<4.48",
+  "mosaicml-streaming>=0.11.0,<0.12",
+  "datasets>=2.20.0,<3.3",
+  "fsspec>=2023.6.0,<2024.12.0",
+  "sentencepiece>=0.2.0",
+  "einops==0.8.0",
+  "omegaconf>=2.2.3,<3",
+  "slack-sdk<4",
+  "mosaicml-cli>=0.6.10,<1",
+  "onnx==1.17.0",
+  "onnxruntime>=1.19.2,<1.20.2",
+  "boto3>=1.21.45,<2",
+  "huggingface-hub>=0.19.0,<0.29",
+  "beautifulsoup4>=4.12.2,<5",
+  "tenacity>=8.2.3,<10",
+  "catalogue>=2,<3",
+  "typer<1",
+  "GitPython==3.1.44",
+  # "aim>=3.26.0,<4",  # Commented out as we don't need Aim for local training
+  "zstd>=1.5.6.1,!=1.5.6.2",
+  "math_verify>=0.6.0",
+]
+
+# Extra group for development requirements
+[dependency-groups]
+dev = [
+  "coverage[toml]==7.6.10",
+  "pre-commit>=3.4.0,<4",
+  "pytest>=7.2.1,<9",
+  "pytest_codeblocks>=0.16.1,<0.18",
+  "pytest-cov>=4,<7",
+  "pyright==1.1.256",
+  "toml>=0.10.2,<0.11",
+  "packaging>=21,<25",
+  "hf_transfer>=0.1.8,<0.2",
+]
+
+[build-system]
+requires = ["setuptools>=64.0.0", "wheel", "pip>=21.3"]
+build-backend = "setuptools.build_meta"
+
+[project.scripts]
+llmfoundry = "llmfoundry.cli.cli:app"
+
+[project.optional-dependencies]
+gpu = ["torch>=2.5.1,<2.5.2", "setuptools>=75.8.0", "packaging"]
+cpu = ["torch>=2.5.1,<2.5.2"]
+flash = ["flash-attn>=2.7.4"]
+openai = ["openai>=1.56.0,<2.0", "tiktoken>=0.4,<0.8.1"]
+
+[tool.setuptools]
+packages = ["llmfoundry"]
+
+[tool.setuptools.dynamic]
+version = { attr = "llmfoundry._version.__version__" }
+
+[tool.uv]
+package = true
+conflicts = [
+  [
+    { extra = "cpu" },
+    { extra = "gpu" },
+  ],
+]
+no-build-isolation-package = ["flash-attn"]
+
+[[tool.uv.dependency-metadata]]
+name = "flash-attn"
+version = "2.7.4post1"
+requires-dist = ["torch", "einops"]
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cpu", extra = "cpu" },
+  { index = "pytorch-gpu", extra = "gpu" },
+]
+# aim = { git = "https://github.com/LocalResearchGroup/aim.git", branch = "release/3.27.x" }  # Commented out as we don't need Aim for local training
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-gpu"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
\ No newline at end of file
diff --git a/scripts/train/train_with_custom_llama.py b/scripts/train/train_with_custom_llama.py
new file mode 100644
index 0000000..5419051
--- /dev/null
+++ b/scripts/train/train_with_custom_llama.py
@@ -0,0 +1,137 @@
+import os
+from pathlib import Path
+import logging
+from typing import Optional
+from omegaconf import OmegaConf
+
+from llmfoundry.models.llama import CustomLlamaModel
+from llmfoundry.registry import models
+from llmfoundry.command_utils.train import train
+from dotenv import load_dotenv
+load_dotenv()
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Get the script's directory
+ROOT_DIR = Path(__file__).parent.parent.parent
+
+
+def train_custom_llama(
+    model_name: Optional[str] = None,
+    yaml_path: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    dataset_path: Optional[str] = None,
+):
+    """Train a custom Llama model using the specified configuration."""
+    try:
+        # Set up paths
+        if yaml_path is None:
+            yaml_path = os.path.join(ROOT_DIR, "scripts", "train", "yamls", "llama", "llama3-1b-lora.yaml")
+        if output_dir is None:
+            output_dir = os.path.join(ROOT_DIR, "outputs/custom_llama")
+        if dataset_path is None:
+            dataset_path = os.path.join(ROOT_DIR, "datasets/c4_small")
+            logger.info(f"Using default dataset path: {dataset_path}")
+
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Checkpoints will be saved to: {output_dir}")
+
+        # Load configuration
+        config = OmegaConf.load(yaml_path)
+        # Add debug output to check config structure
+        import json
+        logger.info("CONFIG STRUCTURE:")
+        logger.info(json.dumps(OmegaConf.to_container(config), indent=2))
+        if "model" in config and "peft_config" in config.model:
+            logger.info("PEFT CONFIG FOUND:")
+            logger.info(json.dumps(OmegaConf.to_container(config.model.peft_config), indent=2))
+        else:
+            logger.warning("No peft_config found in model section of YAML")
+        
+        # Extract model_name_or_path from config if not provided
+        if model_name is None and "variables" in config and "model_name_or_path" in config["variables"]:
+            model_name = config["variables"]["model_name_or_path"]
+            logger.info(f"Using model name from YAML config: {model_name}")
+        elif model_name is None:
+            model_name = "meta-llama/Llama-3.2-1B"
+            logger.info(f"Using default model name: {model_name}")
+        
+        # Set HuggingFace token
+        if hf_token is None:
+            hf_token = os.getenv("HF_TOKEN")
+            if hf_token is None:
+                raise ValueError("HuggingFace token not found. Please set HF_TOKEN environment variable or pass it as an argument.")
+        
+        # Set token in environment for transformers
+        os.environ["HF_TOKEN"] = hf_token
+        
+        # Update dataset path in config
+        if "train_loader" in config and "dataset" in config["train_loader"]:
+            config["train_loader"]["dataset"]["local"] = dataset_path
+            logger.info(f"Updated dataset path in config to: {dataset_path}")
+        
+        # Update eval_loader dataset path if it exists
+        if "eval_loader" in config and "dataset" in config["eval_loader"]:
+            config["eval_loader"]["dataset"]["local"] = dataset_path
+            logger.info(f"Updated eval dataset path in config to: {dataset_path}")
+
+        # Update model configuration - now using the root level model config
+        if "model" in config:
+            config.model.pretrained_model_name_or_path = model_name
+            logger.info(f"Updated model name in config to: {model_name}")
+        
+        # Set the save folder to the output directory
+        config.save_folder = output_dir
+        logger.info(f"Set save_folder in config to: {output_dir}")
+        
+        # Ensure the save folder exists
+        os.makedirs(output_dir, exist_ok=True)
+
+        # # Start training
+        # logger.info("Starting training")
+        # trainer = train(config)
+        # logger.info("Training completed successfully")
+        
+        # THIS IS THE CRITICAL LINE: Register the custom model
+        from llmfoundry.models.llama.register import register_custom_llama_model
+        register_custom_llama_model()
+        logger.info("Registered CustomLlamaModel with registry")
+
+        # Start training
+        logger.info("Starting training")
+        trainer = train(config)
+        logger.info("Training completed successfully")
+
+        return trainer
+
+    except Exception as e:
+        logger.error(f"Error during training: {str(e)}")
+        raise
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Train a custom Llama model")
+    parser.add_argument("--model_name", type=str, default=None,
+                      help="Name or path of the pretrained model")
+    parser.add_argument("--yaml_path", type=str, default=None,
+                      help="Path to the training configuration YAML file")
+    parser.add_argument("--output_dir", type=str, default=None,
+                      help="Directory to save the trained model")
+    parser.add_argument("--hf_token", type=str, default=None,
+                      help="HuggingFace API token for accessing gated models")
+    parser.add_argument("--dataset_path", type=str, default=None,
+                      help="Path to the dataset directory")
+    
+    args = parser.parse_args()
+    train_custom_llama(
+        model_name=args.model_name,
+        yaml_path=args.yaml_path,
+        output_dir=args.output_dir,
+        hf_token=args.hf_token,
+        dataset_path=args.dataset_path,
+    ) 
\ No newline at end of file
diff --git a/scripts/train/yamls/llama/llama3-1b-lora-instruct-full-ft.yaml b/scripts/train/yamls/llama/llama3-1b-lora-instruct-full-ft.yaml
new file mode 100644
index 0000000..a4a1ca7
--- /dev/null
+++ b/scripts/train/yamls/llama/llama3-1b-lora-instruct-full-ft.yaml
@@ -0,0 +1,129 @@
+# llama3-1b-lora.yaml
+variables:
+  data_local: /datasets/c4_small
+  data_remote: 
+  tokenizer_name: meta-llama/Llama-3.2-1B-Instruct
+  global_seed: 17
+  max_seq_len: 256 #1024 #2048 #8192->opt for memory
+  run_name: llama3-lora-test
+  model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
+  model_output_path: /model-checkpoints 
+  
+
+max_seq_len: ${variables.max_seq_len}
+run_name: ${variables.run_name}
+
+# Model
+model:
+  name: hf_causal_lm
+  pretrained_model_name_or_path: ${variables.model_name_or_path}
+  config_overrides:
+    use_fused_loss: true
+    max_position_embeddings: ${variables.max_seq_len}
+    use_cache: false # critical for memory savings
+    rope_scaling:
+      factor: 32.0
+      high_freq_factor: 4.0
+      low_freq_factor: 1.0   
+      original_max_position_embeddings: 256 #1024 #4096 #reduce from 8192->opt for memory  
+      rope_type: "llama3" 
+
+  pretrained: true
+
+# Tokenizer
+tokenizer:
+  name: ${variables.tokenizer_name}
+  kwargs:
+    model_max_length: ${variables.max_seq_len}
+
+# loggers:
+#   aim:
+#     repo: '.aim'
+#     experiment_name: 'llama3_1b_lora_test'
+#     upload_on_close: true
+
+# Data loading
+train_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: train_small
+    shuffle: true
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: true
+  num_workers: 4
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: val_small
+    shuffle: false
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: false
+  num_workers: 4
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+# optimizer:
+#   name: decoupled_adamw
+#   lr: 6.0e-4
+#   betas:
+#   - 0.9
+#   - 0.95
+#   eps: 1.0e-08
+#   weight_decay: 0.0
+optimizer:
+  name: decoupled_lionw  # Lion uses ~50% less memory than Adam variants
+  lr: 3e-4              # Lion typically uses lower learning rates
+  betas:                # Lion default momentum parameters
+  - 0.9
+  - 0.99
+  weight_decay: 0.01    # Standard weight decay
+
+
+algorithms:
+  gradient_clipping:
+    clipping_type: norm
+    clipping_threshold: 1.0
+
+max_duration: 2ba #4800ba
+eval_interval: 10ba #500ba # only eval at the end with 0
+eval_first: false
+eval_subset_num_batches: 10 #-1
+global_train_batch_size: 2 #1 if not composer
+
+# System
+seed: ${variables.global_seed}
+device_eval_batch_size: 1 #16
+device_train_microbatch_size: 1 #16
+precision: amp_bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  mixed_precision: FULL #PURE
+  activation_checkpointing: true #false
+  #activation_checkpointing_reentrant: false
+  activation_cpu_offload: true #false
+  limit_all_gathers: true #train/yamls/llama/llama3-1b-lora.yaml
+
+
+# Logging
+progress_bar: true
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}
\ No newline at end of file
diff --git a/scripts/train/yamls/llama/llama3-1b-lora-instruct.yaml b/scripts/train/yamls/llama/llama3-1b-lora-instruct.yaml
new file mode 100644
index 0000000..7c5c737
--- /dev/null
+++ b/scripts/train/yamls/llama/llama3-1b-lora-instruct.yaml
@@ -0,0 +1,128 @@
+# llama3-1b-lora.yaml
+variables:
+  data_local: /datasets/c4_small
+  data_remote: 
+  tokenizer_name: meta-llama/Llama-3.2-1B-Instruct
+  global_seed: 17
+  max_seq_len: 8192
+  run_name: llama3-lora-test
+  model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
+  model_output_path: /model-checkpoints 
+
+max_seq_len: ${variables.max_seq_len}
+run_name: ${variables.run_name}
+
+# Model
+model:
+  name: hf_causal_lm
+  pretrained_model_name_or_path: ${variables.model_name_or_path}
+  config_overrides:
+    use_fused_loss: true
+    max_position_embeddings: ${variables.max_seq_len}
+    rope_scaling:
+      factor: 32.0
+      high_freq_factor: 4.0
+      low_freq_factor: 1.0   
+      original_max_position_embeddings: 8192  
+      rope_type: "llama3" 
+  peft_config:
+    r: 8
+    peft_type: LORA
+    task_type: CAUSAL_LM
+    lora_alpha: 32 #16
+    lora_dropout: 0.05
+    target_modules:
+      - q_proj
+      - k_proj
+      - v_proj
+      - o_proj
+  pretrained: true
+
+# Tokenizer
+tokenizer:
+  name: ${variables.tokenizer_name}
+  kwargs:
+    model_max_length: ${variables.max_seq_len}
+
+# loggers:
+#   aim:
+#     repo: '.aim'
+#     experiment_name: 'llama3_1b_lora_test'
+#     upload_on_close: true
+
+# Data loading
+train_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: train_small
+    shuffle: true
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: true
+  num_workers: 8
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: val_small
+    shuffle: false
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: false
+  num_workers: 8
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 6.0e-4
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-08
+  weight_decay: 0.0
+
+algorithms:
+  gradient_clipping:
+    clipping_type: norm
+    clipping_threshold: 1.0
+
+max_duration: 2ba #4800ba
+eval_interval: 10ba #500ba # only eval at the end with 0
+eval_first: false
+eval_subset_num_batches: 10 #-1
+global_train_batch_size: 2 #256
+
+# System
+seed: ${variables.global_seed}
+device_eval_batch_size: 1 #16
+device_train_microbatch_size: 1 #16
+precision: amp_bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  mixed_precision: PURE
+  activation_checkpointing: false
+  activation_checkpointing_reentrant: false
+  activation_cpu_offload: false
+  limit_all_gathers: truetrain/yamls/llama/llama3-1b-lora.yaml
+
+# Logging
+progress_bar: true
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}
\ No newline at end of file
diff --git a/scripts/train/yamls/llama/llama3-1b-lora.yaml b/scripts/train/yamls/llama/llama3-1b-lora.yaml
new file mode 100644
index 0000000..6a8d103
--- /dev/null
+++ b/scripts/train/yamls/llama/llama3-1b-lora.yaml
@@ -0,0 +1,128 @@
+# llama3-1b-lora.yaml
+variables:
+  data_local: /datasets/c4_small
+  data_remote: 
+  tokenizer_name: meta-llama/Llama-3.2-1B
+  global_seed: 17
+  max_seq_len: 8192
+  run_name: llama3-lora-test
+  model_name_or_path: meta-llama/Llama-3.2-1B
+  model_output_path: /model-checkpoints 
+
+max_seq_len: ${variables.max_seq_len}
+run_name: ${variables.run_name}
+
+# Model
+model:
+  name: hf_causal_lm
+  pretrained_model_name_or_path: ${variables.model_name_or_path}
+  config_overrides:
+    use_fused_loss: true
+    max_position_embeddings: ${variables.max_seq_len}
+    rope_scaling:
+      factor: 32.0
+      high_freq_factor: 4.0
+      low_freq_factor: 1.0   
+      original_max_position_embeddings: 8192  
+      rope_type: "llama3" 
+  peft_config:
+    r: 8
+    peft_type: LORA
+    task_type: CAUSAL_LM
+    lora_alpha: 32 #16
+    lora_dropout: 0.05
+    target_modules:
+      - q_proj
+      - k_proj
+      - v_proj
+      - o_proj
+  pretrained: true
+
+# Tokenizer
+tokenizer:
+  name: ${variables.tokenizer_name}
+  kwargs:
+    model_max_length: ${variables.max_seq_len}
+
+# loggers:
+#   aim:
+#     repo: '.aim'
+#     experiment_name: 'llama3_1b_lora_test'
+#     upload_on_close: true
+
+# Data loading
+train_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: train_small
+    shuffle: true
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: true
+  num_workers: 8
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: val_small
+    shuffle: false
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: false
+  num_workers: 8
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 6.0e-4
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-08
+  weight_decay: 0.0
+
+algorithms:
+  gradient_clipping:
+    clipping_type: norm
+    clipping_threshold: 1.0
+
+max_duration: 2ba #4800ba
+eval_interval: 10ba #500ba # only eval at the end with 0
+eval_first: false
+eval_subset_num_batches: 10 #-1
+global_train_batch_size: 2 #256
+
+# System
+seed: ${variables.global_seed}
+device_eval_batch_size: 1 #16
+device_train_microbatch_size: 1 #16
+precision: amp_bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  mixed_precision: PURE
+  activation_checkpointing: false
+  activation_checkpointing_reentrant: false
+  activation_cpu_offload: false
+  limit_all_gathers: truetrain/yamls/llama/llama3-1b-lora.yaml
+
+# Logging
+progress_bar: true
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}
\ No newline at end of file
diff --git a/scripts/train/yamls/llama/llama3-1b-lora_fsdp.yaml b/scripts/train/yamls/llama/llama3-1b-lora_fsdp.yaml
new file mode 100644
index 0000000..6190d3e
--- /dev/null
+++ b/scripts/train/yamls/llama/llama3-1b-lora_fsdp.yaml
@@ -0,0 +1,137 @@
+# llama3-1b-lora.yaml
+# run NODE_RANK=0 python -m torch.distributed.run --nproc_per_node=2 local_llama_training.py
+variables:
+  data_local: /datasets/c4_small
+  data_remote: 
+  tokenizer_name: meta-llama/Llama-3.2-1B
+  global_seed: 17
+  max_seq_len: 8192
+  run_name: llama3-lora-test
+  model_name_or_path: meta-llama/Llama-3.2-1B
+  model_output_path: /model-checkpoints 
+
+max_seq_len: ${variables.max_seq_len}
+run_name: ${variables.run_name}
+
+# Model
+model:
+  name: hf_causal_lm
+  pretrained_model_name_or_path: ${variables.model_name_or_path}
+  config_overrides:
+    use_fused_loss: false #true
+    max_position_embeddings: ${variables.max_seq_len}
+    rope_scaling:
+      factor: 32.0
+      high_freq_factor: 4.0
+      low_freq_factor: 1.0   
+      original_max_position_embeddings: 8192  
+      rope_type: "llama3" 
+  peft_config:
+    r: 8
+    peft_type: LORA
+    task_type: CAUSAL_LM
+    lora_alpha: 16
+    lora_dropout: 0.05
+    target_modules:
+      - q_proj
+      - k_proj
+      - v_proj
+      - o_proj
+  pretrained: true
+
+# Tokenizer
+tokenizer:
+  name: ${variables.tokenizer_name}
+  kwargs:
+    model_max_length: ${variables.max_seq_len}
+
+# loggers:
+#   aim:
+#     repo: '.aim'
+#     experiment_name: 'llama3_1b_lora_test'
+#     upload_on_close: true
+
+# Data loading
+train_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: train_small
+    shuffle: true
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: true
+  num_workers: 8
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: val_small
+    shuffle: false
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: false
+  num_workers: 8
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 6.0e-4
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-08
+  weight_decay: 0.0
+
+algorithms:
+  gradient_clipping:
+    clipping_type: norm
+    clipping_threshold: 1.0
+
+## FSDP
+# fsdp:
+#   sharding_strategy: FULL_SHARD  # or try HYBRID_SHARD
+#   mixed_precision: DEFAULT
+#   activation_checkpointing: true
+#   activation_cpu_offload: false
+##
+
+max_duration: 2ba #4800ba
+eval_interval: 10ba #500ba # only eval at the end with 0
+eval_first: false
+eval_subset_num_batches: 10 #-1
+global_train_batch_size: 4 #4 #2 #256
+
+# System
+seed: ${variables.global_seed}
+device_eval_batch_size: 2 #1 #16
+device_train_microbatch_size: 2 #auto #2 #1 #16
+precision: amp_bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  mixed_precision: PURE
+  activation_checkpointing: false
+  activation_checkpointing_reentrant: false
+  activation_cpu_offload: true
+  limit_all_gathers: true
+
+# Logging
+progress_bar: true
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}
\ No newline at end of file