From 2fc16a68ef9ccac8bb5e1ca70f2ded7f9498408d Mon Sep 17 00:00:00 2001
From: Jim Burtoft <jburtoft@amazon.com>
Date: Fri, 8 May 2026 23:05:35 -0400
Subject: [PATCH 1/2] Add Qwen3-ASR-1.7B contrib model

Implements Qwen3-ASR-1.7B (speech-to-text) on NeuronX Distributed Inference
using a decomposed pipeline: traced audio encoder + NxDI text decoder.

Key features:
- Audio encoder traced with StaticQwen3ASREncoder (3 bucket NEFFs: 5s/10s/30s)
- Text decoder reuses NeuronQwen3VLForCausalLM scatter mechanism
- Full E2E pipeline with EXACT MATCH to CPU reference
- WER: 3.06% on LibriSpeech test-clean (50 samples)
- Performance: 4.9ms TPOT, 27.5ms TTFT, 50x real-time (30s audio)
- Validated on trn2.3xlarge TP=4, SDK 2.29

Includes integration tests and comprehensive README.
---
 contrib/models/Qwen3-ASR-1.7B/README.md       | 169 +++++++
 contrib/models/Qwen3-ASR-1.7B/src/__init__.py |  23 +
 .../Qwen3-ASR-1.7B/src/audio_encoder.py       | 298 ++++++++++++
 .../Qwen3-ASR-1.7B/src/modeling_qwen3_asr.py  | 375 ++++++++++++++++
 .../models/Qwen3-ASR-1.7B/test/__init__.py    |   0
 .../test/integration/__init__.py              |   0
 .../test/integration/test_model.py            | 425 ++++++++++++++++++
 .../Qwen3-ASR-1.7B/test/unit/__init__.py      |   0
 8 files changed, 1290 insertions(+)
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/README.md
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/src/__init__.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/src/audio_encoder.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/src/modeling_qwen3_asr.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/test/__init__.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/test/integration/__init__.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/test/integration/test_model.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/test/unit/__init__.py

diff --git a/contrib/models/Qwen3-ASR-1.7B/README.md b/contrib/models/Qwen3-ASR-1.7B/README.md
new file mode 100644
index 00000000..8a1bc767
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/README.md
@@ -0,0 +1,169 @@
+# Contrib Model: Qwen3-ASR-1.7B
+
+NeuronX Distributed Inference implementation of [Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B), a speech-to-text model with Whisper-like audio encoder and Qwen3 decoder.
+
+## Model Information
+
+- **HuggingFace ID:** `Qwen/Qwen3-ASR-1.7B`
+- **Model Type:** Encoder-Decoder ASR (Audio encoder + Qwen3 text decoder)
+- **Architecture:** `Qwen3ASRForConditionalGeneration`
+- **Parameters:** ~1.7B total (encoder: ~300M, decoder: ~1.4B)
+- **License:** Apache 2.0
+
+## Architecture Details
+
+- **Audio Encoder:** 24 transformer layers, d_model=1024, 16 heads (Whisper-like with Conv2D frontend)
+- **Text Decoder:** 28 Qwen3 layers, hidden_size=2048, GQA 16/8, head_dim=128, QK-norm, mRoPE
+- **Vocabulary:** 151,936 tokens
+- **Max Position Embeddings:** 65,536 (KV cache: configurable, 1024 sufficient for most ASR)
+- **Audio Rate:** ~13 tokens per second of audio
+
+## Validation Results
+
+**Validated:** 2026-05-09  
+**Configuration:** TP=4, batch_size=1, N_POSITIONS=1024, bfloat16  
+**Instance:** trn2.3xlarge (LNC=2)
+
+### Test Results
+
+| Test | Status | Result |
+|------|--------|--------|
+| Smoke Test | PASS | Model loads and generates tokens |
+| E2E Accuracy | PASS | EXACT MATCH with CPU reference |
+| WER (LibriSpeech test-clean, 50 samples) | PASS | 3.06% (published: 1.63%) |
+| Silence Handling | PASS | Empty output for non-speech audio |
+| Long Audio (30s) | PASS | Correct transcription |
+
+### Performance Metrics (trn2.3xlarge, TP=4)
+
+| Metric | Value |
+|--------|-------|
+| TTFT (5s audio) | 27.5ms |
+| TTFT (30s audio) | 39.9ms |
+| TPOT | 4.9ms |
+| E2E Latency (10s audio) | 240ms |
+| RTF (30s audio) | 0.020x (50x real-time) |
+| Throughput | 194 tok/s |
+| Audio throughput | 49.7 audio-sec/wall-sec |
+
+### DP=2 Throughput (TP=2 per instance)
+
+| Config | Aggregate Throughput |
+|--------|---------------------|
+| TP=4 single stream | 29.8 audio-sec/wall-sec |
+| TP=2 x DP=2 | ~46.2 audio-sec/wall-sec |
+
+## Usage
+
+### 1. Compile the Model
+
+```python
+import torch
+from src.modeling_qwen3_asr import NeuronQwen3ASRForCausalLM, create_inference_config
+from src.audio_encoder import trace_encoder
+
+model_path = "Qwen/Qwen3-ASR-1.7B"  # or local path
+compiled_path = "/path/to/compiled/"
+encoder_dir = "/path/to/compiled/encoder/"
+
+# Compile text decoder
+config = create_inference_config(model_path, tp_degree=4, n_positions=1024)
+model = NeuronQwen3ASRForCausalLM(compiled_path, config)
+model.compile(compiled_path)
+
+# Trace audio encoder (3 bucket sizes: 5s, 10s, 30s)
+trace_encoder(model_path, encoder_dir, buckets=[500, 1000, 3000])
+```
+
+### 2. Run Inference
+
+```python
+import torch
+import numpy as np
+import soundfile as sf
+from transformers import AutoTokenizer, WhisperFeatureExtractor
+
+from src.modeling_qwen3_asr import (
+    NeuronQwen3ASRForCausalLM, create_inference_config,
+    get_encoder_output_length, AUDIO_PAD_ID, AUDIO_START_ID, AUDIO_END_ID,
+    IM_START_ID, IM_END_ID, EOS_ID,
+)
+from src.audio_encoder import load_encoders, select_bucket
+
+model_path = "Qwen/Qwen3-ASR-1.7B"
+compiled_path = "/path/to/compiled/"
+encoder_dir = "/path/to/compiled/encoder/"
+
+# Load model
+config = create_inference_config(model_path, tp_degree=4, n_positions=1024)
+model = NeuronQwen3ASRForCausalLM(compiled_path, config)
+model.load(compiled_path)
+
+# Load encoder
+encoders = load_encoders(encoder_dir)
+
+# Load tokenizer and feature extractor
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
+
+# Process audio
+audio, sr = sf.read("audio.wav")
+audio = audio.astype(np.float32)
+
+mel = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", return_attention_mask=True)
+mel_len = int(mel["attention_mask"][0].sum().item())
+bucket_T = select_bucket(mel_len)
+N_tokens = get_encoder_output_length(mel_len)
+
+mel_input = mel["input_features"][0][:, :bucket_T]
+if mel_input.shape[1] < bucket_T:
+    mel_input = torch.nn.functional.pad(mel_input, (0, bucket_T - mel_input.shape[1]))
+
+# Encode audio
+with torch.no_grad():
+    audio_embeddings = encoders[bucket_T](mel_input)[:N_tokens]
+
+# Build input sequence
+prefix = [IM_START_ID, 8948, 198, IM_END_ID, 198, IM_START_ID, 872, 198, AUDIO_START_ID]
+audio_ids = [AUDIO_PAD_ID] * N_tokens
+suffix = [AUDIO_END_ID, IM_END_ID, 198, IM_START_ID, 77091, 198]
+input_ids = torch.tensor([prefix + audio_ids + suffix], dtype=torch.long)
+
+# Generate (see test/integration/test_model.py for full decode loop)
+# ... autoregressive decode with model.forward() ...
+
+# Output format: "language English<asr_text>transcription text<|im_end|>"
+```
+
+## Key Implementation Notes
+
+1. **rope_scaling must use "rope_type": "default"** (NOT "mrope") - mRoPE is applied externally via `rotary_position_ids`
+2. **rotary_position_ids must be int/long** (NOT float) - computed from attention_mask.long().cumsum()
+3. **sampling_params must be torch.zeros(1, 3)** even when on_device_sampling is disabled
+4. **Encoder tracing: DO NOT use inline_weights_to_neff=True** - causes accuracy regression
+5. **Batching limitation**: `scatter_by_index_put` in NxDI assumes BS=1 for multimodal prefill. Use DP for throughput.
+
+## Compatibility Matrix
+
+| Instance/SDK | SDK 2.29 | SDK 2.28 |
+|--------------|----------|----------|
+| trn2.3xlarge | VALIDATED | Not tested |
+| trn2.48xlarge | Expected to work | Not tested |
+| trn1.32xlarge | Not supported (NxDI 0.9 drops trn1) | May work with NxDI 0.7 |
+
+## Testing
+
+```bash
+# Run integration tests (requires compiled model and encoder on Neuron instance)
+pytest contrib/models/Qwen3-ASR-1.7B/test/integration/test_model.py -v --capture=tee-sys
+```
+
+## Example Checkpoints
+
+* [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+## Maintainer
+
+Jim Burtoft (jimburtoft)
+
+**Last Updated:** 2026-05-09
diff --git a/contrib/models/Qwen3-ASR-1.7B/src/__init__.py b/contrib/models/Qwen3-ASR-1.7B/src/__init__.py
new file mode 100644
index 00000000..5c310355
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/src/__init__.py
@@ -0,0 +1,23 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Qwen3-ASR contrib model for NeuronX Distributed Inference."""
+
+from .modeling_qwen3_asr import (
+    NeuronQwen3ASRForCausalLM,
+    create_inference_config,
+    get_encoder_output_length,
+    AUDIO_PAD_ID,
+    AUDIO_START_ID,
+    AUDIO_END_ID,
+    IM_START_ID,
+    IM_END_ID,
+    EOS_ID,
+    ASR_TEXT_TOKEN_ID,
+)
+from .audio_encoder import (
+    StaticQwen3ASREncoder,
+    trace_encoder,
+    load_encoders,
+    select_bucket,
+)
diff --git a/contrib/models/Qwen3-ASR-1.7B/src/audio_encoder.py b/contrib/models/Qwen3-ASR-1.7B/src/audio_encoder.py
new file mode 100644
index 00000000..d71fe4d1
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/src/audio_encoder.py
@@ -0,0 +1,298 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Qwen3-ASR Audio Encoder for Neuron tracing.
+
+The encoder uses a Whisper-like architecture with:
+- Conv2D frontend (3 layers, stride 2, channel 480)
+- 24 transformer encoder layers (bidirectional, d_model=1024)
+- Output projector (1024 -> 2048 to match text decoder hidden_size)
+
+This module provides StaticQwen3ASREncoder which rewrites the encoder
+to be trace-friendly (no dynamic shapes, no cu_seqlens, static attention mask).
+
+Bucket tracing strategy:
+- Trace 3 encoder NEFFs for different audio durations: 5s/10s/30s
+- At inference, select the smallest bucket that fits the input
+- Pad input mel to bucket size, trim output to actual token count
+"""
+
+import math
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def get_encoder_output_length(T_mel: int) -> int:
+    """Compute number of encoder output tokens from mel frame count."""
+    input_lengths_leave = T_mel % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (T_mel // 100) * 13
+    return output_lengths
+
+
+def select_bucket(T_mel: int, buckets: List[int] = [500, 1000, 3000]) -> int:
+    """Select smallest bucket that fits the mel frame count."""
+    for b in buckets:
+        if T_mel <= b:
+            return b
+    return buckets[-1]
+
+
+class StaticQwen3ASREncoder(nn.Module):
+    """Static (trace-friendly) wrapper for Qwen3-ASR audio encoder.
+
+    Rewrites the encoder forward pass to use:
+    - Fixed-size reshape instead of dynamic split/pad_sequence
+    - Pre-computed block-diagonal attention mask (no cu_seqlens)
+    - Eager attention instead of Flash Attention 2
+
+    The encoder processes audio in chunks of 100 mel frames (1 second).
+    Input must be padded to the nearest 100-frame boundary.
+
+    Args:
+        audio_tower: The HuggingFace Qwen3ASRAudioEncoder module
+        fixed_T: Fixed mel frame count (must be multiple of 100)
+    """
+
+    def __init__(self, audio_tower, fixed_T: int):
+        super().__init__()
+        assert fixed_T % 100 == 0, f"fixed_T must be multiple of 100, got {fixed_T}"
+
+        self.fixed_T = fixed_T
+        self.n_chunks = fixed_T // 100
+
+        # Copy convolution layers
+        self.conv2d1 = audio_tower.conv2d1
+        self.conv2d2 = audio_tower.conv2d2
+        self.conv2d3 = audio_tower.conv2d3
+        self.conv_out = audio_tower.conv_out
+
+        # Positional embedding
+        self.positional_embedding = audio_tower.positional_embedding
+
+        # Transformer layers
+        self.layers = audio_tower.layers
+
+        # Post-processing
+        self.ln_post = audio_tower.ln_post
+        self.proj1 = audio_tower.proj1
+        self.act = audio_tower.act
+        self.proj2 = audio_tower.proj2
+
+        # Pre-compute attention mask (block-diagonal)
+        # Each chunk of 13 tokens attends only to itself
+        tokens_per_chunk = 13
+        total_tokens = self.n_chunks * tokens_per_chunk
+        mask = torch.zeros(total_tokens, total_tokens, dtype=torch.bool)
+        for i in range(self.n_chunks):
+            start = i * tokens_per_chunk
+            end = start + tokens_per_chunk
+            mask[start:end, start:end] = True
+        # Convert to float mask: 0 for attend, -inf for no attend
+        self.register_buffer(
+            "attention_mask", (~mask).float() * torch.finfo(torch.float32).min
+        )
+
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the static encoder.
+
+        Args:
+            input_features: Mel spectrogram [128, fixed_T]
+
+        Returns:
+            Audio embeddings [total_output_tokens, 2048]
+        """
+        # input_features: [128, T] -> add batch and channel: [1, 1, 128, T]
+        x = input_features.unsqueeze(0).unsqueeze(0)
+
+        # Conv2D frontend: [1, 1, 128, T] -> [1, 480, 16, T/8]
+        x = F.gelu(self.conv2d1(x))
+        x = F.gelu(self.conv2d2(x))
+        x = F.gelu(self.conv2d3(x))
+
+        # Reshape to chunks: each chunk has T_chunk/8 time steps
+        # x shape: [1, 480, 16, T/8]
+        B, C, F_dim, T_dim = x.shape  # 1, 480, 16, fixed_T/8
+
+        # Reshape: [n_chunks, 480, 16, chunk_time] where chunk_time = 100/8 = 12.5 -> 13
+        # Actually, after 3x stride-2 conv: 100 -> 50 -> 25 -> 13 time steps per chunk
+        tokens_per_chunk = T_dim // self.n_chunks  # Should be 13 for 100-frame chunks
+
+        # Reshape [1, 480, 16, T/8] -> [n_chunks, 480, 16, tokens_per_chunk]
+        x = x.squeeze(0)  # [480, 16, T/8]
+        x = x.view(
+            C, F_dim, self.n_chunks, tokens_per_chunk
+        )  # [480, 16, n_chunks, tpc]
+        x = x.permute(2, 0, 1, 3)  # [n_chunks, 480, 16, tpc]
+
+        # Flatten freq and channel: [n_chunks, tpc, 480*16=7680]
+        x = x.permute(0, 3, 1, 2)  # [n_chunks, tpc, 480, 16]
+        x = x.reshape(
+            self.n_chunks, tokens_per_chunk, C * F_dim
+        )  # [n_chunks, tpc, 7680]
+
+        # Linear projection: [n_chunks, tpc, 7680] -> [n_chunks, tpc, 1024]
+        x = self.conv_out(x)
+
+        # Add positional embedding
+        x = x + self.positional_embedding.weight[:tokens_per_chunk]
+
+        # Flatten to single sequence: [total_tokens, 1024]
+        total_tokens = self.n_chunks * tokens_per_chunk
+        x = x.reshape(total_tokens, -1)  # [total_tokens, 1024]
+
+        # Transformer layers with block-diagonal attention
+        for layer in self.layers:
+            # Pre-norm
+            residual = x
+            x_norm = layer.self_attn_layer_norm(x)
+
+            # Self-attention with block-diagonal mask
+            # Q, K, V projections
+            q = layer.self_attn.q_proj(x_norm)
+            k = layer.self_attn.k_proj(x_norm)
+            v = layer.self_attn.v_proj(x_norm)
+
+            # Multi-head attention
+            head_dim = q.shape[-1] // layer.self_attn.num_heads
+            q = q.view(total_tokens, layer.self_attn.num_heads, head_dim).transpose(
+                0, 1
+            )
+            k = k.view(total_tokens, layer.self_attn.num_heads, head_dim).transpose(
+                0, 1
+            )
+            v = v.view(total_tokens, layer.self_attn.num_heads, head_dim).transpose(
+                0, 1
+            )
+
+            # Scaled dot-product attention with mask
+            scale = 1.0 / math.sqrt(head_dim)
+            attn_weights = torch.matmul(q, k.transpose(-2, -1)) * scale
+            attn_weights = attn_weights + self.attention_mask.unsqueeze(0)
+            attn_weights = F.softmax(attn_weights, dim=-1)
+            attn_output = torch.matmul(attn_weights, v)
+
+            # Merge heads and project
+            attn_output = attn_output.transpose(0, 1).reshape(total_tokens, -1)
+            attn_output = layer.self_attn.out_proj(attn_output)
+
+            x = residual + attn_output
+
+            # FFN
+            residual = x
+            x_norm = layer.final_layer_norm(x)
+            x = residual + layer.fc2(F.gelu(layer.fc1(x_norm)))
+
+        # Post-processing: layernorm + projector
+        x = self.ln_post(x)
+        x = self.proj1(x)
+        x = self.act(x)
+        x = self.proj2(x)  # [total_tokens, 2048]
+
+        return x
+
+
+def trace_encoder(
+    model_path: str,
+    output_dir: str,
+    buckets: List[int] = [500, 1000, 3000],
+    compiler_args: Optional[List[str]] = None,
+) -> Dict[int, str]:
+    """Trace the audio encoder for multiple bucket sizes.
+
+    Args:
+        model_path: Path to HuggingFace Qwen3-ASR model
+        output_dir: Directory to save traced encoder NEFFs
+        buckets: List of mel frame counts to trace (default: 5s, 10s, 30s)
+        compiler_args: Additional neuron compiler arguments
+
+    Returns:
+        Dict mapping bucket T -> saved NEFF path
+    """
+    import torch_neuronx
+    from transformers import AutoConfig
+
+    if compiler_args is None:
+        compiler_args = [
+            "--auto-cast",
+            "matmult",
+            "--auto-cast-type",
+            "bf16",
+            "--model-type",
+            "transformer",
+        ]
+
+    # Load model
+    hf_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+
+    # Import model class
+    import sys
+
+    sys.path.insert(0, model_path)
+    from qwen_asr.core.transformers_backend import Qwen3ASRForConditionalGeneration
+    import torch
+
+    hf_model = Qwen3ASRForConditionalGeneration.from_pretrained(
+        model_path, trust_remote_code=True, torch_dtype=torch.float32
+    )
+    audio_tower = hf_model.thinker.audio_tower
+    audio_tower.eval()
+
+    os.makedirs(output_dir, exist_ok=True)
+    saved_paths = {}
+
+    for T in buckets:
+        print(f"  Tracing encoder for T={T} ({T // 100}s audio)...")
+        encoder = StaticQwen3ASREncoder(audio_tower, T)
+        encoder.eval()
+
+        example_input = torch.randn(128, T)
+
+        traced = torch_neuronx.trace(
+            encoder,
+            (example_input,),
+            compiler_args=compiler_args,
+            # DO NOT use inline_weights_to_neff=True (causes accuracy regression)
+        )
+
+        save_path = os.path.join(output_dir, f"encoder_T{T}.pt")
+        traced.save(save_path)
+        saved_paths[T] = save_path
+        print(f"    Saved to {save_path}")
+
+    return saved_paths
+
+
+def load_encoders(
+    encoder_dir: str,
+    buckets: List[int] = [500, 1000, 3000],
+    device: int = 0,
+    warmup: bool = True,
+) -> Dict[int, torch.jit.ScriptModule]:
+    """Load traced encoder NEFFs and move to device.
+
+    Args:
+        encoder_dir: Directory containing encoder_T{bucket}.pt files
+        buckets: Bucket sizes to load
+        device: Neuron device ID
+        warmup: Whether to run a warmup inference
+
+    Returns:
+        Dict mapping bucket T -> loaded traced model
+    """
+    import torch_neuronx
+
+    encoders = {}
+    for T in buckets:
+        path = os.path.join(encoder_dir, f"encoder_T{T}.pt")
+        enc = torch.jit.load(path)
+        torch_neuronx.move_trace_to_device(enc, device)
+        if warmup:
+            _ = enc(torch.randn(128, T))
+        encoders[T] = enc
+
+    return encoders
diff --git a/contrib/models/Qwen3-ASR-1.7B/src/modeling_qwen3_asr.py b/contrib/models/Qwen3-ASR-1.7B/src/modeling_qwen3_asr.py
new file mode 100644
index 00000000..2a8a4858
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/src/modeling_qwen3_asr.py
@@ -0,0 +1,375 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+NeuronX Distributed Inference implementation of Qwen3-ASR-1.7B.
+
+This implements the full ASR pipeline:
+1. Audio encoder (traced separately, Whisper-like architecture)
+2. Text decoder (NeuronQwen3VL pattern with multimodal scatter)
+
+Architecture:
+- Audio encoder: 24-layer transformer, d_model=1024, Conv2D frontend
+- Text decoder: 28-layer Qwen3, hidden_size=2048, GQA 16/8, QK-norm, mRoPE
+- Pipeline: mel spectrogram -> encoder -> scatter into text embeddings -> autoregressive decode
+
+The text decoder reuses NeuronQwen3VLForCausalLM (multimodal scatter mechanism)
+with audio embeddings placed at audio_token positions.
+"""
+
+import copy
+import os
+import time
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from neuronx_distributed_inference.models.qwen3_vl.modeling_qwen3_vl import (
+    NeuronQwen3VLForCausalLM,
+    Qwen3VLInferenceConfig,
+    Qwen3VLNeuronConfig,
+)
+from neuronx_distributed_inference.models.qwen3_vl.modeling_qwen3_vl_text import (
+    NeuronQwen3VLTextForCausalLM,
+)
+from neuronx_distributed_inference.models.application_base import (
+    load_state_dict as nxdi_load_sd,
+)
+from neuronx_distributed_inference.models.image_to_text_model_base import (
+    normalize_path,
+)
+from neuronx_distributed_inference.models.llama4.utils.encoder_utils import (
+    generate_positions_from_mask,
+    pad_positions,
+    pad_vision_embeddings,
+)
+
+
+# Special token IDs for Qwen3-ASR
+AUDIO_PAD_ID = 151676  # Placeholder token for audio embeddings in text sequence
+AUDIO_START_ID = 151669  # Marks beginning of audio segment
+AUDIO_END_ID = 151670  # Marks end of audio segment
+IM_START_ID = 151644  # <|im_start|>
+IM_END_ID = 151645  # <|im_end|> (also used as EOS for generation)
+EOS_ID = 151643  # End of sequence / pad token
+ASR_TEXT_TOKEN_ID = (
+    151704  # <asr_text> separator between language tag and transcription
+)
+
+
+def get_encoder_output_length(T_mel: int) -> int:
+    """Compute number of encoder output tokens from mel frame count.
+
+    The encoder uses chunked processing with Conv2D stride-2 frontend.
+    Each 100 mel frames (1 second) produces 13 output tokens.
+
+    Args:
+        T_mel: Number of mel spectrogram frames (100 per second of audio)
+
+    Returns:
+        Number of encoder output tokens (audio embeddings)
+    """
+    input_lengths_leave = T_mel % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (T_mel // 100) * 13
+    return output_lengths
+
+
+class NeuronQwen3ASRForCausalLM(NeuronQwen3VLForCausalLM):
+    """Qwen3-ASR text decoder on Neuron, using Qwen3-VL's multimodal scatter.
+
+    This class handles:
+    - Loading text decoder weights (excluding audio encoder weights)
+    - Converting HuggingFace state dict to NxDI format
+    - Prefill with audio embedding scatter at AUDIO_PAD positions
+    - Autoregressive decode with mRoPE position tracking
+
+    The audio encoder is handled separately via StaticQwen3ASREncoder (traced).
+    """
+
+    vision_model_cls = None
+    vision_model_wrapper = None
+
+    def enable_vision_encoder(self, **kwargs):
+        """No-op: encoder is traced separately."""
+        pass
+
+    def load(
+        self,
+        compiled_model_path: str,
+        start_rank_id: int = 0,
+        debug: bool = False,
+        **kwargs,
+    ):
+        """Load compiled text model only (no vision model)."""
+        text_path = normalize_path(compiled_model_path) + "text_model/"
+        self.text_traced_model = torch.jit.load(text_path + "model.pt")
+
+        text_weights = self.get_text_builder(debug).shard_checkpoint()
+        start_rank_tensor = torch.tensor([start_rank_id], dtype=torch.int32)
+        self.text_traced_model.nxd_model.initialize(text_weights, start_rank_tensor)
+
+        for model_wrapper in self.text_models:
+            model_wrapper.model = self.text_traced_model
+
+        self.is_loaded_to_neuron = True
+
+    def compile(
+        self,
+        compiled_model_path: str,
+        debug: bool = False,
+        pre_shard_weights_hook=None,
+        dry_run: bool = False,
+    ):
+        """Compile text model only (skip vision model trace)."""
+        from neuronx_distributed_inference.models.application_base import (
+            NeuronApplicationBase,
+        )
+
+        NeuronApplicationBase.compile(
+            self,
+            compiled_model_path,
+            debug=debug,
+            pre_shard_weights_hook=pre_shard_weights_hook,
+            dry_run=dry_run,
+        )
+
+    @classmethod
+    def get_state_dict(cls, model_name_or_path: str, config):
+        """Convert HuggingFace Qwen3-ASR weights to NxDI format.
+
+        Key mappings:
+        - thinker.model.* -> language_model.* (text decoder layers)
+        - thinker.lm_head.* -> lm_head.* (output projection)
+        - thinker.audio_tower.* -> excluded (handled by traced encoder)
+        """
+        raw_sd = nxdi_load_sd(model_name_or_path)
+        converted_sd = {}
+
+        for key, value in raw_sd.items():
+            if key.startswith("thinker.audio_tower."):
+                continue  # Encoder handled separately
+            if key.startswith("thinker.model."):
+                new_key = "language_model." + key[len("thinker.model.") :]
+                converted_sd[new_key] = value
+            elif key.startswith("thinker.lm_head."):
+                new_key = key[len("thinker.") :]
+                converted_sd[new_key] = value
+            else:
+                converted_sd[key] = value
+
+        model_sd = NeuronQwen3VLTextForCausalLM.convert_hf_to_neuron_state_dict(
+            converted_sd, config.text_config
+        )
+
+        # Handle tied embeddings
+        if getattr(config.text_config, "tie_word_embeddings", False):
+            if "embed_tokens.weight" in model_sd and "lm_head.weight" not in model_sd:
+                model_sd["lm_head.weight"] = model_sd["embed_tokens.weight"]
+
+        return model_sd
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        seq_ids: Optional[torch.Tensor] = None,
+        sampling_params: Optional[torch.Tensor] = None,
+        audio_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """Forward pass with audio embedding scatter.
+
+        During prefill (input_ids seq_len > 1):
+            - Scatters audio_embeddings at AUDIO_PAD_ID positions
+            - Computes mRoPE position_ids (3 axes, all same for ASR)
+
+        During decode (input_ids seq_len == 1):
+            - Uses dummy vision inputs (no scatter)
+            - Increments mRoPE positions using stored rope_deltas
+        """
+        pad_limit = self.get_padding_length(input_ids)
+
+        # Determine if we're in prefill with audio or decode/text-only
+        if (
+            audio_embeddings is not None
+            and input_ids.shape[-1] > 1
+            and audio_embeddings.sum() != 0
+        ):
+            # Prefill with audio: scatter embeddings at AUDIO_PAD positions
+            vision_mask = (input_ids == AUDIO_PAD_ID).unsqueeze(-1).to(torch.bool)
+            vision_mask = generate_positions_from_mask(vision_mask.squeeze())
+            vision_mask = pad_positions(vision_mask, pad_limit, (pad_limit - 1))
+
+            vision_embeddings = audio_embeddings.to(
+                self.text_config.neuron_config.torch_dtype
+            )
+            embedding_dim = vision_embeddings.shape[-1]
+            vision_embeddings = vision_embeddings.view(-1, embedding_dim).unsqueeze(0)
+            vision_embeddings = pad_vision_embeddings(vision_embeddings, pad_limit)
+        else:
+            # Text-only or decode phase: use dummy inputs
+            vision_embeddings, vision_mask, _ = (
+                self.text_model_wrapper.get_dummy_vision_inputs(
+                    config=self.text_config,
+                    input_ids=input_ids,
+                    n_active_tokens=pad_limit,
+                    fill_value=(pad_limit - 1),
+                )
+            )
+
+        # Compute mRoPE position IDs (3 axes, all identical for ASR)
+        if input_ids.shape[-1] > 1:
+            # Prefill: compute positions from attention mask
+            if attention_mask is not None:
+                pos = attention_mask.long().cumsum(-1) - 1
+                pos.masked_fill_(attention_mask == 0, 1)
+            else:
+                seq_len = input_ids.shape[1]
+                pos = torch.arange(seq_len).unsqueeze(0)
+            # Expand to 3 mRoPE axes [temporal, height, width] - all same for ASR
+            rotary_position_ids = pos.unsqueeze(0).expand(3, -1, -1)
+
+            # Store rope_deltas for decode phase
+            if attention_mask is not None:
+                max_pos = pos.max(-1, keepdim=True)[0]
+                self.rope_deltas = (
+                    max_pos + 1 - attention_mask.sum(-1, keepdim=True)
+                ).long()
+            else:
+                self.rope_deltas = torch.zeros(1, 1, dtype=torch.long)
+        else:
+            # Decode: increment position based on stored delta
+            batch_size = input_ids.shape[0]
+            if self.rope_deltas is not None:
+                delta = self.rope_deltas.to(input_ids.device)
+                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+            else:
+                delta = 0
+            rotary_position_ids = copy.deepcopy(position_ids)
+            rotary_position_ids = rotary_position_ids.view(1, -1).expand(batch_size, -1)
+            rotary_position_ids = rotary_position_ids.add(delta)
+            rotary_position_ids = rotary_position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        deepstack_vision_embeds = torch.zeros(0)
+
+        # Call grandparent forward (bypasses NeuronQwen3VLForCausalLM's vision handling)
+        output_token = super(NeuronQwen3VLForCausalLM, self).forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            seq_ids=seq_ids,
+            sampling_params=sampling_params,
+            rotary_position_ids=rotary_position_ids,
+            vision_embeddings=vision_embeddings,
+            vision_mask=vision_mask,
+            deepstack_vision_embeds=deepstack_vision_embeds,
+        )
+        return output_token
+
+
+def create_inference_config(
+    model_path: str,
+    tp_degree: int = 4,
+    batch_size: int = 1,
+    n_positions: int = 1024,
+) -> Qwen3VLInferenceConfig:
+    """Create Qwen3VLInferenceConfig for Qwen3-ASR text decoder.
+
+    Args:
+        model_path: Path to HuggingFace model directory
+        tp_degree: Tensor parallel degree (2 or 4 for trn2.3xlarge)
+        batch_size: Inference batch size
+        n_positions: KV cache length (1024 sufficient for most ASR)
+
+    Returns:
+        Qwen3VLInferenceConfig configured for Qwen3-ASR
+    """
+    from transformers import AutoConfig
+
+    hf_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    text_config = hf_config.thinker_config.text_config
+
+    text_neuron_config = Qwen3VLNeuronConfig(
+        tp_degree=tp_degree,
+        batch_size=batch_size,
+        n_positions=n_positions,
+        seq_len=n_positions,
+        max_context_length=n_positions,
+        on_device_sampling_config=None,
+        torch_dtype=torch.bfloat16,
+    )
+    # Dummy vision config (required by Qwen3VLInferenceConfig but not used)
+    vision_neuron_config = Qwen3VLNeuronConfig(
+        tp_degree=tp_degree,
+        batch_size=batch_size,
+        seq_len=512,
+        n_positions=512,
+        on_device_sampling_config=None,
+        torch_dtype=torch.bfloat16,
+    )
+
+    text_config_dict = {
+        "hidden_size": text_config.hidden_size,
+        "num_hidden_layers": text_config.num_hidden_layers,
+        "num_attention_heads": text_config.num_attention_heads,
+        "num_key_value_heads": text_config.num_key_value_heads,
+        "head_dim": text_config.head_dim,
+        "intermediate_size": text_config.intermediate_size,
+        "vocab_size": text_config.vocab_size,
+        "max_position_embeddings": text_config.max_position_embeddings,
+        "rope_theta": text_config.rope_theta,
+        "rms_norm_eps": text_config.rms_norm_eps,
+        "tie_word_embeddings": text_config.tie_word_embeddings,
+        "attention_bias": getattr(text_config, "attention_bias", False),
+        "hidden_act": "silu",
+        "rope_scaling": {
+            "type": "mrope",
+            "rope_type": "default",
+            "mrope_section": [24, 20, 20],
+        },
+        "pad_token_id": EOS_ID,
+        "attention_dropout": 0.0,
+        "bos_token_id": EOS_ID,
+        "dtype": "bfloat16",
+        "eos_token_id": IM_END_ID,
+        "initializer_range": 0.02,
+        "output_attentions": False,
+        "output_hidden_states": False,
+    }
+
+    vision_config_dict = {
+        "hidden_size": 1024,
+        "num_hidden_layers": 1,
+        "num_attention_heads": 16,
+        "num_key_value_heads": 16,
+        "head_dim": 64,
+        "intermediate_size": 4096,
+        "image_size": 224,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "deepstack_visual_indexes": [],
+        "vocab_size": text_config.vocab_size,
+        "max_position_embeddings": 512,
+        "depth": 1,
+        "hidden_act": "gelu",
+        "in_channels": 3,
+        "initializer_range": 0.02,
+        "num_heads": 16,
+        "num_position_embeddings": 256,
+        "out_hidden_size": text_config.hidden_size,
+        "temporal_patch_size": 2,
+    }
+
+    config = Qwen3VLInferenceConfig(
+        text_neuron_config=text_neuron_config,
+        vision_neuron_config=vision_neuron_config,
+        text_config=text_config_dict,
+        vision_config=vision_config_dict,
+        _name_or_path=model_path,
+        image_token_id=AUDIO_PAD_ID,
+    )
+    return config
diff --git a/contrib/models/Qwen3-ASR-1.7B/test/__init__.py b/contrib/models/Qwen3-ASR-1.7B/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/contrib/models/Qwen3-ASR-1.7B/test/integration/__init__.py b/contrib/models/Qwen3-ASR-1.7B/test/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/contrib/models/Qwen3-ASR-1.7B/test/integration/test_model.py b/contrib/models/Qwen3-ASR-1.7B/test/integration/test_model.py
new file mode 100644
index 00000000..acceb12e
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/test/integration/test_model.py
@@ -0,0 +1,425 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Integration tests for Qwen3-ASR-1.7B on NeuronX.
+
+Tests:
+1. Smoke test: model loads and generates tokens
+2. Logit validation: first-token logits match CPU reference
+3. E2E accuracy: transcription matches expected text
+
+Prerequisites:
+- Model downloaded: Qwen/Qwen3-ASR-1.7B
+- Compiled artifacts available (run compile first)
+- Traced encoder NEFFs available
+
+Configuration:
+- Set MODEL_PATH, COMPILED_MODEL_PATH, ENCODER_DIR below
+"""
+
+import sys
+import time
+from pathlib import Path
+
+import pytest
+import torch
+import numpy as np
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
+from modeling_qwen3_asr import (
+    NeuronQwen3ASRForCausalLM,
+    create_inference_config,
+    get_encoder_output_length,
+    AUDIO_PAD_ID,
+    AUDIO_START_ID,
+    AUDIO_END_ID,
+    IM_START_ID,
+    IM_END_ID,
+    EOS_ID,
+)
+from audio_encoder import load_encoders, select_bucket
+
+# ===== CONFIGURATION =====
+# Update these paths for your environment
+MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--Qwen--Qwen3-ASR-1.7B/snapshots/7278e1e70fe206f11671096ffdd38061171dd6e5"
+COMPILED_MODEL_PATH = "/mnt/models/compiled/qwen3_asr_vl_text_tp4"
+ENCODER_DIR = "/mnt/models/compiled/qwen3_asr_encoder"
+TP_DEGREE = 4
+N_POSITIONS = 1024
+MAX_NEW_TOKENS = 128
+
+
+@pytest.fixture(scope="module")
+def model():
+    """Load compiled model (shared across all tests in this module)."""
+    config = create_inference_config(
+        MODEL_PATH, tp_degree=TP_DEGREE, n_positions=N_POSITIONS
+    )
+    model = NeuronQwen3ASRForCausalLM(COMPILED_MODEL_PATH, config)
+    model.load(COMPILED_MODEL_PATH)
+    return model
+
+
+@pytest.fixture(scope="module")
+def encoders():
+    """Load traced encoder NEFFs."""
+    return load_encoders(ENCODER_DIR)
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    """Load tokenizer."""
+    from transformers import AutoTokenizer
+
+    return AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+
+
+@pytest.fixture(scope="module")
+def feature_extractor():
+    """Load Whisper feature extractor."""
+    from transformers import WhisperFeatureExtractor
+
+    return WhisperFeatureExtractor.from_pretrained(MODEL_PATH)
+
+
+def generate_greedy(
+    model, input_ids, attention_mask, audio_embeddings=None, max_tokens=MAX_NEW_TOKENS
+):
+    """Run greedy autoregressive generation."""
+    import copy
+
+    seq_ids = torch.zeros(1, dtype=torch.long)
+    sampling_params = torch.zeros(1, 3)
+
+    # Pad input
+    seq_len = input_ids.shape[1]
+    padded_input_ids = torch.nn.functional.pad(
+        input_ids, (0, N_POSITIONS - seq_len), value=EOS_ID
+    )
+
+    position_ids = attention_mask.long().cumsum(-1) - 1
+    position_ids.masked_fill_(attention_mask == 0, 1)
+
+    # Prefill
+    with torch.no_grad():
+        output = model.forward(
+            input_ids=padded_input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            seq_ids=seq_ids,
+            sampling_params=sampling_params,
+            audio_embeddings=audio_embeddings,
+        )
+
+    logits = (
+        output.logits
+        if hasattr(output, "logits")
+        else (output[0] if isinstance(output, (tuple, list)) else output)
+    )
+    first_token = logits[0, -1, :].argmax(dim=-1).item()
+
+    generated = [first_token]
+    current_pos = seq_len
+
+    # Decode loop
+    with torch.no_grad():
+        for _ in range(max_tokens - 1):
+            if generated[-1] in (EOS_ID, IM_END_ID) or current_pos >= N_POSITIONS - 1:
+                break
+
+            next_id = torch.tensor([[generated[-1]]], dtype=torch.long)
+            decode_mask = torch.zeros(1, N_POSITIONS, dtype=torch.long)
+            decode_mask[0, : current_pos + 1] = 1
+            decode_pos = torch.tensor([[current_pos]], dtype=torch.long)
+
+            output = model.forward(
+                input_ids=next_id,
+                attention_mask=decode_mask,
+                position_ids=decode_pos,
+                seq_ids=seq_ids,
+                sampling_params=sampling_params,
+                audio_embeddings=None,
+            )
+
+            logits = (
+                output.logits
+                if hasattr(output, "logits")
+                else (output[0] if isinstance(output, (tuple, list)) else output)
+            )
+            next_token = logits[0, -1, :].argmax(dim=-1).item()
+            generated.append(next_token)
+            current_pos += 1
+
+    return generated, logits
+
+
+class TestSmokeTest:
+    """Basic smoke tests: model loads and produces output."""
+
+    def test_model_loads(self, model):
+        """Model loads without errors."""
+        assert model is not None
+        assert model.is_loaded_to_neuron
+
+    def test_text_only_generation(self, model, tokenizer):
+        """Model generates tokens for text-only input (no audio)."""
+        # Simple text prompt without audio
+        text = "<|im_start|>system\n<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n"
+        input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
+        seq_len = input_ids.shape[1]
+
+        attention_mask = torch.ones(1, N_POSITIONS, dtype=torch.long)
+        attention_mask[0, seq_len:] = 0
+
+        generated, _ = generate_greedy(model, input_ids, attention_mask, max_tokens=10)
+        assert len(generated) > 0, "Model should generate at least 1 token"
+
+
+class TestEncoderIntegration:
+    """Tests with audio encoder."""
+
+    def test_encoder_loads(self, encoders):
+        """All encoder buckets load successfully."""
+        assert 500 in encoders
+        assert 1000 in encoders
+        assert 3000 in encoders
+
+    def test_encoder_output_shape(self, encoders):
+        """Encoder produces correct output shapes."""
+        for T, expected_tokens in [(500, 65), (1000, 130), (3000, 390)]:
+            output = encoders[T](torch.randn(128, T))
+            assert output.shape == (expected_tokens, 2048), (
+                f"T={T}: expected ({expected_tokens}, 2048), got {output.shape}"
+            )
+
+    def test_encoder_latency(self, encoders):
+        """Encoder latency is within expected range."""
+        mel = torch.randn(128, 1000)
+        times = []
+        for _ in range(10):
+            t0 = time.time()
+            with torch.no_grad():
+                _ = encoders[1000](mel)
+            times.append(time.time() - t0)
+
+        avg_ms = np.mean(times[2:]) * 1000  # Skip warmup
+        assert avg_ms < 50, f"Encoder T=1000 should be <50ms, got {avg_ms:.1f}ms"
+
+
+class TestE2EAccuracy:
+    """End-to-end accuracy validation."""
+
+    def test_reference_transcription(
+        self, model, encoders, tokenizer, feature_extractor
+    ):
+        """E2E pipeline produces correct transcription for reference audio.
+
+        Uses the LibriSpeech test sample: "Mr. Quilter is the apostle..."
+        """
+        import soundfile as sf
+
+        # Load test audio (must exist on test machine)
+        audio_path = "/tmp/test_speech.wav"
+        try:
+            audio, sr = sf.read(audio_path)
+        except FileNotFoundError:
+            pytest.skip(f"Test audio not found at {audio_path}")
+
+        audio = audio.astype(np.float32)
+
+        # Feature extraction
+        mel_output = feature_extractor(
+            audio, sampling_rate=16000, return_tensors="pt", return_attention_mask=True
+        )
+        mel_features = mel_output["input_features"][0]
+        mel_attention_mask = mel_output["attention_mask"][0]
+        actual_mel_len = int(mel_attention_mask.sum().item())
+        bucket_T = select_bucket(actual_mel_len)
+        N_tokens = get_encoder_output_length(actual_mel_len)
+
+        mel_input = mel_features[:, :bucket_T]
+        if mel_input.shape[1] < bucket_T:
+            mel_input = torch.nn.functional.pad(
+                mel_input, (0, bucket_T - mel_input.shape[1])
+            )
+
+        # Encode
+        with torch.no_grad():
+            audio_embeddings = encoders[bucket_T](mel_input)[:N_tokens]
+
+        # Build input_ids
+        prefix_ids = [
+            IM_START_ID,
+            8948,
+            198,
+            IM_END_ID,
+            198,
+            IM_START_ID,
+            872,
+            198,
+            AUDIO_START_ID,
+        ]
+        audio_ids = [AUDIO_PAD_ID] * N_tokens
+        suffix_ids = [AUDIO_END_ID, IM_END_ID, 198, IM_START_ID, 77091, 198]
+        all_ids = prefix_ids + audio_ids + suffix_ids
+        seq_len = len(all_ids)
+
+        input_ids = torch.tensor([all_ids], dtype=torch.long)
+        attention_mask = torch.ones(1, N_POSITIONS, dtype=torch.long)
+        attention_mask[0, seq_len:] = 0
+
+        # Generate
+        generated, _ = generate_greedy(
+            model, input_ids, attention_mask, audio_embeddings
+        )
+
+        # Extract transcription
+        transcription = tokenizer.decode(generated, skip_special_tokens=False)
+        if "<asr_text>" in transcription:
+            text = transcription.split("<asr_text>", 1)[1]
+            for special in ["<|im_end|>", "<|endoftext|>"]:
+                text = text.replace(special, "")
+            text = text.strip()
+        else:
+            text = tokenizer.decode(generated, skip_special_tokens=True).strip()
+
+        expected = "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+        assert text.lower() == expected.lower(), (
+            f"Expected: '{expected}', Got: '{text}'"
+        )
+
+    def test_silence_produces_empty(
+        self, model, encoders, tokenizer, feature_extractor
+    ):
+        """Pure silence should produce empty/minimal transcription."""
+        silence = np.zeros(48000, dtype=np.float32)  # 3s silence
+
+        mel_output = feature_extractor(
+            silence,
+            sampling_rate=16000,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        mel_features = mel_output["input_features"][0]
+        mel_attention_mask = mel_output["attention_mask"][0]
+        actual_mel_len = int(mel_attention_mask.sum().item())
+        bucket_T = select_bucket(actual_mel_len)
+        N_tokens = get_encoder_output_length(actual_mel_len)
+
+        mel_input = mel_features[:, :bucket_T]
+        if mel_input.shape[1] < bucket_T:
+            mel_input = torch.nn.functional.pad(
+                mel_input, (0, bucket_T - mel_input.shape[1])
+            )
+
+        with torch.no_grad():
+            audio_embeddings = encoders[bucket_T](mel_input)[:N_tokens]
+
+        prefix_ids = [
+            IM_START_ID,
+            8948,
+            198,
+            IM_END_ID,
+            198,
+            IM_START_ID,
+            872,
+            198,
+            AUDIO_START_ID,
+        ]
+        audio_ids = [AUDIO_PAD_ID] * N_tokens
+        suffix_ids = [AUDIO_END_ID, IM_END_ID, 198, IM_START_ID, 77091, 198]
+        all_ids = prefix_ids + audio_ids + suffix_ids
+        seq_len = len(all_ids)
+
+        input_ids = torch.tensor([all_ids], dtype=torch.long)
+        attention_mask = torch.ones(1, N_POSITIONS, dtype=torch.long)
+        attention_mask[0, seq_len:] = 0
+
+        generated, _ = generate_greedy(
+            model, input_ids, attention_mask, audio_embeddings
+        )
+
+        # Silence should produce very few tokens (language tag + EOS)
+        assert len(generated) <= 10, (
+            f"Silence should produce <=10 tokens, got {len(generated)}"
+        )
+
+
+class TestPerformance:
+    """Performance benchmarks."""
+
+    def test_ttft_under_threshold(self, model, encoders, feature_extractor):
+        """TTFT should be under 50ms for 5s audio."""
+        mel_input = torch.randn(128, 500)  # 5s bucket
+
+        with torch.no_grad():
+            audio_embeddings = encoders[500](mel_input)[:65]
+
+        N_tokens = 65
+        prefix_ids = [
+            IM_START_ID,
+            8948,
+            198,
+            IM_END_ID,
+            198,
+            IM_START_ID,
+            872,
+            198,
+            AUDIO_START_ID,
+        ]
+        audio_ids = [AUDIO_PAD_ID] * N_tokens
+        suffix_ids = [AUDIO_END_ID, IM_END_ID, 198, IM_START_ID, 77091, 198]
+        all_ids = prefix_ids + audio_ids + suffix_ids
+        seq_len = len(all_ids)
+
+        input_ids = torch.tensor([all_ids], dtype=torch.long)
+        attention_mask = torch.ones(1, N_POSITIONS, dtype=torch.long)
+        attention_mask[0, seq_len:] = 0
+        padded_input_ids = torch.nn.functional.pad(
+            input_ids, (0, N_POSITIONS - seq_len), value=EOS_ID
+        )
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        seq_ids = torch.zeros(1, dtype=torch.long)
+        sampling_params = torch.zeros(1, 3)
+
+        # Warmup
+        for _ in range(3):
+            with torch.no_grad():
+                _ = encoders[500](mel_input)
+                _ = model.forward(
+                    input_ids=padded_input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    seq_ids=seq_ids,
+                    sampling_params=sampling_params,
+                    audio_embeddings=audio_embeddings,
+                )
+
+        # Measure TTFT
+        times = []
+        for _ in range(5):
+            t0 = time.time()
+            with torch.no_grad():
+                _ = encoders[500](mel_input)
+                _ = model.forward(
+                    input_ids=padded_input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    seq_ids=seq_ids,
+                    sampling_params=sampling_params,
+                    audio_embeddings=audio_embeddings,
+                )
+            times.append(time.time() - t0)
+
+        avg_ttft_ms = np.mean(times) * 1000
+        assert avg_ttft_ms < 50, (
+            f"TTFT should be <50ms for 5s audio, got {avg_ttft_ms:.1f}ms"
+        )
+        print(f"  TTFT (5s audio): {avg_ttft_ms:.1f}ms")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--capture=tee-sys"])
diff --git a/contrib/models/Qwen3-ASR-1.7B/test/unit/__init__.py b/contrib/models/Qwen3-ASR-1.7B/test/unit/__init__.py
new file mode 100644
index 00000000..e69de29b

From 0044da7ded282257136014525dd9e11cbfb32ebc Mon Sep 17 00:00:00 2001
From: Jim Burtoft <jburtoft@amazon.com>
Date: Sat, 9 May 2026 01:42:25 -0400
Subject: [PATCH 2/2] Add vLLM-neuron integration for Qwen3-ASR-1.7B

Adds vllm/ directory with:
- README.md: Patch instructions for vllm-neuron (constants, model loader,
  model runner, platform, utils)
- neuron_qwen3_asr_vllm.py: Full NeuronQwen3ASRForCausalLM class
- start-vllm-server.sh: Server launch script
- test_transcription.py: API test script

Validated: 140ms E2E latency for 2.9s audio via OpenAI chat completions API
on trn2.3xlarge (TP=4, SDK 2.29, vLLM 0.16.0 + vllm-neuron 0.5.0).
---
 contrib/models/Qwen3-ASR-1.7B/README.md       |  21 +
 contrib/models/Qwen3-ASR-1.7B/vllm/README.md  | 195 ++++++++
 .../vllm/neuron_qwen3_asr_vllm.py             | 460 ++++++++++++++++++
 .../Qwen3-ASR-1.7B/vllm/start-vllm-server.sh  |  54 ++
 .../Qwen3-ASR-1.7B/vllm/test_transcription.py | 132 +++++
 5 files changed, 862 insertions(+)
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/vllm/README.md
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/vllm/neuron_qwen3_asr_vllm.py
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/vllm/start-vllm-server.sh
 create mode 100644 contrib/models/Qwen3-ASR-1.7B/vllm/test_transcription.py

diff --git a/contrib/models/Qwen3-ASR-1.7B/README.md b/contrib/models/Qwen3-ASR-1.7B/README.md
index 8a1bc767..baf8b7de 100644
--- a/contrib/models/Qwen3-ASR-1.7B/README.md
+++ b/contrib/models/Qwen3-ASR-1.7B/README.md
@@ -135,6 +135,27 @@ input_ids = torch.tensor([prefix + audio_ids + suffix], dtype=torch.long)
 # Output format: "language English<asr_text>transcription text<|im_end|>"
 ```
 
+### 3. vLLM Serving (OpenAI-compatible API)
+
+See [`vllm/README.md`](./vllm/README.md) for full setup instructions including patches to vllm-neuron.
+
+Quick start (after applying patches):
+
+```bash
+export NEURON_COMPILED_ARTIFACTS='/path/to/compiled/qwen3_asr_vl_text_tp4'
+export NEURON_ENCODER_PATH='/path/to/compiled/qwen3_asr_encoder'
+export NEURON_RT_VISIBLE_CORES='0-3'
+bash vllm/start-vllm-server.sh
+```
+
+Then transcribe via API:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "Qwen/Qwen3-ASR-1.7B", "messages": [{"role": "user", "content": [{"type": "input_audio", "input_audio": {"data": "<base64_wav>", "format": "wav"}}]}], "max_tokens": 256}'
+```
+
 ## Key Implementation Notes
 
 1. **rope_scaling must use "rope_type": "default"** (NOT "mrope") - mRoPE is applied externally via `rotary_position_ids`
diff --git a/contrib/models/Qwen3-ASR-1.7B/vllm/README.md b/contrib/models/Qwen3-ASR-1.7B/vllm/README.md
new file mode 100644
index 00000000..b52d8041
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/vllm/README.md
@@ -0,0 +1,195 @@
+# Running Qwen3-ASR-1.7B with vLLM on AWS Neuron
+
+## Prerequisites
+
+- trn2.3xlarge instance with SDK 2.29 DLAMI (`Deep Learning AMI Neuron (Ubuntu 24.04) 20260410`)
+- Pre-compiled encoder NEFFs and text decoder (see parent `README.md` for compilation steps)
+- Model weights: `Qwen/Qwen3-ASR-1.7B`
+
+## Setup
+
+### 1. Install vLLM-neuron
+
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/bin/activate
+git clone https://github.com/vllm-project/vllm-neuron.git
+cd vllm-neuron
+pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e .
+```
+
+### 2. Configure Qwen3-ASR Support
+
+Apply the following patches to vllm-neuron:
+
+#### 2.1 Register Qwen3-ASR in `NEURON_MULTI_MODAL_MODELS`
+
+Modify `vllm_neuron/worker/constants.py`:
+
+```diff
+--- a/vllm_neuron/worker/constants.py
++++ b/vllm_neuron/worker/constants.py
+@@ -5,6 +5,7 @@ NEURON_MULTI_MODAL_MODELS = [
+     "MllamaForConditionalGeneration",
+     "LlavaForConditionalGeneration",
+     "Llama4ForConditionalGeneration",
++    "Qwen3ASRForConditionalGeneration",
+ ]
+```
+
+#### 2.2 Add `NeuronQwen3ASRForCausalLM` class to `vllm_neuron/worker/neuronx_distributed_model_loader.py`
+
+Add the following class (see `neuron_qwen3_asr_vllm.py` for the full implementation):
+
+```diff
++class NeuronQwen3ASRForCausalLM(NeuronMultiModalCausalLM):
++    """Qwen3-ASR multimodal model using decomposed pipeline:
++    - Traced encoder NEFFs (bucketed: 500, 1000, 3000 mel frames)
++    - NxDI Qwen3-VL text decoder with vision_embeddings scatter
++    """
++    ...  # See neuron_qwen3_asr_vllm.py for full 460-line implementation
+```
+
+The class handles:
+- Loading pre-compiled encoder NEFFs from `NEURON_ENCODER_PATH`
+- Loading the NxDI text decoder from `NEURON_COMPILED_ARTIFACTS`
+- Bucket selection for encoder (5s/10s/30s audio)
+- Mel feature extraction → encoder → audio embeddings
+- Scatter audio embeddings into token positions via `vision_mask`
+- mRoPE position computation (all 3 axes identical for ASR)
+- Routing to NxDI text decoder (CTE prefill / TKG decode)
+
+#### 2.3 Add dispatch case in `get_neuron_model()`
+
+Modify `vllm_neuron/worker/neuronx_distributed_model_runner.py`:
+
+```diff
+--- a/vllm_neuron/worker/neuronx_distributed_model_runner.py
++++ b/vllm_neuron/worker/neuronx_distributed_model_runner.py
+@@ -775,6 +775,8 @@ def get_neuron_model(
+     elif architecture == "Llama4ForConditionalGeneration":
+         model = NeuronLlama4ForCausalLM(model_config.hf_config)
++    elif architecture == "Qwen3ASRForConditionalGeneration":
++        model = NeuronQwen3ASRForCausalLM(model_config.hf_config)
+     else:
+         model = NeuronCausalLM(model_config.hf_config)
+```
+
+#### 2.4 Add Qwen3-ASR to audio pass-through in `_process_multi_modal_data_neuron`
+
+Modify `vllm_neuron/worker/neuronx_distributed_model_runner.py`:
+
+```diff
+--- a/vllm_neuron/worker/neuronx_distributed_model_runner.py
++++ b/vllm_neuron/worker/neuronx_distributed_model_runner.py
+@@ -1139,7 +1139,7 @@
+-        if self.model.architecture == "ShukaModel":
+-            pass  # Shuka-1 audio data (input_features) passes through directly
++        if self.model.architecture == "ShukaModel" or self.model.architecture == "Qwen3ASRForConditionalGeneration":
++            pass  # Audio data passes through directly (Shuka-1: input_features, Qwen3-ASR: input_audio_features)
+         elif self.model.model.config.model_type == "llava":
+```
+
+#### 2.5 Fix `_get_model_configs` for nested `thinker_config`
+
+Modify `vllm_neuron/worker/neuronx_distributed_model_loader.py` in the `_get_model_configs` function:
+
+```diff
+--- a/vllm_neuron/worker/neuronx_distributed_model_loader.py
++++ b/vllm_neuron/worker/neuronx_distributed_model_loader.py
+@@ -1535,7 +1535,10 @@ def _get_model_configs(config: PretrainedConfig) -> str:
+     # For multimodal models like Llava/Mllama, use text_config
+-    text_config = getattr(config, "text_config", None)
++    text_config = getattr(config, "text_config", None) or getattr(
++        getattr(config, "thinker_config", None), "text_config", None
++    )
+     if text_config is not None:
+         config = text_config
+```
+
+#### 2.6 Fix `get_num_layers_from_hf_config` for `thinker_config`
+
+Modify `vllm_neuron/worker/utils.py`:
+
+```diff
+--- a/vllm_neuron/worker/utils.py
++++ b/vllm_neuron/worker/utils.py
+@@ -XX,6 +XX,10 @@ def get_num_layers_from_hf_config(config):
++    # Handle thinker_config nesting (Qwen3-ASR)
++    thinker_config = getattr(config, "thinker_config", None)
++    if thinker_config is not None:
++        text_config = getattr(thinker_config, "text_config", None)
++        if text_config is not None and hasattr(text_config, "num_hidden_layers"):
++            return text_config.num_hidden_layers
+```
+
+#### 2.7 Fix infinite recursion in `platform.py`
+
+Modify `vllm_neuron/platform.py` in `_register_neuron_multimodal_models()`:
+
+```diff
+--- a/vllm_neuron/platform.py
++++ b/vllm_neuron/platform.py
+@@ -131,7 +131,7 @@ def _register_neuron_multimodal_models():
+     for arch in NEURON_MULTI_MODAL_MODELS:
+-        if ModelRegistry._try_inspect_model_cls(arch) is not None:
++        if arch in ModelRegistry.models:
+             ModelRegistry.register_model(arch, NeuronMultiModalCausalLM)
+```
+
+### 3. Set Environment Variables
+
+```bash
+# Path to pre-compiled text decoder NEFFs
+export NEURON_COMPILED_ARTIFACTS='/path/to/compiled/qwen3_asr_vl_text_tp4'
+
+# Path to pre-compiled encoder NEFFs (encoder_T500.pt, encoder_T1000.pt, encoder_T3000.pt)
+export NEURON_ENCODER_PATH='/path/to/compiled/qwen3_asr_encoder'
+
+# Visible cores (4 cores for TP=4)
+export NEURON_RT_VISIBLE_CORES='0-3'
+```
+
+### 4. Run Inference
+
+#### 4.1 Start vLLM Server
+
+```bash
+bash start-vllm-server.sh
+```
+
+Or manually:
+
+```bash
+python3 -m vllm.entrypoints.openai.api_server \
+    --model 'Qwen/Qwen3-ASR-1.7B' \
+    --tensor-parallel-size 4 \
+    --max-model-len 1024 \
+    --max-num-seqs 1 \
+    --block-size 128 \
+    --no-enable-prefix-caching \
+    --port 8000 \
+    --trust-remote-code \
+    --additional-config '{"override_neuron_config": {"text_neuron_config": {"tp_degree": 4, "batch_size": 1, "n_positions": 1024, "seq_len": 1024}}}'
+```
+
+#### 4.2 Test Transcription
+
+```bash
+python3 test_transcription.py
+```
+
+## Performance
+
+| Metric | Value |
+|--------|-------|
+| E2E latency (2.9s audio, 17 tokens) | 140ms |
+| TPOT (raw, from benchmark) | 4.9ms |
+| TPOT (via vLLM API, includes overhead) | ~8.2ms |
+| Audio throughput | ~49.7 audio-sec/wall-sec |
+
+## Known Limitations
+
+- **Batch size**: Currently limited to `max-num-seqs=1` due to NxDI `scatter_by_index_put()` assuming BS=1 for multimodal prefill
+- **Prefix caching**: Must be disabled (`--no-enable-prefix-caching`)
+- **Block size**: Maximum 256 (use 128)
+- **Transcription API**: Only chat completions endpoint tested; `/v1/audio/transcriptions` not yet validated
diff --git a/contrib/models/Qwen3-ASR-1.7B/vllm/neuron_qwen3_asr_vllm.py b/contrib/models/Qwen3-ASR-1.7B/vllm/neuron_qwen3_asr_vllm.py
new file mode 100644
index 00000000..d6c49585
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/vllm/neuron_qwen3_asr_vllm.py
@@ -0,0 +1,460 @@
+class NeuronQwen3ASRForCausalLM(NeuronMultiModalCausalLM):
+    """Qwen3-ASR-1.7B audio-language model for Neuron.
+
+    Uses the NxDI NeuronQwen3VLForCausalLM text decoder (same Qwen3 architecture
+    with mRoPE and QK-norm) with separately traced Whisper encoder NEFFs.
+
+    Audio flow:
+    - execute_model() extracts mel features from multi_modal_kwargs
+    - Selects appropriate encoder bucket (5s/10s/30s)
+    - Runs traced encoder NEFF to get audio embeddings
+    - Constructs vision_embeddings/vision_mask for CTE scatter
+    - forward() passes to NxDI CTE (prefill) or TKG (decode)
+    """
+
+    AUDIO_TOKEN_ID = 151676  # <|audio_pad|> placeholder token
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__(config)
+        self.encoders = {}  # bucket_T -> traced NEFF
+        self._rope_deltas = None
+
+    def load_weights(self, model_name_or_path: str, architecture: str, **kwargs):
+        """Load pre-compiled Qwen3-ASR text decoder + traced encoder NEFFs.
+
+        Expects:
+        - NEURON_COMPILED_ARTIFACTS env var pointing to compiled text decoder
+        - NEURON_ENCODER_PATH env var pointing to directory with encoder_T{500,1000,3000}.pt
+        """
+        import torch_neuronx
+        from neuronx_distributed_inference.models.qwen3_vl.modeling_qwen3_vl import (
+            NeuronQwen3VLForCausalLM as NxDIQwen3VL,
+            Qwen3VLInferenceConfig,
+            Qwen3VLNeuronConfig,
+        )
+        from neuronx_distributed_inference.models.qwen3_vl.modeling_qwen3_vl_text import (
+            NeuronQwen3VLTextForCausalLM,
+        )
+        from neuronx_distributed_inference.models.application_base import (
+            load_state_dict as nxdi_load_sd,
+        )
+        from neuronx_distributed_inference.models.image_to_text_model_base import (
+            normalize_path,
+        )
+
+        neuron_config_dict = kwargs.get("neuron_config", {})
+        tp_degree = neuron_config_dict.get("tp_degree", 4)
+        batch_size = neuron_config_dict.get("batch_size", 1)
+        n_positions = neuron_config_dict.get("n_positions", 1024)
+        seq_len = neuron_config_dict.get("seq_len", 1024)
+
+        compiled_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
+        encoder_path = os.getenv(
+            "NEURON_ENCODER_PATH", "/mnt/models/compiled/qwen3_asr_encoder"
+        )
+
+        if not compiled_path:
+            raise ValueError(
+                "NEURON_COMPILED_ARTIFACTS must be set for Qwen3-ASR "
+                "(e.g., /mnt/models/compiled/qwen3_asr_vl_text_tp4)"
+            )
+
+        logger.info("Loading Qwen3-ASR text decoder from %s", compiled_path)
+        logger.info("Loading Qwen3-ASR encoders from %s", encoder_path)
+
+        # --- Load HF config for Qwen3-ASR ---
+        # Use vLLM's registered Qwen3ASRConfig (avoids trust_remote_code issues in subprocesses)
+        from vllm.transformers_utils.configs.qwen3_asr import Qwen3ASRConfig
+        import json
+        config_path = os.path.join(model_name_or_path, "config.json")
+        with open(config_path) as f:
+            raw_config = json.load(f)
+        hf_config = Qwen3ASRConfig(**raw_config)
+        text_config = hf_config.thinker_config.text_config
+
+        # --- Build NxDI config ---
+        text_neuron_config = Qwen3VLNeuronConfig(
+            tp_degree=tp_degree,
+            batch_size=batch_size,
+            n_positions=n_positions,
+            seq_len=seq_len,
+            max_context_length=n_positions,
+            on_device_sampling_config=None,
+            torch_dtype=torch.bfloat16,
+            is_continuous_batching=batch_size > 1,
+        )
+        vision_neuron_config = Qwen3VLNeuronConfig(
+            tp_degree=tp_degree,
+            batch_size=1,
+            seq_len=512,
+            n_positions=512,
+            on_device_sampling_config=None,
+            torch_dtype=torch.bfloat16,
+        )
+
+        text_config_dict = {
+            "hidden_size": text_config.hidden_size,
+            "num_hidden_layers": text_config.num_hidden_layers,
+            "num_attention_heads": text_config.num_attention_heads,
+            "num_key_value_heads": text_config.num_key_value_heads,
+            "head_dim": text_config.head_dim,
+            "intermediate_size": text_config.intermediate_size,
+            "vocab_size": text_config.vocab_size,
+            "max_position_embeddings": text_config.max_position_embeddings,
+            "rope_theta": text_config.rope_theta,
+            "rms_norm_eps": text_config.rms_norm_eps,
+            "tie_word_embeddings": text_config.tie_word_embeddings,
+            "attention_bias": getattr(text_config, "attention_bias", False),
+            "hidden_act": "silu",
+            "rope_scaling": {
+                "type": "mrope",
+                "rope_type": "default",
+                "mrope_section": [24, 20, 20],
+            },
+            "pad_token_id": 151643,
+            "attention_dropout": 0.0,
+            "bos_token_id": 151643,
+            "dtype": "bfloat16",
+            "eos_token_id": 151645,
+            "initializer_range": 0.02,
+        }
+        vision_config_dict = {
+            "hidden_size": 1024,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 16,
+            "num_key_value_heads": 16,
+            "head_dim": 64,
+            "intermediate_size": 4096,
+            "image_size": 224,
+            "patch_size": 14,
+            "spatial_merge_size": 2,
+            "deepstack_visual_indexes": [],
+            "vocab_size": text_config.vocab_size,
+            "max_position_embeddings": 512,
+            "depth": 1,
+            "hidden_act": "gelu",
+            "in_channels": 3,
+            "initializer_range": 0.02,
+            "num_heads": 16,
+            "num_position_embeddings": 256,
+            "out_hidden_size": text_config.hidden_size,
+            "temporal_patch_size": 2,
+        }
+
+        config = Qwen3VLInferenceConfig(
+            text_neuron_config=text_neuron_config,
+            vision_neuron_config=vision_neuron_config,
+            text_config=text_config_dict,
+            vision_config=vision_config_dict,
+            _name_or_path=model_name_or_path,
+            image_token_id=self.AUDIO_TOKEN_ID,
+        )
+
+        # --- Instantiate NxDI model (subclass with no vision encoder) ---
+        class _Qwen3ASRNxDI(NxDIQwen3VL):
+            vision_model_cls = None
+            vision_model_wrapper = None
+
+            def enable_vision_encoder(self, **kwargs):
+                pass
+
+            def load(self, compiled_model_path, start_rank_id=0, debug=False, **kwargs):
+                text_path = normalize_path(compiled_model_path) + "text_model/"
+                self.text_traced_model = torch.jit.load(text_path + "model.pt")
+                text_weights = self.get_text_builder(debug).shard_checkpoint()
+                start_rank_tensor = torch.tensor([start_rank_id], dtype=torch.int32)
+                self.text_traced_model.nxd_model.initialize(
+                    text_weights, start_rank_tensor
+                )
+                for model_wrapper in self.text_models:
+                    model_wrapper.model = self.text_traced_model
+                self.is_loaded_to_neuron = True
+
+            def compile(self, compiled_model_path, debug=False, **kwargs):
+                pass
+
+            @classmethod
+            def get_state_dict(cls, model_name_or_path, config):
+                raw_sd = nxdi_load_sd(model_name_or_path)
+                converted_sd = {}
+                for key, value in raw_sd.items():
+                    if key.startswith("thinker.audio_tower."):
+                        continue
+                    if key.startswith("thinker.model."):
+                        new_key = "language_model." + key[len("thinker.model.") :]
+                        converted_sd[new_key] = value
+                    elif key.startswith("thinker.lm_head."):
+                        new_key = key[len("thinker.") :]
+                        converted_sd[new_key] = value
+                    else:
+                        converted_sd[key] = value
+                model_sd = NeuronQwen3VLTextForCausalLM.convert_hf_to_neuron_state_dict(
+                    converted_sd, config.text_config
+                )
+                if getattr(config.text_config, "tie_word_embeddings", False):
+                    if (
+                        "embed_tokens.weight" in model_sd
+                        and "lm_head.weight" not in model_sd
+                    ):
+                        model_sd["lm_head.weight"] = model_sd["embed_tokens.weight"]
+                return model_sd
+
+        nxdi_model = _Qwen3ASRNxDI(compiled_path, config)
+        nxdi_model.load(compiled_path)
+
+        # Store the NxDI model
+        # Add missing HF config attributes needed by NeuronBaseForImageToText.forward()
+        for attr in ("output_attentions", "output_hidden_states", "use_return_dict"):
+            if not hasattr(nxdi_model.text_config, attr):
+                setattr(nxdi_model.text_config, attr, False)
+        self.model = nxdi_model
+        self._dtype = torch.bfloat16
+        self._n_positions = n_positions
+
+        # --- Load traced encoder NEFFs ---
+        encoder_buckets = [500, 1000, 3000]
+        for T in encoder_buckets:
+            neff_path = os.path.join(encoder_path, f"encoder_T{T}.pt")
+            if os.path.exists(neff_path):
+                enc = torch.jit.load(neff_path)
+                torch_neuronx.move_trace_to_device(enc, 0)
+                # Warmup
+                _ = enc(torch.randn(128, T))
+                self.encoders[T] = enc
+                logger.info("Loaded encoder bucket T=%d from %s", T, neff_path)
+            else:
+                logger.warning("Encoder NEFF not found: %s", neff_path)
+
+        if not self.encoders:
+            raise FileNotFoundError(
+                f"No encoder NEFFs found in {encoder_path}. "
+                "Expected files like encoder_T500.pt, encoder_T1000.pt, encoder_T3000.pt"
+            )
+
+        logger.info(
+            "Qwen3-ASR model loaded: %d encoder buckets, text decoder TP=%d",
+            len(self.encoders),
+            tp_degree,
+        )
+        return True, compiled_path
+
+    def _get_encoder_output_length(self, T_mel: int) -> int:
+        """Calculate encoder output token count for given mel length."""
+        input_lengths_leave = T_mel % 100
+        feat_lengths = (input_lengths_leave - 1) // 2 + 1
+        output_lengths = (
+            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (T_mel // 100) * 13
+        )
+        return output_lengths
+
+    def _select_bucket(self, T_mel: int) -> int:
+        """Select the smallest encoder bucket that fits the mel length."""
+        buckets = sorted(self.encoders.keys())
+        for b in buckets:
+            if T_mel <= b:
+                return b
+        return buckets[-1]
+
+    def _run_encoder(
+        self, mel_features: torch.Tensor, actual_mel_len: int
+    ) -> torch.Tensor:
+        """Run audio through the appropriate encoder bucket NEFF.
+
+        Args:
+            mel_features: Mel spectrogram [128, T] (128 mel bins for Qwen3-ASR)
+            actual_mel_len: Actual number of valid mel frames
+
+        Returns:
+            Audio embeddings [N_tokens, hidden_size]
+        """
+        bucket_T = self._select_bucket(actual_mel_len)
+        N_tokens = self._get_encoder_output_length(actual_mel_len)
+
+        # Pad/trim mel to bucket size
+        mel_input = mel_features[:, :bucket_T]
+        if mel_input.shape[1] < bucket_T:
+            mel_input = torch.nn.functional.pad(
+                mel_input, (0, bucket_T - mel_input.shape[1])
+            )
+
+        with torch.no_grad():
+            mel_input = mel_input.float()  # Encoder NEFF expects float32
+            output = self.encoders[bucket_T](mel_input)
+
+        # Trim to actual output length
+        return output[:N_tokens]  # [N_tokens, hidden_size]
+
+    def execute_model(self, model_input):
+        """Extract audio features and run the encoder during prefill."""
+        from neuronx_distributed_inference.models.llama4.utils.encoder_utils import (
+            generate_positions_from_mask,
+            pad_positions,
+            pad_vision_embeddings,
+        )
+
+        input_audio_features = None
+        feature_attention_mask = None
+
+        if model_input.multi_modal_kwargs is not None:
+            input_audio_features = model_input.multi_modal_kwargs.get(
+                "input_audio_features"
+            )
+            feature_attention_mask = model_input.multi_modal_kwargs.get(
+                "feature_attention_mask"
+            )
+
+        is_prefill = model_input.input_tokens.shape[-1] > 1
+        vision_embeddings = None
+        vision_mask = None
+
+        if input_audio_features is not None and is_prefill:
+            # Extract mel features
+            if isinstance(input_audio_features, list):
+                mel = input_audio_features[0]
+            else:
+                mel = input_audio_features
+            if mel.dim() == 3:
+                mel = mel.squeeze(0)  # [128, T]
+
+            # Get actual mel length from attention mask
+            if feature_attention_mask is not None:
+                if isinstance(feature_attention_mask, list):
+                    feature_attention_mask = feature_attention_mask[0]
+                if feature_attention_mask.dim() > 1:
+                    feature_attention_mask = feature_attention_mask.squeeze(0)
+                actual_mel_len = int(feature_attention_mask.sum().item())
+            else:
+                actual_mel_len = mel.shape[-1]
+
+            # Run encoder
+            audio_embeddings = self._run_encoder(mel, actual_mel_len)
+            audio_embeddings = audio_embeddings.to(self._dtype)
+            N_tokens = audio_embeddings.shape[0]
+
+            # Count placeholders in input_ids
+            num_placeholder = (
+                (model_input.input_tokens == self.AUDIO_TOKEN_ID).sum().item()
+            )
+            actual_scatter_count = min(N_tokens, num_placeholder)
+            audio_embeddings = audio_embeddings[:actual_scatter_count]
+
+            # Build vision_mask from audio token positions
+            audio_positions = (model_input.input_tokens == self.AUDIO_TOKEN_ID).squeeze(
+                0
+            )
+            position_indices = torch.where(audio_positions)[0][:actual_scatter_count]
+            trimmed_mask = torch.zeros(
+                model_input.input_tokens.shape[-1], dtype=torch.bool
+            )
+            trimmed_mask[position_indices] = True
+            vision_mask = generate_positions_from_mask(trimmed_mask)
+
+            # Pad to bucket size
+            bucket_size = self._n_positions
+            vision_mask = pad_positions(vision_mask, bucket_size, bucket_size - 1)
+
+            # Reshape embeddings for scatter: [1, N, hidden] -> padded
+            embedding_dim = audio_embeddings.shape[-1]
+            vision_embeddings = audio_embeddings.unsqueeze(0)  # [1, N, hidden]
+            vision_embeddings = pad_vision_embeddings(vision_embeddings, bucket_size)
+
+        hidden_states = self.forward(
+            input_ids=model_input.input_tokens,
+            positions=model_input.position_ids,
+            input_block_ids=model_input.input_block_ids,
+            sampling_params=model_input.sampling_params,
+            vision_embeddings=vision_embeddings,
+            vision_mask=vision_mask,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        input_block_ids: torch.Tensor,
+        sampling_params: torch.Tensor,
+        vision_embeddings: torch.Tensor | None = None,
+        vision_mask: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Forward pass routing to NxDI's CTE (prefill) or TKG (decode).
+
+        Handles mRoPE position computation for Qwen3-ASR (all 3 axes identical
+        since audio is 1D, unlike Qwen3-VL which has spatial positions).
+        """
+        import copy as _copy
+
+        is_prefill = input_ids.shape[-1] > 1
+        batch_size = input_ids.shape[0]
+
+        # Compute mRoPE position IDs (all 3 axes same for ASR)
+        if is_prefill:
+            seq_len = input_ids.shape[1]
+            pos = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
+            rotary_position_ids = pos.unsqueeze(0).expand(3, -1, -1)
+            self._rope_deltas = torch.zeros(1, 1, dtype=torch.long)
+        else:
+            if self._rope_deltas is not None:
+                delta = self._rope_deltas.to(input_ids.device)
+                delta = delta.repeat_interleave(
+                    batch_size // max(delta.shape[0], 1), dim=0
+                )
+            else:
+                delta = 0
+            rotary_position_ids = _copy.deepcopy(positions)
+            rotary_position_ids = rotary_position_ids.view(1, -1).expand(batch_size, -1)
+            rotary_position_ids = rotary_position_ids.add(delta)
+            rotary_position_ids = rotary_position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        # Get dummy vision inputs for decode or text-only prefill
+        if vision_embeddings is None:
+            from neuronx_distributed_inference.models.qwen3_vl.modeling_qwen3_vl_text import (
+                NeuronQwen3VLTextModelWrapper,
+            )
+
+            pad_limit = self._n_positions if is_prefill else 1
+            # For decode, we need bucket_size = n_positions for the mask
+            vision_embeddings, vision_mask, _ = (
+                NeuronQwen3VLTextModelWrapper.get_dummy_vision_inputs(
+                    config=self.model.text_config,
+                    input_ids=input_ids,
+                    n_active_tokens=self._n_positions,
+                    fill_value=(self._n_positions - 1),
+                )
+            )
+
+        deepstack_vision_embeds = torch.zeros(0)
+
+        with self._reordered(
+            input_block_ids,
+            input_ids=input_ids,
+            positions=positions,
+            sampling_params=sampling_params,
+            vision_embeddings=vision_embeddings,
+            vision_mask=vision_mask,
+        ) as (sorted_ids, inputs, restore):
+            # Call grandparent forward (NeuronBaseForImageToText) which accepts vision_embeddings
+            # NeuronQwen3VLForCausalLM.forward() only accepts pixel_values, not vision_embeddings
+            from neuronx_distributed_inference.models.image_to_text_model_base import NeuronBaseForImageToText
+            output = NeuronBaseForImageToText.forward(
+                self.model,
+                inputs["input_ids"].to(torch.int32),
+                attention_mask=None,
+                position_ids=inputs["positions"].to(torch.int32),
+                seq_ids=sorted_ids.flatten().to(torch.int32),
+                sampling_params=inputs["sampling_params"],
+                vision_embeddings=inputs.get("vision_embeddings"),
+                vision_mask=inputs.get("vision_mask"),
+                rotary_position_ids=rotary_position_ids,
+                deepstack_vision_embeds=deepstack_vision_embeds,
+            )
+
+            if self.model.config.neuron_config.on_device_sampling_config:
+                output = output.hidden_states
+            else:
+                output = output.logits[:, -1, :]
+
+            return restore(output)
+
+
diff --git a/contrib/models/Qwen3-ASR-1.7B/vllm/start-vllm-server.sh b/contrib/models/Qwen3-ASR-1.7B/vllm/start-vllm-server.sh
new file mode 100644
index 00000000..4982e874
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/vllm/start-vllm-server.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Start vLLM server for Qwen3-ASR-1.7B on Neuron
+#
+# Prerequisites:
+#   - Pre-compiled encoder NEFFs in NEURON_ENCODER_PATH
+#   - Pre-compiled text decoder in NEURON_COMPILED_ARTIFACTS
+#   - vllm-neuron installed with Qwen3-ASR patches applied
+
+set -e
+
+# Configuration
+MODEL_PATH="${MODEL_PATH:-Qwen/Qwen3-ASR-1.7B}"
+PORT="${PORT:-8000}"
+TP_DEGREE="${TP_DEGREE:-4}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-1024}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-1}"
+
+# Verify environment variables
+if [ -z "$NEURON_COMPILED_ARTIFACTS" ]; then
+    echo "ERROR: NEURON_COMPILED_ARTIFACTS not set"
+    echo "  Set to path containing compiled text decoder (e.g., /mnt/models/compiled/qwen3_asr_vl_text_tp4)"
+    exit 1
+fi
+
+if [ -z "$NEURON_ENCODER_PATH" ]; then
+    echo "ERROR: NEURON_ENCODER_PATH not set"
+    echo "  Set to path containing encoder NEFFs (encoder_T500.pt, encoder_T1000.pt, encoder_T3000.pt)"
+    exit 1
+fi
+
+if [ -z "$NEURON_RT_VISIBLE_CORES" ]; then
+    export NEURON_RT_VISIBLE_CORES="0-$((TP_DEGREE - 1))"
+    echo "NEURON_RT_VISIBLE_CORES not set, defaulting to: $NEURON_RT_VISIBLE_CORES"
+fi
+
+echo "Starting vLLM server for Qwen3-ASR-1.7B"
+echo "  Model: $MODEL_PATH"
+echo "  TP degree: $TP_DEGREE"
+echo "  Port: $PORT"
+echo "  Compiled artifacts: $NEURON_COMPILED_ARTIFACTS"
+echo "  Encoder path: $NEURON_ENCODER_PATH"
+echo "  Visible cores: $NEURON_RT_VISIBLE_CORES"
+echo ""
+
+python3 -m vllm.entrypoints.openai.api_server \
+    --model "$MODEL_PATH" \
+    --tensor-parallel-size "$TP_DEGREE" \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-seqs "$MAX_NUM_SEQS" \
+    --block-size 128 \
+    --no-enable-prefix-caching \
+    --port "$PORT" \
+    --trust-remote-code \
+    --additional-config "{\"override_neuron_config\": {\"text_neuron_config\": {\"tp_degree\": $TP_DEGREE, \"batch_size\": 1, \"n_positions\": $MAX_MODEL_LEN, \"seq_len\": $MAX_MODEL_LEN}}}"
diff --git a/contrib/models/Qwen3-ASR-1.7B/vllm/test_transcription.py b/contrib/models/Qwen3-ASR-1.7B/vllm/test_transcription.py
new file mode 100644
index 00000000..0316085b
--- /dev/null
+++ b/contrib/models/Qwen3-ASR-1.7B/vllm/test_transcription.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Test Qwen3-ASR-1.7B transcription via vLLM OpenAI-compatible API.
+
+Usage:
+    # Start the vLLM server first:
+    bash start-vllm-server.sh
+
+    # Then run this test:
+    python3 test_transcription.py [audio_file.wav]
+"""
+
+import base64
+import json
+import sys
+import time
+import urllib.request
+
+VLLM_URL = "http://localhost:8000"
+
+
+def get_model_id():
+    """Get the model ID from the running server."""
+    req = urllib.request.Request(f"{VLLM_URL}/v1/models")
+    with urllib.request.urlopen(req, timeout=10) as resp:
+        data = json.loads(resp.read().decode())
+    return data["data"][0]["id"]
+
+
+def transcribe(audio_path: str, model_id: str) -> dict:
+    """Send audio to the vLLM server and get transcription."""
+    with open(audio_path, "rb") as f:
+        audio_bytes = f.read()
+    audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+
+    payload = {
+        "model": model_id,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_b64,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        "max_tokens": 256,
+        "temperature": 0.0,
+    }
+
+    headers = {"Content-Type": "application/json"}
+    req_data = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(
+        f"{VLLM_URL}/v1/chat/completions",
+        data=req_data,
+        headers=headers,
+        method="POST",
+    )
+
+    t0 = time.time()
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        result = json.loads(resp.read().decode())
+    elapsed = time.time() - t0
+
+    return {
+        "text": result["choices"][0]["message"]["content"],
+        "elapsed": elapsed,
+        "prompt_tokens": result.get("usage", {}).get("prompt_tokens", 0),
+        "completion_tokens": result.get("usage", {}).get("completion_tokens", 0),
+    }
+
+
+def parse_asr_output(raw_text: str) -> str:
+    """Parse Qwen3-ASR output format: 'language {lang}<asr_text>{text}'"""
+    marker = "<asr_text>"
+    if marker in raw_text:
+        return raw_text.split(marker, 1)[1].strip()
+    return raw_text.strip()
+
+
+def main():
+    audio_path = sys.argv[1] if len(sys.argv) > 1 else "/tmp/test_speech_real.wav"
+
+    # Check server health
+    try:
+        req = urllib.request.Request(f"{VLLM_URL}/health")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            if resp.status != 200:
+                print("ERROR: Server not healthy")
+                return 1
+    except Exception as e:
+        print(f"ERROR: Server not reachable: {e}")
+        print("Start the server with: bash start-vllm-server.sh")
+        return 1
+
+    model_id = get_model_id()
+    print(f"Model: {model_id}")
+    print(f"Audio: {audio_path}")
+    print()
+
+    # Run transcription
+    result = transcribe(audio_path, model_id)
+
+    raw_text = result["text"]
+    clean_text = parse_asr_output(raw_text)
+
+    print(f"Raw output: {raw_text}")
+    print(f"Transcription: {clean_text}")
+    print(f"Latency: {result['elapsed']:.3f}s")
+    print(
+        f"Tokens: prompt={result['prompt_tokens']}, completion={result['completion_tokens']}"
+    )
+
+    # Performance run (5 iterations)
+    print("\nPerformance (5 runs):")
+    latencies = []
+    for i in range(5):
+        r = transcribe(audio_path, model_id)
+        latencies.append(r["elapsed"])
+    avg = sum(latencies) / len(latencies)
+    print(f"  Avg: {avg:.3f}s, Min: {min(latencies):.3f}s, Max: {max(latencies):.3f}s")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())