diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h
index 0bed7bc7..82802acd 100644
--- a/include/infinicore_infer.h
+++ b/include/infinicore_infer.h
@@ -6,5 +6,6 @@
 
 #include "infinicore_infer/models/deepseek.h"
 #include "infinicore_infer/models/jiuge.h"
+#include "infinicore_infer/models/Qwen3MoE.h"
 
 #endif /* INFINICORE_INFER_H */
diff --git a/include/infinicore_infer/models/Qwen3MoE.h b/include/infinicore_infer/models/Qwen3MoE.h
new file mode 100644
index 00000000..0ea8047c
--- /dev/null
+++ b/include/infinicore_infer/models/Qwen3MoE.h
@@ -0,0 +1,88 @@
+#ifndef QWEN3MOE
+#define QWEN3MOE
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+
+struct Qwen3MoEWeights;
+
+/// @brief 函数指针
+typedef void (*load_global)(Qwen3MoEWeights *, void *cpu_ptr);
+typedef void (*load_layer)(Qwen3MoEWeights *, void *cpu_ptr, size_t layer_id);
+typedef void (*load_layer_linear)(Qwen3MoEWeights *, void *weight_ptr, size_t layer_id);
+/// @brief 权重加载器
+typedef struct {
+    // Pre-Norm
+    load_layer load_attn_norm;
+
+    // Attention
+    load_layer_linear load_attn_q_proj; 
+    load_layer_linear load_attn_k_proj; 
+    load_layer_linear load_attn_v_proj; 
+
+    // QKNorm(RMSNorm)
+    load_layer load_attn_q_norm;
+    load_layer load_attn_k_norm;
+
+    // output linear
+    load_layer_linear load_attn_o_proj;
+
+}Qwen3MoEWeightLoader;
+
+struct Qwen3MoEAttention;
+
+/// @brief 模型参数
+typedef struct {
+    //数据种类 BF16 / FP16
+    infiniDtype_t dtype;
+
+    // Linear args
+    size_t hidden_size;
+    size_t num_heads;
+    size_t num_kv_head; // k_v head GQA广播倍数
+    size_t head_dim;
+
+    // RoPE args
+    float rope_theta;
+    size_t max_seq_len; 
+
+    float rms_norm_eps; //防止除零
+}Qwen3MoEAttentionMeta;
+
+/// ==================== API ====================
+
+/// @brief 创建注意力模块
+__C __export struct Qwen3MoEAttention *
+createQwen3MoEAttention(const Qwen3MoEAttentionMeta *,
+                        const Qwen3MoEWeights *);
+/// @brief 创建权重矩阵
+__C Qwen3MoEWeights *
+createQwen3MoEWeights(const Qwen3MoEAttentionMeta *meta,
+                        infiniDevice_t device,
+                        int ndev,             
+                        const int *dev_ids);  
+/// @brief 创建weight加载器
+__C __export Qwen3MoEWeightLoader *
+createQwen3MoEWeightLoader();
+/// @brief 创建KVCache
+__C __export struct Qwen3Cache *
+createQwen3Cache(const Qwen3MoEAttentionMeta *meta,
+                    size_t batch_size, size_t seq_len);
+/// @brief 前向计算
+__C __export void forwardQwen3MoEAttention(
+    struct Qwen3MoEAttention* context,
+    struct Qwen3Cache* kv_cache,
+    const void* input_tensor,
+    void* output_tensor,
+    int batch_size,           // [新增]
+    const int* seq_lens_ptr,  // [新增]
+    const int* past_lens_ptr, // [新增]
+    const int* pos_ids_ptr    // [新增]
+);
+
+/// @brief 销毁模型
+__C __export void destroyQwen3MoEAttention(struct Qwen3MoEAttention* ctx);
+
+
+#endif 
\ No newline at end of file
diff --git a/scripts/libinfinicore_infer/qwen3_moe.py b/scripts/libinfinicore_infer/qwen3_moe.py
new file mode 100644
index 00000000..2d7c6393
--- /dev/null
+++ b/scripts/libinfinicore_infer/qwen3_moe.py
@@ -0,0 +1,115 @@
+from .base import BaseModel, DataType, DeviceType, KVCacheCStruct, register_model
+from ctypes import (
+    c_size_t,
+    c_uint,
+    c_int,
+    c_float,
+    c_void_p,
+    POINTER,
+    Structure,
+    CFUNCTYPE,
+)
+
+
+class Qwen3MoEAttentionMetaCStruct(Structure):
+    _fields_ = [
+        ("dtype", DataType),
+        ("hidden_size", c_size_t),
+        ("num_heads", c_size_t),
+        ("num_kv_head", c_size_t),
+        ("head_dim", c_size_t),
+        ("rope_theta", c_float),
+        ("max_seq_len", c_size_t),
+        ("rms_norm_eps", c_float),
+    ]
+
+
+class Qwen3MoEWeightsCStruct(Structure):
+    pass
+
+
+class Qwen3MoEAttentionCStruct(Structure):
+    pass
+
+
+class Qwen3CacheCStruct(Structure):
+    pass
+
+
+load_layer_fn = CFUNCTYPE(None, POINTER(Qwen3MoEWeightsCStruct), c_void_p, c_size_t)
+load_layer_linear_fn = CFUNCTYPE(
+    None, POINTER(Qwen3MoEWeightsCStruct), c_void_p, c_void_p, c_void_p, c_size_t
+)
+
+
+class Qwen3MoEWeightLoaderCStruct(Structure):
+    _fields_ = [
+        ("load_attn_norm", load_layer_fn),
+        ("load_attn_q_proj", load_layer_linear_fn),
+        ("load_attn_k_proj", load_layer_linear_fn),
+        ("load_attn_v_proj", load_layer_linear_fn),
+        ("load_attn_q_norm", load_layer_fn),
+        ("load_attn_k_norm", load_layer_fn),
+        ("load_attn_o_proj", load_layer_linear_fn),
+    ]
+
+
+@register_model
+class Qwen3MoEModel(BaseModel):
+    @classmethod
+    def register_lib(cls, lib):
+        """Register Qwen3MoE model functions with the library"""
+        lib.createQwen3MoEWeightLoader.argtypes = []
+        lib.createQwen3MoEWeightLoader.restype = POINTER(
+            Qwen3MoEWeightLoaderCStruct
+        )
+
+        lib.createQwen3MoEWeights.argtypes = [
+            POINTER(Qwen3MoEAttentionMetaCStruct),
+            DeviceType,
+            c_int,
+            POINTER(c_int),
+        ]
+        lib.createQwen3MoEWeights.restype = POINTER(Qwen3MoEWeightsCStruct)
+
+        lib.createQwen3MoEAttention.argtypes = [
+            POINTER(Qwen3MoEAttentionMetaCStruct),
+            POINTER(Qwen3MoEWeightsCStruct),
+        ]
+        lib.createQwen3MoEAttention.restype = POINTER(Qwen3MoEAttentionCStruct)
+
+        lib.destroyQwen3MoEAttention.argtypes = [POINTER(Qwen3MoEAttentionCStruct)]
+
+        lib.createQwen3Cache.argtypes = [
+            POINTER(Qwen3MoEAttentionMetaCStruct),
+            c_size_t,
+            c_size_t,
+        ]
+        lib.createQwen3Cache.restype = POINTER(Qwen3CacheCStruct)
+
+        lib.forwardQwen3MoEAttention.argtypes = [
+            POINTER(Qwen3MoEAttentionCStruct),
+            POINTER(Qwen3CacheCStruct),
+            c_void_p,
+            c_void_p,
+        ]
+
+    def create_weight_loader(self):
+        return self.lib.createQwen3MoEWeightLoader()
+
+    def create_weights(self, meta, device_type, ndev, dev_ids):
+        return self.lib.createQwen3MoEWeights(meta, device_type, ndev, dev_ids)
+
+    def create_model(self, meta, weights):
+        return self.lib.createQwen3MoEAttention(meta, weights)
+
+    def destroy_model(self, model):
+        self.lib.destroyQwen3MoEAttention(model)
+
+    def create_cache(self, meta, batch_size, seq_len):
+        return self.lib.createQwen3Cache(meta, batch_size, seq_len)
+
+    def forward_attention(self, model, kv_cache, input_tensor, output_tensor):
+        self.lib.forwardQwen3MoEAttention(model, kv_cache, input_tensor, output_tensor)
+
+
diff --git a/src/models/Qwen3MoE/Qwen3MoE.cpp b/src/models/Qwen3MoE/Qwen3MoE.cpp
new file mode 100644
index 00000000..7d044e4b
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE.cpp
@@ -0,0 +1,616 @@
+#include "Qwen3MoE_impl.hpp"
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+#include "../inference_context.hpp"
+#include "infinicore_infer.h"
+#include <random>
+#include <thread>
+#include <vector>
+#include <cmath>
+#include <cstring>
+#include <algorithm>
+#include <numeric>
+#include <iostream>
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+// =============================================================================
+// Helper Declarations & Utils
+// =============================================================================
+
+void createDeviceResource(Qwen3MoEDeviceResource *rsrc, 
+    const Qwen3MoEAttentionMeta *meta,
+    std::shared_ptr<Qwen3DeviceWeights> weights, 
+    infiniDevice_t device, int idev,
+    int ndev, int dev_id,
+    infinicclComm_t comm) {
+
+    RUN_INFINI(infinirtSetDevice(device, dev_id));
+    RUN_INFINI(infinirtStreamSynchronize(weights->load_stream));
+
+    infiniopHandle_t handle;
+    infiniopCreateHandle(&handle);
+
+    infinirtStream_t stream;
+    infinirtStreamCreate(&stream); 
+
+    auto memory_pool = std::make_shared<MemoryPool>();
+
+    *rsrc = Qwen3MoEDeviceResource{
+    device,
+    dev_id,
+    handle,
+    weights,
+    stream,
+    comm,
+    memory_pool,
+    };
+
+    RUN_INFINI(infinirtDeviceSynchronize());
+}
+
+void releaseDeviceResource(Qwen3MoEDeviceResource &res) {
+    infinirtDeviceSynchronize();
+    res.weights.reset();
+    if (res.handle) { infiniopDestroyHandle(res.handle); res.handle = nullptr; }
+    if (res.stream) { infinirtStreamDestroy(res.stream); res.stream = nullptr; }
+    if (res.comm) { infinicclCommDestroy(res.comm); res.comm = nullptr; }
+}
+
+// =============================================================================
+// Inference Logic
+// =============================================================================
+
+// Qwen3MoE.cpp
+
+void inferBatchQwen3MoE(const Qwen3MoEAttentionMeta &meta,
+    Qwen3MoEDeviceResource &rsrc,
+    std::shared_ptr<Tensor> input_hidden_states,
+    std::shared_ptr<Tensor> pos_ids,
+    std::shared_ptr<Tensor> output_tensor,
+    Qwen3Cache *kv_cache,
+    size_t layer_id,
+    int batch_size,
+    const std::vector<int>& _seq_lens, 
+    const std::vector<int>& _past_lens
+) {
+    infiniopHandle_t handle = rsrc.handle;
+    infinirtStream_t stream = rsrc.stream;
+    auto memory_pool = rsrc.memory_pool;
+    auto dt_logits = meta.dtype;
+
+    const auto &layer_weight = rsrc.weights->w_layers[layer_id];
+    const auto &attn_weight = layer_weight.self_attn;
+
+    // [FINAL TRUTH] Based on weight shape [4096, 2048]
+    size_t num_heads = 32;     
+    size_t num_kv_head = 4;    
+    size_t head_dim = 128;     
+    size_t ngroup = num_heads / num_kv_head; // 8
+
+    auto input_shape = input_hidden_states->shape();
+    size_t ntok = input_shape[0];
+
+    std::vector<int> seq_lens = _seq_lens;
+    std::vector<int> past_lens = _past_lens;
+    std::vector<int> cpu_pos_ids(ntok);
+
+    RUN_INFINI(infinirtMemcpyAsync(cpu_pos_ids.data(), pos_ids->data(), ntok * sizeof(int), INFINIRT_MEMCPY_D2H, stream));
+    RUN_INFINI(infinirtStreamSynchronize(stream));
+
+    size_t pos_offset = 0;
+    for (int b = 0; b < batch_size; ++b) {
+    int current_pos = cpu_pos_ids[pos_offset];
+    if (past_lens[b] == 0 && current_pos > 0) {
+        past_lens[b] = current_pos;
+        }
+        pos_offset += seq_lens[b];
+        }
+
+    CacheManager cache_manager(100);
+    InferenceContext ctx(handle, memory_pool, &cache_manager, stream);
+    setInferenceContext(&ctx);
+
+    // Alloc Buffers (Full 128-dim size)
+    // Q: 32 * 128 = 4096
+    // K/V: 4 * 128 = 512
+    auto q_buf = Tensor::buffer(dt_logits, {ntok, num_heads * head_dim}, memory_pool);
+    auto k_buf = Tensor::buffer(dt_logits, {ntok, num_kv_head * head_dim}, memory_pool);
+    auto v_buf = Tensor::buffer(dt_logits, {ntok, num_kv_head * head_dim}, memory_pool);
+    auto o_buf = Tensor::buffer(dt_logits, {ntok, num_heads * head_dim}, memory_pool);
+
+    // Step 1: Projections
+    linear(q_buf, input_hidden_states, attn_weight->q_proj, 1.f, 0.f, nullptr, nullptr);
+    linear(k_buf, input_hidden_states, attn_weight->k_proj, 1.f, 0.f, nullptr, nullptr);
+    linear(v_buf, input_hidden_states, attn_weight->v_proj, 1.f, 0.f, nullptr, nullptr);
+
+    int check_pos_id = 64;
+    size_t half_dim = 64; // head_dim / 2
+    std::vector<unsigned short> h_cos_row(half_dim);
+    
+    // Offset = row * row_stride (half_dim elements)
+    size_t cos_offset = check_pos_id * half_dim;
+    
+    // Assuming cos_table is BF16
+    RUN_INFINI(infinirtMemcpyAsync(h_cos_row.data(), 
+                    (char*)rsrc.weights->cos_table->data() + cos_offset * sizeof(unsigned short), 
+                    half_dim * sizeof(unsigned short), 
+                    INFINIRT_MEMCPY_D2H, 
+                    stream));
+    RUN_INFINI(infinirtStreamSynchronize(stream));
+
+    // Step 2: QK Norm (128-dim)
+    {
+        auto q_norm_view = q_buf->view({ntok, num_heads, head_dim});
+        auto k_norm_view = k_buf->view({ntok, num_kv_head, head_dim});
+
+    if (rsrc.weights->w_layers[layer_id].self_attn->q_norm) {
+        rmsnorm(q_norm_view, q_norm_view, rsrc.weights->w_layers[layer_id].self_attn->q_norm, 1e-6);
+    }
+    if (rsrc.weights->w_layers[layer_id].self_attn->k_norm) {
+        rmsnorm(k_norm_view, k_norm_view, rsrc.weights->w_layers[layer_id].self_attn->k_norm, 1e-6);
+    }
+    }
+    
+    // Step 3: RoPE (128-dim)
+    {
+        auto q_rope = q_buf->view({ntok, num_heads, head_dim});
+        auto k_rope = k_buf->view({ntok, num_kv_head, head_dim});
+        
+        rope_v2(q_rope, q_rope, pos_ids, rsrc.weights->cos_table, rsrc.weights->sin_table);
+        rope_v2(k_rope, k_rope, pos_ids, rsrc.weights->cos_table, rsrc.weights->sin_table);
+    }
+
+    // =========================================================
+    // Step 4: KV Cache Setup & Batch Loop
+    // =========================================================
+        
+        // 1. KV Cache Initialization
+        if (kv_cache->layers.size() <= layer_id) {
+            kv_cache->layers.resize(layer_id + 1);
+        }
+        auto &kv_cache_layer = kv_cache->layers[layer_id];
+        size_t max_seq_len = meta.max_seq_len;
+        
+        // [RESTORED STANDARD LOGIC]
+        // 只有当指针为空，或者形状不匹配时，才重新分配！
+        bool need_alloc = false;
+        if (!kv_cache_layer.first || !kv_cache_layer.second) {
+            need_alloc = true;
+        } else {
+            auto s = kv_cache_layer.first->shape();
+            if (s[0] < static_cast<size_t>(batch_size) || 
+                s[1] != num_kv_head || 
+                s[2] != max_seq_len || 
+                s[3] != head_dim) { 
+                need_alloc = true;
+            }
+        }
+        size_t unit_size = dsize(dt_logits);
+        if (need_alloc) {
+            kv_cache_layer.first = Tensor::buffer(dt_logits, {static_cast<size_t>(batch_size), num_kv_head, max_seq_len, head_dim}, memory_pool);
+            kv_cache_layer.second = Tensor::buffer(dt_logits, {static_cast<size_t>(batch_size), num_kv_head, max_seq_len, head_dim}, memory_pool);
+            
+            // [REVERTED] Use cudaMemsetAsync (Stable)
+            size_t num_elements = static_cast<size_t>(batch_size) * num_kv_head * max_seq_len * head_dim;
+            size_t total_bytes = num_elements * unit_size;
+            
+            // [SAFEGUARD] Check size > 0
+            if (total_bytes > 0) {
+                cudaMemsetAsync(kv_cache_layer.first->data(), 0, total_bytes, (cudaStream_t)stream);
+                cudaMemsetAsync(kv_cache_layer.second->data(), 0, total_bytes, (cudaStream_t)stream);
+            }
+        } 
+
+        auto k_cache_all = kv_cache_layer.first;
+        auto v_cache_all = kv_cache_layer.second;
+
+
+    char* k_cache_base = (char*)k_cache_all->data();
+    char* v_cache_base = (char*)v_cache_all->data();
+
+    size_t stride_seq_bytes   = head_dim * unit_size;
+    size_t stride_head_bytes  = max_seq_len * stride_seq_bytes;
+    size_t stride_batch_bytes = num_kv_head * stride_head_bytes;
+
+    size_t token_offset = 0; 
+
+    for (int b = 0; b < batch_size; ++b) {
+    size_t cur_seq_len = static_cast<size_t>(seq_lens[b]);
+    size_t cur_past_len = static_cast<size_t>(past_lens[b]);
+    size_t total_len = cur_past_len + cur_seq_len;
+
+    // --- Cache Update ---
+    char* k_src_batch_ptr = (char*)k_buf->data() + token_offset * num_kv_head * head_dim * unit_size;
+    char* v_src_batch_ptr = (char*)v_buf->data() + token_offset * num_kv_head * head_dim * unit_size;
+    char* k_dst_batch_base = k_cache_base + b * stride_batch_bytes;
+    char* v_dst_batch_base = v_cache_base + b * stride_batch_bytes;
+    size_t kv_token_bytes = head_dim * unit_size; 
+    size_t src_pitch = num_kv_head * head_dim * unit_size; 
+    size_t dst_pitch = head_dim * unit_size; 
+
+    for (size_t h = 0; h < num_kv_head; h++) {
+    char* k_s = k_src_batch_ptr + h * kv_token_bytes;
+    char* v_s = v_src_batch_ptr + h * kv_token_bytes;
+    char* k_d = k_dst_batch_base + h * stride_head_bytes + cur_past_len * stride_seq_bytes;
+    char* v_d = v_dst_batch_base + h * stride_head_bytes + cur_past_len * stride_seq_bytes;
+    cudaMemcpy2DAsync(k_d, dst_pitch, k_s, src_pitch, kv_token_bytes, cur_seq_len, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    cudaMemcpy2DAsync(v_d, dst_pitch, v_s, src_pitch, kv_token_bytes, cur_seq_len, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
+
+    // --- Attention Compute  ---
+
+    // 1. Prepare Q
+    auto q_transposed = Tensor::buffer(dt_logits, {num_heads, cur_seq_len, head_dim}, memory_pool);
+    auto q_src_view = q_buf->view({ntok, num_heads, head_dim})->slice(0, token_offset, cur_seq_len);
+    for (size_t h = 0; h < num_heads; h++) {
+    auto q_s = q_src_view->slice(1, h, 1)->view({cur_seq_len, head_dim});
+    auto q_d = q_transposed->slice(0, h, 1)->view({cur_seq_len, head_dim});
+    rearrange(q_d, q_s);
+    }
+    auto q_gemm = q_transposed->view({num_kv_head, ngroup * cur_seq_len, head_dim});
+
+    // 2. Prepare K
+    size_t padded_len = (total_len + 31) / 32 * 32; 
+    auto k_padded_gather = Tensor::buffer(dt_logits, {num_kv_head, padded_len, head_dim}, memory_pool);
+    size_t kv_gather_bytes = num_kv_head * padded_len * head_dim * unit_size;
+
+    // [REVERTED] Use cudaMemsetAsync
+    if (kv_gather_bytes > 0) {
+        cudaMemsetAsync(k_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    }
+
+    char* k_gather_src_base = k_cache_base + b * stride_batch_bytes;
+    size_t gather_bytes_per_head = total_len * head_dim * unit_size;
+    size_t dst_head_stride_bytes = padded_len * head_dim * unit_size;
+    for (size_t h = 0; h < num_kv_head; h++) {
+    char* k_src = k_gather_src_base + h * stride_head_bytes;
+    char* k_dst = (char*)k_padded_gather->data() + h * dst_head_stride_bytes;
+    // Keep size check for memcpy
+    if (gather_bytes_per_head > 0) {
+        cudaMemcpyAsync(k_dst, (void*)k_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
+    }
+
+    auto k_gemm_in = Tensor::buffer(dt_logits, {num_kv_head, head_dim, padded_len}, memory_pool);
+    rearrange(k_gemm_in, k_padded_gather->permute({0, 2, 1}));
+
+    // 3. GEMM 1: Q * K
+    auto scores_padded = Tensor::buffer(dt_logits, {num_kv_head, ngroup * cur_seq_len, padded_len}, memory_pool);
+
+    // [Scheme A] Zero out the buffer safely
+    size_t scores_bytes = num_kv_head * ngroup * cur_seq_len * padded_len * unit_size;
+    cudaMemsetAsync(scores_padded->data(), 0, scores_bytes, (cudaStream_t)stream);
+
+    float scale_factor = 1.0f / sqrt(128.0f); 
+    linear(scores_padded, q_gemm, k_gemm_in, scale_factor, 0.f, nullptr, nullptr);
+
+    // 4. Softmax+Scaling+Masking
+    auto scores_view = scores_padded->view({num_heads, cur_seq_len, padded_len});
+    auto scores_in = scores_view->slice(2, 0, total_len);
+    causalSoftmax(scores_in, scores_in);
+
+    if (padded_len > total_len) {
+        size_t pitch = padded_len * unit_size;
+        size_t width = (padded_len - total_len) * unit_size;
+        char* dst_ptr = (char*)scores_padded->data() + total_len * unit_size;
+        // Keep size check for 2D Memset
+        if (width > 0) {
+            cudaMemset2DAsync(dst_ptr, pitch, 0, width, num_heads * cur_seq_len, (cudaStream_t)stream);
+        }
+    }
+    
+
+    // 5. GEMM 2
+    auto v_padded_gather = Tensor::buffer(dt_logits, {num_kv_head, padded_len, head_dim}, memory_pool);
+    // [REVERTED] Use cudaMemsetAsync
+    if (kv_gather_bytes > 0) {
+        cudaMemsetAsync(v_padded_gather->data(), 0, kv_gather_bytes, (cudaStream_t)stream);
+    }
+
+    char* v_gather_src_base = v_cache_base + b * stride_batch_bytes;
+    for (size_t h = 0; h < num_kv_head; h++) {
+    char* v_src = v_gather_src_base + h * stride_head_bytes;
+    char* v_dst = (char*)v_padded_gather->data() + h * dst_head_stride_bytes;
+    // Keep size check for memcpy
+    if (gather_bytes_per_head > 0) {
+        cudaMemcpyAsync(v_dst, (void*)v_src, gather_bytes_per_head, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    }
+    }
+
+    auto attn_out_b = Tensor::buffer(dt_logits, {num_kv_head, ngroup * cur_seq_len, head_dim}, memory_pool);
+    linear(attn_out_b, scores_padded, v_padded_gather, 1.f, 0.f, nullptr, nullptr);
+
+    // Rearrange
+    auto attn_out_view_flat = attn_out_b->view({num_heads, cur_seq_len, head_dim});
+    auto o_dst_flat = o_buf->view({ntok, num_heads, head_dim})->slice(0, token_offset, cur_seq_len);
+    for (size_t h = 0; h < num_heads; h++) {
+    auto src_h = attn_out_view_flat->slice(0, h, 1)->view({cur_seq_len, head_dim});
+    auto dst_h = o_dst_flat->slice(1, h, 1)->view({cur_seq_len, head_dim});
+    rearrange(dst_h, src_h);
+    }
+
+    token_offset += cur_seq_len;
+    } // End of Batch Loop
+
+    // Step 6: Final Output Projection
+    if (output_tensor) {
+    size_t context_dim = num_heads * head_dim; 
+    auto ctx_flat = o_buf->view({ntok, context_dim});
+    auto w_o = attn_weight->o_proj; 
+    size_t hidden_dim = meta.hidden_size; 
+    auto out_flat = output_tensor->view({ntok, hidden_dim});
+    linear(out_flat, ctx_flat, w_o, 1.0f, 0.0f, nullptr, nullptr);
+    }
+}
+
+// =============================================================================
+// Interface Exports
+// =============================================================================
+
+Qwen3MoEAttention::Qwen3MoEAttention(const Qwen3MoEAttentionMeta *_meta, const Qwen3MoEWeights *weights) : meta(*_meta) {
+    auto device_weights = weights->device_weights;
+    int ndev = device_weights.size();
+    device = device_weights[0]->device;
+    dev_ids.resize(ndev);
+    for (int i = 0; i < ndev; i++) {
+        dev_ids[i] = device_weights[i]->dev_id;
+    }
+    dev_resources = std::vector<Qwen3MoEDeviceResource>(ndev);
+    RUN_INFINI(infinirtInit());
+    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+    if (ndev > 1) {
+        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+    }
+    for (int i = 0; i < ndev; i++) {
+        createDeviceResource(&dev_resources[i], &meta, device_weights[i], device, i, ndev, dev_ids[i], comms[i]);
+    }
+}
+
+__C __export struct Qwen3MoEAttention *createQwen3MoEAttention(const Qwen3MoEAttentionMeta *_meta,
+                        const Qwen3MoEWeights *weights) {
+    Qwen3MoEAttention *attention = new Qwen3MoEAttention(_meta, weights);
+    return attention;
+}
+
+__C __export void destroyQwen3MoEAttention(struct Qwen3MoEAttention *ctx) {
+    if (!ctx) return;
+    auto ndev = ctx->dev_resources.size();
+    for (size_t idev = 0; idev < ndev; idev++) {
+        releaseDeviceResource(ctx->dev_resources[idev]);
+    }
+    delete ctx;
+}
+
+__C __export void forwardQwen3MoEAttention(
+    struct Qwen3MoEAttention* context,
+    struct Qwen3Cache* kv_cache,
+    const void* input_tensor,
+    void* output_tensor,
+    int batch_size,
+    const int* seq_lens_ptr,
+    const int* past_lens_ptr,
+    const int* pos_ids_ptr
+) {
+    if (!context || !kv_cache || !input_tensor || !output_tensor) {
+        return;
+    }
+    
+    size_t layer_id = 0;
+    if (context->dev_resources.empty()) return;
+    auto &rsrc = context->dev_resources[0];
+    auto meta = &context->meta;
+    auto dt_logits = meta->dtype;
+    size_t hidden_size = meta->hidden_size;
+    
+    std::vector<int> seq_lens(batch_size);
+    std::vector<int> past_lens(batch_size);
+    std::memcpy(seq_lens.data(), seq_lens_ptr, batch_size * sizeof(int));
+    std::memcpy(past_lens.data(), past_lens_ptr, batch_size * sizeof(int));
+    
+    size_t ntok = 0;
+    for (int len : seq_lens) ntok += len;
+    
+    std::shared_ptr<Tensor> input_hidden_states;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        input_hidden_states = Tensor::weight(const_cast<void*>(input_tensor), dt_logits, {ntok, hidden_size});
+    } else {
+        input_hidden_states = Tensor::buffer(dt_logits, {ntok, hidden_size}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(input_hidden_states->data(), const_cast<void*>(input_tensor),
+                                     dsize(dt_logits) * ntok * hidden_size,
+                                     INFINIRT_MEMCPY_H2D, rsrc.stream));
+    }
+    
+    std::shared_ptr<Tensor> pos_ids;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        pos_ids = Tensor::weight(const_cast<int*>(pos_ids_ptr), INFINI_DTYPE_I32, {ntok});
+    } else {
+        pos_ids = Tensor::buffer(INFINI_DTYPE_I32, {ntok}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(pos_ids->data(), (void*)pos_ids_ptr,
+                                     sizeof(int) * ntok,
+                                     INFINIRT_MEMCPY_H2D, rsrc.stream));
+    }
+    
+    auto output_tensor_ptr = Tensor::buffer(dt_logits, {ntok, hidden_size}, rsrc.memory_pool);
+    Qwen3Cache *qwen3_cache = reinterpret_cast<Qwen3Cache*>(kv_cache);
+    inferBatchQwen3MoE(context->meta, rsrc, input_hidden_states, pos_ids,
+                       output_tensor_ptr, qwen3_cache, layer_id,
+                       batch_size, seq_lens, past_lens);
+
+    RUN_INFINI(infinirtStreamSynchronize(rsrc.stream));
+
+    if (rsrc.device != INFINI_DEVICE_CPU) {
+        RUN_INFINI(infinirtMemcpyAsync(output_tensor, output_tensor_ptr->data(),
+        dsize(dt_logits) * ntok * hidden_size,
+        INFINIRT_MEMCPY_D2H, rsrc.stream));
+    }
+}
+
+__C __export void injectQwen3CacheKV(
+    struct Qwen3MoEAttention* context,
+    struct Qwen3Cache* kv_cache,
+    int layer_id,
+    int batch_idx,
+    int past_len,
+    const void* k_host_ptr, 
+    const void* v_host_ptr
+) {
+    if (!context || !kv_cache || past_len <= 0) return;
+
+    auto &rsrc = context->dev_resources[0];
+    RUN_INFINI(infinirtSetDevice(rsrc.device, rsrc.device_id));
+    auto meta = &context->meta;
+    auto memory_pool = rsrc.memory_pool;
+    auto stream = rsrc.stream;
+    
+    if (kv_cache->layers.size() <= static_cast<size_t>(layer_id)) {
+        kv_cache->layers.resize(static_cast<size_t>(layer_id) + 1);
+    }
+    auto &layer = kv_cache->layers[layer_id];
+    
+    size_t required_batch = batch_idx + 1;
+    size_t H = meta->num_kv_head;
+    size_t S = meta->max_seq_len;
+    size_t D = meta->head_dim;
+    
+    bool need_alloc = false;
+    if (!layer.first || !layer.second) {
+        need_alloc = true;
+    } else {
+        if (layer.first->shape()[0] < required_batch) need_alloc = true;
+    }
+    
+    // [FIX] Force minimum allocation size to avoid mid-loop resizing/resetting
+    size_t current_capacity = 0;
+    if (layer.first) current_capacity = layer.first->shape()[0];
+    size_t target_capacity = std::max(required_batch, (size_t)16);
+    
+    if (current_capacity < target_capacity) {
+        need_alloc = true;
+    }
+    
+    if (need_alloc) {
+        layer.first = Tensor::buffer(meta->dtype, {target_capacity, H, S, D}, memory_pool);
+        layer.second = Tensor::buffer(meta->dtype, {target_capacity, H, S, D}, memory_pool);
+        RUN_INFINI(infinirtStreamSynchronize(stream));
+    }
+
+    auto k_tensor = layer.first;
+    auto v_tensor = layer.second;
+
+    
+    size_t dtype_size = dsize(meta->dtype);
+    size_t stride_seq_bytes   = D * dtype_size;
+    size_t stride_head_bytes  = S * stride_seq_bytes;
+    size_t stride_batch_bytes = H * stride_head_bytes;
+
+    char* k_base = (char*)k_tensor->data();
+    char* v_base = (char*)v_tensor->data();
+    
+    char* k_batch_base = k_base + batch_idx * stride_batch_bytes;
+    char* v_batch_base = v_base + batch_idx * stride_batch_bytes;
+
+    const char* k_src_base = (const char*)k_host_ptr;
+    const char* v_src_base = (const char*)v_host_ptr;
+    
+    size_t src_head_stride_bytes = past_len * D * dtype_size;
+    size_t bytes_to_copy_per_head = past_len * D * dtype_size;
+
+    for (size_t h = 0; h < H; ++h) {
+        char* k_dst_addr = k_batch_base + h * stride_head_bytes;
+        char* v_dst_addr = v_batch_base + h * stride_head_bytes;
+
+        const char* k_src_addr = k_src_base + h * src_head_stride_bytes;
+        const char* v_src_addr = v_src_base + h * src_head_stride_bytes;
+
+        if (rsrc.device == INFINI_DEVICE_CPU) {
+            std::memcpy(k_dst_addr, k_src_addr, bytes_to_copy_per_head);
+            std::memcpy(v_dst_addr, v_src_addr, bytes_to_copy_per_head);
+        } else {
+            if (bytes_to_copy_per_head > 0) {
+                RUN_INFINI(infinirtMemcpyAsync(k_dst_addr, (void*)k_src_addr,
+                            bytes_to_copy_per_head, INFINIRT_MEMCPY_H2D, stream));
+                RUN_INFINI(infinirtMemcpyAsync(v_dst_addr, (void*)v_src_addr,
+                            bytes_to_copy_per_head, INFINIRT_MEMCPY_H2D, stream));
+            }
+        }
+    }
+    RUN_INFINI(infinirtStreamSynchronize(stream));
+}
+
+extern "C" void customInjectCacheKV(
+    Qwen3Cache *kv_cache,
+    size_t layer_id,
+    int batch_idx,
+    int past_len,
+    void* k_src_ptr,
+    void* v_src_ptr,
+    cudaStream_t stream
+) {
+    int dev_id = 0;
+    cudaGetDevice(&dev_id);
+    RUN_INFINI(infinirtSetDevice(INFINI_DEVICE_NVIDIA, dev_id));
+
+    // 1. 安全检查
+    if (!kv_cache || kv_cache->layers.size() <= layer_id) {
+        std::cout<< "检查 unpass!" << std::endl;
+        return;
+
+    }
+    
+    auto &layer = kv_cache->layers[layer_id];
+    //std::cout<< layer_id << std::endl;
+    // 如果显存还没分配（Dummy Forward 没跑？），直接返回，Python侧会报错
+    if (!layer.first || !layer.second) {
+        printf(">>> [C++ Error] Cache not allocated yet! Run dummy forward first.\n");
+        return;
+    }
+
+    // 2. 获取 C++ 视角的形状信息
+    auto shape = layer.first->shape(); 
+    // shape: [Batch, NumKV, MaxSeq, HeadDim]
+    size_t num_kv = shape[1];
+    size_t max_seq = shape[2]; // 这里是关键！它是 8192
+    size_t head_dim = shape[3]; // 这里应该是 128
+
+    // 3. 计算 C++ 显存中的 Stride (稀疏布局)
+    size_t dtype_size = 2; // BF16 = 2 bytes
+    size_t stride_seq = head_dim * dtype_size;
+    size_t stride_head = max_seq * stride_seq;       // 跨越 8192 个 Token
+    size_t stride_batch = num_kv * stride_head;
+
+    // 4. 计算目标地址基址 (Base Address for this specific Batch)
+    char* k_dst_base = (char*)layer.first->data() + batch_idx * stride_batch;
+    char* v_dst_base = (char*)layer.second->data() + batch_idx * stride_batch;
+
+    // 5. 搬运循环
+    // Python 传来的数据是紧凑的: [NumKV, PastLen, HeadDim]
+    // 我们需要把每个 Head 的 [PastLen, HeadDim] 块搬运过去
+    
+    size_t copy_bytes_per_head = past_len * head_dim * dtype_size;
+    size_t src_stride_head = copy_bytes_per_head; // Python端是紧凑的
+
+    for (size_t h = 0; h < num_kv; ++h) {
+        // Source: Python (Compact)
+        char* k_src = (char*)k_src_ptr + h * src_stride_head;
+        char* v_src = (char*)v_src_ptr + h * src_stride_head;
+
+        // Dest: C++ (Sparse / Strided)
+        // 注意：我们从 sequence 的 index 0 开始写起
+        //int start_pos = past_len;
+        char* k_dst = k_dst_base + h * stride_head ;
+        char* v_dst = v_dst_base + h * stride_head ;
+
+        // 检查指针是否对齐和越界（简单保护）
+        if (past_len > 0) {
+             RUN_INFINI(infinirtMemcpyAsync(k_dst, k_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+             RUN_INFINI(infinirtMemcpyAsync(v_dst, v_src, copy_bytes_per_head, INFINIRT_MEMCPY_H2D, (infinirtStream_t)stream));
+        }
+    }
+    
+    // 简单同步确保写入完成
+    RUN_INFINI(infinirtStreamSynchronize((infinirtStream_t)stream));
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("DEBUG: Error at customInjectCacheKV end: %s\n", cudaGetErrorString(err));
+    }
+}
\ No newline at end of file
diff --git a/src/models/Qwen3MoE/Qwen3MoE_cache.cpp b/src/models/Qwen3MoE/Qwen3MoE_cache.cpp
new file mode 100644
index 00000000..ad1a3d92
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE_cache.cpp
@@ -0,0 +1,50 @@
+#include "Qwen3MoE_impl.hpp"
+#include "infinicore_infer.h"
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+// 注意：Qwen3MoECache 在头文件中声明，但实际使用 Qwen3Cache
+// 这里假设它们是同一个类型，或者Qwen3MoECache是Qwen3Cache的typedef
+
+/// @brief 创建KVCache
+__C __export struct Qwen3Cache *
+createQwen3Cache(const Qwen3MoEAttentionMeta *meta,
+                    size_t batch_size, size_t seq_len) {
+    Qwen3Cache *cache = new Qwen3Cache();
+    
+    // 假设只有1层attention（因为只实现attention模块）
+    size_t nlayer = 1;
+    size_t max_seq_len = meta->max_seq_len;
+    size_t num_kv_head = meta->num_kv_head;
+    size_t head_dim = meta->head_dim;
+    
+    // 为每一层创建K和V cache
+    // Cache shape: [num_kv_head, max_seq_len, head_dim]
+    cache->layers.resize(nlayer);
+    
+    for (size_t layer = 0; layer < nlayer; layer++) {
+        // 创建K cache: [num_kv_head, max_seq_len, head_dim]
+        auto k_cache = Tensor::buffer(meta->dtype, {num_kv_head, max_seq_len, head_dim});
+        
+        // 创建V cache: [num_kv_head, max_seq_len, head_dim]
+        auto v_cache = Tensor::buffer(meta->dtype, {num_kv_head, max_seq_len, head_dim});
+        
+        cache->layers[layer] = std::make_pair(k_cache, v_cache);
+    }
+    
+    return reinterpret_cast<struct Qwen3Cache *>(cache);
+}
+
+/// @brief 销毁KVCache（如果需要的话，可以添加这个函数）
+// 注意：头文件中没有声明这个函数，如果需要可以添加
+// __C void dropQwen3Cache(struct Qwen3Cache *cache) {
+//     if (cache) {
+//         Qwen3Cache *qwen3_cache = reinterpret_cast<Qwen3Cache *>(cache);
+//         for (auto &layer : qwen3_cache->layers) {
+//             layer.first.reset();
+//             layer.second.reset();
+//         }
+//         delete qwen3_cache;
+//     }
+// }
+
diff --git a/src/models/Qwen3MoE/Qwen3MoE_impl.hpp b/src/models/Qwen3MoE/Qwen3MoE_impl.hpp
new file mode 100644
index 00000000..909a0c0b
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE_impl.hpp
@@ -0,0 +1,133 @@
+#ifndef QWEN3MOE_IMPL_H
+#define QWEN3MOE_IMPL_H
+
+#include "infinicore_infer.h"
+
+#include "../../allocator.hpp"
+#include "../../tensor.hpp"
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+struct QuantLinearWeight {
+    std::shared_ptr<Tensor> w;
+    std::shared_ptr<Tensor> s; // Scale QUANT
+    std::shared_ptr<Tensor> z; // Zero QUANT
+}; 
+
+struct Qwen3AttentionWeight {
+    // Pre-Norm
+    std::shared_ptr<Tensor> attn_norm;
+    
+    // GQA
+    std::shared_ptr<Tensor> q_proj;
+    std::shared_ptr<Tensor> k_proj;
+    std::shared_ptr<Tensor> v_proj;
+    std::shared_ptr<Tensor> o_proj;
+
+    // QK Norm
+    std::shared_ptr<Tensor> q_norm;
+    std::shared_ptr<Tensor> k_norm;
+
+};
+
+struct Qwen3LayerWeight {
+    std::shared_ptr<Qwen3AttentionWeight> self_attn;
+
+    // TODO: 实现MLP Experts等， 由于比赛只实现attention模块
+    // 所以只放一个self_attn
+};
+
+struct Qwen3DeviceWeights {
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd;
+    
+    // RoPE
+    std::shared_ptr<Tensor> sin_table;
+    std::shared_ptr<Tensor> cos_table;
+
+    // layer
+    std::vector<Qwen3LayerWeight> w_layers;
+    
+    infiniDevice_t device;
+    int dev_id;
+    infinirtStream_t load_stream;
+};
+
+struct Qwen3MoEWeights {
+    // 即使是单卡，通常也用 vector 存，方便统一逻辑
+    std::vector<std::shared_ptr<Qwen3DeviceWeights>> device_weights;
+
+    // 构造函数声明
+    Qwen3MoEWeights(const Qwen3MoEAttentionMeta *meta,
+                    infiniDevice_t device,
+                    int ndev,
+                    const int *dev_ids);
+};
+
+/*
+Qwen3 KVCache
+[Batch, KV_Heads, Max_Seq, Head_Dim]    
+*/
+struct Qwen3Cache {
+    std::vector<std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>> layers;
+};
+
+struct Qwen3MoEDeviceResource {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    // Weights
+    std::shared_ptr<Qwen3DeviceWeights> weights;
+    // Streams
+    infinirtStream_t stream;
+    // Communicator
+    infinicclComm_t comm;
+
+    std::shared_ptr<MemoryPool> memory_pool;
+};
+
+
+struct InferState {
+    std::mutex mtx;
+    std::condition_variable cv_load, cv_start, cv_done;
+    bool loaded = false;
+    bool proceed = false;
+    bool exit_flag = false;
+};
+
+struct InferRequest {
+    const uint32_t *tokens;
+    uint32_t ntok;
+    const uint32_t *req_lens;
+    uint32_t nreq;
+    const uint32_t *req_pos;
+    struct Qwen3Cache **kv_caches;
+    const float *temperature;
+    const uint32_t *topk;
+    const float *topp;
+    uint32_t *output;
+    void *logits;
+};
+
+struct Qwen3MoEAttention { 
+    Qwen3MoEAttentionMeta meta;
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    
+    std::vector<Qwen3MoEDeviceResource> dev_resources;
+    
+    // 线程控制
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+
+    // 构造函数
+    Qwen3MoEAttention(const Qwen3MoEAttentionMeta *, const Qwen3MoEWeights *weights);
+};
+
+
+#endif
\ No newline at end of file
diff --git a/src/models/Qwen3MoE/Qwen3MoE_weight.cpp b/src/models/Qwen3MoE/Qwen3MoE_weight.cpp
new file mode 100644
index 00000000..c3ead95d
--- /dev/null
+++ b/src/models/Qwen3MoE/Qwen3MoE_weight.cpp
@@ -0,0 +1,268 @@
+#include "Qwen3MoE_impl.hpp"
+#include "infinicore_infer.h"
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+
+#include <cmath>
+#include <iostream>
+
+// ==================== 辅助函数 ====================
+
+// 辅助函数：创建普通线性权重 (BF16)
+// 形状通常为 [in_dim, out_dim]，这是 InfiniLM 计算库的标准格式
+inline std::shared_ptr<Tensor> getLinear(
+    const Qwen3MoEAttentionMeta *meta, size_t in_dim, size_t out_dim) {
+    // 创建 BF16 权重张量
+    auto shape = std::vector<size_t>({in_dim, out_dim});
+    // 使用 meta->dtype 也可以，通常 meta->dtype 已经是 BF16
+    return Tensor::weight(nullptr, INFINI_DTYPE_BF16, shape);
+}
+
+// 辅助函数：分布式加载线性权重 (Tensor Parallel)
+// 即使 ndev=1 也能正常工作
+inline void load_dist_linear(void *w_ptr, std::shared_ptr<Tensor> w, 
+                             size_t ndev, size_t dev, infinirtStream_t stream) {
+    // 简单假设按输出维度切分 (Column Parallel)
+    // 偏移量 = 总元素数 / ndev * dev * 元素大小
+    size_t offset = w->shape()[0] * w->shape()[1] * dev * dsize(w->dtype());
+    w->load(reinterpret_cast<char *>(w_ptr) + offset, stream);
+}
+
+// 获取Attention Norm权重
+inline std::shared_ptr<Tensor> getAttnNorm(const Qwen3MoEAttentionMeta *meta) {
+    auto shape = std::vector<size_t>({meta->hidden_size});
+    return Tensor::weight(nullptr, meta->dtype, shape);
+}
+
+// 1. 恢复标准 Sin/Cos 表 (适用于 64 dim -> 32 freqs)
+inline std::shared_ptr<Tensor> getSinTable(const Qwen3MoEAttentionMeta *meta) {
+    auto half_dh = meta->head_dim / 2; 
+    auto unit = dsize(meta->dtype);
+    void *table = std::malloc(meta->max_seq_len * half_dh * unit);
+    float theta = meta->rope_theta;
+
+    // 标准 Full RoPE 生成逻辑
+    for (size_t i = 0; i < meta->max_seq_len; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            // j = 0..31
+            float freq_exponent = static_cast<float>(j) / static_cast<float>(half_dh);
+            float freq = std::pow(theta, freq_exponent);
+            float _sin = std::sin(static_cast<float>(i) / freq);
+            
+            size_t idx = i * half_dh + j;
+            if (meta->dtype == INFINI_DTYPE_BF16) {
+                ((uint16_t *)table)[idx] = f32_to_bf16(_sin);
+            } else if (meta->dtype == INFINI_DTYPE_F32) {
+                ((float *)table)[idx] = _sin;
+            }
+        }
+    }
+    // ... (Tensor 创建代码同上)
+    auto shape = std::vector<size_t>({meta->max_seq_len, half_dh});
+    auto tensor = Tensor::weight(table, meta->dtype, shape);
+    std::free(table);
+    return tensor;
+}
+
+// Cos 表同理，完全标准逻辑
+inline std::shared_ptr<Tensor> getCosTable(const Qwen3MoEAttentionMeta *meta) {
+    auto half_dh = meta->head_dim / 2;
+    auto unit = dsize(meta->dtype);
+    void *table = std::malloc(meta->max_seq_len * half_dh * unit);
+    float theta = meta->rope_theta;
+
+    // 标准 Full RoPE 生成逻辑
+    for (size_t i = 0; i < meta->max_seq_len; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            // j = 0..31
+            float freq_exponent = static_cast<float>(j) / static_cast<float>(half_dh);
+            float freq = std::pow(theta, freq_exponent);
+            float _cos = std::cos(static_cast<float>(i) / freq);
+            
+            size_t idx = i * half_dh + j;
+            if (meta->dtype == INFINI_DTYPE_BF16) {
+                ((uint16_t *)table)[idx] = f32_to_bf16(_cos);
+            } else if (meta->dtype == INFINI_DTYPE_F32) {
+                ((float *)table)[idx] = _cos;
+            }
+        }
+    }
+    auto shape = std::vector<size_t>({meta->max_seq_len, half_dh});
+    auto tensor = Tensor::weight(table, meta->dtype, shape);
+    std::free(table);
+    return tensor;
+}
+
+// 恢复 Norm 权重形状
+inline std::shared_ptr<Tensor> getQNorm(const Qwen3MoEAttentionMeta *meta) {
+    auto shape = std::vector<size_t>({meta->head_dim}); // 128
+    return Tensor::weight(nullptr, meta->dtype, shape);
+}
+
+inline std::shared_ptr<Tensor> getKNorm(const Qwen3MoEAttentionMeta *meta) {
+    //std::cout<<"head dim"<<meta->head_dim<<std::endl;
+    auto shape = std::vector<size_t>({meta->head_dim}); // 128
+    return Tensor::weight(nullptr, meta->dtype, shape);
+}
+// ==================== 构造函数 ====================
+
+Qwen3MoEWeights::Qwen3MoEWeights(
+    const Qwen3MoEAttentionMeta *meta,
+    infiniDevice_t device,
+    int ndev,
+    const int *dev_ids) {
+    
+    device_weights = std::vector<std::shared_ptr<Qwen3DeviceWeights>>(ndev);
+    
+    // 假设只有1层attention
+    size_t nlayer = 1;
+    
+    // 计算本地头数 (Tensor Parallel)
+    size_t local_num_heads = meta->num_heads / ndev;
+    size_t local_num_kv_heads = meta->num_kv_head / ndev;
+    
+    for (int dev = 0; dev < ndev; dev++) {
+        int dev_id = dev_ids[dev];
+        RUN_INFINI(infinirtSetDevice(device, dev_id));
+        device_weights[dev] = std::make_shared<Qwen3DeviceWeights>();
+        device_weights[dev]->device = device;
+        device_weights[dev]->dev_id = dev_id;
+        RUN_INFINI(infinirtStreamCreate(&device_weights[dev]->load_stream));
+
+        // 初始化RoPE表
+        device_weights[dev]->sin_table = getSinTable(meta);
+        device_weights[dev]->cos_table = getCosTable(meta);
+
+        // 初始化layers
+        device_weights[dev]->w_layers = std::vector<Qwen3LayerWeight>(nlayer);
+        
+        for (size_t layer = 0; layer < nlayer; layer++) {
+            auto attn_weight = std::make_shared<Qwen3AttentionWeight>();
+            
+            // Pre-Norm
+            attn_weight->attn_norm = getAttnNorm(meta);
+            
+            // Q/K/V投影（GQA + Tensor Parallel）
+            // 注意：这里 out_dim 使用本地头数计算
+            size_t q_out_dim = local_num_heads * meta->head_dim;
+            size_t kv_out_dim = local_num_kv_heads * meta->head_dim;
+            
+            // 【修改点】改为使用 getLinear 初始化普通 BF16 Tensor
+            attn_weight->q_proj = getLinear(meta, meta->hidden_size, q_out_dim);
+            attn_weight->k_proj = getLinear(meta, meta->hidden_size, kv_out_dim);
+            attn_weight->v_proj = getLinear(meta, meta->hidden_size, kv_out_dim);
+            
+            // QK Norm
+            attn_weight->q_norm = getQNorm(meta); 
+            attn_weight->k_norm = getKNorm(meta);
+            
+            // Output投影
+            // 注意：Output Proj 输入维度切分，输出维度完整 (Row Parallel 归约)
+            // 这里为了简化加载逻辑，我们暂时假设它也是普通 Linear
+            attn_weight->o_proj = getLinear(meta, q_out_dim, meta->hidden_size);
+            
+            device_weights[dev]->w_layers[layer].self_attn = attn_weight;
+        }
+    }
+}
+
+// ==================== 权重加载函数 (移除 Scale/Zero 参数) ====================
+
+// 加载Attention Norm
+void load_attn_norm(Qwen3MoEWeights *weights, void *cpu_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        weight->w_layers[layer_id].self_attn->attn_norm->load(cpu_ptr, weight->load_stream);
+    }
+}
+
+// 加载Q投影
+// 【修改点】去掉了 scale_ptr, zero_ptr
+void load_attn_q_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto q_proj = weight->w_layers[layer_id].self_attn->q_proj;
+        load_dist_linear(weight_ptr, q_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 加载K投影
+void load_attn_k_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto k_proj = weight->w_layers[layer_id].self_attn->k_proj;
+        load_dist_linear(weight_ptr, k_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 加载V投影
+void load_attn_v_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto v_proj = weight->w_layers[layer_id].self_attn->v_proj;
+        load_dist_linear(weight_ptr, v_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 加载Q Norm
+void load_attn_q_norm(Qwen3MoEWeights *weights, void *cpu_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        weight->w_layers[layer_id].self_attn->q_norm->load(cpu_ptr, weight->load_stream);
+    }
+}
+
+// 加载K Norm
+void load_attn_k_norm(Qwen3MoEWeights *weights, void *cpu_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        weight->w_layers[layer_id].self_attn->k_norm->load(cpu_ptr, weight->load_stream);
+    }
+}
+
+// 加载Output投影
+void load_attn_o_proj(Qwen3MoEWeights *weights, void *weight_ptr, size_t layer_id) {
+    for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
+        auto weight = weights->device_weights[dev];
+        RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
+        
+        auto o_proj = weight->w_layers[layer_id].self_attn->o_proj;
+        load_dist_linear(weight_ptr, o_proj, weights->device_weights.size(), dev, weight->load_stream);
+    }
+}
+
+// 创建权重加载器
+// 【修改点】结构体定义需要去对应修改头文件，这里只填入函数指针
+static Qwen3MoEWeightLoader weight_loader = {
+    .load_attn_norm = load_attn_norm,
+    .load_attn_q_proj = load_attn_q_proj,
+    .load_attn_k_proj = load_attn_k_proj,
+    .load_attn_v_proj = load_attn_v_proj,
+    .load_attn_q_norm = load_attn_q_norm,
+    .load_attn_k_norm = load_attn_k_norm,
+    .load_attn_o_proj = load_attn_o_proj,
+};
+
+__C __export Qwen3MoEWeights *
+createQwen3MoEWeights(const Qwen3MoEAttentionMeta *meta,
+                      infiniDevice_t device,
+                      int ndev,
+                      const int *dev_ids) {
+    auto weights = new Qwen3MoEWeights(meta, device, ndev, dev_ids);
+    return weights;
+}
+
+__C __export Qwen3MoEWeightLoader *
+createQwen3MoEWeightLoader() {
+    return &weight_loader;
+}
\ No newline at end of file
diff --git a/test/models/qwen3_moe/attention_test.py b/test/models/qwen3_moe/attention_test.py
index 26f66e40..ef0d8d3c 100644
--- a/test/models/qwen3_moe/attention_test.py
+++ b/test/models/qwen3_moe/attention_test.py
@@ -1,484 +1,457 @@
 import os
 import time
 import sys
+import json
 import safetensors
 import torch
+import numpy as np
+import ctypes
+from ctypes import byref, POINTER, c_int, c_float, c_void_p, c_size_t, Structure
 from transformers import AutoConfig
 from transformers import DynamicCache
 from transformers.models import qwen3_moe
 
-WARMUPS = 10
-RUNS = 100
-PREFILL_TESTCASES = {"seqlens": [64, 128, 256, 256], "pastlens": [512, 0, 0, 256]}
+# ==============================================================================
+# 1. Ctypes Setup
+# ==============================================================================
+SO_PATH = "build/linux/x86_64/release/libinfinicore_infer.so"
+if not os.path.exists(SO_PATH):
+    SO_PATH = os.path.expanduser("~/.infini/lib/libinfinicore_infer.so")
+
+if not os.path.exists(SO_PATH):
+    print(f"Warning: Cannot find libinfinicore_infer.so at {SO_PATH}.")
+    LIB_INFINILM = None
+else:
+    LIB_INFINILM = ctypes.CDLL(SO_PATH)
+
+class DataType:
+    INFINI_DTYPE_BF16 = 19
+
+class DeviceType:
+    DEVICE_TYPE_NVIDIA = 1
+
+class Qwen3MoEAttentionMetaCStruct(Structure):
+    _fields_ = [
+        ("dtype", c_int),
+        ("hidden_size", c_size_t),
+        ("num_heads", c_size_t),
+        ("num_kv_head", c_size_t),
+        ("head_dim", c_size_t),
+        ("rope_theta", c_float),
+        ("max_seq_len", c_size_t),
+        ("rms_norm_eps", c_float),
+    ]
+
+class Qwen3MoEWeightLoader(Structure):
+    _fields_ = [
+        ("load_attn_norm", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_q_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_k_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_v_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_q_norm", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_k_norm", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+        ("load_attn_o_proj", ctypes.CFUNCTYPE(None, c_void_p, c_void_p, c_size_t)),
+    ]
+
+class Qwen3MoEAttention(Structure): pass
+class Qwen3MoEWeights(Structure): pass
+class Qwen3Cache(Structure): pass
+
+if LIB_INFINILM:
+    LIB_INFINILM.createQwen3MoEWeights.restype = POINTER(Qwen3MoEWeights)
+    LIB_INFINILM.createQwen3MoEWeightLoader.restype = POINTER(Qwen3MoEWeightLoader)
+    LIB_INFINILM.createQwen3MoEAttention.restype = POINTER(Qwen3MoEAttention)
+    LIB_INFINILM.createQwen3Cache.restype = POINTER(Qwen3Cache)
+    LIB_INFINILM.createQwen3Cache.argtypes = [POINTER(Qwen3MoEAttentionMetaCStruct), c_size_t, c_size_t]
+
+    LIB_INFINILM.forwardQwen3MoEAttention.argtypes = [
+        POINTER(Qwen3MoEAttention), POINTER(Qwen3Cache),
+        c_void_p, c_void_p, c_int, POINTER(c_int), POINTER(c_int), POINTER(c_int)
+    ]
+    LIB_INFINILM.injectQwen3CacheKV.argtypes = [
+        POINTER(Qwen3MoEAttention), POINTER(Qwen3Cache),
+        c_int, c_int, c_int, c_void_p, c_void_p
+    ]
+
+global_tensor_keepalive = []
+
+def get_ptr(numpy_array):
+    if not numpy_array.flags['C_CONTIGUOUS']:
+        numpy_array = np.ascontiguousarray(numpy_array)
+    ptr = numpy_array.ctypes.data_as(c_void_p)
+    global_tensor_keepalive.append(numpy_array)
+    return ptr
+
+# ==============================================================================
+# 2. InfiniLM Wrapper
+# ==============================================================================
+class InfiniLMWrapper:
+    def __init__(self, config, torch_model, device_id=0):
+        if not LIB_INFINILM: raise RuntimeError("Library not loaded")
+        
+        # [TRUTH] 物理真值是 128
+        self.real_hidden = config.hidden_size 
+        real_head_dim = 128
+        
+        self.meta = Qwen3MoEAttentionMetaCStruct(
+            dtype=DataType.INFINI_DTYPE_BF16, 
+            hidden_size=config.hidden_size, 
+            num_heads=config.num_attention_heads,
+            num_kv_head=config.num_key_value_heads,
+            head_dim=real_head_dim,
+            rope_theta=config.rope_theta,
+            max_seq_len=8192,
+            rms_norm_eps=config.rms_norm_eps
+        )
+        self.weights_handle = LIB_INFINILM.createQwen3MoEWeights(byref(self.meta), DeviceType.DEVICE_TYPE_NVIDIA, 1, (c_int * 1)(device_id))
+        self.loader = LIB_INFINILM.createQwen3MoEWeightLoader()
+        self._load_weights(torch_model)
+        self.attn_ctx = LIB_INFINILM.createQwen3MoEAttention(byref(self.meta), self.weights_handle)
+        self.kv_cache = LIB_INFINILM.createQwen3Cache(byref(self.meta), 0, 0)
+    
+    def _load_weights(self, model):
+        def load(tensor, loader_func, transpose=False):
+            if tensor is None: return
+            w_pt = tensor.detach().to(torch.float32)
+            if transpose: w_pt = w_pt.t()
+            w_bf16 = w_pt.to(torch.bfloat16).view(torch.int16).cpu().numpy()
+            loader_func(self.weights_handle, get_ptr(w_bf16), 0)
+
+        load(model.q_proj.weight, self.loader.contents.load_attn_q_proj, transpose=True)
+        load(model.k_proj.weight, self.loader.contents.load_attn_k_proj, transpose=True)
+        load(model.v_proj.weight, self.loader.contents.load_attn_v_proj, transpose=True)
+        load(model.o_proj.weight, self.loader.contents.load_attn_o_proj, transpose=True)
+        
+        if hasattr(model, 'q_norm') and model.q_norm is not None:
+            load(model.q_norm.weight, self.loader.contents.load_attn_q_norm, transpose=False)
+        if hasattr(model, 'k_norm') and model.k_norm is not None:
+            load(model.k_norm.weight, self.loader.contents.load_attn_k_norm, transpose=False)
+
+    def inject_cache(self, layer_id, batch_idx, k_torch, v_torch):
+        """
+        将 PyTorch 的 KV Cache (BFloat16) 注入到 InfiniLM 的 Cache 中
+        k_torch, v_torch shape: [num_kv_heads, past_len, head_dim]
+        """
+        if k_torch is None or v_torch is None: return
+        
+        # 转换为 numpy int16 (模拟 bf16) 且保证 C 连续
+        k_np = k_torch.detach().cpu().view(torch.int16).numpy().copy(order='C')
+        v_np = v_torch.detach().cpu().view(torch.int16).numpy().copy(order='C')
+        past_len = k_np.shape[1] 
+        
+        LIB_INFINILM.injectQwen3CacheKV(
+            self.attn_ctx, self.kv_cache,
+            c_int(layer_id), c_int(batch_idx), c_int(past_len),
+            get_ptr(k_np), get_ptr(v_np)
+        )
 
-DECODE_TESTCASES = {
-    "seqlens": [1 for _ in range(16)],
-    "pastlens": [50 for _ in range(4)]
-    + [100 for _ in range(4)]
-    + [200 for _ in range(4)]
-    + [400 for _ in range(4)],
-}
+    def forward(self, input_bf16_np, batch_size, seq_lens, past_lens, pos_ids, return_raw=False):
+        q_out_dim = self.meta.num_heads * self.meta.head_dim 
+        out_dim = q_out_dim if return_raw else self.real_hidden
+        output = np.zeros((input_bf16_np.shape[0], out_dim), dtype=np.int16)
+        
+        LIB_INFINILM.forwardQwen3MoEAttention(
+            self.attn_ctx, self.kv_cache, 
+            get_ptr(input_bf16_np), get_ptr(output),
+            c_int(batch_size), (c_int*batch_size)(*seq_lens), 
+            (c_int*batch_size)(*past_lens), (c_int*len(pos_ids))(*pos_ids)
+        )
+        return output
 
+# ==============================================================================
+# 3. Utilities
+# ==============================================================================
+WARMUPS = 10
+RUNS = 100
+PREFILL_TESTCASES = {"seqlens": [64,128,256,256], "pastlens": [512,0,0,256]}
+DECODE_TESTCASES = {"seqlens": [1] * 16, "pastlens": [504]*4 + [1004]*4 + [2004]*4 + [4004]*4}
 
 def get_args():
     import argparse
-
-    parser = argparse.ArgumentParser(description="Test Operator")
-    parser.add_argument(
-        "--model_path",
-        action="store",
-        help="The directory of the model to be tested",
-    )
-
-    parser.add_argument(
-        "--cpu",
-        action="store_true",
-        help="Run cpu test",
-    )
-
-    parser.add_argument(
-        "--nvidia",
-        action="store_true",
-        help="Run nvidia test",
-    )
-
-    parser.add_argument(
-        "--metax",
-        action="store_true",
-        help="Run metax test",
-    )
-    parser.add_argument(
-        "--moore",
-        action="store_true",
-        help="Run moore test",
-    )
-    parser.add_argument(
-        "--iluvatar",
-        action="store_true",
-        help="Run iluvatar test",
-    )
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", required=True)
+    parser.add_argument("--nvidia", action="store_true")
     return parser.parse_args()
 
+def create_Qwen3attention_torch(dir_path, device, dtype=torch.bfloat16):
+    config = AutoConfig.from_pretrained(dir_path)
 
-def torch_synchronize(_device):
-    if _device == "cuda":
-        torch.cuda.synchronize()
-    elif _device == "musa":
-        torch.musa.synchronize()
-
-
-def torch_empty_cache(_device):
-    if _device == "cuda":
-        torch.cuda.empty_cache()
-    elif _device == "musa":
-        torch.musa.empty_cache()
-
+    real_head_dim = 128 
+    config.head_dim = real_head_dim 
 
-def create_Qwen3attention_torch(dir_path, *, device, dtype=torch.bfloat16):
-    config = AutoConfig.from_pretrained(dir_path)
     config.num_hidden_layers = 1
     config._attn_implementation = "sdpa"
-
-    # --------------------------------------------------------------------------------#
-    #                创建只包含 attention的模型
-    # --------------------------------------------------------------------------------#
-    model = qwen3_moe.modeling_qwen3_moe.Qwen3MoeAttention(config, layer_idx=0).to(
-        device=device, dtype=dtype
-    )
+    
+    model = qwen3_moe.modeling_qwen3_moe.Qwen3MoeAttention(config, layer_idx=0).to(device=device, dtype=dtype)
+    
     tensors = {}
     for fname in sorted(os.listdir(dir_path)):
-        if not fname.endswith(".safetensors"):
-            continue
-        fpath = os.path.join(dir_path, fname)
-        with safetensors.safe_open(fpath, framework="pt") as f:
+        if not fname.endswith(".safetensors"): continue
+        with safetensors.safe_open(os.path.join(dir_path, fname), framework="pt") as f:
             for key in f.keys():
                 if "model.layers.0.self_attn." in key:
                     tensors[key[len("model.layers.0.self_attn.") :]] = f.get_tensor(key)
         break
-    model.load_state_dict(tensors)
-
-    # --------------------------------------------------------------------------------#
-    #                创建 rotary_emb 类
-    # --------------------------------------------------------------------------------#
-    rotary_emb = qwen3_moe.modeling_qwen3_moe.Qwen3MoeRotaryEmbedding(
-        config, device=device
-    )
-    return model, rotary_emb
-
-
-def generate_attention_input_torch(
-    model, rotary_emb, testcase, device, dtype=torch.bfloat16
-):
+    
+    model.load_state_dict(tensors, strict=False)
+    
+    if model.q_proj.bias is not None: torch.nn.init.zeros_(model.q_proj.bias)
+    if model.k_proj.bias is not None: torch.nn.init.zeros_(model.k_proj.bias)
+    if model.v_proj.bias is not None: torch.nn.init.zeros_(model.v_proj.bias)
+    if model.o_proj.bias is not None: torch.nn.init.zeros_(model.o_proj.bias)
+
+    rotary_emb = qwen3_moe.modeling_qwen3_moe.Qwen3MoeRotaryEmbedding(config, device=device)
+    return model, rotary_emb, config
+
+def prepare_inputs(model, testcase, device, dtype):
     config = model.config
-    hidden_size = config.hidden_size  # 2048
-    head_dim = config.head_dim  # 128
-    num_key_value_heads = config.num_key_value_heads
     bs = 1
-
     req_list = []
+    
     for seq_lens, past_lens in zip(testcase["seqlens"], testcase["pastlens"]):
-        hidden_states = torch.rand(
-            (bs, seq_lens, hidden_size), device=device, dtype=dtype
-        )
-
-        attention_mask = None
-
+        hidden_states = torch.rand((bs, seq_lens, config.hidden_size), device=device, dtype=dtype)
         past_key_values = DynamicCache(config=config)
-        key_states = torch.rand(
-            (bs, num_key_value_heads, past_lens, head_dim), device=device, dtype=dtype
-        )
-        value_states = torch.rand(
-            (bs, num_key_value_heads, past_lens, head_dim), device=device, dtype=dtype
-        )
-        past_key_values.update(key_states, value_states, 0)
-
-        req = {
-            "hidden_states": hidden_states,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-        }
-        req_list.append(req)
-
-    return req_list
-
-
-def benchmark_Qwen3attention_prefill_torch(
-    model, rotary_emb, test_cases, device, dtype=torch.bfloat16
-):
-    """
-    Test Qwen3attention.
-
-    """
-    req_list = generate_attention_input_torch(
-        model, rotary_emb, test_cases, device, dtype=dtype
-    )
-    req_out_list = []
-    for req in req_list:
-        # ----------------------------------------- #
-        #         获得每个req的数据
-        # ----------------------------------------- #
-        hidden_states = req["hidden_states"]
-        attention_mask = req["attention_mask"]
-        past_key_values = req["past_key_values"]
-
-        # ----------------------------------------- #
-        #         计算当前所需的sin_table，sin_table
-        # ----------------------------------------- #
-        cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-        bs, seq_len, _ = hidden_states.shape
-
-        position_ids = torch.arange(
-            cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-        ).reshape((bs, seq_len))
-
-        cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-        position_embeddings = (sin_table, cos_table)
-
-        # ----------------------------------------- #
-        #            计算一次
-        # ----------------------------------------- #
-        output_device, _ = model(
-            hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-
-        # ----------------------------------------- #
-        #            得到结果，存储下来
-        # ----------------------------------------- #
-        output_host = output_device.to("cpu")
-        req_out_list.append(output_host)
-
-    torch_synchronize(device)
-
+        
+        # [CRITICAL] 恢复为 torch.rand！
+        # 现在我们通过 inject_cache 保证 C++ 拿到完全一样的随机数
+        if past_lens > 0:
+            k = torch.rand((bs, config.num_key_value_heads, past_lens, config.head_dim), device=device, dtype=dtype)
+            v = torch.rand((bs, config.num_key_value_heads, past_lens, config.head_dim), device=device, dtype=dtype)
+            past_key_values.update(k, v, 0)
+        req_list.append({"hidden_states": hidden_states, "attention_mask": None, "past_key_values": past_key_values})
+
+    all_hs = [req["hidden_states"].squeeze(0) for req in req_list]
+    flat_input = torch.cat(all_hs, dim=0) 
+    
+    input_np = flat_input.cpu().view(torch.int16).numpy().copy(order='C')
+    
+    seq_lens = testcase["seqlens"]
+    past_lens = testcase["pastlens"]
+    pos_ids = []
+    for s, p in zip(seq_lens, past_lens):
+        pos_ids.extend(range(p, p+s))
+        
+    return req_list, input_np, seq_lens, past_lens, pos_ids
+
+def check_correctness_prefill(torch_outs, infinilm_out_np, device):
+    if not torch_outs:
+        print("❌ Error: Torch Output is empty.")
+        return
+
+    torch_flat = torch.cat([out.float().view(-1, out.shape[-1]) for out in torch_outs], dim=0).to("cpu")
+    
+    infini_tensor_int16 = torch.from_numpy(infinilm_out_np)
+    infini_flat = infini_tensor_int16.view(torch.bfloat16).float().view(-1, torch_flat.shape[-1])
+
+    cos_sim = torch.nn.functional.cosine_similarity(torch_flat, infini_flat, dim=-1).mean().item()
+    print(f"Cosine Similarity: {cos_sim:.6f}")
+    
+    if cos_sim > 0.98: print("✅ Result Match")
+    else: print("❌ Result Mismatch")
+
+def check_correctness_decode(torch_outs, infinilm_out_np, device):
+    if not torch_outs:
+        print("❌ Error: Torch Output is empty.")
+        return
+
+    torch_flat = torch.cat([out.float().view(-1, out.shape[-1]) for out in torch_outs], dim=0).to("cpu")
+    
+    infini_tensor_int16 = torch.from_numpy(infinilm_out_np)
+    infini_flat = infini_tensor_int16.view(torch.bfloat16).float().view(-1, torch_flat.shape[-1])
+
+    cos_sim = torch.nn.functional.cosine_similarity(torch_flat, infini_flat, dim=-1).mean().item()
+    print(f"Cosine Similarity: {cos_sim:.6f}")
+    ## for decode, 0.95 enough 
+    if cos_sim > 0.95: print("✅ Result Match")
+    else: print("❌ Result Mismatch")
+
+
+def benchmark_prefill(model, rotary_emb, infinilm_model, test_cases, device, dtype):
+    print(f"\n{'='*40} PREFILL {'='*40}")
+    req_list, input_np, seq_lens, past_lens, pos_ids = prepare_inputs(model, test_cases, device, dtype)
+    batch_size = len(seq_lens)
+    
+    # =======================================================
+    # Torch Run
+    # =======================================================
     for _ in range(WARMUPS):
         for i, req in enumerate(req_list):
-            # ----------------------------------------- #
-            #          恢复 kv chche的长度
-            # ----------------------------------------- #
-            origin_len = test_cases["pastlens"][i]
-            req["past_key_values"].crop(origin_len)
-
-        for req in req_list:
-            # ----------------------------------------- #
-            #         获得每个req的数据
-            # ----------------------------------------- #
-
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # ----------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # ----------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # ----------------------------------------- #
-            #            计算一次
-            # ----------------------------------------- #
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
+            req["past_key_values"].crop(past_lens[i])
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1]
+            pids = torch.arange(cache_len, cache_len+seq_len, device=device).reshape(1, seq_len)
+            cos, sin = rotary_emb(req["hidden_states"], pids)
+            _ = model(req["hidden_states"], position_embeddings=(sin, cos), 
+                      attention_mask=req["attention_mask"],
+                      past_key_values=req["past_key_values"])
+    torch.cuda.synchronize()
+
+    
+    torch_out_list = []
     time_consuming = 0
-    for _ in range(RUNS):
+    for run_idx in range(RUNS):
         for i, req in enumerate(req_list):
-            # ----------------------------------------- #
-            #          恢复 kv chche的长度
-            # ----------------------------------------- #
-            origin_len = test_cases["pastlens"][i]
-            req["past_key_values"].crop(origin_len)
-
-        torch_synchronize(device)
+            # 1. Reset KV Cache to initial state
+            req["past_key_values"].crop(past_lens[i])
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1]
+            
+            q_len = seq_len
+            k_len = cache_len + seq_len
+            past_len = cache_len
+            
+            causal_mask = torch.zeros((q_len, k_len), device=device, dtype=dtype)
+            for j in range(q_len):
+                valid_limit = past_len + j + 1
+                if valid_limit < k_len:
+                    causal_mask[j, valid_limit:] = float("-inf")
+            req["attention_mask"] = causal_mask[None, None, :, :]
         # ----------------------------------------- #
         #       重要：每个req都按整个batch的起始时间计算
         # ----------------------------------------- #
-        start_time = time.time()
-
-        for i, req in enumerate(req_list):
-            # ----------------------------------------- #
-            #         获得每个req的数据
-            # ----------------------------------------- #
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # ----------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # ----------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # ----------------------------------------- #
-            #            计算一次
-            # ----------------------------------------- #
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
-            torch_synchronize(device)
+        torch.cuda.synchronize()
+        start = time.time()
+        for i, req in enumerate(req_list):     
+            req["past_key_values"].crop(past_lens[i])
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1]
+            # Position IDs
+            pids = torch.arange(cache_len, cache_len+seq_len, device=device).reshape(1, seq_len)
+            cos, sin = rotary_emb(req["hidden_states"], pids)
+            out, _ = model(req["hidden_states"], position_embeddings=(sin, cos), 
+                           attention_mask=req["attention_mask"],
+                           past_key_values=req["past_key_values"])
+            torch.cuda.synchronize()
             end_time = time.time()
-
-            # 记录每个req从进入所有req进入推理到自己结束的时间
-            time_consuming += end_time - start_time
-
+            time_consuming += end_time - start
+            if run_idx == RUNS - 1:
+                torch_out_list.append(out.detach().to("cpu"))
+    torch.cuda.synchronize()
     out_token_count = RUNS * len(req_list)
+    t_lat = time_consuming * 1000 / out_token_count
 
-    latency = time_consuming * 1000 / out_token_count
-
-    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average TTFT: {round(latency, 2)} ms\n"
-    )
-
-    return req_out_list
-
-
-def benchmark_Qwen3attention_decode_torch(
-    model, rotary_emb, test_cases, device, dtype=torch.bfloat16
-):
-    """
-    Test Qwen3attention_decode.
-    """
-    req_list = generate_attention_input_torch(
-        model, rotary_emb, test_cases, device, dtype=dtype
-    )
-    req_out_list = []
-    for req in req_list:
-        # ----------------------------------------- #
-        #         获得每个req的数据
-        # ----------------------------------------- #
-        hidden_states = req["hidden_states"]
-        attention_mask = req["attention_mask"]
-        past_key_values = req["past_key_values"]
-
-        # ----------------------------------------- #
-        #         计算当前所需的sin_table，sin_table
-        # ----------------------------------------- #
-        cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-        bs, seq_len, _ = hidden_states.shape
-
-        position_ids = torch.arange(
-            cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-        ).reshape((bs, seq_len))
-
-        cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-        position_embeddings = (sin_table, cos_table)
-
-        ##
-        output_device, _ = model(
-            hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-        output_host = output_device.to("cpu")
-
-        req_out_list.append(output_host)
-
-    torch_synchronize(device)
-
-    for req in req_list:
-        for _ in range(WARMUPS):
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # ----------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # ----------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # ----------------------------------------- #
-            #            计算一次
-            # ----------------------------------------- #
-
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
-    # ----------------------------------------- #
-    #          恢复 kv chche的长度
-    # ----------------------------------------- #
+    # =======================================================
+    # InfiniLM Run
+    # =======================================================
+    print(">>> Injecting Cache to InfiniLM...")
     for i, req in enumerate(req_list):
-        origin_len = test_cases["pastlens"][i]
-        req["past_key_values"].crop(origin_len)
+        if past_lens[i] > 0:
+            k_cache = req["past_key_values"][0][0].squeeze(0) 
+            v_cache = req["past_key_values"][0][1].squeeze(0)
+            infinilm_model.inject_cache(0, i, k_cache, v_cache)
 
-    torch_synchronize(device)
-    start_time = time.time()
+    for _ in range(WARMUPS):
+        _ = infinilm_model.forward(input_np, batch_size, seq_lens, past_lens, pos_ids, return_raw=False)
+    torch.cuda.synchronize()
 
-    for i, req in enumerate(req_list):
-        for _ in range(RUNS):
-            # ----------------------------------------- #
-            #         获得每个req的数据
-            # ----------------------------------------- #
-            hidden_states = req["hidden_states"]
-            attention_mask = req["attention_mask"]
-            past_key_values = req["past_key_values"]
-
-            # -------------------------------------------------------------- #
-            #         计算当前所需的sin_table，sin_table
-            # -------------------------------------------------------------- #
-            cache_lens = past_key_values.get_seq_length()  # kv cache 现在的长度
-            bs, seq_len, _ = hidden_states.shape
-
-            position_ids = torch.arange(
-                cache_lens, cache_lens + seq_len, dtype=torch.int64, device=device
-            ).reshape((bs, seq_len))
-
-            cos_table, sin_table = rotary_emb(hidden_states, position_ids)
-            position_embeddings = (sin_table, cos_table)
-
-            # -------------------------------------------------------------- #
-            #            计算一次
-            # -------------------------------------------------------------- #
-            output_device, _ = model(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-            )
-
-            # -------------------------------------------------------------- #
-            #            更新hidden_states, ( DynamicCache的类自动更新)
-            # -------------------------------------------------------------- #
-            req["hidden_states"] = output_device
-
-    torch_synchronize(device)
-    end_time = time.time()
-
-    time_consuming = end_time - start_time
+    start = time.time()
+    infini_out = None
+    for _ in range(RUNS):
+        # Repeatedly run with same inputs (simulating same-shape prefill)
+        # Note: We assume InfiniLM overwrites/resets based on past_lens parameter
+        infini_out = infinilm_model.forward(input_np, batch_size, seq_lens, past_lens, pos_ids, return_raw=False)
+    torch.cuda.synchronize()
     out_token_count = RUNS * len(req_list)
+    i_lat = (time.time() - start) * 1000 / out_token_count
 
-    throughput = out_token_count / time_consuming
+    print(f"Latency: Torch={t_lat:.3f}ms, Infini={i_lat:.3f}ms")
+    check_correctness_prefill(torch_out_list, infini_out, device)
 
-    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average throughput: {round(throughput, 2)} tok/s \n"
-    )
 
-    return req_out_list
+def benchmark_decode(model, rotary_emb, infinilm_model, test_cases, device, dtype):
+    print(f"\n{'='*40} DECODE {'='*40}")
+    req_list, input_np, seq_lens, past_lens, pos_ids = prepare_inputs(model, test_cases, device, dtype)
+    batch_size = len(seq_lens)
+    total_tokens_per_round = sum(seq_lens)
 
+    # Capture initial KV for InfiniLM injection (before Torch modifies them)
+    initial_kv = []
+    for req in req_list:
+        if req["past_key_values"].get_seq_length() > 0:
+            k = req["past_key_values"][0][0].detach().clone()
+            v = req["past_key_values"][0][1].detach().clone()
+            initial_kv.append((k, v))
+        else:
+            initial_kv.append(None)
+
+    # =======================================================
+    # Torch Run
+    # =======================================================
+    # Note: No Warmup mentioned in requirements for "Sequential inference 100 rounds", 
+    # but usually we might warm up. However, since state changes, warmup is part of the sequence.
+    # We will just run the 100 rounds as the benchmark.
+
+    torch_out_list = []
+    torch.cuda.synchronize()
+    start = time.time()
+    for run_idx in range(RUNS):
+        for i, req in enumerate(req_list):
+            # Do NOT crop cache - let it grow
+            cache_len = req["past_key_values"].get_seq_length()
+            seq_len = req["hidden_states"].shape[1] # Should be 1
+            
+            pids = torch.arange(cache_len, cache_len+seq_len, device=device).reshape(1, seq_len)
+            cos, sin = rotary_emb(req["hidden_states"], pids)
+            
+            # Decode: attention_mask is None (causal implied for len 1)
+            out, _ = model(req["hidden_states"], position_embeddings=(sin, cos), 
+                           attention_mask=None,
+                           past_key_values=req["past_key_values"])
+            
+            # Update input for next round
+            req["hidden_states"] = out
+            
+            if run_idx == RUNS - 1:
+                torch_out_list.append(out.detach().to("cpu"))
+                
+    torch.cuda.synchronize()
+    end = time.time()
+    t_throughput = (total_tokens_per_round * RUNS) / (end - start)
+
+    # =======================================================
+    # InfiniLM Run
+    # =======================================================
+    print(">>> Injecting Cache to InfiniLM...")
+    for i, kv in enumerate(initial_kv):
+        if kv is not None:
+            k_cache, v_cache = kv
+            k_cache = k_cache.squeeze(0)
+            v_cache = v_cache.squeeze(0)
+            infinilm_model.inject_cache(0, i, k_cache, v_cache)
+            
+    curr_input_np = input_np.copy()
+    curr_past_lens_np = np.array(past_lens, dtype=np.int32)
+    curr_pos_ids_np = np.array(pos_ids, dtype=np.int32)
+    
+    start = time.time()
+    infini_out = None
+    
+    for run_idx in range(RUNS):
+        out_np = infinilm_model.forward(curr_input_np, batch_size, seq_lens, curr_past_lens_np, curr_pos_ids_np, return_raw=False)
+        
+        if run_idx < RUNS - 1:
+            # Update inputs for next round
+            curr_input_np = out_np
+            
+            curr_past_lens_np = [x + 1 for x in curr_past_lens_np]
+            curr_pos_ids_np = [x + 1 for x in curr_pos_ids_np]
+                
+        infini_out = out_np
+        
+    torch.cuda.synchronize()
+    end = time.time()
+    i_throughput = (total_tokens_per_round * RUNS) / (end - start)
+
+    print(f"Throughput: Torch={t_throughput:.1f} tok/s, Infini={i_throughput:.1f} tok/s")
+    check_correctness_decode(torch_out_list, infini_out, device)
 
 if __name__ == "__main__":
+    
     args = get_args()
-    print(args)
-
-    model_path = args.model_path
-    dtype = torch.bfloat16
-
-    # Parse command line arguments
-    device = "cpu"
-    if args.cpu:
-        device = "cpu"
-    elif args.nvidia:
-        device = "cuda"
-    elif args.metax:
-        device = "cuda"
-    elif args.moore:
-        device = "musa"
-        import torch_musa
-    elif args.iluvatar:
-        device = "cuda"
-    else:
-        print(
-            "Usage:  python test/models/qwen3_moe/attention_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
-        )
-        sys.exit(1)
-
-    # -----------------------------------------------------------------------------
-    # -----------------------------------------------------------------------------
-    # -----------------------------------------------------------------------------
-    model, rotary_emb = create_Qwen3attention_torch(
-        model_path, device=device, dtype=dtype
-    )
-    print("\n")
-    print("*" * 130)
-    print("Test Qwen3attention ")
-    print("*" * 130)
-    print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
-    output_prefill = benchmark_Qwen3attention_prefill_torch(
-        model, rotary_emb, PREFILL_TESTCASES, device, dtype=dtype
-    )
-
-    print("\n")
-    print("-" * 130)
-    print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
-    output_decode = benchmark_Qwen3attention_decode_torch(
-        model, rotary_emb, DECODE_TESTCASES, device, dtype=dtype
-    )
-
-    # clean up device memory
-    del model
-    torch_empty_cache(device)
+    device = "cuda" if args.nvidia else "cpu"
+    
+    torch_model, rotary, cfg = create_Qwen3attention_torch(args.model_path, device)
+    infini_model = InfiniLMWrapper(cfg, torch_model)
+    
+    benchmark_prefill(torch_model, rotary, infini_model, PREFILL_TESTCASES, device, torch.bfloat16)
+    benchmark_decode(torch_model, rotary, infini_model, DECODE_TESTCASES, device, torch.bfloat16)
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index ad636197..e403c741 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -15,7 +15,8 @@ target("infinicore_infer")
 
     add_linkdirs(INFINI_ROOT.."/lib")
     add_links("infiniop", "infinirt", "infiniccl")
-
+    add_syslinks("cudart")  -- 用了cuda runtime 暂时link一下 后续fix
+    add_rules("cuda") 
     set_languages("cxx17")
     set_warnings("all", "error")
 
@@ -55,4 +56,4 @@ target("_infinilm")
     add_files("csrc/**.cc")
 
     set_installdir("python/infinilm")
-target_end()
+target_end()
\ No newline at end of file