From 221fd709e23392bd8e586505dbc556cd329dd97a Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Fri, 20 Mar 2026 00:46:39 +0800
Subject: [PATCH 01/29] refactor perftest 1. support different kv backends 2.
 support intra-node and inter-node client placement 3. remove ray bandwidth
 test

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/README_PERFTEST.md         | 110 ++++++
 scripts/configs/mooncake.yaml      |   4 +
 scripts/configs/transferqueue.yaml |   6 +
 scripts/configs/yuanrong.yaml      |   4 +
 scripts/perftest.py                | 560 +++++++++++++++++++++++++++++
 5 files changed, 684 insertions(+)
 create mode 100644 scripts/README_PERFTEST.md
 create mode 100644 scripts/configs/mooncake.yaml
 create mode 100644 scripts/configs/transferqueue.yaml
 create mode 100644 scripts/configs/yuanrong.yaml
 create mode 100644 scripts/perftest.py

diff --git a/scripts/README_PERFTEST.md b/scripts/README_PERFTEST.md
new file mode 100644
index 0000000..b01f37f
--- /dev/null
+++ b/scripts/README_PERFTEST.md
@@ -0,0 +1,110 @@
+# TransferQueue Throughput Test
+
+This script runs throughput tests for TransferQueue with different backends.
+
+## Prerequisites
+
+1. Start Ray cluster with node resources:
+   ```bash
+   # On head node
+   ray start --head --resources='{"node:192.168.0.1":1}'
+
+   # On worker node
+   ray start --address=192.168.0.1 --resources='{"node:192.168.0.2":1}'
+   ```
+
+2. Start the backend service (yuanrong, mooncake, etc.) if testing non-default backends.
+
+## Usage
+
+```bash
+python perftest.py \
+  --backend=[default|yuanrong|mooncake] \
+  --client_placement=[intra_node|inter_node] \
+  --backend_config=xxx.yaml \
+  --device=[cpu|npu|gpu] \
+  --global_batch_size=1024 \
+  --field_num=10 \
+  --seq_len=8192 \
+  --num_global_batch=1 \
+  --head_node_ip=192.168.0.1 \
+  --worker_node_ip=192.168.0.2
+```
+
+## Arguments
+
+| Argument | Description | Default |
+|----------|-------------|---------|
+| `--backend` | Backend type: default, yuanrong, mooncake | default |
+| `--client_placement` | Client placement: intra_node or inter_node | intra_node |
+| `--backend_config` | Path to YAML config file (optional) | None |
+| `--device` | Device: cpu, npu, gpu | cpu |
+| `--global_batch_size` | Global batch size | 1024 |
+| `--field_num` | Number of fields | 10 |
+| `--seq_len` | Sequence length | 8192 |
+| `--num_global_batch` | Number of global batches | 1 |
+| `--head_node_ip` | Head node IP (required) | - |
+| `--worker_node_ip` | Worker node IP (required for inter_node) | None |
+| `--ray_address` | Ray cluster address | auto |
+
+## Backend Configuration
+
+Sample config files are in `configs/`:
+
+- **transferqueue.yaml**: Default backend config
+  ```yaml
+  num_data_storage_units: 8
+  storage_unit_placement: normal  # or "remote"
+  ```
+
+- **yuanrong.yaml**: Yuanrong backend config
+  ```yaml
+  host: 127.0.0.1
+  port: 31501
+  enable_yr_npu_transport: false
+  ```
+
+- **mooncake.yaml**: Mooncake backend config
+  ```yaml
+  local_hostname: 127.0.0.1
+  metadata_server: 127.0.0.1:8080
+  master_server_address: 127.0.0.1:8081
+  ```
+
+## Examples
+
+### Intra-node test with default backend
+```bash
+python perftest.py --backend=default --client_placement=intra_node \
+  --head_node_ip=192.168.0.1
+```
+
+### Inter-node test with yuanrong backend
+```bash
+python perftest.py --backend=yuanrong --client_placement=inter_node \
+  --backend_config=configs/yuanrong.yaml \
+  --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
+```
+
+### Default backend with remote storage units
+```bash
+python perftest.py --backend=default --client_placement=intra_node \
+  --backend_config=configs/transferqueue.yaml \
+  --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
+```
+
+### NPU device test
+```bash
+python perftest.py --backend=mooncake --device=npu \
+  --head_node_ip=192.168.0.1
+```
+
+## Output
+
+The test prints:
+- Total data size
+- PUT time and throughput
+- GET time and throughput
+- Total round-trip throughput
+
+Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per second).
diff --git a/scripts/configs/mooncake.yaml b/scripts/configs/mooncake.yaml
new file mode 100644
index 0000000..320801f
--- /dev/null
+++ b/scripts/configs/mooncake.yaml
@@ -0,0 +1,4 @@
+# Mooncake backend configuration
+local_hostname: 127.0.0.1
+metadata_server: 127.0.0.1:8080
+master_server_address: 127.0.0.1:8081
diff --git a/scripts/configs/transferqueue.yaml b/scripts/configs/transferqueue.yaml
new file mode 100644
index 0000000..9f55742
--- /dev/null
+++ b/scripts/configs/transferqueue.yaml
@@ -0,0 +1,6 @@
+# TransferQueue (default) backend configuration
+num_data_storage_units: 8
+# storage_unit_placement: "normal" (default) or "remote"
+#   - normal: create storage units on current node using placement group
+#   - remote: create all storage units on WORKER_NODE_IP
+storage_unit_placement: normal
diff --git a/scripts/configs/yuanrong.yaml b/scripts/configs/yuanrong.yaml
new file mode 100644
index 0000000..2df1b84
--- /dev/null
+++ b/scripts/configs/yuanrong.yaml
@@ -0,0 +1,4 @@
+# Yuanrong backend configuration
+host: 127.0.0.1
+port: 31501
+enable_yr_npu_transport: false
diff --git a/scripts/perftest.py b/scripts/perftest.py
new file mode 100644
index 0000000..ca10775
--- /dev/null
+++ b/scripts/perftest.py
@@ -0,0 +1,560 @@
+# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import random
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import ray
+import torch
+from omegaconf import OmegaConf
+from tensordict import TensorDict
+from tensordict.tensorclass import NonTensorData
+
+parent_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(parent_dir))
+
+from transfer_queue.client import TransferQueueClient  # noqa: E402
+from transfer_queue.controller import TransferQueueController  # noqa: E402
+from transfer_queue.storage.simple_backend import SimpleStorageUnit  # noqa: E402
+from transfer_queue.utils.common import get_placement_group  # noqa: E402
+from transfer_queue.utils.zmq_utils import process_zmq_server_info  # noqa: E402
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def create_complex_test_case(
+    batch_size: int | None = None,
+    seq_length: int | None = None,
+    field_num: int | None = None,
+    device: str = "cpu",
+) -> tuple[TensorDict, float]:
+    """Create a complex test case with tensor and non-tensor fields.
+
+    Args:
+        batch_size: Batch size for the test case
+        seq_length: Sequence length for tensor fields
+        field_num: Number of fields to create
+        device: Device to create tensors on ("cpu", "npu", or "gpu")
+
+    Returns:
+        Tuple of (TensorDict, total_size_gb)
+    """
+    tensor_field_size_bytes = batch_size * seq_length * 4
+    tensor_field_size_gb = tensor_field_size_bytes / (1024**3)
+
+    num_tensor_fields = (field_num + 1) // 2
+    num_nontensor_fields = field_num // 2
+
+    total_tensor_size_gb = tensor_field_size_gb * num_tensor_fields
+    total_nontensor_size_gb = (batch_size * 1024 / (1024**3)) * num_nontensor_fields
+    total_size_gb = total_tensor_size_gb + total_nontensor_size_gb
+
+    logger.info(f"Total data size: {total_size_gb:.6f} GB")
+
+    # Determine torch device
+    torch_device = None
+    if device == "npu":
+        torch_device = "npu:0"
+    elif device == "gpu":
+        torch_device = "cuda:0"
+
+    fields = {}
+    for i in range(field_num):
+        field_name = f"field_{i}"
+
+        if i % 2 == 0:
+            # Tensor field
+            tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
+            fields[field_name] = tensor_data
+        else:
+            # NonTensorData field
+            str_length = 1024
+            non_tensor_data = [
+                "".join(
+                    random.choices(
+                        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
+                        k=str_length,
+                    )
+                )
+                for _ in range(batch_size)
+            ]
+            fields[field_name] = NonTensorData(data=non_tensor_data, batch_size=(batch_size,), device=None)
+
+    batch_size_tuple = (batch_size,)
+    prompt_batch = TensorDict(
+        fields,
+        batch_size=batch_size_tuple,
+    )
+
+    return prompt_batch, total_size_gb
+
+
+@ray.remote
+class TQClientActor:
+    """Ray actor that holds a TransferQueueClient."""
+
+    def __init__(self, client_id: str, controller_info: Any):
+        self.client = TransferQueueClient(
+            client_id=client_id,
+            controller_info=controller_info,
+        )
+        self.prompt_meta = None
+        self.test_data = None
+        self.total_data_size_gb = 0.0
+
+    def initialize_storage_manager(self, manager_type: str, config: dict[str, Any]) -> None:
+        """Initialize the storage manager with given config."""
+        self.client.initialize_storage_manager(manager_type=manager_type, config=config)
+
+    def create_complex_test_case(
+        self,
+        batch_size: int | None = None,
+        seq_length: int | None = None,
+        field_num: int | None = None,
+        device: str = "cpu",
+    ) -> tuple[list[str], float]:
+        """Create test case on the actor."""
+        self.test_data, self.total_data_size_gb = create_complex_test_case(batch_size, seq_length, field_num, device)
+        return list(self.test_data.keys()), self.total_data_size_gb
+
+    def put(self, partition_id: str) -> None:
+        """Put data to storage."""
+        self.client.put(data=self.test_data, partition_id=partition_id)
+
+    def get_meta(
+        self,
+        data_fields: list[str],
+        batch_size: int,
+        partition_id: str,
+        task_name: str | None = None,
+        sampling_config: dict[str, Any] | None = None,
+    ) -> Any:
+        """Get metadata from controller."""
+        self.prompt_meta = self.client.get_meta(
+            data_fields=data_fields,
+            batch_size=batch_size,
+            partition_id=partition_id,
+            task_name=task_name,
+            sampling_config=sampling_config,
+        )
+        return self.prompt_meta
+
+    def get_data(self) -> None:
+        """Get data from storage using cached metadata."""
+        self.client.get_data(self.prompt_meta)
+
+
+class TQThroughputTester:
+    """Main throughput tester for TransferQueue backends."""
+
+    def __init__(
+        self,
+        backend: str,
+        client_placement: str,
+        backend_config: dict[str, Any],
+        device: str,
+        global_batch_size: int,
+        field_num: int,
+        seq_len: int,
+        num_global_batch: int,
+        head_node_ip: str,
+        worker_node_ip: str | None = None,
+    ):
+        """Initialize the throughput tester.
+
+        Args:
+            backend: Backend type ("default", "yuanrong", "mooncake")
+            client_placement: Client placement mode ("intra_node" or "inter_node")
+            backend_config: Backend configuration dictionary
+            device: Device type ("cpu", "npu", "gpu")
+            global_batch_size: Global batch size
+            field_num: Number of fields
+            seq_len: Sequence length
+            num_global_batch: Number of global batches
+            head_node_ip: Head node IP address
+            worker_node_ip: Worker node IP address (required for inter_node)
+        """
+        self.backend = backend
+        self.client_placement = client_placement
+        self.backend_config = backend_config
+        self.device = device
+        self.global_batch_size = global_batch_size
+        self.field_num = field_num
+        self.seq_len = seq_len
+        self.num_global_batch = num_global_batch
+        self.head_node_ip = head_node_ip
+        self.worker_node_ip = worker_node_ip
+
+        # Validate arguments
+        self._validate_args()
+
+        # Determine manager type and prepare configs
+        self.manager_type = self._get_manager_type()
+        self.writer_config, self.reader_config = self._prepare_backend_configs()
+
+        # Initialize the test infrastructure
+        self._initialize_data_system()
+        self._initialize_clients()
+
+    def _validate_args(self) -> None:
+        """Validate input arguments."""
+        if self.client_placement == "inter_node" and self.worker_node_ip is None:
+            raise ValueError("worker_node_ip is required for inter_node client placement")
+        if self.backend == "default":
+            storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal")
+            if storage_unit_placement == "remote" and self.worker_node_ip is None:
+                raise ValueError("worker_node_ip is required for remote storage_unit_placement")
+
+    def _get_manager_type(self) -> str:
+        """Get the storage manager type based on backend."""
+        if self.backend == "default":
+            return "AsyncSimpleStorageManager"
+        elif self.backend == "yuanrong":
+            return "YuanrongStorageManager"
+        elif self.backend == "mooncake":
+            return "MooncakeStorageManager"
+        else:
+            raise ValueError(f"Unknown backend: {self.backend}")
+
+    def _prepare_backend_configs(self) -> tuple[dict[str, Any], dict[str, Any]]:
+        """Prepare writer and reader backend configs.
+
+        Returns:
+            Tuple of (writer_config, reader_config)
+        """
+        # Set client_name based on backend
+        base_config = self.backend_config.copy()
+        if self.backend == "yuanrong":
+            base_config["client_name"] = "YuanrongStorageClient"
+        elif self.backend == "mooncake":
+            base_config["client_name"] = "MooncakeStoreClient"
+
+        writer_config = base_config.copy()
+        reader_config = base_config.copy()
+
+        if self.client_placement == "inter_node":
+            if self.backend == "yuanrong":
+                writer_config["host"] = self.head_node_ip
+                reader_config["host"] = self.worker_node_ip
+            elif self.backend == "mooncake":
+                writer_config["local_hostname"] = self.head_node_ip
+                reader_config["local_hostname"] = self.worker_node_ip
+
+        return writer_config, reader_config
+
+    def _initialize_data_system(self) -> None:
+        """Initialize controller and storage units if needed."""
+        # Initialize controller
+        self.data_system_controller = TransferQueueController.remote()
+        logger.info("TransferQueueController has been created.")
+        self.data_system_controller_info = process_zmq_server_info(self.data_system_controller)
+
+        # Initialize storage units for default backend
+        if self.backend == "default":
+            self._initialize_storage_units()
+
+    def _initialize_storage_units(self) -> None:
+        """Initialize SimpleStorageUnits for default backend."""
+        num_data_storage_units = self.backend_config.get("num_data_storage_units", 8)
+        storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal")
+        total_storage_size = self.global_batch_size * self.num_global_batch
+
+        self.data_system_storage_units = {}
+
+        if storage_unit_placement == "remote":
+            # Remote mode: create all storage units on worker node
+            for storage_unit_rank in range(num_data_storage_units):
+                storage_node = SimpleStorageUnit.options(
+                    num_cpus=10,
+                    resources={f"node:{self.worker_node_ip}": 0.001},
+                ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units))
+                self.data_system_storage_units[storage_unit_rank] = storage_node
+            logger.info(
+                f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created on worker node {self.worker_node_ip}."
+            )
+        else:
+            # Normal mode: create storage units using placement group
+            storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1)
+            for storage_unit_rank in range(num_data_storage_units):
+                storage_node = SimpleStorageUnit.options(
+                    placement_group=storage_placement_group,
+                    placement_group_bundle_index=storage_unit_rank,
+                ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units))
+                self.data_system_storage_units[storage_unit_rank] = storage_node
+            logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.")
+
+        self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units)
+        # Add storage unit infos to backend configs
+        self.writer_config["zmq_info"] = self.data_system_storage_unit_infos
+        self.reader_config["zmq_info"] = self.data_system_storage_unit_infos
+
+    def _initialize_clients(self) -> None:
+        """Initialize writer and reader TQClientActors."""
+        # Determine node placement
+        if self.client_placement == "intra_node":
+            writer_node = reader_node = self.head_node_ip
+        else:
+            writer_node = self.head_node_ip
+            reader_node = self.worker_node_ip
+
+        logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}")
+
+        # Prepare device resource
+        device_resource = {}
+        if self.device in ["npu", "gpu"]:
+            device_resource = {self.device: 1}
+
+        # Create writer and reader actors
+        self.writer = TQClientActor.options(
+            resources={f"node:{writer_node}": 0.001, **device_resource},
+        ).remote("writer", self.data_system_controller_info)
+
+        self.reader = TQClientActor.options(
+            resources={f"node:{reader_node}": 0.001, **device_resource},
+        ).remote("reader", self.data_system_controller_info)
+
+        # Initialize storage managers
+        logger.info(f"Using {self.manager_type} as storage backend.")
+
+        w = self.writer.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.writer_config)
+        r = self.reader.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.reader_config)
+        ray.get([w, r])
+
+    def run_throughput_test(self) -> None:
+        """Run the throughput test and print results."""
+        logger.info("Creating large batch for throughput test...")
+        start_create_data = time.time()
+        data_fields, total_data_size_gb = ray.get(
+            self.writer.create_complex_test_case.remote(
+                batch_size=self.global_batch_size,
+                seq_length=self.seq_len,
+                field_num=self.field_num,
+                device=self.device,
+            )
+        )
+        end_create_data = time.time()
+        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
+
+        # PUT operation
+        logger.info("Starting PUT operation...")
+        start_put = time.time()
+        ray.get(self.writer.put.remote(partition_id="train_0"))
+        end_put = time.time()
+        put_time = end_put - start_put
+        put_throughput_gbps = (total_data_size_gb * 8) / put_time
+        put_throughput_gbs = total_data_size_gb / put_time
+        logger.info(f"put cost time: {put_time:.8f}s")
+        logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)")
+
+        time.sleep(2)
+
+        # GET_META operation
+        logger.info("Starting GET_META operation...")
+        start_get_meta = time.time()
+        ray.wait(
+            [
+                self.reader.get_meta.remote(
+                    data_fields=list(data_fields),
+                    batch_size=self.global_batch_size,
+                    partition_id="train_0",
+                    task_name="generate_sequences",
+                )
+            ]
+        )
+        end_get_meta = time.time()
+        logger.info(f"get_meta cost time: {end_get_meta - start_get_meta:.8f}s")
+
+        time.sleep(2)
+
+        # GET_DATA operation
+        logger.info("Starting GET_DATA operation...")
+        start_get_data = time.time()
+        ray.get(self.reader.get_data.remote())
+        end_get_data = time.time()
+        get_time = end_get_data - start_get_data
+        get_throughput_gbps = (total_data_size_gb * 8) / get_time
+        get_throughput_gbs = total_data_size_gb / get_time
+
+        logger.info(f"get_data cost time: {get_time:.8f}s")
+        logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)")
+
+        # Print summary
+        total_throughput_gbps = (total_data_size_gb * 16) / (put_time + get_time)
+        total_throughput_gbs = (total_data_size_gb * 2) / (put_time + get_time)
+
+        logger.info("=" * 60)
+        logger.info("THROUGHPUT TEST SUMMARY")
+        logger.info("=" * 60)
+        logger.info(f"Backend: {self.backend}")
+        logger.info(f"Client Placement: {self.client_placement}")
+        logger.info(f"Device: {self.device}")
+        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
+        logger.info(f"PUT Time: {put_time:.8f}s")
+        logger.info(f"GET Time: {get_time:.8f}s")
+        logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)")
+        logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)")
+        logger.info(f"Total Throughput: {total_throughput_gbps:.8f} Gb/s ({total_throughput_gbs:.8f} GB/s)")
+        logger.info("=" * 60)
+
+
+def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]:
+    """Load backend config from YAML file or use defaults.
+
+    Args:
+        config_path: Path to YAML config file (optional)
+        backend: Backend type for default config
+
+    Returns:
+        Backend configuration dictionary
+    """
+    if config_path is not None:
+        config = OmegaConf.load(config_path)
+        return OmegaConf.to_container(config, resolve=True)
+
+    # Default configs
+    if backend == "default":
+        return {"num_data_storage_units": 8, "storage_unit_placement": "normal"}
+    elif backend == "yuanrong":
+        return {
+            "host": "127.0.0.1",
+            "port": 31501,
+            "enable_yr_npu_transport": False,
+        }
+    elif backend == "mooncake":
+        return {
+            "local_hostname": "127.0.0.1",
+            "metadata_server": "127.0.0.1:8080",
+            "master_server_address": "127.0.0.1:8081",
+        }
+    else:
+        return {}
+
+
+def main() -> None:
+    """Main entry point for the perftest script."""
+    parser = argparse.ArgumentParser(description="TransferQueue Throughput Test")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="default",
+        choices=["default", "yuanrong", "mooncake"],
+        help="Backend type to test (default: default)",
+    )
+    parser.add_argument(
+        "--client_placement",
+        type=str,
+        default="intra_node",
+        choices=["intra_node", "inter_node"],
+        help="Client placement mode (default: intra_node)",
+    )
+    parser.add_argument(
+        "--backend_config",
+        type=str,
+        default=None,
+        help="Path to backend config YAML file (optional)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        choices=["cpu", "npu", "gpu"],
+        help="Device to use (default: cpu)",
+    )
+    parser.add_argument(
+        "--global_batch_size",
+        type=int,
+        default=1024,
+        help="Global batch size (default: 1024)",
+    )
+    parser.add_argument(
+        "--field_num",
+        type=int,
+        default=10,
+        help="Number of fields (default: 10)",
+    )
+    parser.add_argument(
+        "--seq_len",
+        type=int,
+        default=8192,
+        help="Sequence length (default: 8192)",
+    )
+    parser.add_argument(
+        "--num_global_batch",
+        type=int,
+        default=1,
+        help="Number of global batches (default: 1)",
+    )
+    parser.add_argument(
+        "--head_node_ip",
+        type=str,
+        required=True,
+        help="Head node IP address",
+    )
+    parser.add_argument(
+        "--worker_node_ip",
+        type=str,
+        default=None,
+        help="Worker node IP address (required for inter_node)",
+    )
+    parser.add_argument(
+        "--ray_address",
+        type=str,
+        default="auto",
+        help="Ray cluster address (default: auto)",
+    )
+
+    args = parser.parse_args()
+
+    # Load backend config
+    backend_config = load_backend_config(args.backend_config, args.backend)
+
+    # Initialize Ray
+    logger.info(f"Connecting to Ray cluster at {args.ray_address}")
+    ray.init(address=args.ray_address)
+
+    # Create and run tester
+    tester = TQThroughputTester(
+        backend=args.backend,
+        client_placement=args.client_placement,
+        backend_config=backend_config,
+        device=args.device,
+        global_batch_size=args.global_batch_size,
+        field_num=args.field_num,
+        seq_len=args.seq_len,
+        num_global_batch=args.num_global_batch,
+        head_node_ip=args.head_node_ip,
+        worker_node_ip=args.worker_node_ip,
+    )
+
+    # Run test multiple times for consistent results
+    print("-" * 60)
+    tester.run_throughput_test()
+    print("-" * 60)
+    tester.run_throughput_test()
+    print("-" * 60)
+    tester.run_throughput_test()
+
+    logger.info("Throughput test completed successfully!")
+
+
+if __name__ == "__main__":
+    main()

From bf8478a499402d9c25bee135667420d5d2412595 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Fri, 20 Mar 2026 10:35:13 +0800
Subject: [PATCH 02/29] fixed review comments

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/README_PERFTEST.md |  5 +++++
 scripts/perftest.py        | 28 +++++++++++++++++-----------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/scripts/README_PERFTEST.md b/scripts/README_PERFTEST.md
index b01f37f..887f723 100644
--- a/scripts/README_PERFTEST.md
+++ b/scripts/README_PERFTEST.md
@@ -71,6 +71,11 @@ Sample config files are in `configs/`:
   master_server_address: 127.0.0.1:8081
   ```
 
+For device support of each backend,
+- `default` backend supports `cpu`
+- `yuanrong` supports `cpu` and `npu`
+- `mooncake` supports `cpu` and `gpu`
+
 ## Examples
 
 ### Intra-node test with default backend
diff --git a/scripts/perftest.py b/scripts/perftest.py
index ca10775..ff04ce1 100644
--- a/scripts/perftest.py
+++ b/scripts/perftest.py
@@ -318,19 +318,25 @@ def _initialize_clients(self) -> None:
 
         logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}")
 
-        # Prepare device resource
-        device_resource = {}
-        if self.device in ["npu", "gpu"]:
-            device_resource = {self.device: 1}
+        # Prepare base options
+        writer_options = {
+            "resources": {f"node:{writer_node}": 0.001},
+        }
+        reader_options = {
+            "resources": {f"node:{reader_node}": 0.001},
+        }
 
-        # Create writer and reader actors
-        self.writer = TQClientActor.options(
-            resources={f"node:{writer_node}": 0.001, **device_resource},
-        ).remote("writer", self.data_system_controller_info)
+        # Add device-specific options
+        if self.device == "gpu":
+            writer_options["num_gpus"] = 1
+            reader_options["num_gpus"] = 1
+        elif self.device == "npu":
+            writer_options["resources"]["NPU"] = 1
+            reader_options["resources"]["NPU"] = 1
 
-        self.reader = TQClientActor.options(
-            resources={f"node:{reader_node}": 0.001, **device_resource},
-        ).remote("reader", self.data_system_controller_info)
+        # Create writer and reader actors
+        self.writer = TQClientActor.options(**writer_options).remote("writer", self.data_system_controller_info)
+        self.reader = TQClientActor.options(**reader_options).remote("reader", self.data_system_controller_info)
 
         # Initialize storage managers
         logger.info(f"Using {self.manager_type} as storage backend.")

From 06ffebb32fb7b6ca94d290b3538ab46780041953 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Fri, 20 Mar 2026 17:08:02 +0800
Subject: [PATCH 03/29] 1. adjust default storage unit number to 1 2. remove
 old perf test script

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/configs/transferqueue.yaml |   2 +-
 scripts/performance_test.py        | 350 -----------------------------
 scripts/perftest.py                |   4 +-
 3 files changed, 3 insertions(+), 353 deletions(-)
 delete mode 100644 scripts/performance_test.py

diff --git a/scripts/configs/transferqueue.yaml b/scripts/configs/transferqueue.yaml
index 9f55742..3e5acd8 100644
--- a/scripts/configs/transferqueue.yaml
+++ b/scripts/configs/transferqueue.yaml
@@ -1,5 +1,5 @@
 # TransferQueue (default) backend configuration
-num_data_storage_units: 8
+num_data_storage_units: 1
 # storage_unit_placement: "normal" (default) or "remote"
 #   - normal: create storage units on current node using placement group
 #   - remote: create all storage units on WORKER_NODE_IP
diff --git a/scripts/performance_test.py b/scripts/performance_test.py
deleted file mode 100644
index 14d06a4..0000000
--- a/scripts/performance_test.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2025 The TransferQueue Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import logging
-import math
-import random
-import sys
-import time
-from pathlib import Path
-
-import ray
-import torch
-from omegaconf import OmegaConf
-from tensordict import TensorDict
-from tensordict.tensorclass import NonTensorData
-
-parent_dir = Path(__file__).resolve().parent.parent.parent
-sys.path.append(str(parent_dir))
-
-from transfer_queue.client import TransferQueueClient  # noqa: E402
-from transfer_queue.controller import TransferQueueController  # noqa: E402
-from transfer_queue.storage.simple_backend import SimpleStorageUnit  # noqa: E402
-from transfer_queue.utils.common import get_placement_group  # noqa: E402
-from transfer_queue.utils.zmq_utils import process_zmq_server_info  # noqa: E402
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-
-########################################################################
-# Please set up Ray cluster before running this script
-########################################################################
-HEAD_NODE_IP = "NodeA"  # Replace with your head node IP
-WORKER_NODE_IP = "NodeB"  # Replace with your worker node IP
-
-
-# This is the Medium setting of the performance test.
-# You can modify the parameters according to
-# https://www.yuque.com/haomingzi-lfse7/lhp4el/tml8ke0zkgn6roey?singleDoc#
-config_str = """
-  global_batch_size: 1024
-  seq_length: 8192
-  field_num: 10
-  num_global_batch: 1 
-  num_data_storage_units: 8
-"""
-dict_conf = OmegaConf.create(config_str)
-
-
-def create_complex_test_case(batch_size=None, seq_length=None, field_num=None):
-    tensor_field_size_bytes = batch_size * seq_length * 4
-    tensor_field_size_gb = tensor_field_size_bytes / (1024**3)
-
-    num_tensor_fields = (field_num + 1) // 2
-    num_nontensor_fields = field_num // 2
-
-    total_tensor_size_gb = tensor_field_size_gb * num_tensor_fields
-    total_nontensor_size_gb = (batch_size * 1024 / (1024**3)) * num_nontensor_fields
-    total_size_gb = total_tensor_size_gb + total_nontensor_size_gb
-
-    logger.info(f"Total data size: {total_size_gb:.6f} GB")
-
-    fields = {}
-
-    for i in range(field_num):
-        field_name = f"field_{i}"
-
-        if i % 2 == 0:
-            # Tensor
-            tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32)
-            fields[field_name] = tensor_data
-        else:
-            # NonTensorData
-            str_length = 1024
-            non_tensor_data = [
-                "".join(random.choices("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=str_length))
-                for _ in range(batch_size)
-            ]
-            fields[field_name] = NonTensorData(data=non_tensor_data, batch_size=(batch_size,), device=None)
-
-    batch_size_tuple = (batch_size,)
-    prompt_batch = TensorDict(
-        fields,
-        batch_size=batch_size_tuple,
-        device=None,
-    )
-
-    return prompt_batch, total_size_gb
-
-
-@ray.remote
-class RemoteDataStoreObjStore:
-    def __init__(self):
-        pass
-
-    def get_data(self, data_handler):
-        start_get = time.time()
-        ray.get(data_handler)
-        end_get = time.time()
-
-        get_time = end_get - start_get
-        return get_time
-
-
-@ray.remote
-class RemoteDataStoreRemote:
-    def __init__(self):
-        self.stored_data = None
-
-    def put_data(self, data):
-        self.stored_data = data
-
-    def get_data(self):
-        return self.stored_data
-
-    def clear_data(self):
-        self.stored_data = None
-
-
-class RayBandwidthTester:
-    def __init__(self, config, test_mode="obj_store"):
-        self.config = config
-        self.test_mode = test_mode
-
-        if test_mode == "obj_store":
-            RemoteDataStore = RemoteDataStoreObjStore
-        else:
-            RemoteDataStore = RemoteDataStoreRemote
-
-        self.remote_store = RemoteDataStore.options(num_cpus=10, resources={f"node:{WORKER_NODE_IP}": 0.001}).remote()
-
-        logger.info(f"Remote data store created on worker node {WORKER_NODE_IP}")
-
-    def run_bandwidth_test(self):
-        start_create_data = time.time()
-        test_data, total_data_size_gb = create_complex_test_case(
-            batch_size=self.config.global_batch_size, seq_length=self.config.seq_length, field_num=self.config.field_num
-        )
-        end_create_data = time.time()
-        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
-
-        if self.test_mode == "obj_store":
-            self._run_obj_store_test(test_data, total_data_size_gb)
-        else:
-            self._run_remote_test(test_data, total_data_size_gb)
-
-    def _run_obj_store_test(self, test_data, total_data_size_gb):
-        start_time = time.time()
-        data_handler = ray.put(test_data)
-        ray.get(self.remote_store.get_data.remote([data_handler]))
-        end_time = time.time()
-
-        transfer_time = end_time - start_time
-        throughput = (total_data_size_gb * 8) / transfer_time
-
-        logger.info("=" * 60)
-        logger.info("RAY OBJECT STORE BANDWIDTH TEST SUMMARY")
-        logger.info("=" * 60)
-        logger.info(f"Data Size: {(total_data_size_gb):.6f} GB")
-        logger.info(f"Transfer Time: {transfer_time:.8f}s")
-        logger.info(f"Throughput: {throughput:.8f} Gb/s")
-
-    def _run_remote_test(self, test_data, total_data_size_gb):
-        logger.info("Starting Ray PUT bandwidth test...")
-        start_put = time.time()
-        ray.get(self.remote_store.put_data.remote(test_data))
-        end_put = time.time()
-        put_time = end_put - start_put
-        logger.info(f"PUT Time: {put_time:.8f}s")
-
-        time.sleep(2)
-
-        logger.info("Starting Ray GET bandwidth test...")
-        start_get = time.time()
-        ray.get(self.remote_store.get_data.remote())
-        end_get = time.time()
-        get_time = end_get - start_get
-        logger.info(f"GET Time: {get_time:.8f}s")
-
-        ray.get(self.remote_store.clear_data.remote())
-
-        put_throughput = (total_data_size_gb * 8) / put_time
-        get_throughput = (total_data_size_gb * 8) / get_time
-
-        logger.info("=" * 60)
-        logger.info("RAY REMOTE ACTOR BANDWIDTH TEST SUMMARY")
-        logger.info("=" * 60)
-        logger.info(f"Data Size: {total_data_size_gb:.6f} GB")
-        logger.info(f"PUT Time: {put_time:.8f}s")
-        logger.info(f"GET Time: {get_time:.8f}s")
-        logger.info(f"PUT Throughput (Head->Worker): {put_throughput:.8f} Gb/s")
-        logger.info(f"GET Throughput (Worker->Head): {get_throughput:.8f} Gb/s")
-        logger.info(f"Round-trip Average Throughput: {total_data_size_gb * 16 / (put_time + get_time):.8f} Gb/s")
-
-
-class TQBandwidthTester:
-    def __init__(self, config, remote_mode=False):
-        self.config = config
-        self.remote_mode = remote_mode
-        self.data_system_client = self._initialize_data_system()
-
-    def _initialize_data_system(self):
-        total_storage_size = self.config.global_batch_size * self.config.num_global_batch
-        self.data_system_storage_units = {}
-
-        if self.remote_mode:
-            for storage_unit_rank in range(self.config.num_data_storage_units):
-                storage_node = SimpleStorageUnit.options(
-                    num_cpus=10,
-                    resources={f"node:{WORKER_NODE_IP}": 0.001},
-                ).remote(storage_unit_size=math.ceil(total_storage_size / self.config.num_data_storage_units))
-                self.data_system_storage_units[storage_unit_rank] = storage_node
-        else:
-            storage_placement_group = get_placement_group(self.config.num_data_storage_units, num_cpus_per_actor=10)
-            for storage_unit_rank in range(self.config.num_data_storage_units):
-                storage_node = SimpleStorageUnit.options(
-                    placement_group=storage_placement_group,
-                    placement_group_bundle_index=storage_unit_rank,
-                ).remote(storage_unit_size=math.ceil(total_storage_size / self.config.num_data_storage_units))
-                self.data_system_storage_units[storage_unit_rank] = storage_node
-
-        logger.info(f"TransferQueueStorageSimpleUnit #0 ~ #{storage_unit_rank} has been created.")
-
-        self.data_system_controller = TransferQueueController.remote()
-        logger.info("TransferQueueController has been created.")
-
-        self.data_system_controller_info = process_zmq_server_info(self.data_system_controller)
-        self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units)
-
-        tq_config = OmegaConf.create({}, flags={"allow_objects": True})
-        tq_config.controller_info = self.data_system_controller_info
-        tq_config.storage_unit_infos = self.data_system_storage_unit_infos
-        self.config = OmegaConf.merge(tq_config, self.config)
-
-        self.data_system_client = TransferQueueClient(
-            client_id="Trainer", controller_info=self.data_system_controller_info
-        )
-        self.data_system_client.initialize_storage_manager(manager_type="AsyncSimpleStorageManager", config=self.config)
-        return self.data_system_client
-
-    def run_bandwidth_test(self):
-        logger.info("Creating large batch for bandwidth test...")
-        start_create_data = time.time()
-        big_input_ids, total_data_size_gb = create_complex_test_case(
-            batch_size=self.config.global_batch_size, seq_length=self.config.seq_length, field_num=self.config.field_num
-        )
-        end_create_data = time.time()
-        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
-
-        logger.info("Starting PUT operation...")
-        start_async_put = time.time()
-        asyncio.run(self.data_system_client.async_put(data=big_input_ids, partition_id="train_0"))
-        end_async_put = time.time()
-        put_time = end_async_put - start_async_put
-
-        put_throughput_gbps = (total_data_size_gb * 8) / put_time
-        logger.info(f"async_put cost time: {put_time:.8f}s")
-        logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s")
-
-        time.sleep(2)
-
-        logger.info("Starting GET_META operation...")
-        start_async_get_meta = time.time()
-        prompt_meta = asyncio.run(
-            self.data_system_client.async_get_meta(
-                data_fields=list(big_input_ids.keys()),
-                batch_size=big_input_ids.size(0),
-                partition_id="train_0",
-                task_name="generate_sequences",
-            )
-        )
-        end_async_get_meta = time.time()
-        logger.info(f"async_get_meta cost time: {end_async_get_meta - start_async_get_meta:.8f}s")
-
-        time.sleep(2)
-
-        logger.info("Starting GET_DATA operation...")
-        start_async_get_data = time.time()
-        asyncio.run(self.data_system_client.async_get_data(prompt_meta))
-        end_async_get_data = time.time()
-        get_time = end_async_get_data - start_async_get_data
-        get_throughput_gbps = (total_data_size_gb * 8) / get_time
-
-        logger.info(f"async_get_data cost time: {get_time:.8f}s")
-        logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s")
-
-        mode_name = "TQ REMOTE" if self.remote_mode else "TQ NORMAL"
-        logger.info("=" * 60)
-        logger.info(f"{mode_name} BANDWIDTH TEST SUMMARY")
-        logger.info("=" * 60)
-        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
-        logger.info(f"PUT Time: {put_time:.8f}s")
-        logger.info(f"GET Time: {get_time:.8f}s")
-        logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s")
-        logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s")
-        logger.info(f"Network Round-trip Throughput: {(total_data_size_gb * 16) / (put_time + get_time):.8f} Gb/s")
-
-
-def main():
-    if len(sys.argv) < 2:
-        print("Usage: python performance_test.py <test_mode>")
-        print("Available test modes:")
-        print("  ray-obj-store    - Ray Object Store bandwidth test")
-        print("  ray-remote       - Ray Remote Actor bandwidth test")
-        print("  tq-normal        - TQ Normal mode bandwidth test")
-        print("  tq-remote        - TQ Remote mode bandwidth test")
-        return
-
-    test_mode = sys.argv[1]
-
-    if test_mode == "ray-obj-store":
-        logger.info("Starting Ray Object Store bandwidth test")
-        tester = RayBandwidthTester(config=dict_conf, test_mode="obj_store")
-        tester.run_bandwidth_test()
-        logger.info("Ray Object Store bandwidth test completed successfully!")
-
-    elif test_mode == "ray-remote":
-        logger.info("Starting Ray Remote Actor bandwidth test")
-        tester = RayBandwidthTester(config=dict_conf, test_mode="remote")
-        tester.run_bandwidth_test()
-        logger.info("Ray Remote Actor bandwidth test completed successfully!")
-
-    elif test_mode in ["tq-normal", "tq-remote"]:
-        remote_mode = test_mode == "tq-remote"
-        mode_name = "TQ Remote" if remote_mode else "TQ Normal"
-        logger.info(f"Starting {mode_name} bandwidth test")
-
-        tester = TQBandwidthTester(config=dict_conf, remote_mode=remote_mode)
-        tester.run_bandwidth_test()
-        logger.info(f"{mode_name} bandwidth test completed successfully!")
-
-    else:
-        print(f"Unknown test mode: {test_mode}")
-        print("Available test modes: ray-obj-store, ray-remote, tq-normal, tq-remote")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/perftest.py b/scripts/perftest.py
index ff04ce1..db7d9bf 100644
--- a/scripts/perftest.py
+++ b/scripts/perftest.py
@@ -284,7 +284,7 @@ def _initialize_storage_units(self) -> None:
             # Remote mode: create all storage units on worker node
             for storage_unit_rank in range(num_data_storage_units):
                 storage_node = SimpleStorageUnit.options(
-                    num_cpus=10,
+                    num_cpus=1,
                     resources={f"node:{self.worker_node_ip}": 0.001},
                 ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units))
                 self.data_system_storage_units[storage_unit_rank] = storage_node
@@ -438,7 +438,7 @@ def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]
 
     # Default configs
     if backend == "default":
-        return {"num_data_storage_units": 8, "storage_unit_placement": "normal"}
+        return {"num_data_storage_units": 1, "storage_unit_placement": "normal"}
     elif backend == "yuanrong":
         return {
             "host": "127.0.0.1",

From 83c9fb01e92e7b9e3fc677547333902dfbd692c4 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Mon, 23 Mar 2026 15:08:47 +0800
Subject: [PATCH 04/29] 1. the current `backend` arg is "default", "yuanrong",
 and "mooncake". use "SimpleStorage", "Yuanrong", "MooncakeStore" instead 2.
 do not support storage_unit_placement for simple storage any more 3. output
 test results to csv 4. move client_placement to yuanrong-specific config
 yaml. if backend is not yuanrong, can only be intra_node. if backend is
 yuanrong, inter node by default 5. remove --client_placement arg. put it into
 yuanrong.yaml 6. use tq.init(config) to initialize TransferQueueClient in
 TQClientActor 7. remove the non-tensor part when create_complex_test_case.
 only create tensors in tensordict. rename create_complex_test_case to
 create_test_case 8. put perftest.py, .md, configs/  under
 scripts/performance_test. 9. add default simple storage test to a new github
 workflow action 10. extract the test times 3 in _initialize_storage_units as
 a constant, and run run_throughput_test in a for loop 11. for TQClientActor,
 put, get_meta, and get_data, refactor them with kv interface

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 .github/workflows/perftest.yml                |  58 ++
 scripts/configs/transferqueue.yaml            |   6 -
 .../{ => performance_test}/README_PERFTEST.md |  70 ++-
 .../configs/mooncake_store.yaml}              |   0
 .../configs/simple_storage.yaml               |   2 +
 .../configs/yuanrong.yaml                     |   1 +
 scripts/performance_test/perftest.py          | 563 +++++++++++++++++
 scripts/perftest.py                           | 566 ------------------
 8 files changed, 670 insertions(+), 596 deletions(-)
 create mode 100644 .github/workflows/perftest.yml
 delete mode 100644 scripts/configs/transferqueue.yaml
 rename scripts/{ => performance_test}/README_PERFTEST.md (52%)
 rename scripts/{configs/mooncake.yaml => performance_test/configs/mooncake_store.yaml} (100%)
 create mode 100644 scripts/performance_test/configs/simple_storage.yaml
 rename scripts/{ => performance_test}/configs/yuanrong.yaml (76%)
 create mode 100644 scripts/performance_test/perftest.py
 delete mode 100644 scripts/perftest.py

diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml
new file mode 100644
index 0000000..9356c02
--- /dev/null
+++ b/.github/workflows/perftest.yml
@@ -0,0 +1,58 @@
+# This workflow runs the SimpleStorage performance test
+name: Performance Test
+
+on:
+  push:
+    branches:
+      - main
+      - v0.*
+  pull_request:
+    branches:
+      - main
+      - v0.*
+
+jobs:
+  perftest:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11"]
+
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+        pip install -e ".[test,build,yuanrong]"
+    - name: Start Ray cluster
+      run: |
+        # Get the host IP
+        HOST_IP=$(hostname -I | awk '{print $1}')
+        echo "Host IP: $HOST_IP"
+        # Start Ray with node resource
+        ray start --head --resources="{\"node:$HOST_IP\":1}"
+    - name: Run SimpleStorage performance test
+      run: |
+        # Get the host IP
+        HOST_IP=$(hostname -I | awk '{print $1}')
+        echo "Host IP: $HOST_IP"
+        # Run the perftest with small batch size for quick test
+        cd scripts/performance_test
+        python perftest.py \
+          --backend=SimpleStorage \
+          --device=cpu \
+          --global_batch_size=128 \
+          --field_num=4 \
+          --seq_len=1024 \
+          --head_node_ip=$HOST_IP
+    - name: Stop Ray cluster
+      run: |
+        ray stop
+      if: always()
diff --git a/scripts/configs/transferqueue.yaml b/scripts/configs/transferqueue.yaml
deleted file mode 100644
index 3e5acd8..0000000
--- a/scripts/configs/transferqueue.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-# TransferQueue (default) backend configuration
-num_data_storage_units: 1
-# storage_unit_placement: "normal" (default) or "remote"
-#   - normal: create storage units on current node using placement group
-#   - remote: create all storage units on WORKER_NODE_IP
-storage_unit_placement: normal
diff --git a/scripts/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
similarity index 52%
rename from scripts/README_PERFTEST.md
rename to scripts/performance_test/README_PERFTEST.md
index 887f723..0a82ce6 100644
--- a/scripts/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -13,14 +13,13 @@ This script runs throughput tests for TransferQueue with different backends.
    ray start --address=192.168.0.1 --resources='{"node:192.168.0.2":1}'
    ```
 
-2. Start the backend service (yuanrong, mooncake, etc.) if testing non-default backends.
+2. Start the backend service (Yuanrong, MooncakeStore, etc.) if testing non-SimpleStorage backends.
 
 ## Usage
 
 ```bash
 python perftest.py \
-  --backend=[default|yuanrong|mooncake] \
-  --client_placement=[intra_node|inter_node] \
+  --backend=[SimpleStorage|Yuanrong|MooncakeStore] \
   --backend_config=xxx.yaml \
   --device=[cpu|npu|gpu] \
   --global_batch_size=1024 \
@@ -35,8 +34,7 @@ python perftest.py \
 
 | Argument | Description | Default |
 |----------|-------------|---------|
-| `--backend` | Backend type: default, yuanrong, mooncake | default |
-| `--client_placement` | Client placement: intra_node or inter_node | intra_node |
+| `--backend` | Backend type: SimpleStorage, Yuanrong, MooncakeStore | SimpleStorage |
 | `--backend_config` | Path to YAML config file (optional) | None |
 | `--device` | Device: cpu, npu, gpu | cpu |
 | `--global_batch_size` | Global batch size | 1024 |
@@ -44,17 +42,17 @@ python perftest.py \
 | `--seq_len` | Sequence length | 8192 |
 | `--num_global_batch` | Number of global batches | 1 |
 | `--head_node_ip` | Head node IP (required) | - |
-| `--worker_node_ip` | Worker node IP (required for inter_node) | None |
+| `--worker_node_ip` | Worker node IP (required for Yuanrong inter_node) | None |
 | `--ray_address` | Ray cluster address | auto |
+| `--output_csv` | Path to output CSV file (optional) | None |
 
 ## Backend Configuration
 
 Sample config files are in `configs/`:
 
-- **transferqueue.yaml**: Default backend config
+- **simple_storage.yaml**: SimpleStorage backend config
   ```yaml
-  num_data_storage_units: 8
-  storage_unit_placement: normal  # or "remote"
+  num_data_storage_units: 1
   ```
 
 - **yuanrong.yaml**: Yuanrong backend config
@@ -62,9 +60,10 @@ Sample config files are in `configs/`:
   host: 127.0.0.1
   port: 31501
   enable_yr_npu_transport: false
+  client_placement: inter_node  # or "intra_node"
   ```
 
-- **mooncake.yaml**: Mooncake backend config
+- **mooncake_store.yaml**: MooncakeStore backend config
   ```yaml
   local_hostname: 127.0.0.1
   metadata_server: 127.0.0.1:8080
@@ -72,36 +71,41 @@ Sample config files are in `configs/`:
   ```
 
 For device support of each backend,
-- `default` backend supports `cpu`
-- `yuanrong` supports `cpu` and `npu`
-- `mooncake` supports `cpu` and `gpu`
+- `SimpleStorage` backend supports `cpu`
+- `Yuanrong` supports `cpu` and `npu`
+- `MooncakeStore` supports `cpu` and `gpu`
+
+## Yuanrong Client Placement
+
+For Yuanrong backend, since `put` is always local-first, we need to start client actors on different nodes to test cross-node transfer. The client placement is configured in the YAML file:
+- `client_placement: intra_node`: Both writer and reader run on head node
+- `client_placement: inter_node`: Writer runs on head node, reader runs on worker node (default)
 
 ## Examples
 
-### Intra-node test with default backend
+### SimpleStorage backend
 ```bash
-python perftest.py --backend=default --client_placement=intra_node \
+python perftest.py --backend=SimpleStorage \
   --head_node_ip=192.168.0.1
 ```
 
-### Inter-node test with yuanrong backend
+### Yuanrong backend
 ```bash
-python perftest.py --backend=yuanrong --client_placement=inter_node \
+python perftest.py --backend=Yuanrong \
   --backend_config=configs/yuanrong.yaml \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
-### Default backend with remote storage units
+### NPU device test
 ```bash
-python perftest.py --backend=default --client_placement=intra_node \
-  --backend_config=configs/transferqueue.yaml \
-  --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
+python perftest.py --backend=Yuanrong --device=npu \
+  --head_node_ip=192.168.0.1
 ```
 
-### NPU device test
+### Output to CSV
 ```bash
-python perftest.py --backend=mooncake --device=npu \
-  --head_node_ip=192.168.0.1
+python perftest.py --backend=SimpleStorage \
+  --head_node_ip=192.168.0.1 --output_csv=results.csv
 ```
 
 ## Output
@@ -113,3 +117,21 @@ The test prints:
 - Total round-trip throughput
 
 Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per second).
+
+### CSV Output
+
+When using `--output_csv`, the test writes results to a CSV file with the following columns:
+- backend
+- client_placement
+- device
+- total_data_size_gb
+- put_time
+- get_time
+- put_gbit_per_sec
+- put_gbyte_per_sec
+- get_gbit_per_sec
+- get_gbyte_per_sec
+- total_gbit_per_sec
+- total_gbyte_per_sec
+
+The test runs 3 iterations and saves all 3 results to the CSV.
diff --git a/scripts/configs/mooncake.yaml b/scripts/performance_test/configs/mooncake_store.yaml
similarity index 100%
rename from scripts/configs/mooncake.yaml
rename to scripts/performance_test/configs/mooncake_store.yaml
diff --git a/scripts/performance_test/configs/simple_storage.yaml b/scripts/performance_test/configs/simple_storage.yaml
new file mode 100644
index 0000000..2eb397c
--- /dev/null
+++ b/scripts/performance_test/configs/simple_storage.yaml
@@ -0,0 +1,2 @@
+# TransferQueue (default) backend configuration
+num_data_storage_units: 1
diff --git a/scripts/configs/yuanrong.yaml b/scripts/performance_test/configs/yuanrong.yaml
similarity index 76%
rename from scripts/configs/yuanrong.yaml
rename to scripts/performance_test/configs/yuanrong.yaml
index 2df1b84..b4c52e5 100644
--- a/scripts/configs/yuanrong.yaml
+++ b/scripts/performance_test/configs/yuanrong.yaml
@@ -2,3 +2,4 @@
 host: 127.0.0.1
 port: 31501
 enable_yr_npu_transport: false
+client_placement: inter_node
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
new file mode 100644
index 0000000..3b7edb3
--- /dev/null
+++ b/scripts/performance_test/perftest.py
@@ -0,0 +1,563 @@
+#!/usr/bin/env python3
+# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+import logging
+import math
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import ray
+import torch
+from omegaconf import OmegaConf
+from tensordict import TensorDict
+
+parent_dir = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(parent_dir))
+
+import transfer_queue as tq  # noqa: E402
+from transfer_queue.storage.simple_backend import SimpleStorageUnit  # noqa: E402
+from transfer_queue.utils.common import get_placement_group  # noqa: E402
+from transfer_queue.utils.zmq_utils import process_zmq_server_info  # noqa: E402
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# Constants
+NUM_TEST_ITERATIONS = 3
+
+
+def create_test_case(
+    batch_size: int | None = None,
+    seq_length: int | None = None,
+    field_num: int | None = None,
+    device: str = "cpu",
+) -> tuple[TensorDict, float]:
+    """Create a test case with tensor fields only.
+
+    Args:
+        batch_size: Batch size for the test case
+        seq_length: Sequence length for tensor fields
+        field_num: Number of fields to create
+        device: Device to create tensors on ("cpu", "npu", or "gpu")
+
+    Returns:
+        Tuple of (TensorDict, total_size_gb)
+    """
+    tensor_field_size_bytes = batch_size * seq_length * 4
+    tensor_field_size_gb = tensor_field_size_bytes / (1024**3)
+
+    total_size_gb = tensor_field_size_gb * field_num
+
+    logger.info(f"Total data size: {total_size_gb:.6f} GB")
+
+    # Determine torch device
+    torch_device = None
+    if device == "npu":
+        torch_device = "npu:0"
+    elif device == "gpu":
+        torch_device = "cuda:0"
+
+    fields = {}
+    for i in range(field_num):
+        field_name = f"field_{i}"
+        tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
+        fields[field_name] = tensor_data
+
+    batch_size_tuple = (batch_size,)
+    prompt_batch = TensorDict(
+        fields,
+        batch_size=batch_size_tuple,
+    )
+
+    return prompt_batch, total_size_gb
+
+
+@ray.remote
+class TQClientActor:
+    """Ray actor that uses tq.init(config) to initialize."""
+
+    def __init__(self, base_config: dict[str, Any]):
+        self.base_config = base_config
+        self.test_data = None
+        self.total_data_size_gb = 0.0
+        self.test_keys = None
+
+    def initialize(self, zmq_info: Any = None) -> None:
+        """Initialize transfer_queue with the config."""
+        config = OmegaConf.create(self.base_config, flags={"allow_objects": True})
+        if zmq_info is not None and self.base_config["backend"]["storage_backend"] == "SimpleStorage":
+            # Use dict-style assignment to avoid OmegaConf validation
+            config["backend"]["SimpleStorage"]["zmq_info"] = zmq_info
+        tq.init(config)
+
+    def create_test_case(
+        self,
+        batch_size: int | None = None,
+        seq_length: int | None = None,
+        field_num: int | None = None,
+        device: str = "cpu",
+    ) -> tuple[list[str], float]:
+        """Create test case on the actor."""
+        self.test_data, self.total_data_size_gb = create_test_case(batch_size, seq_length, field_num, device)
+        # Create keys for each sample in the batch
+        self.test_keys = [f"test_key_{i}" for i in range(batch_size)]
+        return list(self.test_data.keys()), self.total_data_size_gb
+
+    def put(self, partition_id: str) -> None:
+        """Put data to storage using kv_batch_put."""
+        tq.kv_batch_put(keys=self.test_keys, partition_id=partition_id, fields=self.test_data)
+
+    def list_keys(self, partition_id: str) -> list[str]:
+        """List keys in a partition using kv_list."""
+        partition_info = tq.kv_list(partition_id=partition_id)
+        if partition_id in partition_info:
+            return list(partition_info[partition_id].keys())
+        return []
+
+    def get_data(self, partition_id: str, keys: list[str] | None = None) -> None:
+        """Get data from storage using kv_batch_get."""
+        if keys is None:
+            keys = self.test_keys
+        tq.kv_batch_get(keys=keys, partition_id=partition_id)
+
+
+class TQThroughputTester:
+    """Main throughput tester for TransferQueue backends."""
+
+    def __init__(
+        self,
+        backend: str,
+        backend_config: dict[str, Any],
+        device: str,
+        global_batch_size: int,
+        field_num: int,
+        seq_len: int,
+        num_global_batch: int,
+        head_node_ip: str,
+        worker_node_ip: str | None = None,
+        output_csv: str | None = None,
+    ):
+        """Initialize the throughput tester.
+
+        Args:
+            backend: Backend type ("SimpleStorage", "Yuanrong", "MooncakeStore")
+            backend_config: Backend configuration dictionary
+            device: Device type ("cpu", "npu", "gpu")
+            global_batch_size: Global batch size
+            field_num: Number of fields
+            seq_len: Sequence length
+            num_global_batch: Number of global batches
+            head_node_ip: Head node IP address
+            worker_node_ip: Worker node IP address (required for Yuanrong inter_node)
+            output_csv: Path to output CSV file (optional)
+        """
+        self.backend = backend
+        self.backend_config = backend_config
+        self.device = device
+        self.global_batch_size = global_batch_size
+        self.field_num = field_num
+        self.seq_len = seq_len
+        self.num_global_batch = num_global_batch
+        self.head_node_ip = head_node_ip
+        self.worker_node_ip = worker_node_ip
+        self.output_csv = output_csv
+
+        # Get client_placement from Yuanrong config, default to inter_node
+        self.client_placement = (
+            self.backend_config.get("client_placement", "inter_node") if self.backend == "Yuanrong" else "intra_node"
+        )
+
+        # Validate arguments
+        self._validate_args()
+
+        # Prepare full config for tq.init()
+        self.base_config, self.zmq_info = self._prepare_configs()
+
+        # Initialize the test infrastructure
+        self._initialize_data_system()
+        self._initialize_clients()
+
+    def _validate_args(self) -> None:
+        """Validate input arguments."""
+        # Check worker_node_ip for Yuanrong inter_node
+        if self.backend == "Yuanrong" and self.client_placement == "inter_node" and self.worker_node_ip is None:
+            raise ValueError("worker_node_ip is required for Yuanrong with client_placement=inter_node")
+
+    def _prepare_configs(self) -> tuple[dict[str, Any], Any]:
+        """Prepare the base config and storage units.
+
+        Returns:
+            Tuple of (base_config, zmq_info)
+        """
+        total_storage_size = self.global_batch_size * self.num_global_batch
+
+        config = {
+            "controller": {
+                "sampler": "SequentialSampler",
+                "polling_mode": False,
+            },
+            "backend": {
+                "storage_backend": self.backend,
+            },
+        }
+
+        # Set client_name based on backend
+        if self.backend == "Yuanrong":
+            self.backend_config["client_name"] = "YuanrongStorageClient"
+        elif self.backend == "MooncakeStore":
+            self.backend_config["client_name"] = "MooncakeStoreClient"
+
+        # Add backend-specific config
+        if self.backend == "SimpleStorage":
+            config["backend"]["SimpleStorage"] = {
+                "total_storage_size": total_storage_size,
+                "num_data_storage_units": self.backend_config.get("num_data_storage_units", 1),
+            }
+        elif self.backend == "Yuanrong":
+            config["backend"]["Yuanrong"] = self.backend_config.copy()
+            # Remove client_placement from the backend config passed to tq
+            if "client_placement" in config["backend"]["Yuanrong"]:
+                del config["backend"]["Yuanrong"]["client_placement"]
+        elif self.backend == "MooncakeStore":
+            config["backend"]["MooncakeStore"] = self.backend_config.copy()
+
+        return config, None
+
+    def _initialize_data_system(self) -> None:
+        """Initialize controller and storage units if needed."""
+        # For SimpleStorage, we need to manually create storage units with placement
+        if self.backend == "SimpleStorage":
+            self._initialize_storage_units()
+
+    def _initialize_storage_units(self) -> None:
+        """Initialize SimpleStorageUnits for SimpleStorage backend."""
+        num_data_storage_units = self.backend_config.get("num_data_storage_units", 1)
+        total_storage_size = self.global_batch_size * self.num_global_batch
+
+        self.data_system_storage_units = {}
+
+        storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1)
+        for storage_unit_rank in range(num_data_storage_units):
+            storage_node = SimpleStorageUnit.options(
+                placement_group=storage_placement_group,
+                placement_group_bundle_index=storage_unit_rank,
+            ).remote(storage_unit_size=NUM_TEST_ITERATIONS * math.ceil(total_storage_size / num_data_storage_units))
+            self.data_system_storage_units[storage_unit_rank] = storage_node
+        logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.")
+
+        self.zmq_info = process_zmq_server_info(self.data_system_storage_units)
+
+    def _initialize_clients(self) -> None:
+        """Initialize writer and reader TQClientActors."""
+        # Determine node placement
+        if self.client_placement == "intra_node":
+            writer_node = reader_node = self.head_node_ip
+        else:
+            writer_node = self.head_node_ip
+            reader_node = self.worker_node_ip
+
+        logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}")
+
+        # Prepare base options
+        writer_options = {
+            "resources": {f"node:{writer_node}": 0.001},
+        }
+        reader_options = {
+            "resources": {f"node:{reader_node}": 0.001},
+        }
+
+        # Add device-specific options
+        if self.device == "gpu":
+            writer_options["num_gpus"] = 1
+            reader_options["num_gpus"] = 1
+        elif self.device == "npu":
+            writer_options["resources"]["NPU"] = 1
+            reader_options["resources"]["NPU"] = 1
+
+        # Create writer and reader actors
+        self.writer = TQClientActor.options(**writer_options).remote(self.base_config)
+        self.reader = TQClientActor.options(**reader_options).remote(self.base_config)
+
+        # Initialize transfer_queue
+        logger.info(f"Using {self.backend} as storage backend.")
+
+        w = self.writer.initialize.remote(self.zmq_info)
+        r = self.reader.initialize.remote(self.zmq_info)
+        ray.get([w, r])
+
+    def run_throughput_test(self) -> dict[str, Any]:
+        """Run the throughput test and print results.
+
+        Returns:
+            Dictionary with test results
+        """
+        logger.info("Creating large batch for throughput test...")
+        start_create_data = time.perf_counter()
+        data_fields, total_data_size_gb = ray.get(
+            self.writer.create_test_case.remote(
+                batch_size=self.global_batch_size,
+                seq_length=self.seq_len,
+                field_num=self.field_num,
+                device=self.device,
+            )
+        )
+        end_create_data = time.perf_counter()
+        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
+
+        partition_id = "train_0"
+
+        # PUT operation using kv_batch_put
+        logger.info("Starting PUT operation (kv_batch_put)...")
+        start_put = time.perf_counter()
+        ray.get(self.writer.put.remote(partition_id=partition_id))
+        end_put = time.perf_counter()
+        put_time = end_put - start_put
+        put_gbit_per_sec = (total_data_size_gb * 8) / put_time
+        put_gbyte_per_sec = total_data_size_gb / put_time
+        logger.info(f"put cost time: {put_time:.8f}s")
+        logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)")
+
+        time.sleep(2)
+
+        # LIST_KEYS operation using kv_list
+        logger.info("Starting LIST_KEYS operation (kv_list)...")
+        start_list = time.perf_counter()
+        keys = ray.get(self.reader.list_keys.remote(partition_id=partition_id))
+        end_list = time.perf_counter()
+        logger.info(f"list_keys cost time: {end_list - start_list:.8f}s")
+        logger.info(f"Found {len(keys)} keys")
+
+        time.sleep(2)
+
+        # GET_DATA operation using kv_batch_get
+        logger.info("Starting GET_DATA operation (kv_batch_get)...")
+        start_get_data = time.perf_counter()
+        ray.get(self.reader.get_data.remote(partition_id=partition_id, keys=keys))
+        end_get_data = time.perf_counter()
+        get_time = end_get_data - start_get_data
+        get_gbit_per_sec = (total_data_size_gb * 8) / get_time
+        get_gbyte_per_sec = total_data_size_gb / get_time
+
+        logger.info(f"get_data cost time: {get_time:.8f}s")
+        logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s ({get_gbyte_per_sec:.8f} GB/s)")
+
+        # Print summary
+        total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time)
+        total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time)
+
+        logger.info("=" * 60)
+        logger.info("THROUGHPUT TEST SUMMARY")
+        logger.info("=" * 60)
+        logger.info(f"Backend: {self.backend}")
+        logger.info(f"Client Placement: {self.client_placement}")
+        logger.info(f"Device: {self.device}")
+        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
+        logger.info(f"PUT Time: {put_time:.8f}s")
+        logger.info(f"GET Time: {get_time:.8f}s")
+        logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)")
+        logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s ({get_gbyte_per_sec:.8f} GB/s)")
+        logger.info(f"Total Throughput: {total_gbit_per_sec:.8f} Gb/s ({total_gbyte_per_sec:.8f} GB/s)")
+        logger.info("=" * 60)
+
+        # Return results
+        return {
+            "backend": self.backend,
+            "client_placement": self.client_placement,
+            "device": self.device,
+            "total_data_size_gb": total_data_size_gb,
+            "put_time": put_time,
+            "get_time": get_time,
+            "put_gbit_per_sec": put_gbit_per_sec,
+            "put_gbyte_per_sec": put_gbyte_per_sec,
+            "get_gbit_per_sec": get_gbit_per_sec,
+            "get_gbyte_per_sec": get_gbyte_per_sec,
+            "total_gbit_per_sec": total_gbit_per_sec,
+            "total_gbyte_per_sec": total_gbyte_per_sec,
+        }
+
+
+def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]:
+    """Load backend config from YAML file or use defaults.
+
+    Args:
+        config_path: Path to YAML config file (optional)
+        backend: Backend type for default config
+
+    Returns:
+        Backend configuration dictionary
+    """
+    if config_path is not None:
+        config = OmegaConf.load(config_path)
+        return OmegaConf.to_container(config, resolve=True)
+
+    # Default configs
+    if backend == "SimpleStorage":
+        return {"num_data_storage_units": 1}
+    elif backend == "Yuanrong":
+        return {
+            "host": "127.0.0.1",
+            "port": 31501,
+            "enable_yr_npu_transport": False,
+            "client_placement": "inter_node",
+        }
+    elif backend == "MooncakeStore":
+        return {
+            "local_hostname": "127.0.0.1",
+            "metadata_server": "127.0.0.1:8080",
+            "master_server_address": "127.0.0.1:8081",
+        }
+    else:
+        return {}
+
+
+def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> None:
+    """Write test results to CSV file.
+
+    Args:
+        results: List of result dictionaries
+        output_path: Path to output CSV file
+    """
+    if not results:
+        return
+
+    fieldnames = list(results[0].keys())
+
+    with open(output_path, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for result in results:
+            writer.writerow(result)
+
+    logger.info(f"Results written to {output_path}")
+
+
+def main() -> None:
+    """Main entry point for the perftest script."""
+    parser = argparse.ArgumentParser(description="TransferQueue Throughput Test")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="SimpleStorage",
+        choices=["SimpleStorage", "Yuanrong", "MooncakeStore"],
+        help="Backend type to test (default: SimpleStorage)",
+    )
+    parser.add_argument(
+        "--backend_config",
+        type=str,
+        default=None,
+        help="Path to backend config YAML file (optional)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        choices=["cpu", "npu", "gpu"],
+        help="Device to use (default: cpu)",
+    )
+    parser.add_argument(
+        "--global_batch_size",
+        type=int,
+        default=1024,
+        help="Global batch size (default: 1024)",
+    )
+    parser.add_argument(
+        "--field_num",
+        type=int,
+        default=10,
+        help="Number of fields (default: 10)",
+    )
+    parser.add_argument(
+        "--seq_len",
+        type=int,
+        default=8192,
+        help="Sequence length (default: 8192)",
+    )
+    parser.add_argument(
+        "--num_global_batch",
+        type=int,
+        default=1,
+        help="Number of global batches (default: 1)",
+    )
+    parser.add_argument(
+        "--head_node_ip",
+        type=str,
+        required=True,
+        help="Head node IP address",
+    )
+    parser.add_argument(
+        "--worker_node_ip",
+        type=str,
+        default=None,
+        help="Worker node IP address (required for Yuanrong inter_node)",
+    )
+    parser.add_argument(
+        "--ray_address",
+        type=str,
+        default="auto",
+        help="Ray cluster address (default: auto)",
+    )
+    parser.add_argument(
+        "--output_csv",
+        type=str,
+        default=None,
+        help="Path to output CSV file (optional)",
+    )
+
+    args = parser.parse_args()
+
+    # Load backend config
+    backend_config = load_backend_config(args.backend_config, args.backend)
+
+    # Initialize Ray
+    logger.info(f"Connecting to Ray cluster at {args.ray_address}")
+    ray.init(address=args.ray_address)
+
+    # Create and run tester
+    tester = TQThroughputTester(
+        backend=args.backend,
+        backend_config=backend_config,
+        device=args.device,
+        global_batch_size=args.global_batch_size,
+        field_num=args.field_num,
+        seq_len=args.seq_len,
+        num_global_batch=args.num_global_batch,
+        head_node_ip=args.head_node_ip,
+        worker_node_ip=args.worker_node_ip,
+        output_csv=args.output_csv,
+    )
+
+    # Run test multiple times for consistent results using a for loop
+    all_results = []
+    for i in range(NUM_TEST_ITERATIONS):
+        logger.info("-" * 60)
+        logger.info(f"Iteration {i + 1}/{NUM_TEST_ITERATIONS}")
+        logger.info("-" * 60)
+        result = tester.run_throughput_test()
+        all_results.append(result)
+
+    # Write to CSV if output path is specified
+    if args.output_csv:
+        write_results_to_csv(all_results, args.output_csv)
+
+    logger.info("Throughput test completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/perftest.py b/scripts/perftest.py
deleted file mode 100644
index db7d9bf..0000000
--- a/scripts/perftest.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2025 The TransferQueue Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import math
-import random
-import sys
-import time
-from pathlib import Path
-from typing import Any
-
-import ray
-import torch
-from omegaconf import OmegaConf
-from tensordict import TensorDict
-from tensordict.tensorclass import NonTensorData
-
-parent_dir = Path(__file__).resolve().parent.parent
-sys.path.append(str(parent_dir))
-
-from transfer_queue.client import TransferQueueClient  # noqa: E402
-from transfer_queue.controller import TransferQueueController  # noqa: E402
-from transfer_queue.storage.simple_backend import SimpleStorageUnit  # noqa: E402
-from transfer_queue.utils.common import get_placement_group  # noqa: E402
-from transfer_queue.utils.zmq_utils import process_zmq_server_info  # noqa: E402
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-
-
-def create_complex_test_case(
-    batch_size: int | None = None,
-    seq_length: int | None = None,
-    field_num: int | None = None,
-    device: str = "cpu",
-) -> tuple[TensorDict, float]:
-    """Create a complex test case with tensor and non-tensor fields.
-
-    Args:
-        batch_size: Batch size for the test case
-        seq_length: Sequence length for tensor fields
-        field_num: Number of fields to create
-        device: Device to create tensors on ("cpu", "npu", or "gpu")
-
-    Returns:
-        Tuple of (TensorDict, total_size_gb)
-    """
-    tensor_field_size_bytes = batch_size * seq_length * 4
-    tensor_field_size_gb = tensor_field_size_bytes / (1024**3)
-
-    num_tensor_fields = (field_num + 1) // 2
-    num_nontensor_fields = field_num // 2
-
-    total_tensor_size_gb = tensor_field_size_gb * num_tensor_fields
-    total_nontensor_size_gb = (batch_size * 1024 / (1024**3)) * num_nontensor_fields
-    total_size_gb = total_tensor_size_gb + total_nontensor_size_gb
-
-    logger.info(f"Total data size: {total_size_gb:.6f} GB")
-
-    # Determine torch device
-    torch_device = None
-    if device == "npu":
-        torch_device = "npu:0"
-    elif device == "gpu":
-        torch_device = "cuda:0"
-
-    fields = {}
-    for i in range(field_num):
-        field_name = f"field_{i}"
-
-        if i % 2 == 0:
-            # Tensor field
-            tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
-            fields[field_name] = tensor_data
-        else:
-            # NonTensorData field
-            str_length = 1024
-            non_tensor_data = [
-                "".join(
-                    random.choices(
-                        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
-                        k=str_length,
-                    )
-                )
-                for _ in range(batch_size)
-            ]
-            fields[field_name] = NonTensorData(data=non_tensor_data, batch_size=(batch_size,), device=None)
-
-    batch_size_tuple = (batch_size,)
-    prompt_batch = TensorDict(
-        fields,
-        batch_size=batch_size_tuple,
-    )
-
-    return prompt_batch, total_size_gb
-
-
-@ray.remote
-class TQClientActor:
-    """Ray actor that holds a TransferQueueClient."""
-
-    def __init__(self, client_id: str, controller_info: Any):
-        self.client = TransferQueueClient(
-            client_id=client_id,
-            controller_info=controller_info,
-        )
-        self.prompt_meta = None
-        self.test_data = None
-        self.total_data_size_gb = 0.0
-
-    def initialize_storage_manager(self, manager_type: str, config: dict[str, Any]) -> None:
-        """Initialize the storage manager with given config."""
-        self.client.initialize_storage_manager(manager_type=manager_type, config=config)
-
-    def create_complex_test_case(
-        self,
-        batch_size: int | None = None,
-        seq_length: int | None = None,
-        field_num: int | None = None,
-        device: str = "cpu",
-    ) -> tuple[list[str], float]:
-        """Create test case on the actor."""
-        self.test_data, self.total_data_size_gb = create_complex_test_case(batch_size, seq_length, field_num, device)
-        return list(self.test_data.keys()), self.total_data_size_gb
-
-    def put(self, partition_id: str) -> None:
-        """Put data to storage."""
-        self.client.put(data=self.test_data, partition_id=partition_id)
-
-    def get_meta(
-        self,
-        data_fields: list[str],
-        batch_size: int,
-        partition_id: str,
-        task_name: str | None = None,
-        sampling_config: dict[str, Any] | None = None,
-    ) -> Any:
-        """Get metadata from controller."""
-        self.prompt_meta = self.client.get_meta(
-            data_fields=data_fields,
-            batch_size=batch_size,
-            partition_id=partition_id,
-            task_name=task_name,
-            sampling_config=sampling_config,
-        )
-        return self.prompt_meta
-
-    def get_data(self) -> None:
-        """Get data from storage using cached metadata."""
-        self.client.get_data(self.prompt_meta)
-
-
-class TQThroughputTester:
-    """Main throughput tester for TransferQueue backends."""
-
-    def __init__(
-        self,
-        backend: str,
-        client_placement: str,
-        backend_config: dict[str, Any],
-        device: str,
-        global_batch_size: int,
-        field_num: int,
-        seq_len: int,
-        num_global_batch: int,
-        head_node_ip: str,
-        worker_node_ip: str | None = None,
-    ):
-        """Initialize the throughput tester.
-
-        Args:
-            backend: Backend type ("default", "yuanrong", "mooncake")
-            client_placement: Client placement mode ("intra_node" or "inter_node")
-            backend_config: Backend configuration dictionary
-            device: Device type ("cpu", "npu", "gpu")
-            global_batch_size: Global batch size
-            field_num: Number of fields
-            seq_len: Sequence length
-            num_global_batch: Number of global batches
-            head_node_ip: Head node IP address
-            worker_node_ip: Worker node IP address (required for inter_node)
-        """
-        self.backend = backend
-        self.client_placement = client_placement
-        self.backend_config = backend_config
-        self.device = device
-        self.global_batch_size = global_batch_size
-        self.field_num = field_num
-        self.seq_len = seq_len
-        self.num_global_batch = num_global_batch
-        self.head_node_ip = head_node_ip
-        self.worker_node_ip = worker_node_ip
-
-        # Validate arguments
-        self._validate_args()
-
-        # Determine manager type and prepare configs
-        self.manager_type = self._get_manager_type()
-        self.writer_config, self.reader_config = self._prepare_backend_configs()
-
-        # Initialize the test infrastructure
-        self._initialize_data_system()
-        self._initialize_clients()
-
-    def _validate_args(self) -> None:
-        """Validate input arguments."""
-        if self.client_placement == "inter_node" and self.worker_node_ip is None:
-            raise ValueError("worker_node_ip is required for inter_node client placement")
-        if self.backend == "default":
-            storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal")
-            if storage_unit_placement == "remote" and self.worker_node_ip is None:
-                raise ValueError("worker_node_ip is required for remote storage_unit_placement")
-
-    def _get_manager_type(self) -> str:
-        """Get the storage manager type based on backend."""
-        if self.backend == "default":
-            return "AsyncSimpleStorageManager"
-        elif self.backend == "yuanrong":
-            return "YuanrongStorageManager"
-        elif self.backend == "mooncake":
-            return "MooncakeStorageManager"
-        else:
-            raise ValueError(f"Unknown backend: {self.backend}")
-
-    def _prepare_backend_configs(self) -> tuple[dict[str, Any], dict[str, Any]]:
-        """Prepare writer and reader backend configs.
-
-        Returns:
-            Tuple of (writer_config, reader_config)
-        """
-        # Set client_name based on backend
-        base_config = self.backend_config.copy()
-        if self.backend == "yuanrong":
-            base_config["client_name"] = "YuanrongStorageClient"
-        elif self.backend == "mooncake":
-            base_config["client_name"] = "MooncakeStoreClient"
-
-        writer_config = base_config.copy()
-        reader_config = base_config.copy()
-
-        if self.client_placement == "inter_node":
-            if self.backend == "yuanrong":
-                writer_config["host"] = self.head_node_ip
-                reader_config["host"] = self.worker_node_ip
-            elif self.backend == "mooncake":
-                writer_config["local_hostname"] = self.head_node_ip
-                reader_config["local_hostname"] = self.worker_node_ip
-
-        return writer_config, reader_config
-
-    def _initialize_data_system(self) -> None:
-        """Initialize controller and storage units if needed."""
-        # Initialize controller
-        self.data_system_controller = TransferQueueController.remote()
-        logger.info("TransferQueueController has been created.")
-        self.data_system_controller_info = process_zmq_server_info(self.data_system_controller)
-
-        # Initialize storage units for default backend
-        if self.backend == "default":
-            self._initialize_storage_units()
-
-    def _initialize_storage_units(self) -> None:
-        """Initialize SimpleStorageUnits for default backend."""
-        num_data_storage_units = self.backend_config.get("num_data_storage_units", 8)
-        storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal")
-        total_storage_size = self.global_batch_size * self.num_global_batch
-
-        self.data_system_storage_units = {}
-
-        if storage_unit_placement == "remote":
-            # Remote mode: create all storage units on worker node
-            for storage_unit_rank in range(num_data_storage_units):
-                storage_node = SimpleStorageUnit.options(
-                    num_cpus=1,
-                    resources={f"node:{self.worker_node_ip}": 0.001},
-                ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units))
-                self.data_system_storage_units[storage_unit_rank] = storage_node
-            logger.info(
-                f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created on worker node {self.worker_node_ip}."
-            )
-        else:
-            # Normal mode: create storage units using placement group
-            storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1)
-            for storage_unit_rank in range(num_data_storage_units):
-                storage_node = SimpleStorageUnit.options(
-                    placement_group=storage_placement_group,
-                    placement_group_bundle_index=storage_unit_rank,
-                ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units))
-                self.data_system_storage_units[storage_unit_rank] = storage_node
-            logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.")
-
-        self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units)
-        # Add storage unit infos to backend configs
-        self.writer_config["zmq_info"] = self.data_system_storage_unit_infos
-        self.reader_config["zmq_info"] = self.data_system_storage_unit_infos
-
-    def _initialize_clients(self) -> None:
-        """Initialize writer and reader TQClientActors."""
-        # Determine node placement
-        if self.client_placement == "intra_node":
-            writer_node = reader_node = self.head_node_ip
-        else:
-            writer_node = self.head_node_ip
-            reader_node = self.worker_node_ip
-
-        logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}")
-
-        # Prepare base options
-        writer_options = {
-            "resources": {f"node:{writer_node}": 0.001},
-        }
-        reader_options = {
-            "resources": {f"node:{reader_node}": 0.001},
-        }
-
-        # Add device-specific options
-        if self.device == "gpu":
-            writer_options["num_gpus"] = 1
-            reader_options["num_gpus"] = 1
-        elif self.device == "npu":
-            writer_options["resources"]["NPU"] = 1
-            reader_options["resources"]["NPU"] = 1
-
-        # Create writer and reader actors
-        self.writer = TQClientActor.options(**writer_options).remote("writer", self.data_system_controller_info)
-        self.reader = TQClientActor.options(**reader_options).remote("reader", self.data_system_controller_info)
-
-        # Initialize storage managers
-        logger.info(f"Using {self.manager_type} as storage backend.")
-
-        w = self.writer.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.writer_config)
-        r = self.reader.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.reader_config)
-        ray.get([w, r])
-
-    def run_throughput_test(self) -> None:
-        """Run the throughput test and print results."""
-        logger.info("Creating large batch for throughput test...")
-        start_create_data = time.time()
-        data_fields, total_data_size_gb = ray.get(
-            self.writer.create_complex_test_case.remote(
-                batch_size=self.global_batch_size,
-                seq_length=self.seq_len,
-                field_num=self.field_num,
-                device=self.device,
-            )
-        )
-        end_create_data = time.time()
-        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
-
-        # PUT operation
-        logger.info("Starting PUT operation...")
-        start_put = time.time()
-        ray.get(self.writer.put.remote(partition_id="train_0"))
-        end_put = time.time()
-        put_time = end_put - start_put
-        put_throughput_gbps = (total_data_size_gb * 8) / put_time
-        put_throughput_gbs = total_data_size_gb / put_time
-        logger.info(f"put cost time: {put_time:.8f}s")
-        logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)")
-
-        time.sleep(2)
-
-        # GET_META operation
-        logger.info("Starting GET_META operation...")
-        start_get_meta = time.time()
-        ray.wait(
-            [
-                self.reader.get_meta.remote(
-                    data_fields=list(data_fields),
-                    batch_size=self.global_batch_size,
-                    partition_id="train_0",
-                    task_name="generate_sequences",
-                )
-            ]
-        )
-        end_get_meta = time.time()
-        logger.info(f"get_meta cost time: {end_get_meta - start_get_meta:.8f}s")
-
-        time.sleep(2)
-
-        # GET_DATA operation
-        logger.info("Starting GET_DATA operation...")
-        start_get_data = time.time()
-        ray.get(self.reader.get_data.remote())
-        end_get_data = time.time()
-        get_time = end_get_data - start_get_data
-        get_throughput_gbps = (total_data_size_gb * 8) / get_time
-        get_throughput_gbs = total_data_size_gb / get_time
-
-        logger.info(f"get_data cost time: {get_time:.8f}s")
-        logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)")
-
-        # Print summary
-        total_throughput_gbps = (total_data_size_gb * 16) / (put_time + get_time)
-        total_throughput_gbs = (total_data_size_gb * 2) / (put_time + get_time)
-
-        logger.info("=" * 60)
-        logger.info("THROUGHPUT TEST SUMMARY")
-        logger.info("=" * 60)
-        logger.info(f"Backend: {self.backend}")
-        logger.info(f"Client Placement: {self.client_placement}")
-        logger.info(f"Device: {self.device}")
-        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
-        logger.info(f"PUT Time: {put_time:.8f}s")
-        logger.info(f"GET Time: {get_time:.8f}s")
-        logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)")
-        logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)")
-        logger.info(f"Total Throughput: {total_throughput_gbps:.8f} Gb/s ({total_throughput_gbs:.8f} GB/s)")
-        logger.info("=" * 60)
-
-
-def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]:
-    """Load backend config from YAML file or use defaults.
-
-    Args:
-        config_path: Path to YAML config file (optional)
-        backend: Backend type for default config
-
-    Returns:
-        Backend configuration dictionary
-    """
-    if config_path is not None:
-        config = OmegaConf.load(config_path)
-        return OmegaConf.to_container(config, resolve=True)
-
-    # Default configs
-    if backend == "default":
-        return {"num_data_storage_units": 1, "storage_unit_placement": "normal"}
-    elif backend == "yuanrong":
-        return {
-            "host": "127.0.0.1",
-            "port": 31501,
-            "enable_yr_npu_transport": False,
-        }
-    elif backend == "mooncake":
-        return {
-            "local_hostname": "127.0.0.1",
-            "metadata_server": "127.0.0.1:8080",
-            "master_server_address": "127.0.0.1:8081",
-        }
-    else:
-        return {}
-
-
-def main() -> None:
-    """Main entry point for the perftest script."""
-    parser = argparse.ArgumentParser(description="TransferQueue Throughput Test")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="default",
-        choices=["default", "yuanrong", "mooncake"],
-        help="Backend type to test (default: default)",
-    )
-    parser.add_argument(
-        "--client_placement",
-        type=str,
-        default="intra_node",
-        choices=["intra_node", "inter_node"],
-        help="Client placement mode (default: intra_node)",
-    )
-    parser.add_argument(
-        "--backend_config",
-        type=str,
-        default=None,
-        help="Path to backend config YAML file (optional)",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        choices=["cpu", "npu", "gpu"],
-        help="Device to use (default: cpu)",
-    )
-    parser.add_argument(
-        "--global_batch_size",
-        type=int,
-        default=1024,
-        help="Global batch size (default: 1024)",
-    )
-    parser.add_argument(
-        "--field_num",
-        type=int,
-        default=10,
-        help="Number of fields (default: 10)",
-    )
-    parser.add_argument(
-        "--seq_len",
-        type=int,
-        default=8192,
-        help="Sequence length (default: 8192)",
-    )
-    parser.add_argument(
-        "--num_global_batch",
-        type=int,
-        default=1,
-        help="Number of global batches (default: 1)",
-    )
-    parser.add_argument(
-        "--head_node_ip",
-        type=str,
-        required=True,
-        help="Head node IP address",
-    )
-    parser.add_argument(
-        "--worker_node_ip",
-        type=str,
-        default=None,
-        help="Worker node IP address (required for inter_node)",
-    )
-    parser.add_argument(
-        "--ray_address",
-        type=str,
-        default="auto",
-        help="Ray cluster address (default: auto)",
-    )
-
-    args = parser.parse_args()
-
-    # Load backend config
-    backend_config = load_backend_config(args.backend_config, args.backend)
-
-    # Initialize Ray
-    logger.info(f"Connecting to Ray cluster at {args.ray_address}")
-    ray.init(address=args.ray_address)
-
-    # Create and run tester
-    tester = TQThroughputTester(
-        backend=args.backend,
-        client_placement=args.client_placement,
-        backend_config=backend_config,
-        device=args.device,
-        global_batch_size=args.global_batch_size,
-        field_num=args.field_num,
-        seq_len=args.seq_len,
-        num_global_batch=args.num_global_batch,
-        head_node_ip=args.head_node_ip,
-        worker_node_ip=args.worker_node_ip,
-    )
-
-    # Run test multiple times for consistent results
-    print("-" * 60)
-    tester.run_throughput_test()
-    print("-" * 60)
-    tester.run_throughput_test()
-    print("-" * 60)
-    tester.run_throughput_test()
-
-    logger.info("Throughput test completed successfully!")
-
-
-if __name__ == "__main__":
-    main()

From a9f70bd26aaf1f9e9af00d748a9adbf8a8375c53 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Mon, 23 Mar 2026 15:28:56 +0800
Subject: [PATCH 05/29] reduce num_cpus for ci

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 .github/workflows/perftest.yml       | 3 ++-
 scripts/performance_test/perftest.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml
index 9356c02..a4fb231 100644
--- a/.github/workflows/perftest.yml
+++ b/.github/workflows/perftest.yml
@@ -30,7 +30,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
-        pip install -e ".[test,build,yuanrong]"
+        pip install -e .
     - name: Start Ray cluster
       run: |
         # Get the host IP
@@ -52,6 +52,7 @@ jobs:
           --field_num=4 \
           --seq_len=1024 \
           --head_node_ip=$HOST_IP
+          --output_csv=results.csv
     - name: Stop Ray cluster
       run: |
         ray stop
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 3b7edb3..85b87d3 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -253,7 +253,7 @@ def _initialize_storage_units(self) -> None:
 
         self.data_system_storage_units = {}
 
-        storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1)
+        storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=0.001)
         for storage_unit_rank in range(num_data_storage_units):
             storage_node = SimpleStorageUnit.options(
                 placement_group=storage_placement_group,
@@ -277,9 +277,11 @@ def _initialize_clients(self) -> None:
 
         # Prepare base options
         writer_options = {
+            "num_cpus": 0.001,
             "resources": {f"node:{writer_node}": 0.001},
         }
         reader_options = {
+            "num_cpus": 0.001,
             "resources": {f"node:{reader_node}": 0.001},
         }
 

From c43519c8e47c8d61de1f2fdbeddd5c858f998f64 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Mon, 23 Mar 2026 15:29:45 +0800
Subject: [PATCH 06/29] reduce perftest ci timeout to 10 min

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 .github/workflows/perftest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml
index a4fb231..a39121e 100644
--- a/.github/workflows/perftest.yml
+++ b/.github/workflows/perftest.yml
@@ -14,7 +14,7 @@ on:
 jobs:
   perftest:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 10
     strategy:
       fail-fast: false
       matrix:
@@ -51,7 +51,7 @@ jobs:
           --global_batch_size=128 \
           --field_num=4 \
           --seq_len=1024 \
-          --head_node_ip=$HOST_IP
+          --head_node_ip=$HOST_IP \
           --output_csv=results.csv
     - name: Stop Ray cluster
       run: |

From c3f69a821270013bdb292d84dcc7a761c7accbea Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Mon, 23 Mar 2026 15:44:10 +0800
Subject: [PATCH 07/29] fix ci

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 .github/workflows/perftest.yml       | 1 +
 scripts/performance_test/perftest.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml
index a39121e..2a72428 100644
--- a/.github/workflows/perftest.yml
+++ b/.github/workflows/perftest.yml
@@ -38,6 +38,7 @@ jobs:
         echo "Host IP: $HOST_IP"
         # Start Ray with node resource
         ray start --head --resources="{\"node:$HOST_IP\":1}"
+        ray status
     - name: Run SimpleStorage performance test
       run: |
         # Get the host IP
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 85b87d3..30a0ad2 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -253,7 +253,7 @@ def _initialize_storage_units(self) -> None:
 
         self.data_system_storage_units = {}
 
-        storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=0.001)
+        storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1)
         for storage_unit_rank in range(num_data_storage_units):
             storage_node = SimpleStorageUnit.options(
                 placement_group=storage_placement_group,

From a641e95cc38fb5b2755b1bc8e3d989f1e0b5480e Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Mon, 23 Mar 2026 22:44:57 +0800
Subject: [PATCH 08/29] 1. use transfer_queue/config.yaml instead of new
 configs 2. add num_test_iterations as a new option

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 .github/workflows/perftest.yml                |   2 +-
 scripts/performance_test/README_PERFTEST.md   |  60 ++---
 .../configs/mooncake_store.yaml               |   4 -
 .../configs/simple_storage.yaml               |   2 -
 .../performance_test/configs/yuanrong.yaml    |   5 -
 scripts/performance_test/perftest.py          | 234 +++++-------------
 transfer_queue/config.yaml                    |   8 +-
 7 files changed, 85 insertions(+), 230 deletions(-)
 delete mode 100644 scripts/performance_test/configs/mooncake_store.yaml
 delete mode 100644 scripts/performance_test/configs/simple_storage.yaml
 delete mode 100644 scripts/performance_test/configs/yuanrong.yaml

diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml
index 2a72428..30097cc 100644
--- a/.github/workflows/perftest.yml
+++ b/.github/workflows/perftest.yml
@@ -47,7 +47,7 @@ jobs:
         # Run the perftest with small batch size for quick test
         cd scripts/performance_test
         python perftest.py \
-          --backend=SimpleStorage \
+          --backend_config=../../transfer_queue/config.yaml \
           --device=cpu \
           --global_batch_size=128 \
           --field_num=4 \
diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index 0a82ce6..9b979ae 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -10,7 +10,7 @@ This script runs throughput tests for TransferQueue with different backends.
    ray start --head --resources='{"node:192.168.0.1":1}'
 
    # On worker node
-   ray start --address=192.168.0.1 --resources='{"node:192.168.0.2":1}'
+   ray start --address=192.168.0.1:6379 --resources='{"node:192.168.0.2":1}'
    ```
 
 2. Start the backend service (Yuanrong, MooncakeStore, etc.) if testing non-SimpleStorage backends.
@@ -19,13 +19,11 @@ This script runs throughput tests for TransferQueue with different backends.
 
 ```bash
 python perftest.py \
-  --backend=[SimpleStorage|Yuanrong|MooncakeStore] \
-  --backend_config=xxx.yaml \
+  --backend_config=../../transfer_queue/config.yaml \
   --device=[cpu|npu|gpu] \
   --global_batch_size=1024 \
   --field_num=10 \
   --seq_len=8192 \
-  --num_global_batch=1 \
   --head_node_ip=192.168.0.1 \
   --worker_node_ip=192.168.0.2
 ```
@@ -34,77 +32,53 @@ python perftest.py \
 
 | Argument | Description | Default |
 |----------|-------------|---------|
-| `--backend` | Backend type: SimpleStorage, Yuanrong, MooncakeStore | SimpleStorage |
-| `--backend_config` | Path to YAML config file (optional) | None |
+| `--backend_config` | Path to backend config YAML file (required) | - |
 | `--device` | Device: cpu, npu, gpu | cpu |
 | `--global_batch_size` | Global batch size | 1024 |
 | `--field_num` | Number of fields | 10 |
 | `--seq_len` | Sequence length | 8192 |
-| `--num_global_batch` | Number of global batches | 1 |
+| `--num_test_iterations` | Number of test iterations | 3 |
 | `--head_node_ip` | Head node IP (required) | - |
-| `--worker_node_ip` | Worker node IP (required for Yuanrong inter_node) | None |
+| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None |
 | `--ray_address` | Ray cluster address | auto |
 | `--output_csv` | Path to output CSV file (optional) | None |
 
 ## Backend Configuration
 
-Sample config files are in `configs/`:
-
-- **simple_storage.yaml**: SimpleStorage backend config
-  ```yaml
-  num_data_storage_units: 1
-  ```
-
-- **yuanrong.yaml**: Yuanrong backend config
-  ```yaml
-  host: 127.0.0.1
-  port: 31501
-  enable_yr_npu_transport: false
-  client_placement: inter_node  # or "intra_node"
-  ```
-
-- **mooncake_store.yaml**: MooncakeStore backend config
-  ```yaml
-  local_hostname: 127.0.0.1
-  metadata_server: 127.0.0.1:8080
-  master_server_address: 127.0.0.1:8081
-  ```
+The script reads the backend configuration directly from the provided `--backend_config` YAML file. The backend type is determined by `backend.storage_backend` in the config file.
 
 For device support of each backend,
 - `SimpleStorage` backend supports `cpu`
 - `Yuanrong` supports `cpu` and `npu`
 - `MooncakeStore` supports `cpu` and `gpu`
 
-## Yuanrong Client Placement
+## Yuanrong Backend
 
-For Yuanrong backend, since `put` is always local-first, we need to start client actors on different nodes to test cross-node transfer. The client placement is configured in the YAML file:
-- `client_placement: intra_node`: Both writer and reader run on head node
-- `client_placement: inter_node`: Writer runs on head node, reader runs on worker node (default)
+For Yuanrong backend, writer runs on head node and reader runs on worker node.
 
 ## Examples
 
-### SimpleStorage backend
+### SimpleStorage/Mooncake backend
 ```bash
-python perftest.py --backend=SimpleStorage \
+python perftest.py --backend_config=../../transfer_queue/config.yaml \
   --head_node_ip=192.168.0.1
 ```
 
 ### Yuanrong backend
 ```bash
-python perftest.py --backend=Yuanrong \
-  --backend_config=configs/yuanrong.yaml \
+python perftest.py --backend_config=../../transfer_queue/config.yaml \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
 ### NPU device test
 ```bash
-python perftest.py --backend=Yuanrong --device=npu \
-  --head_node_ip=192.168.0.1
+python perftest.py --backend_config=../../transfer_queue/config.yaml --device=npu \
+  --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
 ### Output to CSV
 ```bash
-python perftest.py --backend=SimpleStorage \
+python perftest.py --backend_config=../../transfer_queue/config.yaml \
   --head_node_ip=192.168.0.1 --output_csv=results.csv
 ```
 
@@ -122,16 +96,12 @@ Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per s
 
 When using `--output_csv`, the test writes results to a CSV file with the following columns:
 - backend
-- client_placement
 - device
 - total_data_size_gb
 - put_time
 - get_time
 - put_gbit_per_sec
-- put_gbyte_per_sec
 - get_gbit_per_sec
-- get_gbyte_per_sec
 - total_gbit_per_sec
-- total_gbyte_per_sec
 
-The test runs 3 iterations and saves all 3 results to the CSV.
+The test runs `--num_test_iterations` iterations (default: 3) and saves all results to the CSV.
diff --git a/scripts/performance_test/configs/mooncake_store.yaml b/scripts/performance_test/configs/mooncake_store.yaml
deleted file mode 100644
index 320801f..0000000
--- a/scripts/performance_test/configs/mooncake_store.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Mooncake backend configuration
-local_hostname: 127.0.0.1
-metadata_server: 127.0.0.1:8080
-master_server_address: 127.0.0.1:8081
diff --git a/scripts/performance_test/configs/simple_storage.yaml b/scripts/performance_test/configs/simple_storage.yaml
deleted file mode 100644
index 2eb397c..0000000
--- a/scripts/performance_test/configs/simple_storage.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-# TransferQueue (default) backend configuration
-num_data_storage_units: 1
diff --git a/scripts/performance_test/configs/yuanrong.yaml b/scripts/performance_test/configs/yuanrong.yaml
deleted file mode 100644
index b4c52e5..0000000
--- a/scripts/performance_test/configs/yuanrong.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-# Yuanrong backend configuration
-host: 127.0.0.1
-port: 31501
-enable_yr_npu_transport: false
-client_placement: inter_node
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 30a0ad2..249af0b 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -17,7 +17,6 @@
 import argparse
 import csv
 import logging
-import math
 import sys
 import time
 from pathlib import Path
@@ -32,16 +31,10 @@
 sys.path.append(str(parent_dir))
 
 import transfer_queue as tq  # noqa: E402
-from transfer_queue.storage.simple_backend import SimpleStorageUnit  # noqa: E402
-from transfer_queue.utils.common import get_placement_group  # noqa: E402
-from transfer_queue.utils.zmq_utils import process_zmq_server_info  # noqa: E402
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
-# Constants
-NUM_TEST_ITERATIONS = 3
-
 
 def create_test_case(
     batch_size: int | None = None,
@@ -93,19 +86,15 @@ def create_test_case(
 class TQClientActor:
     """Ray actor that uses tq.init(config) to initialize."""
 
-    def __init__(self, base_config: dict[str, Any]):
-        self.base_config = base_config
+    def __init__(self, config: dict[str, Any]):
+        self.config = config
         self.test_data = None
         self.total_data_size_gb = 0.0
         self.test_keys = None
 
-    def initialize(self, zmq_info: Any = None) -> None:
+    def initialize(self) -> None:
         """Initialize transfer_queue with the config."""
-        config = OmegaConf.create(self.base_config, flags={"allow_objects": True})
-        if zmq_info is not None and self.base_config["backend"]["storage_backend"] == "SimpleStorage":
-            # Use dict-style assignment to avoid OmegaConf validation
-            config["backend"]["SimpleStorage"]["zmq_info"] = zmq_info
-        tq.init(config)
+        tq.init(OmegaConf.create(self.config))
 
     def create_test_case(
         self,
@@ -137,19 +126,22 @@ def get_data(self, partition_id: str, keys: list[str] | None = None) -> None:
             keys = self.test_keys
         tq.kv_batch_get(keys=keys, partition_id=partition_id)
 
+    def close(self) -> None:
+        """Close transfer_queue."""
+        tq.close()
+
 
 class TQThroughputTester:
     """Main throughput tester for TransferQueue backends."""
 
     def __init__(
         self,
-        backend: str,
-        backend_config: dict[str, Any],
+        backend_config_path: str,
         device: str,
         global_batch_size: int,
         field_num: int,
         seq_len: int,
-        num_global_batch: int,
+        num_test_iterations: int,
         head_node_ip: str,
         worker_node_ip: str | None = None,
         output_csv: str | None = None,
@@ -157,121 +149,71 @@ def __init__(
         """Initialize the throughput tester.
 
         Args:
-            backend: Backend type ("SimpleStorage", "Yuanrong", "MooncakeStore")
-            backend_config: Backend configuration dictionary
+            backend_config_path: Path to backend config YAML file
             device: Device type ("cpu", "npu", "gpu")
             global_batch_size: Global batch size
             field_num: Number of fields
             seq_len: Sequence length
-            num_global_batch: Number of global batches
+            num_test_iterations: Number of test iterations
             head_node_ip: Head node IP address
-            worker_node_ip: Worker node IP address (required for Yuanrong inter_node)
+            worker_node_ip: Worker node IP address (required for Yuanrong)
             output_csv: Path to output CSV file (optional)
         """
-        self.backend = backend
-        self.backend_config = backend_config
+        self.backend_config_path = backend_config_path
         self.device = device
         self.global_batch_size = global_batch_size
         self.field_num = field_num
         self.seq_len = seq_len
-        self.num_global_batch = num_global_batch
+        self.num_test_iterations = num_test_iterations
         self.head_node_ip = head_node_ip
         self.worker_node_ip = worker_node_ip
         self.output_csv = output_csv
 
-        # Get client_placement from Yuanrong config, default to inter_node
-        self.client_placement = (
-            self.backend_config.get("client_placement", "inter_node") if self.backend == "Yuanrong" else "intra_node"
-        )
+        # Prepare full config for tq.init()
+        self.full_config = self._prepare_config()
+
+        # Get backend from config
+        self.backend = self.full_config["backend"]["storage_backend"]
+
+        # For Yuanrong, always use inter_node
+        self.use_inter_node = self.backend == "Yuanrong"
 
         # Validate arguments
         self._validate_args()
 
-        # Prepare full config for tq.init()
-        self.base_config, self.zmq_info = self._prepare_configs()
-
-        # Initialize the test infrastructure
-        self._initialize_data_system()
+        # Initialize clients
         self._initialize_clients()
 
     def _validate_args(self) -> None:
         """Validate input arguments."""
-        # Check worker_node_ip for Yuanrong inter_node
-        if self.backend == "Yuanrong" and self.client_placement == "inter_node" and self.worker_node_ip is None:
-            raise ValueError("worker_node_ip is required for Yuanrong with client_placement=inter_node")
+        # Check worker_node_ip for Yuanrong
+        if self.use_inter_node and self.worker_node_ip is None:
+            raise ValueError("worker_node_ip is required for Yuanrong backend")
 
-    def _prepare_configs(self) -> tuple[dict[str, Any], Any]:
-        """Prepare the base config and storage units.
+    def _prepare_config(self) -> dict[str, Any]:
+        """Prepare the config by directly reading the backend_config file.
 
         Returns:
-            Tuple of (base_config, zmq_info)
+            Configuration dictionary
         """
-        total_storage_size = self.global_batch_size * self.num_global_batch
-
-        config = {
-            "controller": {
-                "sampler": "SequentialSampler",
-                "polling_mode": False,
-            },
-            "backend": {
-                "storage_backend": self.backend,
-            },
-        }
+        # Directly read the backend_config file, no merging with default
+        config = OmegaConf.load(self.backend_config_path)
 
-        # Set client_name based on backend
-        if self.backend == "Yuanrong":
-            self.backend_config["client_name"] = "YuanrongStorageClient"
-        elif self.backend == "MooncakeStore":
-            self.backend_config["client_name"] = "MooncakeStoreClient"
-
-        # Add backend-specific config
-        if self.backend == "SimpleStorage":
-            config["backend"]["SimpleStorage"] = {
-                "total_storage_size": total_storage_size,
-                "num_data_storage_units": self.backend_config.get("num_data_storage_units", 1),
-            }
-        elif self.backend == "Yuanrong":
-            config["backend"]["Yuanrong"] = self.backend_config.copy()
-            # Remove client_placement from the backend config passed to tq
-            if "client_placement" in config["backend"]["Yuanrong"]:
-                del config["backend"]["Yuanrong"]["client_placement"]
-        elif self.backend == "MooncakeStore":
-            config["backend"]["MooncakeStore"] = self.backend_config.copy()
-
-        return config, None
-
-    def _initialize_data_system(self) -> None:
-        """Initialize controller and storage units if needed."""
-        # For SimpleStorage, we need to manually create storage units with placement
-        if self.backend == "SimpleStorage":
-            self._initialize_storage_units()
-
-    def _initialize_storage_units(self) -> None:
-        """Initialize SimpleStorageUnits for SimpleStorage backend."""
-        num_data_storage_units = self.backend_config.get("num_data_storage_units", 1)
-        total_storage_size = self.global_batch_size * self.num_global_batch
-
-        self.data_system_storage_units = {}
-
-        storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1)
-        for storage_unit_rank in range(num_data_storage_units):
-            storage_node = SimpleStorageUnit.options(
-                placement_group=storage_placement_group,
-                placement_group_bundle_index=storage_unit_rank,
-            ).remote(storage_unit_size=NUM_TEST_ITERATIONS * math.ceil(total_storage_size / num_data_storage_units))
-            self.data_system_storage_units[storage_unit_rank] = storage_node
-        logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.")
-
-        self.zmq_info = process_zmq_server_info(self.data_system_storage_units)
+        # If backend.storage_backend is SimpleStorage, override total_storage_size
+        total_storage_size = self.global_batch_size * self.num_test_iterations
+        if config.backend.storage_backend == "SimpleStorage":
+            config.backend.SimpleStorage.total_storage_size = total_storage_size
+
+        return OmegaConf.to_container(config, resolve=True)
 
     def _initialize_clients(self) -> None:
         """Initialize writer and reader TQClientActors."""
         # Determine node placement
-        if self.client_placement == "intra_node":
-            writer_node = reader_node = self.head_node_ip
-        else:
+        if self.use_inter_node:
             writer_node = self.head_node_ip
             reader_node = self.worker_node_ip
+        else:
+            writer_node = reader_node = self.head_node_ip
 
         logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}")
 
@@ -294,14 +236,14 @@ def _initialize_clients(self) -> None:
             reader_options["resources"]["NPU"] = 1
 
         # Create writer and reader actors
-        self.writer = TQClientActor.options(**writer_options).remote(self.base_config)
-        self.reader = TQClientActor.options(**reader_options).remote(self.base_config)
+        self.writer = TQClientActor.options(**writer_options).remote(self.full_config)
+        self.reader = TQClientActor.options(**reader_options).remote(self.full_config)
 
         # Initialize transfer_queue
         logger.info(f"Using {self.backend} as storage backend.")
 
-        w = self.writer.initialize.remote(self.zmq_info)
-        r = self.reader.initialize.remote(self.zmq_info)
+        w = self.writer.initialize.remote()
+        r = self.reader.initialize.remote()
         ray.get([w, r])
 
     def run_throughput_test(self) -> dict[str, Any]:
@@ -333,18 +275,12 @@ def run_throughput_test(self) -> dict[str, Any]:
         put_time = end_put - start_put
         put_gbit_per_sec = (total_data_size_gb * 8) / put_time
         put_gbyte_per_sec = total_data_size_gb / put_time
-        logger.info(f"put cost time: {put_time:.8f}s")
-        logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)")
 
         time.sleep(2)
 
         # LIST_KEYS operation using kv_list
         logger.info("Starting LIST_KEYS operation (kv_list)...")
-        start_list = time.perf_counter()
         keys = ray.get(self.reader.list_keys.remote(partition_id=partition_id))
-        end_list = time.perf_counter()
-        logger.info(f"list_keys cost time: {end_list - start_list:.8f}s")
-        logger.info(f"Found {len(keys)} keys")
 
         time.sleep(2)
 
@@ -357,9 +293,6 @@ def run_throughput_test(self) -> dict[str, Any]:
         get_gbit_per_sec = (total_data_size_gb * 8) / get_time
         get_gbyte_per_sec = total_data_size_gb / get_time
 
-        logger.info(f"get_data cost time: {get_time:.8f}s")
-        logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s ({get_gbyte_per_sec:.8f} GB/s)")
-
         # Print summary
         total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time)
         total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time)
@@ -368,7 +301,6 @@ def run_throughput_test(self) -> dict[str, Any]:
         logger.info("THROUGHPUT TEST SUMMARY")
         logger.info("=" * 60)
         logger.info(f"Backend: {self.backend}")
-        logger.info(f"Client Placement: {self.client_placement}")
         logger.info(f"Device: {self.device}")
         logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
         logger.info(f"PUT Time: {put_time:.8f}s")
@@ -378,55 +310,21 @@ def run_throughput_test(self) -> dict[str, Any]:
         logger.info(f"Total Throughput: {total_gbit_per_sec:.8f} Gb/s ({total_gbyte_per_sec:.8f} GB/s)")
         logger.info("=" * 60)
 
-        # Return results
+        # Return results (only Gb/s for CSV, not GB/s)
         return {
             "backend": self.backend,
-            "client_placement": self.client_placement,
             "device": self.device,
             "total_data_size_gb": total_data_size_gb,
             "put_time": put_time,
             "get_time": get_time,
             "put_gbit_per_sec": put_gbit_per_sec,
-            "put_gbyte_per_sec": put_gbyte_per_sec,
             "get_gbit_per_sec": get_gbit_per_sec,
-            "get_gbyte_per_sec": get_gbyte_per_sec,
             "total_gbit_per_sec": total_gbit_per_sec,
-            "total_gbyte_per_sec": total_gbyte_per_sec,
         }
 
-
-def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]:
-    """Load backend config from YAML file or use defaults.
-
-    Args:
-        config_path: Path to YAML config file (optional)
-        backend: Backend type for default config
-
-    Returns:
-        Backend configuration dictionary
-    """
-    if config_path is not None:
-        config = OmegaConf.load(config_path)
-        return OmegaConf.to_container(config, resolve=True)
-
-    # Default configs
-    if backend == "SimpleStorage":
-        return {"num_data_storage_units": 1}
-    elif backend == "Yuanrong":
-        return {
-            "host": "127.0.0.1",
-            "port": 31501,
-            "enable_yr_npu_transport": False,
-            "client_placement": "inter_node",
-        }
-    elif backend == "MooncakeStore":
-        return {
-            "local_hostname": "127.0.0.1",
-            "metadata_server": "127.0.0.1:8080",
-            "master_server_address": "127.0.0.1:8081",
-        }
-    else:
-        return {}
+    def close(self) -> None:
+        """Close the transfer_queue clients."""
+        ray.get([self.writer.close.remote(), self.reader.close.remote()])
 
 
 def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> None:
@@ -453,18 +351,11 @@ def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> Non
 def main() -> None:
     """Main entry point for the perftest script."""
     parser = argparse.ArgumentParser(description="TransferQueue Throughput Test")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="SimpleStorage",
-        choices=["SimpleStorage", "Yuanrong", "MooncakeStore"],
-        help="Backend type to test (default: SimpleStorage)",
-    )
     parser.add_argument(
         "--backend_config",
         type=str,
-        default=None,
-        help="Path to backend config YAML file (optional)",
+        required=True,
+        help="Path to backend config YAML file",
     )
     parser.add_argument(
         "--device",
@@ -492,10 +383,10 @@ def main() -> None:
         help="Sequence length (default: 8192)",
     )
     parser.add_argument(
-        "--num_global_batch",
+        "--num_test_iterations",
         type=int,
-        default=1,
-        help="Number of global batches (default: 1)",
+        default=3,
+        help="Number of test iterations (default: 3)",
     )
     parser.add_argument(
         "--head_node_ip",
@@ -507,7 +398,7 @@ def main() -> None:
         "--worker_node_ip",
         type=str,
         default=None,
-        help="Worker node IP address (required for Yuanrong inter_node)",
+        help="Worker node IP address (required for Yuanrong)",
     )
     parser.add_argument(
         "--ray_address",
@@ -524,22 +415,18 @@ def main() -> None:
 
     args = parser.parse_args()
 
-    # Load backend config
-    backend_config = load_backend_config(args.backend_config, args.backend)
-
     # Initialize Ray
     logger.info(f"Connecting to Ray cluster at {args.ray_address}")
     ray.init(address=args.ray_address)
 
     # Create and run tester
     tester = TQThroughputTester(
-        backend=args.backend,
-        backend_config=backend_config,
+        backend_config_path=args.backend_config,
         device=args.device,
         global_batch_size=args.global_batch_size,
         field_num=args.field_num,
         seq_len=args.seq_len,
-        num_global_batch=args.num_global_batch,
+        num_test_iterations=args.num_test_iterations,
         head_node_ip=args.head_node_ip,
         worker_node_ip=args.worker_node_ip,
         output_csv=args.output_csv,
@@ -547,9 +434,9 @@ def main() -> None:
 
     # Run test multiple times for consistent results using a for loop
     all_results = []
-    for i in range(NUM_TEST_ITERATIONS):
+    for i in range(args.num_test_iterations):
         logger.info("-" * 60)
-        logger.info(f"Iteration {i + 1}/{NUM_TEST_ITERATIONS}")
+        logger.info(f"Iteration {i + 1}/{args.num_test_iterations}")
         logger.info("-" * 60)
         result = tester.run_throughput_test()
         all_results.append(result)
@@ -558,6 +445,9 @@ def main() -> None:
     if args.output_csv:
         write_results_to_csv(all_results, args.output_csv)
 
+    # Close transfer_queue
+    tester.close()
+
     logger.info("Throughput test completed successfully!")
 
 
diff --git a/transfer_queue/config.yaml b/transfer_queue/config.yaml
index 98819ed..0a8ccef 100644
--- a/transfer_queue/config.yaml
+++ b/transfer_queue/config.yaml
@@ -47,4 +47,10 @@ backend:
   RayStore:
 
   # For Yuanrong:
-  # TODO
\ No newline at end of file
+  Yuanrong:
+    # IP of local yuanrong datasystem worker
+    host: 127.0.0.1
+    # Port of local yuanrong datasystem worker
+    port: 31501
+    # If enable npu transport
+    enable_yr_npu_transport: false

From eb9112b95a0f15e278e1c1f1cd012cb533256c01 Mon Sep 17 00:00:00 2001
From: 0oshowero0 <o0shower0o@outlook.com>
Date: Tue, 24 Mar 2026 20:36:13 +0800
Subject: [PATCH 09/29] squash all commits

Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>

# Conflicts:
#	scripts/performance_test/README_PERFTEST.md
#	scripts/performance_test/perftest.py
---
 scripts/performance_test/README_PERFTEST.md   |  29 +-
 scripts/performance_test/draw_figure.py       | 140 +++++++
 scripts/performance_test/perftest.py          | 127 ++++--
 scripts/performance_test/perftest_config.yaml |  56 +++
 .../performance_test/ray_perftest_baseline.py | 375 ++++++++++++++++++
 scripts/performance_test/run_perf_test.sh     |  81 ++++
 6 files changed, 764 insertions(+), 44 deletions(-)
 create mode 100644 scripts/performance_test/draw_figure.py
 create mode 100644 scripts/performance_test/perftest_config.yaml
 create mode 100644 scripts/performance_test/ray_perftest_baseline.py
 create mode 100755 scripts/performance_test/run_perf_test.sh

diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index 9b979ae..cf62efa 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -32,16 +32,15 @@ python perftest.py \
 
 | Argument | Description | Default |
 |----------|-------------|---------|
-| `--backend_config` | Path to backend config YAML file (required) | - |
-| `--device` | Device: cpu, npu, gpu | cpu |
-| `--global_batch_size` | Global batch size | 1024 |
-| `--field_num` | Number of fields | 10 |
-| `--seq_len` | Sequence length | 8192 |
-| `--num_test_iterations` | Number of test iterations | 3 |
-| `--head_node_ip` | Head node IP (required) | - |
-| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None |
-| `--ray_address` | Ray cluster address | auto |
-| `--output_csv` | Path to output CSV file (optional) | None |
+| `--backend_config` | Path to backend config YAML file (required) | -       |
+| `--device` | Device: cpu, npu, gpu | cpu     |
+| `--global_batch_size` | Global batch size | 1024    |
+| `--field_num` | Number of fields | 10      |
+| `--seq_len` | Sequence length | 8192    |
+| `--num_test_iterations` | Number of test iterations | 4       |
+| `--head_node_ip` | Head node IP (required) | -       |
+| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None    |
+| `--output_csv` | Path to output CSV file (optional) | None    |
 
 ## Backend Configuration
 
@@ -60,25 +59,25 @@ For Yuanrong backend, writer runs on head node and reader runs on worker node.
 
 ### SimpleStorage/Mooncake backend
 ```bash
-python perftest.py --backend_config=../../transfer_queue/config.yaml \
+python perftest.py --backend_config=perftest_config.yaml \
   --head_node_ip=192.168.0.1
 ```
 
 ### Yuanrong backend
 ```bash
-python perftest.py --backend_config=../../transfer_queue/config.yaml \
+python perftest.py --backend_config=perftest_config.yaml \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
 ### NPU device test
 ```bash
-python perftest.py --backend_config=../../transfer_queue/config.yaml --device=npu \
+python perftest.py --backend_config=perftest_config.yaml --device=npu \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
 ### Output to CSV
 ```bash
-python perftest.py --backend_config=../../transfer_queue/config.yaml \
+python perftest.py --backend_config=perftest_config.yaml \
   --head_node_ip=192.168.0.1 --output_csv=results.csv
 ```
 
@@ -104,4 +103,4 @@ When using `--output_csv`, the test writes results to a CSV file with the follow
 - get_gbit_per_sec
 - total_gbit_per_sec
 
-The test runs `--num_test_iterations` iterations (default: 3) and saves all results to the CSV.
+The test runs `--num_test_iterations` iterations (default: 4) and saves all results to the CSV.
diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py
new file mode 100644
index 0000000..1d96a65
--- /dev/null
+++ b/scripts/performance_test/draw_figure.py
@@ -0,0 +1,140 @@
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+results_dir = Path(__file__).resolve().parent / "results"
+csv_files = list(results_dir.glob("*.csv"))
+
+if not csv_files:
+    raise FileNotFoundError(f"No CSV files found in {results_dir}")
+
+size_order = ["Small", "Medium", "Large"]
+
+# Filename -> display name mapping for backends.
+# All normalization lives here so the shell script keeps simple lowercase names.
+BACKEND_DISPLAY_NAMES = {
+    "simplestorage": "SimpleStorage",
+    "yuanrong": "Yuanrong",
+    "mooncakestore": "MooncakeStore",
+    "ray_baseline": "Ray",
+}
+
+
+def format_size(size_gb: float) -> str:
+    """Format a data size in GB to a human-readable string with appropriate unit."""
+    if size_gb >= 1.0:
+        return f"{size_gb:.2f} GB"
+    size_mb = size_gb * 1024
+    if size_mb >= 1.0:
+        return f"{size_mb:.2f} MB"
+    size_kb = size_mb * 1024
+    return f"{size_kb:.2f} KB"
+
+
+dfs = []
+for csv_file in csv_files:
+    df = pd.read_csv(csv_file)
+    # Parse size label and backend from filename: {backend}_{size_label}.csv
+    # Size label is always the last _-separated segment (lowercase).
+    # Backend is everything before the last underscore.
+    # e.g. "simplestorage_small.csv" -> backend_key="simplestorage", size_label="Small"
+    # e.g. "ray_baseline_small.csv"  -> backend_key="ray_baseline", size_label="Small"
+    stem = csv_file.stem
+    parts = stem.rsplit("_", 1)
+    if len(parts) != 2:
+        print(f"Warning: skipping {csv_file.name}, unexpected filename format")
+        continue
+    raw_backend, raw_size = parts
+    size_label = raw_size.capitalize()
+    if size_label not in size_order:
+        print(f"Warning: skipping {csv_file.name}, unrecognized size label '{raw_size}'")
+        continue
+    df["backend_parsed"] = BACKEND_DISPLAY_NAMES.get(raw_backend, raw_backend)
+    df["size_label"] = size_label
+    dfs.append(df)
+
+df = pd.concat(dfs, ignore_index=True)
+
+existing_sizes = [s for s in size_order if s in df["size_label"].unique()]
+
+# Build composite X-axis label: "SizeLabel\n<human-readable size>"
+size_to_gb = df.groupby("size_label")["total_data_size_gb"].first().to_dict()
+
+
+def make_xlabel(size_label: str) -> str:
+    return f"{size_label}\n{format_size(size_to_gb.get(size_label, 0))}"
+
+
+df["X_label"] = df["size_label"].apply(make_xlabel)
+
+# Make X_label categorical with the correct ordering
+df["X_label"] = pd.Categorical(
+    df["X_label"],
+    categories=[make_xlabel(s) for s in existing_sizes],
+    ordered=True,
+)
+
+df["Bandwidth"] = df["total_gbit_per_sec"]
+df["Scenario"] = df["backend_parsed"]
+
+# ========== Plotting ==========
+sns.set_theme(style="white", palette="husl")
+
+fig, ax = plt.subplots(figsize=(12, 7))
+
+palette = sns.color_palette("Set2", n_colors=df["Scenario"].nunique())
+barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette)
+
+# Legend: match old style — at the top center, horizontal, with frame
+handles, labels = ax.get_legend_handles_labels()
+# Move legend above the plot
+ax.get_legend().remove()
+fig.legend(
+    handles,
+    labels,
+    bbox_to_anchor=(0.5, 1.0),
+    loc="upper center",
+    ncol=len(handles),
+    title="",
+    frameon=True,
+    fancybox=True,
+    shadow=True,
+    fontsize=13,
+)
+
+# Annotations on bars
+for p in ax.patches:
+    height = p.get_height()
+    if height > 0:
+        ax.annotate(
+            f"{height:.3f}",
+            (p.get_x() + p.get_width() / 2.0, height),
+            ha="center",
+            va="bottom",
+            fontsize=11,
+            rotation=0,
+        )
+
+# Axis formatting
+ax.set_title("Performance Comparison (Total Throughput)", fontsize=16, fontweight="bold")
+ax.set_xlabel("")
+ax.set_ylabel("Bandwidth (Gbps)", fontsize=16)
+
+# Adjust y range to leave room for annotations
+y_max = df["Bandwidth"].max() * 1.15
+ax.set_ylim(0, y_max)
+
+ax.grid(True, alpha=0.3)
+ax.tick_params(axis="x", labelsize=14)
+ax.tick_params(axis="y", labelsize=13)
+
+# Unified x-label at the bottom
+fig.text(0.5, 0.02, "Data Volume", ha="center", fontsize=20)
+
+plt.tight_layout(rect=[0, 0.04, 1, 0.95])  # room for legend + x-label
+plt.savefig(results_dir / "performance_comparison.pdf", dpi=300, bbox_inches="tight")
+plt.show()
+
+print("Performance comparison plot generated and saved as 'performance_comparison.pdf'")
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 249af0b..0541296 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -25,7 +25,7 @@
 import ray
 import torch
 from omegaconf import OmegaConf
-from tensordict import TensorDict
+from tensordict import NonTensorStack, TensorDict
 
 parent_dir = Path(__file__).resolve().parent.parent.parent
 sys.path.append(str(parent_dir))
@@ -42,21 +42,54 @@ def create_test_case(
     field_num: int | None = None,
     device: str = "cpu",
 ) -> tuple[TensorDict, float]:
-    """Create a test case with tensor fields only.
+    """Create a test case with complex data formats.
+
+    Creates TensorDict with:
+    - Regular tensors: (batch_size, seq_length) shape, each element is float32
+    - Nested Tensors: variable-length sequences, each batch element has length
+      uniformly sampled from [1, seq_length]
+    - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes
+      (to match memory footprint of one tensor element)
 
     Args:
         batch_size: Batch size for the test case
-        seq_length: Sequence length for tensor fields
-        field_num: Number of fields to create
+        seq_length: Maximum sequence length (used for regular tensors and
+            as upper bound for nested tensor sampling)
+        field_num: Total number of fields to create (distributed across types)
         device: Device to create tensors on ("cpu", "npu", or "gpu")
 
     Returns:
         Tuple of (TensorDict, total_size_gb)
     """
-    tensor_field_size_bytes = batch_size * seq_length * 4
-    tensor_field_size_gb = tensor_field_size_bytes / (1024**3)
-
-    total_size_gb = tensor_field_size_gb * field_num
+    bytes_per_element = 4  # float32
+
+    # Calculate field distribution (1/3 each type, last fields may be regular)
+    num_regular_fields = (field_num + 2) // 3
+    num_nested_fields = (field_num + 2) // 3
+    num_nontensor_fields = field_num - num_regular_fields - num_nested_fields
+
+    # Each regular tensor field: batch_size * seq_length * 4 bytes
+    regular_field_size_bytes = batch_size * seq_length * bytes_per_element
+    regular_field_size_gb = regular_field_size_bytes / (1024**3)
+
+    # Nested tensor field: average length = (1 + seq_length) / 2,
+    # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes
+    avg_nested_length = (1 + seq_length) / 2
+    nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
+    nested_field_size_gb = nested_field_size_bytes / (1024**3)
+
+    # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element
+    # Total for field: batch_size strings * seq_length * 4 bytes each
+    string_size_per_elem = seq_length * bytes_per_element
+    nontensor_field_size_bytes = batch_size * string_size_per_elem
+    nontensor_field_size_gb = nontensor_field_size_bytes / (1024**3)
+
+    # Total size = sum of all field types
+    total_size_gb = (
+        regular_field_size_gb * num_regular_fields
+        + nested_field_size_gb * num_nested_fields
+        + nontensor_field_size_gb * num_nontensor_fields
+    )
 
     logger.info(f"Total data size: {total_size_gb:.6f} GB")
 
@@ -67,17 +100,48 @@ def create_test_case(
     elif device == "gpu":
         torch_device = "cuda:0"
 
-    fields = {}
-    for i in range(field_num):
-        field_name = f"field_{i}"
-        tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
-        fields[field_name] = tensor_data
+    # Set seeds for reproducibility (within this process)
+    # Sample lengths for all nested fields at once
+    nested_lengths = [
+        torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i))
+        for i in range(num_nested_fields)
+    ]
 
     batch_size_tuple = (batch_size,)
-    prompt_batch = TensorDict(
-        fields,
-        batch_size=batch_size_tuple,
-    )
+
+    prompt_batch = TensorDict(batch_size=batch_size_tuple)
+
+    # 1. Regular tensor fields
+    for i in range(num_regular_fields):
+        field_name = f"field_{i}"
+        tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
+        prompt_batch.set(field_name, tensor_data)
+
+    # 2. Nested Tensor fields (variable-length sequences)
+    for i in range(num_nested_fields):
+        field_name = f"nested_field_{i}"
+        actual_lengths = nested_lengths[i]
+
+        # Create nested tensor from variable-length sequences
+        nested_list = []
+        for j in range(batch_size):
+            length = actual_lengths[j].item()
+            # Create sequence data: arange for each element (representing sequence indices)
+            seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
+            nested_list.append(seq_data)
+
+        nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
+        prompt_batch.set(field_name, nested_tensor)
+
+    # 3. NonTensorStack wrapped strings
+    # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint
+    string_char_count = seq_length * bytes_per_element  # 4 bytes per char (unicode)
+    string_template = "x" * string_char_count
+
+    for i in range(num_nontensor_fields):
+        field_name = f"nontensor_field_{i}"
+        string_data = [string_template for _ in range(batch_size)]
+        prompt_batch.set(field_name, NonTensorStack.from_list(string_data))
 
     return prompt_batch, total_size_gb
 
@@ -143,6 +207,7 @@ def __init__(
         seq_len: int,
         num_test_iterations: int,
         head_node_ip: str,
+        backend: str | None = None,
         worker_node_ip: str | None = None,
         output_csv: str | None = None,
     ):
@@ -150,6 +215,7 @@ def __init__(
 
         Args:
             backend_config_path: Path to backend config YAML file
+            backend: Override storage_backend in config (e.g. "SimpleStorage")
             device: Device type ("cpu", "npu", "gpu")
             global_batch_size: Global batch size
             field_num: Number of fields
@@ -160,6 +226,7 @@ def __init__(
             output_csv: Path to output CSV file (optional)
         """
         self.backend_config_path = backend_config_path
+        self.backend_override = backend
         self.device = device
         self.global_batch_size = global_batch_size
         self.field_num = field_num
@@ -199,6 +266,11 @@ def _prepare_config(self) -> dict[str, Any]:
         # Directly read the backend_config file, no merging with default
         config = OmegaConf.load(self.backend_config_path)
 
+        # Override storage_backend if specified via CLI
+        if self.backend_override is not None:
+            config.backend.storage_backend = self.backend_override
+            logger.info(f"Overriding storage_backend to: {self.backend_override}")
+
         # If backend.storage_backend is SimpleStorage, override total_storage_size
         total_storage_size = self.global_batch_size * self.num_test_iterations
         if config.backend.storage_backend == "SimpleStorage":
@@ -357,6 +429,12 @@ def main() -> None:
         required=True,
         help="Path to backend config YAML file",
     )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default=None,
+        help="Override storage_backend in config (e.g. SimpleStorage, Yuanrong, MooncakeStore)",
+    )
     parser.add_argument(
         "--device",
         type=str,
@@ -385,8 +463,8 @@ def main() -> None:
     parser.add_argument(
         "--num_test_iterations",
         type=int,
-        default=3,
-        help="Number of test iterations (default: 3)",
+        default=4,
+        help="Number of test iterations (default: 4)",
     )
     parser.add_argument(
         "--head_node_ip",
@@ -400,12 +478,6 @@ def main() -> None:
         default=None,
         help="Worker node IP address (required for Yuanrong)",
     )
-    parser.add_argument(
-        "--ray_address",
-        type=str,
-        default="auto",
-        help="Ray cluster address (default: auto)",
-    )
     parser.add_argument(
         "--output_csv",
         type=str,
@@ -415,10 +487,6 @@ def main() -> None:
 
     args = parser.parse_args()
 
-    # Initialize Ray
-    logger.info(f"Connecting to Ray cluster at {args.ray_address}")
-    ray.init(address=args.ray_address)
-
     # Create and run tester
     tester = TQThroughputTester(
         backend_config_path=args.backend_config,
@@ -428,6 +496,7 @@ def main() -> None:
         seq_len=args.seq_len,
         num_test_iterations=args.num_test_iterations,
         head_node_ip=args.head_node_ip,
+        backend=args.backend,
         worker_node_ip=args.worker_node_ip,
         output_csv=args.output_csv,
     )
diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml
new file mode 100644
index 0000000..88dca45
--- /dev/null
+++ b/scripts/performance_test/perftest_config.yaml
@@ -0,0 +1,56 @@
+# This is the default configuration of TransferQueue. Users may modify the default value
+# and use transfer_queue.init(conf) to overwrite the config entries.
+
+controller:
+  # User-defined sampler. User can pass sampler instance to overwrite this string config.
+  sampler: SequentialSampler
+  # Whether return an empty BatchMeta to prevent request blocking when no enough data is available
+  polling_mode: False
+  # ZMQ Server IP & Ports (automatically generated during init)
+  zmq_info: null
+
+
+backend:
+  # Pluggable storage/transport backend of TransferQueue. Choose from:
+  # SimpleStorage, Yuanrong, MooncakeStore, ...
+  storage_backend: SimpleStorage
+
+  # For SimpleStorage:
+  SimpleStorage:
+    # Total number of samples
+    total_storage_size: 100000
+    # Number of distributed storage units for SimpleStorage backend
+    num_data_storage_units: 16
+    # ZMQ Server IP & Ports (automatically generated during init)
+    zmq_info: null
+
+  # For MooncakeStore:
+  MooncakeStore:
+    # Whether to let TQ automatically init metadata_server.
+    auto_init: true
+    # Address of the HTTP metadata server
+    metadata_server: localhost:50050
+    # Address of master server
+    master_server_address: localhost:50051
+    # Address of local host. Set to "" to use Ray IP as local host address
+    local_hostname: ""
+    # Protocol for transmission. Choose from: tcp, rdma. (default: tcp)
+    protocol: tcp
+    # Memory segment size in bytes for mounting (default: 4GB)
+    global_segment_size: 4294967296
+    # Local buffer size in bytes (default: 1GB)
+    local_buffer_size: 1073741824
+    # Network device name. Set to "" to let Mooncake to auto-picks devices
+    device_name: ""
+
+  # For RayStore:
+  RayStore:
+
+  # For Yuanrong:
+  Yuanrong:
+    # IP of local yuanrong datasystem worker
+    host: 127.0.0.1
+    # Port of local yuanrong datasystem worker
+    port: 31501
+    # If enable npu transport
+    enable_yr_npu_transport: false
diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py
new file mode 100644
index 0000000..6951713
--- /dev/null
+++ b/scripts/performance_test/ray_perftest_baseline.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+import logging
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import ray
+import torch
+from tensordict import NonTensorStack, TensorDict
+
+parent_dir = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(parent_dir))
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def create_test_case(
+    batch_size: int | None = None,
+    seq_length: int | None = None,
+    field_num: int | None = None,
+    device: str = "cpu",
+) -> tuple[TensorDict, float]:
+    """Create a test case with complex data formats.
+
+    Creates TensorDict with:
+    - Regular tensors: (batch_size, seq_length) shape, each element is float32
+    - Nested Tensors: variable-length sequences, each batch element has length
+      uniformly sampled from [1, seq_length]
+    - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes
+      (to match memory footprint of one tensor element)
+
+    Args:
+        batch_size: Batch size for the test case
+        seq_length: Maximum sequence length (used for regular tensors and
+            as upper bound for nested tensor sampling)
+        field_num: Total number of fields to create (distributed across types)
+        device: Device to create tensors on ("cpu", "npu", or "gpu")
+
+    Returns:
+        Tuple of (TensorDict, total_size_gb)
+    """
+    bytes_per_element = 4  # float32
+
+    # Calculate field distribution (1/3 each type, last fields may be regular)
+    num_regular_fields = (field_num + 2) // 3
+    num_nested_fields = (field_num + 2) // 3
+    num_nontensor_fields = field_num - num_regular_fields - num_nested_fields
+
+    # Each regular tensor field: batch_size * seq_length * 4 bytes
+    regular_field_size_bytes = batch_size * seq_length * bytes_per_element
+    regular_field_size_gb = regular_field_size_bytes / (1024**3)
+
+    # Nested tensor field: average length = (1 + seq_length) / 2,
+    # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes
+    avg_nested_length = (1 + seq_length) / 2
+    nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
+    nested_field_size_gb = nested_field_size_bytes / (1024**3)
+
+    # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element
+    # Total for field: batch_size strings * seq_length * 4 bytes each
+    string_size_per_elem = seq_length * bytes_per_element
+    nontensor_field_size_bytes = batch_size * string_size_per_elem
+    nontensor_field_size_gb = nontensor_field_size_bytes / (1024**3)
+
+    # Total size = sum of all field types
+    total_size_gb = (
+        regular_field_size_gb * num_regular_fields
+        + nested_field_size_gb * num_nested_fields
+        + nontensor_field_size_gb * num_nontensor_fields
+    )
+
+    logger.info(f"Total data size: {total_size_gb:.6f} GB")
+
+    # Determine torch device
+    torch_device = None
+    if device == "npu":
+        torch_device = "npu:0"
+    elif device == "gpu":
+        torch_device = "cuda:0"
+
+    # Set seeds for reproducibility (within this process)
+    # Sample lengths for all nested fields at once
+    nested_lengths = [
+        torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i))
+        for i in range(num_nested_fields)
+    ]
+
+    batch_size_tuple = (batch_size,)
+
+    prompt_batch = TensorDict(batch_size=batch_size_tuple)
+
+    # 1. Regular tensor fields
+    for i in range(num_regular_fields):
+        field_name = f"field_{i}"
+        tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
+        prompt_batch.set(field_name, tensor_data)
+
+    # 2. Nested Tensor fields (variable-length sequences)
+    for i in range(num_nested_fields):
+        field_name = f"nested_field_{i}"
+        actual_lengths = nested_lengths[i]
+
+        # Create nested tensor from variable-length sequences
+        nested_list = []
+        for j in range(batch_size):
+            length = actual_lengths[j].item()
+            # Create sequence data: arange for each element (representing sequence indices)
+            seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
+            nested_list.append(seq_data)
+
+        nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
+        prompt_batch.set(field_name, nested_tensor)
+
+    # 3. NonTensorStack wrapped strings
+    # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint
+    string_char_count = seq_length * bytes_per_element  # 4 bytes per char (unicode)
+    string_template = "x" * string_char_count
+
+    for i in range(num_nontensor_fields):
+        field_name = f"nontensor_field_{i}"
+        string_data = [string_template for _ in range(batch_size)]
+        prompt_batch.set(field_name, NonTensorStack.from_list(string_data))
+
+    return prompt_batch, total_size_gb
+
+
+@ray.remote
+class RemoteDataStore:
+    """Ray remote actor that stores and retrieves data directly (without ray.put)."""
+
+    def __init__(self):
+        self.stored_data = None
+
+    def put_data(self, data: TensorDict) -> None:
+        self.stored_data = data
+
+    def get_data(self) -> TensorDict:
+        return self.stored_data
+
+    def clear_data(self) -> None:
+        self.stored_data = None
+
+
+class RayBaselineTester:
+    """Ray baseline throughput tester - measures raw Ray data transfer performance."""
+
+    def __init__(
+        self,
+        global_batch_size: int,
+        field_num: int,
+        seq_len: int,
+        num_test_iterations: int,
+        head_node_ip: str,
+        worker_node_ip: str | None = None,
+        output_csv: str | None = None,
+    ):
+        """Initialize the Ray baseline tester.
+
+        Args:
+            global_batch_size: Global batch size
+            field_num: Number of fields
+            seq_len: Sequence length
+            num_test_iterations: Number of test iterations
+            head_node_ip: Head node IP address
+            worker_node_ip: Worker node IP address
+            output_csv: Path to output CSV file (optional)
+        """
+        self.global_batch_size = global_batch_size
+        self.field_num = field_num
+        self.seq_len = seq_len
+        self.num_test_iterations = num_test_iterations
+        self.head_node_ip = head_node_ip
+        self.worker_node_ip = worker_node_ip
+        self.output_csv = output_csv
+
+        # Initialize remote store on worker node
+        self._initialize_remote_store()
+
+    def _initialize_remote_store(self) -> None:
+        """Initialize the RemoteDataStore actor on worker node."""
+        writer_node = self.head_node_ip
+        reader_node = self.worker_node_ip if self.worker_node_ip else self.head_node_ip
+
+        logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}")
+
+        self.remote_store = RemoteDataStore.options(
+            num_cpus=0.001,
+            resources={f"node:{reader_node}": 0.001},
+        ).remote()
+
+        logger.info(f"RemoteDataStore created on {reader_node}")
+
+    def run_throughput_test(self) -> dict[str, Any]:
+        """Run the throughput test and print results.
+
+        Returns:
+            Dictionary with test results
+        """
+        # Create test data
+        logger.info("Creating large batch for throughput test...")
+        start_create_data = time.perf_counter()
+        test_data, total_data_size_gb = create_test_case(
+            batch_size=self.global_batch_size,
+            seq_length=self.seq_len,
+            field_num=self.field_num,
+            device="cpu",
+        )
+        end_create_data = time.perf_counter()
+        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
+
+        # PUT operation - pass data directly to remote actor
+        logger.info("Starting PUT operation...")
+        start_put = time.perf_counter()
+        ray.get(self.remote_store.put_data.remote(test_data))
+        end_put = time.perf_counter()
+        put_time = end_put - start_put
+        put_gbit_per_sec = (total_data_size_gb * 8) / put_time
+
+        time.sleep(2)
+
+        # GET operation - retrieve data from remote actor
+        logger.info("Starting GET operation...")
+        start_get = time.perf_counter()
+        _ = ray.get(self.remote_store.get_data.remote())
+        end_get = time.perf_counter()
+        get_time = end_get - start_get
+        get_gbit_per_sec = (total_data_size_gb * 8) / get_time
+
+        # Clear data
+        ray.get(self.remote_store.clear_data.remote())
+
+        # Calculate total throughput
+        total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time)
+
+        # Print summary
+        logger.info("=" * 60)
+        logger.info("RAY BASELINE THROUGHPUT TEST SUMMARY")
+        logger.info("=" * 60)
+        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
+        logger.info(f"PUT Time: {put_time:.8f}s")
+        logger.info(f"GET Time: {get_time:.8f}s")
+        logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s")
+        logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s")
+        logger.info(f"Total Throughput (round-trip): {total_gbit_per_sec:.8f} Gb/s")
+        logger.info("=" * 60)
+
+        return {
+            "backend": "RayBaseline",
+            "device": "cpu",
+            "total_data_size_gb": total_data_size_gb,
+            "put_time": put_time,
+            "get_time": get_time,
+            "put_gbit_per_sec": put_gbit_per_sec,
+            "get_gbit_per_sec": get_gbit_per_sec,
+            "total_gbit_per_sec": total_gbit_per_sec,
+        }
+
+
+def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> None:
+    """Write test results to CSV file.
+
+    Args:
+        results: List of result dictionaries
+        output_path: Path to output CSV file
+    """
+    if not results:
+        return
+
+    fieldnames = list(results[0].keys())
+
+    with open(output_path, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for result in results:
+            writer.writerow(result)
+
+    logger.info(f"Results written to {output_path}")
+
+
+def main() -> None:
+    """Main entry point for the Ray baseline perftest script."""
+    parser = argparse.ArgumentParser(description="Ray Baseline Throughput Test")
+    parser.add_argument(
+        "--global_batch_size",
+        type=int,
+        default=1024,
+        help="Global batch size (default: 1024)",
+    )
+    parser.add_argument(
+        "--field_num",
+        type=int,
+        default=10,
+        help="Number of fields (default: 10)",
+    )
+    parser.add_argument(
+        "--seq_len",
+        type=int,
+        default=8192,
+        help="Sequence length (default: 8192)",
+    )
+    parser.add_argument(
+        "--num_test_iterations",
+        type=int,
+        default=3,
+        help="Number of test iterations (default: 3)",
+    )
+    parser.add_argument(
+        "--head_node_ip",
+        type=str,
+        required=True,
+        help="Head node IP address",
+    )
+    parser.add_argument(
+        "--worker_node_ip",
+        type=str,
+        default=None,
+        help="Worker node IP address (optional)",
+    )
+    parser.add_argument(
+        "--output_csv",
+        type=str,
+        default=None,
+        help="Path to output CSV file (optional)",
+    )
+
+    args = parser.parse_args()
+
+    # Create and run tester
+    tester = RayBaselineTester(
+        global_batch_size=args.global_batch_size,
+        field_num=args.field_num,
+        seq_len=args.seq_len,
+        num_test_iterations=args.num_test_iterations,
+        head_node_ip=args.head_node_ip,
+        worker_node_ip=args.worker_node_ip,
+        output_csv=args.output_csv,
+    )
+
+    # Run test multiple times
+    all_results = []
+    for i in range(args.num_test_iterations):
+        logger.info("-" * 60)
+        logger.info(f"Iteration {i + 1}/{args.num_test_iterations}")
+        logger.info("-" * 60)
+        result = tester.run_throughput_test()
+        all_results.append(result)
+
+    # Write to CSV if output path is specified
+    if args.output_csv:
+        write_results_to_csv(all_results, args.output_csv)
+
+    logger.info("Ray baseline throughput test completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh
new file mode 100755
index 0000000..58a6930
--- /dev/null
+++ b/scripts/performance_test/run_perf_test.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+RESULTS_DIR="${SCRIPT_DIR}/results"
+PERFTEST_PY="${SCRIPT_DIR}/perftest.py"
+RAY_PERFTEST_PY="${SCRIPT_DIR}/ray_perftest_baseline.py"
+CONFIG_YAML="${SCRIPT_DIR}/perftest_config.yaml"
+
+mkdir -p "${RESULTS_DIR}"
+
+# ========== User Configuration ==========
+# Modify these based on your environment
+HEAD_NODE_IP="${HEAD_NODE_IP:-127.0.0.1}"
+WORKER_NODE_IP="${WORKER_NODE_IP:-127.0.0.1}"
+DEVICE="${DEVICE:-cpu}"
+NUM_TEST_ITERATIONS="${NUM_TEST_ITERATIONS:-4}"
+# ========================================
+
+# Backends to test (passed via --backend to perftest.py)
+BACKENDS=("SimpleStorage" "Yuanrong" "MooncakeStore")
+
+# Test settings: global_batch_size, field_num, seq_len, name
+declare -a SETTINGS=(
+    "128,3,1024,Small"
+    "1024,9,8192,Medium"
+    "4096,21,128000,Large"
+)
+
+# ---- TransferQueue perftest ----
+for backend in "${BACKENDS[@]}"; do
+    echo "=========================================="
+    echo "Testing backend: ${backend}"
+    echo "=========================================="
+
+    for setting in "${SETTINGS[@]}"; do
+        IFS=',' read -r batch_size field_num seq_len name <<< "$setting"
+        output_csv="${RESULTS_DIR}/${backend,,}_${name,,}.csv"
+
+        echo "  Setting: ${name} (batch=${batch_size}, fields=${field_num}, seq=${seq_len})"
+
+        if [[ "$backend" == "Yuanrong" ]]; then
+            python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \
+                --device="${DEVICE}" \
+                --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
+                --num_test_iterations="${NUM_TEST_ITERATIONS}" \
+                --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
+                --output_csv="${output_csv}"
+        else
+            python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \
+                --device="${DEVICE}" \
+                --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
+                --num_test_iterations="${NUM_TEST_ITERATIONS}" \
+                --head_node_ip="${HEAD_NODE_IP}" \
+                --output_csv="${output_csv}"
+        fi
+    done
+done
+
+# ---- Ray baseline ----
+echo "=========================================="
+echo "Testing backend: Ray (baseline)"
+echo "=========================================="
+for setting in "${SETTINGS[@]}"; do
+    IFS=',' read -r batch_size field_num seq_len name <<< "$setting"
+    output_csv="${RESULTS_DIR}/ray_baseline_${name,,}.csv"
+
+    echo "  Setting: ${name} (batch=${batch_size}, fields=${field_num}, seq=${seq_len})"
+
+    python "${RAY_PERFTEST_PY}" \
+        --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
+        --num_test_iterations="${NUM_TEST_ITERATIONS}" \
+        --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
+        --output_csv="${output_csv}"
+done
+
+# ---- Draw figures ----
+python "${SCRIPT_DIR}/draw_figure.py"
+
+echo ""
+echo "All tests completed!"

From fa5b131ca4798a91b1c19ed95dc495b65dc3deba Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 10:09:20 +0800
Subject: [PATCH 10/29] add license to draw_figure.py

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/draw_figure.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py
index 1d96a65..bb7910e 100644
--- a/scripts/performance_test/draw_figure.py
+++ b/scripts/performance_test/draw_figure.py
@@ -1,3 +1,19 @@
+#!/usr/bin/env python3
+# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 
 import matplotlib.pyplot as plt

From 537b7c6ef1cece453a4c0557ab4150f721eb3f74 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 10:13:46 +0800
Subject: [PATCH 11/29] simplify run_perf_test.sh

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/run_perf_test.sh | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh
index 58a6930..50b2863 100755
--- a/scripts/performance_test/run_perf_test.sh
+++ b/scripts/performance_test/run_perf_test.sh
@@ -39,21 +39,12 @@ for backend in "${BACKENDS[@]}"; do
 
         echo "  Setting: ${name} (batch=${batch_size}, fields=${field_num}, seq=${seq_len})"
 
-        if [[ "$backend" == "Yuanrong" ]]; then
-            python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \
-                --device="${DEVICE}" \
-                --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
-                --num_test_iterations="${NUM_TEST_ITERATIONS}" \
-                --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
-                --output_csv="${output_csv}"
-        else
-            python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \
-                --device="${DEVICE}" \
-                --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
-                --num_test_iterations="${NUM_TEST_ITERATIONS}" \
-                --head_node_ip="${HEAD_NODE_IP}" \
-                --output_csv="${output_csv}"
-        fi
+        python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \
+            --device="${DEVICE}" \
+            --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
+            --num_test_iterations="${NUM_TEST_ITERATIONS}" \
+            --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
+            --output_csv="${output_csv}"
     done
 done
 

From 8a17c1951317f886995a9d58e26eed007fb17aa9 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 14:39:57 +0800
Subject: [PATCH 12/29] change client host for yuanrong

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 0541296..53614bf 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -307,9 +307,24 @@ def _initialize_clients(self) -> None:
             writer_options["resources"]["NPU"] = 1
             reader_options["resources"]["NPU"] = 1
 
+        # Prepare configs for writer and reader
+        # For Yuanrong backend, set different hosts for writer and reader
+        if self.backend == "Yuanrong":
+            import copy
+
+            writer_config = copy.deepcopy(self.full_config)
+            reader_config = copy.deepcopy(self.full_config)
+            writer_config["backend"]["Yuanrong"]["host"] = self.head_node_ip
+            reader_config["backend"]["Yuanrong"]["host"] = self.worker_node_ip
+            logger.info(f"Writer Yuanrong host: {self.head_node_ip}")
+            logger.info(f"Reader Yuanrong host: {self.worker_node_ip}")
+        else:
+            writer_config = self.full_config
+            reader_config = self.full_config
+
         # Create writer and reader actors
-        self.writer = TQClientActor.options(**writer_options).remote(self.full_config)
-        self.reader = TQClientActor.options(**reader_options).remote(self.full_config)
+        self.writer = TQClientActor.options(**writer_options).remote(writer_config)
+        self.reader = TQClientActor.options(**reader_options).remote(reader_config)
 
         # Initialize transfer_queue
         logger.info(f"Using {self.backend} as storage backend.")

From 60cdcaa24e639783441314e8c2ebdbab09f9b9a2 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 15:09:57 +0800
Subject: [PATCH 13/29] use d2h and h2d instead of d2d

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 tests/test_yuanrong_storage_client_e2e.py         |  6 +++---
 transfer_queue/storage/clients/yuanrong_client.py | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/test_yuanrong_storage_client_e2e.py b/tests/test_yuanrong_storage_client_e2e.py
index 519335f..2a79ec9 100644
--- a/tests/test_yuanrong_storage_client_e2e.py
+++ b/tests/test_yuanrong_storage_client_e2e.py
@@ -38,18 +38,18 @@ def __init__(self, host, port, device_id):
     def init(self):
         pass
 
-    def dev_mset(self, keys, values):
+    def mset_d2h(self, keys, values):
         for k, v in zip(keys, values, strict=True):
             assert v.device.type == "npu"
             self.storage[k] = v
 
-    def dev_mget(self, keys, out_tensors):
+    def mget_h2d(self, keys, out_tensors):
         for i, k in enumerate(keys):
             # Note: If key is missing, tensor remains unchanged (mock limitation)
             if k in self.storage:
                 out_tensors[i].copy_(self.storage[k])
 
-    def dev_delete(self, keys):
+    def delete(self, keys):
         for k in keys:
             self.storage.pop(k, None)
 
diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py
index 41219c2..80f5bb7 100644
--- a/transfer_queue/storage/clients/yuanrong_client.py
+++ b/transfer_queue/storage/clients/yuanrong_client.py
@@ -123,12 +123,12 @@ def put(self, keys: list[str], values: list[Any]):
         for i in range(0, len(keys), self.KEYS_LIMIT):
             batch_keys = keys[i : i + self.KEYS_LIMIT]
             batch_values = values[i : i + self.KEYS_LIMIT]
-            # _npu_ds_client.dev_mset doesn't support to overwrite
+            # mset_d2h cannot overwrite existing keys
             try:
-                self._ds_client.dev_delete(batch_keys)
+                self._ds_client.delete(batch_keys)
             except Exception:
                 pass
-            self._ds_client.dev_mset(batch_keys, batch_values)
+            self._ds_client.mset_d2h(batch_keys, batch_values)
 
     def supports_get(self, strategy_tag: str) -> bool:
         """Matches 'DsTensorClient' Strategy tag."""
@@ -147,8 +147,8 @@ def get(self, keys: list[str], **kwargs) -> list[Optional[Any]]:
             batch_dtypes = dtypes[i : i + self.KEYS_LIMIT]
 
             batch_values = self._create_empty_npu_tensorlist(batch_shapes, batch_dtypes)
-            self._ds_client.dev_mget(batch_keys, batch_values)
-            # Todo(dpj): consider checking and logging keys that fail during dev_mget
+            self._ds_client.mget_h2d(batch_keys, batch_values)
+            # Todo(dpj): consider checking and logging keys that fail during mget_h2d
             results.extend(batch_values)
         return results
 
@@ -161,7 +161,7 @@ def clear(self, keys: list[str]):
         for i in range(0, len(keys), self.KEYS_LIMIT):
             batch = keys[i : i + self.KEYS_LIMIT]
             # Todo(dpj): Test call clear when no (key,value) put in ds
-            self._ds_client.dev_delete(batch)
+            self._ds_client.delete(batch)
 
     def _create_empty_npu_tensorlist(self, shapes, dtypes):
         """

From cdccf6df0345eb3971d8e3d4bf5b67e1fdc4759f Mon Sep 17 00:00:00 2001
From: 0oshowero0 <o0shower0o@outlook.com>
Date: Wed, 25 Mar 2026 15:48:42 +0800
Subject: [PATCH 14/29] fix nested tensor for NPU

Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
---
 scripts/performance_test/README_PERFTEST.md   | 72 +++++++++++++++++--
 scripts/performance_test/perftest.py          | 57 ++++++++-------
 .../performance_test/ray_perftest_baseline.py | 57 ++++++++-------
 3 files changed, 132 insertions(+), 54 deletions(-)

diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index cf62efa..e05f9a8 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -51,33 +51,93 @@ For device support of each backend,
 - `Yuanrong` supports `cpu` and `npu`
 - `MooncakeStore` supports `cpu` and `gpu`
 
+## Test Data Format
+
+The test case creates TensorDict with three types of fields:
+
+1. **Regular tensors**: Shape `(batch_size, seq_length)`, float32
+2. **Nested tensors** (non-NPU devices): Variable-length sequences with lengths forming an arithmetic progression from 1 to `seq_length`. For a batch of size N, element j has length `1 + j * (seq_length - 1) / (N - 1)`. This gives an average nested length of approximately `seq_length / 2`, making the nested column size roughly half of a regular tensor column.
+3. **NonTensorStack strings**: Each string is `seq_length * 4` bytes to match the memory footprint of one tensor element.
+
+### NPU Fallback
+
+NPU does not support nested tensors. When running with `--device=npu`, the nested tensor fields are replaced with regular tensors of shape `(batch_size, seq_length // 2)` to maintain comparable total data size while avoiding nested tensor operations.
+
 ## Yuanrong Backend
 
 For Yuanrong backend, writer runs on head node and reader runs on worker node.
 
+## Running Full Test Suite
+
+The `run_perf_test.sh` script automates the full performance test suite:
+
+```bash
+cd scripts/performance_test
+./run_perf_test.sh
+```
+
+### Configuration
+
+Configure the test environment via environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `HEAD_NODE_IP` | Head node IP address | 127.0.0.1 |
+| `WORKER_NODE_IP` | Worker node IP address | 127.0.0.1 |
+| `DEVICE` | Device type (cpu, npu, gpu) | cpu |
+| `NUM_TEST_ITERATIONS` | Number of iterations per test | 4 |
+
+Example:
+```bash
+HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.sh
+```
+
+### Test Matrix
+
+The script tests all combinations of:
+- **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline)
+- **Data sizes**: Small (batch=128, fields=3, seq=1024), Medium (batch=1024, fields=9, seq=8192), Large (batch=4096, fields=21, seq=128000)
+
+### Output
+
+- CSV results are saved to `results/{backend}_{size}.csv` (e.g., `results/simplestorage_small.csv`)
+- A performance comparison chart is generated as `results/performance_comparison.pdf`
+
+### draw_figure.py
+
+After running the tests, `draw_figure.py` reads all CSV files from the `results/` directory and generates a bar chart comparing total throughput (Gbps) across backends and data sizes.
+
 ## Examples
 
-### SimpleStorage/Mooncake backend
+Individual test examples using `perftest.py`:
+
+### SimpleStorage backend
 ```bash
-python perftest.py --backend_config=perftest_config.yaml \
+python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage \
   --head_node_ip=192.168.0.1
 ```
 
 ### Yuanrong backend
 ```bash
-python perftest.py --backend_config=perftest_config.yaml \
+python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
-### NPU device test
+### MooncakeStore backend
+```bash
+python perftest.py --backend_config=perftest_config.yaml --backend=MooncakeStore \
+  --head_node_ip=192.168.0.1
+```
+
+### NPU device test (Yuanrong backend)
 ```bash
-python perftest.py --backend_config=perftest_config.yaml --device=npu \
+python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong --device=npu \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
 ```
 
 ### Output to CSV
 ```bash
-python perftest.py --backend_config=perftest_config.yaml \
+python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage \
   --head_node_ip=192.168.0.1 --output_csv=results.csv
 ```
 
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 53614bf..46e38a8 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -46,15 +46,16 @@ def create_test_case(
 
     Creates TensorDict with:
     - Regular tensors: (batch_size, seq_length) shape, each element is float32
-    - Nested Tensors: variable-length sequences, each batch element has length
-      uniformly sampled from [1, seq_length]
+    - Nested Tensors (non-NPU): variable-length sequences with lengths forming an
+      arithmetic progression from 1 to seq_length (average length ≈ seq_length/2)
+    - Nested Tensors (NPU): regular tensors of shape (batch_size, seq_length//2)
     - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes
       (to match memory footprint of one tensor element)
 
     Args:
         batch_size: Batch size for the test case
         seq_length: Maximum sequence length (used for regular tensors and
-            as upper bound for nested tensor sampling)
+            as upper bound for nested tensor lengths)
         field_num: Total number of fields to create (distributed across types)
         device: Device to create tensors on ("cpu", "npu", or "gpu")
 
@@ -72,10 +73,15 @@ def create_test_case(
     regular_field_size_bytes = batch_size * seq_length * bytes_per_element
     regular_field_size_gb = regular_field_size_bytes / (1024**3)
 
-    # Nested tensor field: average length = (1 + seq_length) / 2,
+    # Nested tensor field: average length = (1 + seq_length) / 2 (arithmetic progression),
     # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes
-    avg_nested_length = (1 + seq_length) / 2
-    nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
+    # For NPU, nested fields become regular tensors of seq_length // 2
+    if device == "npu":
+        avg_nested_length = seq_length // 2
+        nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
+    else:
+        avg_nested_length = (1 + seq_length) / 2
+        nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
     nested_field_size_gb = nested_field_size_bytes / (1024**3)
 
     # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element
@@ -101,11 +107,8 @@ def create_test_case(
         torch_device = "cuda:0"
 
     # Set seeds for reproducibility (within this process)
-    # Sample lengths for all nested fields at once
-    nested_lengths = [
-        torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i))
-        for i in range(num_nested_fields)
-    ]
+    # For non-NPU: arithmetic progression lengths from 1 to seq_length for each nested field
+    # For NPU: nested fields become regular tensors of seq_length // 2
 
     batch_size_tuple = (batch_size,)
 
@@ -117,21 +120,27 @@ def create_test_case(
         tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
         prompt_batch.set(field_name, tensor_data)
 
-    # 2. Nested Tensor fields (variable-length sequences)
+    # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU
     for i in range(num_nested_fields):
         field_name = f"nested_field_{i}"
-        actual_lengths = nested_lengths[i]
-
-        # Create nested tensor from variable-length sequences
-        nested_list = []
-        for j in range(batch_size):
-            length = actual_lengths[j].item()
-            # Create sequence data: arange for each element (representing sequence indices)
-            seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
-            nested_list.append(seq_data)
-
-        nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
-        prompt_batch.set(field_name, nested_tensor)
+
+        if device == "npu":
+            # For NPU: create a regular tensor of seq_length // 2
+            tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device)
+            prompt_batch.set(field_name, tensor_data)
+        else:
+            # For non-NPU: create nested tensor with arithmetic progression lengths
+            # Lengths go from 1 to seq_length in equal increments
+            step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0
+            nested_list = []
+            for j in range(batch_size):
+                length = int(round(1 + j * step))
+                length = max(1, min(length, seq_length))  # Clamp to [1, seq_length]
+                seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
+                nested_list.append(seq_data)
+
+            nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
+            prompt_batch.set(field_name, nested_tensor)
 
     # 3. NonTensorStack wrapped strings
     # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint
diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py
index 6951713..30eb9ad 100644
--- a/scripts/performance_test/ray_perftest_baseline.py
+++ b/scripts/performance_test/ray_perftest_baseline.py
@@ -43,15 +43,16 @@ def create_test_case(
 
     Creates TensorDict with:
     - Regular tensors: (batch_size, seq_length) shape, each element is float32
-    - Nested Tensors: variable-length sequences, each batch element has length
-      uniformly sampled from [1, seq_length]
+    - Nested Tensors (non-NPU): variable-length sequences with lengths forming an
+      arithmetic progression from 1 to seq_length (average length ≈ seq_length/2)
+    - Nested Tensors (NPU): regular tensors of shape (batch_size, seq_length//2)
     - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes
       (to match memory footprint of one tensor element)
 
     Args:
         batch_size: Batch size for the test case
         seq_length: Maximum sequence length (used for regular tensors and
-            as upper bound for nested tensor sampling)
+            as upper bound for nested tensor lengths)
         field_num: Total number of fields to create (distributed across types)
         device: Device to create tensors on ("cpu", "npu", or "gpu")
 
@@ -69,10 +70,15 @@ def create_test_case(
     regular_field_size_bytes = batch_size * seq_length * bytes_per_element
     regular_field_size_gb = regular_field_size_bytes / (1024**3)
 
-    # Nested tensor field: average length = (1 + seq_length) / 2,
+    # Nested tensor field: average length = (1 + seq_length) / 2 (arithmetic progression),
     # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes
-    avg_nested_length = (1 + seq_length) / 2
-    nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
+    # For NPU, nested fields become regular tensors of seq_length // 2
+    if device == "npu":
+        avg_nested_length = seq_length // 2
+        nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
+    else:
+        avg_nested_length = (1 + seq_length) / 2
+        nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element)
     nested_field_size_gb = nested_field_size_bytes / (1024**3)
 
     # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element
@@ -98,11 +104,8 @@ def create_test_case(
         torch_device = "cuda:0"
 
     # Set seeds for reproducibility (within this process)
-    # Sample lengths for all nested fields at once
-    nested_lengths = [
-        torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i))
-        for i in range(num_nested_fields)
-    ]
+    # For non-NPU: arithmetic progression lengths from 1 to seq_length for each nested field
+    # For NPU: nested fields become regular tensors of seq_length // 2
 
     batch_size_tuple = (batch_size,)
 
@@ -114,21 +117,27 @@ def create_test_case(
         tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
         prompt_batch.set(field_name, tensor_data)
 
-    # 2. Nested Tensor fields (variable-length sequences)
+    # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU
     for i in range(num_nested_fields):
         field_name = f"nested_field_{i}"
-        actual_lengths = nested_lengths[i]
-
-        # Create nested tensor from variable-length sequences
-        nested_list = []
-        for j in range(batch_size):
-            length = actual_lengths[j].item()
-            # Create sequence data: arange for each element (representing sequence indices)
-            seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
-            nested_list.append(seq_data)
-
-        nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
-        prompt_batch.set(field_name, nested_tensor)
+
+        if device == "npu":
+            # For NPU: create a regular tensor of seq_length // 2
+            tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device)
+            prompt_batch.set(field_name, tensor_data)
+        else:
+            # For non-NPU: create nested tensor with arithmetic progression lengths
+            # Lengths go from 1 to seq_length in equal increments
+            step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0
+            nested_list = []
+            for j in range(batch_size):
+                length = int(round(1 + j * step))
+                length = max(1, min(length, seq_length))  # Clamp to [1, seq_length]
+                seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
+                nested_list.append(seq_data)
+
+            nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
+            prompt_batch.set(field_name, nested_tensor)
 
     # 3. NonTensorStack wrapped strings
     # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint

From d043cf96501523e61c518c6cfd6e67da756172aa Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 17:26:14 +0800
Subject: [PATCH 15/29] 1. delete old samples

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest.py      | 17 +++++++++++++++++
 scripts/performance_test/run_perf_test.sh |  2 ++
 2 files changed, 19 insertions(+)

diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 46e38a8..9d2d06f 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -199,6 +199,12 @@ def get_data(self, partition_id: str, keys: list[str] | None = None) -> None:
             keys = self.test_keys
         tq.kv_batch_get(keys=keys, partition_id=partition_id)
 
+    def delete(self, partition_id: str, keys: list[str] | None = None) -> None:
+        """Delete data from storage using kv_batch_delete."""
+        if keys is None:
+            keys = self.test_keys
+        tq.kv_batch_delete(keys=keys, partition_id=partition_id)
+
     def close(self) -> None:
         """Close transfer_queue."""
         tq.close()
@@ -358,6 +364,7 @@ def run_throughput_test(self) -> dict[str, Any]:
                 device=self.device,
             )
         )
+        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
         end_create_data = time.perf_counter()
         logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
 
@@ -389,6 +396,16 @@ def run_throughput_test(self) -> dict[str, Any]:
         get_gbit_per_sec = (total_data_size_gb * 8) / get_time
         get_gbyte_per_sec = total_data_size_gb / get_time
 
+        time.sleep(2)
+
+        # DELETE operation using kv_batch_delete
+        logger.info("Starting DELETE operation (kv_batch_delete)...")
+        start_delete = time.perf_counter()
+        ray.get(self.writer.delete.remote(partition_id=partition_id, keys=keys))
+        end_delete = time.perf_counter()
+        delete_time = end_delete - start_delete
+        logger.info(f"DELETE Time: {delete_time:.8f}s")
+
         # Print summary
         total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time)
         total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time)
diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh
index 50b2863..630869e 100755
--- a/scripts/performance_test/run_perf_test.sh
+++ b/scripts/performance_test/run_perf_test.sh
@@ -45,6 +45,8 @@ for backend in "${BACKENDS[@]}"; do
             --num_test_iterations="${NUM_TEST_ITERATIONS}" \
             --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
             --output_csv="${output_csv}"
+
+        sleep 10
     done
 done
 

From dc51c265e47b4b93a038d4b6ca693f44f9a8177c Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 17:31:22 +0800
Subject: [PATCH 16/29] kv_batch_delete -> kv_clear

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 9d2d06f..7b2f900 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -200,10 +200,10 @@ def get_data(self, partition_id: str, keys: list[str] | None = None) -> None:
         tq.kv_batch_get(keys=keys, partition_id=partition_id)
 
     def delete(self, partition_id: str, keys: list[str] | None = None) -> None:
-        """Delete data from storage using kv_batch_delete."""
+        """Delete data from storage using kv_clear."""
         if keys is None:
             keys = self.test_keys
-        tq.kv_batch_delete(keys=keys, partition_id=partition_id)
+        tq.kv_clear(keys=keys, partition_id=partition_id)
 
     def close(self) -> None:
         """Close transfer_queue."""
@@ -398,8 +398,8 @@ def run_throughput_test(self) -> dict[str, Any]:
 
         time.sleep(2)
 
-        # DELETE operation using kv_batch_delete
-        logger.info("Starting DELETE operation (kv_batch_delete)...")
+        # DELETE operation using kv_clear
+        logger.info("Starting DELETE operation (kv_clear)...")
         start_delete = time.perf_counter()
         ray.get(self.writer.delete.remote(partition_id=partition_id, keys=keys))
         end_delete = time.perf_counter()

From 8d621d057490030815a3728cdc95c64abedcef48 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Wed, 25 Mar 2026 17:54:50 +0800
Subject: [PATCH 17/29] clean test data

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 7b2f900..55283e1 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -204,6 +204,7 @@ def delete(self, partition_id: str, keys: list[str] | None = None) -> None:
         if keys is None:
             keys = self.test_keys
         tq.kv_clear(keys=keys, partition_id=partition_id)
+        self.test_data = None
 
     def close(self) -> None:
         """Close transfer_queue."""
@@ -550,6 +551,7 @@ def main() -> None:
         logger.info("-" * 60)
         result = tester.run_throughput_test()
         all_results.append(result)
+        time.sleep(10)
 
     # Write to CSV if output path is specified
     if args.output_csv:

From 590af4565ff2190e4c0914e8e887bc1a1aa6e31b Mon Sep 17 00:00:00 2001
From: 0oshowero0 <o0shower0o@outlook.com>
Date: Thu, 26 Mar 2026 14:05:17 +0800
Subject: [PATCH 18/29] update test sceanrio and optimize data gen speed

Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>

# Conflicts:
#	scripts/performance_test/perftest.py
---
 scripts/performance_test/perftest.py          | 71 ++++++++++---------
 .../performance_test/ray_perftest_baseline.py | 66 ++++++++---------
 scripts/performance_test/run_perf_test.sh     |  8 +--
 3 files changed, 73 insertions(+), 72 deletions(-)

diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 55283e1..dfe78a2 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -17,6 +17,7 @@
 import argparse
 import csv
 import logging
+import os
 import sys
 import time
 from pathlib import Path
@@ -121,6 +122,11 @@ def create_test_case(
         prompt_batch.set(field_name, tensor_data)
 
     # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU
+    if device != "npu":
+        step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0
+        lengths = [max(1, min(int(round(1 + j * step)), seq_length)) for j in range(batch_size)]
+        total_elements = sum(lengths)
+
     for i in range(num_nested_fields):
         field_name = f"nested_field_{i}"
 
@@ -129,27 +135,20 @@ def create_test_case(
             tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device)
             prompt_batch.set(field_name, tensor_data)
         else:
-            # For non-NPU: create nested tensor with arithmetic progression lengths
-            # Lengths go from 1 to seq_length in equal increments
-            step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0
-            nested_list = []
-            for j in range(batch_size):
-                length = int(round(1 + j * step))
-                length = max(1, min(length, seq_length))  # Clamp to [1, seq_length]
-                seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
-                nested_list.append(seq_data)
-
-            nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
+            flat_data = torch.randn(total_elements, dtype=torch.float32, device=torch_device)
+            nested_tuple = torch.split(flat_data, lengths)
+            nested_tensor = torch.nested.as_nested_tensor(nested_tuple, layout=torch.jagged)
             prompt_batch.set(field_name, nested_tensor)
 
     # 3. NonTensorStack wrapped strings
     # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint
     string_char_count = seq_length * bytes_per_element  # 4 bytes per char (unicode)
-    string_template = "x" * string_char_count
 
     for i in range(num_nontensor_fields):
         field_name = f"nontensor_field_{i}"
-        string_data = [string_template for _ in range(batch_size)]
+        bytes_needed = string_char_count // 2
+        string_data = [os.urandom(bytes_needed).hex() for _ in range(batch_size)]
+
         prompt_batch.set(field_name, NonTensorStack.from_list(string_data))
 
     return prompt_batch, total_size_gb
@@ -349,25 +348,27 @@ def _initialize_clients(self) -> None:
         r = self.reader.initialize.remote()
         ray.get([w, r])
 
-    def run_throughput_test(self) -> dict[str, Any]:
+    def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]:
         """Run the throughput test and print results.
 
         Returns:
             Dictionary with test results
         """
-        logger.info("Creating large batch for throughput test...")
-        start_create_data = time.perf_counter()
-        data_fields, total_data_size_gb = ray.get(
-            self.writer.create_test_case.remote(
-                batch_size=self.global_batch_size,
-                seq_length=self.seq_len,
-                field_num=self.field_num,
-                device=self.device,
+        # Create test data
+        if not skip_dataset_create:
+            logger.info("Creating large batch for throughput test...")
+            start_create_data = time.perf_counter()
+            data_fields, self.total_data_size_gb = ray.get(
+                self.writer.create_test_case.remote(
+                    batch_size=self.global_batch_size,
+                    seq_length=self.seq_len,
+                    field_num=self.field_num,
+                    device=self.device,
+                )
             )
-        )
-        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
-        end_create_data = time.perf_counter()
-        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
+            end_create_data = time.perf_counter()
+            logger.info(f"Total Data Size: {self.total_data_size_gb:.6f} GB")
+            logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
 
         partition_id = "train_0"
 
@@ -377,8 +378,8 @@ def run_throughput_test(self) -> dict[str, Any]:
         ray.get(self.writer.put.remote(partition_id=partition_id))
         end_put = time.perf_counter()
         put_time = end_put - start_put
-        put_gbit_per_sec = (total_data_size_gb * 8) / put_time
-        put_gbyte_per_sec = total_data_size_gb / put_time
+        put_gbit_per_sec = (self.total_data_size_gb * 8) / put_time
+        put_gbyte_per_sec = self.total_data_size_gb / put_time
 
         time.sleep(2)
 
@@ -394,8 +395,8 @@ def run_throughput_test(self) -> dict[str, Any]:
         ray.get(self.reader.get_data.remote(partition_id=partition_id, keys=keys))
         end_get_data = time.perf_counter()
         get_time = end_get_data - start_get_data
-        get_gbit_per_sec = (total_data_size_gb * 8) / get_time
-        get_gbyte_per_sec = total_data_size_gb / get_time
+        get_gbit_per_sec = (self.total_data_size_gb * 8) / get_time
+        get_gbyte_per_sec = self.total_data_size_gb / get_time
 
         time.sleep(2)
 
@@ -408,15 +409,15 @@ def run_throughput_test(self) -> dict[str, Any]:
         logger.info(f"DELETE Time: {delete_time:.8f}s")
 
         # Print summary
-        total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time)
-        total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time)
+        total_gbit_per_sec = (self.total_data_size_gb * 16) / (put_time + get_time)
+        total_gbyte_per_sec = (self.total_data_size_gb * 2) / (put_time + get_time)
 
         logger.info("=" * 60)
         logger.info("THROUGHPUT TEST SUMMARY")
         logger.info("=" * 60)
         logger.info(f"Backend: {self.backend}")
         logger.info(f"Device: {self.device}")
-        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
+        logger.info(f"Total Data Size: {self.total_data_size_gb:.6f} GB")
         logger.info(f"PUT Time: {put_time:.8f}s")
         logger.info(f"GET Time: {get_time:.8f}s")
         logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)")
@@ -428,7 +429,7 @@ def run_throughput_test(self) -> dict[str, Any]:
         return {
             "backend": self.backend,
             "device": self.device,
-            "total_data_size_gb": total_data_size_gb,
+            "total_data_size_gb": self.total_data_size_gb,
             "put_time": put_time,
             "get_time": get_time,
             "put_gbit_per_sec": put_gbit_per_sec,
@@ -549,7 +550,7 @@ def main() -> None:
         logger.info("-" * 60)
         logger.info(f"Iteration {i + 1}/{args.num_test_iterations}")
         logger.info("-" * 60)
-        result = tester.run_throughput_test()
+        result = tester.run_throughput_test(skip_dataset_create=(i != 0))
         all_results.append(result)
         time.sleep(10)
 
diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py
index 30eb9ad..e59b175 100644
--- a/scripts/performance_test/ray_perftest_baseline.py
+++ b/scripts/performance_test/ray_perftest_baseline.py
@@ -17,6 +17,7 @@
 import argparse
 import csv
 import logging
+import os
 import sys
 import time
 from pathlib import Path
@@ -118,6 +119,11 @@ def create_test_case(
         prompt_batch.set(field_name, tensor_data)
 
     # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU
+    if device != "npu":
+        step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0
+        lengths = [max(1, min(int(round(1 + j * step)), seq_length)) for j in range(batch_size)]
+        total_elements = sum(lengths)
+
     for i in range(num_nested_fields):
         field_name = f"nested_field_{i}"
 
@@ -126,27 +132,20 @@ def create_test_case(
             tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device)
             prompt_batch.set(field_name, tensor_data)
         else:
-            # For non-NPU: create nested tensor with arithmetic progression lengths
-            # Lengths go from 1 to seq_length in equal increments
-            step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0
-            nested_list = []
-            for j in range(batch_size):
-                length = int(round(1 + j * step))
-                length = max(1, min(length, seq_length))  # Clamp to [1, seq_length]
-                seq_data = torch.arange(length, dtype=torch.float32, device=torch_device)
-                nested_list.append(seq_data)
-
-            nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged)
+            flat_data = torch.randn(total_elements, dtype=torch.float32, device=torch_device)
+            nested_tuple = torch.split(flat_data, lengths)
+            nested_tensor = torch.nested.as_nested_tensor(nested_tuple, layout=torch.jagged)
             prompt_batch.set(field_name, nested_tensor)
 
     # 3. NonTensorStack wrapped strings
     # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint
     string_char_count = seq_length * bytes_per_element  # 4 bytes per char (unicode)
-    string_template = "x" * string_char_count
 
     for i in range(num_nontensor_fields):
         field_name = f"nontensor_field_{i}"
-        string_data = [string_template for _ in range(batch_size)]
+        bytes_needed = string_char_count // 2
+        string_data = [os.urandom(bytes_needed).hex() for _ in range(batch_size)]
+
         prompt_batch.set(field_name, NonTensorStack.from_list(string_data))
 
     return prompt_batch, total_size_gb
@@ -218,31 +217,32 @@ def _initialize_remote_store(self) -> None:
 
         logger.info(f"RemoteDataStore created on {reader_node}")
 
-    def run_throughput_test(self) -> dict[str, Any]:
+    def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]:
         """Run the throughput test and print results.
 
         Returns:
             Dictionary with test results
         """
         # Create test data
-        logger.info("Creating large batch for throughput test...")
-        start_create_data = time.perf_counter()
-        test_data, total_data_size_gb = create_test_case(
-            batch_size=self.global_batch_size,
-            seq_length=self.seq_len,
-            field_num=self.field_num,
-            device="cpu",
-        )
-        end_create_data = time.perf_counter()
-        logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
+        if not skip_dataset_create:
+            logger.info("Creating large batch for throughput test...")
+            start_create_data = time.perf_counter()
+            self.test_data, self.total_data_size_gb = create_test_case(
+                batch_size=self.global_batch_size,
+                seq_length=self.seq_len,
+                field_num=self.field_num,
+                device="cpu",
+            )
+            end_create_data = time.perf_counter()
+            logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
 
         # PUT operation - pass data directly to remote actor
         logger.info("Starting PUT operation...")
         start_put = time.perf_counter()
-        ray.get(self.remote_store.put_data.remote(test_data))
+        ray.get(self.remote_store.put_data.remote(self.test_data))
         end_put = time.perf_counter()
         put_time = end_put - start_put
-        put_gbit_per_sec = (total_data_size_gb * 8) / put_time
+        put_gbit_per_sec = (self.total_data_size_gb * 8) / put_time
 
         time.sleep(2)
 
@@ -252,19 +252,19 @@ def run_throughput_test(self) -> dict[str, Any]:
         _ = ray.get(self.remote_store.get_data.remote())
         end_get = time.perf_counter()
         get_time = end_get - start_get
-        get_gbit_per_sec = (total_data_size_gb * 8) / get_time
+        get_gbit_per_sec = (self.total_data_size_gb * 8) / get_time
 
         # Clear data
         ray.get(self.remote_store.clear_data.remote())
 
         # Calculate total throughput
-        total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time)
+        total_gbit_per_sec = (self.total_data_size_gb * 16) / (put_time + get_time)
 
         # Print summary
         logger.info("=" * 60)
         logger.info("RAY BASELINE THROUGHPUT TEST SUMMARY")
         logger.info("=" * 60)
-        logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB")
+        logger.info(f"Total Data Size: {self.total_data_size_gb:.6f} GB")
         logger.info(f"PUT Time: {put_time:.8f}s")
         logger.info(f"GET Time: {get_time:.8f}s")
         logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s")
@@ -275,7 +275,7 @@ def run_throughput_test(self) -> dict[str, Any]:
         return {
             "backend": "RayBaseline",
             "device": "cpu",
-            "total_data_size_gb": total_data_size_gb,
+            "total_data_size_gb": self.total_data_size_gb,
             "put_time": put_time,
             "get_time": get_time,
             "put_gbit_per_sec": put_gbit_per_sec,
@@ -329,8 +329,8 @@ def main() -> None:
     parser.add_argument(
         "--num_test_iterations",
         type=int,
-        default=3,
-        help="Number of test iterations (default: 3)",
+        default=4,
+        help="Number of test iterations (default: 4)",
     )
     parser.add_argument(
         "--head_node_ip",
@@ -370,7 +370,7 @@ def main() -> None:
         logger.info("-" * 60)
         logger.info(f"Iteration {i + 1}/{args.num_test_iterations}")
         logger.info("-" * 60)
-        result = tester.run_throughput_test()
+        result = tester.run_throughput_test(skip_dataset_create=(i != 0))
         all_results.append(result)
 
     # Write to CSV if output path is specified
diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh
index 630869e..b9c7a1f 100755
--- a/scripts/performance_test/run_perf_test.sh
+++ b/scripts/performance_test/run_perf_test.sh
@@ -22,9 +22,9 @@ BACKENDS=("SimpleStorage" "Yuanrong" "MooncakeStore")
 
 # Test settings: global_batch_size, field_num, seq_len, name
 declare -a SETTINGS=(
-    "128,3,1024,Small"
-    "1024,9,8192,Medium"
-    "4096,21,128000,Large"
+    "1024,9,8192,Small"
+    "4096,15,32768,Medium"
+    "8192,21,128000,Large"
 )
 
 # ---- TransferQueue perftest ----
@@ -71,4 +71,4 @@ done
 python "${SCRIPT_DIR}/draw_figure.py"
 
 echo ""
-echo "All tests completed!"
+echo "All tests completed!"
\ No newline at end of file

From 5446dfe154110cf3e846ede4e0db03ad9da5b1e3 Mon Sep 17 00:00:00 2001
From: 0oshowero0 <o0shower0o@outlook.com>
Date: Thu, 26 Mar 2026 14:10:17 +0800
Subject: [PATCH 19/29] update readme

Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
---
 scripts/performance_test/README_PERFTEST.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index e05f9a8..d51d42b 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -96,7 +96,7 @@ HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.s
 
 The script tests all combinations of:
 - **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline)
-- **Data sizes**: Small (batch=128, fields=3, seq=1024), Medium (batch=1024, fields=9, seq=8192), Large (batch=4096, fields=21, seq=128000)
+- **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=21, seq=128000)
 
 ### Output
 

From b918ec543e8cfb2b19b2babf580f53934cd000d2 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Thu, 26 Mar 2026 14:41:47 +0800
Subject: [PATCH 20/29] do not remove test data since it's being reused

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index dfe78a2..ce395c1 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -203,7 +203,6 @@ def delete(self, partition_id: str, keys: list[str] | None = None) -> None:
         if keys is None:
             keys = self.test_keys
         tq.kv_clear(keys=keys, partition_id=partition_id)
-        self.test_data = None
 
     def close(self) -> None:
         """Close transfer_queue."""

From ca530afbb5c15713f7a0aac46702254eb36e5c50 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Thu, 26 Mar 2026 15:14:10 +0800
Subject: [PATCH 21/29] update readme for perftest

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/README_PERFTEST.md | 120 +++++++++++---------
 1 file changed, 65 insertions(+), 55 deletions(-)

diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index d51d42b..e9e9e02 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -8,7 +8,6 @@ This script runs throughput tests for TransferQueue with different backends.
    ```bash
    # On head node
    ray start --head --resources='{"node:192.168.0.1":1}'
-
    # On worker node
    ray start --address=192.168.0.1:6379 --resources='{"node:192.168.0.2":1}'
    ```
@@ -19,10 +18,11 @@ This script runs throughput tests for TransferQueue with different backends.
 
 ```bash
 python perftest.py \
-  --backend_config=../../transfer_queue/config.yaml \
-  --device=[cpu|npu|gpu] \
+  --backend_config=perftest_config.yaml \
+  --backend=SimpleStorage \
+  --device=cpu \
   --global_batch_size=1024 \
-  --field_num=10 \
+  --field_num=9 \
   --seq_len=8192 \
   --head_node_ip=192.168.0.1 \
   --worker_node_ip=192.168.0.2
@@ -30,46 +30,56 @@ python perftest.py \
 
 ## Arguments
 
-| Argument | Description | Default |
-|----------|-------------|---------|
-| `--backend_config` | Path to backend config YAML file (required) | -       |
-| `--device` | Device: cpu, npu, gpu | cpu     |
-| `--global_batch_size` | Global batch size | 1024    |
-| `--field_num` | Number of fields | 10      |
-| `--seq_len` | Sequence length | 8192    |
-| `--num_test_iterations` | Number of test iterations | 4       |
-| `--head_node_ip` | Head node IP (required) | -       |
-| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None    |
-| `--output_csv` | Path to output CSV file (optional) | None    |
+| Argument | Description | Default | Required |
+|----------|-------------|---------|----------|
+| `--backend_config` | Path to backend config YAML file | - | Yes |
+| `--backend` | Override `storage_backend` in config (`SimpleStorage`, `Yuanrong`, `MooncakeStore`) | None | No |
+| `--device` | Device: `cpu`, `npu`, `gpu` | `cpu` | No |
+| `--global_batch_size` | Global batch size | 1024 | No |
+| `--field_num` | Number of fields in the TensorDict | 10 | No |
+| `--seq_len` | Sequence length | 8192 | No |
+| `--num_test_iterations` | Number of test iterations | 4 | No |
+| `--head_node_ip` | Head node IP address | - | Yes |
+| `--worker_node_ip` | Worker node IP address (required for Yuanrong) | None | No |
+| `--output_csv` | Path to output CSV file | None | No |
 
 ## Backend Configuration
 
-The script reads the backend configuration directly from the provided `--backend_config` YAML file. The backend type is determined by `backend.storage_backend` in the config file.
+The script reads the backend configuration directly from the provided `--backend_config` YAML file. The backend type is determined by `backend.storage_backend` in the config file. When `--backend` is specified, it overrides the value in the config.
 
-For device support of each backend,
-- `SimpleStorage` backend supports `cpu`
-- `Yuanrong` supports `cpu` and `npu`
-- `MooncakeStore` supports `cpu` and `gpu`
+For device support of each backend:
+- `SimpleStorage`: `cpu`
+- `Yuanrong`: `cpu`, `npu`
+- `MooncakeStore`: `cpu`, `gpu`
 
 ## Test Data Format
 
-The test case creates TensorDict with three types of fields:
+The test case creates a `TensorDict` with three types of fields to simulate real training batches:
+
+1. **Regular tensors**: Shape `(batch_size, seq_length)`, float32.
+2. **Nested tensors** (non-NPU devices): Variable-length ragged sequences with lengths forming an arithmetic progression from 1 to `seq_length`. Average length ≈ `seq_length / 2`, so each nested field is roughly half the size of a regular field.
+3. **NonTensorStack strings**: Each string is `seq_length × 4` bytes, matching the memory footprint of one tensor element.
+
+Fields are distributed evenly across the three types (rounded up). For NPU devices, nested tensors fall back to regular tensors of shape `(batch_size, seq_length // 2)`.
+
+## Test Flow
 
-1. **Regular tensors**: Shape `(batch_size, seq_length)`, float32
-2. **Nested tensors** (non-NPU devices): Variable-length sequences with lengths forming an arithmetic progression from 1 to `seq_length`. For a batch of size N, element j has length `1 + j * (seq_length - 1) / (N - 1)`. This gives an average nested length of approximately `seq_length / 2`, making the nested column size roughly half of a regular tensor column.
-3. **NonTensorStack strings**: Each string is `seq_length * 4` bytes to match the memory footprint of one tensor element.
+Each iteration performs a PUT → LIST → GET → DELETE cycle via TransferQueue's KV API:
 
-### NPU Fallback
+1. **PUT** (`kv_batch_put`): Writer sends the TensorDict to storage.
+2. **LIST** (`kv_list`): Reader queries available keys in the partition.
+3. **GET** (`kv_batch_get`): Reader fetches data for those keys.
+4. **DELETE** (`kv_clear`): Writer removes the written data.
 
-NPU does not support nested tensors. When running with `--device=npu`, the nested tensor fields are replaced with regular tensors of shape `(batch_size, seq_length // 2)` to maintain comparable total data size while avoiding nested tensor operations.
+The test runs `--num_test_iterations` iterations. Data creation only happens in the first iteration; subsequent iterations reuse the same TensorDict to isolate transfer overhead.
 
 ## Yuanrong Backend
 
-For Yuanrong backend, writer runs on head node and reader runs on worker node.
+For Yuanrong backend, writer runs on the head node and reader runs on the worker node. `--worker_node_ip` is required.
 
 ## Running Full Test Suite
 
-The `run_perf_test.sh` script automates the full performance test suite:
+The `run_perf_test.sh` script automates the full test suite across all backends and data sizes, then generates a comparison chart:
 
 ```bash
 cd scripts/performance_test
@@ -78,14 +88,14 @@ cd scripts/performance_test
 
 ### Configuration
 
-Configure the test environment via environment variables:
+Configure via environment variables:
 
 | Variable | Description | Default |
 |----------|-------------|---------|
-| `HEAD_NODE_IP` | Head node IP address | 127.0.0.1 |
-| `WORKER_NODE_IP` | Worker node IP address | 127.0.0.1 |
-| `DEVICE` | Device type (cpu, npu, gpu) | cpu |
-| `NUM_TEST_ITERATIONS` | Number of iterations per test | 4 |
+| `HEAD_NODE_IP` | Head node IP address | `127.0.0.1` |
+| `WORKER_NODE_IP` | Worker node IP address | `127.0.0.1` |
+| `DEVICE` | Device type (`cpu`, `npu`, `gpu`) | `cpu` |
+| `NUM_TEST_ITERATIONS` | Number of iterations per test | `4` |
 
 Example:
 ```bash
@@ -94,30 +104,31 @@ HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.s
 
 ### Test Matrix
 
-The script tests all combinations of:
 - **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline)
 - **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=21, seq=128000)
 
 ### Output
 
-- CSV results are saved to `results/{backend}_{size}.csv` (e.g., `results/simplestorage_small.csv`)
-- A performance comparison chart is generated as `results/performance_comparison.pdf`
+- CSV results: `results/{backend}_{size}.csv` (e.g., `results/simplestorage_small.csv`, `results/ray_baseline_medium.csv`)
+- Performance chart: `results/performance_comparison.pdf`
+
+### Ray Baseline
+
+`ray_perftest_baseline.py` measures raw Ray inter-node transfer throughput without TransferQueue, serving as a baseline. It passes a TensorDict directly to a remote Ray actor (via `ray.get`), using the same test data format. It is automatically included in `run_perf_test.sh`.
 
 ### draw_figure.py
 
-After running the tests, `draw_figure.py` reads all CSV files from the `results/` directory and generates a bar chart comparing total throughput (Gbps) across backends and data sizes.
+After running the tests, `draw_figure.py` reads all CSV files from `results/` and generates a grouped bar chart comparing total throughput (Gbps) across backends and data sizes.
 
 ## Examples
 
-Individual test examples using `perftest.py`:
-
 ### SimpleStorage backend
 ```bash
 python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage \
   --head_node_ip=192.168.0.1
 ```
 
-### Yuanrong backend
+### Yuanrong backend (inter-node)
 ```bash
 python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
@@ -129,7 +140,7 @@ python perftest.py --backend_config=perftest_config.yaml --backend=MooncakeStore
   --head_node_ip=192.168.0.1
 ```
 
-### NPU device test (Yuanrong backend)
+### NPU device test (Yuanrong)
 ```bash
 python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong --device=npu \
   --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2
@@ -141,7 +152,7 @@ python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage
   --head_node_ip=192.168.0.1 --output_csv=results.csv
 ```
 
-## Output
+## Output Format
 
 The test prints:
 - Total data size
@@ -151,16 +162,15 @@ The test prints:
 
 Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per second).
 
-### CSV Output
-
-When using `--output_csv`, the test writes results to a CSV file with the following columns:
-- backend
-- device
-- total_data_size_gb
-- put_time
-- get_time
-- put_gbit_per_sec
-- get_gbit_per_sec
-- total_gbit_per_sec
-
-The test runs `--num_test_iterations` iterations (default: 4) and saves all results to the CSV.
+### CSV Columns
+
+| Column | Description |
+|--------|-------------|
+| `backend` | Backend name |
+| `device` | Device type |
+| `total_data_size_gb` | Data size in GB |
+| `put_time` | PUT duration (seconds) |
+| `get_time` | GET duration (seconds) |
+| `put_gbit_per_sec` | PUT throughput (Gbps) |
+| `get_gbit_per_sec` | GET throughput (Gbps) |
+| `total_gbit_per_sec` | Round-trip throughput (Gbps) |

From dbd830f9a4e7d83b841a59ceaac7d9376ef99a96 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Thu, 26 Mar 2026 16:42:22 +0800
Subject: [PATCH 22/29] 1. fix bar order in draw_figure.py 2. remove delete
 time stats

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/draw_figure.py | 9 ++++++++-
 scripts/performance_test/perftest.py    | 4 ----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py
index bb7910e..b96219a 100644
--- a/scripts/performance_test/draw_figure.py
+++ b/scripts/performance_test/draw_figure.py
@@ -95,12 +95,19 @@ def make_xlabel(size_label: str) -> str:
 df["Bandwidth"] = df["total_gbit_per_sec"]
 df["Scenario"] = df["backend_parsed"]
 
+# Set backend display order
+backend_order = ["Ray", "SimpleStorage", "Yuanrong", "MooncakeStore"]
+
+df["Scenario"] = pd.Categorical(df["Scenario"], categories=backend_order, ordered=True)
+
 # ========== Plotting ==========
 sns.set_theme(style="white", palette="husl")
 
 fig, ax = plt.subplots(figsize=(12, 7))
 
-palette = sns.color_palette("Set2", n_colors=df["Scenario"].nunique())
+# Use the backend order to ensure consistent coloring
+existing_backends = df["Scenario"].unique()
+palette = sns.color_palette("Set2", n_colors=len(existing_backends))
 barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette)
 
 # Legend: match old style — at the top center, horizontal, with frame
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index ce395c1..c5c174e 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -401,11 +401,7 @@ def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]:
 
         # DELETE operation using kv_clear
         logger.info("Starting DELETE operation (kv_clear)...")
-        start_delete = time.perf_counter()
         ray.get(self.writer.delete.remote(partition_id=partition_id, keys=keys))
-        end_delete = time.perf_counter()
-        delete_time = end_delete - start_delete
-        logger.info(f"DELETE Time: {delete_time:.8f}s")
 
         # Print summary
         total_gbit_per_sec = (self.total_data_size_gb * 16) / (put_time + get_time)

From eb380c20a1771ddef8aef395e1c2dcd2e7204f5e Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Fri, 27 Mar 2026 21:10:50 +0800
Subject: [PATCH 23/29] fix incorrect init yr client from controller; otherwise
 all yr clients will connect to the head node

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest_config.yaml | 2 +-
 transfer_queue/interface.py                   | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml
index 88dca45..a1b1f9a 100644
--- a/scripts/performance_test/perftest_config.yaml
+++ b/scripts/performance_test/perftest_config.yaml
@@ -53,4 +53,4 @@ backend:
     # Port of local yuanrong datasystem worker
     port: 31501
     # If enable npu transport
-    enable_yr_npu_transport: false
+    enable_yr_npu_transport: true
diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py
index d0fd2f7..92b0f5e 100644
--- a/transfer_queue/interface.py
+++ b/transfer_queue/interface.py
@@ -233,7 +233,7 @@ def init(conf: Optional[DictConfig] = None) -> None:
         >>> metadata = tq.get_meta(...)
         >>> data = tq.get_data(metadata)
     """
-    if _init_from_existing():
+    if conf is None and _init_from_existing():
         return
 
     # First-time initialize TransferQueue
@@ -271,8 +271,10 @@ def init(conf: Optional[DictConfig] = None) -> None:
         logger.info("TransferQueueController has been created.")
     except ValueError:
         logger.info("Some other rank has initialized TransferQueueController. Try to connect to existing controller.")
-        _init_from_existing()
-        return
+        if conf is None:
+            _init_from_existing()
+            return
+        _TRANSFER_QUEUE_CONTROLLER = ray.get_actor("TransferQueueController")
 
     controller_zmq_info = process_zmq_server_info(_TRANSFER_QUEUE_CONTROLLER)
     final_conf.controller.zmq_info = controller_zmq_info

From b68d267160715101ba201a8c52812d4139dd4e47 Mon Sep 17 00:00:00 2001
From: 0oshowero0 <o0shower0o@outlook.com>
Date: Fri, 27 Mar 2026 16:26:14 +0800
Subject: [PATCH 24/29] add simple case

Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
---
 scripts/performance_test/README_PERFTEST.md   |  8 ++
 scripts/performance_test/perftest.py          | 80 ++++++++++++++++++-
 .../performance_test/ray_perftest_baseline.py | 79 ++++++++++++++++--
 scripts/performance_test/run_perf_test.sh     | 14 +++-
 4 files changed, 169 insertions(+), 12 deletions(-)

diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index e9e9e02..b1d0f6a 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -96,9 +96,17 @@ Configure via environment variables:
 | `WORKER_NODE_IP` | Worker node IP address | `127.0.0.1` |
 | `DEVICE` | Device type (`cpu`, `npu`, `gpu`) | `cpu` |
 | `NUM_TEST_ITERATIONS` | Number of iterations per test | `4` |
+| `USE_COMPLEX_CASE` | Run with complex test case (nested + nontensor fields) | `false` |
 
 Example:
 ```bash
+# Simple case (default, regular tensors only)
+./run_perf_test.sh
+
+# Complex case (nested tensors + nontensor strings)
+USE_COMPLEX_CASE=true ./run_perf_test.sh
+
+# With specific node IPs & use NPU
 HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.sh
 ```
 
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index c5c174e..5e94f36 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -42,6 +42,62 @@ def create_test_case(
     seq_length: int | None = None,
     field_num: int | None = None,
     device: str = "cpu",
+) -> tuple[TensorDict, float]:
+    """Create a test case with tensor data formats.
+
+    Creates TensorDict with:
+    - Regular tensors: (batch_size, seq_length) shape, each element is float32
+
+    Args:
+        batch_size: Batch size for the test case
+        seq_length: Maximum sequence length (used for regular tensors and
+            as upper bound for nested tensor lengths)
+        field_num: Total number of fields to create (distributed across types)
+        device: Device to create tensors on ("cpu", "npu", or "gpu")
+
+    Returns:
+        Tuple of (TensorDict, total_size_gb)
+    """
+    bytes_per_element = 4  # float32
+
+    # Each regular tensor field: batch_size * seq_length * 4 bytes
+    regular_field_size_bytes = batch_size * seq_length * bytes_per_element
+    regular_field_size_gb = regular_field_size_bytes / (1024**3)
+
+    # Total size = sum of all field types
+    total_size_gb = regular_field_size_gb * field_num
+
+    logger.info(f"Total data size: {total_size_gb:.6f} GB")
+
+    # Determine torch device
+    torch_device = None
+    if device == "npu":
+        torch_device = "npu:0"
+    elif device == "gpu":
+        torch_device = "cuda:0"
+
+    # Set seeds for reproducibility (within this process)
+    # For non-NPU: arithmetic progression lengths from 1 to seq_length for each nested field
+    # For NPU: nested fields become regular tensors of seq_length // 2
+
+    batch_size_tuple = (batch_size,)
+
+    prompt_batch = TensorDict(batch_size=batch_size_tuple)
+
+    # 1. Regular tensor fields
+    for i in range(field_num):
+        field_name = f"field_{i}"
+        tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
+        prompt_batch.set(field_name, tensor_data)
+
+    return prompt_batch, total_size_gb
+
+
+def create_complex_test_case(
+    batch_size: int | None = None,
+    seq_length: int | None = None,
+    field_num: int | None = None,
+    device: str = "cpu",
 ) -> tuple[TensorDict, float]:
     """Create a test case with complex data formats.
 
@@ -158,8 +214,9 @@ def create_test_case(
 class TQClientActor:
     """Ray actor that uses tq.init(config) to initialize."""
 
-    def __init__(self, config: dict[str, Any]):
+    def __init__(self, config: dict[str, Any], use_complex_case: bool = False):
         self.config = config
+        self.use_complex_case = use_complex_case
         self.test_data = None
         self.total_data_size_gb = 0.0
         self.test_keys = None
@@ -176,7 +233,12 @@ def create_test_case(
         device: str = "cpu",
     ) -> tuple[list[str], float]:
         """Create test case on the actor."""
-        self.test_data, self.total_data_size_gb = create_test_case(batch_size, seq_length, field_num, device)
+        if self.use_complex_case:
+            self.test_data, self.total_data_size_gb = create_complex_test_case(
+                batch_size, seq_length, field_num, device
+            )
+        else:
+            self.test_data, self.total_data_size_gb = create_test_case(batch_size, seq_length, field_num, device)
         # Create keys for each sample in the batch
         self.test_keys = [f"test_key_{i}" for i in range(batch_size)]
         return list(self.test_data.keys()), self.total_data_size_gb
@@ -224,6 +286,7 @@ def __init__(
         backend: str | None = None,
         worker_node_ip: str | None = None,
         output_csv: str | None = None,
+        use_complex_case: bool = False,
     ):
         """Initialize the throughput tester.
 
@@ -238,6 +301,7 @@ def __init__(
             head_node_ip: Head node IP address
             worker_node_ip: Worker node IP address (required for Yuanrong)
             output_csv: Path to output CSV file (optional)
+            use_complex_case: Whether to use complex test case (nested + nontensor fields)
         """
         self.backend_config_path = backend_config_path
         self.backend_override = backend
@@ -249,6 +313,7 @@ def __init__(
         self.head_node_ip = head_node_ip
         self.worker_node_ip = worker_node_ip
         self.output_csv = output_csv
+        self.use_complex_case = use_complex_case
 
         # Prepare full config for tq.init()
         self.full_config = self._prepare_config()
@@ -337,8 +402,8 @@ def _initialize_clients(self) -> None:
             reader_config = self.full_config
 
         # Create writer and reader actors
-        self.writer = TQClientActor.options(**writer_options).remote(writer_config)
-        self.reader = TQClientActor.options(**reader_options).remote(reader_config)
+        self.writer = TQClientActor.options(**writer_options).remote(writer_config, self.use_complex_case)
+        self.reader = TQClientActor.options(**reader_options).remote(reader_config, self.use_complex_case)
 
         # Initialize transfer_queue
         logger.info(f"Using {self.backend} as storage backend.")
@@ -522,6 +587,12 @@ def main() -> None:
         default=None,
         help="Path to output CSV file (optional)",
     )
+    parser.add_argument(
+        "--use_complex_case",
+        action="store_true",
+        default=False,
+        help="Use complex test case with nested tensors and nontensor fields (default: False, simple case)",
+    )
 
     args = parser.parse_args()
 
@@ -537,6 +608,7 @@ def main() -> None:
         backend=args.backend,
         worker_node_ip=args.worker_node_ip,
         output_csv=args.output_csv,
+        use_complex_case=args.use_complex_case,
     )
 
     # Run test multiple times for consistent results using a for loop
diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py
index e59b175..fe40788 100644
--- a/scripts/performance_test/ray_perftest_baseline.py
+++ b/scripts/performance_test/ray_perftest_baseline.py
@@ -39,6 +39,55 @@ def create_test_case(
     seq_length: int | None = None,
     field_num: int | None = None,
     device: str = "cpu",
+) -> tuple[TensorDict, float]:
+    """Create a test case with only regular tensors.
+
+    Creates TensorDict with:
+    - Regular tensors: (batch_size, seq_length) shape, each element is float32
+
+    Args:
+        batch_size: Batch size for the test case
+        seq_length: Maximum sequence length
+        field_num: Total number of fields to create
+        device: Device to create tensors on ("cpu", "npu", or "gpu")
+
+    Returns:
+        Tuple of (TensorDict, total_size_gb)
+    """
+    bytes_per_element = 4  # float32
+
+    # Each regular tensor field: batch_size * seq_length * 4 bytes
+    regular_field_size_bytes = batch_size * seq_length * bytes_per_element
+    regular_field_size_gb = regular_field_size_bytes / (1024**3)
+
+    total_size_gb = regular_field_size_gb * field_num
+
+    logger.info(f"Total data size: {total_size_gb:.6f} GB")
+
+    # Determine torch device
+    torch_device = None
+    if device == "npu":
+        torch_device = "npu:0"
+    elif device == "gpu":
+        torch_device = "cuda:0"
+
+    batch_size_tuple = (batch_size,)
+
+    prompt_batch = TensorDict(batch_size=batch_size_tuple)
+
+    for i in range(field_num):
+        field_name = f"field_{i}"
+        tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device)
+        prompt_batch.set(field_name, tensor_data)
+
+    return prompt_batch, total_size_gb
+
+
+def create_complex_test_case(
+    batch_size: int | None = None,
+    seq_length: int | None = None,
+    field_num: int | None = None,
+    device: str = "cpu",
 ) -> tuple[TensorDict, float]:
     """Create a test case with complex data formats.
 
@@ -180,6 +229,7 @@ def __init__(
         head_node_ip: str,
         worker_node_ip: str | None = None,
         output_csv: str | None = None,
+        use_complex_case: bool = False,
     ):
         """Initialize the Ray baseline tester.
 
@@ -191,6 +241,7 @@ def __init__(
             head_node_ip: Head node IP address
             worker_node_ip: Worker node IP address
             output_csv: Path to output CSV file (optional)
+            use_complex_case: Whether to use complex test case (nested + nontensor fields)
         """
         self.global_batch_size = global_batch_size
         self.field_num = field_num
@@ -199,6 +250,7 @@ def __init__(
         self.head_node_ip = head_node_ip
         self.worker_node_ip = worker_node_ip
         self.output_csv = output_csv
+        self.use_complex_case = use_complex_case
 
         # Initialize remote store on worker node
         self._initialize_remote_store()
@@ -227,12 +279,20 @@ def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]:
         if not skip_dataset_create:
             logger.info("Creating large batch for throughput test...")
             start_create_data = time.perf_counter()
-            self.test_data, self.total_data_size_gb = create_test_case(
-                batch_size=self.global_batch_size,
-                seq_length=self.seq_len,
-                field_num=self.field_num,
-                device="cpu",
-            )
+            if self.use_complex_case:
+                self.test_data, self.total_data_size_gb = create_complex_test_case(
+                    batch_size=self.global_batch_size,
+                    seq_length=self.seq_len,
+                    field_num=self.field_num,
+                    device="cpu",
+                )
+            else:
+                self.test_data, self.total_data_size_gb = create_test_case(
+                    batch_size=self.global_batch_size,
+                    seq_length=self.seq_len,
+                    field_num=self.field_num,
+                    device="cpu",
+                )
             end_create_data = time.perf_counter()
             logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s")
 
@@ -350,6 +410,12 @@ def main() -> None:
         default=None,
         help="Path to output CSV file (optional)",
     )
+    parser.add_argument(
+        "--use_complex_case",
+        action="store_true",
+        default=False,
+        help="Use complex test case with nested tensors and nontensor fields (default: False, simple case)",
+    )
 
     args = parser.parse_args()
 
@@ -362,6 +428,7 @@ def main() -> None:
         head_node_ip=args.head_node_ip,
         worker_node_ip=args.worker_node_ip,
         output_csv=args.output_csv,
+        use_complex_case=args.use_complex_case,
     )
 
     # Run test multiple times
diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh
index b9c7a1f..0531717 100755
--- a/scripts/performance_test/run_perf_test.sh
+++ b/scripts/performance_test/run_perf_test.sh
@@ -15,6 +15,7 @@ HEAD_NODE_IP="${HEAD_NODE_IP:-127.0.0.1}"
 WORKER_NODE_IP="${WORKER_NODE_IP:-127.0.0.1}"
 DEVICE="${DEVICE:-cpu}"
 NUM_TEST_ITERATIONS="${NUM_TEST_ITERATIONS:-4}"
+USE_COMPLEX_CASE="${USE_COMPLEX_CASE:-false}"
 # ========================================
 
 # Backends to test (passed via --backend to perftest.py)
@@ -27,6 +28,13 @@ declare -a SETTINGS=(
     "8192,21,128000,Large"
 )
 
+# Complex case flag
+if [[ "${USE_COMPLEX_CASE}" == "true" ]]; then
+    COMPLEX_FLAG="--use_complex_case"
+else
+    COMPLEX_FLAG=""
+fi
+
 # ---- TransferQueue perftest ----
 for backend in "${BACKENDS[@]}"; do
     echo "=========================================="
@@ -44,7 +52,8 @@ for backend in "${BACKENDS[@]}"; do
             --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
             --num_test_iterations="${NUM_TEST_ITERATIONS}" \
             --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
-            --output_csv="${output_csv}"
+            --output_csv="${output_csv}" \
+            ${COMPLEX_FLAG}
 
         sleep 10
     done
@@ -64,7 +73,8 @@ for setting in "${SETTINGS[@]}"; do
         --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \
         --num_test_iterations="${NUM_TEST_ITERATIONS}" \
         --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \
-        --output_csv="${output_csv}"
+        --output_csv="${output_csv}" \
+        ${COMPLEX_FLAG}
 done
 
 # ---- Draw figures ----

From a020706322dd11031972e543f942686e262194ca Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Sat, 28 Mar 2026 14:07:42 +0800
Subject: [PATCH 25/29] remove host config for yuanrong; auto-detect instead

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 .../openyuanrong_datasystem.md                | 16 ++-
 scripts/performance_test/perftest.py          | 16 +--
 scripts/performance_test/perftest_config.yaml |  2 -
 transfer_queue/config.yaml                    |  2 -
 .../storage/clients/yuanrong_client.py        | 32 +++++-
 .../storage/managers/yuanrong_manager.py      |  4 +-
 transfer_queue/utils/common.py                | 99 +++++++++++++++++++
 7 files changed, 139 insertions(+), 32 deletions(-)

diff --git a/docs/storage_backends/openyuanrong_datasystem.md b/docs/storage_backends/openyuanrong_datasystem.md
index 084bf55..e25c2f9 100644
--- a/docs/storage_backends/openyuanrong_datasystem.md
+++ b/docs/storage_backends/openyuanrong_datasystem.md
@@ -132,11 +132,11 @@ from transfer_queue import (
     TransferQueueController,
     process_zmq_server_info,
 )
-# host, port, manager_type and client_name are the config for booting the datasystem.
+# port, manager_type and client_name are the config for booting the datasystem.
+# host will be auto-detected by checking local IP addresses.
 config_str = """
   manager_type: YuanrongStorageManager
   client_name: YuanrongStorageClient
-  host: 127.0.0.1
   port: 31501
 """
 dict_conf = OmegaConf.create(config_str, flags={"allow_objects": True})
@@ -360,26 +360,22 @@ def main():
     config_str = """
         manager_type: YuanrongStorageManager
         client_name: YuanrongStorageClient
-        host: 10.170.27.24
         port: 31501
     """
     dict_conf = OmegaConf.create(config_str, flags={"allow_objects": True})
     # It is important to pay attention to the controller's lifecycle.
     controller, dict_conf.controller_info = initialize_controller()
-    
-    conf_writer = dict_conf.copy()
-    conf_writer.host = HEAD_NODE_IP
-    conf_reader = dict_conf.copy()
-    conf_reader.host = WORKER_NODE_IP
+
+    # Note: host is auto-detected on each node, no need to configure explicitly
     data = TensorDict({ "prompt": torch.ones(3, 512), "big_tensor": torch.randn(3,1024,1024)}, batch_size=[3])
     # you could assign npu or gpu devices by 'resources'
     # resources={f"node:{HEAD_NODE_IP}": 0.001} could Force the actor to run on HEAD_NODE
     writer = TransferQueueClientActor.options(
             resources={f"node:{HEAD_NODE_IP}": 0.001},
-    ).remote(conf_writer, "train")
+    ).remote(dict_conf, "train")
     reader = TransferQueueClientActor.options(
             resources={f"node:{WORKER_NODE_IP}": 0.001}
-    ).remote(conf_reader, "rollout")
+    ).remote(dict_conf, "rollout")
         
     ray.get(writer.put.remote(data=data, partition_id="train_0"))
 
diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py
index 5e94f36..95eb45b 100644
--- a/scripts/performance_test/perftest.py
+++ b/scripts/performance_test/perftest.py
@@ -387,19 +387,9 @@ def _initialize_clients(self) -> None:
             reader_options["resources"]["NPU"] = 1
 
         # Prepare configs for writer and reader
-        # For Yuanrong backend, set different hosts for writer and reader
-        if self.backend == "Yuanrong":
-            import copy
-
-            writer_config = copy.deepcopy(self.full_config)
-            reader_config = copy.deepcopy(self.full_config)
-            writer_config["backend"]["Yuanrong"]["host"] = self.head_node_ip
-            reader_config["backend"]["Yuanrong"]["host"] = self.worker_node_ip
-            logger.info(f"Writer Yuanrong host: {self.head_node_ip}")
-            logger.info(f"Reader Yuanrong host: {self.worker_node_ip}")
-        else:
-            writer_config = self.full_config
-            reader_config = self.full_config
+        # Host is auto-detected on each node for Yuanrong backend
+        writer_config = self.full_config
+        reader_config = self.full_config
 
         # Create writer and reader actors
         self.writer = TQClientActor.options(**writer_options).remote(writer_config, self.use_complex_case)
diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml
index a1b1f9a..b96ea33 100644
--- a/scripts/performance_test/perftest_config.yaml
+++ b/scripts/performance_test/perftest_config.yaml
@@ -48,8 +48,6 @@ backend:
 
   # For Yuanrong:
   Yuanrong:
-    # IP of local yuanrong datasystem worker
-    host: 127.0.0.1
     # Port of local yuanrong datasystem worker
     port: 31501
     # If enable npu transport
diff --git a/transfer_queue/config.yaml b/transfer_queue/config.yaml
index 0a8ccef..433c026 100644
--- a/transfer_queue/config.yaml
+++ b/transfer_queue/config.yaml
@@ -48,8 +48,6 @@ backend:
 
   # For Yuanrong:
   Yuanrong:
-    # IP of local yuanrong datasystem worker
-    host: 127.0.0.1
     # Port of local yuanrong datasystem worker
     port: 31501
     # If enable npu transport
diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py
index 80f5bb7..5a32217 100644
--- a/transfer_queue/storage/clients/yuanrong_client.py
+++ b/transfer_queue/storage/clients/yuanrong_client.py
@@ -25,6 +25,7 @@
 
 from transfer_queue.storage.clients.base import TransferQueueStorageKVClient
 from transfer_queue.storage.clients.factory import StorageClientFactory
+from transfer_queue.utils.common import find_reachable_host
 from transfer_queue.utils.serial_utils import _decoder, _encoder
 
 logger = logging.getLogger(__name__)
@@ -83,9 +84,20 @@ class NPUTensorKVClientAdapter(StorageStrategy):
     KEYS_LIMIT: int = 10_000
 
     def __init__(self, config: dict):
-        host = config.get("host")
         port = config.get("port")
 
+        if port is None or not isinstance(port, int):
+            raise ValueError("Missing or invalid 'port' in config")
+
+        logger.info(f"Auto-detecting reachable host for Yuanrong port {port}...")
+        host = find_reachable_host(port)
+        if host is None:
+            raise ValueError(
+                f"Could not find any reachable host for Yuanrong port {port}. "
+                "Please ensure yuanrong datasystem is running."
+            )
+        logger.info(f"Using auto-detected host: {host}")
+
         self.device_id = torch.npu.current_device()
         torch.npu.set_device(self.device_id)
 
@@ -199,9 +211,20 @@ class GeneralKVClientAdapter(StorageStrategy):
     DS_MAX_WORKERS: int = 16
 
     def __init__(self, config: dict):
-        host = config.get("host")
         port = config.get("port")
 
+        if port is None or not isinstance(port, int):
+            raise ValueError("Missing or invalid 'port' in config")
+
+        logger.info(f"Auto-detecting reachable host for Yuanrong port {port}...")
+        host = find_reachable_host(port)
+        if host is None:
+            raise ValueError(
+                f"Could not find any reachable host for Yuanrong port {port}. "
+                "Please ensure yuanrong datasystem is running."
+            )
+        logger.info(f"Using auto-detected host: {host}")
+
         self._ds_client = datasystem.KVClient(host, port)
         self._ds_client.init()
         logger.info("YuanrongStorageClient: Create KVClient to connect with yuanrong-datasystem backend!")
@@ -357,6 +380,11 @@ def __init__(self, config: dict[str, Any]):
         if not YUANRONG_DATASYSTEM_IMPORTED:
             raise ImportError("YuanRong DataSystem not installed.")
 
+        port = config.get("port")
+
+        if port is None or not isinstance(port, int):
+            raise ValueError("Missing or invalid 'port' in config")
+
         super().__init__(config)
 
         # Storage strategies are prioritized in ascending order of list element index.
diff --git a/transfer_queue/storage/managers/yuanrong_manager.py b/transfer_queue/storage/managers/yuanrong_manager.py
index 54ac094..d527040 100644
--- a/transfer_queue/storage/managers/yuanrong_manager.py
+++ b/transfer_queue/storage/managers/yuanrong_manager.py
@@ -36,14 +36,12 @@ class YuanrongStorageManager(KVStorageManager):
     """Storage manager for Yuanrong backend."""
 
     def __init__(self, controller_info: ZMQServerInfo, config: dict[str, Any]):
-        host = config.get("host", None)
         port = config.get("port", None)
         client_name = config.get("client_name", None)
 
-        if host is None or not isinstance(host, str):
-            raise ValueError("Missing or invalid 'host' in config")
         if port is None or not isinstance(port, int):
             raise ValueError("Missing or invalid 'port' in config")
+
         if client_name is None:
             logger.info("Missing 'client_name' in config, using default value('YuanrongStorageClient')")
             config["client_name"] = "YuanrongStorageClient"
diff --git a/transfer_queue/utils/common.py b/transfer_queue/utils/common.py
index a9d2b93..08a137f 100644
--- a/transfer_queue/utils/common.py
+++ b/transfer_queue/utils/common.py
@@ -15,6 +15,7 @@
 
 import logging
 import os
+import socket
 from contextlib import contextmanager
 from typing import Optional
 
@@ -98,3 +99,101 @@ def get_env_bool(env_key: str, default: bool = False) -> bool:
 
     true_values = {"true", "1", "yes", "y", "on"}
     return env_value_lower in true_values
+
+
+def get_local_ip_addresses() -> list[str]:
+    """Get all local IP addresses including 127.0.0.1.
+
+    Returns:
+        List of local IP addresses, with 127.0.0.1 first.
+    """
+    ips = ["127.0.0.1"]
+
+    try:
+        hostname = socket.gethostname()
+        # Add hostname resolution
+        try:
+            host_ip = socket.gethostbyname(hostname)
+            if host_ip not in ips:
+                ips.append(host_ip)
+        except socket.gaierror:
+            pass
+
+        # Get all network interfaces
+        import netifaces
+
+        for interface in netifaces.interfaces():
+            try:
+                addrs = netifaces.ifaddresses(interface)
+                if netifaces.AF_INET in addrs:
+                    for addr_info in addrs[netifaces.AF_INET]:
+                        ip = addr_info.get("addr")
+                        if ip and ip not in ips:
+                            ips.append(ip)
+            except (ValueError, KeyError):
+                continue
+    except ImportError:
+        # Fallback if netifaces is not available
+        try:
+            # Try to get IP by connecting to an external address
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                # Doesn't need to be reachable
+                s.connect(("8.8.8.8", 80))
+                ip = s.getsockname()[0]
+                if ip not in ips:
+                    ips.append(ip)
+            except Exception:
+                pass
+            finally:
+                s.close()
+        except Exception:
+            pass
+
+    return ips
+
+
+def check_port_connectivity(host: str, port: int, timeout: float = 2.0) -> bool:
+    """Check if a TCP port is reachable on the given host.
+
+    Args:
+        host: Host IP address to check
+        port: Port number to check
+        timeout: Connection timeout in seconds
+
+    Returns:
+        True if the port is reachable, False otherwise
+    """
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(timeout)
+        result = sock.connect_ex((host, port))
+        sock.close()
+        return result == 0
+    except Exception:
+        return False
+
+
+def find_reachable_host(port: int, timeout: float = 1.0) -> Optional[str]:
+    """Find a reachable local host IP address for the given port.
+
+    Tries all local IP addresses in order and returns the first one
+    that has the given port open.
+
+    Args:
+        port: Port number to check
+        timeout: Connection timeout in seconds per check
+
+    Returns:
+        The first reachable host IP address, or None if none found.
+    """
+    local_ips = get_local_ip_addresses()
+    logger.info(f"Checking port {port} on local IPs: {local_ips}")
+
+    for ip in local_ips:
+        if check_port_connectivity(ip, port, timeout):
+            logger.info(f"Found reachable host: {ip}:{port}")
+            return ip
+
+    logger.warning(f"No reachable host found for port {port}")
+    return None

From 28313dd6711cbdf58e89b50eaabde4a547d01537 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Sat, 28 Mar 2026 16:24:27 +0800
Subject: [PATCH 26/29] 1. move find reachable ip to yuanrong client 2. modify
 default mooncake store perftest config

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/perftest_config.yaml |  12 +--
 tests/test_yuanrong_client_zero_copy.py       |   1 +
 tests/test_yuanrong_storage_client_e2e.py     |   7 ++
 transfer_queue/interface.py                   |   7 +-
 .../storage/clients/yuanrong_client.py        | 101 +++++++++++++++++-
 transfer_queue/utils/common.py                |  99 -----------------
 6 files changed, 116 insertions(+), 111 deletions(-)

diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml
index b96ea33..39538bc 100644
--- a/scripts/performance_test/perftest_config.yaml
+++ b/scripts/performance_test/perftest_config.yaml
@@ -34,12 +34,12 @@ backend:
     master_server_address: localhost:50051
     # Address of local host. Set to "" to use Ray IP as local host address
     local_hostname: ""
-    # Protocol for transmission. Choose from: tcp, rdma. (default: tcp)
-    protocol: tcp
-    # Memory segment size in bytes for mounting (default: 4GB)
-    global_segment_size: 4294967296
-    # Local buffer size in bytes (default: 1GB)
-    local_buffer_size: 1073741824
+    # Protocol for transmission. Choose from: tcp, rdma. (default: rdma)
+    protocol: rdma
+    # Memory segment size in bytes for mounting
+    global_segment_size: 86294967296
+    # Local buffer size in bytes
+    local_buffer_size: 86294967296
     # Network device name. Set to "" to let Mooncake to auto-picks devices
     device_name: ""
 
diff --git a/tests/test_yuanrong_client_zero_copy.py b/tests/test_yuanrong_client_zero_copy.py
index b93fd32..423b1c7 100644
--- a/tests/test_yuanrong_client_zero_copy.py
+++ b/tests/test_yuanrong_client_zero_copy.py
@@ -47,6 +47,7 @@ def mock_kv_client(self, mocker):
 
         mocker.patch("yr.datasystem.KVClient", return_value=mock_client)
         mocker.patch("yr.datasystem.DsTensorClient")
+        mocker.patch("transfer_queue.storage.clients.yuanrong_client.find_reachable_host", return_value="127.0.0.1")
 
         return mock_client
 
diff --git a/tests/test_yuanrong_storage_client_e2e.py b/tests/test_yuanrong_storage_client_e2e.py
index 2a79ec9..3cb1f99 100644
--- a/tests/test_yuanrong_storage_client_e2e.py
+++ b/tests/test_yuanrong_storage_client_e2e.py
@@ -108,10 +108,17 @@ def mock_yr_datasystem():
     # - sys.modules: Redirects 'import yr' to our mocks
     # - YUANRONG_DATASYSTEM_IMPORTED: Forces the existence check to True so initialize the client successfully
     # - datasystem: Direct attribute patch for the module
+    # - find_reachable_host: Mock host detection to avoid real network checks
+    def mock_find_reachable_host(port, timeout=1.0):
+        return "127.0.0.1"
+
     with (
         mock.patch.dict("sys.modules", {"yr": yr_mock, "yr.datasystem": ds_mock}),
         mock.patch("transfer_queue.storage.clients.yuanrong_client.YUANRONG_DATASYSTEM_IMPORTED", True, create=True),
         mock.patch("transfer_queue.storage.clients.yuanrong_client.datasystem", ds_mock),
+        mock.patch(
+            "transfer_queue.storage.clients.yuanrong_client.find_reachable_host", side_effect=mock_find_reachable_host
+        ),
     ):
         yield
 
diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py
index 92b0f5e..f54c3bf 100644
--- a/transfer_queue/interface.py
+++ b/transfer_queue/interface.py
@@ -233,7 +233,7 @@ def init(conf: Optional[DictConfig] = None) -> None:
         >>> metadata = tq.get_meta(...)
         >>> data = tq.get_data(metadata)
     """
-    if conf is None and _init_from_existing():
+    if _init_from_existing():
         return
 
     # First-time initialize TransferQueue
@@ -271,10 +271,7 @@ def init(conf: Optional[DictConfig] = None) -> None:
         logger.info("TransferQueueController has been created.")
     except ValueError:
         logger.info("Some other rank has initialized TransferQueueController. Try to connect to existing controller.")
-        if conf is None:
-            _init_from_existing()
-            return
-        _TRANSFER_QUEUE_CONTROLLER = ray.get_actor("TransferQueueController")
+        _init_from_existing()
 
     controller_zmq_info = process_zmq_server_info(_TRANSFER_QUEUE_CONTROLLER)
     final_conf.controller.zmq_info = controller_zmq_info
diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py
index 5a32217..77a981e 100644
--- a/transfer_queue/storage/clients/yuanrong_client.py
+++ b/transfer_queue/storage/clients/yuanrong_client.py
@@ -15,6 +15,7 @@
 
 import logging
 import os
+import socket
 import struct
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
@@ -25,12 +26,110 @@
 
 from transfer_queue.storage.clients.base import TransferQueueStorageKVClient
 from transfer_queue.storage.clients.factory import StorageClientFactory
-from transfer_queue.utils.common import find_reachable_host
 from transfer_queue.utils.serial_utils import _decoder, _encoder
 
 logger = logging.getLogger(__name__)
 logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.WARNING))
 
+
+def get_local_ip_addresses() -> list[str]:
+    """Get all local IP addresses including 127.0.0.1.
+
+    Returns:
+        List of local IP addresses, with 127.0.0.1 first.
+    """
+    ips = ["127.0.0.1"]
+
+    try:
+        hostname = socket.gethostname()
+        # Add hostname resolution
+        try:
+            host_ip = socket.gethostbyname(hostname)
+            if host_ip not in ips:
+                ips.append(host_ip)
+        except socket.gaierror:
+            pass
+
+        # Get all network interfaces
+        import netifaces
+
+        for interface in netifaces.interfaces():
+            try:
+                addrs = netifaces.ifaddresses(interface)
+                if netifaces.AF_INET in addrs:
+                    for addr_info in addrs[netifaces.AF_INET]:
+                        ip = addr_info.get("addr")
+                        if ip and ip not in ips:
+                            ips.append(ip)
+            except (ValueError, KeyError):
+                continue
+    except ImportError:
+        # Fallback if netifaces is not available
+        try:
+            # Try to get IP by connecting to an external address
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                # Doesn't need to be reachable
+                s.connect(("8.8.8.8", 80))
+                ip = s.getsockname()[0]
+                if ip not in ips:
+                    ips.append(ip)
+            except Exception:
+                pass
+            finally:
+                s.close()
+        except Exception:
+            pass
+
+    return ips
+
+
+def check_port_connectivity(host: str, port: int, timeout: float = 2.0) -> bool:
+    """Check if a TCP port is reachable on the given host.
+
+    Args:
+        host: Host IP address to check
+        port: Port number to check
+        timeout: Connection timeout in seconds
+
+    Returns:
+        True if the port is reachable, False otherwise
+    """
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(timeout)
+        result = sock.connect_ex((host, port))
+        sock.close()
+        return result == 0
+    except Exception:
+        return False
+
+
+def find_reachable_host(port: int, timeout: float = 1.0) -> Optional[str]:
+    """Find a reachable local host IP address for the given port.
+
+    Tries all local IP addresses in order and returns the first one
+    that has the given port open.
+
+    Args:
+        port: Port number to check
+        timeout: Connection timeout in seconds per check
+
+    Returns:
+        The first reachable host IP address, or None if none found.
+    """
+    local_ips = get_local_ip_addresses()
+    logger.info(f"Checking port {port} on local IPs: {local_ips}")
+
+    for ip in local_ips:
+        if check_port_connectivity(ip, port, timeout):
+            logger.info(f"Found reachable host: {ip}:{port}")
+            return ip
+
+    logger.warning(f"No reachable host found for port {port}")
+    return None
+
+
 YUANRONG_DATASYSTEM_IMPORTED: bool = True
 
 try:
diff --git a/transfer_queue/utils/common.py b/transfer_queue/utils/common.py
index 08a137f..a9d2b93 100644
--- a/transfer_queue/utils/common.py
+++ b/transfer_queue/utils/common.py
@@ -15,7 +15,6 @@
 
 import logging
 import os
-import socket
 from contextlib import contextmanager
 from typing import Optional
 
@@ -99,101 +98,3 @@ def get_env_bool(env_key: str, default: bool = False) -> bool:
 
     true_values = {"true", "1", "yes", "y", "on"}
     return env_value_lower in true_values
-
-
-def get_local_ip_addresses() -> list[str]:
-    """Get all local IP addresses including 127.0.0.1.
-
-    Returns:
-        List of local IP addresses, with 127.0.0.1 first.
-    """
-    ips = ["127.0.0.1"]
-
-    try:
-        hostname = socket.gethostname()
-        # Add hostname resolution
-        try:
-            host_ip = socket.gethostbyname(hostname)
-            if host_ip not in ips:
-                ips.append(host_ip)
-        except socket.gaierror:
-            pass
-
-        # Get all network interfaces
-        import netifaces
-
-        for interface in netifaces.interfaces():
-            try:
-                addrs = netifaces.ifaddresses(interface)
-                if netifaces.AF_INET in addrs:
-                    for addr_info in addrs[netifaces.AF_INET]:
-                        ip = addr_info.get("addr")
-                        if ip and ip not in ips:
-                            ips.append(ip)
-            except (ValueError, KeyError):
-                continue
-    except ImportError:
-        # Fallback if netifaces is not available
-        try:
-            # Try to get IP by connecting to an external address
-            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-            try:
-                # Doesn't need to be reachable
-                s.connect(("8.8.8.8", 80))
-                ip = s.getsockname()[0]
-                if ip not in ips:
-                    ips.append(ip)
-            except Exception:
-                pass
-            finally:
-                s.close()
-        except Exception:
-            pass
-
-    return ips
-
-
-def check_port_connectivity(host: str, port: int, timeout: float = 2.0) -> bool:
-    """Check if a TCP port is reachable on the given host.
-
-    Args:
-        host: Host IP address to check
-        port: Port number to check
-        timeout: Connection timeout in seconds
-
-    Returns:
-        True if the port is reachable, False otherwise
-    """
-    try:
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        sock.settimeout(timeout)
-        result = sock.connect_ex((host, port))
-        sock.close()
-        return result == 0
-    except Exception:
-        return False
-
-
-def find_reachable_host(port: int, timeout: float = 1.0) -> Optional[str]:
-    """Find a reachable local host IP address for the given port.
-
-    Tries all local IP addresses in order and returns the first one
-    that has the given port open.
-
-    Args:
-        port: Port number to check
-        timeout: Connection timeout in seconds per check
-
-    Returns:
-        The first reachable host IP address, or None if none found.
-    """
-    local_ips = get_local_ip_addresses()
-    logger.info(f"Checking port {port} on local IPs: {local_ips}")
-
-    for ip in local_ips:
-        if check_port_connectivity(ip, port, timeout):
-            logger.info(f"Found reachable host: {ip}:{port}")
-            return ip
-
-    logger.warning(f"No reachable host found for port {port}")
-    return None

From f278f8e22a39abfc9f44e7fb11b7e90d40301daf Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Sat, 28 Mar 2026 17:01:12 +0800
Subject: [PATCH 27/29] fix comments

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 pyproject.toml              | 5 -----
 transfer_queue/interface.py | 1 +
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a3824cb..1fba227 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -120,11 +120,6 @@ yuanrong = [
 mooncake = [
     "mooncake-transfer-engine"
 ]
-perftest = [
-    "matplotlib",
-    "seaborn",
-    "pandas"
-]
 
 # If you need to mimic `package_dir={'': '.'}`:
 [tool.setuptools.package-dir]
diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py
index f54c3bf..d0fd2f7 100644
--- a/transfer_queue/interface.py
+++ b/transfer_queue/interface.py
@@ -272,6 +272,7 @@ def init(conf: Optional[DictConfig] = None) -> None:
     except ValueError:
         logger.info("Some other rank has initialized TransferQueueController. Try to connect to existing controller.")
         _init_from_existing()
+        return
 
     controller_zmq_info = process_zmq_server_info(_TRANSFER_QUEUE_CONTROLLER)
     final_conf.controller.zmq_info = controller_zmq_info

From 47977050feefa9e5e2276d4d1b23b6c8fb4df184 Mon Sep 17 00:00:00 2001
From: 0oshowero0 <o0shower0o@outlook.com>
Date: Sat, 28 Mar 2026 16:54:48 +0800
Subject: [PATCH 28/29] fix figure drawing

Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
---
 scripts/performance_test/draw_figure.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py
index b96219a..400c7b9 100644
--- a/scripts/performance_test/draw_figure.py
+++ b/scripts/performance_test/draw_figure.py
@@ -95,8 +95,14 @@ def make_xlabel(size_label: str) -> str:
 df["Bandwidth"] = df["total_gbit_per_sec"]
 df["Scenario"] = df["backend_parsed"]
 
-# Set backend display order
-backend_order = ["Ray", "SimpleStorage", "Yuanrong", "MooncakeStore"]
+# Set backend display order: only include backends that actually exist in the data
+preferred_backend_order = ["Ray", "SimpleStorage", "Yuanrong", "MooncakeStore"]
+
+# Get actual backends present in the data, maintaining preferred order
+actual_backends = df["Scenario"].unique().tolist()
+backend_order = [b for b in preferred_backend_order if b in actual_backends]
+# Add any unknown backends at the end (shouldn't happen normally)
+backend_order += [b for b in actual_backends if b not in preferred_backend_order]
 
 df["Scenario"] = pd.Categorical(df["Scenario"], categories=backend_order, ordered=True)
 
@@ -105,9 +111,9 @@ def make_xlabel(size_label: str) -> str:
 
 fig, ax = plt.subplots(figsize=(12, 7))
 
-# Use the backend order to ensure consistent coloring
-existing_backends = df["Scenario"].unique()
-palette = sns.color_palette("Set2", n_colors=len(existing_backends))
+# Use Set2 palette to generate colors for all backends
+# Set2 has 8 colors, which should be enough for typical use cases
+palette = sns.color_palette("Set2", n_colors=len(backend_order))
 barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette)
 
 # Legend: match old style — at the top center, horizontal, with frame

From 49d113909535e19e37b329422ff1274d5c6720b8 Mon Sep 17 00:00:00 2001
From: tianyi-ge <tianyig@outlook.com>
Date: Sat, 28 Mar 2026 17:38:43 +0800
Subject: [PATCH 29/29] update large test config

Signed-off-by: tianyi-ge <tianyig@outlook.com>
---
 scripts/performance_test/README_PERFTEST.md | 2 +-
 scripts/performance_test/run_perf_test.sh   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md
index b1d0f6a..1b5ddc6 100644
--- a/scripts/performance_test/README_PERFTEST.md
+++ b/scripts/performance_test/README_PERFTEST.md
@@ -113,7 +113,7 @@ HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.s
 ### Test Matrix
 
 - **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline)
-- **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=21, seq=128000)
+- **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=18, seq=100000)
 
 ### Output
 
diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh
index 0531717..19aa478 100755
--- a/scripts/performance_test/run_perf_test.sh
+++ b/scripts/performance_test/run_perf_test.sh
@@ -25,7 +25,7 @@ BACKENDS=("SimpleStorage" "Yuanrong" "MooncakeStore")
 declare -a SETTINGS=(
     "1024,9,8192,Small"
     "4096,15,32768,Medium"
-    "8192,21,128000,Large"
+    "8192,18,100000,Large"
 )
 
 # Complex case flag