From 221fd709e23392bd8e586505dbc556cd329dd97a Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Fri, 20 Mar 2026 00:46:39 +0800 Subject: [PATCH 01/29] refactor perftest 1. support different kv backends 2. support intra-node and inter-node client placement 3. remove ray bandwidth test Signed-off-by: tianyi-ge --- scripts/README_PERFTEST.md | 110 ++++++ scripts/configs/mooncake.yaml | 4 + scripts/configs/transferqueue.yaml | 6 + scripts/configs/yuanrong.yaml | 4 + scripts/perftest.py | 560 +++++++++++++++++++++++++++++ 5 files changed, 684 insertions(+) create mode 100644 scripts/README_PERFTEST.md create mode 100644 scripts/configs/mooncake.yaml create mode 100644 scripts/configs/transferqueue.yaml create mode 100644 scripts/configs/yuanrong.yaml create mode 100644 scripts/perftest.py diff --git a/scripts/README_PERFTEST.md b/scripts/README_PERFTEST.md new file mode 100644 index 0000000..b01f37f --- /dev/null +++ b/scripts/README_PERFTEST.md @@ -0,0 +1,110 @@ +# TransferQueue Throughput Test + +This script runs throughput tests for TransferQueue with different backends. + +## Prerequisites + +1. Start Ray cluster with node resources: + ```bash + # On head node + ray start --head --resources='{"node:192.168.0.1":1}' + + # On worker node + ray start --address=192.168.0.1 --resources='{"node:192.168.0.2":1}' + ``` + +2. Start the backend service (yuanrong, mooncake, etc.) if testing non-default backends. + +## Usage + +```bash +python perftest.py \ + --backend=[default|yuanrong|mooncake] \ + --client_placement=[intra_node|inter_node] \ + --backend_config=xxx.yaml \ + --device=[cpu|npu|gpu] \ + --global_batch_size=1024 \ + --field_num=10 \ + --seq_len=8192 \ + --num_global_batch=1 \ + --head_node_ip=192.168.0.1 \ + --worker_node_ip=192.168.0.2 +``` + +## Arguments + +| Argument | Description | Default | +|----------|-------------|---------| +| `--backend` | Backend type: default, yuanrong, mooncake | default | +| `--client_placement` | Client placement: intra_node or inter_node | intra_node | +| `--backend_config` | Path to YAML config file (optional) | None | +| `--device` | Device: cpu, npu, gpu | cpu | +| `--global_batch_size` | Global batch size | 1024 | +| `--field_num` | Number of fields | 10 | +| `--seq_len` | Sequence length | 8192 | +| `--num_global_batch` | Number of global batches | 1 | +| `--head_node_ip` | Head node IP (required) | - | +| `--worker_node_ip` | Worker node IP (required for inter_node) | None | +| `--ray_address` | Ray cluster address | auto | + +## Backend Configuration + +Sample config files are in `configs/`: + +- **transferqueue.yaml**: Default backend config + ```yaml + num_data_storage_units: 8 + storage_unit_placement: normal # or "remote" + ``` + +- **yuanrong.yaml**: Yuanrong backend config + ```yaml + host: 127.0.0.1 + port: 31501 + enable_yr_npu_transport: false + ``` + +- **mooncake.yaml**: Mooncake backend config + ```yaml + local_hostname: 127.0.0.1 + metadata_server: 127.0.0.1:8080 + master_server_address: 127.0.0.1:8081 + ``` + +## Examples + +### Intra-node test with default backend +```bash +python perftest.py --backend=default --client_placement=intra_node \ + --head_node_ip=192.168.0.1 +``` + +### Inter-node test with yuanrong backend +```bash +python perftest.py --backend=yuanrong --client_placement=inter_node \ + --backend_config=configs/yuanrong.yaml \ + --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 +``` + +### Default backend with remote storage units +```bash +python perftest.py --backend=default --client_placement=intra_node \ + --backend_config=configs/transferqueue.yaml \ + --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 +``` + +### NPU device test +```bash +python perftest.py --backend=mooncake --device=npu \ + --head_node_ip=192.168.0.1 +``` + +## Output + +The test prints: +- Total data size +- PUT time and throughput +- GET time and throughput +- Total round-trip throughput + +Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per second). diff --git a/scripts/configs/mooncake.yaml b/scripts/configs/mooncake.yaml new file mode 100644 index 0000000..320801f --- /dev/null +++ b/scripts/configs/mooncake.yaml @@ -0,0 +1,4 @@ +# Mooncake backend configuration +local_hostname: 127.0.0.1 +metadata_server: 127.0.0.1:8080 +master_server_address: 127.0.0.1:8081 diff --git a/scripts/configs/transferqueue.yaml b/scripts/configs/transferqueue.yaml new file mode 100644 index 0000000..9f55742 --- /dev/null +++ b/scripts/configs/transferqueue.yaml @@ -0,0 +1,6 @@ +# TransferQueue (default) backend configuration +num_data_storage_units: 8 +# storage_unit_placement: "normal" (default) or "remote" +# - normal: create storage units on current node using placement group +# - remote: create all storage units on WORKER_NODE_IP +storage_unit_placement: normal diff --git a/scripts/configs/yuanrong.yaml b/scripts/configs/yuanrong.yaml new file mode 100644 index 0000000..2df1b84 --- /dev/null +++ b/scripts/configs/yuanrong.yaml @@ -0,0 +1,4 @@ +# Yuanrong backend configuration +host: 127.0.0.1 +port: 31501 +enable_yr_npu_transport: false diff --git a/scripts/perftest.py b/scripts/perftest.py new file mode 100644 index 0000000..ca10775 --- /dev/null +++ b/scripts/perftest.py @@ -0,0 +1,560 @@ +# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2025 The TransferQueue Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import math +import random +import sys +import time +from pathlib import Path +from typing import Any + +import ray +import torch +from omegaconf import OmegaConf +from tensordict import TensorDict +from tensordict.tensorclass import NonTensorData + +parent_dir = Path(__file__).resolve().parent.parent +sys.path.append(str(parent_dir)) + +from transfer_queue.client import TransferQueueClient # noqa: E402 +from transfer_queue.controller import TransferQueueController # noqa: E402 +from transfer_queue.storage.simple_backend import SimpleStorageUnit # noqa: E402 +from transfer_queue.utils.common import get_placement_group # noqa: E402 +from transfer_queue.utils.zmq_utils import process_zmq_server_info # noqa: E402 + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +def create_complex_test_case( + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", +) -> tuple[TensorDict, float]: + """Create a complex test case with tensor and non-tensor fields. + + Args: + batch_size: Batch size for the test case + seq_length: Sequence length for tensor fields + field_num: Number of fields to create + device: Device to create tensors on ("cpu", "npu", or "gpu") + + Returns: + Tuple of (TensorDict, total_size_gb) + """ + tensor_field_size_bytes = batch_size * seq_length * 4 + tensor_field_size_gb = tensor_field_size_bytes / (1024**3) + + num_tensor_fields = (field_num + 1) // 2 + num_nontensor_fields = field_num // 2 + + total_tensor_size_gb = tensor_field_size_gb * num_tensor_fields + total_nontensor_size_gb = (batch_size * 1024 / (1024**3)) * num_nontensor_fields + total_size_gb = total_tensor_size_gb + total_nontensor_size_gb + + logger.info(f"Total data size: {total_size_gb:.6f} GB") + + # Determine torch device + torch_device = None + if device == "npu": + torch_device = "npu:0" + elif device == "gpu": + torch_device = "cuda:0" + + fields = {} + for i in range(field_num): + field_name = f"field_{i}" + + if i % 2 == 0: + # Tensor field + tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) + fields[field_name] = tensor_data + else: + # NonTensorData field + str_length = 1024 + non_tensor_data = [ + "".join( + random.choices( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", + k=str_length, + ) + ) + for _ in range(batch_size) + ] + fields[field_name] = NonTensorData(data=non_tensor_data, batch_size=(batch_size,), device=None) + + batch_size_tuple = (batch_size,) + prompt_batch = TensorDict( + fields, + batch_size=batch_size_tuple, + ) + + return prompt_batch, total_size_gb + + +@ray.remote +class TQClientActor: + """Ray actor that holds a TransferQueueClient.""" + + def __init__(self, client_id: str, controller_info: Any): + self.client = TransferQueueClient( + client_id=client_id, + controller_info=controller_info, + ) + self.prompt_meta = None + self.test_data = None + self.total_data_size_gb = 0.0 + + def initialize_storage_manager(self, manager_type: str, config: dict[str, Any]) -> None: + """Initialize the storage manager with given config.""" + self.client.initialize_storage_manager(manager_type=manager_type, config=config) + + def create_complex_test_case( + self, + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", + ) -> tuple[list[str], float]: + """Create test case on the actor.""" + self.test_data, self.total_data_size_gb = create_complex_test_case(batch_size, seq_length, field_num, device) + return list(self.test_data.keys()), self.total_data_size_gb + + def put(self, partition_id: str) -> None: + """Put data to storage.""" + self.client.put(data=self.test_data, partition_id=partition_id) + + def get_meta( + self, + data_fields: list[str], + batch_size: int, + partition_id: str, + task_name: str | None = None, + sampling_config: dict[str, Any] | None = None, + ) -> Any: + """Get metadata from controller.""" + self.prompt_meta = self.client.get_meta( + data_fields=data_fields, + batch_size=batch_size, + partition_id=partition_id, + task_name=task_name, + sampling_config=sampling_config, + ) + return self.prompt_meta + + def get_data(self) -> None: + """Get data from storage using cached metadata.""" + self.client.get_data(self.prompt_meta) + + +class TQThroughputTester: + """Main throughput tester for TransferQueue backends.""" + + def __init__( + self, + backend: str, + client_placement: str, + backend_config: dict[str, Any], + device: str, + global_batch_size: int, + field_num: int, + seq_len: int, + num_global_batch: int, + head_node_ip: str, + worker_node_ip: str | None = None, + ): + """Initialize the throughput tester. + + Args: + backend: Backend type ("default", "yuanrong", "mooncake") + client_placement: Client placement mode ("intra_node" or "inter_node") + backend_config: Backend configuration dictionary + device: Device type ("cpu", "npu", "gpu") + global_batch_size: Global batch size + field_num: Number of fields + seq_len: Sequence length + num_global_batch: Number of global batches + head_node_ip: Head node IP address + worker_node_ip: Worker node IP address (required for inter_node) + """ + self.backend = backend + self.client_placement = client_placement + self.backend_config = backend_config + self.device = device + self.global_batch_size = global_batch_size + self.field_num = field_num + self.seq_len = seq_len + self.num_global_batch = num_global_batch + self.head_node_ip = head_node_ip + self.worker_node_ip = worker_node_ip + + # Validate arguments + self._validate_args() + + # Determine manager type and prepare configs + self.manager_type = self._get_manager_type() + self.writer_config, self.reader_config = self._prepare_backend_configs() + + # Initialize the test infrastructure + self._initialize_data_system() + self._initialize_clients() + + def _validate_args(self) -> None: + """Validate input arguments.""" + if self.client_placement == "inter_node" and self.worker_node_ip is None: + raise ValueError("worker_node_ip is required for inter_node client placement") + if self.backend == "default": + storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal") + if storage_unit_placement == "remote" and self.worker_node_ip is None: + raise ValueError("worker_node_ip is required for remote storage_unit_placement") + + def _get_manager_type(self) -> str: + """Get the storage manager type based on backend.""" + if self.backend == "default": + return "AsyncSimpleStorageManager" + elif self.backend == "yuanrong": + return "YuanrongStorageManager" + elif self.backend == "mooncake": + return "MooncakeStorageManager" + else: + raise ValueError(f"Unknown backend: {self.backend}") + + def _prepare_backend_configs(self) -> tuple[dict[str, Any], dict[str, Any]]: + """Prepare writer and reader backend configs. + + Returns: + Tuple of (writer_config, reader_config) + """ + # Set client_name based on backend + base_config = self.backend_config.copy() + if self.backend == "yuanrong": + base_config["client_name"] = "YuanrongStorageClient" + elif self.backend == "mooncake": + base_config["client_name"] = "MooncakeStoreClient" + + writer_config = base_config.copy() + reader_config = base_config.copy() + + if self.client_placement == "inter_node": + if self.backend == "yuanrong": + writer_config["host"] = self.head_node_ip + reader_config["host"] = self.worker_node_ip + elif self.backend == "mooncake": + writer_config["local_hostname"] = self.head_node_ip + reader_config["local_hostname"] = self.worker_node_ip + + return writer_config, reader_config + + def _initialize_data_system(self) -> None: + """Initialize controller and storage units if needed.""" + # Initialize controller + self.data_system_controller = TransferQueueController.remote() + logger.info("TransferQueueController has been created.") + self.data_system_controller_info = process_zmq_server_info(self.data_system_controller) + + # Initialize storage units for default backend + if self.backend == "default": + self._initialize_storage_units() + + def _initialize_storage_units(self) -> None: + """Initialize SimpleStorageUnits for default backend.""" + num_data_storage_units = self.backend_config.get("num_data_storage_units", 8) + storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal") + total_storage_size = self.global_batch_size * self.num_global_batch + + self.data_system_storage_units = {} + + if storage_unit_placement == "remote": + # Remote mode: create all storage units on worker node + for storage_unit_rank in range(num_data_storage_units): + storage_node = SimpleStorageUnit.options( + num_cpus=10, + resources={f"node:{self.worker_node_ip}": 0.001}, + ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units)) + self.data_system_storage_units[storage_unit_rank] = storage_node + logger.info( + f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created on worker node {self.worker_node_ip}." + ) + else: + # Normal mode: create storage units using placement group + storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) + for storage_unit_rank in range(num_data_storage_units): + storage_node = SimpleStorageUnit.options( + placement_group=storage_placement_group, + placement_group_bundle_index=storage_unit_rank, + ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units)) + self.data_system_storage_units[storage_unit_rank] = storage_node + logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.") + + self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units) + # Add storage unit infos to backend configs + self.writer_config["zmq_info"] = self.data_system_storage_unit_infos + self.reader_config["zmq_info"] = self.data_system_storage_unit_infos + + def _initialize_clients(self) -> None: + """Initialize writer and reader TQClientActors.""" + # Determine node placement + if self.client_placement == "intra_node": + writer_node = reader_node = self.head_node_ip + else: + writer_node = self.head_node_ip + reader_node = self.worker_node_ip + + logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}") + + # Prepare device resource + device_resource = {} + if self.device in ["npu", "gpu"]: + device_resource = {self.device: 1} + + # Create writer and reader actors + self.writer = TQClientActor.options( + resources={f"node:{writer_node}": 0.001, **device_resource}, + ).remote("writer", self.data_system_controller_info) + + self.reader = TQClientActor.options( + resources={f"node:{reader_node}": 0.001, **device_resource}, + ).remote("reader", self.data_system_controller_info) + + # Initialize storage managers + logger.info(f"Using {self.manager_type} as storage backend.") + + w = self.writer.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.writer_config) + r = self.reader.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.reader_config) + ray.get([w, r]) + + def run_throughput_test(self) -> None: + """Run the throughput test and print results.""" + logger.info("Creating large batch for throughput test...") + start_create_data = time.time() + data_fields, total_data_size_gb = ray.get( + self.writer.create_complex_test_case.remote( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device=self.device, + ) + ) + end_create_data = time.time() + logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") + + # PUT operation + logger.info("Starting PUT operation...") + start_put = time.time() + ray.get(self.writer.put.remote(partition_id="train_0")) + end_put = time.time() + put_time = end_put - start_put + put_throughput_gbps = (total_data_size_gb * 8) / put_time + put_throughput_gbs = total_data_size_gb / put_time + logger.info(f"put cost time: {put_time:.8f}s") + logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)") + + time.sleep(2) + + # GET_META operation + logger.info("Starting GET_META operation...") + start_get_meta = time.time() + ray.wait( + [ + self.reader.get_meta.remote( + data_fields=list(data_fields), + batch_size=self.global_batch_size, + partition_id="train_0", + task_name="generate_sequences", + ) + ] + ) + end_get_meta = time.time() + logger.info(f"get_meta cost time: {end_get_meta - start_get_meta:.8f}s") + + time.sleep(2) + + # GET_DATA operation + logger.info("Starting GET_DATA operation...") + start_get_data = time.time() + ray.get(self.reader.get_data.remote()) + end_get_data = time.time() + get_time = end_get_data - start_get_data + get_throughput_gbps = (total_data_size_gb * 8) / get_time + get_throughput_gbs = total_data_size_gb / get_time + + logger.info(f"get_data cost time: {get_time:.8f}s") + logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)") + + # Print summary + total_throughput_gbps = (total_data_size_gb * 16) / (put_time + get_time) + total_throughput_gbs = (total_data_size_gb * 2) / (put_time + get_time) + + logger.info("=" * 60) + logger.info("THROUGHPUT TEST SUMMARY") + logger.info("=" * 60) + logger.info(f"Backend: {self.backend}") + logger.info(f"Client Placement: {self.client_placement}") + logger.info(f"Device: {self.device}") + logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") + logger.info(f"PUT Time: {put_time:.8f}s") + logger.info(f"GET Time: {get_time:.8f}s") + logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)") + logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)") + logger.info(f"Total Throughput: {total_throughput_gbps:.8f} Gb/s ({total_throughput_gbs:.8f} GB/s)") + logger.info("=" * 60) + + +def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]: + """Load backend config from YAML file or use defaults. + + Args: + config_path: Path to YAML config file (optional) + backend: Backend type for default config + + Returns: + Backend configuration dictionary + """ + if config_path is not None: + config = OmegaConf.load(config_path) + return OmegaConf.to_container(config, resolve=True) + + # Default configs + if backend == "default": + return {"num_data_storage_units": 8, "storage_unit_placement": "normal"} + elif backend == "yuanrong": + return { + "host": "127.0.0.1", + "port": 31501, + "enable_yr_npu_transport": False, + } + elif backend == "mooncake": + return { + "local_hostname": "127.0.0.1", + "metadata_server": "127.0.0.1:8080", + "master_server_address": "127.0.0.1:8081", + } + else: + return {} + + +def main() -> None: + """Main entry point for the perftest script.""" + parser = argparse.ArgumentParser(description="TransferQueue Throughput Test") + parser.add_argument( + "--backend", + type=str, + default="default", + choices=["default", "yuanrong", "mooncake"], + help="Backend type to test (default: default)", + ) + parser.add_argument( + "--client_placement", + type=str, + default="intra_node", + choices=["intra_node", "inter_node"], + help="Client placement mode (default: intra_node)", + ) + parser.add_argument( + "--backend_config", + type=str, + default=None, + help="Path to backend config YAML file (optional)", + ) + parser.add_argument( + "--device", + type=str, + default="cpu", + choices=["cpu", "npu", "gpu"], + help="Device to use (default: cpu)", + ) + parser.add_argument( + "--global_batch_size", + type=int, + default=1024, + help="Global batch size (default: 1024)", + ) + parser.add_argument( + "--field_num", + type=int, + default=10, + help="Number of fields (default: 10)", + ) + parser.add_argument( + "--seq_len", + type=int, + default=8192, + help="Sequence length (default: 8192)", + ) + parser.add_argument( + "--num_global_batch", + type=int, + default=1, + help="Number of global batches (default: 1)", + ) + parser.add_argument( + "--head_node_ip", + type=str, + required=True, + help="Head node IP address", + ) + parser.add_argument( + "--worker_node_ip", + type=str, + default=None, + help="Worker node IP address (required for inter_node)", + ) + parser.add_argument( + "--ray_address", + type=str, + default="auto", + help="Ray cluster address (default: auto)", + ) + + args = parser.parse_args() + + # Load backend config + backend_config = load_backend_config(args.backend_config, args.backend) + + # Initialize Ray + logger.info(f"Connecting to Ray cluster at {args.ray_address}") + ray.init(address=args.ray_address) + + # Create and run tester + tester = TQThroughputTester( + backend=args.backend, + client_placement=args.client_placement, + backend_config=backend_config, + device=args.device, + global_batch_size=args.global_batch_size, + field_num=args.field_num, + seq_len=args.seq_len, + num_global_batch=args.num_global_batch, + head_node_ip=args.head_node_ip, + worker_node_ip=args.worker_node_ip, + ) + + # Run test multiple times for consistent results + print("-" * 60) + tester.run_throughput_test() + print("-" * 60) + tester.run_throughput_test() + print("-" * 60) + tester.run_throughput_test() + + logger.info("Throughput test completed successfully!") + + +if __name__ == "__main__": + main() From bf8478a499402d9c25bee135667420d5d2412595 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Fri, 20 Mar 2026 10:35:13 +0800 Subject: [PATCH 02/29] fixed review comments Signed-off-by: tianyi-ge --- scripts/README_PERFTEST.md | 5 +++++ scripts/perftest.py | 28 +++++++++++++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/scripts/README_PERFTEST.md b/scripts/README_PERFTEST.md index b01f37f..887f723 100644 --- a/scripts/README_PERFTEST.md +++ b/scripts/README_PERFTEST.md @@ -71,6 +71,11 @@ Sample config files are in `configs/`: master_server_address: 127.0.0.1:8081 ``` +For device support of each backend, +- `default` backend supports `cpu` +- `yuanrong` supports `cpu` and `npu` +- `mooncake` supports `cpu` and `gpu` + ## Examples ### Intra-node test with default backend diff --git a/scripts/perftest.py b/scripts/perftest.py index ca10775..ff04ce1 100644 --- a/scripts/perftest.py +++ b/scripts/perftest.py @@ -318,19 +318,25 @@ def _initialize_clients(self) -> None: logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}") - # Prepare device resource - device_resource = {} - if self.device in ["npu", "gpu"]: - device_resource = {self.device: 1} + # Prepare base options + writer_options = { + "resources": {f"node:{writer_node}": 0.001}, + } + reader_options = { + "resources": {f"node:{reader_node}": 0.001}, + } - # Create writer and reader actors - self.writer = TQClientActor.options( - resources={f"node:{writer_node}": 0.001, **device_resource}, - ).remote("writer", self.data_system_controller_info) + # Add device-specific options + if self.device == "gpu": + writer_options["num_gpus"] = 1 + reader_options["num_gpus"] = 1 + elif self.device == "npu": + writer_options["resources"]["NPU"] = 1 + reader_options["resources"]["NPU"] = 1 - self.reader = TQClientActor.options( - resources={f"node:{reader_node}": 0.001, **device_resource}, - ).remote("reader", self.data_system_controller_info) + # Create writer and reader actors + self.writer = TQClientActor.options(**writer_options).remote("writer", self.data_system_controller_info) + self.reader = TQClientActor.options(**reader_options).remote("reader", self.data_system_controller_info) # Initialize storage managers logger.info(f"Using {self.manager_type} as storage backend.") From 06ffebb32fb7b6ca94d290b3538ab46780041953 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Fri, 20 Mar 2026 17:08:02 +0800 Subject: [PATCH 03/29] 1. adjust default storage unit number to 1 2. remove old perf test script Signed-off-by: tianyi-ge --- scripts/configs/transferqueue.yaml | 2 +- scripts/performance_test.py | 350 ----------------------------- scripts/perftest.py | 4 +- 3 files changed, 3 insertions(+), 353 deletions(-) delete mode 100644 scripts/performance_test.py diff --git a/scripts/configs/transferqueue.yaml b/scripts/configs/transferqueue.yaml index 9f55742..3e5acd8 100644 --- a/scripts/configs/transferqueue.yaml +++ b/scripts/configs/transferqueue.yaml @@ -1,5 +1,5 @@ # TransferQueue (default) backend configuration -num_data_storage_units: 8 +num_data_storage_units: 1 # storage_unit_placement: "normal" (default) or "remote" # - normal: create storage units on current node using placement group # - remote: create all storage units on WORKER_NODE_IP diff --git a/scripts/performance_test.py b/scripts/performance_test.py deleted file mode 100644 index 14d06a4..0000000 --- a/scripts/performance_test.py +++ /dev/null @@ -1,350 +0,0 @@ -# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2025 The TransferQueue Team -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import logging -import math -import random -import sys -import time -from pathlib import Path - -import ray -import torch -from omegaconf import OmegaConf -from tensordict import TensorDict -from tensordict.tensorclass import NonTensorData - -parent_dir = Path(__file__).resolve().parent.parent.parent -sys.path.append(str(parent_dir)) - -from transfer_queue.client import TransferQueueClient # noqa: E402 -from transfer_queue.controller import TransferQueueController # noqa: E402 -from transfer_queue.storage.simple_backend import SimpleStorageUnit # noqa: E402 -from transfer_queue.utils.common import get_placement_group # noqa: E402 -from transfer_queue.utils.zmq_utils import process_zmq_server_info # noqa: E402 - -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") -logger = logging.getLogger(__name__) - -######################################################################## -# Please set up Ray cluster before running this script -######################################################################## -HEAD_NODE_IP = "NodeA" # Replace with your head node IP -WORKER_NODE_IP = "NodeB" # Replace with your worker node IP - - -# This is the Medium setting of the performance test. -# You can modify the parameters according to -# https://www.yuque.com/haomingzi-lfse7/lhp4el/tml8ke0zkgn6roey?singleDoc# -config_str = """ - global_batch_size: 1024 - seq_length: 8192 - field_num: 10 - num_global_batch: 1 - num_data_storage_units: 8 -""" -dict_conf = OmegaConf.create(config_str) - - -def create_complex_test_case(batch_size=None, seq_length=None, field_num=None): - tensor_field_size_bytes = batch_size * seq_length * 4 - tensor_field_size_gb = tensor_field_size_bytes / (1024**3) - - num_tensor_fields = (field_num + 1) // 2 - num_nontensor_fields = field_num // 2 - - total_tensor_size_gb = tensor_field_size_gb * num_tensor_fields - total_nontensor_size_gb = (batch_size * 1024 / (1024**3)) * num_nontensor_fields - total_size_gb = total_tensor_size_gb + total_nontensor_size_gb - - logger.info(f"Total data size: {total_size_gb:.6f} GB") - - fields = {} - - for i in range(field_num): - field_name = f"field_{i}" - - if i % 2 == 0: - # Tensor - tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32) - fields[field_name] = tensor_data - else: - # NonTensorData - str_length = 1024 - non_tensor_data = [ - "".join(random.choices("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=str_length)) - for _ in range(batch_size) - ] - fields[field_name] = NonTensorData(data=non_tensor_data, batch_size=(batch_size,), device=None) - - batch_size_tuple = (batch_size,) - prompt_batch = TensorDict( - fields, - batch_size=batch_size_tuple, - device=None, - ) - - return prompt_batch, total_size_gb - - -@ray.remote -class RemoteDataStoreObjStore: - def __init__(self): - pass - - def get_data(self, data_handler): - start_get = time.time() - ray.get(data_handler) - end_get = time.time() - - get_time = end_get - start_get - return get_time - - -@ray.remote -class RemoteDataStoreRemote: - def __init__(self): - self.stored_data = None - - def put_data(self, data): - self.stored_data = data - - def get_data(self): - return self.stored_data - - def clear_data(self): - self.stored_data = None - - -class RayBandwidthTester: - def __init__(self, config, test_mode="obj_store"): - self.config = config - self.test_mode = test_mode - - if test_mode == "obj_store": - RemoteDataStore = RemoteDataStoreObjStore - else: - RemoteDataStore = RemoteDataStoreRemote - - self.remote_store = RemoteDataStore.options(num_cpus=10, resources={f"node:{WORKER_NODE_IP}": 0.001}).remote() - - logger.info(f"Remote data store created on worker node {WORKER_NODE_IP}") - - def run_bandwidth_test(self): - start_create_data = time.time() - test_data, total_data_size_gb = create_complex_test_case( - batch_size=self.config.global_batch_size, seq_length=self.config.seq_length, field_num=self.config.field_num - ) - end_create_data = time.time() - logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") - - if self.test_mode == "obj_store": - self._run_obj_store_test(test_data, total_data_size_gb) - else: - self._run_remote_test(test_data, total_data_size_gb) - - def _run_obj_store_test(self, test_data, total_data_size_gb): - start_time = time.time() - data_handler = ray.put(test_data) - ray.get(self.remote_store.get_data.remote([data_handler])) - end_time = time.time() - - transfer_time = end_time - start_time - throughput = (total_data_size_gb * 8) / transfer_time - - logger.info("=" * 60) - logger.info("RAY OBJECT STORE BANDWIDTH TEST SUMMARY") - logger.info("=" * 60) - logger.info(f"Data Size: {(total_data_size_gb):.6f} GB") - logger.info(f"Transfer Time: {transfer_time:.8f}s") - logger.info(f"Throughput: {throughput:.8f} Gb/s") - - def _run_remote_test(self, test_data, total_data_size_gb): - logger.info("Starting Ray PUT bandwidth test...") - start_put = time.time() - ray.get(self.remote_store.put_data.remote(test_data)) - end_put = time.time() - put_time = end_put - start_put - logger.info(f"PUT Time: {put_time:.8f}s") - - time.sleep(2) - - logger.info("Starting Ray GET bandwidth test...") - start_get = time.time() - ray.get(self.remote_store.get_data.remote()) - end_get = time.time() - get_time = end_get - start_get - logger.info(f"GET Time: {get_time:.8f}s") - - ray.get(self.remote_store.clear_data.remote()) - - put_throughput = (total_data_size_gb * 8) / put_time - get_throughput = (total_data_size_gb * 8) / get_time - - logger.info("=" * 60) - logger.info("RAY REMOTE ACTOR BANDWIDTH TEST SUMMARY") - logger.info("=" * 60) - logger.info(f"Data Size: {total_data_size_gb:.6f} GB") - logger.info(f"PUT Time: {put_time:.8f}s") - logger.info(f"GET Time: {get_time:.8f}s") - logger.info(f"PUT Throughput (Head->Worker): {put_throughput:.8f} Gb/s") - logger.info(f"GET Throughput (Worker->Head): {get_throughput:.8f} Gb/s") - logger.info(f"Round-trip Average Throughput: {total_data_size_gb * 16 / (put_time + get_time):.8f} Gb/s") - - -class TQBandwidthTester: - def __init__(self, config, remote_mode=False): - self.config = config - self.remote_mode = remote_mode - self.data_system_client = self._initialize_data_system() - - def _initialize_data_system(self): - total_storage_size = self.config.global_batch_size * self.config.num_global_batch - self.data_system_storage_units = {} - - if self.remote_mode: - for storage_unit_rank in range(self.config.num_data_storage_units): - storage_node = SimpleStorageUnit.options( - num_cpus=10, - resources={f"node:{WORKER_NODE_IP}": 0.001}, - ).remote(storage_unit_size=math.ceil(total_storage_size / self.config.num_data_storage_units)) - self.data_system_storage_units[storage_unit_rank] = storage_node - else: - storage_placement_group = get_placement_group(self.config.num_data_storage_units, num_cpus_per_actor=10) - for storage_unit_rank in range(self.config.num_data_storage_units): - storage_node = SimpleStorageUnit.options( - placement_group=storage_placement_group, - placement_group_bundle_index=storage_unit_rank, - ).remote(storage_unit_size=math.ceil(total_storage_size / self.config.num_data_storage_units)) - self.data_system_storage_units[storage_unit_rank] = storage_node - - logger.info(f"TransferQueueStorageSimpleUnit #0 ~ #{storage_unit_rank} has been created.") - - self.data_system_controller = TransferQueueController.remote() - logger.info("TransferQueueController has been created.") - - self.data_system_controller_info = process_zmq_server_info(self.data_system_controller) - self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units) - - tq_config = OmegaConf.create({}, flags={"allow_objects": True}) - tq_config.controller_info = self.data_system_controller_info - tq_config.storage_unit_infos = self.data_system_storage_unit_infos - self.config = OmegaConf.merge(tq_config, self.config) - - self.data_system_client = TransferQueueClient( - client_id="Trainer", controller_info=self.data_system_controller_info - ) - self.data_system_client.initialize_storage_manager(manager_type="AsyncSimpleStorageManager", config=self.config) - return self.data_system_client - - def run_bandwidth_test(self): - logger.info("Creating large batch for bandwidth test...") - start_create_data = time.time() - big_input_ids, total_data_size_gb = create_complex_test_case( - batch_size=self.config.global_batch_size, seq_length=self.config.seq_length, field_num=self.config.field_num - ) - end_create_data = time.time() - logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") - - logger.info("Starting PUT operation...") - start_async_put = time.time() - asyncio.run(self.data_system_client.async_put(data=big_input_ids, partition_id="train_0")) - end_async_put = time.time() - put_time = end_async_put - start_async_put - - put_throughput_gbps = (total_data_size_gb * 8) / put_time - logger.info(f"async_put cost time: {put_time:.8f}s") - logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s") - - time.sleep(2) - - logger.info("Starting GET_META operation...") - start_async_get_meta = time.time() - prompt_meta = asyncio.run( - self.data_system_client.async_get_meta( - data_fields=list(big_input_ids.keys()), - batch_size=big_input_ids.size(0), - partition_id="train_0", - task_name="generate_sequences", - ) - ) - end_async_get_meta = time.time() - logger.info(f"async_get_meta cost time: {end_async_get_meta - start_async_get_meta:.8f}s") - - time.sleep(2) - - logger.info("Starting GET_DATA operation...") - start_async_get_data = time.time() - asyncio.run(self.data_system_client.async_get_data(prompt_meta)) - end_async_get_data = time.time() - get_time = end_async_get_data - start_async_get_data - get_throughput_gbps = (total_data_size_gb * 8) / get_time - - logger.info(f"async_get_data cost time: {get_time:.8f}s") - logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s") - - mode_name = "TQ REMOTE" if self.remote_mode else "TQ NORMAL" - logger.info("=" * 60) - logger.info(f"{mode_name} BANDWIDTH TEST SUMMARY") - logger.info("=" * 60) - logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") - logger.info(f"PUT Time: {put_time:.8f}s") - logger.info(f"GET Time: {get_time:.8f}s") - logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s") - logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s") - logger.info(f"Network Round-trip Throughput: {(total_data_size_gb * 16) / (put_time + get_time):.8f} Gb/s") - - -def main(): - if len(sys.argv) < 2: - print("Usage: python performance_test.py ") - print("Available test modes:") - print(" ray-obj-store - Ray Object Store bandwidth test") - print(" ray-remote - Ray Remote Actor bandwidth test") - print(" tq-normal - TQ Normal mode bandwidth test") - print(" tq-remote - TQ Remote mode bandwidth test") - return - - test_mode = sys.argv[1] - - if test_mode == "ray-obj-store": - logger.info("Starting Ray Object Store bandwidth test") - tester = RayBandwidthTester(config=dict_conf, test_mode="obj_store") - tester.run_bandwidth_test() - logger.info("Ray Object Store bandwidth test completed successfully!") - - elif test_mode == "ray-remote": - logger.info("Starting Ray Remote Actor bandwidth test") - tester = RayBandwidthTester(config=dict_conf, test_mode="remote") - tester.run_bandwidth_test() - logger.info("Ray Remote Actor bandwidth test completed successfully!") - - elif test_mode in ["tq-normal", "tq-remote"]: - remote_mode = test_mode == "tq-remote" - mode_name = "TQ Remote" if remote_mode else "TQ Normal" - logger.info(f"Starting {mode_name} bandwidth test") - - tester = TQBandwidthTester(config=dict_conf, remote_mode=remote_mode) - tester.run_bandwidth_test() - logger.info(f"{mode_name} bandwidth test completed successfully!") - - else: - print(f"Unknown test mode: {test_mode}") - print("Available test modes: ray-obj-store, ray-remote, tq-normal, tq-remote") - - -if __name__ == "__main__": - main() diff --git a/scripts/perftest.py b/scripts/perftest.py index ff04ce1..db7d9bf 100644 --- a/scripts/perftest.py +++ b/scripts/perftest.py @@ -284,7 +284,7 @@ def _initialize_storage_units(self) -> None: # Remote mode: create all storage units on worker node for storage_unit_rank in range(num_data_storage_units): storage_node = SimpleStorageUnit.options( - num_cpus=10, + num_cpus=1, resources={f"node:{self.worker_node_ip}": 0.001}, ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units)) self.data_system_storage_units[storage_unit_rank] = storage_node @@ -438,7 +438,7 @@ def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any] # Default configs if backend == "default": - return {"num_data_storage_units": 8, "storage_unit_placement": "normal"} + return {"num_data_storage_units": 1, "storage_unit_placement": "normal"} elif backend == "yuanrong": return { "host": "127.0.0.1", From 83c9fb01e92e7b9e3fc677547333902dfbd692c4 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Mon, 23 Mar 2026 15:08:47 +0800 Subject: [PATCH 04/29] 1. the current `backend` arg is "default", "yuanrong", and "mooncake". use "SimpleStorage", "Yuanrong", "MooncakeStore" instead 2. do not support storage_unit_placement for simple storage any more 3. output test results to csv 4. move client_placement to yuanrong-specific config yaml. if backend is not yuanrong, can only be intra_node. if backend is yuanrong, inter node by default 5. remove --client_placement arg. put it into yuanrong.yaml 6. use tq.init(config) to initialize TransferQueueClient in TQClientActor 7. remove the non-tensor part when create_complex_test_case. only create tensors in tensordict. rename create_complex_test_case to create_test_case 8. put perftest.py, .md, configs/ under scripts/performance_test. 9. add default simple storage test to a new github workflow action 10. extract the test times 3 in _initialize_storage_units as a constant, and run run_throughput_test in a for loop 11. for TQClientActor, put, get_meta, and get_data, refactor them with kv interface Signed-off-by: tianyi-ge --- .github/workflows/perftest.yml | 58 ++ scripts/configs/transferqueue.yaml | 6 - .../{ => performance_test}/README_PERFTEST.md | 70 ++- .../configs/mooncake_store.yaml} | 0 .../configs/simple_storage.yaml | 2 + .../configs/yuanrong.yaml | 1 + scripts/performance_test/perftest.py | 563 +++++++++++++++++ scripts/perftest.py | 566 ------------------ 8 files changed, 670 insertions(+), 596 deletions(-) create mode 100644 .github/workflows/perftest.yml delete mode 100644 scripts/configs/transferqueue.yaml rename scripts/{ => performance_test}/README_PERFTEST.md (52%) rename scripts/{configs/mooncake.yaml => performance_test/configs/mooncake_store.yaml} (100%) create mode 100644 scripts/performance_test/configs/simple_storage.yaml rename scripts/{ => performance_test}/configs/yuanrong.yaml (76%) create mode 100644 scripts/performance_test/perftest.py delete mode 100644 scripts/perftest.py diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml new file mode 100644 index 0000000..9356c02 --- /dev/null +++ b/.github/workflows/perftest.yml @@ -0,0 +1,58 @@ +# This workflow runs the SimpleStorage performance test +name: Performance Test + +on: + push: + branches: + - main + - v0.* + pull_request: + branches: + - main + - v0.* + +jobs: + perftest: + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + python-version: ["3.11"] + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu + pip install -e ".[test,build,yuanrong]" + - name: Start Ray cluster + run: | + # Get the host IP + HOST_IP=$(hostname -I | awk '{print $1}') + echo "Host IP: $HOST_IP" + # Start Ray with node resource + ray start --head --resources="{\"node:$HOST_IP\":1}" + - name: Run SimpleStorage performance test + run: | + # Get the host IP + HOST_IP=$(hostname -I | awk '{print $1}') + echo "Host IP: $HOST_IP" + # Run the perftest with small batch size for quick test + cd scripts/performance_test + python perftest.py \ + --backend=SimpleStorage \ + --device=cpu \ + --global_batch_size=128 \ + --field_num=4 \ + --seq_len=1024 \ + --head_node_ip=$HOST_IP + - name: Stop Ray cluster + run: | + ray stop + if: always() diff --git a/scripts/configs/transferqueue.yaml b/scripts/configs/transferqueue.yaml deleted file mode 100644 index 3e5acd8..0000000 --- a/scripts/configs/transferqueue.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# TransferQueue (default) backend configuration -num_data_storage_units: 1 -# storage_unit_placement: "normal" (default) or "remote" -# - normal: create storage units on current node using placement group -# - remote: create all storage units on WORKER_NODE_IP -storage_unit_placement: normal diff --git a/scripts/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md similarity index 52% rename from scripts/README_PERFTEST.md rename to scripts/performance_test/README_PERFTEST.md index 887f723..0a82ce6 100644 --- a/scripts/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -13,14 +13,13 @@ This script runs throughput tests for TransferQueue with different backends. ray start --address=192.168.0.1 --resources='{"node:192.168.0.2":1}' ``` -2. Start the backend service (yuanrong, mooncake, etc.) if testing non-default backends. +2. Start the backend service (Yuanrong, MooncakeStore, etc.) if testing non-SimpleStorage backends. ## Usage ```bash python perftest.py \ - --backend=[default|yuanrong|mooncake] \ - --client_placement=[intra_node|inter_node] \ + --backend=[SimpleStorage|Yuanrong|MooncakeStore] \ --backend_config=xxx.yaml \ --device=[cpu|npu|gpu] \ --global_batch_size=1024 \ @@ -35,8 +34,7 @@ python perftest.py \ | Argument | Description | Default | |----------|-------------|---------| -| `--backend` | Backend type: default, yuanrong, mooncake | default | -| `--client_placement` | Client placement: intra_node or inter_node | intra_node | +| `--backend` | Backend type: SimpleStorage, Yuanrong, MooncakeStore | SimpleStorage | | `--backend_config` | Path to YAML config file (optional) | None | | `--device` | Device: cpu, npu, gpu | cpu | | `--global_batch_size` | Global batch size | 1024 | @@ -44,17 +42,17 @@ python perftest.py \ | `--seq_len` | Sequence length | 8192 | | `--num_global_batch` | Number of global batches | 1 | | `--head_node_ip` | Head node IP (required) | - | -| `--worker_node_ip` | Worker node IP (required for inter_node) | None | +| `--worker_node_ip` | Worker node IP (required for Yuanrong inter_node) | None | | `--ray_address` | Ray cluster address | auto | +| `--output_csv` | Path to output CSV file (optional) | None | ## Backend Configuration Sample config files are in `configs/`: -- **transferqueue.yaml**: Default backend config +- **simple_storage.yaml**: SimpleStorage backend config ```yaml - num_data_storage_units: 8 - storage_unit_placement: normal # or "remote" + num_data_storage_units: 1 ``` - **yuanrong.yaml**: Yuanrong backend config @@ -62,9 +60,10 @@ Sample config files are in `configs/`: host: 127.0.0.1 port: 31501 enable_yr_npu_transport: false + client_placement: inter_node # or "intra_node" ``` -- **mooncake.yaml**: Mooncake backend config +- **mooncake_store.yaml**: MooncakeStore backend config ```yaml local_hostname: 127.0.0.1 metadata_server: 127.0.0.1:8080 @@ -72,36 +71,41 @@ Sample config files are in `configs/`: ``` For device support of each backend, -- `default` backend supports `cpu` -- `yuanrong` supports `cpu` and `npu` -- `mooncake` supports `cpu` and `gpu` +- `SimpleStorage` backend supports `cpu` +- `Yuanrong` supports `cpu` and `npu` +- `MooncakeStore` supports `cpu` and `gpu` + +## Yuanrong Client Placement + +For Yuanrong backend, since `put` is always local-first, we need to start client actors on different nodes to test cross-node transfer. The client placement is configured in the YAML file: +- `client_placement: intra_node`: Both writer and reader run on head node +- `client_placement: inter_node`: Writer runs on head node, reader runs on worker node (default) ## Examples -### Intra-node test with default backend +### SimpleStorage backend ```bash -python perftest.py --backend=default --client_placement=intra_node \ +python perftest.py --backend=SimpleStorage \ --head_node_ip=192.168.0.1 ``` -### Inter-node test with yuanrong backend +### Yuanrong backend ```bash -python perftest.py --backend=yuanrong --client_placement=inter_node \ +python perftest.py --backend=Yuanrong \ --backend_config=configs/yuanrong.yaml \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` -### Default backend with remote storage units +### NPU device test ```bash -python perftest.py --backend=default --client_placement=intra_node \ - --backend_config=configs/transferqueue.yaml \ - --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 +python perftest.py --backend=Yuanrong --device=npu \ + --head_node_ip=192.168.0.1 ``` -### NPU device test +### Output to CSV ```bash -python perftest.py --backend=mooncake --device=npu \ - --head_node_ip=192.168.0.1 +python perftest.py --backend=SimpleStorage \ + --head_node_ip=192.168.0.1 --output_csv=results.csv ``` ## Output @@ -113,3 +117,21 @@ The test prints: - Total round-trip throughput Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per second). + +### CSV Output + +When using `--output_csv`, the test writes results to a CSV file with the following columns: +- backend +- client_placement +- device +- total_data_size_gb +- put_time +- get_time +- put_gbit_per_sec +- put_gbyte_per_sec +- get_gbit_per_sec +- get_gbyte_per_sec +- total_gbit_per_sec +- total_gbyte_per_sec + +The test runs 3 iterations and saves all 3 results to the CSV. diff --git a/scripts/configs/mooncake.yaml b/scripts/performance_test/configs/mooncake_store.yaml similarity index 100% rename from scripts/configs/mooncake.yaml rename to scripts/performance_test/configs/mooncake_store.yaml diff --git a/scripts/performance_test/configs/simple_storage.yaml b/scripts/performance_test/configs/simple_storage.yaml new file mode 100644 index 0000000..2eb397c --- /dev/null +++ b/scripts/performance_test/configs/simple_storage.yaml @@ -0,0 +1,2 @@ +# TransferQueue (default) backend configuration +num_data_storage_units: 1 diff --git a/scripts/configs/yuanrong.yaml b/scripts/performance_test/configs/yuanrong.yaml similarity index 76% rename from scripts/configs/yuanrong.yaml rename to scripts/performance_test/configs/yuanrong.yaml index 2df1b84..b4c52e5 100644 --- a/scripts/configs/yuanrong.yaml +++ b/scripts/performance_test/configs/yuanrong.yaml @@ -2,3 +2,4 @@ host: 127.0.0.1 port: 31501 enable_yr_npu_transport: false +client_placement: inter_node diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py new file mode 100644 index 0000000..3b7edb3 --- /dev/null +++ b/scripts/performance_test/perftest.py @@ -0,0 +1,563 @@ +#!/usr/bin/env python3 +# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2025 The TransferQueue Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import csv +import logging +import math +import sys +import time +from pathlib import Path +from typing import Any + +import ray +import torch +from omegaconf import OmegaConf +from tensordict import TensorDict + +parent_dir = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(parent_dir)) + +import transfer_queue as tq # noqa: E402 +from transfer_queue.storage.simple_backend import SimpleStorageUnit # noqa: E402 +from transfer_queue.utils.common import get_placement_group # noqa: E402 +from transfer_queue.utils.zmq_utils import process_zmq_server_info # noqa: E402 + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# Constants +NUM_TEST_ITERATIONS = 3 + + +def create_test_case( + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", +) -> tuple[TensorDict, float]: + """Create a test case with tensor fields only. + + Args: + batch_size: Batch size for the test case + seq_length: Sequence length for tensor fields + field_num: Number of fields to create + device: Device to create tensors on ("cpu", "npu", or "gpu") + + Returns: + Tuple of (TensorDict, total_size_gb) + """ + tensor_field_size_bytes = batch_size * seq_length * 4 + tensor_field_size_gb = tensor_field_size_bytes / (1024**3) + + total_size_gb = tensor_field_size_gb * field_num + + logger.info(f"Total data size: {total_size_gb:.6f} GB") + + # Determine torch device + torch_device = None + if device == "npu": + torch_device = "npu:0" + elif device == "gpu": + torch_device = "cuda:0" + + fields = {} + for i in range(field_num): + field_name = f"field_{i}" + tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) + fields[field_name] = tensor_data + + batch_size_tuple = (batch_size,) + prompt_batch = TensorDict( + fields, + batch_size=batch_size_tuple, + ) + + return prompt_batch, total_size_gb + + +@ray.remote +class TQClientActor: + """Ray actor that uses tq.init(config) to initialize.""" + + def __init__(self, base_config: dict[str, Any]): + self.base_config = base_config + self.test_data = None + self.total_data_size_gb = 0.0 + self.test_keys = None + + def initialize(self, zmq_info: Any = None) -> None: + """Initialize transfer_queue with the config.""" + config = OmegaConf.create(self.base_config, flags={"allow_objects": True}) + if zmq_info is not None and self.base_config["backend"]["storage_backend"] == "SimpleStorage": + # Use dict-style assignment to avoid OmegaConf validation + config["backend"]["SimpleStorage"]["zmq_info"] = zmq_info + tq.init(config) + + def create_test_case( + self, + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", + ) -> tuple[list[str], float]: + """Create test case on the actor.""" + self.test_data, self.total_data_size_gb = create_test_case(batch_size, seq_length, field_num, device) + # Create keys for each sample in the batch + self.test_keys = [f"test_key_{i}" for i in range(batch_size)] + return list(self.test_data.keys()), self.total_data_size_gb + + def put(self, partition_id: str) -> None: + """Put data to storage using kv_batch_put.""" + tq.kv_batch_put(keys=self.test_keys, partition_id=partition_id, fields=self.test_data) + + def list_keys(self, partition_id: str) -> list[str]: + """List keys in a partition using kv_list.""" + partition_info = tq.kv_list(partition_id=partition_id) + if partition_id in partition_info: + return list(partition_info[partition_id].keys()) + return [] + + def get_data(self, partition_id: str, keys: list[str] | None = None) -> None: + """Get data from storage using kv_batch_get.""" + if keys is None: + keys = self.test_keys + tq.kv_batch_get(keys=keys, partition_id=partition_id) + + +class TQThroughputTester: + """Main throughput tester for TransferQueue backends.""" + + def __init__( + self, + backend: str, + backend_config: dict[str, Any], + device: str, + global_batch_size: int, + field_num: int, + seq_len: int, + num_global_batch: int, + head_node_ip: str, + worker_node_ip: str | None = None, + output_csv: str | None = None, + ): + """Initialize the throughput tester. + + Args: + backend: Backend type ("SimpleStorage", "Yuanrong", "MooncakeStore") + backend_config: Backend configuration dictionary + device: Device type ("cpu", "npu", "gpu") + global_batch_size: Global batch size + field_num: Number of fields + seq_len: Sequence length + num_global_batch: Number of global batches + head_node_ip: Head node IP address + worker_node_ip: Worker node IP address (required for Yuanrong inter_node) + output_csv: Path to output CSV file (optional) + """ + self.backend = backend + self.backend_config = backend_config + self.device = device + self.global_batch_size = global_batch_size + self.field_num = field_num + self.seq_len = seq_len + self.num_global_batch = num_global_batch + self.head_node_ip = head_node_ip + self.worker_node_ip = worker_node_ip + self.output_csv = output_csv + + # Get client_placement from Yuanrong config, default to inter_node + self.client_placement = ( + self.backend_config.get("client_placement", "inter_node") if self.backend == "Yuanrong" else "intra_node" + ) + + # Validate arguments + self._validate_args() + + # Prepare full config for tq.init() + self.base_config, self.zmq_info = self._prepare_configs() + + # Initialize the test infrastructure + self._initialize_data_system() + self._initialize_clients() + + def _validate_args(self) -> None: + """Validate input arguments.""" + # Check worker_node_ip for Yuanrong inter_node + if self.backend == "Yuanrong" and self.client_placement == "inter_node" and self.worker_node_ip is None: + raise ValueError("worker_node_ip is required for Yuanrong with client_placement=inter_node") + + def _prepare_configs(self) -> tuple[dict[str, Any], Any]: + """Prepare the base config and storage units. + + Returns: + Tuple of (base_config, zmq_info) + """ + total_storage_size = self.global_batch_size * self.num_global_batch + + config = { + "controller": { + "sampler": "SequentialSampler", + "polling_mode": False, + }, + "backend": { + "storage_backend": self.backend, + }, + } + + # Set client_name based on backend + if self.backend == "Yuanrong": + self.backend_config["client_name"] = "YuanrongStorageClient" + elif self.backend == "MooncakeStore": + self.backend_config["client_name"] = "MooncakeStoreClient" + + # Add backend-specific config + if self.backend == "SimpleStorage": + config["backend"]["SimpleStorage"] = { + "total_storage_size": total_storage_size, + "num_data_storage_units": self.backend_config.get("num_data_storage_units", 1), + } + elif self.backend == "Yuanrong": + config["backend"]["Yuanrong"] = self.backend_config.copy() + # Remove client_placement from the backend config passed to tq + if "client_placement" in config["backend"]["Yuanrong"]: + del config["backend"]["Yuanrong"]["client_placement"] + elif self.backend == "MooncakeStore": + config["backend"]["MooncakeStore"] = self.backend_config.copy() + + return config, None + + def _initialize_data_system(self) -> None: + """Initialize controller and storage units if needed.""" + # For SimpleStorage, we need to manually create storage units with placement + if self.backend == "SimpleStorage": + self._initialize_storage_units() + + def _initialize_storage_units(self) -> None: + """Initialize SimpleStorageUnits for SimpleStorage backend.""" + num_data_storage_units = self.backend_config.get("num_data_storage_units", 1) + total_storage_size = self.global_batch_size * self.num_global_batch + + self.data_system_storage_units = {} + + storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) + for storage_unit_rank in range(num_data_storage_units): + storage_node = SimpleStorageUnit.options( + placement_group=storage_placement_group, + placement_group_bundle_index=storage_unit_rank, + ).remote(storage_unit_size=NUM_TEST_ITERATIONS * math.ceil(total_storage_size / num_data_storage_units)) + self.data_system_storage_units[storage_unit_rank] = storage_node + logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.") + + self.zmq_info = process_zmq_server_info(self.data_system_storage_units) + + def _initialize_clients(self) -> None: + """Initialize writer and reader TQClientActors.""" + # Determine node placement + if self.client_placement == "intra_node": + writer_node = reader_node = self.head_node_ip + else: + writer_node = self.head_node_ip + reader_node = self.worker_node_ip + + logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}") + + # Prepare base options + writer_options = { + "resources": {f"node:{writer_node}": 0.001}, + } + reader_options = { + "resources": {f"node:{reader_node}": 0.001}, + } + + # Add device-specific options + if self.device == "gpu": + writer_options["num_gpus"] = 1 + reader_options["num_gpus"] = 1 + elif self.device == "npu": + writer_options["resources"]["NPU"] = 1 + reader_options["resources"]["NPU"] = 1 + + # Create writer and reader actors + self.writer = TQClientActor.options(**writer_options).remote(self.base_config) + self.reader = TQClientActor.options(**reader_options).remote(self.base_config) + + # Initialize transfer_queue + logger.info(f"Using {self.backend} as storage backend.") + + w = self.writer.initialize.remote(self.zmq_info) + r = self.reader.initialize.remote(self.zmq_info) + ray.get([w, r]) + + def run_throughput_test(self) -> dict[str, Any]: + """Run the throughput test and print results. + + Returns: + Dictionary with test results + """ + logger.info("Creating large batch for throughput test...") + start_create_data = time.perf_counter() + data_fields, total_data_size_gb = ray.get( + self.writer.create_test_case.remote( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device=self.device, + ) + ) + end_create_data = time.perf_counter() + logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") + + partition_id = "train_0" + + # PUT operation using kv_batch_put + logger.info("Starting PUT operation (kv_batch_put)...") + start_put = time.perf_counter() + ray.get(self.writer.put.remote(partition_id=partition_id)) + end_put = time.perf_counter() + put_time = end_put - start_put + put_gbit_per_sec = (total_data_size_gb * 8) / put_time + put_gbyte_per_sec = total_data_size_gb / put_time + logger.info(f"put cost time: {put_time:.8f}s") + logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)") + + time.sleep(2) + + # LIST_KEYS operation using kv_list + logger.info("Starting LIST_KEYS operation (kv_list)...") + start_list = time.perf_counter() + keys = ray.get(self.reader.list_keys.remote(partition_id=partition_id)) + end_list = time.perf_counter() + logger.info(f"list_keys cost time: {end_list - start_list:.8f}s") + logger.info(f"Found {len(keys)} keys") + + time.sleep(2) + + # GET_DATA operation using kv_batch_get + logger.info("Starting GET_DATA operation (kv_batch_get)...") + start_get_data = time.perf_counter() + ray.get(self.reader.get_data.remote(partition_id=partition_id, keys=keys)) + end_get_data = time.perf_counter() + get_time = end_get_data - start_get_data + get_gbit_per_sec = (total_data_size_gb * 8) / get_time + get_gbyte_per_sec = total_data_size_gb / get_time + + logger.info(f"get_data cost time: {get_time:.8f}s") + logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s ({get_gbyte_per_sec:.8f} GB/s)") + + # Print summary + total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time) + total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time) + + logger.info("=" * 60) + logger.info("THROUGHPUT TEST SUMMARY") + logger.info("=" * 60) + logger.info(f"Backend: {self.backend}") + logger.info(f"Client Placement: {self.client_placement}") + logger.info(f"Device: {self.device}") + logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") + logger.info(f"PUT Time: {put_time:.8f}s") + logger.info(f"GET Time: {get_time:.8f}s") + logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)") + logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s ({get_gbyte_per_sec:.8f} GB/s)") + logger.info(f"Total Throughput: {total_gbit_per_sec:.8f} Gb/s ({total_gbyte_per_sec:.8f} GB/s)") + logger.info("=" * 60) + + # Return results + return { + "backend": self.backend, + "client_placement": self.client_placement, + "device": self.device, + "total_data_size_gb": total_data_size_gb, + "put_time": put_time, + "get_time": get_time, + "put_gbit_per_sec": put_gbit_per_sec, + "put_gbyte_per_sec": put_gbyte_per_sec, + "get_gbit_per_sec": get_gbit_per_sec, + "get_gbyte_per_sec": get_gbyte_per_sec, + "total_gbit_per_sec": total_gbit_per_sec, + "total_gbyte_per_sec": total_gbyte_per_sec, + } + + +def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]: + """Load backend config from YAML file or use defaults. + + Args: + config_path: Path to YAML config file (optional) + backend: Backend type for default config + + Returns: + Backend configuration dictionary + """ + if config_path is not None: + config = OmegaConf.load(config_path) + return OmegaConf.to_container(config, resolve=True) + + # Default configs + if backend == "SimpleStorage": + return {"num_data_storage_units": 1} + elif backend == "Yuanrong": + return { + "host": "127.0.0.1", + "port": 31501, + "enable_yr_npu_transport": False, + "client_placement": "inter_node", + } + elif backend == "MooncakeStore": + return { + "local_hostname": "127.0.0.1", + "metadata_server": "127.0.0.1:8080", + "master_server_address": "127.0.0.1:8081", + } + else: + return {} + + +def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> None: + """Write test results to CSV file. + + Args: + results: List of result dictionaries + output_path: Path to output CSV file + """ + if not results: + return + + fieldnames = list(results[0].keys()) + + with open(output_path, "w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for result in results: + writer.writerow(result) + + logger.info(f"Results written to {output_path}") + + +def main() -> None: + """Main entry point for the perftest script.""" + parser = argparse.ArgumentParser(description="TransferQueue Throughput Test") + parser.add_argument( + "--backend", + type=str, + default="SimpleStorage", + choices=["SimpleStorage", "Yuanrong", "MooncakeStore"], + help="Backend type to test (default: SimpleStorage)", + ) + parser.add_argument( + "--backend_config", + type=str, + default=None, + help="Path to backend config YAML file (optional)", + ) + parser.add_argument( + "--device", + type=str, + default="cpu", + choices=["cpu", "npu", "gpu"], + help="Device to use (default: cpu)", + ) + parser.add_argument( + "--global_batch_size", + type=int, + default=1024, + help="Global batch size (default: 1024)", + ) + parser.add_argument( + "--field_num", + type=int, + default=10, + help="Number of fields (default: 10)", + ) + parser.add_argument( + "--seq_len", + type=int, + default=8192, + help="Sequence length (default: 8192)", + ) + parser.add_argument( + "--num_global_batch", + type=int, + default=1, + help="Number of global batches (default: 1)", + ) + parser.add_argument( + "--head_node_ip", + type=str, + required=True, + help="Head node IP address", + ) + parser.add_argument( + "--worker_node_ip", + type=str, + default=None, + help="Worker node IP address (required for Yuanrong inter_node)", + ) + parser.add_argument( + "--ray_address", + type=str, + default="auto", + help="Ray cluster address (default: auto)", + ) + parser.add_argument( + "--output_csv", + type=str, + default=None, + help="Path to output CSV file (optional)", + ) + + args = parser.parse_args() + + # Load backend config + backend_config = load_backend_config(args.backend_config, args.backend) + + # Initialize Ray + logger.info(f"Connecting to Ray cluster at {args.ray_address}") + ray.init(address=args.ray_address) + + # Create and run tester + tester = TQThroughputTester( + backend=args.backend, + backend_config=backend_config, + device=args.device, + global_batch_size=args.global_batch_size, + field_num=args.field_num, + seq_len=args.seq_len, + num_global_batch=args.num_global_batch, + head_node_ip=args.head_node_ip, + worker_node_ip=args.worker_node_ip, + output_csv=args.output_csv, + ) + + # Run test multiple times for consistent results using a for loop + all_results = [] + for i in range(NUM_TEST_ITERATIONS): + logger.info("-" * 60) + logger.info(f"Iteration {i + 1}/{NUM_TEST_ITERATIONS}") + logger.info("-" * 60) + result = tester.run_throughput_test() + all_results.append(result) + + # Write to CSV if output path is specified + if args.output_csv: + write_results_to_csv(all_results, args.output_csv) + + logger.info("Throughput test completed successfully!") + + +if __name__ == "__main__": + main() diff --git a/scripts/perftest.py b/scripts/perftest.py deleted file mode 100644 index db7d9bf..0000000 --- a/scripts/perftest.py +++ /dev/null @@ -1,566 +0,0 @@ -# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2025 The TransferQueue Team -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import math -import random -import sys -import time -from pathlib import Path -from typing import Any - -import ray -import torch -from omegaconf import OmegaConf -from tensordict import TensorDict -from tensordict.tensorclass import NonTensorData - -parent_dir = Path(__file__).resolve().parent.parent -sys.path.append(str(parent_dir)) - -from transfer_queue.client import TransferQueueClient # noqa: E402 -from transfer_queue.controller import TransferQueueController # noqa: E402 -from transfer_queue.storage.simple_backend import SimpleStorageUnit # noqa: E402 -from transfer_queue.utils.common import get_placement_group # noqa: E402 -from transfer_queue.utils.zmq_utils import process_zmq_server_info # noqa: E402 - -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") -logger = logging.getLogger(__name__) - - -def create_complex_test_case( - batch_size: int | None = None, - seq_length: int | None = None, - field_num: int | None = None, - device: str = "cpu", -) -> tuple[TensorDict, float]: - """Create a complex test case with tensor and non-tensor fields. - - Args: - batch_size: Batch size for the test case - seq_length: Sequence length for tensor fields - field_num: Number of fields to create - device: Device to create tensors on ("cpu", "npu", or "gpu") - - Returns: - Tuple of (TensorDict, total_size_gb) - """ - tensor_field_size_bytes = batch_size * seq_length * 4 - tensor_field_size_gb = tensor_field_size_bytes / (1024**3) - - num_tensor_fields = (field_num + 1) // 2 - num_nontensor_fields = field_num // 2 - - total_tensor_size_gb = tensor_field_size_gb * num_tensor_fields - total_nontensor_size_gb = (batch_size * 1024 / (1024**3)) * num_nontensor_fields - total_size_gb = total_tensor_size_gb + total_nontensor_size_gb - - logger.info(f"Total data size: {total_size_gb:.6f} GB") - - # Determine torch device - torch_device = None - if device == "npu": - torch_device = "npu:0" - elif device == "gpu": - torch_device = "cuda:0" - - fields = {} - for i in range(field_num): - field_name = f"field_{i}" - - if i % 2 == 0: - # Tensor field - tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) - fields[field_name] = tensor_data - else: - # NonTensorData field - str_length = 1024 - non_tensor_data = [ - "".join( - random.choices( - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", - k=str_length, - ) - ) - for _ in range(batch_size) - ] - fields[field_name] = NonTensorData(data=non_tensor_data, batch_size=(batch_size,), device=None) - - batch_size_tuple = (batch_size,) - prompt_batch = TensorDict( - fields, - batch_size=batch_size_tuple, - ) - - return prompt_batch, total_size_gb - - -@ray.remote -class TQClientActor: - """Ray actor that holds a TransferQueueClient.""" - - def __init__(self, client_id: str, controller_info: Any): - self.client = TransferQueueClient( - client_id=client_id, - controller_info=controller_info, - ) - self.prompt_meta = None - self.test_data = None - self.total_data_size_gb = 0.0 - - def initialize_storage_manager(self, manager_type: str, config: dict[str, Any]) -> None: - """Initialize the storage manager with given config.""" - self.client.initialize_storage_manager(manager_type=manager_type, config=config) - - def create_complex_test_case( - self, - batch_size: int | None = None, - seq_length: int | None = None, - field_num: int | None = None, - device: str = "cpu", - ) -> tuple[list[str], float]: - """Create test case on the actor.""" - self.test_data, self.total_data_size_gb = create_complex_test_case(batch_size, seq_length, field_num, device) - return list(self.test_data.keys()), self.total_data_size_gb - - def put(self, partition_id: str) -> None: - """Put data to storage.""" - self.client.put(data=self.test_data, partition_id=partition_id) - - def get_meta( - self, - data_fields: list[str], - batch_size: int, - partition_id: str, - task_name: str | None = None, - sampling_config: dict[str, Any] | None = None, - ) -> Any: - """Get metadata from controller.""" - self.prompt_meta = self.client.get_meta( - data_fields=data_fields, - batch_size=batch_size, - partition_id=partition_id, - task_name=task_name, - sampling_config=sampling_config, - ) - return self.prompt_meta - - def get_data(self) -> None: - """Get data from storage using cached metadata.""" - self.client.get_data(self.prompt_meta) - - -class TQThroughputTester: - """Main throughput tester for TransferQueue backends.""" - - def __init__( - self, - backend: str, - client_placement: str, - backend_config: dict[str, Any], - device: str, - global_batch_size: int, - field_num: int, - seq_len: int, - num_global_batch: int, - head_node_ip: str, - worker_node_ip: str | None = None, - ): - """Initialize the throughput tester. - - Args: - backend: Backend type ("default", "yuanrong", "mooncake") - client_placement: Client placement mode ("intra_node" or "inter_node") - backend_config: Backend configuration dictionary - device: Device type ("cpu", "npu", "gpu") - global_batch_size: Global batch size - field_num: Number of fields - seq_len: Sequence length - num_global_batch: Number of global batches - head_node_ip: Head node IP address - worker_node_ip: Worker node IP address (required for inter_node) - """ - self.backend = backend - self.client_placement = client_placement - self.backend_config = backend_config - self.device = device - self.global_batch_size = global_batch_size - self.field_num = field_num - self.seq_len = seq_len - self.num_global_batch = num_global_batch - self.head_node_ip = head_node_ip - self.worker_node_ip = worker_node_ip - - # Validate arguments - self._validate_args() - - # Determine manager type and prepare configs - self.manager_type = self._get_manager_type() - self.writer_config, self.reader_config = self._prepare_backend_configs() - - # Initialize the test infrastructure - self._initialize_data_system() - self._initialize_clients() - - def _validate_args(self) -> None: - """Validate input arguments.""" - if self.client_placement == "inter_node" and self.worker_node_ip is None: - raise ValueError("worker_node_ip is required for inter_node client placement") - if self.backend == "default": - storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal") - if storage_unit_placement == "remote" and self.worker_node_ip is None: - raise ValueError("worker_node_ip is required for remote storage_unit_placement") - - def _get_manager_type(self) -> str: - """Get the storage manager type based on backend.""" - if self.backend == "default": - return "AsyncSimpleStorageManager" - elif self.backend == "yuanrong": - return "YuanrongStorageManager" - elif self.backend == "mooncake": - return "MooncakeStorageManager" - else: - raise ValueError(f"Unknown backend: {self.backend}") - - def _prepare_backend_configs(self) -> tuple[dict[str, Any], dict[str, Any]]: - """Prepare writer and reader backend configs. - - Returns: - Tuple of (writer_config, reader_config) - """ - # Set client_name based on backend - base_config = self.backend_config.copy() - if self.backend == "yuanrong": - base_config["client_name"] = "YuanrongStorageClient" - elif self.backend == "mooncake": - base_config["client_name"] = "MooncakeStoreClient" - - writer_config = base_config.copy() - reader_config = base_config.copy() - - if self.client_placement == "inter_node": - if self.backend == "yuanrong": - writer_config["host"] = self.head_node_ip - reader_config["host"] = self.worker_node_ip - elif self.backend == "mooncake": - writer_config["local_hostname"] = self.head_node_ip - reader_config["local_hostname"] = self.worker_node_ip - - return writer_config, reader_config - - def _initialize_data_system(self) -> None: - """Initialize controller and storage units if needed.""" - # Initialize controller - self.data_system_controller = TransferQueueController.remote() - logger.info("TransferQueueController has been created.") - self.data_system_controller_info = process_zmq_server_info(self.data_system_controller) - - # Initialize storage units for default backend - if self.backend == "default": - self._initialize_storage_units() - - def _initialize_storage_units(self) -> None: - """Initialize SimpleStorageUnits for default backend.""" - num_data_storage_units = self.backend_config.get("num_data_storage_units", 8) - storage_unit_placement = self.backend_config.get("storage_unit_placement", "normal") - total_storage_size = self.global_batch_size * self.num_global_batch - - self.data_system_storage_units = {} - - if storage_unit_placement == "remote": - # Remote mode: create all storage units on worker node - for storage_unit_rank in range(num_data_storage_units): - storage_node = SimpleStorageUnit.options( - num_cpus=1, - resources={f"node:{self.worker_node_ip}": 0.001}, - ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units)) - self.data_system_storage_units[storage_unit_rank] = storage_node - logger.info( - f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created on worker node {self.worker_node_ip}." - ) - else: - # Normal mode: create storage units using placement group - storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) - for storage_unit_rank in range(num_data_storage_units): - storage_node = SimpleStorageUnit.options( - placement_group=storage_placement_group, - placement_group_bundle_index=storage_unit_rank, - ).remote(storage_unit_size=3 * math.ceil(total_storage_size / num_data_storage_units)) - self.data_system_storage_units[storage_unit_rank] = storage_node - logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.") - - self.data_system_storage_unit_infos = process_zmq_server_info(self.data_system_storage_units) - # Add storage unit infos to backend configs - self.writer_config["zmq_info"] = self.data_system_storage_unit_infos - self.reader_config["zmq_info"] = self.data_system_storage_unit_infos - - def _initialize_clients(self) -> None: - """Initialize writer and reader TQClientActors.""" - # Determine node placement - if self.client_placement == "intra_node": - writer_node = reader_node = self.head_node_ip - else: - writer_node = self.head_node_ip - reader_node = self.worker_node_ip - - logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}") - - # Prepare base options - writer_options = { - "resources": {f"node:{writer_node}": 0.001}, - } - reader_options = { - "resources": {f"node:{reader_node}": 0.001}, - } - - # Add device-specific options - if self.device == "gpu": - writer_options["num_gpus"] = 1 - reader_options["num_gpus"] = 1 - elif self.device == "npu": - writer_options["resources"]["NPU"] = 1 - reader_options["resources"]["NPU"] = 1 - - # Create writer and reader actors - self.writer = TQClientActor.options(**writer_options).remote("writer", self.data_system_controller_info) - self.reader = TQClientActor.options(**reader_options).remote("reader", self.data_system_controller_info) - - # Initialize storage managers - logger.info(f"Using {self.manager_type} as storage backend.") - - w = self.writer.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.writer_config) - r = self.reader.initialize_storage_manager.remote(manager_type=self.manager_type, config=self.reader_config) - ray.get([w, r]) - - def run_throughput_test(self) -> None: - """Run the throughput test and print results.""" - logger.info("Creating large batch for throughput test...") - start_create_data = time.time() - data_fields, total_data_size_gb = ray.get( - self.writer.create_complex_test_case.remote( - batch_size=self.global_batch_size, - seq_length=self.seq_len, - field_num=self.field_num, - device=self.device, - ) - ) - end_create_data = time.time() - logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") - - # PUT operation - logger.info("Starting PUT operation...") - start_put = time.time() - ray.get(self.writer.put.remote(partition_id="train_0")) - end_put = time.time() - put_time = end_put - start_put - put_throughput_gbps = (total_data_size_gb * 8) / put_time - put_throughput_gbs = total_data_size_gb / put_time - logger.info(f"put cost time: {put_time:.8f}s") - logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)") - - time.sleep(2) - - # GET_META operation - logger.info("Starting GET_META operation...") - start_get_meta = time.time() - ray.wait( - [ - self.reader.get_meta.remote( - data_fields=list(data_fields), - batch_size=self.global_batch_size, - partition_id="train_0", - task_name="generate_sequences", - ) - ] - ) - end_get_meta = time.time() - logger.info(f"get_meta cost time: {end_get_meta - start_get_meta:.8f}s") - - time.sleep(2) - - # GET_DATA operation - logger.info("Starting GET_DATA operation...") - start_get_data = time.time() - ray.get(self.reader.get_data.remote()) - end_get_data = time.time() - get_time = end_get_data - start_get_data - get_throughput_gbps = (total_data_size_gb * 8) / get_time - get_throughput_gbs = total_data_size_gb / get_time - - logger.info(f"get_data cost time: {get_time:.8f}s") - logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)") - - # Print summary - total_throughput_gbps = (total_data_size_gb * 16) / (put_time + get_time) - total_throughput_gbs = (total_data_size_gb * 2) / (put_time + get_time) - - logger.info("=" * 60) - logger.info("THROUGHPUT TEST SUMMARY") - logger.info("=" * 60) - logger.info(f"Backend: {self.backend}") - logger.info(f"Client Placement: {self.client_placement}") - logger.info(f"Device: {self.device}") - logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") - logger.info(f"PUT Time: {put_time:.8f}s") - logger.info(f"GET Time: {get_time:.8f}s") - logger.info(f"PUT Throughput: {put_throughput_gbps:.8f} Gb/s ({put_throughput_gbs:.8f} GB/s)") - logger.info(f"GET Throughput: {get_throughput_gbps:.8f} Gb/s ({get_throughput_gbs:.8f} GB/s)") - logger.info(f"Total Throughput: {total_throughput_gbps:.8f} Gb/s ({total_throughput_gbs:.8f} GB/s)") - logger.info("=" * 60) - - -def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]: - """Load backend config from YAML file or use defaults. - - Args: - config_path: Path to YAML config file (optional) - backend: Backend type for default config - - Returns: - Backend configuration dictionary - """ - if config_path is not None: - config = OmegaConf.load(config_path) - return OmegaConf.to_container(config, resolve=True) - - # Default configs - if backend == "default": - return {"num_data_storage_units": 1, "storage_unit_placement": "normal"} - elif backend == "yuanrong": - return { - "host": "127.0.0.1", - "port": 31501, - "enable_yr_npu_transport": False, - } - elif backend == "mooncake": - return { - "local_hostname": "127.0.0.1", - "metadata_server": "127.0.0.1:8080", - "master_server_address": "127.0.0.1:8081", - } - else: - return {} - - -def main() -> None: - """Main entry point for the perftest script.""" - parser = argparse.ArgumentParser(description="TransferQueue Throughput Test") - parser.add_argument( - "--backend", - type=str, - default="default", - choices=["default", "yuanrong", "mooncake"], - help="Backend type to test (default: default)", - ) - parser.add_argument( - "--client_placement", - type=str, - default="intra_node", - choices=["intra_node", "inter_node"], - help="Client placement mode (default: intra_node)", - ) - parser.add_argument( - "--backend_config", - type=str, - default=None, - help="Path to backend config YAML file (optional)", - ) - parser.add_argument( - "--device", - type=str, - default="cpu", - choices=["cpu", "npu", "gpu"], - help="Device to use (default: cpu)", - ) - parser.add_argument( - "--global_batch_size", - type=int, - default=1024, - help="Global batch size (default: 1024)", - ) - parser.add_argument( - "--field_num", - type=int, - default=10, - help="Number of fields (default: 10)", - ) - parser.add_argument( - "--seq_len", - type=int, - default=8192, - help="Sequence length (default: 8192)", - ) - parser.add_argument( - "--num_global_batch", - type=int, - default=1, - help="Number of global batches (default: 1)", - ) - parser.add_argument( - "--head_node_ip", - type=str, - required=True, - help="Head node IP address", - ) - parser.add_argument( - "--worker_node_ip", - type=str, - default=None, - help="Worker node IP address (required for inter_node)", - ) - parser.add_argument( - "--ray_address", - type=str, - default="auto", - help="Ray cluster address (default: auto)", - ) - - args = parser.parse_args() - - # Load backend config - backend_config = load_backend_config(args.backend_config, args.backend) - - # Initialize Ray - logger.info(f"Connecting to Ray cluster at {args.ray_address}") - ray.init(address=args.ray_address) - - # Create and run tester - tester = TQThroughputTester( - backend=args.backend, - client_placement=args.client_placement, - backend_config=backend_config, - device=args.device, - global_batch_size=args.global_batch_size, - field_num=args.field_num, - seq_len=args.seq_len, - num_global_batch=args.num_global_batch, - head_node_ip=args.head_node_ip, - worker_node_ip=args.worker_node_ip, - ) - - # Run test multiple times for consistent results - print("-" * 60) - tester.run_throughput_test() - print("-" * 60) - tester.run_throughput_test() - print("-" * 60) - tester.run_throughput_test() - - logger.info("Throughput test completed successfully!") - - -if __name__ == "__main__": - main() From a9f70bd26aaf1f9e9af00d748a9adbf8a8375c53 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Mon, 23 Mar 2026 15:28:56 +0800 Subject: [PATCH 05/29] reduce num_cpus for ci Signed-off-by: tianyi-ge --- .github/workflows/perftest.yml | 3 ++- scripts/performance_test/perftest.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml index 9356c02..a4fb231 100644 --- a/.github/workflows/perftest.yml +++ b/.github/workflows/perftest.yml @@ -30,7 +30,7 @@ jobs: run: | python -m pip install --upgrade pip pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu - pip install -e ".[test,build,yuanrong]" + pip install -e . - name: Start Ray cluster run: | # Get the host IP @@ -52,6 +52,7 @@ jobs: --field_num=4 \ --seq_len=1024 \ --head_node_ip=$HOST_IP + --output_csv=results.csv - name: Stop Ray cluster run: | ray stop diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 3b7edb3..85b87d3 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -253,7 +253,7 @@ def _initialize_storage_units(self) -> None: self.data_system_storage_units = {} - storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) + storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=0.001) for storage_unit_rank in range(num_data_storage_units): storage_node = SimpleStorageUnit.options( placement_group=storage_placement_group, @@ -277,9 +277,11 @@ def _initialize_clients(self) -> None: # Prepare base options writer_options = { + "num_cpus": 0.001, "resources": {f"node:{writer_node}": 0.001}, } reader_options = { + "num_cpus": 0.001, "resources": {f"node:{reader_node}": 0.001}, } From c43519c8e47c8d61de1f2fdbeddd5c858f998f64 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Mon, 23 Mar 2026 15:29:45 +0800 Subject: [PATCH 06/29] reduce perftest ci timeout to 10 min Signed-off-by: tianyi-ge --- .github/workflows/perftest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml index a4fb231..a39121e 100644 --- a/.github/workflows/perftest.yml +++ b/.github/workflows/perftest.yml @@ -14,7 +14,7 @@ on: jobs: perftest: runs-on: ubuntu-latest - timeout-minutes: 30 + timeout-minutes: 10 strategy: fail-fast: false matrix: @@ -51,7 +51,7 @@ jobs: --global_batch_size=128 \ --field_num=4 \ --seq_len=1024 \ - --head_node_ip=$HOST_IP + --head_node_ip=$HOST_IP \ --output_csv=results.csv - name: Stop Ray cluster run: | From c3f69a821270013bdb292d84dcc7a761c7accbea Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Mon, 23 Mar 2026 15:44:10 +0800 Subject: [PATCH 07/29] fix ci Signed-off-by: tianyi-ge --- .github/workflows/perftest.yml | 1 + scripts/performance_test/perftest.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml index a39121e..2a72428 100644 --- a/.github/workflows/perftest.yml +++ b/.github/workflows/perftest.yml @@ -38,6 +38,7 @@ jobs: echo "Host IP: $HOST_IP" # Start Ray with node resource ray start --head --resources="{\"node:$HOST_IP\":1}" + ray status - name: Run SimpleStorage performance test run: | # Get the host IP diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 85b87d3..30a0ad2 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -253,7 +253,7 @@ def _initialize_storage_units(self) -> None: self.data_system_storage_units = {} - storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=0.001) + storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) for storage_unit_rank in range(num_data_storage_units): storage_node = SimpleStorageUnit.options( placement_group=storage_placement_group, From a641e95cc38fb5b2755b1bc8e3d989f1e0b5480e Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Mon, 23 Mar 2026 22:44:57 +0800 Subject: [PATCH 08/29] 1. use transfer_queue/config.yaml instead of new configs 2. add num_test_iterations as a new option Signed-off-by: tianyi-ge --- .github/workflows/perftest.yml | 2 +- scripts/performance_test/README_PERFTEST.md | 60 ++--- .../configs/mooncake_store.yaml | 4 - .../configs/simple_storage.yaml | 2 - .../performance_test/configs/yuanrong.yaml | 5 - scripts/performance_test/perftest.py | 234 +++++------------- transfer_queue/config.yaml | 8 +- 7 files changed, 85 insertions(+), 230 deletions(-) delete mode 100644 scripts/performance_test/configs/mooncake_store.yaml delete mode 100644 scripts/performance_test/configs/simple_storage.yaml delete mode 100644 scripts/performance_test/configs/yuanrong.yaml diff --git a/.github/workflows/perftest.yml b/.github/workflows/perftest.yml index 2a72428..30097cc 100644 --- a/.github/workflows/perftest.yml +++ b/.github/workflows/perftest.yml @@ -47,7 +47,7 @@ jobs: # Run the perftest with small batch size for quick test cd scripts/performance_test python perftest.py \ - --backend=SimpleStorage \ + --backend_config=../../transfer_queue/config.yaml \ --device=cpu \ --global_batch_size=128 \ --field_num=4 \ diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index 0a82ce6..9b979ae 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -10,7 +10,7 @@ This script runs throughput tests for TransferQueue with different backends. ray start --head --resources='{"node:192.168.0.1":1}' # On worker node - ray start --address=192.168.0.1 --resources='{"node:192.168.0.2":1}' + ray start --address=192.168.0.1:6379 --resources='{"node:192.168.0.2":1}' ``` 2. Start the backend service (Yuanrong, MooncakeStore, etc.) if testing non-SimpleStorage backends. @@ -19,13 +19,11 @@ This script runs throughput tests for TransferQueue with different backends. ```bash python perftest.py \ - --backend=[SimpleStorage|Yuanrong|MooncakeStore] \ - --backend_config=xxx.yaml \ + --backend_config=../../transfer_queue/config.yaml \ --device=[cpu|npu|gpu] \ --global_batch_size=1024 \ --field_num=10 \ --seq_len=8192 \ - --num_global_batch=1 \ --head_node_ip=192.168.0.1 \ --worker_node_ip=192.168.0.2 ``` @@ -34,77 +32,53 @@ python perftest.py \ | Argument | Description | Default | |----------|-------------|---------| -| `--backend` | Backend type: SimpleStorage, Yuanrong, MooncakeStore | SimpleStorage | -| `--backend_config` | Path to YAML config file (optional) | None | +| `--backend_config` | Path to backend config YAML file (required) | - | | `--device` | Device: cpu, npu, gpu | cpu | | `--global_batch_size` | Global batch size | 1024 | | `--field_num` | Number of fields | 10 | | `--seq_len` | Sequence length | 8192 | -| `--num_global_batch` | Number of global batches | 1 | +| `--num_test_iterations` | Number of test iterations | 3 | | `--head_node_ip` | Head node IP (required) | - | -| `--worker_node_ip` | Worker node IP (required for Yuanrong inter_node) | None | +| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None | | `--ray_address` | Ray cluster address | auto | | `--output_csv` | Path to output CSV file (optional) | None | ## Backend Configuration -Sample config files are in `configs/`: - -- **simple_storage.yaml**: SimpleStorage backend config - ```yaml - num_data_storage_units: 1 - ``` - -- **yuanrong.yaml**: Yuanrong backend config - ```yaml - host: 127.0.0.1 - port: 31501 - enable_yr_npu_transport: false - client_placement: inter_node # or "intra_node" - ``` - -- **mooncake_store.yaml**: MooncakeStore backend config - ```yaml - local_hostname: 127.0.0.1 - metadata_server: 127.0.0.1:8080 - master_server_address: 127.0.0.1:8081 - ``` +The script reads the backend configuration directly from the provided `--backend_config` YAML file. The backend type is determined by `backend.storage_backend` in the config file. For device support of each backend, - `SimpleStorage` backend supports `cpu` - `Yuanrong` supports `cpu` and `npu` - `MooncakeStore` supports `cpu` and `gpu` -## Yuanrong Client Placement +## Yuanrong Backend -For Yuanrong backend, since `put` is always local-first, we need to start client actors on different nodes to test cross-node transfer. The client placement is configured in the YAML file: -- `client_placement: intra_node`: Both writer and reader run on head node -- `client_placement: inter_node`: Writer runs on head node, reader runs on worker node (default) +For Yuanrong backend, writer runs on head node and reader runs on worker node. ## Examples -### SimpleStorage backend +### SimpleStorage/Mooncake backend ```bash -python perftest.py --backend=SimpleStorage \ +python perftest.py --backend_config=../../transfer_queue/config.yaml \ --head_node_ip=192.168.0.1 ``` ### Yuanrong backend ```bash -python perftest.py --backend=Yuanrong \ - --backend_config=configs/yuanrong.yaml \ +python perftest.py --backend_config=../../transfer_queue/config.yaml \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` ### NPU device test ```bash -python perftest.py --backend=Yuanrong --device=npu \ - --head_node_ip=192.168.0.1 +python perftest.py --backend_config=../../transfer_queue/config.yaml --device=npu \ + --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` ### Output to CSV ```bash -python perftest.py --backend=SimpleStorage \ +python perftest.py --backend_config=../../transfer_queue/config.yaml \ --head_node_ip=192.168.0.1 --output_csv=results.csv ``` @@ -122,16 +96,12 @@ Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per s When using `--output_csv`, the test writes results to a CSV file with the following columns: - backend -- client_placement - device - total_data_size_gb - put_time - get_time - put_gbit_per_sec -- put_gbyte_per_sec - get_gbit_per_sec -- get_gbyte_per_sec - total_gbit_per_sec -- total_gbyte_per_sec -The test runs 3 iterations and saves all 3 results to the CSV. +The test runs `--num_test_iterations` iterations (default: 3) and saves all results to the CSV. diff --git a/scripts/performance_test/configs/mooncake_store.yaml b/scripts/performance_test/configs/mooncake_store.yaml deleted file mode 100644 index 320801f..0000000 --- a/scripts/performance_test/configs/mooncake_store.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# Mooncake backend configuration -local_hostname: 127.0.0.1 -metadata_server: 127.0.0.1:8080 -master_server_address: 127.0.0.1:8081 diff --git a/scripts/performance_test/configs/simple_storage.yaml b/scripts/performance_test/configs/simple_storage.yaml deleted file mode 100644 index 2eb397c..0000000 --- a/scripts/performance_test/configs/simple_storage.yaml +++ /dev/null @@ -1,2 +0,0 @@ -# TransferQueue (default) backend configuration -num_data_storage_units: 1 diff --git a/scripts/performance_test/configs/yuanrong.yaml b/scripts/performance_test/configs/yuanrong.yaml deleted file mode 100644 index b4c52e5..0000000 --- a/scripts/performance_test/configs/yuanrong.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# Yuanrong backend configuration -host: 127.0.0.1 -port: 31501 -enable_yr_npu_transport: false -client_placement: inter_node diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 30a0ad2..249af0b 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -17,7 +17,6 @@ import argparse import csv import logging -import math import sys import time from pathlib import Path @@ -32,16 +31,10 @@ sys.path.append(str(parent_dir)) import transfer_queue as tq # noqa: E402 -from transfer_queue.storage.simple_backend import SimpleStorageUnit # noqa: E402 -from transfer_queue.utils.common import get_placement_group # noqa: E402 -from transfer_queue.utils.zmq_utils import process_zmq_server_info # noqa: E402 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) -# Constants -NUM_TEST_ITERATIONS = 3 - def create_test_case( batch_size: int | None = None, @@ -93,19 +86,15 @@ def create_test_case( class TQClientActor: """Ray actor that uses tq.init(config) to initialize.""" - def __init__(self, base_config: dict[str, Any]): - self.base_config = base_config + def __init__(self, config: dict[str, Any]): + self.config = config self.test_data = None self.total_data_size_gb = 0.0 self.test_keys = None - def initialize(self, zmq_info: Any = None) -> None: + def initialize(self) -> None: """Initialize transfer_queue with the config.""" - config = OmegaConf.create(self.base_config, flags={"allow_objects": True}) - if zmq_info is not None and self.base_config["backend"]["storage_backend"] == "SimpleStorage": - # Use dict-style assignment to avoid OmegaConf validation - config["backend"]["SimpleStorage"]["zmq_info"] = zmq_info - tq.init(config) + tq.init(OmegaConf.create(self.config)) def create_test_case( self, @@ -137,19 +126,22 @@ def get_data(self, partition_id: str, keys: list[str] | None = None) -> None: keys = self.test_keys tq.kv_batch_get(keys=keys, partition_id=partition_id) + def close(self) -> None: + """Close transfer_queue.""" + tq.close() + class TQThroughputTester: """Main throughput tester for TransferQueue backends.""" def __init__( self, - backend: str, - backend_config: dict[str, Any], + backend_config_path: str, device: str, global_batch_size: int, field_num: int, seq_len: int, - num_global_batch: int, + num_test_iterations: int, head_node_ip: str, worker_node_ip: str | None = None, output_csv: str | None = None, @@ -157,121 +149,71 @@ def __init__( """Initialize the throughput tester. Args: - backend: Backend type ("SimpleStorage", "Yuanrong", "MooncakeStore") - backend_config: Backend configuration dictionary + backend_config_path: Path to backend config YAML file device: Device type ("cpu", "npu", "gpu") global_batch_size: Global batch size field_num: Number of fields seq_len: Sequence length - num_global_batch: Number of global batches + num_test_iterations: Number of test iterations head_node_ip: Head node IP address - worker_node_ip: Worker node IP address (required for Yuanrong inter_node) + worker_node_ip: Worker node IP address (required for Yuanrong) output_csv: Path to output CSV file (optional) """ - self.backend = backend - self.backend_config = backend_config + self.backend_config_path = backend_config_path self.device = device self.global_batch_size = global_batch_size self.field_num = field_num self.seq_len = seq_len - self.num_global_batch = num_global_batch + self.num_test_iterations = num_test_iterations self.head_node_ip = head_node_ip self.worker_node_ip = worker_node_ip self.output_csv = output_csv - # Get client_placement from Yuanrong config, default to inter_node - self.client_placement = ( - self.backend_config.get("client_placement", "inter_node") if self.backend == "Yuanrong" else "intra_node" - ) + # Prepare full config for tq.init() + self.full_config = self._prepare_config() + + # Get backend from config + self.backend = self.full_config["backend"]["storage_backend"] + + # For Yuanrong, always use inter_node + self.use_inter_node = self.backend == "Yuanrong" # Validate arguments self._validate_args() - # Prepare full config for tq.init() - self.base_config, self.zmq_info = self._prepare_configs() - - # Initialize the test infrastructure - self._initialize_data_system() + # Initialize clients self._initialize_clients() def _validate_args(self) -> None: """Validate input arguments.""" - # Check worker_node_ip for Yuanrong inter_node - if self.backend == "Yuanrong" and self.client_placement == "inter_node" and self.worker_node_ip is None: - raise ValueError("worker_node_ip is required for Yuanrong with client_placement=inter_node") + # Check worker_node_ip for Yuanrong + if self.use_inter_node and self.worker_node_ip is None: + raise ValueError("worker_node_ip is required for Yuanrong backend") - def _prepare_configs(self) -> tuple[dict[str, Any], Any]: - """Prepare the base config and storage units. + def _prepare_config(self) -> dict[str, Any]: + """Prepare the config by directly reading the backend_config file. Returns: - Tuple of (base_config, zmq_info) + Configuration dictionary """ - total_storage_size = self.global_batch_size * self.num_global_batch - - config = { - "controller": { - "sampler": "SequentialSampler", - "polling_mode": False, - }, - "backend": { - "storage_backend": self.backend, - }, - } + # Directly read the backend_config file, no merging with default + config = OmegaConf.load(self.backend_config_path) - # Set client_name based on backend - if self.backend == "Yuanrong": - self.backend_config["client_name"] = "YuanrongStorageClient" - elif self.backend == "MooncakeStore": - self.backend_config["client_name"] = "MooncakeStoreClient" - - # Add backend-specific config - if self.backend == "SimpleStorage": - config["backend"]["SimpleStorage"] = { - "total_storage_size": total_storage_size, - "num_data_storage_units": self.backend_config.get("num_data_storage_units", 1), - } - elif self.backend == "Yuanrong": - config["backend"]["Yuanrong"] = self.backend_config.copy() - # Remove client_placement from the backend config passed to tq - if "client_placement" in config["backend"]["Yuanrong"]: - del config["backend"]["Yuanrong"]["client_placement"] - elif self.backend == "MooncakeStore": - config["backend"]["MooncakeStore"] = self.backend_config.copy() - - return config, None - - def _initialize_data_system(self) -> None: - """Initialize controller and storage units if needed.""" - # For SimpleStorage, we need to manually create storage units with placement - if self.backend == "SimpleStorage": - self._initialize_storage_units() - - def _initialize_storage_units(self) -> None: - """Initialize SimpleStorageUnits for SimpleStorage backend.""" - num_data_storage_units = self.backend_config.get("num_data_storage_units", 1) - total_storage_size = self.global_batch_size * self.num_global_batch - - self.data_system_storage_units = {} - - storage_placement_group = get_placement_group(num_data_storage_units, num_cpus_per_actor=1) - for storage_unit_rank in range(num_data_storage_units): - storage_node = SimpleStorageUnit.options( - placement_group=storage_placement_group, - placement_group_bundle_index=storage_unit_rank, - ).remote(storage_unit_size=NUM_TEST_ITERATIONS * math.ceil(total_storage_size / num_data_storage_units)) - self.data_system_storage_units[storage_unit_rank] = storage_node - logger.info(f"StorageUnit #0 ~ #{num_data_storage_units - 1} has been created.") - - self.zmq_info = process_zmq_server_info(self.data_system_storage_units) + # If backend.storage_backend is SimpleStorage, override total_storage_size + total_storage_size = self.global_batch_size * self.num_test_iterations + if config.backend.storage_backend == "SimpleStorage": + config.backend.SimpleStorage.total_storage_size = total_storage_size + + return OmegaConf.to_container(config, resolve=True) def _initialize_clients(self) -> None: """Initialize writer and reader TQClientActors.""" # Determine node placement - if self.client_placement == "intra_node": - writer_node = reader_node = self.head_node_ip - else: + if self.use_inter_node: writer_node = self.head_node_ip reader_node = self.worker_node_ip + else: + writer_node = reader_node = self.head_node_ip logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}") @@ -294,14 +236,14 @@ def _initialize_clients(self) -> None: reader_options["resources"]["NPU"] = 1 # Create writer and reader actors - self.writer = TQClientActor.options(**writer_options).remote(self.base_config) - self.reader = TQClientActor.options(**reader_options).remote(self.base_config) + self.writer = TQClientActor.options(**writer_options).remote(self.full_config) + self.reader = TQClientActor.options(**reader_options).remote(self.full_config) # Initialize transfer_queue logger.info(f"Using {self.backend} as storage backend.") - w = self.writer.initialize.remote(self.zmq_info) - r = self.reader.initialize.remote(self.zmq_info) + w = self.writer.initialize.remote() + r = self.reader.initialize.remote() ray.get([w, r]) def run_throughput_test(self) -> dict[str, Any]: @@ -333,18 +275,12 @@ def run_throughput_test(self) -> dict[str, Any]: put_time = end_put - start_put put_gbit_per_sec = (total_data_size_gb * 8) / put_time put_gbyte_per_sec = total_data_size_gb / put_time - logger.info(f"put cost time: {put_time:.8f}s") - logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)") time.sleep(2) # LIST_KEYS operation using kv_list logger.info("Starting LIST_KEYS operation (kv_list)...") - start_list = time.perf_counter() keys = ray.get(self.reader.list_keys.remote(partition_id=partition_id)) - end_list = time.perf_counter() - logger.info(f"list_keys cost time: {end_list - start_list:.8f}s") - logger.info(f"Found {len(keys)} keys") time.sleep(2) @@ -357,9 +293,6 @@ def run_throughput_test(self) -> dict[str, Any]: get_gbit_per_sec = (total_data_size_gb * 8) / get_time get_gbyte_per_sec = total_data_size_gb / get_time - logger.info(f"get_data cost time: {get_time:.8f}s") - logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s ({get_gbyte_per_sec:.8f} GB/s)") - # Print summary total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time) total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time) @@ -368,7 +301,6 @@ def run_throughput_test(self) -> dict[str, Any]: logger.info("THROUGHPUT TEST SUMMARY") logger.info("=" * 60) logger.info(f"Backend: {self.backend}") - logger.info(f"Client Placement: {self.client_placement}") logger.info(f"Device: {self.device}") logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") logger.info(f"PUT Time: {put_time:.8f}s") @@ -378,55 +310,21 @@ def run_throughput_test(self) -> dict[str, Any]: logger.info(f"Total Throughput: {total_gbit_per_sec:.8f} Gb/s ({total_gbyte_per_sec:.8f} GB/s)") logger.info("=" * 60) - # Return results + # Return results (only Gb/s for CSV, not GB/s) return { "backend": self.backend, - "client_placement": self.client_placement, "device": self.device, "total_data_size_gb": total_data_size_gb, "put_time": put_time, "get_time": get_time, "put_gbit_per_sec": put_gbit_per_sec, - "put_gbyte_per_sec": put_gbyte_per_sec, "get_gbit_per_sec": get_gbit_per_sec, - "get_gbyte_per_sec": get_gbyte_per_sec, "total_gbit_per_sec": total_gbit_per_sec, - "total_gbyte_per_sec": total_gbyte_per_sec, } - -def load_backend_config(config_path: str | None, backend: str) -> dict[str, Any]: - """Load backend config from YAML file or use defaults. - - Args: - config_path: Path to YAML config file (optional) - backend: Backend type for default config - - Returns: - Backend configuration dictionary - """ - if config_path is not None: - config = OmegaConf.load(config_path) - return OmegaConf.to_container(config, resolve=True) - - # Default configs - if backend == "SimpleStorage": - return {"num_data_storage_units": 1} - elif backend == "Yuanrong": - return { - "host": "127.0.0.1", - "port": 31501, - "enable_yr_npu_transport": False, - "client_placement": "inter_node", - } - elif backend == "MooncakeStore": - return { - "local_hostname": "127.0.0.1", - "metadata_server": "127.0.0.1:8080", - "master_server_address": "127.0.0.1:8081", - } - else: - return {} + def close(self) -> None: + """Close the transfer_queue clients.""" + ray.get([self.writer.close.remote(), self.reader.close.remote()]) def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> None: @@ -453,18 +351,11 @@ def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> Non def main() -> None: """Main entry point for the perftest script.""" parser = argparse.ArgumentParser(description="TransferQueue Throughput Test") - parser.add_argument( - "--backend", - type=str, - default="SimpleStorage", - choices=["SimpleStorage", "Yuanrong", "MooncakeStore"], - help="Backend type to test (default: SimpleStorage)", - ) parser.add_argument( "--backend_config", type=str, - default=None, - help="Path to backend config YAML file (optional)", + required=True, + help="Path to backend config YAML file", ) parser.add_argument( "--device", @@ -492,10 +383,10 @@ def main() -> None: help="Sequence length (default: 8192)", ) parser.add_argument( - "--num_global_batch", + "--num_test_iterations", type=int, - default=1, - help="Number of global batches (default: 1)", + default=3, + help="Number of test iterations (default: 3)", ) parser.add_argument( "--head_node_ip", @@ -507,7 +398,7 @@ def main() -> None: "--worker_node_ip", type=str, default=None, - help="Worker node IP address (required for Yuanrong inter_node)", + help="Worker node IP address (required for Yuanrong)", ) parser.add_argument( "--ray_address", @@ -524,22 +415,18 @@ def main() -> None: args = parser.parse_args() - # Load backend config - backend_config = load_backend_config(args.backend_config, args.backend) - # Initialize Ray logger.info(f"Connecting to Ray cluster at {args.ray_address}") ray.init(address=args.ray_address) # Create and run tester tester = TQThroughputTester( - backend=args.backend, - backend_config=backend_config, + backend_config_path=args.backend_config, device=args.device, global_batch_size=args.global_batch_size, field_num=args.field_num, seq_len=args.seq_len, - num_global_batch=args.num_global_batch, + num_test_iterations=args.num_test_iterations, head_node_ip=args.head_node_ip, worker_node_ip=args.worker_node_ip, output_csv=args.output_csv, @@ -547,9 +434,9 @@ def main() -> None: # Run test multiple times for consistent results using a for loop all_results = [] - for i in range(NUM_TEST_ITERATIONS): + for i in range(args.num_test_iterations): logger.info("-" * 60) - logger.info(f"Iteration {i + 1}/{NUM_TEST_ITERATIONS}") + logger.info(f"Iteration {i + 1}/{args.num_test_iterations}") logger.info("-" * 60) result = tester.run_throughput_test() all_results.append(result) @@ -558,6 +445,9 @@ def main() -> None: if args.output_csv: write_results_to_csv(all_results, args.output_csv) + # Close transfer_queue + tester.close() + logger.info("Throughput test completed successfully!") diff --git a/transfer_queue/config.yaml b/transfer_queue/config.yaml index 98819ed..0a8ccef 100644 --- a/transfer_queue/config.yaml +++ b/transfer_queue/config.yaml @@ -47,4 +47,10 @@ backend: RayStore: # For Yuanrong: - # TODO \ No newline at end of file + Yuanrong: + # IP of local yuanrong datasystem worker + host: 127.0.0.1 + # Port of local yuanrong datasystem worker + port: 31501 + # If enable npu transport + enable_yr_npu_transport: false From eb9112b95a0f15e278e1c1f1cd012cb533256c01 Mon Sep 17 00:00:00 2001 From: 0oshowero0 Date: Tue, 24 Mar 2026 20:36:13 +0800 Subject: [PATCH 09/29] squash all commits Signed-off-by: 0oshowero0 # Conflicts: # scripts/performance_test/README_PERFTEST.md # scripts/performance_test/perftest.py --- scripts/performance_test/README_PERFTEST.md | 29 +- scripts/performance_test/draw_figure.py | 140 +++++++ scripts/performance_test/perftest.py | 127 ++++-- scripts/performance_test/perftest_config.yaml | 56 +++ .../performance_test/ray_perftest_baseline.py | 375 ++++++++++++++++++ scripts/performance_test/run_perf_test.sh | 81 ++++ 6 files changed, 764 insertions(+), 44 deletions(-) create mode 100644 scripts/performance_test/draw_figure.py create mode 100644 scripts/performance_test/perftest_config.yaml create mode 100644 scripts/performance_test/ray_perftest_baseline.py create mode 100755 scripts/performance_test/run_perf_test.sh diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index 9b979ae..cf62efa 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -32,16 +32,15 @@ python perftest.py \ | Argument | Description | Default | |----------|-------------|---------| -| `--backend_config` | Path to backend config YAML file (required) | - | -| `--device` | Device: cpu, npu, gpu | cpu | -| `--global_batch_size` | Global batch size | 1024 | -| `--field_num` | Number of fields | 10 | -| `--seq_len` | Sequence length | 8192 | -| `--num_test_iterations` | Number of test iterations | 3 | -| `--head_node_ip` | Head node IP (required) | - | -| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None | -| `--ray_address` | Ray cluster address | auto | -| `--output_csv` | Path to output CSV file (optional) | None | +| `--backend_config` | Path to backend config YAML file (required) | - | +| `--device` | Device: cpu, npu, gpu | cpu | +| `--global_batch_size` | Global batch size | 1024 | +| `--field_num` | Number of fields | 10 | +| `--seq_len` | Sequence length | 8192 | +| `--num_test_iterations` | Number of test iterations | 4 | +| `--head_node_ip` | Head node IP (required) | - | +| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None | +| `--output_csv` | Path to output CSV file (optional) | None | ## Backend Configuration @@ -60,25 +59,25 @@ For Yuanrong backend, writer runs on head node and reader runs on worker node. ### SimpleStorage/Mooncake backend ```bash -python perftest.py --backend_config=../../transfer_queue/config.yaml \ +python perftest.py --backend_config=perftest_config.yaml \ --head_node_ip=192.168.0.1 ``` ### Yuanrong backend ```bash -python perftest.py --backend_config=../../transfer_queue/config.yaml \ +python perftest.py --backend_config=perftest_config.yaml \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` ### NPU device test ```bash -python perftest.py --backend_config=../../transfer_queue/config.yaml --device=npu \ +python perftest.py --backend_config=perftest_config.yaml --device=npu \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` ### Output to CSV ```bash -python perftest.py --backend_config=../../transfer_queue/config.yaml \ +python perftest.py --backend_config=perftest_config.yaml \ --head_node_ip=192.168.0.1 --output_csv=results.csv ``` @@ -104,4 +103,4 @@ When using `--output_csv`, the test writes results to a CSV file with the follow - get_gbit_per_sec - total_gbit_per_sec -The test runs `--num_test_iterations` iterations (default: 3) and saves all results to the CSV. +The test runs `--num_test_iterations` iterations (default: 4) and saves all results to the CSV. diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py new file mode 100644 index 0000000..1d96a65 --- /dev/null +++ b/scripts/performance_test/draw_figure.py @@ -0,0 +1,140 @@ +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + +results_dir = Path(__file__).resolve().parent / "results" +csv_files = list(results_dir.glob("*.csv")) + +if not csv_files: + raise FileNotFoundError(f"No CSV files found in {results_dir}") + +size_order = ["Small", "Medium", "Large"] + +# Filename -> display name mapping for backends. +# All normalization lives here so the shell script keeps simple lowercase names. +BACKEND_DISPLAY_NAMES = { + "simplestorage": "SimpleStorage", + "yuanrong": "Yuanrong", + "mooncakestore": "MooncakeStore", + "ray_baseline": "Ray", +} + + +def format_size(size_gb: float) -> str: + """Format a data size in GB to a human-readable string with appropriate unit.""" + if size_gb >= 1.0: + return f"{size_gb:.2f} GB" + size_mb = size_gb * 1024 + if size_mb >= 1.0: + return f"{size_mb:.2f} MB" + size_kb = size_mb * 1024 + return f"{size_kb:.2f} KB" + + +dfs = [] +for csv_file in csv_files: + df = pd.read_csv(csv_file) + # Parse size label and backend from filename: {backend}_{size_label}.csv + # Size label is always the last _-separated segment (lowercase). + # Backend is everything before the last underscore. + # e.g. "simplestorage_small.csv" -> backend_key="simplestorage", size_label="Small" + # e.g. "ray_baseline_small.csv" -> backend_key="ray_baseline", size_label="Small" + stem = csv_file.stem + parts = stem.rsplit("_", 1) + if len(parts) != 2: + print(f"Warning: skipping {csv_file.name}, unexpected filename format") + continue + raw_backend, raw_size = parts + size_label = raw_size.capitalize() + if size_label not in size_order: + print(f"Warning: skipping {csv_file.name}, unrecognized size label '{raw_size}'") + continue + df["backend_parsed"] = BACKEND_DISPLAY_NAMES.get(raw_backend, raw_backend) + df["size_label"] = size_label + dfs.append(df) + +df = pd.concat(dfs, ignore_index=True) + +existing_sizes = [s for s in size_order if s in df["size_label"].unique()] + +# Build composite X-axis label: "SizeLabel\n" +size_to_gb = df.groupby("size_label")["total_data_size_gb"].first().to_dict() + + +def make_xlabel(size_label: str) -> str: + return f"{size_label}\n{format_size(size_to_gb.get(size_label, 0))}" + + +df["X_label"] = df["size_label"].apply(make_xlabel) + +# Make X_label categorical with the correct ordering +df["X_label"] = pd.Categorical( + df["X_label"], + categories=[make_xlabel(s) for s in existing_sizes], + ordered=True, +) + +df["Bandwidth"] = df["total_gbit_per_sec"] +df["Scenario"] = df["backend_parsed"] + +# ========== Plotting ========== +sns.set_theme(style="white", palette="husl") + +fig, ax = plt.subplots(figsize=(12, 7)) + +palette = sns.color_palette("Set2", n_colors=df["Scenario"].nunique()) +barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette) + +# Legend: match old style — at the top center, horizontal, with frame +handles, labels = ax.get_legend_handles_labels() +# Move legend above the plot +ax.get_legend().remove() +fig.legend( + handles, + labels, + bbox_to_anchor=(0.5, 1.0), + loc="upper center", + ncol=len(handles), + title="", + frameon=True, + fancybox=True, + shadow=True, + fontsize=13, +) + +# Annotations on bars +for p in ax.patches: + height = p.get_height() + if height > 0: + ax.annotate( + f"{height:.3f}", + (p.get_x() + p.get_width() / 2.0, height), + ha="center", + va="bottom", + fontsize=11, + rotation=0, + ) + +# Axis formatting +ax.set_title("Performance Comparison (Total Throughput)", fontsize=16, fontweight="bold") +ax.set_xlabel("") +ax.set_ylabel("Bandwidth (Gbps)", fontsize=16) + +# Adjust y range to leave room for annotations +y_max = df["Bandwidth"].max() * 1.15 +ax.set_ylim(0, y_max) + +ax.grid(True, alpha=0.3) +ax.tick_params(axis="x", labelsize=14) +ax.tick_params(axis="y", labelsize=13) + +# Unified x-label at the bottom +fig.text(0.5, 0.02, "Data Volume", ha="center", fontsize=20) + +plt.tight_layout(rect=[0, 0.04, 1, 0.95]) # room for legend + x-label +plt.savefig(results_dir / "performance_comparison.pdf", dpi=300, bbox_inches="tight") +plt.show() + +print("Performance comparison plot generated and saved as 'performance_comparison.pdf'") diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 249af0b..0541296 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -25,7 +25,7 @@ import ray import torch from omegaconf import OmegaConf -from tensordict import TensorDict +from tensordict import NonTensorStack, TensorDict parent_dir = Path(__file__).resolve().parent.parent.parent sys.path.append(str(parent_dir)) @@ -42,21 +42,54 @@ def create_test_case( field_num: int | None = None, device: str = "cpu", ) -> tuple[TensorDict, float]: - """Create a test case with tensor fields only. + """Create a test case with complex data formats. + + Creates TensorDict with: + - Regular tensors: (batch_size, seq_length) shape, each element is float32 + - Nested Tensors: variable-length sequences, each batch element has length + uniformly sampled from [1, seq_length] + - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes + (to match memory footprint of one tensor element) Args: batch_size: Batch size for the test case - seq_length: Sequence length for tensor fields - field_num: Number of fields to create + seq_length: Maximum sequence length (used for regular tensors and + as upper bound for nested tensor sampling) + field_num: Total number of fields to create (distributed across types) device: Device to create tensors on ("cpu", "npu", or "gpu") Returns: Tuple of (TensorDict, total_size_gb) """ - tensor_field_size_bytes = batch_size * seq_length * 4 - tensor_field_size_gb = tensor_field_size_bytes / (1024**3) - - total_size_gb = tensor_field_size_gb * field_num + bytes_per_element = 4 # float32 + + # Calculate field distribution (1/3 each type, last fields may be regular) + num_regular_fields = (field_num + 2) // 3 + num_nested_fields = (field_num + 2) // 3 + num_nontensor_fields = field_num - num_regular_fields - num_nested_fields + + # Each regular tensor field: batch_size * seq_length * 4 bytes + regular_field_size_bytes = batch_size * seq_length * bytes_per_element + regular_field_size_gb = regular_field_size_bytes / (1024**3) + + # Nested tensor field: average length = (1 + seq_length) / 2, + # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes + avg_nested_length = (1 + seq_length) / 2 + nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) + nested_field_size_gb = nested_field_size_bytes / (1024**3) + + # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element + # Total for field: batch_size strings * seq_length * 4 bytes each + string_size_per_elem = seq_length * bytes_per_element + nontensor_field_size_bytes = batch_size * string_size_per_elem + nontensor_field_size_gb = nontensor_field_size_bytes / (1024**3) + + # Total size = sum of all field types + total_size_gb = ( + regular_field_size_gb * num_regular_fields + + nested_field_size_gb * num_nested_fields + + nontensor_field_size_gb * num_nontensor_fields + ) logger.info(f"Total data size: {total_size_gb:.6f} GB") @@ -67,17 +100,48 @@ def create_test_case( elif device == "gpu": torch_device = "cuda:0" - fields = {} - for i in range(field_num): - field_name = f"field_{i}" - tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) - fields[field_name] = tensor_data + # Set seeds for reproducibility (within this process) + # Sample lengths for all nested fields at once + nested_lengths = [ + torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i)) + for i in range(num_nested_fields) + ] batch_size_tuple = (batch_size,) - prompt_batch = TensorDict( - fields, - batch_size=batch_size_tuple, - ) + + prompt_batch = TensorDict(batch_size=batch_size_tuple) + + # 1. Regular tensor fields + for i in range(num_regular_fields): + field_name = f"field_{i}" + tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) + prompt_batch.set(field_name, tensor_data) + + # 2. Nested Tensor fields (variable-length sequences) + for i in range(num_nested_fields): + field_name = f"nested_field_{i}" + actual_lengths = nested_lengths[i] + + # Create nested tensor from variable-length sequences + nested_list = [] + for j in range(batch_size): + length = actual_lengths[j].item() + # Create sequence data: arange for each element (representing sequence indices) + seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) + nested_list.append(seq_data) + + nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) + prompt_batch.set(field_name, nested_tensor) + + # 3. NonTensorStack wrapped strings + # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint + string_char_count = seq_length * bytes_per_element # 4 bytes per char (unicode) + string_template = "x" * string_char_count + + for i in range(num_nontensor_fields): + field_name = f"nontensor_field_{i}" + string_data = [string_template for _ in range(batch_size)] + prompt_batch.set(field_name, NonTensorStack.from_list(string_data)) return prompt_batch, total_size_gb @@ -143,6 +207,7 @@ def __init__( seq_len: int, num_test_iterations: int, head_node_ip: str, + backend: str | None = None, worker_node_ip: str | None = None, output_csv: str | None = None, ): @@ -150,6 +215,7 @@ def __init__( Args: backend_config_path: Path to backend config YAML file + backend: Override storage_backend in config (e.g. "SimpleStorage") device: Device type ("cpu", "npu", "gpu") global_batch_size: Global batch size field_num: Number of fields @@ -160,6 +226,7 @@ def __init__( output_csv: Path to output CSV file (optional) """ self.backend_config_path = backend_config_path + self.backend_override = backend self.device = device self.global_batch_size = global_batch_size self.field_num = field_num @@ -199,6 +266,11 @@ def _prepare_config(self) -> dict[str, Any]: # Directly read the backend_config file, no merging with default config = OmegaConf.load(self.backend_config_path) + # Override storage_backend if specified via CLI + if self.backend_override is not None: + config.backend.storage_backend = self.backend_override + logger.info(f"Overriding storage_backend to: {self.backend_override}") + # If backend.storage_backend is SimpleStorage, override total_storage_size total_storage_size = self.global_batch_size * self.num_test_iterations if config.backend.storage_backend == "SimpleStorage": @@ -357,6 +429,12 @@ def main() -> None: required=True, help="Path to backend config YAML file", ) + parser.add_argument( + "--backend", + type=str, + default=None, + help="Override storage_backend in config (e.g. SimpleStorage, Yuanrong, MooncakeStore)", + ) parser.add_argument( "--device", type=str, @@ -385,8 +463,8 @@ def main() -> None: parser.add_argument( "--num_test_iterations", type=int, - default=3, - help="Number of test iterations (default: 3)", + default=4, + help="Number of test iterations (default: 4)", ) parser.add_argument( "--head_node_ip", @@ -400,12 +478,6 @@ def main() -> None: default=None, help="Worker node IP address (required for Yuanrong)", ) - parser.add_argument( - "--ray_address", - type=str, - default="auto", - help="Ray cluster address (default: auto)", - ) parser.add_argument( "--output_csv", type=str, @@ -415,10 +487,6 @@ def main() -> None: args = parser.parse_args() - # Initialize Ray - logger.info(f"Connecting to Ray cluster at {args.ray_address}") - ray.init(address=args.ray_address) - # Create and run tester tester = TQThroughputTester( backend_config_path=args.backend_config, @@ -428,6 +496,7 @@ def main() -> None: seq_len=args.seq_len, num_test_iterations=args.num_test_iterations, head_node_ip=args.head_node_ip, + backend=args.backend, worker_node_ip=args.worker_node_ip, output_csv=args.output_csv, ) diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml new file mode 100644 index 0000000..88dca45 --- /dev/null +++ b/scripts/performance_test/perftest_config.yaml @@ -0,0 +1,56 @@ +# This is the default configuration of TransferQueue. Users may modify the default value +# and use transfer_queue.init(conf) to overwrite the config entries. + +controller: + # User-defined sampler. User can pass sampler instance to overwrite this string config. + sampler: SequentialSampler + # Whether return an empty BatchMeta to prevent request blocking when no enough data is available + polling_mode: False + # ZMQ Server IP & Ports (automatically generated during init) + zmq_info: null + + +backend: + # Pluggable storage/transport backend of TransferQueue. Choose from: + # SimpleStorage, Yuanrong, MooncakeStore, ... + storage_backend: SimpleStorage + + # For SimpleStorage: + SimpleStorage: + # Total number of samples + total_storage_size: 100000 + # Number of distributed storage units for SimpleStorage backend + num_data_storage_units: 16 + # ZMQ Server IP & Ports (automatically generated during init) + zmq_info: null + + # For MooncakeStore: + MooncakeStore: + # Whether to let TQ automatically init metadata_server. + auto_init: true + # Address of the HTTP metadata server + metadata_server: localhost:50050 + # Address of master server + master_server_address: localhost:50051 + # Address of local host. Set to "" to use Ray IP as local host address + local_hostname: "" + # Protocol for transmission. Choose from: tcp, rdma. (default: tcp) + protocol: tcp + # Memory segment size in bytes for mounting (default: 4GB) + global_segment_size: 4294967296 + # Local buffer size in bytes (default: 1GB) + local_buffer_size: 1073741824 + # Network device name. Set to "" to let Mooncake to auto-picks devices + device_name: "" + + # For RayStore: + RayStore: + + # For Yuanrong: + Yuanrong: + # IP of local yuanrong datasystem worker + host: 127.0.0.1 + # Port of local yuanrong datasystem worker + port: 31501 + # If enable npu transport + enable_yr_npu_transport: false diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py new file mode 100644 index 0000000..6951713 --- /dev/null +++ b/scripts/performance_test/ray_perftest_baseline.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2025 The TransferQueue Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import csv +import logging +import sys +import time +from pathlib import Path +from typing import Any + +import ray +import torch +from tensordict import NonTensorStack, TensorDict + +parent_dir = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(parent_dir)) + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +def create_test_case( + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", +) -> tuple[TensorDict, float]: + """Create a test case with complex data formats. + + Creates TensorDict with: + - Regular tensors: (batch_size, seq_length) shape, each element is float32 + - Nested Tensors: variable-length sequences, each batch element has length + uniformly sampled from [1, seq_length] + - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes + (to match memory footprint of one tensor element) + + Args: + batch_size: Batch size for the test case + seq_length: Maximum sequence length (used for regular tensors and + as upper bound for nested tensor sampling) + field_num: Total number of fields to create (distributed across types) + device: Device to create tensors on ("cpu", "npu", or "gpu") + + Returns: + Tuple of (TensorDict, total_size_gb) + """ + bytes_per_element = 4 # float32 + + # Calculate field distribution (1/3 each type, last fields may be regular) + num_regular_fields = (field_num + 2) // 3 + num_nested_fields = (field_num + 2) // 3 + num_nontensor_fields = field_num - num_regular_fields - num_nested_fields + + # Each regular tensor field: batch_size * seq_length * 4 bytes + regular_field_size_bytes = batch_size * seq_length * bytes_per_element + regular_field_size_gb = regular_field_size_bytes / (1024**3) + + # Nested tensor field: average length = (1 + seq_length) / 2, + # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes + avg_nested_length = (1 + seq_length) / 2 + nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) + nested_field_size_gb = nested_field_size_bytes / (1024**3) + + # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element + # Total for field: batch_size strings * seq_length * 4 bytes each + string_size_per_elem = seq_length * bytes_per_element + nontensor_field_size_bytes = batch_size * string_size_per_elem + nontensor_field_size_gb = nontensor_field_size_bytes / (1024**3) + + # Total size = sum of all field types + total_size_gb = ( + regular_field_size_gb * num_regular_fields + + nested_field_size_gb * num_nested_fields + + nontensor_field_size_gb * num_nontensor_fields + ) + + logger.info(f"Total data size: {total_size_gb:.6f} GB") + + # Determine torch device + torch_device = None + if device == "npu": + torch_device = "npu:0" + elif device == "gpu": + torch_device = "cuda:0" + + # Set seeds for reproducibility (within this process) + # Sample lengths for all nested fields at once + nested_lengths = [ + torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i)) + for i in range(num_nested_fields) + ] + + batch_size_tuple = (batch_size,) + + prompt_batch = TensorDict(batch_size=batch_size_tuple) + + # 1. Regular tensor fields + for i in range(num_regular_fields): + field_name = f"field_{i}" + tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) + prompt_batch.set(field_name, tensor_data) + + # 2. Nested Tensor fields (variable-length sequences) + for i in range(num_nested_fields): + field_name = f"nested_field_{i}" + actual_lengths = nested_lengths[i] + + # Create nested tensor from variable-length sequences + nested_list = [] + for j in range(batch_size): + length = actual_lengths[j].item() + # Create sequence data: arange for each element (representing sequence indices) + seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) + nested_list.append(seq_data) + + nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) + prompt_batch.set(field_name, nested_tensor) + + # 3. NonTensorStack wrapped strings + # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint + string_char_count = seq_length * bytes_per_element # 4 bytes per char (unicode) + string_template = "x" * string_char_count + + for i in range(num_nontensor_fields): + field_name = f"nontensor_field_{i}" + string_data = [string_template for _ in range(batch_size)] + prompt_batch.set(field_name, NonTensorStack.from_list(string_data)) + + return prompt_batch, total_size_gb + + +@ray.remote +class RemoteDataStore: + """Ray remote actor that stores and retrieves data directly (without ray.put).""" + + def __init__(self): + self.stored_data = None + + def put_data(self, data: TensorDict) -> None: + self.stored_data = data + + def get_data(self) -> TensorDict: + return self.stored_data + + def clear_data(self) -> None: + self.stored_data = None + + +class RayBaselineTester: + """Ray baseline throughput tester - measures raw Ray data transfer performance.""" + + def __init__( + self, + global_batch_size: int, + field_num: int, + seq_len: int, + num_test_iterations: int, + head_node_ip: str, + worker_node_ip: str | None = None, + output_csv: str | None = None, + ): + """Initialize the Ray baseline tester. + + Args: + global_batch_size: Global batch size + field_num: Number of fields + seq_len: Sequence length + num_test_iterations: Number of test iterations + head_node_ip: Head node IP address + worker_node_ip: Worker node IP address + output_csv: Path to output CSV file (optional) + """ + self.global_batch_size = global_batch_size + self.field_num = field_num + self.seq_len = seq_len + self.num_test_iterations = num_test_iterations + self.head_node_ip = head_node_ip + self.worker_node_ip = worker_node_ip + self.output_csv = output_csv + + # Initialize remote store on worker node + self._initialize_remote_store() + + def _initialize_remote_store(self) -> None: + """Initialize the RemoteDataStore actor on worker node.""" + writer_node = self.head_node_ip + reader_node = self.worker_node_ip if self.worker_node_ip else self.head_node_ip + + logger.info(f"Writer is on {writer_node}, Reader is on {reader_node}") + + self.remote_store = RemoteDataStore.options( + num_cpus=0.001, + resources={f"node:{reader_node}": 0.001}, + ).remote() + + logger.info(f"RemoteDataStore created on {reader_node}") + + def run_throughput_test(self) -> dict[str, Any]: + """Run the throughput test and print results. + + Returns: + Dictionary with test results + """ + # Create test data + logger.info("Creating large batch for throughput test...") + start_create_data = time.perf_counter() + test_data, total_data_size_gb = create_test_case( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device="cpu", + ) + end_create_data = time.perf_counter() + logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") + + # PUT operation - pass data directly to remote actor + logger.info("Starting PUT operation...") + start_put = time.perf_counter() + ray.get(self.remote_store.put_data.remote(test_data)) + end_put = time.perf_counter() + put_time = end_put - start_put + put_gbit_per_sec = (total_data_size_gb * 8) / put_time + + time.sleep(2) + + # GET operation - retrieve data from remote actor + logger.info("Starting GET operation...") + start_get = time.perf_counter() + _ = ray.get(self.remote_store.get_data.remote()) + end_get = time.perf_counter() + get_time = end_get - start_get + get_gbit_per_sec = (total_data_size_gb * 8) / get_time + + # Clear data + ray.get(self.remote_store.clear_data.remote()) + + # Calculate total throughput + total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time) + + # Print summary + logger.info("=" * 60) + logger.info("RAY BASELINE THROUGHPUT TEST SUMMARY") + logger.info("=" * 60) + logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") + logger.info(f"PUT Time: {put_time:.8f}s") + logger.info(f"GET Time: {get_time:.8f}s") + logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s") + logger.info(f"GET Throughput: {get_gbit_per_sec:.8f} Gb/s") + logger.info(f"Total Throughput (round-trip): {total_gbit_per_sec:.8f} Gb/s") + logger.info("=" * 60) + + return { + "backend": "RayBaseline", + "device": "cpu", + "total_data_size_gb": total_data_size_gb, + "put_time": put_time, + "get_time": get_time, + "put_gbit_per_sec": put_gbit_per_sec, + "get_gbit_per_sec": get_gbit_per_sec, + "total_gbit_per_sec": total_gbit_per_sec, + } + + +def write_results_to_csv(results: list[dict[str, Any]], output_path: str) -> None: + """Write test results to CSV file. + + Args: + results: List of result dictionaries + output_path: Path to output CSV file + """ + if not results: + return + + fieldnames = list(results[0].keys()) + + with open(output_path, "w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for result in results: + writer.writerow(result) + + logger.info(f"Results written to {output_path}") + + +def main() -> None: + """Main entry point for the Ray baseline perftest script.""" + parser = argparse.ArgumentParser(description="Ray Baseline Throughput Test") + parser.add_argument( + "--global_batch_size", + type=int, + default=1024, + help="Global batch size (default: 1024)", + ) + parser.add_argument( + "--field_num", + type=int, + default=10, + help="Number of fields (default: 10)", + ) + parser.add_argument( + "--seq_len", + type=int, + default=8192, + help="Sequence length (default: 8192)", + ) + parser.add_argument( + "--num_test_iterations", + type=int, + default=3, + help="Number of test iterations (default: 3)", + ) + parser.add_argument( + "--head_node_ip", + type=str, + required=True, + help="Head node IP address", + ) + parser.add_argument( + "--worker_node_ip", + type=str, + default=None, + help="Worker node IP address (optional)", + ) + parser.add_argument( + "--output_csv", + type=str, + default=None, + help="Path to output CSV file (optional)", + ) + + args = parser.parse_args() + + # Create and run tester + tester = RayBaselineTester( + global_batch_size=args.global_batch_size, + field_num=args.field_num, + seq_len=args.seq_len, + num_test_iterations=args.num_test_iterations, + head_node_ip=args.head_node_ip, + worker_node_ip=args.worker_node_ip, + output_csv=args.output_csv, + ) + + # Run test multiple times + all_results = [] + for i in range(args.num_test_iterations): + logger.info("-" * 60) + logger.info(f"Iteration {i + 1}/{args.num_test_iterations}") + logger.info("-" * 60) + result = tester.run_throughput_test() + all_results.append(result) + + # Write to CSV if output path is specified + if args.output_csv: + write_results_to_csv(all_results, args.output_csv) + + logger.info("Ray baseline throughput test completed successfully!") + + +if __name__ == "__main__": + main() diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh new file mode 100755 index 0000000..58a6930 --- /dev/null +++ b/scripts/performance_test/run_perf_test.sh @@ -0,0 +1,81 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RESULTS_DIR="${SCRIPT_DIR}/results" +PERFTEST_PY="${SCRIPT_DIR}/perftest.py" +RAY_PERFTEST_PY="${SCRIPT_DIR}/ray_perftest_baseline.py" +CONFIG_YAML="${SCRIPT_DIR}/perftest_config.yaml" + +mkdir -p "${RESULTS_DIR}" + +# ========== User Configuration ========== +# Modify these based on your environment +HEAD_NODE_IP="${HEAD_NODE_IP:-127.0.0.1}" +WORKER_NODE_IP="${WORKER_NODE_IP:-127.0.0.1}" +DEVICE="${DEVICE:-cpu}" +NUM_TEST_ITERATIONS="${NUM_TEST_ITERATIONS:-4}" +# ======================================== + +# Backends to test (passed via --backend to perftest.py) +BACKENDS=("SimpleStorage" "Yuanrong" "MooncakeStore") + +# Test settings: global_batch_size, field_num, seq_len, name +declare -a SETTINGS=( + "128,3,1024,Small" + "1024,9,8192,Medium" + "4096,21,128000,Large" +) + +# ---- TransferQueue perftest ---- +for backend in "${BACKENDS[@]}"; do + echo "==========================================" + echo "Testing backend: ${backend}" + echo "==========================================" + + for setting in "${SETTINGS[@]}"; do + IFS=',' read -r batch_size field_num seq_len name <<< "$setting" + output_csv="${RESULTS_DIR}/${backend,,}_${name,,}.csv" + + echo " Setting: ${name} (batch=${batch_size}, fields=${field_num}, seq=${seq_len})" + + if [[ "$backend" == "Yuanrong" ]]; then + python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \ + --device="${DEVICE}" \ + --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ + --num_test_iterations="${NUM_TEST_ITERATIONS}" \ + --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ + --output_csv="${output_csv}" + else + python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \ + --device="${DEVICE}" \ + --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ + --num_test_iterations="${NUM_TEST_ITERATIONS}" \ + --head_node_ip="${HEAD_NODE_IP}" \ + --output_csv="${output_csv}" + fi + done +done + +# ---- Ray baseline ---- +echo "==========================================" +echo "Testing backend: Ray (baseline)" +echo "==========================================" +for setting in "${SETTINGS[@]}"; do + IFS=',' read -r batch_size field_num seq_len name <<< "$setting" + output_csv="${RESULTS_DIR}/ray_baseline_${name,,}.csv" + + echo " Setting: ${name} (batch=${batch_size}, fields=${field_num}, seq=${seq_len})" + + python "${RAY_PERFTEST_PY}" \ + --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ + --num_test_iterations="${NUM_TEST_ITERATIONS}" \ + --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ + --output_csv="${output_csv}" +done + +# ---- Draw figures ---- +python "${SCRIPT_DIR}/draw_figure.py" + +echo "" +echo "All tests completed!" From fa5b131ca4798a91b1c19ed95dc495b65dc3deba Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 10:09:20 +0800 Subject: [PATCH 10/29] add license to draw_figure.py Signed-off-by: tianyi-ge --- scripts/performance_test/draw_figure.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py index 1d96a65..bb7910e 100644 --- a/scripts/performance_test/draw_figure.py +++ b/scripts/performance_test/draw_figure.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python3 +# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2025 The TransferQueue Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path import matplotlib.pyplot as plt From 537b7c6ef1cece453a4c0557ab4150f721eb3f74 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 10:13:46 +0800 Subject: [PATCH 11/29] simplify run_perf_test.sh Signed-off-by: tianyi-ge --- scripts/performance_test/run_perf_test.sh | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh index 58a6930..50b2863 100755 --- a/scripts/performance_test/run_perf_test.sh +++ b/scripts/performance_test/run_perf_test.sh @@ -39,21 +39,12 @@ for backend in "${BACKENDS[@]}"; do echo " Setting: ${name} (batch=${batch_size}, fields=${field_num}, seq=${seq_len})" - if [[ "$backend" == "Yuanrong" ]]; then - python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \ - --device="${DEVICE}" \ - --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ - --num_test_iterations="${NUM_TEST_ITERATIONS}" \ - --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ - --output_csv="${output_csv}" - else - python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \ - --device="${DEVICE}" \ - --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ - --num_test_iterations="${NUM_TEST_ITERATIONS}" \ - --head_node_ip="${HEAD_NODE_IP}" \ - --output_csv="${output_csv}" - fi + python "${PERFTEST_PY}" --backend_config="${CONFIG_YAML}" --backend="${backend}" \ + --device="${DEVICE}" \ + --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ + --num_test_iterations="${NUM_TEST_ITERATIONS}" \ + --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ + --output_csv="${output_csv}" done done From 8a17c1951317f886995a9d58e26eed007fb17aa9 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 14:39:57 +0800 Subject: [PATCH 12/29] change client host for yuanrong Signed-off-by: tianyi-ge --- scripts/performance_test/perftest.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 0541296..53614bf 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -307,9 +307,24 @@ def _initialize_clients(self) -> None: writer_options["resources"]["NPU"] = 1 reader_options["resources"]["NPU"] = 1 + # Prepare configs for writer and reader + # For Yuanrong backend, set different hosts for writer and reader + if self.backend == "Yuanrong": + import copy + + writer_config = copy.deepcopy(self.full_config) + reader_config = copy.deepcopy(self.full_config) + writer_config["backend"]["Yuanrong"]["host"] = self.head_node_ip + reader_config["backend"]["Yuanrong"]["host"] = self.worker_node_ip + logger.info(f"Writer Yuanrong host: {self.head_node_ip}") + logger.info(f"Reader Yuanrong host: {self.worker_node_ip}") + else: + writer_config = self.full_config + reader_config = self.full_config + # Create writer and reader actors - self.writer = TQClientActor.options(**writer_options).remote(self.full_config) - self.reader = TQClientActor.options(**reader_options).remote(self.full_config) + self.writer = TQClientActor.options(**writer_options).remote(writer_config) + self.reader = TQClientActor.options(**reader_options).remote(reader_config) # Initialize transfer_queue logger.info(f"Using {self.backend} as storage backend.") From 60cdcaa24e639783441314e8c2ebdbab09f9b9a2 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 15:09:57 +0800 Subject: [PATCH 13/29] use d2h and h2d instead of d2d Signed-off-by: tianyi-ge --- tests/test_yuanrong_storage_client_e2e.py | 6 +++--- transfer_queue/storage/clients/yuanrong_client.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_yuanrong_storage_client_e2e.py b/tests/test_yuanrong_storage_client_e2e.py index 519335f..2a79ec9 100644 --- a/tests/test_yuanrong_storage_client_e2e.py +++ b/tests/test_yuanrong_storage_client_e2e.py @@ -38,18 +38,18 @@ def __init__(self, host, port, device_id): def init(self): pass - def dev_mset(self, keys, values): + def mset_d2h(self, keys, values): for k, v in zip(keys, values, strict=True): assert v.device.type == "npu" self.storage[k] = v - def dev_mget(self, keys, out_tensors): + def mget_h2d(self, keys, out_tensors): for i, k in enumerate(keys): # Note: If key is missing, tensor remains unchanged (mock limitation) if k in self.storage: out_tensors[i].copy_(self.storage[k]) - def dev_delete(self, keys): + def delete(self, keys): for k in keys: self.storage.pop(k, None) diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py index 41219c2..80f5bb7 100644 --- a/transfer_queue/storage/clients/yuanrong_client.py +++ b/transfer_queue/storage/clients/yuanrong_client.py @@ -123,12 +123,12 @@ def put(self, keys: list[str], values: list[Any]): for i in range(0, len(keys), self.KEYS_LIMIT): batch_keys = keys[i : i + self.KEYS_LIMIT] batch_values = values[i : i + self.KEYS_LIMIT] - # _npu_ds_client.dev_mset doesn't support to overwrite + # mset_d2h cannot overwrite existing keys try: - self._ds_client.dev_delete(batch_keys) + self._ds_client.delete(batch_keys) except Exception: pass - self._ds_client.dev_mset(batch_keys, batch_values) + self._ds_client.mset_d2h(batch_keys, batch_values) def supports_get(self, strategy_tag: str) -> bool: """Matches 'DsTensorClient' Strategy tag.""" @@ -147,8 +147,8 @@ def get(self, keys: list[str], **kwargs) -> list[Optional[Any]]: batch_dtypes = dtypes[i : i + self.KEYS_LIMIT] batch_values = self._create_empty_npu_tensorlist(batch_shapes, batch_dtypes) - self._ds_client.dev_mget(batch_keys, batch_values) - # Todo(dpj): consider checking and logging keys that fail during dev_mget + self._ds_client.mget_h2d(batch_keys, batch_values) + # Todo(dpj): consider checking and logging keys that fail during mget_h2d results.extend(batch_values) return results @@ -161,7 +161,7 @@ def clear(self, keys: list[str]): for i in range(0, len(keys), self.KEYS_LIMIT): batch = keys[i : i + self.KEYS_LIMIT] # Todo(dpj): Test call clear when no (key,value) put in ds - self._ds_client.dev_delete(batch) + self._ds_client.delete(batch) def _create_empty_npu_tensorlist(self, shapes, dtypes): """ From cdccf6df0345eb3971d8e3d4bf5b67e1fdc4759f Mon Sep 17 00:00:00 2001 From: 0oshowero0 Date: Wed, 25 Mar 2026 15:48:42 +0800 Subject: [PATCH 14/29] fix nested tensor for NPU Signed-off-by: 0oshowero0 --- scripts/performance_test/README_PERFTEST.md | 72 +++++++++++++++++-- scripts/performance_test/perftest.py | 57 ++++++++------- .../performance_test/ray_perftest_baseline.py | 57 ++++++++------- 3 files changed, 132 insertions(+), 54 deletions(-) diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index cf62efa..e05f9a8 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -51,33 +51,93 @@ For device support of each backend, - `Yuanrong` supports `cpu` and `npu` - `MooncakeStore` supports `cpu` and `gpu` +## Test Data Format + +The test case creates TensorDict with three types of fields: + +1. **Regular tensors**: Shape `(batch_size, seq_length)`, float32 +2. **Nested tensors** (non-NPU devices): Variable-length sequences with lengths forming an arithmetic progression from 1 to `seq_length`. For a batch of size N, element j has length `1 + j * (seq_length - 1) / (N - 1)`. This gives an average nested length of approximately `seq_length / 2`, making the nested column size roughly half of a regular tensor column. +3. **NonTensorStack strings**: Each string is `seq_length * 4` bytes to match the memory footprint of one tensor element. + +### NPU Fallback + +NPU does not support nested tensors. When running with `--device=npu`, the nested tensor fields are replaced with regular tensors of shape `(batch_size, seq_length // 2)` to maintain comparable total data size while avoiding nested tensor operations. + ## Yuanrong Backend For Yuanrong backend, writer runs on head node and reader runs on worker node. +## Running Full Test Suite + +The `run_perf_test.sh` script automates the full performance test suite: + +```bash +cd scripts/performance_test +./run_perf_test.sh +``` + +### Configuration + +Configure the test environment via environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `HEAD_NODE_IP` | Head node IP address | 127.0.0.1 | +| `WORKER_NODE_IP` | Worker node IP address | 127.0.0.1 | +| `DEVICE` | Device type (cpu, npu, gpu) | cpu | +| `NUM_TEST_ITERATIONS` | Number of iterations per test | 4 | + +Example: +```bash +HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.sh +``` + +### Test Matrix + +The script tests all combinations of: +- **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline) +- **Data sizes**: Small (batch=128, fields=3, seq=1024), Medium (batch=1024, fields=9, seq=8192), Large (batch=4096, fields=21, seq=128000) + +### Output + +- CSV results are saved to `results/{backend}_{size}.csv` (e.g., `results/simplestorage_small.csv`) +- A performance comparison chart is generated as `results/performance_comparison.pdf` + +### draw_figure.py + +After running the tests, `draw_figure.py` reads all CSV files from the `results/` directory and generates a bar chart comparing total throughput (Gbps) across backends and data sizes. + ## Examples -### SimpleStorage/Mooncake backend +Individual test examples using `perftest.py`: + +### SimpleStorage backend ```bash -python perftest.py --backend_config=perftest_config.yaml \ +python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage \ --head_node_ip=192.168.0.1 ``` ### Yuanrong backend ```bash -python perftest.py --backend_config=perftest_config.yaml \ +python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` -### NPU device test +### MooncakeStore backend +```bash +python perftest.py --backend_config=perftest_config.yaml --backend=MooncakeStore \ + --head_node_ip=192.168.0.1 +``` + +### NPU device test (Yuanrong backend) ```bash -python perftest.py --backend_config=perftest_config.yaml --device=npu \ +python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong --device=npu \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 ``` ### Output to CSV ```bash -python perftest.py --backend_config=perftest_config.yaml \ +python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage \ --head_node_ip=192.168.0.1 --output_csv=results.csv ``` diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 53614bf..46e38a8 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -46,15 +46,16 @@ def create_test_case( Creates TensorDict with: - Regular tensors: (batch_size, seq_length) shape, each element is float32 - - Nested Tensors: variable-length sequences, each batch element has length - uniformly sampled from [1, seq_length] + - Nested Tensors (non-NPU): variable-length sequences with lengths forming an + arithmetic progression from 1 to seq_length (average length ≈ seq_length/2) + - Nested Tensors (NPU): regular tensors of shape (batch_size, seq_length//2) - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes (to match memory footprint of one tensor element) Args: batch_size: Batch size for the test case seq_length: Maximum sequence length (used for regular tensors and - as upper bound for nested tensor sampling) + as upper bound for nested tensor lengths) field_num: Total number of fields to create (distributed across types) device: Device to create tensors on ("cpu", "npu", or "gpu") @@ -72,10 +73,15 @@ def create_test_case( regular_field_size_bytes = batch_size * seq_length * bytes_per_element regular_field_size_gb = regular_field_size_bytes / (1024**3) - # Nested tensor field: average length = (1 + seq_length) / 2, + # Nested tensor field: average length = (1 + seq_length) / 2 (arithmetic progression), # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes - avg_nested_length = (1 + seq_length) / 2 - nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) + # For NPU, nested fields become regular tensors of seq_length // 2 + if device == "npu": + avg_nested_length = seq_length // 2 + nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) + else: + avg_nested_length = (1 + seq_length) / 2 + nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) nested_field_size_gb = nested_field_size_bytes / (1024**3) # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element @@ -101,11 +107,8 @@ def create_test_case( torch_device = "cuda:0" # Set seeds for reproducibility (within this process) - # Sample lengths for all nested fields at once - nested_lengths = [ - torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i)) - for i in range(num_nested_fields) - ] + # For non-NPU: arithmetic progression lengths from 1 to seq_length for each nested field + # For NPU: nested fields become regular tensors of seq_length // 2 batch_size_tuple = (batch_size,) @@ -117,21 +120,27 @@ def create_test_case( tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) prompt_batch.set(field_name, tensor_data) - # 2. Nested Tensor fields (variable-length sequences) + # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU for i in range(num_nested_fields): field_name = f"nested_field_{i}" - actual_lengths = nested_lengths[i] - - # Create nested tensor from variable-length sequences - nested_list = [] - for j in range(batch_size): - length = actual_lengths[j].item() - # Create sequence data: arange for each element (representing sequence indices) - seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) - nested_list.append(seq_data) - - nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) - prompt_batch.set(field_name, nested_tensor) + + if device == "npu": + # For NPU: create a regular tensor of seq_length // 2 + tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device) + prompt_batch.set(field_name, tensor_data) + else: + # For non-NPU: create nested tensor with arithmetic progression lengths + # Lengths go from 1 to seq_length in equal increments + step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0 + nested_list = [] + for j in range(batch_size): + length = int(round(1 + j * step)) + length = max(1, min(length, seq_length)) # Clamp to [1, seq_length] + seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) + nested_list.append(seq_data) + + nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) + prompt_batch.set(field_name, nested_tensor) # 3. NonTensorStack wrapped strings # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py index 6951713..30eb9ad 100644 --- a/scripts/performance_test/ray_perftest_baseline.py +++ b/scripts/performance_test/ray_perftest_baseline.py @@ -43,15 +43,16 @@ def create_test_case( Creates TensorDict with: - Regular tensors: (batch_size, seq_length) shape, each element is float32 - - Nested Tensors: variable-length sequences, each batch element has length - uniformly sampled from [1, seq_length] + - Nested Tensors (non-NPU): variable-length sequences with lengths forming an + arithmetic progression from 1 to seq_length (average length ≈ seq_length/2) + - Nested Tensors (NPU): regular tensors of shape (batch_size, seq_length//2) - NonTensorStack wrapped strings: each string size ~= seq_length * 4 bytes (to match memory footprint of one tensor element) Args: batch_size: Batch size for the test case seq_length: Maximum sequence length (used for regular tensors and - as upper bound for nested tensor sampling) + as upper bound for nested tensor lengths) field_num: Total number of fields to create (distributed across types) device: Device to create tensors on ("cpu", "npu", or "gpu") @@ -69,10 +70,15 @@ def create_test_case( regular_field_size_bytes = batch_size * seq_length * bytes_per_element regular_field_size_gb = regular_field_size_bytes / (1024**3) - # Nested tensor field: average length = (1 + seq_length) / 2, + # Nested tensor field: average length = (1 + seq_length) / 2 (arithmetic progression), # so avg size = batch_size * (1 + seq_length) / 2 * 4 bytes - avg_nested_length = (1 + seq_length) / 2 - nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) + # For NPU, nested fields become regular tensors of seq_length // 2 + if device == "npu": + avg_nested_length = seq_length // 2 + nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) + else: + avg_nested_length = (1 + seq_length) / 2 + nested_field_size_bytes = int(batch_size * avg_nested_length * bytes_per_element) nested_field_size_gb = nested_field_size_bytes / (1024**3) # NonTensorStack string field: each string ~= seq_length * 4 bytes to match one tensor element @@ -98,11 +104,8 @@ def create_test_case( torch_device = "cuda:0" # Set seeds for reproducibility (within this process) - # Sample lengths for all nested fields at once - nested_lengths = [ - torch.randint(1, seq_length + 1, (batch_size,), generator=torch.Generator().manual_seed(42 + i)) - for i in range(num_nested_fields) - ] + # For non-NPU: arithmetic progression lengths from 1 to seq_length for each nested field + # For NPU: nested fields become regular tensors of seq_length // 2 batch_size_tuple = (batch_size,) @@ -114,21 +117,27 @@ def create_test_case( tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) prompt_batch.set(field_name, tensor_data) - # 2. Nested Tensor fields (variable-length sequences) + # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU for i in range(num_nested_fields): field_name = f"nested_field_{i}" - actual_lengths = nested_lengths[i] - - # Create nested tensor from variable-length sequences - nested_list = [] - for j in range(batch_size): - length = actual_lengths[j].item() - # Create sequence data: arange for each element (representing sequence indices) - seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) - nested_list.append(seq_data) - - nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) - prompt_batch.set(field_name, nested_tensor) + + if device == "npu": + # For NPU: create a regular tensor of seq_length // 2 + tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device) + prompt_batch.set(field_name, tensor_data) + else: + # For non-NPU: create nested tensor with arithmetic progression lengths + # Lengths go from 1 to seq_length in equal increments + step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0 + nested_list = [] + for j in range(batch_size): + length = int(round(1 + j * step)) + length = max(1, min(length, seq_length)) # Clamp to [1, seq_length] + seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) + nested_list.append(seq_data) + + nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) + prompt_batch.set(field_name, nested_tensor) # 3. NonTensorStack wrapped strings # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint From d043cf96501523e61c518c6cfd6e67da756172aa Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 17:26:14 +0800 Subject: [PATCH 15/29] 1. delete old samples Signed-off-by: tianyi-ge --- scripts/performance_test/perftest.py | 17 +++++++++++++++++ scripts/performance_test/run_perf_test.sh | 2 ++ 2 files changed, 19 insertions(+) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 46e38a8..9d2d06f 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -199,6 +199,12 @@ def get_data(self, partition_id: str, keys: list[str] | None = None) -> None: keys = self.test_keys tq.kv_batch_get(keys=keys, partition_id=partition_id) + def delete(self, partition_id: str, keys: list[str] | None = None) -> None: + """Delete data from storage using kv_batch_delete.""" + if keys is None: + keys = self.test_keys + tq.kv_batch_delete(keys=keys, partition_id=partition_id) + def close(self) -> None: """Close transfer_queue.""" tq.close() @@ -358,6 +364,7 @@ def run_throughput_test(self) -> dict[str, Any]: device=self.device, ) ) + logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") end_create_data = time.perf_counter() logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") @@ -389,6 +396,16 @@ def run_throughput_test(self) -> dict[str, Any]: get_gbit_per_sec = (total_data_size_gb * 8) / get_time get_gbyte_per_sec = total_data_size_gb / get_time + time.sleep(2) + + # DELETE operation using kv_batch_delete + logger.info("Starting DELETE operation (kv_batch_delete)...") + start_delete = time.perf_counter() + ray.get(self.writer.delete.remote(partition_id=partition_id, keys=keys)) + end_delete = time.perf_counter() + delete_time = end_delete - start_delete + logger.info(f"DELETE Time: {delete_time:.8f}s") + # Print summary total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time) total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time) diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh index 50b2863..630869e 100755 --- a/scripts/performance_test/run_perf_test.sh +++ b/scripts/performance_test/run_perf_test.sh @@ -45,6 +45,8 @@ for backend in "${BACKENDS[@]}"; do --num_test_iterations="${NUM_TEST_ITERATIONS}" \ --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ --output_csv="${output_csv}" + + sleep 10 done done From dc51c265e47b4b93a038d4b6ca693f44f9a8177c Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 17:31:22 +0800 Subject: [PATCH 16/29] kv_batch_delete -> kv_clear Signed-off-by: tianyi-ge --- scripts/performance_test/perftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 9d2d06f..7b2f900 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -200,10 +200,10 @@ def get_data(self, partition_id: str, keys: list[str] | None = None) -> None: tq.kv_batch_get(keys=keys, partition_id=partition_id) def delete(self, partition_id: str, keys: list[str] | None = None) -> None: - """Delete data from storage using kv_batch_delete.""" + """Delete data from storage using kv_clear.""" if keys is None: keys = self.test_keys - tq.kv_batch_delete(keys=keys, partition_id=partition_id) + tq.kv_clear(keys=keys, partition_id=partition_id) def close(self) -> None: """Close transfer_queue.""" @@ -398,8 +398,8 @@ def run_throughput_test(self) -> dict[str, Any]: time.sleep(2) - # DELETE operation using kv_batch_delete - logger.info("Starting DELETE operation (kv_batch_delete)...") + # DELETE operation using kv_clear + logger.info("Starting DELETE operation (kv_clear)...") start_delete = time.perf_counter() ray.get(self.writer.delete.remote(partition_id=partition_id, keys=keys)) end_delete = time.perf_counter() From 8d621d057490030815a3728cdc95c64abedcef48 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Wed, 25 Mar 2026 17:54:50 +0800 Subject: [PATCH 17/29] clean test data Signed-off-by: tianyi-ge --- scripts/performance_test/perftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 7b2f900..55283e1 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -204,6 +204,7 @@ def delete(self, partition_id: str, keys: list[str] | None = None) -> None: if keys is None: keys = self.test_keys tq.kv_clear(keys=keys, partition_id=partition_id) + self.test_data = None def close(self) -> None: """Close transfer_queue.""" @@ -550,6 +551,7 @@ def main() -> None: logger.info("-" * 60) result = tester.run_throughput_test() all_results.append(result) + time.sleep(10) # Write to CSV if output path is specified if args.output_csv: From 590af4565ff2190e4c0914e8e887bc1a1aa6e31b Mon Sep 17 00:00:00 2001 From: 0oshowero0 Date: Thu, 26 Mar 2026 14:05:17 +0800 Subject: [PATCH 18/29] update test sceanrio and optimize data gen speed Signed-off-by: 0oshowero0 # Conflicts: # scripts/performance_test/perftest.py --- scripts/performance_test/perftest.py | 71 ++++++++++--------- .../performance_test/ray_perftest_baseline.py | 66 ++++++++--------- scripts/performance_test/run_perf_test.sh | 8 +-- 3 files changed, 73 insertions(+), 72 deletions(-) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 55283e1..dfe78a2 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -17,6 +17,7 @@ import argparse import csv import logging +import os import sys import time from pathlib import Path @@ -121,6 +122,11 @@ def create_test_case( prompt_batch.set(field_name, tensor_data) # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU + if device != "npu": + step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0 + lengths = [max(1, min(int(round(1 + j * step)), seq_length)) for j in range(batch_size)] + total_elements = sum(lengths) + for i in range(num_nested_fields): field_name = f"nested_field_{i}" @@ -129,27 +135,20 @@ def create_test_case( tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device) prompt_batch.set(field_name, tensor_data) else: - # For non-NPU: create nested tensor with arithmetic progression lengths - # Lengths go from 1 to seq_length in equal increments - step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0 - nested_list = [] - for j in range(batch_size): - length = int(round(1 + j * step)) - length = max(1, min(length, seq_length)) # Clamp to [1, seq_length] - seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) - nested_list.append(seq_data) - - nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) + flat_data = torch.randn(total_elements, dtype=torch.float32, device=torch_device) + nested_tuple = torch.split(flat_data, lengths) + nested_tensor = torch.nested.as_nested_tensor(nested_tuple, layout=torch.jagged) prompt_batch.set(field_name, nested_tensor) # 3. NonTensorStack wrapped strings # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint string_char_count = seq_length * bytes_per_element # 4 bytes per char (unicode) - string_template = "x" * string_char_count for i in range(num_nontensor_fields): field_name = f"nontensor_field_{i}" - string_data = [string_template for _ in range(batch_size)] + bytes_needed = string_char_count // 2 + string_data = [os.urandom(bytes_needed).hex() for _ in range(batch_size)] + prompt_batch.set(field_name, NonTensorStack.from_list(string_data)) return prompt_batch, total_size_gb @@ -349,25 +348,27 @@ def _initialize_clients(self) -> None: r = self.reader.initialize.remote() ray.get([w, r]) - def run_throughput_test(self) -> dict[str, Any]: + def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]: """Run the throughput test and print results. Returns: Dictionary with test results """ - logger.info("Creating large batch for throughput test...") - start_create_data = time.perf_counter() - data_fields, total_data_size_gb = ray.get( - self.writer.create_test_case.remote( - batch_size=self.global_batch_size, - seq_length=self.seq_len, - field_num=self.field_num, - device=self.device, + # Create test data + if not skip_dataset_create: + logger.info("Creating large batch for throughput test...") + start_create_data = time.perf_counter() + data_fields, self.total_data_size_gb = ray.get( + self.writer.create_test_case.remote( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device=self.device, + ) ) - ) - logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") - end_create_data = time.perf_counter() - logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") + end_create_data = time.perf_counter() + logger.info(f"Total Data Size: {self.total_data_size_gb:.6f} GB") + logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") partition_id = "train_0" @@ -377,8 +378,8 @@ def run_throughput_test(self) -> dict[str, Any]: ray.get(self.writer.put.remote(partition_id=partition_id)) end_put = time.perf_counter() put_time = end_put - start_put - put_gbit_per_sec = (total_data_size_gb * 8) / put_time - put_gbyte_per_sec = total_data_size_gb / put_time + put_gbit_per_sec = (self.total_data_size_gb * 8) / put_time + put_gbyte_per_sec = self.total_data_size_gb / put_time time.sleep(2) @@ -394,8 +395,8 @@ def run_throughput_test(self) -> dict[str, Any]: ray.get(self.reader.get_data.remote(partition_id=partition_id, keys=keys)) end_get_data = time.perf_counter() get_time = end_get_data - start_get_data - get_gbit_per_sec = (total_data_size_gb * 8) / get_time - get_gbyte_per_sec = total_data_size_gb / get_time + get_gbit_per_sec = (self.total_data_size_gb * 8) / get_time + get_gbyte_per_sec = self.total_data_size_gb / get_time time.sleep(2) @@ -408,15 +409,15 @@ def run_throughput_test(self) -> dict[str, Any]: logger.info(f"DELETE Time: {delete_time:.8f}s") # Print summary - total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time) - total_gbyte_per_sec = (total_data_size_gb * 2) / (put_time + get_time) + total_gbit_per_sec = (self.total_data_size_gb * 16) / (put_time + get_time) + total_gbyte_per_sec = (self.total_data_size_gb * 2) / (put_time + get_time) logger.info("=" * 60) logger.info("THROUGHPUT TEST SUMMARY") logger.info("=" * 60) logger.info(f"Backend: {self.backend}") logger.info(f"Device: {self.device}") - logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") + logger.info(f"Total Data Size: {self.total_data_size_gb:.6f} GB") logger.info(f"PUT Time: {put_time:.8f}s") logger.info(f"GET Time: {get_time:.8f}s") logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s ({put_gbyte_per_sec:.8f} GB/s)") @@ -428,7 +429,7 @@ def run_throughput_test(self) -> dict[str, Any]: return { "backend": self.backend, "device": self.device, - "total_data_size_gb": total_data_size_gb, + "total_data_size_gb": self.total_data_size_gb, "put_time": put_time, "get_time": get_time, "put_gbit_per_sec": put_gbit_per_sec, @@ -549,7 +550,7 @@ def main() -> None: logger.info("-" * 60) logger.info(f"Iteration {i + 1}/{args.num_test_iterations}") logger.info("-" * 60) - result = tester.run_throughput_test() + result = tester.run_throughput_test(skip_dataset_create=(i != 0)) all_results.append(result) time.sleep(10) diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py index 30eb9ad..e59b175 100644 --- a/scripts/performance_test/ray_perftest_baseline.py +++ b/scripts/performance_test/ray_perftest_baseline.py @@ -17,6 +17,7 @@ import argparse import csv import logging +import os import sys import time from pathlib import Path @@ -118,6 +119,11 @@ def create_test_case( prompt_batch.set(field_name, tensor_data) # 2. Nested Tensor fields (variable-length sequences) or regular tensors for NPU + if device != "npu": + step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0 + lengths = [max(1, min(int(round(1 + j * step)), seq_length)) for j in range(batch_size)] + total_elements = sum(lengths) + for i in range(num_nested_fields): field_name = f"nested_field_{i}" @@ -126,27 +132,20 @@ def create_test_case( tensor_data = torch.randn(batch_size, seq_length // 2, dtype=torch.float32, device=torch_device) prompt_batch.set(field_name, tensor_data) else: - # For non-NPU: create nested tensor with arithmetic progression lengths - # Lengths go from 1 to seq_length in equal increments - step = (seq_length - 1) / (batch_size - 1) if batch_size > 1 else 0 - nested_list = [] - for j in range(batch_size): - length = int(round(1 + j * step)) - length = max(1, min(length, seq_length)) # Clamp to [1, seq_length] - seq_data = torch.arange(length, dtype=torch.float32, device=torch_device) - nested_list.append(seq_data) - - nested_tensor = torch.nested.as_nested_tensor(nested_list, layout=torch.jagged) + flat_data = torch.randn(total_elements, dtype=torch.float32, device=torch_device) + nested_tuple = torch.split(flat_data, lengths) + nested_tensor = torch.nested.as_nested_tensor(nested_tuple, layout=torch.jagged) prompt_batch.set(field_name, nested_tensor) # 3. NonTensorStack wrapped strings # Each string ~= seq_length * 4 bytes to match one tensor element's memory footprint string_char_count = seq_length * bytes_per_element # 4 bytes per char (unicode) - string_template = "x" * string_char_count for i in range(num_nontensor_fields): field_name = f"nontensor_field_{i}" - string_data = [string_template for _ in range(batch_size)] + bytes_needed = string_char_count // 2 + string_data = [os.urandom(bytes_needed).hex() for _ in range(batch_size)] + prompt_batch.set(field_name, NonTensorStack.from_list(string_data)) return prompt_batch, total_size_gb @@ -218,31 +217,32 @@ def _initialize_remote_store(self) -> None: logger.info(f"RemoteDataStore created on {reader_node}") - def run_throughput_test(self) -> dict[str, Any]: + def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]: """Run the throughput test and print results. Returns: Dictionary with test results """ # Create test data - logger.info("Creating large batch for throughput test...") - start_create_data = time.perf_counter() - test_data, total_data_size_gb = create_test_case( - batch_size=self.global_batch_size, - seq_length=self.seq_len, - field_num=self.field_num, - device="cpu", - ) - end_create_data = time.perf_counter() - logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") + if not skip_dataset_create: + logger.info("Creating large batch for throughput test...") + start_create_data = time.perf_counter() + self.test_data, self.total_data_size_gb = create_test_case( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device="cpu", + ) + end_create_data = time.perf_counter() + logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") # PUT operation - pass data directly to remote actor logger.info("Starting PUT operation...") start_put = time.perf_counter() - ray.get(self.remote_store.put_data.remote(test_data)) + ray.get(self.remote_store.put_data.remote(self.test_data)) end_put = time.perf_counter() put_time = end_put - start_put - put_gbit_per_sec = (total_data_size_gb * 8) / put_time + put_gbit_per_sec = (self.total_data_size_gb * 8) / put_time time.sleep(2) @@ -252,19 +252,19 @@ def run_throughput_test(self) -> dict[str, Any]: _ = ray.get(self.remote_store.get_data.remote()) end_get = time.perf_counter() get_time = end_get - start_get - get_gbit_per_sec = (total_data_size_gb * 8) / get_time + get_gbit_per_sec = (self.total_data_size_gb * 8) / get_time # Clear data ray.get(self.remote_store.clear_data.remote()) # Calculate total throughput - total_gbit_per_sec = (total_data_size_gb * 16) / (put_time + get_time) + total_gbit_per_sec = (self.total_data_size_gb * 16) / (put_time + get_time) # Print summary logger.info("=" * 60) logger.info("RAY BASELINE THROUGHPUT TEST SUMMARY") logger.info("=" * 60) - logger.info(f"Total Data Size: {total_data_size_gb:.6f} GB") + logger.info(f"Total Data Size: {self.total_data_size_gb:.6f} GB") logger.info(f"PUT Time: {put_time:.8f}s") logger.info(f"GET Time: {get_time:.8f}s") logger.info(f"PUT Throughput: {put_gbit_per_sec:.8f} Gb/s") @@ -275,7 +275,7 @@ def run_throughput_test(self) -> dict[str, Any]: return { "backend": "RayBaseline", "device": "cpu", - "total_data_size_gb": total_data_size_gb, + "total_data_size_gb": self.total_data_size_gb, "put_time": put_time, "get_time": get_time, "put_gbit_per_sec": put_gbit_per_sec, @@ -329,8 +329,8 @@ def main() -> None: parser.add_argument( "--num_test_iterations", type=int, - default=3, - help="Number of test iterations (default: 3)", + default=4, + help="Number of test iterations (default: 4)", ) parser.add_argument( "--head_node_ip", @@ -370,7 +370,7 @@ def main() -> None: logger.info("-" * 60) logger.info(f"Iteration {i + 1}/{args.num_test_iterations}") logger.info("-" * 60) - result = tester.run_throughput_test() + result = tester.run_throughput_test(skip_dataset_create=(i != 0)) all_results.append(result) # Write to CSV if output path is specified diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh index 630869e..b9c7a1f 100755 --- a/scripts/performance_test/run_perf_test.sh +++ b/scripts/performance_test/run_perf_test.sh @@ -22,9 +22,9 @@ BACKENDS=("SimpleStorage" "Yuanrong" "MooncakeStore") # Test settings: global_batch_size, field_num, seq_len, name declare -a SETTINGS=( - "128,3,1024,Small" - "1024,9,8192,Medium" - "4096,21,128000,Large" + "1024,9,8192,Small" + "4096,15,32768,Medium" + "8192,21,128000,Large" ) # ---- TransferQueue perftest ---- @@ -71,4 +71,4 @@ done python "${SCRIPT_DIR}/draw_figure.py" echo "" -echo "All tests completed!" +echo "All tests completed!" \ No newline at end of file From 5446dfe154110cf3e846ede4e0db03ad9da5b1e3 Mon Sep 17 00:00:00 2001 From: 0oshowero0 Date: Thu, 26 Mar 2026 14:10:17 +0800 Subject: [PATCH 19/29] update readme Signed-off-by: 0oshowero0 --- scripts/performance_test/README_PERFTEST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index e05f9a8..d51d42b 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -96,7 +96,7 @@ HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.s The script tests all combinations of: - **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline) -- **Data sizes**: Small (batch=128, fields=3, seq=1024), Medium (batch=1024, fields=9, seq=8192), Large (batch=4096, fields=21, seq=128000) +- **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=21, seq=128000) ### Output From b918ec543e8cfb2b19b2babf580f53934cd000d2 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Thu, 26 Mar 2026 14:41:47 +0800 Subject: [PATCH 20/29] do not remove test data since it's being reused Signed-off-by: tianyi-ge --- scripts/performance_test/perftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index dfe78a2..ce395c1 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -203,7 +203,6 @@ def delete(self, partition_id: str, keys: list[str] | None = None) -> None: if keys is None: keys = self.test_keys tq.kv_clear(keys=keys, partition_id=partition_id) - self.test_data = None def close(self) -> None: """Close transfer_queue.""" From ca530afbb5c15713f7a0aac46702254eb36e5c50 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Thu, 26 Mar 2026 15:14:10 +0800 Subject: [PATCH 21/29] update readme for perftest Signed-off-by: tianyi-ge --- scripts/performance_test/README_PERFTEST.md | 120 +++++++++++--------- 1 file changed, 65 insertions(+), 55 deletions(-) diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index d51d42b..e9e9e02 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -8,7 +8,6 @@ This script runs throughput tests for TransferQueue with different backends. ```bash # On head node ray start --head --resources='{"node:192.168.0.1":1}' - # On worker node ray start --address=192.168.0.1:6379 --resources='{"node:192.168.0.2":1}' ``` @@ -19,10 +18,11 @@ This script runs throughput tests for TransferQueue with different backends. ```bash python perftest.py \ - --backend_config=../../transfer_queue/config.yaml \ - --device=[cpu|npu|gpu] \ + --backend_config=perftest_config.yaml \ + --backend=SimpleStorage \ + --device=cpu \ --global_batch_size=1024 \ - --field_num=10 \ + --field_num=9 \ --seq_len=8192 \ --head_node_ip=192.168.0.1 \ --worker_node_ip=192.168.0.2 @@ -30,46 +30,56 @@ python perftest.py \ ## Arguments -| Argument | Description | Default | -|----------|-------------|---------| -| `--backend_config` | Path to backend config YAML file (required) | - | -| `--device` | Device: cpu, npu, gpu | cpu | -| `--global_batch_size` | Global batch size | 1024 | -| `--field_num` | Number of fields | 10 | -| `--seq_len` | Sequence length | 8192 | -| `--num_test_iterations` | Number of test iterations | 4 | -| `--head_node_ip` | Head node IP (required) | - | -| `--worker_node_ip` | Worker node IP (required for Yuanrong) | None | -| `--output_csv` | Path to output CSV file (optional) | None | +| Argument | Description | Default | Required | +|----------|-------------|---------|----------| +| `--backend_config` | Path to backend config YAML file | - | Yes | +| `--backend` | Override `storage_backend` in config (`SimpleStorage`, `Yuanrong`, `MooncakeStore`) | None | No | +| `--device` | Device: `cpu`, `npu`, `gpu` | `cpu` | No | +| `--global_batch_size` | Global batch size | 1024 | No | +| `--field_num` | Number of fields in the TensorDict | 10 | No | +| `--seq_len` | Sequence length | 8192 | No | +| `--num_test_iterations` | Number of test iterations | 4 | No | +| `--head_node_ip` | Head node IP address | - | Yes | +| `--worker_node_ip` | Worker node IP address (required for Yuanrong) | None | No | +| `--output_csv` | Path to output CSV file | None | No | ## Backend Configuration -The script reads the backend configuration directly from the provided `--backend_config` YAML file. The backend type is determined by `backend.storage_backend` in the config file. +The script reads the backend configuration directly from the provided `--backend_config` YAML file. The backend type is determined by `backend.storage_backend` in the config file. When `--backend` is specified, it overrides the value in the config. -For device support of each backend, -- `SimpleStorage` backend supports `cpu` -- `Yuanrong` supports `cpu` and `npu` -- `MooncakeStore` supports `cpu` and `gpu` +For device support of each backend: +- `SimpleStorage`: `cpu` +- `Yuanrong`: `cpu`, `npu` +- `MooncakeStore`: `cpu`, `gpu` ## Test Data Format -The test case creates TensorDict with three types of fields: +The test case creates a `TensorDict` with three types of fields to simulate real training batches: + +1. **Regular tensors**: Shape `(batch_size, seq_length)`, float32. +2. **Nested tensors** (non-NPU devices): Variable-length ragged sequences with lengths forming an arithmetic progression from 1 to `seq_length`. Average length ≈ `seq_length / 2`, so each nested field is roughly half the size of a regular field. +3. **NonTensorStack strings**: Each string is `seq_length × 4` bytes, matching the memory footprint of one tensor element. + +Fields are distributed evenly across the three types (rounded up). For NPU devices, nested tensors fall back to regular tensors of shape `(batch_size, seq_length // 2)`. + +## Test Flow -1. **Regular tensors**: Shape `(batch_size, seq_length)`, float32 -2. **Nested tensors** (non-NPU devices): Variable-length sequences with lengths forming an arithmetic progression from 1 to `seq_length`. For a batch of size N, element j has length `1 + j * (seq_length - 1) / (N - 1)`. This gives an average nested length of approximately `seq_length / 2`, making the nested column size roughly half of a regular tensor column. -3. **NonTensorStack strings**: Each string is `seq_length * 4` bytes to match the memory footprint of one tensor element. +Each iteration performs a PUT → LIST → GET → DELETE cycle via TransferQueue's KV API: -### NPU Fallback +1. **PUT** (`kv_batch_put`): Writer sends the TensorDict to storage. +2. **LIST** (`kv_list`): Reader queries available keys in the partition. +3. **GET** (`kv_batch_get`): Reader fetches data for those keys. +4. **DELETE** (`kv_clear`): Writer removes the written data. -NPU does not support nested tensors. When running with `--device=npu`, the nested tensor fields are replaced with regular tensors of shape `(batch_size, seq_length // 2)` to maintain comparable total data size while avoiding nested tensor operations. +The test runs `--num_test_iterations` iterations. Data creation only happens in the first iteration; subsequent iterations reuse the same TensorDict to isolate transfer overhead. ## Yuanrong Backend -For Yuanrong backend, writer runs on head node and reader runs on worker node. +For Yuanrong backend, writer runs on the head node and reader runs on the worker node. `--worker_node_ip` is required. ## Running Full Test Suite -The `run_perf_test.sh` script automates the full performance test suite: +The `run_perf_test.sh` script automates the full test suite across all backends and data sizes, then generates a comparison chart: ```bash cd scripts/performance_test @@ -78,14 +88,14 @@ cd scripts/performance_test ### Configuration -Configure the test environment via environment variables: +Configure via environment variables: | Variable | Description | Default | |----------|-------------|---------| -| `HEAD_NODE_IP` | Head node IP address | 127.0.0.1 | -| `WORKER_NODE_IP` | Worker node IP address | 127.0.0.1 | -| `DEVICE` | Device type (cpu, npu, gpu) | cpu | -| `NUM_TEST_ITERATIONS` | Number of iterations per test | 4 | +| `HEAD_NODE_IP` | Head node IP address | `127.0.0.1` | +| `WORKER_NODE_IP` | Worker node IP address | `127.0.0.1` | +| `DEVICE` | Device type (`cpu`, `npu`, `gpu`) | `cpu` | +| `NUM_TEST_ITERATIONS` | Number of iterations per test | `4` | Example: ```bash @@ -94,30 +104,31 @@ HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.s ### Test Matrix -The script tests all combinations of: - **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline) - **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=21, seq=128000) ### Output -- CSV results are saved to `results/{backend}_{size}.csv` (e.g., `results/simplestorage_small.csv`) -- A performance comparison chart is generated as `results/performance_comparison.pdf` +- CSV results: `results/{backend}_{size}.csv` (e.g., `results/simplestorage_small.csv`, `results/ray_baseline_medium.csv`) +- Performance chart: `results/performance_comparison.pdf` + +### Ray Baseline + +`ray_perftest_baseline.py` measures raw Ray inter-node transfer throughput without TransferQueue, serving as a baseline. It passes a TensorDict directly to a remote Ray actor (via `ray.get`), using the same test data format. It is automatically included in `run_perf_test.sh`. ### draw_figure.py -After running the tests, `draw_figure.py` reads all CSV files from the `results/` directory and generates a bar chart comparing total throughput (Gbps) across backends and data sizes. +After running the tests, `draw_figure.py` reads all CSV files from `results/` and generates a grouped bar chart comparing total throughput (Gbps) across backends and data sizes. ## Examples -Individual test examples using `perftest.py`: - ### SimpleStorage backend ```bash python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage \ --head_node_ip=192.168.0.1 ``` -### Yuanrong backend +### Yuanrong backend (inter-node) ```bash python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 @@ -129,7 +140,7 @@ python perftest.py --backend_config=perftest_config.yaml --backend=MooncakeStore --head_node_ip=192.168.0.1 ``` -### NPU device test (Yuanrong backend) +### NPU device test (Yuanrong) ```bash python perftest.py --backend_config=perftest_config.yaml --backend=Yuanrong --device=npu \ --head_node_ip=192.168.0.1 --worker_node_ip=192.168.0.2 @@ -141,7 +152,7 @@ python perftest.py --backend_config=perftest_config.yaml --backend=SimpleStorage --head_node_ip=192.168.0.1 --output_csv=results.csv ``` -## Output +## Output Format The test prints: - Total data size @@ -151,16 +162,15 @@ The test prints: Throughput is shown in both Gb/s (gigabits per second) and GB/s (gigabytes per second). -### CSV Output - -When using `--output_csv`, the test writes results to a CSV file with the following columns: -- backend -- device -- total_data_size_gb -- put_time -- get_time -- put_gbit_per_sec -- get_gbit_per_sec -- total_gbit_per_sec - -The test runs `--num_test_iterations` iterations (default: 4) and saves all results to the CSV. +### CSV Columns + +| Column | Description | +|--------|-------------| +| `backend` | Backend name | +| `device` | Device type | +| `total_data_size_gb` | Data size in GB | +| `put_time` | PUT duration (seconds) | +| `get_time` | GET duration (seconds) | +| `put_gbit_per_sec` | PUT throughput (Gbps) | +| `get_gbit_per_sec` | GET throughput (Gbps) | +| `total_gbit_per_sec` | Round-trip throughput (Gbps) | From dbd830f9a4e7d83b841a59ceaac7d9376ef99a96 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Thu, 26 Mar 2026 16:42:22 +0800 Subject: [PATCH 22/29] 1. fix bar order in draw_figure.py 2. remove delete time stats Signed-off-by: tianyi-ge --- scripts/performance_test/draw_figure.py | 9 ++++++++- scripts/performance_test/perftest.py | 4 ---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py index bb7910e..b96219a 100644 --- a/scripts/performance_test/draw_figure.py +++ b/scripts/performance_test/draw_figure.py @@ -95,12 +95,19 @@ def make_xlabel(size_label: str) -> str: df["Bandwidth"] = df["total_gbit_per_sec"] df["Scenario"] = df["backend_parsed"] +# Set backend display order +backend_order = ["Ray", "SimpleStorage", "Yuanrong", "MooncakeStore"] + +df["Scenario"] = pd.Categorical(df["Scenario"], categories=backend_order, ordered=True) + # ========== Plotting ========== sns.set_theme(style="white", palette="husl") fig, ax = plt.subplots(figsize=(12, 7)) -palette = sns.color_palette("Set2", n_colors=df["Scenario"].nunique()) +# Use the backend order to ensure consistent coloring +existing_backends = df["Scenario"].unique() +palette = sns.color_palette("Set2", n_colors=len(existing_backends)) barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette) # Legend: match old style — at the top center, horizontal, with frame diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index ce395c1..c5c174e 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -401,11 +401,7 @@ def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]: # DELETE operation using kv_clear logger.info("Starting DELETE operation (kv_clear)...") - start_delete = time.perf_counter() ray.get(self.writer.delete.remote(partition_id=partition_id, keys=keys)) - end_delete = time.perf_counter() - delete_time = end_delete - start_delete - logger.info(f"DELETE Time: {delete_time:.8f}s") # Print summary total_gbit_per_sec = (self.total_data_size_gb * 16) / (put_time + get_time) From eb380c20a1771ddef8aef395e1c2dcd2e7204f5e Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Fri, 27 Mar 2026 21:10:50 +0800 Subject: [PATCH 23/29] fix incorrect init yr client from controller; otherwise all yr clients will connect to the head node Signed-off-by: tianyi-ge --- scripts/performance_test/perftest_config.yaml | 2 +- transfer_queue/interface.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml index 88dca45..a1b1f9a 100644 --- a/scripts/performance_test/perftest_config.yaml +++ b/scripts/performance_test/perftest_config.yaml @@ -53,4 +53,4 @@ backend: # Port of local yuanrong datasystem worker port: 31501 # If enable npu transport - enable_yr_npu_transport: false + enable_yr_npu_transport: true diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py index d0fd2f7..92b0f5e 100644 --- a/transfer_queue/interface.py +++ b/transfer_queue/interface.py @@ -233,7 +233,7 @@ def init(conf: Optional[DictConfig] = None) -> None: >>> metadata = tq.get_meta(...) >>> data = tq.get_data(metadata) """ - if _init_from_existing(): + if conf is None and _init_from_existing(): return # First-time initialize TransferQueue @@ -271,8 +271,10 @@ def init(conf: Optional[DictConfig] = None) -> None: logger.info("TransferQueueController has been created.") except ValueError: logger.info("Some other rank has initialized TransferQueueController. Try to connect to existing controller.") - _init_from_existing() - return + if conf is None: + _init_from_existing() + return + _TRANSFER_QUEUE_CONTROLLER = ray.get_actor("TransferQueueController") controller_zmq_info = process_zmq_server_info(_TRANSFER_QUEUE_CONTROLLER) final_conf.controller.zmq_info = controller_zmq_info From b68d267160715101ba201a8c52812d4139dd4e47 Mon Sep 17 00:00:00 2001 From: 0oshowero0 Date: Fri, 27 Mar 2026 16:26:14 +0800 Subject: [PATCH 24/29] add simple case Signed-off-by: 0oshowero0 --- scripts/performance_test/README_PERFTEST.md | 8 ++ scripts/performance_test/perftest.py | 80 ++++++++++++++++++- .../performance_test/ray_perftest_baseline.py | 79 ++++++++++++++++-- scripts/performance_test/run_perf_test.sh | 14 +++- 4 files changed, 169 insertions(+), 12 deletions(-) diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index e9e9e02..b1d0f6a 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -96,9 +96,17 @@ Configure via environment variables: | `WORKER_NODE_IP` | Worker node IP address | `127.0.0.1` | | `DEVICE` | Device type (`cpu`, `npu`, `gpu`) | `cpu` | | `NUM_TEST_ITERATIONS` | Number of iterations per test | `4` | +| `USE_COMPLEX_CASE` | Run with complex test case (nested + nontensor fields) | `false` | Example: ```bash +# Simple case (default, regular tensors only) +./run_perf_test.sh + +# Complex case (nested tensors + nontensor strings) +USE_COMPLEX_CASE=true ./run_perf_test.sh + +# With specific node IPs & use NPU HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.sh ``` diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index c5c174e..5e94f36 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -42,6 +42,62 @@ def create_test_case( seq_length: int | None = None, field_num: int | None = None, device: str = "cpu", +) -> tuple[TensorDict, float]: + """Create a test case with tensor data formats. + + Creates TensorDict with: + - Regular tensors: (batch_size, seq_length) shape, each element is float32 + + Args: + batch_size: Batch size for the test case + seq_length: Maximum sequence length (used for regular tensors and + as upper bound for nested tensor lengths) + field_num: Total number of fields to create (distributed across types) + device: Device to create tensors on ("cpu", "npu", or "gpu") + + Returns: + Tuple of (TensorDict, total_size_gb) + """ + bytes_per_element = 4 # float32 + + # Each regular tensor field: batch_size * seq_length * 4 bytes + regular_field_size_bytes = batch_size * seq_length * bytes_per_element + regular_field_size_gb = regular_field_size_bytes / (1024**3) + + # Total size = sum of all field types + total_size_gb = regular_field_size_gb * field_num + + logger.info(f"Total data size: {total_size_gb:.6f} GB") + + # Determine torch device + torch_device = None + if device == "npu": + torch_device = "npu:0" + elif device == "gpu": + torch_device = "cuda:0" + + # Set seeds for reproducibility (within this process) + # For non-NPU: arithmetic progression lengths from 1 to seq_length for each nested field + # For NPU: nested fields become regular tensors of seq_length // 2 + + batch_size_tuple = (batch_size,) + + prompt_batch = TensorDict(batch_size=batch_size_tuple) + + # 1. Regular tensor fields + for i in range(field_num): + field_name = f"field_{i}" + tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) + prompt_batch.set(field_name, tensor_data) + + return prompt_batch, total_size_gb + + +def create_complex_test_case( + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", ) -> tuple[TensorDict, float]: """Create a test case with complex data formats. @@ -158,8 +214,9 @@ def create_test_case( class TQClientActor: """Ray actor that uses tq.init(config) to initialize.""" - def __init__(self, config: dict[str, Any]): + def __init__(self, config: dict[str, Any], use_complex_case: bool = False): self.config = config + self.use_complex_case = use_complex_case self.test_data = None self.total_data_size_gb = 0.0 self.test_keys = None @@ -176,7 +233,12 @@ def create_test_case( device: str = "cpu", ) -> tuple[list[str], float]: """Create test case on the actor.""" - self.test_data, self.total_data_size_gb = create_test_case(batch_size, seq_length, field_num, device) + if self.use_complex_case: + self.test_data, self.total_data_size_gb = create_complex_test_case( + batch_size, seq_length, field_num, device + ) + else: + self.test_data, self.total_data_size_gb = create_test_case(batch_size, seq_length, field_num, device) # Create keys for each sample in the batch self.test_keys = [f"test_key_{i}" for i in range(batch_size)] return list(self.test_data.keys()), self.total_data_size_gb @@ -224,6 +286,7 @@ def __init__( backend: str | None = None, worker_node_ip: str | None = None, output_csv: str | None = None, + use_complex_case: bool = False, ): """Initialize the throughput tester. @@ -238,6 +301,7 @@ def __init__( head_node_ip: Head node IP address worker_node_ip: Worker node IP address (required for Yuanrong) output_csv: Path to output CSV file (optional) + use_complex_case: Whether to use complex test case (nested + nontensor fields) """ self.backend_config_path = backend_config_path self.backend_override = backend @@ -249,6 +313,7 @@ def __init__( self.head_node_ip = head_node_ip self.worker_node_ip = worker_node_ip self.output_csv = output_csv + self.use_complex_case = use_complex_case # Prepare full config for tq.init() self.full_config = self._prepare_config() @@ -337,8 +402,8 @@ def _initialize_clients(self) -> None: reader_config = self.full_config # Create writer and reader actors - self.writer = TQClientActor.options(**writer_options).remote(writer_config) - self.reader = TQClientActor.options(**reader_options).remote(reader_config) + self.writer = TQClientActor.options(**writer_options).remote(writer_config, self.use_complex_case) + self.reader = TQClientActor.options(**reader_options).remote(reader_config, self.use_complex_case) # Initialize transfer_queue logger.info(f"Using {self.backend} as storage backend.") @@ -522,6 +587,12 @@ def main() -> None: default=None, help="Path to output CSV file (optional)", ) + parser.add_argument( + "--use_complex_case", + action="store_true", + default=False, + help="Use complex test case with nested tensors and nontensor fields (default: False, simple case)", + ) args = parser.parse_args() @@ -537,6 +608,7 @@ def main() -> None: backend=args.backend, worker_node_ip=args.worker_node_ip, output_csv=args.output_csv, + use_complex_case=args.use_complex_case, ) # Run test multiple times for consistent results using a for loop diff --git a/scripts/performance_test/ray_perftest_baseline.py b/scripts/performance_test/ray_perftest_baseline.py index e59b175..fe40788 100644 --- a/scripts/performance_test/ray_perftest_baseline.py +++ b/scripts/performance_test/ray_perftest_baseline.py @@ -39,6 +39,55 @@ def create_test_case( seq_length: int | None = None, field_num: int | None = None, device: str = "cpu", +) -> tuple[TensorDict, float]: + """Create a test case with only regular tensors. + + Creates TensorDict with: + - Regular tensors: (batch_size, seq_length) shape, each element is float32 + + Args: + batch_size: Batch size for the test case + seq_length: Maximum sequence length + field_num: Total number of fields to create + device: Device to create tensors on ("cpu", "npu", or "gpu") + + Returns: + Tuple of (TensorDict, total_size_gb) + """ + bytes_per_element = 4 # float32 + + # Each regular tensor field: batch_size * seq_length * 4 bytes + regular_field_size_bytes = batch_size * seq_length * bytes_per_element + regular_field_size_gb = regular_field_size_bytes / (1024**3) + + total_size_gb = regular_field_size_gb * field_num + + logger.info(f"Total data size: {total_size_gb:.6f} GB") + + # Determine torch device + torch_device = None + if device == "npu": + torch_device = "npu:0" + elif device == "gpu": + torch_device = "cuda:0" + + batch_size_tuple = (batch_size,) + + prompt_batch = TensorDict(batch_size=batch_size_tuple) + + for i in range(field_num): + field_name = f"field_{i}" + tensor_data = torch.randn(batch_size, seq_length, dtype=torch.float32, device=torch_device) + prompt_batch.set(field_name, tensor_data) + + return prompt_batch, total_size_gb + + +def create_complex_test_case( + batch_size: int | None = None, + seq_length: int | None = None, + field_num: int | None = None, + device: str = "cpu", ) -> tuple[TensorDict, float]: """Create a test case with complex data formats. @@ -180,6 +229,7 @@ def __init__( head_node_ip: str, worker_node_ip: str | None = None, output_csv: str | None = None, + use_complex_case: bool = False, ): """Initialize the Ray baseline tester. @@ -191,6 +241,7 @@ def __init__( head_node_ip: Head node IP address worker_node_ip: Worker node IP address output_csv: Path to output CSV file (optional) + use_complex_case: Whether to use complex test case (nested + nontensor fields) """ self.global_batch_size = global_batch_size self.field_num = field_num @@ -199,6 +250,7 @@ def __init__( self.head_node_ip = head_node_ip self.worker_node_ip = worker_node_ip self.output_csv = output_csv + self.use_complex_case = use_complex_case # Initialize remote store on worker node self._initialize_remote_store() @@ -227,12 +279,20 @@ def run_throughput_test(self, skip_dataset_create=False) -> dict[str, Any]: if not skip_dataset_create: logger.info("Creating large batch for throughput test...") start_create_data = time.perf_counter() - self.test_data, self.total_data_size_gb = create_test_case( - batch_size=self.global_batch_size, - seq_length=self.seq_len, - field_num=self.field_num, - device="cpu", - ) + if self.use_complex_case: + self.test_data, self.total_data_size_gb = create_complex_test_case( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device="cpu", + ) + else: + self.test_data, self.total_data_size_gb = create_test_case( + batch_size=self.global_batch_size, + seq_length=self.seq_len, + field_num=self.field_num, + device="cpu", + ) end_create_data = time.perf_counter() logger.info(f"Data creation time: {end_create_data - start_create_data:.8f}s") @@ -350,6 +410,12 @@ def main() -> None: default=None, help="Path to output CSV file (optional)", ) + parser.add_argument( + "--use_complex_case", + action="store_true", + default=False, + help="Use complex test case with nested tensors and nontensor fields (default: False, simple case)", + ) args = parser.parse_args() @@ -362,6 +428,7 @@ def main() -> None: head_node_ip=args.head_node_ip, worker_node_ip=args.worker_node_ip, output_csv=args.output_csv, + use_complex_case=args.use_complex_case, ) # Run test multiple times diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh index b9c7a1f..0531717 100755 --- a/scripts/performance_test/run_perf_test.sh +++ b/scripts/performance_test/run_perf_test.sh @@ -15,6 +15,7 @@ HEAD_NODE_IP="${HEAD_NODE_IP:-127.0.0.1}" WORKER_NODE_IP="${WORKER_NODE_IP:-127.0.0.1}" DEVICE="${DEVICE:-cpu}" NUM_TEST_ITERATIONS="${NUM_TEST_ITERATIONS:-4}" +USE_COMPLEX_CASE="${USE_COMPLEX_CASE:-false}" # ======================================== # Backends to test (passed via --backend to perftest.py) @@ -27,6 +28,13 @@ declare -a SETTINGS=( "8192,21,128000,Large" ) +# Complex case flag +if [[ "${USE_COMPLEX_CASE}" == "true" ]]; then + COMPLEX_FLAG="--use_complex_case" +else + COMPLEX_FLAG="" +fi + # ---- TransferQueue perftest ---- for backend in "${BACKENDS[@]}"; do echo "==========================================" @@ -44,7 +52,8 @@ for backend in "${BACKENDS[@]}"; do --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ --num_test_iterations="${NUM_TEST_ITERATIONS}" \ --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ - --output_csv="${output_csv}" + --output_csv="${output_csv}" \ + ${COMPLEX_FLAG} sleep 10 done @@ -64,7 +73,8 @@ for setting in "${SETTINGS[@]}"; do --global_batch_size="${batch_size}" --field_num="${field_num}" --seq_len="${seq_len}" \ --num_test_iterations="${NUM_TEST_ITERATIONS}" \ --head_node_ip="${HEAD_NODE_IP}" --worker_node_ip="${WORKER_NODE_IP}" \ - --output_csv="${output_csv}" + --output_csv="${output_csv}" \ + ${COMPLEX_FLAG} done # ---- Draw figures ---- From a020706322dd11031972e543f942686e262194ca Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Sat, 28 Mar 2026 14:07:42 +0800 Subject: [PATCH 25/29] remove host config for yuanrong; auto-detect instead Signed-off-by: tianyi-ge --- .../openyuanrong_datasystem.md | 16 ++- scripts/performance_test/perftest.py | 16 +-- scripts/performance_test/perftest_config.yaml | 2 - transfer_queue/config.yaml | 2 - .../storage/clients/yuanrong_client.py | 32 +++++- .../storage/managers/yuanrong_manager.py | 4 +- transfer_queue/utils/common.py | 99 +++++++++++++++++++ 7 files changed, 139 insertions(+), 32 deletions(-) diff --git a/docs/storage_backends/openyuanrong_datasystem.md b/docs/storage_backends/openyuanrong_datasystem.md index 084bf55..e25c2f9 100644 --- a/docs/storage_backends/openyuanrong_datasystem.md +++ b/docs/storage_backends/openyuanrong_datasystem.md @@ -132,11 +132,11 @@ from transfer_queue import ( TransferQueueController, process_zmq_server_info, ) -# host, port, manager_type and client_name are the config for booting the datasystem. +# port, manager_type and client_name are the config for booting the datasystem. +# host will be auto-detected by checking local IP addresses. config_str = """ manager_type: YuanrongStorageManager client_name: YuanrongStorageClient - host: 127.0.0.1 port: 31501 """ dict_conf = OmegaConf.create(config_str, flags={"allow_objects": True}) @@ -360,26 +360,22 @@ def main(): config_str = """ manager_type: YuanrongStorageManager client_name: YuanrongStorageClient - host: 10.170.27.24 port: 31501 """ dict_conf = OmegaConf.create(config_str, flags={"allow_objects": True}) # It is important to pay attention to the controller's lifecycle. controller, dict_conf.controller_info = initialize_controller() - - conf_writer = dict_conf.copy() - conf_writer.host = HEAD_NODE_IP - conf_reader = dict_conf.copy() - conf_reader.host = WORKER_NODE_IP + + # Note: host is auto-detected on each node, no need to configure explicitly data = TensorDict({ "prompt": torch.ones(3, 512), "big_tensor": torch.randn(3,1024,1024)}, batch_size=[3]) # you could assign npu or gpu devices by 'resources' # resources={f"node:{HEAD_NODE_IP}": 0.001} could Force the actor to run on HEAD_NODE writer = TransferQueueClientActor.options( resources={f"node:{HEAD_NODE_IP}": 0.001}, - ).remote(conf_writer, "train") + ).remote(dict_conf, "train") reader = TransferQueueClientActor.options( resources={f"node:{WORKER_NODE_IP}": 0.001} - ).remote(conf_reader, "rollout") + ).remote(dict_conf, "rollout") ray.get(writer.put.remote(data=data, partition_id="train_0")) diff --git a/scripts/performance_test/perftest.py b/scripts/performance_test/perftest.py index 5e94f36..95eb45b 100644 --- a/scripts/performance_test/perftest.py +++ b/scripts/performance_test/perftest.py @@ -387,19 +387,9 @@ def _initialize_clients(self) -> None: reader_options["resources"]["NPU"] = 1 # Prepare configs for writer and reader - # For Yuanrong backend, set different hosts for writer and reader - if self.backend == "Yuanrong": - import copy - - writer_config = copy.deepcopy(self.full_config) - reader_config = copy.deepcopy(self.full_config) - writer_config["backend"]["Yuanrong"]["host"] = self.head_node_ip - reader_config["backend"]["Yuanrong"]["host"] = self.worker_node_ip - logger.info(f"Writer Yuanrong host: {self.head_node_ip}") - logger.info(f"Reader Yuanrong host: {self.worker_node_ip}") - else: - writer_config = self.full_config - reader_config = self.full_config + # Host is auto-detected on each node for Yuanrong backend + writer_config = self.full_config + reader_config = self.full_config # Create writer and reader actors self.writer = TQClientActor.options(**writer_options).remote(writer_config, self.use_complex_case) diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml index a1b1f9a..b96ea33 100644 --- a/scripts/performance_test/perftest_config.yaml +++ b/scripts/performance_test/perftest_config.yaml @@ -48,8 +48,6 @@ backend: # For Yuanrong: Yuanrong: - # IP of local yuanrong datasystem worker - host: 127.0.0.1 # Port of local yuanrong datasystem worker port: 31501 # If enable npu transport diff --git a/transfer_queue/config.yaml b/transfer_queue/config.yaml index 0a8ccef..433c026 100644 --- a/transfer_queue/config.yaml +++ b/transfer_queue/config.yaml @@ -48,8 +48,6 @@ backend: # For Yuanrong: Yuanrong: - # IP of local yuanrong datasystem worker - host: 127.0.0.1 # Port of local yuanrong datasystem worker port: 31501 # If enable npu transport diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py index 80f5bb7..5a32217 100644 --- a/transfer_queue/storage/clients/yuanrong_client.py +++ b/transfer_queue/storage/clients/yuanrong_client.py @@ -25,6 +25,7 @@ from transfer_queue.storage.clients.base import TransferQueueStorageKVClient from transfer_queue.storage.clients.factory import StorageClientFactory +from transfer_queue.utils.common import find_reachable_host from transfer_queue.utils.serial_utils import _decoder, _encoder logger = logging.getLogger(__name__) @@ -83,9 +84,20 @@ class NPUTensorKVClientAdapter(StorageStrategy): KEYS_LIMIT: int = 10_000 def __init__(self, config: dict): - host = config.get("host") port = config.get("port") + if port is None or not isinstance(port, int): + raise ValueError("Missing or invalid 'port' in config") + + logger.info(f"Auto-detecting reachable host for Yuanrong port {port}...") + host = find_reachable_host(port) + if host is None: + raise ValueError( + f"Could not find any reachable host for Yuanrong port {port}. " + "Please ensure yuanrong datasystem is running." + ) + logger.info(f"Using auto-detected host: {host}") + self.device_id = torch.npu.current_device() torch.npu.set_device(self.device_id) @@ -199,9 +211,20 @@ class GeneralKVClientAdapter(StorageStrategy): DS_MAX_WORKERS: int = 16 def __init__(self, config: dict): - host = config.get("host") port = config.get("port") + if port is None or not isinstance(port, int): + raise ValueError("Missing or invalid 'port' in config") + + logger.info(f"Auto-detecting reachable host for Yuanrong port {port}...") + host = find_reachable_host(port) + if host is None: + raise ValueError( + f"Could not find any reachable host for Yuanrong port {port}. " + "Please ensure yuanrong datasystem is running." + ) + logger.info(f"Using auto-detected host: {host}") + self._ds_client = datasystem.KVClient(host, port) self._ds_client.init() logger.info("YuanrongStorageClient: Create KVClient to connect with yuanrong-datasystem backend!") @@ -357,6 +380,11 @@ def __init__(self, config: dict[str, Any]): if not YUANRONG_DATASYSTEM_IMPORTED: raise ImportError("YuanRong DataSystem not installed.") + port = config.get("port") + + if port is None or not isinstance(port, int): + raise ValueError("Missing or invalid 'port' in config") + super().__init__(config) # Storage strategies are prioritized in ascending order of list element index. diff --git a/transfer_queue/storage/managers/yuanrong_manager.py b/transfer_queue/storage/managers/yuanrong_manager.py index 54ac094..d527040 100644 --- a/transfer_queue/storage/managers/yuanrong_manager.py +++ b/transfer_queue/storage/managers/yuanrong_manager.py @@ -36,14 +36,12 @@ class YuanrongStorageManager(KVStorageManager): """Storage manager for Yuanrong backend.""" def __init__(self, controller_info: ZMQServerInfo, config: dict[str, Any]): - host = config.get("host", None) port = config.get("port", None) client_name = config.get("client_name", None) - if host is None or not isinstance(host, str): - raise ValueError("Missing or invalid 'host' in config") if port is None or not isinstance(port, int): raise ValueError("Missing or invalid 'port' in config") + if client_name is None: logger.info("Missing 'client_name' in config, using default value('YuanrongStorageClient')") config["client_name"] = "YuanrongStorageClient" diff --git a/transfer_queue/utils/common.py b/transfer_queue/utils/common.py index a9d2b93..08a137f 100644 --- a/transfer_queue/utils/common.py +++ b/transfer_queue/utils/common.py @@ -15,6 +15,7 @@ import logging import os +import socket from contextlib import contextmanager from typing import Optional @@ -98,3 +99,101 @@ def get_env_bool(env_key: str, default: bool = False) -> bool: true_values = {"true", "1", "yes", "y", "on"} return env_value_lower in true_values + + +def get_local_ip_addresses() -> list[str]: + """Get all local IP addresses including 127.0.0.1. + + Returns: + List of local IP addresses, with 127.0.0.1 first. + """ + ips = ["127.0.0.1"] + + try: + hostname = socket.gethostname() + # Add hostname resolution + try: + host_ip = socket.gethostbyname(hostname) + if host_ip not in ips: + ips.append(host_ip) + except socket.gaierror: + pass + + # Get all network interfaces + import netifaces + + for interface in netifaces.interfaces(): + try: + addrs = netifaces.ifaddresses(interface) + if netifaces.AF_INET in addrs: + for addr_info in addrs[netifaces.AF_INET]: + ip = addr_info.get("addr") + if ip and ip not in ips: + ips.append(ip) + except (ValueError, KeyError): + continue + except ImportError: + # Fallback if netifaces is not available + try: + # Try to get IP by connecting to an external address + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + # Doesn't need to be reachable + s.connect(("8.8.8.8", 80)) + ip = s.getsockname()[0] + if ip not in ips: + ips.append(ip) + except Exception: + pass + finally: + s.close() + except Exception: + pass + + return ips + + +def check_port_connectivity(host: str, port: int, timeout: float = 2.0) -> bool: + """Check if a TCP port is reachable on the given host. + + Args: + host: Host IP address to check + port: Port number to check + timeout: Connection timeout in seconds + + Returns: + True if the port is reachable, False otherwise + """ + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout) + result = sock.connect_ex((host, port)) + sock.close() + return result == 0 + except Exception: + return False + + +def find_reachable_host(port: int, timeout: float = 1.0) -> Optional[str]: + """Find a reachable local host IP address for the given port. + + Tries all local IP addresses in order and returns the first one + that has the given port open. + + Args: + port: Port number to check + timeout: Connection timeout in seconds per check + + Returns: + The first reachable host IP address, or None if none found. + """ + local_ips = get_local_ip_addresses() + logger.info(f"Checking port {port} on local IPs: {local_ips}") + + for ip in local_ips: + if check_port_connectivity(ip, port, timeout): + logger.info(f"Found reachable host: {ip}:{port}") + return ip + + logger.warning(f"No reachable host found for port {port}") + return None From 28313dd6711cbdf58e89b50eaabde4a547d01537 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Sat, 28 Mar 2026 16:24:27 +0800 Subject: [PATCH 26/29] 1. move find reachable ip to yuanrong client 2. modify default mooncake store perftest config Signed-off-by: tianyi-ge --- scripts/performance_test/perftest_config.yaml | 12 +-- tests/test_yuanrong_client_zero_copy.py | 1 + tests/test_yuanrong_storage_client_e2e.py | 7 ++ transfer_queue/interface.py | 7 +- .../storage/clients/yuanrong_client.py | 101 +++++++++++++++++- transfer_queue/utils/common.py | 99 ----------------- 6 files changed, 116 insertions(+), 111 deletions(-) diff --git a/scripts/performance_test/perftest_config.yaml b/scripts/performance_test/perftest_config.yaml index b96ea33..39538bc 100644 --- a/scripts/performance_test/perftest_config.yaml +++ b/scripts/performance_test/perftest_config.yaml @@ -34,12 +34,12 @@ backend: master_server_address: localhost:50051 # Address of local host. Set to "" to use Ray IP as local host address local_hostname: "" - # Protocol for transmission. Choose from: tcp, rdma. (default: tcp) - protocol: tcp - # Memory segment size in bytes for mounting (default: 4GB) - global_segment_size: 4294967296 - # Local buffer size in bytes (default: 1GB) - local_buffer_size: 1073741824 + # Protocol for transmission. Choose from: tcp, rdma. (default: rdma) + protocol: rdma + # Memory segment size in bytes for mounting + global_segment_size: 86294967296 + # Local buffer size in bytes + local_buffer_size: 86294967296 # Network device name. Set to "" to let Mooncake to auto-picks devices device_name: "" diff --git a/tests/test_yuanrong_client_zero_copy.py b/tests/test_yuanrong_client_zero_copy.py index b93fd32..423b1c7 100644 --- a/tests/test_yuanrong_client_zero_copy.py +++ b/tests/test_yuanrong_client_zero_copy.py @@ -47,6 +47,7 @@ def mock_kv_client(self, mocker): mocker.patch("yr.datasystem.KVClient", return_value=mock_client) mocker.patch("yr.datasystem.DsTensorClient") + mocker.patch("transfer_queue.storage.clients.yuanrong_client.find_reachable_host", return_value="127.0.0.1") return mock_client diff --git a/tests/test_yuanrong_storage_client_e2e.py b/tests/test_yuanrong_storage_client_e2e.py index 2a79ec9..3cb1f99 100644 --- a/tests/test_yuanrong_storage_client_e2e.py +++ b/tests/test_yuanrong_storage_client_e2e.py @@ -108,10 +108,17 @@ def mock_yr_datasystem(): # - sys.modules: Redirects 'import yr' to our mocks # - YUANRONG_DATASYSTEM_IMPORTED: Forces the existence check to True so initialize the client successfully # - datasystem: Direct attribute patch for the module + # - find_reachable_host: Mock host detection to avoid real network checks + def mock_find_reachable_host(port, timeout=1.0): + return "127.0.0.1" + with ( mock.patch.dict("sys.modules", {"yr": yr_mock, "yr.datasystem": ds_mock}), mock.patch("transfer_queue.storage.clients.yuanrong_client.YUANRONG_DATASYSTEM_IMPORTED", True, create=True), mock.patch("transfer_queue.storage.clients.yuanrong_client.datasystem", ds_mock), + mock.patch( + "transfer_queue.storage.clients.yuanrong_client.find_reachable_host", side_effect=mock_find_reachable_host + ), ): yield diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py index 92b0f5e..f54c3bf 100644 --- a/transfer_queue/interface.py +++ b/transfer_queue/interface.py @@ -233,7 +233,7 @@ def init(conf: Optional[DictConfig] = None) -> None: >>> metadata = tq.get_meta(...) >>> data = tq.get_data(metadata) """ - if conf is None and _init_from_existing(): + if _init_from_existing(): return # First-time initialize TransferQueue @@ -271,10 +271,7 @@ def init(conf: Optional[DictConfig] = None) -> None: logger.info("TransferQueueController has been created.") except ValueError: logger.info("Some other rank has initialized TransferQueueController. Try to connect to existing controller.") - if conf is None: - _init_from_existing() - return - _TRANSFER_QUEUE_CONTROLLER = ray.get_actor("TransferQueueController") + _init_from_existing() controller_zmq_info = process_zmq_server_info(_TRANSFER_QUEUE_CONTROLLER) final_conf.controller.zmq_info = controller_zmq_info diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py index 5a32217..77a981e 100644 --- a/transfer_queue/storage/clients/yuanrong_client.py +++ b/transfer_queue/storage/clients/yuanrong_client.py @@ -15,6 +15,7 @@ import logging import os +import socket import struct from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor @@ -25,12 +26,110 @@ from transfer_queue.storage.clients.base import TransferQueueStorageKVClient from transfer_queue.storage.clients.factory import StorageClientFactory -from transfer_queue.utils.common import find_reachable_host from transfer_queue.utils.serial_utils import _decoder, _encoder logger = logging.getLogger(__name__) logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.WARNING)) + +def get_local_ip_addresses() -> list[str]: + """Get all local IP addresses including 127.0.0.1. + + Returns: + List of local IP addresses, with 127.0.0.1 first. + """ + ips = ["127.0.0.1"] + + try: + hostname = socket.gethostname() + # Add hostname resolution + try: + host_ip = socket.gethostbyname(hostname) + if host_ip not in ips: + ips.append(host_ip) + except socket.gaierror: + pass + + # Get all network interfaces + import netifaces + + for interface in netifaces.interfaces(): + try: + addrs = netifaces.ifaddresses(interface) + if netifaces.AF_INET in addrs: + for addr_info in addrs[netifaces.AF_INET]: + ip = addr_info.get("addr") + if ip and ip not in ips: + ips.append(ip) + except (ValueError, KeyError): + continue + except ImportError: + # Fallback if netifaces is not available + try: + # Try to get IP by connecting to an external address + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + # Doesn't need to be reachable + s.connect(("8.8.8.8", 80)) + ip = s.getsockname()[0] + if ip not in ips: + ips.append(ip) + except Exception: + pass + finally: + s.close() + except Exception: + pass + + return ips + + +def check_port_connectivity(host: str, port: int, timeout: float = 2.0) -> bool: + """Check if a TCP port is reachable on the given host. + + Args: + host: Host IP address to check + port: Port number to check + timeout: Connection timeout in seconds + + Returns: + True if the port is reachable, False otherwise + """ + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout) + result = sock.connect_ex((host, port)) + sock.close() + return result == 0 + except Exception: + return False + + +def find_reachable_host(port: int, timeout: float = 1.0) -> Optional[str]: + """Find a reachable local host IP address for the given port. + + Tries all local IP addresses in order and returns the first one + that has the given port open. + + Args: + port: Port number to check + timeout: Connection timeout in seconds per check + + Returns: + The first reachable host IP address, or None if none found. + """ + local_ips = get_local_ip_addresses() + logger.info(f"Checking port {port} on local IPs: {local_ips}") + + for ip in local_ips: + if check_port_connectivity(ip, port, timeout): + logger.info(f"Found reachable host: {ip}:{port}") + return ip + + logger.warning(f"No reachable host found for port {port}") + return None + + YUANRONG_DATASYSTEM_IMPORTED: bool = True try: diff --git a/transfer_queue/utils/common.py b/transfer_queue/utils/common.py index 08a137f..a9d2b93 100644 --- a/transfer_queue/utils/common.py +++ b/transfer_queue/utils/common.py @@ -15,7 +15,6 @@ import logging import os -import socket from contextlib import contextmanager from typing import Optional @@ -99,101 +98,3 @@ def get_env_bool(env_key: str, default: bool = False) -> bool: true_values = {"true", "1", "yes", "y", "on"} return env_value_lower in true_values - - -def get_local_ip_addresses() -> list[str]: - """Get all local IP addresses including 127.0.0.1. - - Returns: - List of local IP addresses, with 127.0.0.1 first. - """ - ips = ["127.0.0.1"] - - try: - hostname = socket.gethostname() - # Add hostname resolution - try: - host_ip = socket.gethostbyname(hostname) - if host_ip not in ips: - ips.append(host_ip) - except socket.gaierror: - pass - - # Get all network interfaces - import netifaces - - for interface in netifaces.interfaces(): - try: - addrs = netifaces.ifaddresses(interface) - if netifaces.AF_INET in addrs: - for addr_info in addrs[netifaces.AF_INET]: - ip = addr_info.get("addr") - if ip and ip not in ips: - ips.append(ip) - except (ValueError, KeyError): - continue - except ImportError: - # Fallback if netifaces is not available - try: - # Try to get IP by connecting to an external address - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - try: - # Doesn't need to be reachable - s.connect(("8.8.8.8", 80)) - ip = s.getsockname()[0] - if ip not in ips: - ips.append(ip) - except Exception: - pass - finally: - s.close() - except Exception: - pass - - return ips - - -def check_port_connectivity(host: str, port: int, timeout: float = 2.0) -> bool: - """Check if a TCP port is reachable on the given host. - - Args: - host: Host IP address to check - port: Port number to check - timeout: Connection timeout in seconds - - Returns: - True if the port is reachable, False otherwise - """ - try: - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(timeout) - result = sock.connect_ex((host, port)) - sock.close() - return result == 0 - except Exception: - return False - - -def find_reachable_host(port: int, timeout: float = 1.0) -> Optional[str]: - """Find a reachable local host IP address for the given port. - - Tries all local IP addresses in order and returns the first one - that has the given port open. - - Args: - port: Port number to check - timeout: Connection timeout in seconds per check - - Returns: - The first reachable host IP address, or None if none found. - """ - local_ips = get_local_ip_addresses() - logger.info(f"Checking port {port} on local IPs: {local_ips}") - - for ip in local_ips: - if check_port_connectivity(ip, port, timeout): - logger.info(f"Found reachable host: {ip}:{port}") - return ip - - logger.warning(f"No reachable host found for port {port}") - return None From f278f8e22a39abfc9f44e7fb11b7e90d40301daf Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Sat, 28 Mar 2026 17:01:12 +0800 Subject: [PATCH 27/29] fix comments Signed-off-by: tianyi-ge --- pyproject.toml | 5 ----- transfer_queue/interface.py | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a3824cb..1fba227 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,11 +120,6 @@ yuanrong = [ mooncake = [ "mooncake-transfer-engine" ] -perftest = [ - "matplotlib", - "seaborn", - "pandas" -] # If you need to mimic `package_dir={'': '.'}`: [tool.setuptools.package-dir] diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py index f54c3bf..d0fd2f7 100644 --- a/transfer_queue/interface.py +++ b/transfer_queue/interface.py @@ -272,6 +272,7 @@ def init(conf: Optional[DictConfig] = None) -> None: except ValueError: logger.info("Some other rank has initialized TransferQueueController. Try to connect to existing controller.") _init_from_existing() + return controller_zmq_info = process_zmq_server_info(_TRANSFER_QUEUE_CONTROLLER) final_conf.controller.zmq_info = controller_zmq_info From 47977050feefa9e5e2276d4d1b23b6c8fb4df184 Mon Sep 17 00:00:00 2001 From: 0oshowero0 Date: Sat, 28 Mar 2026 16:54:48 +0800 Subject: [PATCH 28/29] fix figure drawing Signed-off-by: 0oshowero0 --- scripts/performance_test/draw_figure.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/performance_test/draw_figure.py b/scripts/performance_test/draw_figure.py index b96219a..400c7b9 100644 --- a/scripts/performance_test/draw_figure.py +++ b/scripts/performance_test/draw_figure.py @@ -95,8 +95,14 @@ def make_xlabel(size_label: str) -> str: df["Bandwidth"] = df["total_gbit_per_sec"] df["Scenario"] = df["backend_parsed"] -# Set backend display order -backend_order = ["Ray", "SimpleStorage", "Yuanrong", "MooncakeStore"] +# Set backend display order: only include backends that actually exist in the data +preferred_backend_order = ["Ray", "SimpleStorage", "Yuanrong", "MooncakeStore"] + +# Get actual backends present in the data, maintaining preferred order +actual_backends = df["Scenario"].unique().tolist() +backend_order = [b for b in preferred_backend_order if b in actual_backends] +# Add any unknown backends at the end (shouldn't happen normally) +backend_order += [b for b in actual_backends if b not in preferred_backend_order] df["Scenario"] = pd.Categorical(df["Scenario"], categories=backend_order, ordered=True) @@ -105,9 +111,9 @@ def make_xlabel(size_label: str) -> str: fig, ax = plt.subplots(figsize=(12, 7)) -# Use the backend order to ensure consistent coloring -existing_backends = df["Scenario"].unique() -palette = sns.color_palette("Set2", n_colors=len(existing_backends)) +# Use Set2 palette to generate colors for all backends +# Set2 has 8 colors, which should be enough for typical use cases +palette = sns.color_palette("Set2", n_colors=len(backend_order)) barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette) # Legend: match old style — at the top center, horizontal, with frame From 49d113909535e19e37b329422ff1274d5c6720b8 Mon Sep 17 00:00:00 2001 From: tianyi-ge Date: Sat, 28 Mar 2026 17:38:43 +0800 Subject: [PATCH 29/29] update large test config Signed-off-by: tianyi-ge --- scripts/performance_test/README_PERFTEST.md | 2 +- scripts/performance_test/run_perf_test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/performance_test/README_PERFTEST.md b/scripts/performance_test/README_PERFTEST.md index b1d0f6a..1b5ddc6 100644 --- a/scripts/performance_test/README_PERFTEST.md +++ b/scripts/performance_test/README_PERFTEST.md @@ -113,7 +113,7 @@ HEAD_NODE_IP=192.168.0.1 WORKER_NODE_IP=192.168.0.2 DEVICE=npu ./run_perf_test.s ### Test Matrix - **Backends**: SimpleStorage, Yuanrong, MooncakeStore, Ray (baseline) -- **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=21, seq=128000) +- **Data sizes**: Small (batch=1024, fields=9, seq=8192), Medium (batch=4096, fields=15, seq=32768), Large (batch=8192, fields=18, seq=100000) ### Output diff --git a/scripts/performance_test/run_perf_test.sh b/scripts/performance_test/run_perf_test.sh index 0531717..19aa478 100755 --- a/scripts/performance_test/run_perf_test.sh +++ b/scripts/performance_test/run_perf_test.sh @@ -25,7 +25,7 @@ BACKENDS=("SimpleStorage" "Yuanrong" "MooncakeStore") declare -a SETTINGS=( "1024,9,8192,Small" "4096,15,32768,Medium" - "8192,21,128000,Large" + "8192,18,100000,Large" ) # Complex case flag