diff --git a/infra/aws_neuron/Dockerfile.neuron b/infra/aws_neuron/Dockerfile.neuron index d9bc8558..c68c5cc6 100644 --- a/infra/aws_neuron/Dockerfile.neuron +++ b/infra/aws_neuron/Dockerfile.neuron @@ -1,23 +1,25 @@ -# Is an mirror of -# 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference-neuronx:2.1.2-transformers4.43.2-neuronx-py310-sdk2.20.0-ubuntu20.04 +# Recommended: Use the AWS Deep Learning AMI Neuron (Ubuntu 24.04) directly +# instead of building a custom Docker image. See README.md for instructions. +# +# If you must use Docker, this Dockerfile provides a starting point. +# Note: The Neuron runtime must be available on the host (--device=/dev/neuron0). + +# Base image with Neuron SDK pre-installed +# Mirror of HuggingFace Neuron inference image FROM michaelf34/aws-neuron-base-img:0.0.25-inference AS base WORKDIR /app COPY ./infra/aws_neuron/requirements_no_gpu.txt requirements_no_gpu.txt -RUN pip3 install -r requirements_no_gpu.txt +RUN pip3 install --no-cache-dir -r requirements_no_gpu.txt RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com -# req -# RUN pip3 install --no-deps --upgrade optimum[neuronx]==1.20.0 -RUN pip3 install --no-deps sentence_transformers==3.3.1 -# libneuronxla-2.0.5347.0 ml-dtypes-0.2.0 neuronx-cc-2.15.143.0+e39249ad setuptools-69.5.1 torch-neuronx-2.1.2.2.3.2 torch-xla-2.1.5 transformers-neuronx-0.12.313 -RUN pip3 install --upgrade neuronx-cc==2.15.* torch-neuronx torchvision transformers-neuronx libneuronxla protobuf optimum-neuron==0.0.20 +# Tested with: optimum-neuron 0.4.4, optimum 2.0.0, neuronx-cc 2.21, torch-neuronx 2.8 +RUN pip3 install --no-cache-dir --no-deps sentence_transformers && \ + pip3 install --no-cache-dir --upgrade neuronx-cc torch-neuronx torchvision libneuronxla protobuf optimum-neuron optimum -# base is also checkpointed to -# docker pull michaelf34/aws-neuron-base-img:neuroncc2-15--optimum-1-17--transformers-4-36 FROM base AS infinity_latest COPY ./libs/infinity_emb . RUN pip3 install -e . ENV INFINITY_BATCH_SIZE=8 ENV INFINITY_ENGINE=neuron -ENTRYPOINT [ "infinity_emb" ] \ No newline at end of file +ENTRYPOINT [ "infinity_emb" ] diff --git a/infra/aws_neuron/README.md b/infra/aws_neuron/README.md index 1f157be6..e86aff20 100644 --- a/infra/aws_neuron/README.md +++ b/infra/aws_neuron/README.md @@ -1,33 +1,146 @@ -# Launch an EC2 Instance on AWS: +# Running Infinity on AWS Inferentia / Trainium -### Start a EC2 Instance with Huggingface AMI (free AMI image with Neuron Tools/Docker installed) -- https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2 -- View Purchase Options -> Configure -- Use `64-Bit AMI`, `20241115 (Nov 18, 2024)` -- Region, e.g. `us-west-2` -- Set Instance type `inf2.xlarge` (has two neuron accelerators) -- Login with username `ubuntu` (using your standard EC2 setup e.g. `ssh ubuntu@ec2-14-11-13-12.us-west-2.compute.amazonaws.com`) +## Recommended: Use the HuggingFace Neuron AMI (no Docker) + +The simplest approach is to run Infinity directly on an EC2 instance with the +HuggingFace Neuron AMI, which comes with `optimum-neuron`, `optimum`, `transformers`, +and `sentence-transformers` pre-installed with compatible Neuron SDK versions. + +### 1. Launch an EC2 Instance + +- Use the **HuggingFace Neuron AMI** (`huggingface-neuron-*`) from the AWS Marketplace + - This AMI ships optimum-neuron 0.4.4, neuronx-cc 2.21, Python 3.10 — all compatible + - Search for `huggingface-neuron` in the EC2 AMI catalog +- Instance type: **inf2.xlarge** (2 NeuronCores, 32 GB), **trn2.3xlarge** (4 NeuronCores, 128 GB), or larger +- Disk: The AMI defaults to 512 GB + +### 2. Install Infinity + +```bash +# SSH into the instance +ssh ubuntu@ + +# Activate the pre-installed PyTorch environment +source /opt/aws_neuronx_venv_pytorch_2_8/bin/activate + +# Clone and install Infinity from source (don't overwrite Neuron packages) +git clone https://github.com/michaelfeil/infinity.git ~/infinity +cd ~/infinity/libs/infinity_emb +pip install --no-deps . + +# Install remaining runtime dependencies (most are already present on the HF AMI) +pip install uvicorn fastapi orjson typer httptools pydantic posthog \ + prometheus-fastapi-instrumentator hf_transfer rich +``` + +### 3. Run Infinity with Neuron engine + +```bash +# Single core (uses one NeuronCore) +infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 +``` + +The first run will compile the model for Neuron (~100 seconds). Subsequent runs use the cached compilation. + +### 4. Scale across all NeuronCores (data parallelism) + +The Neuron runtime is limited to one model per process. To use all NeuronCores, +run one server process per core, each pinned to a different core: + +```bash +# inf2.xlarge has 2 NeuronCores (cores 0 and 1) +NEURON_RT_VISIBLE_CORES=0 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7997 & +NEURON_RT_VISIBLE_CORES=1 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7998 & + +# trn2.3xlarge has 4 NeuronCores (cores 0-3) +NEURON_RT_VISIBLE_CORES=0 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7997 & +NEURON_RT_VISIBLE_CORES=1 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7998 & +NEURON_RT_VISIBLE_CORES=2 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7999 & +NEURON_RT_VISIBLE_CORES=3 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 8000 & +``` + +Then use a load balancer (nginx, HAProxy, etc.) to distribute requests across +ports. Throughput scales linearly with cores: 2 cores = 2x, 4 cores = 4x. + +### 5. Test it + +```bash +curl http://localhost:7997/embeddings \ + -H "Content-Type: application/json" \ + -d '{"input": ["Hello world", "How are you?"], "model": "BAAI/bge-small-en-v1.5"}' +``` + +## Performance (bge-small-en-v1.5, batch_size=4) + +### Latency (serial requests, P50) + +| Workload | g5.xlarge (GPU) | inf2.xlarge (1 core) | trn2.3xlarge (1 core) | +|----------|----------------|---------------------|----------------------| +| 1 short sentence | 14.2ms | 25.0ms | 19.0ms | +| 4 short sentences | 16.0ms | 25.6ms | 19.5ms | +| 4 long sentences | 16.2ms | 26.0ms | 20.3ms | + +### Throughput (concurrent requests, data parallelism) + +| Instance | Cores | Peak emb/s | Concurrency | +|----------|-------|-----------|-------------| +| g5.xlarge (GPU) | 1 GPU | 536 | 8 | +| inf2.xlarge | 1 core | 216 | 4 | +| inf2.xlarge | 2 cores | 427 | 4 | +| trn2.3xlarge | 1 core | 348 | 4 | +| trn2.3xlarge | 4 cores | 753 | 4 | + +**Notes:** +- g5.xlarge uses `--engine torch`; inf2/trn2 use `--engine neuron` +- Neuron latency is constant regardless of batch content (padded to compiled batch size) +- trn2 has ~30% lower latency per core than inf2 (19ms vs 25ms) +- Throughput scales linearly with data parallelism (1 process per core) +- Compilation time: ~60-100 seconds on first run (cached after that) + +Tested on HuggingFace Neuron AMI (optimum-neuron 0.4.4, neuronx-cc 2.21, SDK 2.27) +and Deep Learning AMI Neuron Ubuntu 22.04 (SDK 2.28) for trn2. + +## Tested Stack + +| Package | Version | +|---------|---------| +| optimum-neuron | 0.4.4 | +| optimum | 2.0.0 | +| neuronx-cc | 2.21.33363 | +| torch-neuronx | 2.8.0.2.10 | +| torch | 2.8.0 | +| transformers | 4.57.3 | +| Python | 3.10.12 | + +## Alternative: Docker + +### Build from source -### Optional: build docker image from scratch ```bash git clone https://github.com/michaelfeil/infinity cd infinity -docker buildx build -t michaelf34/infinity:0.0.x-neuron -f ./infra/aws_neuron/Dockerfile.neuron +docker buildx build -t infinity-neuron -f ./infra/aws_neuron/Dockerfile.neuron . ``` -### Run the image on EC2 +### Run on EC2 ```bash -docker run -it --rm --device=/dev/neuron0 michaelf34/infinity:0.0.71-neuron v2 --model-id BAAI/bge-small-en-v1.5 --batch-size 8 --log-level debug +docker run -it --rm --device=/dev/neuron0 infinity-neuron \ + v2 --model-id BAAI/bge-small-en-v1.5 --batch-size 8 ``` -### Run task on ECS (Work in progress) +**Note:** The host must have the Neuron driver installed. The Docker approach is less tested than the direct AMI approach above. + +## Limitations -1. Create a AWS ECS Cluster with EC2: -- Amazon Machine Image (AMI): Amazon Linux 2 - *Neuron* -- inf2.xlarge as machine type. +- The `--engine neuron` flag currently supports **text embeddings only** (no reranking or classification) +- The Neuron engine requires a **constant batch size** (requests are padded automatically) +- Models are compiled on first use; compilation can take 60-120 seconds + +## ECS Deployment + +See the ECS task definition example below for container orchestration: -2. Create a Task: ```json { "family": "ecs-infinity-neuron", @@ -45,10 +158,7 @@ docker run -it --rm --device=/dev/neuron0 michaelf34/infinity:0.0.71-neuron v2 - "executionRoleArn": "${YOUR_EXECUTION_ROLE}", "containerDefinitions": [ { - "entryPoint": [ - "infinity_emb", - "v2" - ], + "entryPoint": ["infinity_emb", "v2"], "portMappings": [ { "hostPort": 7997, @@ -61,41 +171,19 @@ docker run -it --rm --device=/dev/neuron0 michaelf34/infinity:0.0.71-neuron v2 - { "containerPath": "/dev/neuron0", "hostPath": "/dev/neuron0", - "permissions": [ - "read", - "write" - ] + "permissions": ["read", "write"] } ], "capabilities": { - "add": [ - "IPC_LOCK" - ] + "add": ["IPC_LOCK"] } }, "cpu": 0, "memoryReservation": 1000, - "image": "michaelf34/infinity:0.0.71-neuron", + "image": "infinity-neuron:latest", "essential": true, "name": "infinity-neuron" } ] } ``` - -You can also add logging: -``` - // same indent as "linuxParameters" - "logConfiguration": { - "logDriver": "awslogs", - "options": { - "awslogs-group": "/ecs/ecs-infinity-neuron", - "mode": "non-blocking", - "awslogs-create-group": "true", - "max-buffer-size": "25m", - "awslogs-region": "us-west-2", // set correct location. - "awslogs-stream-prefix": "ecs" - }, - "secretOptions": [] - } -``` \ No newline at end of file diff --git a/libs/infinity_emb/infinity_emb/transformer/acceleration.py b/libs/infinity_emb/infinity_emb/transformer/acceleration.py index 1d7b7c7f..90964a4e 100644 --- a/libs/infinity_emb/infinity_emb/transformer/acceleration.py +++ b/libs/infinity_emb/infinity_emb/transformer/acceleration.py @@ -8,10 +8,19 @@ from infinity_emb.primitives import Device if CHECK_OPTIMUM.is_available: - from optimum.bettertransformer import ( # type: ignore[import-untyped] - BetterTransformer, - BetterTransformerManager, - ) + try: + from optimum.bettertransformer import ( # type: ignore[import-untyped] + BetterTransformer, + BetterTransformerManager, + ) + except (ImportError, ModuleNotFoundError): + # optimum.bettertransformer was removed in optimum >= 2.0 + CHECK_OPTIMUM.mark_dirty( + ImportError( + "optimum.bettertransformer is not available in this version of optimum. " + "BetterTransformer support requires optimum < 2.0." + ) + ) if CHECK_TORCH.is_available: import torch @@ -37,6 +46,9 @@ def check_if_bettertransformer_possible(engine_args: "EngineArgs") -> bool: if not engine_args.bettertransformer: return False + if "BetterTransformerManager" not in globals(): + return False + config = AutoConfig.from_pretrained( pretrained_model_name_or_path=engine_args.model_name_or_path, revision=engine_args.revision, diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/neuron.py b/libs/infinity_emb/infinity_emb/transformer/embedder/neuron.py index 433bd67c..4b9f8411 100644 --- a/libs/infinity_emb/infinity_emb/transformer/embedder/neuron.py +++ b/libs/infinity_emb/infinity_emb/transformer/embedder/neuron.py @@ -2,10 +2,7 @@ # Copyright (c) 2023-now michaelfeil import copy -import json -import subprocess -from typing import Union -from functools import cache +import os import numpy as np from infinity_emb._optional_imports import CHECK_OPTIMUM_NEURON, CHECK_TORCH @@ -30,22 +27,6 @@ ] -@cache -def get_nc_count() -> Union[int, None]: - """Returns the number of neuron cores on the current instance.""" - try: - cmd = "neuron-ls --json-output" - result = subprocess.run(cmd, shell=True, capture_output=True) - print("inferring nc_count from `neuron-ls`") - print(result.stdout.decode("utf-8")) - json_output = json.loads(result.stdout) - count = sum([x["nc_count"] for x in json_output]) - print(f"nc_count={count}") - return count - except Exception: - return None - - def pad_up_to_size(desired_max_bs: int, input_ids: "torch.Tensor") -> "torch.Tensor": """input_ids a 2D array with batch_size on dim=0 @@ -97,7 +78,13 @@ def __init__(self, *, engine_args: EngineArgs): ) self._infinity_tokenizer = copy.deepcopy(self.tokenizer) - compiler_args = {"num_cores": get_nc_count(), "auto_cast_type": "fp16"} + # Default to 1 NeuronCore (data parallelism). For large models that + # require tensor parallelism across multiple cores, set the + # NEURON_NUM_CORES environment variable. For data-parallel scaling, + # run separate server processes pinned to individual cores via + # NEURON_RT_VISIBLE_CORES (see infra/aws_neuron/README.md). + num_cores = int(os.environ.get("NEURON_NUM_CORES", "1")) + compiler_args = {"num_cores": num_cores, "auto_cast_type": "fp16"} input_shapes = { "batch_size": engine_args.batch_size, "sequence_length": ( @@ -124,7 +111,6 @@ def encode_pre(self, sentences: list[str]) -> dict[str, "torch.Tensor"]: padding=True, truncation="longest_first", return_tensors="pt", - return_token_type_ids=False, ) return input_dict