michaelfeil · jimburtoft · Feb 28, 2026 · Mar 1, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/infra/aws_neuron/Dockerfile.neuron b/infra/aws_neuron/Dockerfile.neuron
@@ -1,23 +1,25 @@
-# Is an mirror of 
-# 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference-neuronx:2.1.2-transformers4.43.2-neuronx-py310-sdk2.20.0-ubuntu20.04
+# Recommended: Use the AWS Deep Learning AMI Neuron (Ubuntu 24.04) directly
+# instead of building a custom Docker image. See README.md for instructions.
+#
+# If you must use Docker, this Dockerfile provides a starting point.
+# Note: The Neuron runtime must be available on the host (--device=/dev/neuron0).
+
+# Base image with Neuron SDK pre-installed
+# Mirror of HuggingFace Neuron inference image
 FROM michaelf34/aws-neuron-base-img:0.0.25-inference AS base
 
 WORKDIR /app
 
 COPY ./infra/aws_neuron/requirements_no_gpu.txt requirements_no_gpu.txt
-RUN pip3 install -r requirements_no_gpu.txt
+RUN pip3 install --no-cache-dir -r requirements_no_gpu.txt
 RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-# req
-# RUN pip3 install --no-deps --upgrade optimum[neuronx]==1.20.0 
-RUN pip3 install --no-deps sentence_transformers==3.3.1 
-# libneuronxla-2.0.5347.0 ml-dtypes-0.2.0 neuronx-cc-2.15.143.0+e39249ad setuptools-69.5.1 torch-neuronx-2.1.2.2.3.2 torch-xla-2.1.5 transformers-neuronx-0.12.313
-RUN pip3 install --upgrade neuronx-cc==2.15.* torch-neuronx torchvision transformers-neuronx libneuronxla protobuf optimum-neuron==0.0.20
+# Tested with: optimum-neuron 0.4.4, optimum 2.0.0, neuronx-cc 2.21, torch-neuronx 2.8
+RUN pip3 install --no-cache-dir --no-deps sentence_transformers && \
+    pip3 install --no-cache-dir --upgrade neuronx-cc torch-neuronx torchvision libneuronxla protobuf optimum-neuron optimum
 
-# base is also checkpointed to 
-# docker pull michaelf34/aws-neuron-base-img:neuroncc2-15--optimum-1-17--transformers-4-36 
 FROM base AS infinity_latest
 COPY ./libs/infinity_emb .
 RUN pip3 install -e .
 ENV INFINITY_BATCH_SIZE=8
 ENV INFINITY_ENGINE=neuron
-ENTRYPOINT [ "infinity_emb" ]
+ENTRYPOINT [ "infinity_emb" ]
diff --git a/infra/aws_neuron/README.md b/infra/aws_neuron/README.md
@@ -1,33 +1,146 @@
-# Launch an EC2 Instance on AWS:
+# Running Infinity on AWS Inferentia / Trainium
 
-### Start a EC2 Instance with Huggingface AMI (free AMI image with Neuron Tools/Docker installed)
-- https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2
-- View Purchase Options -> Configure
-- Use `64-Bit AMI`, `20241115 (Nov 18, 2024)`
-- Region, e.g. `us-west-2`
-- Set Instance type `inf2.xlarge` (has two neuron accelerators)
-- Login with username `ubuntu` (using your standard EC2 setup e.g. `ssh ubuntu@ec2-14-11-13-12.us-west-2.compute.amazonaws.com`)
+## Recommended: Use the HuggingFace Neuron AMI (no Docker)
+
+The simplest approach is to run Infinity directly on an EC2 instance with the
+HuggingFace Neuron AMI, which comes with `optimum-neuron`, `optimum`, `transformers`,
+and `sentence-transformers` pre-installed with compatible Neuron SDK versions.
+
+### 1. Launch an EC2 Instance
+
+- Use the **HuggingFace Neuron AMI** (`huggingface-neuron-*`) from the AWS Marketplace
+  - This AMI ships optimum-neuron 0.4.4, neuronx-cc 2.21, Python 3.10 — all compatible
+  - Search for `huggingface-neuron` in the EC2 AMI catalog
+- Instance type: **inf2.xlarge** (2 NeuronCores, 32 GB), **trn2.3xlarge** (4 NeuronCores, 128 GB), or larger
+- Disk: The AMI defaults to 512 GB
+
+### 2. Install Infinity
+
+```bash
+# SSH into the instance
+ssh ubuntu@<your-instance-ip>
+
+# Activate the pre-installed PyTorch environment
+source /opt/aws_neuronx_venv_pytorch_2_8/bin/activate
+
+# Clone and install Infinity from source (don't overwrite Neuron packages)
+git clone https://github.com/michaelfeil/infinity.git ~/infinity
+cd ~/infinity/libs/infinity_emb
+pip install --no-deps .
+
+# Install remaining runtime dependencies (most are already present on the HF AMI)
+pip install uvicorn fastapi orjson typer httptools pydantic posthog \
+    prometheus-fastapi-instrumentator hf_transfer rich
+```
+
+### 3. Run Infinity with Neuron engine
+
+```bash
+# Single core (uses one NeuronCore)
+infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4
+```
+
+The first run will compile the model for Neuron (~100 seconds). Subsequent runs use the cached compilation.
+
+### 4. Scale across all NeuronCores (data parallelism)
+
+The Neuron runtime is limited to one model per process. To use all NeuronCores,
+run one server process per core, each pinned to a different core:
+
+```bash
+# inf2.xlarge has 2 NeuronCores (cores 0 and 1)
+NEURON_RT_VISIBLE_CORES=0 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7997 &
+NEURON_RT_VISIBLE_CORES=1 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7998 &
+
+# trn2.3xlarge has 4 NeuronCores (cores 0-3)
+NEURON_RT_VISIBLE_CORES=0 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7997 &
+NEURON_RT_VISIBLE_CORES=1 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7998 &
+NEURON_RT_VISIBLE_CORES=2 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 7999 &
+NEURON_RT_VISIBLE_CORES=3 infinity_emb v2 --engine neuron --model-id BAAI/bge-small-en-v1.5 --batch-size 4 --port 8000 &
+```
+
+Then use a load balancer (nginx, HAProxy, etc.) to distribute requests across
+ports. Throughput scales linearly with cores: 2 cores = 2x, 4 cores = 4x.
+
+### 5. Test it
+
+```bash
+curl http://localhost:7997/embeddings \
+  -H "Content-Type: application/json" \
+  -d '{"input": ["Hello world", "How are you?"], "model": "BAAI/bge-small-en-v1.5"}'
+```
+
+## Performance (bge-small-en-v1.5, batch_size=4)
+
+### Latency (serial requests, P50)
+
+| Workload | g5.xlarge (GPU) | inf2.xlarge (1 core) | trn2.3xlarge (1 core) |
+|----------|----------------|---------------------|----------------------|
+| 1 short sentence | 14.2ms | 25.0ms | 19.0ms |
+| 4 short sentences | 16.0ms | 25.6ms | 19.5ms |
+| 4 long sentences | 16.2ms | 26.0ms | 20.3ms |
+
+### Throughput (concurrent requests, data parallelism)
+
+| Instance | Cores | Peak emb/s | Concurrency |
+|----------|-------|-----------|-------------|
+| g5.xlarge (GPU) | 1 GPU | 536 | 8 |
+| inf2.xlarge | 1 core | 216 | 4 |
+| inf2.xlarge | 2 cores | 427 | 4 |
+| trn2.3xlarge | 1 core | 348 | 4 |
+| trn2.3xlarge | 4 cores | 753 | 4 |
+
+**Notes:**
+- g5.xlarge uses `--engine torch`; inf2/trn2 use `--engine neuron`
+- Neuron latency is constant regardless of batch content (padded to compiled batch size)
+- trn2 has ~30% lower latency per core than inf2 (19ms vs 25ms)
+- Throughput scales linearly with data parallelism (1 process per core)
+- Compilation time: ~60-100 seconds on first run (cached after that)
+
+Tested on HuggingFace Neuron AMI (optimum-neuron 0.4.4, neuronx-cc 2.21, SDK 2.27)
+and Deep Learning AMI Neuron Ubuntu 22.04 (SDK 2.28) for trn2.
+
+## Tested Stack
+
+| Package | Version |
+|---------|---------|
+| optimum-neuron | 0.4.4 |
+| optimum | 2.0.0 |
+| neuronx-cc | 2.21.33363 |
+| torch-neuronx | 2.8.0.2.10 |
+| torch | 2.8.0 |
+| transformers | 4.57.3 |
+| Python | 3.10.12 |
+
+## Alternative: Docker
+
+### Build from source
 
-### Optional: build docker image from scratch
 ```bash
 git clone https://github.com/michaelfeil/infinity
 cd infinity
-docker buildx build -t michaelf34/infinity:0.0.x-neuron -f ./infra/aws_neuron/Dockerfile.neuron
+docker buildx build -t infinity-neuron -f ./infra/aws_neuron/Dockerfile.neuron .
 ```
 
-### Run the image on EC2
+### Run on EC2
 
 ```bash
-docker run -it --rm --device=/dev/neuron0 michaelf34/infinity:0.0.71-neuron v2 --model-id BAAI/bge-small-en-v1.5 --batch-size 8 --log-level debug
+docker run -it --rm --device=/dev/neuron0 infinity-neuron \
+  v2 --model-id BAAI/bge-small-en-v1.5 --batch-size 8
 ```
 
-### Run task on ECS (Work in progress)
+**Note:** The host must have the Neuron driver installed. The Docker approach is less tested than the direct AMI approach above.
+
+## Limitations
 
-1. Create a AWS ECS Cluster with EC2:
-- Amazon Machine Image (AMI): Amazon Linux 2 - *Neuron*
-- inf2.xlarge as machine type.
+- The `--engine neuron` flag currently supports **text embeddings only** (no reranking or classification)
+- The Neuron engine requires a **constant batch size** (requests are padded automatically)
+- Models are compiled on first use; compilation can take 60-120 seconds
+
+## ECS Deployment
+
+See the ECS task definition example below for container orchestration:
 
-2. Create a Task:
 ```json
 {
     "family": "ecs-infinity-neuron",
@@ -45,10 +158,7 @@ docker run -it --rm --device=/dev/neuron0 michaelf34/infinity:0.0.71-neuron v2 -
     "executionRoleArn": "${YOUR_EXECUTION_ROLE}",
     "containerDefinitions": [
         {
-            "entryPoint": [
-                "infinity_emb",
-                "v2"
-            ],
+            "entryPoint": ["infinity_emb", "v2"],
             "portMappings": [
                 {
                     "hostPort": 7997,
@@ -61,41 +171,19 @@ docker run -it --rm --device=/dev/neuron0 michaelf34/infinity:0.0.71-neuron v2 -
                     {
                         "containerPath": "/dev/neuron0",
                         "hostPath": "/dev/neuron0",
-                        "permissions": [
-                            "read",
-                            "write"
-                        ]
+                        "permissions": ["read", "write"]
                     }
                 ],
                 "capabilities": {
-                    "add": [
-                        "IPC_LOCK"
-                    ]
+                    "add": ["IPC_LOCK"]
                 }
             },
             "cpu": 0,
             "memoryReservation": 1000,
-            "image": "michaelf34/infinity:0.0.71-neuron",
+            "image": "infinity-neuron:latest",
             "essential": true,
             "name": "infinity-neuron"
         }
     ]
 }
 ```
-
-You can also add logging:
-```
-            // same indent as "linuxParameters"
-            "logConfiguration": {
-                "logDriver": "awslogs", 
-                "options": {
-                    "awslogs-group": "/ecs/ecs-infinity-neuron", 
-                    "mode": "non-blocking", 
-                    "awslogs-create-group": "true", 
-                    "max-buffer-size": "25m", 
-                    "awslogs-region": "us-west-2", // set correct location.
-                    "awslogs-stream-prefix": "ecs" 
-                },
-                "secretOptions": []
-            }
-```
diff --git a/libs/infinity_emb/infinity_emb/transformer/acceleration.py b/libs/infinity_emb/infinity_emb/transformer/acceleration.py
@@ -8,10 +8,19 @@
 from infinity_emb.primitives import Device
 
 if CHECK_OPTIMUM.is_available:
-    from optimum.bettertransformer import (  # type: ignore[import-untyped]
-        BetterTransformer,
-        BetterTransformerManager,
-    )
+    try:
+        from optimum.bettertransformer import (  # type: ignore[import-untyped]
+            BetterTransformer,
+            BetterTransformerManager,
+        )
+    except (ImportError, ModuleNotFoundError):
+        # optimum.bettertransformer was removed in optimum >= 2.0
+        CHECK_OPTIMUM.mark_dirty(
+            ImportError(
+                "optimum.bettertransformer is not available in this version of optimum. "
+                "BetterTransformer support requires optimum < 2.0."
+            )
+        )
 
 if CHECK_TORCH.is_available:
     import torch
@@ -37,6 +46,9 @@ def check_if_bettertransformer_possible(engine_args: "EngineArgs") -> bool:
     if not engine_args.bettertransformer:
         return False
 
+    if "BetterTransformerManager" not in globals():
+        return False
+
     config = AutoConfig.from_pretrained(
         pretrained_model_name_or_path=engine_args.model_name_or_path,
         revision=engine_args.revision,

diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/neuron.py b/libs/infinity_emb/infinity_emb/transformer/embedder/neuron.py
@@ -2,10 +2,7 @@
 # Copyright (c) 2023-now michaelfeil
 
 import copy
-import json
-import subprocess
-from typing import Union
-from functools import cache
+import os
 import numpy as np
 
 from infinity_emb._optional_imports import CHECK_OPTIMUM_NEURON, CHECK_TORCH
@@ -30,22 +27,6 @@
 ]
 
 
-@cache
-def get_nc_count() -> Union[int, None]:
-    """Returns the number of neuron cores on the current instance."""
-    try:
-        cmd = "neuron-ls --json-output"
-        result = subprocess.run(cmd, shell=True, capture_output=True)
-        print("inferring nc_count from `neuron-ls`")
-        print(result.stdout.decode("utf-8"))
-        json_output = json.loads(result.stdout)
-        count = sum([x["nc_count"] for x in json_output])
-        print(f"nc_count={count}")
-        return count
-    except Exception:
-        return None
-
-
 def pad_up_to_size(desired_max_bs: int, input_ids: "torch.Tensor") -> "torch.Tensor":
     """input_ids a 2D array with batch_size on dim=0
 
@@ -97,7 +78,13 @@ def __init__(self, *, engine_args: EngineArgs):
         )
         self._infinity_tokenizer = copy.deepcopy(self.tokenizer)
 
-        compiler_args = {"num_cores": get_nc_count(), "auto_cast_type": "fp16"}
+        # Default to 1 NeuronCore (data parallelism).  For large models that
+        # require tensor parallelism across multiple cores, set the
+        # NEURON_NUM_CORES environment variable.  For data-parallel scaling,
+        # run separate server processes pinned to individual cores via
+        # NEURON_RT_VISIBLE_CORES (see infra/aws_neuron/README.md).
+        num_cores = int(os.environ.get("NEURON_NUM_CORES", "1"))
+        compiler_args = {"num_cores": num_cores, "auto_cast_type": "fp16"}
         input_shapes = {
             "batch_size": engine_args.batch_size,
             "sequence_length": (
@@ -124,7 +111,6 @@ def encode_pre(self, sentences: list[str]) -> dict[str, "torch.Tensor"]:
             padding=True,
             truncation="longest_first",
             return_tensors="pt",
-            return_token_type_ids=False,
         )
         return input_dict