-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
123 lines (117 loc) · 3.82 KB
/
docker-compose.yml
File metadata and controls
123 lines (117 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# docker_mlx_cpp — The NVIDIA Container Toolkit for Mac
#
# Architecture:
# Container → mlx-gateway:8080 → MLX Daemon (host:12435) → Metal GPU
# → DMR (fallback for GGUF/llama.cpp)
#
# Prerequisites:
# 1. MLX Daemon running on host: mlx-cpp serve
# 2. Docker Desktop 4.62+ with Model Runner enabled (optional, for DMR fallback)
#
# Usage:
# docker compose up -d # Start gateway
# docker compose --profile examples up # Run example app
# docker compose --profile test up curl-test # Quick curl test
# docker compose --profile train up trainer # LoRA training example
# docker compose --profile gpu-test up gpu-test # Test ALL 100+ GPU ops
services:
# ============================================================
# MLX Gateway — unified proxy to Metal GPU compute
# Routes: inference, training, image gen, audio, embeddings
# ============================================================
mlx-gateway:
build:
context: ./gateway
dockerfile: Dockerfile
container_name: mlx-gateway
ports:
- "8080:8080"
environment:
- MLX_DAEMON_URL=http://host.docker.internal:12435
- DMR_UPSTREAM=http://model-runner.docker.internal
- GATEWAY_PORT=8080
- LOG_LEVEL=info
- RATE_LIMIT_RPM=120
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8080/health"]
interval: 10s
timeout: 5s
retries: 3
restart: unless-stopped
networks:
- mlx-net
# ============================================================
# Example: Python app using GPU inference via OpenAI SDK
# ============================================================
example-app:
build:
context: ./examples/python-client
dockerfile: Dockerfile
container_name: mlx-example-app
environment:
- OPENAI_BASE_URL=http://mlx-gateway:8080/v1
- OPENAI_API_KEY=not-needed
- OPENAI_MODEL=mlx-community/SmolLM2-360M-Instruct-4bit
depends_on:
mlx-gateway:
condition: service_healthy
networks:
- mlx-net
profiles:
- examples
# ============================================================
# Example: LoRA training from a container
# ============================================================
trainer:
image: python:3.12-slim
container_name: mlx-trainer
environment:
- MLX_URL=http://mlx-gateway:8080
command: >
python3 -c "
import httpx, json, time
url = 'http://mlx-gateway:8080/train/lora'
data = {'model': 'mlx-community/SmolLM2-360M-Instruct-4bit', 'dataset': '/data/train.jsonl', 'epochs': 1}
resp = httpx.post(url, json=data)
print('Training started:', resp.json())
"
depends_on:
mlx-gateway:
condition: service_healthy
networks:
- mlx-net
profiles:
- train
# ============================================================
# GPU Test: validates ALL 100+ Metal GPU operations
# Proves any container can access the full MLX compute stack
# ============================================================
gpu-test:
build:
context: ./examples/gpu-test
dockerfile: Dockerfile
container_name: mlx-gpu-test
environment:
- MLX_URL=http://mlx-gateway:8080
depends_on:
mlx-gateway:
condition: service_healthy
networks:
- mlx-net
profiles:
- gpu-test
# ============================================================
# Test: curl container for ad-hoc testing
# ============================================================
curl-test:
image: curlimages/curl:8.5.0
container_name: mlx-curl-test
entrypoint: ["sh", "-c", "while true; do sleep 3600; done"]
networks:
- mlx-net
profiles:
- test
networks:
mlx-net:
driver: bridge
name: mlx-network