docker_mlx_cpp/docker-compose.yml at main · RobotFlow-Labs/docker_mlx_cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# docker_mlx_cpp — The NVIDIA Container Toolkit for Mac
#
# Architecture:
#   Container → mlx-gateway:8080 → MLX Daemon (host:12435) → Metal GPU
#                                → DMR (fallback for GGUF/llama.cpp)
#
# Prerequisites:
#   1. MLX Daemon running on host: mlx-cpp serve
#   2. Docker Desktop 4.62+ with Model Runner enabled (optional, for DMR fallback)
#
# Usage:
#   docker compose up -d                          # Start gateway
#   docker compose --profile examples up           # Run example app
#   docker compose --profile test up curl-test     # Quick curl test
#   docker compose --profile train up trainer      # LoRA training example
#   docker compose --profile gpu-test up gpu-test  # Test ALL 100+ GPU ops

services:
  # ============================================================
  # MLX Gateway — unified proxy to Metal GPU compute
  # Routes: inference, training, image gen, audio, embeddings
  # ============================================================
  mlx-gateway:
    build:
      context: ./gateway
      dockerfile: Dockerfile
    container_name: mlx-gateway
    ports:
      - "8080:8080"
    environment:
      - MLX_DAEMON_URL=http://host.docker.internal:12435
      - DMR_UPSTREAM=http://model-runner.docker.internal
      - GATEWAY_PORT=8080
      - LOG_LEVEL=info
      - RATE_LIMIT_RPM=120
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:8080/health"]
      interval: 10s
      timeout: 5s
      retries: 3
    restart: unless-stopped
    networks:
      - mlx-net

  # ============================================================
  # Example: Python app using GPU inference via OpenAI SDK
  # ============================================================
  example-app:
    build:
      context: ./examples/python-client
      dockerfile: Dockerfile
    container_name: mlx-example-app
    environment:
      - OPENAI_BASE_URL=http://mlx-gateway:8080/v1
      - OPENAI_API_KEY=not-needed
      - OPENAI_MODEL=mlx-community/SmolLM2-360M-Instruct-4bit
    depends_on:
      mlx-gateway:
        condition: service_healthy
    networks:
      - mlx-net
    profiles:
      - examples

  # ============================================================
  # Example: LoRA training from a container
  # ============================================================
  trainer:
    image: python:3.12-slim
    container_name: mlx-trainer
    environment:
      - MLX_URL=http://mlx-gateway:8080
    command: >
      python3 -c "
      import httpx, json, time
      url = 'http://mlx-gateway:8080/train/lora'
      data = {'model': 'mlx-community/SmolLM2-360M-Instruct-4bit', 'dataset': '/data/train.jsonl', 'epochs': 1}
      resp = httpx.post(url, json=data)
      print('Training started:', resp.json())
      "
    depends_on:
      mlx-gateway:
        condition: service_healthy
    networks:
      - mlx-net
    profiles:
      - train

  # ============================================================
  # GPU Test: validates ALL 100+ Metal GPU operations
  # Proves any container can access the full MLX compute stack
  # ============================================================
  gpu-test:
    build:
      context: ./examples/gpu-test
      dockerfile: Dockerfile
    container_name: mlx-gpu-test
    environment:
      - MLX_URL=http://mlx-gateway:8080
    depends_on:
      mlx-gateway:
        condition: service_healthy
    networks:
      - mlx-net
    profiles:
      - gpu-test

  # ============================================================
  # Test: curl container for ad-hoc testing
  # ============================================================
  curl-test:
    image: curlimages/curl:8.5.0
    container_name: mlx-curl-test
    entrypoint: ["sh", "-c", "while true; do sleep 3600; done"]
    networks:
      - mlx-net
    profiles:
      - test

networks:
  mlx-net:
    driver: bridge
    name: mlx-network