-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
32 lines (24 loc) · 1.05 KB
/
config.example.yaml
File metadata and controls
32 lines (24 loc) · 1.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Model Manager configuration
#
# Copy to config.local.yaml and edit to override defaults.
# Values shown here are the defaults from config.py.
# Ollama connection
OLLAMA_BASE_URL: "http://localhost:11434"
# VRAM management
VRAM_SAFETY_MARGIN_MB: 1024 # Reserve 1 GB of VRAM headroom
VRAM_ESTIMATION_MULTIPLIER: 1.3 # Multiply model size by this for initial VRAM estimate
# Model management
MODEL_KEEP_ALIVE: 300 # Seconds to keep models loaded after last use
MAX_CONCURRENT_PER_MODEL: 20 # Max concurrent requests per loaded model
# Scheduler
SCHEDULER_STRATEGY: "demand_based" # Options: greedy, priority_first, demand_based, balanced
SCHEDULER_LOOP_INTERVAL: 0.1 # Seconds between scheduler ticks
# Resource monitoring
MONITOR_POLL_INTERVAL: 2 # Seconds between GPU stat polls
# Queue
QUEUE_MAX_SIZE: 1000 # Maximum number of queued jobs
# HTTP API
http_port: 5000 # Port for the Flask HTTP API
# Storage (auto-detected for your platform, uncomment to override)
# data_dir:
# cache_dir: