From 31945da03be7beebd924e29af0ac298c15146d24 Mon Sep 17 00:00:00 2001 From: j316chuck Date: Mon, 18 May 2026 02:15:29 +0000 Subject: [PATCH] =?UTF-8?q?[tinker]=20bump=20async=20db=20pool=20default?= =?UTF-8?q?=205+10=E2=86=9220+40=20for=20concurrent=20multi-LoRA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concurrent multi-LoRA async clients (each holding several in-flight requests during rollouts and forward_backward) saturate the SQLAlchemy connection pool at the default (pool_size=5, max_overflow=10 = 15 total). Observed at 4 concurrent clients × mbs=32: pool exhausts → HTTP 500s → client-side 'Sampling session not found' cascading failures. This change raises defaults to pool_size=20, max_overflow=40 (60 total) and makes them env-var-tunable via SKYRL_DB_POOL_SIZE / SKYRL_DB_MAX_OVERFLOW. Empirically with these defaults, 4 concurrent async clients at mbs=32 run with zero QueuePool errors. --- skyrl/tinker/api.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/skyrl/tinker/api.py b/skyrl/tinker/api.py index c565a8c351..0c5d6f2b3c 100644 --- a/skyrl/tinker/api.py +++ b/skyrl/tinker/api.py @@ -108,7 +108,19 @@ async def lifespan(app: FastAPI): """Lifespan event handler for startup and shutdown.""" db_url = get_async_database_url(app.state.engine_config.database_url) - app.state.db_engine = create_async_engine(db_url, echo=False) + # Bump pool above SQLAlchemy defaults (5 + 10 = 15) so concurrent multi-LoRA + # async clients (each holding several in-flight requests during rollouts + + # forward_backward) don't saturate the connection pool. Observed at 4 + # concurrent async clients × mbs=32: pool exhausts → HTTP 500s → + # client-side "Sampling session not found" cascading failures. With + # pool_size=20, max_overflow=40 (60 total) we measured 4 concurrent async + # clients running cleanly with 0 QueuePool errors. + app.state.db_engine = create_async_engine( + db_url, + echo=False, + pool_size=int(os.environ.get("SKYRL_DB_POOL_SIZE", "20")), + max_overflow=int(os.environ.get("SKYRL_DB_MAX_OVERFLOW", "40")), + ) enable_sqlite_wal(app.state.db_engine.sync_engine) async with app.state.db_engine.begin() as conn: