From 192a0d6fd334e3ae38ef3e902a583755040956d2 Mon Sep 17 00:00:00 2001 From: Vasyl Vdovychenko Date: Fri, 22 May 2026 18:36:36 -0400 Subject: [PATCH] =?UTF-8?q?fix(pdf-quality):=20bump=20CLEANUP=5FTIMEOUT=20?= =?UTF-8?q?default=20900s=20=E2=86=92=201500s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prod re-run: 16k-word chapter cleaned in 9m39s (fit 900s), but the 19.8k chapter (a tech book's biggest) timed out at exactly 900s. 1500s covers the realistic upper bound; outliers still tunable via env. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 4 ++-- infra/scripts/quality-poll.sh | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.env.example b/.env.example index 673c5215..6f1ed77c 100644 --- a/.env.example +++ b/.env.example @@ -43,7 +43,7 @@ ADMIN_ALERT_EMAIL= # PDF content cleanup (quality-poll.sh Phase 3 — feat-0007). # When true, chapters scoring below the threshold get an LLM cleanup pass # via Claude CLI. Off by default — leaves the poller at structure-only. -# CLEANUP_TIMEOUT: per-chapter Claude budget; large chapters need 10-15 min. +# CLEANUP_TIMEOUT: per-chapter Claude budget; 20k-word chapters take ~15-20 min. CONTENT_CLEANUP_ENABLED=false CONTENT_QUALITY_THRESHOLD=60 -CLEANUP_TIMEOUT=900 +CLEANUP_TIMEOUT=1500 diff --git a/infra/scripts/quality-poll.sh b/infra/scripts/quality-poll.sh index 9af1a8a9..63bc7e04 100755 --- a/infra/scripts/quality-poll.sh +++ b/infra/scripts/quality-poll.sh @@ -16,9 +16,11 @@ API_BASE="http://localhost:8080" # get an LLM cleanup pass; output is verified by pdf-cleanup-gate.py. CONTENT_CLEANUP_ENABLED="${CONTENT_CLEANUP_ENABLED:-false}" CONTENT_QUALITY_THRESHOLD="${CONTENT_QUALITY_THRESHOLD:-60}" -# Per-chapter Claude timeout. Large chapters (15-20k words) need well over the -# old 300s — rewriting that much HTML takes 10-15 min. Env-tunable. -CLEANUP_TIMEOUT="${CLEANUP_TIMEOUT:-900}" +# Per-chapter Claude timeout. 16k-word chapters take ~10 min; 20k+ chapters +# (the longest tech books have) need ~15-20 min — 900s was tight, dropped one +# in prod testing. Env-tunable for the rare outliers; 1500s default covers +# the realistic upper bound. +CLEANUP_TIMEOUT="${CLEANUP_TIMEOUT:-1500}" DATASET_DIR="$REPO_DIR/data/pdf-cleanup-dataset" GATE_SCRIPT="$REPO_DIR/infra/scripts/pdf-cleanup-gate.py"