diff --git a/.env.example b/.env.example index 673c5215..6f1ed77c 100644 --- a/.env.example +++ b/.env.example @@ -43,7 +43,7 @@ ADMIN_ALERT_EMAIL= # PDF content cleanup (quality-poll.sh Phase 3 — feat-0007). # When true, chapters scoring below the threshold get an LLM cleanup pass # via Claude CLI. Off by default — leaves the poller at structure-only. -# CLEANUP_TIMEOUT: per-chapter Claude budget; large chapters need 10-15 min. +# CLEANUP_TIMEOUT: per-chapter Claude budget; 20k-word chapters take ~15-20 min. CONTENT_CLEANUP_ENABLED=false CONTENT_QUALITY_THRESHOLD=60 -CLEANUP_TIMEOUT=900 +CLEANUP_TIMEOUT=1500 diff --git a/infra/scripts/quality-poll.sh b/infra/scripts/quality-poll.sh index 9af1a8a9..63bc7e04 100755 --- a/infra/scripts/quality-poll.sh +++ b/infra/scripts/quality-poll.sh @@ -16,9 +16,11 @@ API_BASE="http://localhost:8080" # get an LLM cleanup pass; output is verified by pdf-cleanup-gate.py. CONTENT_CLEANUP_ENABLED="${CONTENT_CLEANUP_ENABLED:-false}" CONTENT_QUALITY_THRESHOLD="${CONTENT_QUALITY_THRESHOLD:-60}" -# Per-chapter Claude timeout. Large chapters (15-20k words) need well over the -# old 300s — rewriting that much HTML takes 10-15 min. Env-tunable. -CLEANUP_TIMEOUT="${CLEANUP_TIMEOUT:-900}" +# Per-chapter Claude timeout. 16k-word chapters take ~10 min; 20k+ chapters +# (the longest tech books have) need ~15-20 min — 900s was tight, dropped one +# in prod testing. Env-tunable for the rare outliers; 1500s default covers +# the realistic upper bound. +CLEANUP_TIMEOUT="${CLEANUP_TIMEOUT:-1500}" DATASET_DIR="$REPO_DIR/data/pdf-cleanup-dataset" GATE_SCRIPT="$REPO_DIR/infra/scripts/pdf-cleanup-gate.py"