From d9e9a98756fd7ae36f40743cd56afb33e15a43e1 Mon Sep 17 00:00:00 2001 From: Vasyl Vdovychenko Date: Fri, 22 May 2026 17:09:02 -0400 Subject: [PATCH] fix(pdf-quality): Phase 3 timeout + dataset dir permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First prod test of Phase 3 surfaced two bugs: - Claude timeout was a hardcoded 300s — large chapters (15-20k words) need 10-15 min to rewrite, so every big chapter timed out and was skipped. Now CLEANUP_TIMEOUT env var, default 900s. - data/pdf-cleanup-dataset couldn't be created — data/ is root-owned, the poller runs as the deploy user. `make fix-permissions` now creates + chowns it (via the root alpine container, like the other caches); the poller degrades gracefully (skips pair-logging) if it's missing. Phase 1-2 + the Phase 3 mechanism itself verified working on the test run (chapter cleaned → gate accepted → HTML written back). Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 2 ++ Makefile | 10 +++------- infra/scripts/quality-poll.sh | 22 ++++++++++++++++------ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/.env.example b/.env.example index 660df6df..673c5215 100644 --- a/.env.example +++ b/.env.example @@ -43,5 +43,7 @@ ADMIN_ALERT_EMAIL= # PDF content cleanup (quality-poll.sh Phase 3 — feat-0007). # When true, chapters scoring below the threshold get an LLM cleanup pass # via Claude CLI. Off by default — leaves the poller at structure-only. +# CLEANUP_TIMEOUT: per-chapter Claude budget; large chapters need 10-15 min. CONTENT_CLEANUP_ENABLED=false CONTENT_QUALITY_THRESHOLD=60 +CLEANUP_TIMEOUT=900 diff --git a/Makefile b/Makefile index 0be81b42..f6efe83f 100644 --- a/Makefile +++ b/Makefile @@ -39,13 +39,9 @@ status: # the host dir stays root-owned and the container (uid 1000) gets EACCES. fix-permissions: @echo "Fixing volume permissions..." - @mkdir -p data/textstack data/tts-cache data/explain-cache data/translate-cache - @docker run --rm \ - -v $$(pwd)/data/textstack:/data \ - -v $$(pwd)/data/tts-cache:/tts \ - -v $$(pwd)/data/explain-cache:/explain \ - -v $$(pwd)/data/translate-cache:/translate \ - alpine sh -c 'chown -R 1000:1000 /data /tts /explain /translate' + @docker run --rm -v $$(pwd)/data:/data alpine sh -c '\ + mkdir -p /data/textstack /data/tts-cache /data/explain-cache /data/translate-cache /data/pdf-cleanup-dataset && \ + chown -R 1000:1000 /data/textstack /data/tts-cache /data/explain-cache /data/translate-cache /data/pdf-cleanup-dataset' @echo "Done." deploy: fix-permissions diff --git a/infra/scripts/quality-poll.sh b/infra/scripts/quality-poll.sh index e411bebc..9af1a8a9 100755 --- a/infra/scripts/quality-poll.sh +++ b/infra/scripts/quality-poll.sh @@ -16,6 +16,9 @@ API_BASE="http://localhost:8080" # get an LLM cleanup pass; output is verified by pdf-cleanup-gate.py. CONTENT_CLEANUP_ENABLED="${CONTENT_CLEANUP_ENABLED:-false}" CONTENT_QUALITY_THRESHOLD="${CONTENT_QUALITY_THRESHOLD:-60}" +# Per-chapter Claude timeout. Large chapters (15-20k words) need well over the +# old 300s — rewriting that much HTML takes 10-15 min. Env-tunable. +CLEANUP_TIMEOUT="${CLEANUP_TIMEOUT:-900}" DATASET_DIR="$REPO_DIR/data/pdf-cleanup-dataset" GATE_SCRIPT="$REPO_DIR/infra/scripts/pdf-cleanup-gate.py" @@ -178,7 +181,13 @@ run_content_cleanup() { return 0 fi - mkdir -p "$DATASET_DIR" + # Dataset dir lives under data/ (root-owned) — created + chowned to the + # poller's uid by `make fix-permissions`. Degrade gracefully if missing: + # cleanup still runs, only the (messy→clean) pair log is skipped. + if ! mkdir -p "$DATASET_DIR" 2>/dev/null; then + log "Phase 3: warning — $DATASET_DIR not writable, pairs will not be logged" + DATASET_DIR="" + fi local cleaned=0 rejected=0 skipped=0 for num in $flagged; do @@ -227,9 +236,9 @@ CLEANED_HTML_END PROMPT_TAIL } > "$tmp_prompt" - if ! timeout 300 claude -p --model claude-sonnet-4-6 --permission-mode default \ + if ! timeout "$CLEANUP_TIMEOUT" claude -p --model claude-sonnet-4-6 --permission-mode default \ < "$tmp_prompt" > "$tmp_out" 2>/dev/null; then - log "Phase 3: chapter $num — Claude CLI failed, skipped" + log "Phase 3: chapter $num — Claude CLI failed/timed out (${CLEANUP_TIMEOUT}s), skipped" skipped=$((skipped + 1)) rm -f "$tmp_orig" "$tmp_prompt" "$tmp_out" "$tmp_clean"; continue fi @@ -247,18 +256,19 @@ PROMPT_TAIL local gate_verdict gate_verdict=$(python3 "$GATE_SCRIPT" "$tmp_orig" "$tmp_clean" 2>/dev/null || echo "REJECT: gate error") - local pair_file="$DATASET_DIR/${target_id}-ch${num}-$(date +%s).json" + local pair_file="" + [ -n "$DATASET_DIR" ] && pair_file="$DATASET_DIR/${target_id}-ch${num}-$(date +%s).json" if [[ "$gate_verdict" == ACCEPT* ]]; then local esc_html esc_html=$(python3 -c "import sys,json; print(json.dumps(open(sys.argv[1],encoding='utf-8').read()))" "$tmp_clean") apply_content "$target_type" "$target_id" "$num" "$esc_html" cleaned=$((cleaned + 1)) log "Phase 3: chapter $num cleaned" - log_pair "$pair_file" "$target_id" "$num" true "$tmp_orig" "$tmp_clean" "$gate_verdict" + [ -n "$pair_file" ] && log_pair "$pair_file" "$target_id" "$num" true "$tmp_orig" "$tmp_clean" "$gate_verdict" else rejected=$((rejected + 1)) log "Phase 3: chapter $num rejected — $gate_verdict" - log_pair "$pair_file" "$target_id" "$num" false "$tmp_orig" "$tmp_clean" "$gate_verdict" + [ -n "$pair_file" ] && log_pair "$pair_file" "$target_id" "$num" false "$tmp_orig" "$tmp_clean" "$gate_verdict" fi rm -f "$tmp_orig" "$tmp_prompt" "$tmp_out" "$tmp_clean"