Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,7 @@ ADMIN_ALERT_EMAIL=
# PDF content cleanup (quality-poll.sh Phase 3 — feat-0007).
# When true, chapters scoring below the threshold get an LLM cleanup pass
# via Claude CLI. Off by default — leaves the poller at structure-only.
# CLEANUP_TIMEOUT: per-chapter Claude budget; large chapters need 10-15 min.
CONTENT_CLEANUP_ENABLED=false
CONTENT_QUALITY_THRESHOLD=60
CLEANUP_TIMEOUT=900
10 changes: 3 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,9 @@ status:
# the host dir stays root-owned and the container (uid 1000) gets EACCES.
fix-permissions:
@echo "Fixing volume permissions..."
@mkdir -p data/textstack data/tts-cache data/explain-cache data/translate-cache
@docker run --rm \
-v $$(pwd)/data/textstack:/data \
-v $$(pwd)/data/tts-cache:/tts \
-v $$(pwd)/data/explain-cache:/explain \
-v $$(pwd)/data/translate-cache:/translate \
alpine sh -c 'chown -R 1000:1000 /data /tts /explain /translate'
@docker run --rm -v $$(pwd)/data:/data alpine sh -c '\
mkdir -p /data/textstack /data/tts-cache /data/explain-cache /data/translate-cache /data/pdf-cleanup-dataset && \
chown -R 1000:1000 /data/textstack /data/tts-cache /data/explain-cache /data/translate-cache /data/pdf-cleanup-dataset'
@echo "Done."

deploy: fix-permissions
Expand Down
22 changes: 16 additions & 6 deletions infra/scripts/quality-poll.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ API_BASE="http://localhost:8080"
# get an LLM cleanup pass; output is verified by pdf-cleanup-gate.py.
CONTENT_CLEANUP_ENABLED="${CONTENT_CLEANUP_ENABLED:-false}"
CONTENT_QUALITY_THRESHOLD="${CONTENT_QUALITY_THRESHOLD:-60}"
# Per-chapter Claude timeout. Large chapters (15-20k words) need well over the
# old 300s — rewriting that much HTML takes 10-15 min. Env-tunable.
CLEANUP_TIMEOUT="${CLEANUP_TIMEOUT:-900}"
DATASET_DIR="$REPO_DIR/data/pdf-cleanup-dataset"
GATE_SCRIPT="$REPO_DIR/infra/scripts/pdf-cleanup-gate.py"

Expand Down Expand Up @@ -178,7 +181,13 @@ run_content_cleanup() {
return 0
fi

mkdir -p "$DATASET_DIR"
# Dataset dir lives under data/ (root-owned) — created + chowned to the
# poller's uid by `make fix-permissions`. Degrade gracefully if missing:
# cleanup still runs, only the (messy→clean) pair log is skipped.
if ! mkdir -p "$DATASET_DIR" 2>/dev/null; then
log "Phase 3: warning — $DATASET_DIR not writable, pairs will not be logged"
DATASET_DIR=""
fi
local cleaned=0 rejected=0 skipped=0

for num in $flagged; do
Expand Down Expand Up @@ -227,9 +236,9 @@ CLEANED_HTML_END
PROMPT_TAIL
} > "$tmp_prompt"

if ! timeout 300 claude -p --model claude-sonnet-4-6 --permission-mode default \
if ! timeout "$CLEANUP_TIMEOUT" claude -p --model claude-sonnet-4-6 --permission-mode default \
< "$tmp_prompt" > "$tmp_out" 2>/dev/null; then
log "Phase 3: chapter $num — Claude CLI failed, skipped"
log "Phase 3: chapter $num — Claude CLI failed/timed out (${CLEANUP_TIMEOUT}s), skipped"
skipped=$((skipped + 1))
rm -f "$tmp_orig" "$tmp_prompt" "$tmp_out" "$tmp_clean"; continue
fi
Expand All @@ -247,18 +256,19 @@ PROMPT_TAIL
local gate_verdict
gate_verdict=$(python3 "$GATE_SCRIPT" "$tmp_orig" "$tmp_clean" 2>/dev/null || echo "REJECT: gate error")

local pair_file="$DATASET_DIR/${target_id}-ch${num}-$(date +%s).json"
local pair_file=""
[ -n "$DATASET_DIR" ] && pair_file="$DATASET_DIR/${target_id}-ch${num}-$(date +%s).json"
if [[ "$gate_verdict" == ACCEPT* ]]; then
local esc_html
esc_html=$(python3 -c "import sys,json; print(json.dumps(open(sys.argv[1],encoding='utf-8').read()))" "$tmp_clean")
apply_content "$target_type" "$target_id" "$num" "$esc_html"
cleaned=$((cleaned + 1))
log "Phase 3: chapter $num cleaned"
log_pair "$pair_file" "$target_id" "$num" true "$tmp_orig" "$tmp_clean" "$gate_verdict"
[ -n "$pair_file" ] && log_pair "$pair_file" "$target_id" "$num" true "$tmp_orig" "$tmp_clean" "$gate_verdict"
else
rejected=$((rejected + 1))
log "Phase 3: chapter $num rejected — $gate_verdict"
log_pair "$pair_file" "$target_id" "$num" false "$tmp_orig" "$tmp_clean" "$gate_verdict"
[ -n "$pair_file" ] && log_pair "$pair_file" "$target_id" "$num" false "$tmp_orig" "$tmp_clean" "$gate_verdict"
fi

rm -f "$tmp_orig" "$tmp_prompt" "$tmp_out" "$tmp_clean"
Expand Down
Loading