From c909dc0166a326a107212390c623d63564410b52 Mon Sep 17 00:00:00 2001
From: Paul Pavlidis <paul@msl.ubc.ca>
Date: Mon, 25 May 2026 15:23:51 -0700
Subject: [PATCH] =?UTF-8?q?Add=20analysis/replicate.sh=20=E2=80=94=20singl?=
 =?UTF-8?q?e-command=20driver=20for=20the=20primary=20analysis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors the revisions/replicate.sh layout (phases + DRY_RUN + PHASES filter)
but drives the original-submission pipeline under analysis/strains/ (00–04)
and analysis/cell_lines/ (00–07), via the shared analysis/_downloads/
download. OPENAI_API_KEY is resolved from the macOS Keychain with
~/openai/access_key.txt and pre-set env var as fallbacks (inst/gpt.py
already supports the file path). The 02 / 05 R scripts that issue OpenAI
Batch jobs already block on completion via R/run_batches.R, so the bash
driver does not need a poll loop of its own.

Smoke-tested under DRY_RUN=1.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 analysis/replicate.sh | 166 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100755 analysis/replicate.sh

diff --git a/analysis/replicate.sh b/analysis/replicate.sh
new file mode 100755
index 0000000..452dfae
--- /dev/null
+++ b/analysis/replicate.sh
@@ -0,0 +1,166 @@
+#!/usr/bin/env bash
+# replicate.sh — single-command end-to-end replication of the *primary*
+# analysis (the one reported in the original submission). This is kept
+# separate from revisions/replicate.sh, which drives the revision-round
+# pipeline. Run from the repository root.
+#
+# Usage:
+#   ./analysis/replicate.sh                       # run everything
+#   DRY_RUN=1 ./analysis/replicate.sh             # print commands without running
+#   PHASES="downloads,strains" ./analysis/replicate.sh   # run a subset
+#
+# Phases (in execution order):
+#   keys        Verify the OpenAI API key is reachable (Keychain → env
+#               → ~/openai/access_key.txt fallback used by inst/gpt.py)
+#   downloads   Shared ontology / jar downloads required by both tasks
+#               (analysis/_downloads/download.R)
+#   strains     Strain-task pipeline:
+#                 00 download Gemma annotations + build strain dictionary
+#                 01 regex baseline
+#                 02 GPT-4o annotation (OpenAI Batch — can take hours)
+#                 03 evaluate predictions vs curator
+#                 04 post-curation cleanup
+#   cells       Cell-line pipeline:
+#                 00 download Gemma annotations + embed cell-line terms
+#                 01 process cell-line dictionary
+#                 02 GPT-4o first-pass annotation (OpenAI Batch)
+#                 03 compare extractions to vector index
+#                 04 build RAG inputs
+#                 05 GPT-4o RAG resolution pass (OpenAI Batch)
+#                 06 evaluate predictions vs curator
+#                 07 post-curator evaluation
+#
+# Scripts 02 / 05 submit OpenAI Batch jobs. They block internally on
+# batch completion via R/run_batches.R, so the bash driver does not
+# need a poll loop of its own — the underlying R loop already handles
+# resume-on-restart. Each batch can take up to 24 hours; typical
+# turnaround is much faster.
+
+set -euo pipefail
+shopt -s expand_aliases
+
+# ---------------------------------------------------------------------------
+# Locations
+# ---------------------------------------------------------------------------
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)"
+cd "$REPO_ROOT"
+ANA="$REPO_ROOT/analysis"
+RSCRIPT="${RSCRIPT:-Rscript}"
+
+PHASES="${PHASES:-keys,downloads,strains,cells}"
+
+# ---------------------------------------------------------------------------
+# DRY_RUN: print + skip; otherwise execute
+# ---------------------------------------------------------------------------
+run() {
+    if [[ "${DRY_RUN:-0}" == "1" ]]; then
+        printf "  [dry-run] %s\n" "$*"
+    else
+        printf "  [run]     %s\n" "$*"
+        eval "$@"
+    fi
+}
+
+phase() {
+    local name="$1"; shift
+    if [[ ",$PHASES," == *",$name,"* ]]; then
+        echo
+        echo "============================================================"
+        echo "PHASE: $name"
+        echo "============================================================"
+        "$@"
+    else
+        printf "(skipping phase: %s)\n" "$name"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# Keychain helper (per ~/.claude/CLAUDE.md): resolve a credential from the
+# macOS Keychain into an environment variable, trying several plausible
+# service names. Honours pre-set env vars so manual overrides still work.
+# ---------------------------------------------------------------------------
+keychain_export() {
+    local var="$1"; shift
+    if [[ -n "${!var:-}" ]]; then
+        return 0
+    fi
+    local val=""
+    for entry in "$@"; do
+        [ -z "$entry" ] && continue
+        if val=$(security find-generic-password -s "$entry" -w 2>/dev/null); then
+            export "$var=$val"
+            return 0
+        fi
+    done
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# keys: ensure the OpenAI key is reachable. inst/gpt.py reads (in order)
+# ~/openai/access_key.txt, then OPENAI_API_KEY env. We resolve from
+# Keychain into OPENAI_API_KEY when neither is already set.
+# ---------------------------------------------------------------------------
+verify_keys() {
+    if [[ "${DRY_RUN:-0}" == "1" ]]; then
+        printf "  [dry-run] resolve OPENAI_API_KEY (Keychain / env / ~/openai/access_key.txt)\n"
+        return
+    fi
+    if keychain_export OPENAI_API_KEY \
+            "${OPENAI_KEYCHAIN_ENTRY:-}" \
+            "OPENAI_API_KEY" "openai" "OpenAI" "openai-api-key"; then
+        echo "  ok: OPENAI_API_KEY resolved"
+        return
+    fi
+    if [[ -f "$HOME/openai/access_key.txt" ]]; then
+        echo "  ok: ~/openai/access_key.txt present (inst/gpt.py will read it)"
+        return
+    fi
+    echo "ERROR: no OpenAI key found. Set OPENAI_API_KEY, add an entry to" >&2
+    echo "       the macOS Keychain (override service name via" >&2
+    echo "       OPENAI_KEYCHAIN_ENTRY=<name>), or place the key at" >&2
+    echo "       ~/openai/access_key.txt." >&2
+    exit 1
+}
+
+# ---------------------------------------------------------------------------
+# downloads: shared ontology + tool downloads (robot.jar, CLO, EFO, TGEMO)
+# ---------------------------------------------------------------------------
+run_downloads() {
+    run "$RSCRIPT $ANA/_downloads/download.R"
+}
+
+# ---------------------------------------------------------------------------
+# strains: strain-task pipeline (sequential)
+# ---------------------------------------------------------------------------
+run_strains() {
+    run "$RSCRIPT $ANA/strains/00.downloads.R"
+    run "$RSCRIPT $ANA/strains/01.ask_regex.R"
+    run "$RSCRIPT $ANA/strains/02.ask_gpt.R"
+    run "$RSCRIPT $ANA/strains/03.evaluate.R"
+    run "$RSCRIPT $ANA/strains/04.post_curation.R"
+}
+
+# ---------------------------------------------------------------------------
+# cells: cell-line pipeline (sequential)
+# ---------------------------------------------------------------------------
+run_cells() {
+    run "$RSCRIPT $ANA/cell_lines/00.downloads.R"
+    run "$RSCRIPT $ANA/cell_lines/01.process_cell_lines.R"
+    run "$RSCRIPT $ANA/cell_lines/02.ask_gpt.R"
+    run "$RSCRIPT $ANA/cell_lines/03.compare_to_vect.R"
+    run "$RSCRIPT $ANA/cell_lines/04.rag_inputs.R"
+    run "$RSCRIPT $ANA/cell_lines/05.rag_gpt.R"
+    run "$RSCRIPT $ANA/cell_lines/06.evaluate.R"
+    run "$RSCRIPT $ANA/cell_lines/07.post_curator_evaluation.R"
+}
+
+# ---------------------------------------------------------------------------
+# Run the requested phases
+# ---------------------------------------------------------------------------
+phase keys      verify_keys
+phase downloads run_downloads
+phase strains   run_strains
+phase cells     run_cells
+
+echo
+echo "All requested phases finished."