Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions docs/prds/ci-auto-retry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# PRD: CI Auto Retry

## Context

Night Watch executor PRs can be marked ready for review after implementation finishes. If CI later fails, the PR should not wait for reviewer or human intervention before implementation fixes resume.

## Goal

Automatically resume an existing executor PR when it is marked ready for review and CI has failed.

## Non-Goals

- Do not retry PRs with pending, passing, or unknown CI.
- Do not change reviewer, QA, resolver, or merger ownership.
- Do not start a new PRD before higher-priority resumable executor work.

## Contract

- The executor PR selector treats `nw:resumable` PRs as the highest-priority resume candidates.
- If no `nw:resumable` PR is eligible, it may select a non-draft executor PR labeled `nw:ready-review` when at least one CI check has a failed conclusion.
- Ready-review PRs with pending, passing, skipped, or absent check data are ignored.
- PRs labeled `ready-to-merge` remain excluded from executor resume selection.

## Phases

### Phase 1: Selection Contract

- Add tests for failed-CI ready-review selection.
- Add tests proving pending, passing, and unknown CI are ignored.
- Add tests preserving `nw:resumable` priority over failed-CI ready-review PRs.

### Phase 2: Executor Implementation

- Extend executor resume discovery to request CI rollup data from GitHub.
- Detect failed CI from structured status check conclusions.
- Return the selected PR through the existing resume path so the executor reuses the current branch and PR.

### Phase 3: Verification

- Run focused helper/script smoke tests.
- Manually verify a ready-review executor PR with failed CI is resumed before new PRD pickup.
120 changes: 116 additions & 4 deletions packages/cli/src/__tests__/scripts/night-watch-helpers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,127 @@ exit 0
{ mode: 0o755 },
);

const result = runShell(`source "${helpersScript}"; find_executor_resume_pr "night-watch"`, tempDir, {
...process.env,
PATH: `${fakeBinDir}:${process.env.PATH}`,
});
const result = runShell(
`source "${helpersScript}"; find_executor_resume_pr "night-watch"`,
tempDir,
{
...process.env,
PATH: `${fakeBinDir}:${process.env.PATH}`,
},
);

expect(result.status).toBe(0);
expect(result.stdout.trim()).toBe('');
});

it('find_executor_resume_pr selects ready-review PRs with failed CI', () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'night-watch-helpers-resume-failed-ci-'));
const fakeBinDir = path.join(tempDir, 'bin');
fs.mkdirSync(fakeBinDir, { recursive: true });

fs.writeFileSync(
path.join(fakeBinDir, 'gh'),
`#!/usr/bin/env bash
if [[ "$1" == "pr" && "$2" == "list" ]]; then
cat <<'EOF'
[{"number":44,"headRefName":"night-watch/failed-ci","url":"https://example.test/pull/44","title":"Failed CI","isDraft":false,"createdAt":"2026-04-20T12:00:00Z","labels":[{"name":"nw:ready-review"}],"statusCheckRollup":[{"contexts":[{"name":"test","status":"COMPLETED","conclusion":"FAILURE"}]}]}]
EOF
exit 0
fi
exit 0
`,
{ mode: 0o755 },
);

const result = runShell(
`source "${helpersScript}"; find_executor_resume_pr "night-watch"`,
tempDir,
{
...process.env,
PATH: `${fakeBinDir}:${process.env.PATH}`,
},
);

expect(result.status).toBe(0);
const selectedPr = JSON.parse(result.stdout);
expect(selectedPr.number).toBe(44);
expect(selectedPr.nightWatchResumeReason).toBe('failed_ci');
expect(selectedPr.failedCheckSummary).toContain('test');
});

it('find_executor_resume_pr ignores ready-review PRs with pending, passing, or unknown CI', () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'night-watch-helpers-resume-ci-state-'));
const fakeBinDir = path.join(tempDir, 'bin');
fs.mkdirSync(fakeBinDir, { recursive: true });

fs.writeFileSync(
path.join(fakeBinDir, 'gh'),
`#!/usr/bin/env bash
if [[ "$1" == "pr" && "$2" == "list" ]]; then
cat <<'EOF'
[
{"number":45,"headRefName":"night-watch/pending-ci","url":"https://example.test/pull/45","title":"Pending CI","isDraft":false,"createdAt":"2026-04-20T12:00:00Z","labels":[{"name":"nw:ready-review"}],"statusCheckRollup":[{"name":"test","status":"IN_PROGRESS","conclusion":null}]},
{"number":46,"headRefName":"night-watch/passing-ci","url":"https://example.test/pull/46","title":"Passing CI","isDraft":false,"createdAt":"2026-04-20T12:01:00Z","labels":[{"name":"nw:ready-review"}],"statusCheckRollup":[{"name":"test","status":"COMPLETED","conclusion":"SUCCESS"}]},
{"number":47,"headRefName":"night-watch/unknown-ci","url":"https://example.test/pull/47","title":"Unknown CI","isDraft":false,"createdAt":"2026-04-20T12:02:00Z","labels":[{"name":"nw:ready-review"}],"statusCheckRollup":[]},
{"number":48,"headRefName":"night-watch/ready-to-merge","url":"https://example.test/pull/48","title":"Ready to merge","isDraft":false,"createdAt":"2026-04-20T12:03:00Z","labels":[{"name":"nw:ready-review"},{"name":"ready-to-merge"}],"statusCheckRollup":[{"name":"test","status":"COMPLETED","conclusion":"FAILURE"}]},
{"number":49,"headRefName":"night-watch/draft-failed-ci","url":"https://example.test/pull/49","title":"Draft failed CI","isDraft":true,"createdAt":"2026-04-20T12:04:00Z","labels":[{"name":"nw:ready-review"}],"statusCheckRollup":[{"name":"test","status":"COMPLETED","conclusion":"FAILURE"}]}
]
EOF
exit 0
fi
exit 0
`,
{ mode: 0o755 },
);

const result = runShell(
`source "${helpersScript}"; find_executor_resume_pr "night-watch"`,
tempDir,
{
...process.env,
PATH: `${fakeBinDir}:${process.env.PATH}`,
},
);

expect(result.status).toBe(0);
expect(result.stdout.trim()).toBe('');
});

it('find_executor_resume_pr keeps labeled resumable PRs ahead of failed-CI ready-review PRs', () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'night-watch-helpers-resume-priority-'));
const fakeBinDir = path.join(tempDir, 'bin');
fs.mkdirSync(fakeBinDir, { recursive: true });

fs.writeFileSync(
path.join(fakeBinDir, 'gh'),
`#!/usr/bin/env bash
if [[ "$1" == "pr" && "$2" == "list" ]]; then
cat <<'EOF'
[
{"number":48,"headRefName":"night-watch/older-failed-ci","url":"https://example.test/pull/48","title":"Older failed CI","isDraft":false,"createdAt":"2026-04-20T12:00:00Z","labels":[{"name":"nw:ready-review"}],"statusCheckRollup":[{"name":"test","status":"COMPLETED","conclusion":"FAILURE"}]},
{"number":49,"headRefName":"night-watch/newer-resumable","url":"https://example.test/pull/49","title":"Newer resumable","isDraft":true,"createdAt":"2026-04-20T12:05:00Z","labels":[{"name":"nw:resumable"}],"statusCheckRollup":[]}
]
EOF
exit 0
fi
exit 0
`,
{ mode: 0o755 },
);

const result = runShell(
`source "${helpersScript}"; find_executor_resume_pr "night-watch"`,
tempDir,
{
...process.env,
PATH: `${fakeBinDir}:${process.env.PATH}`,
},
);

expect(result.status).toBe(0);
expect(JSON.parse(result.stdout).number).toBe(49);
});

it('send_missing_fallback_configuration_warning includes configuration guidance', () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'night-watch-helpers-telegram-'));
const curlBin = path.join(tempDir, 'curl');
Expand Down
127 changes: 120 additions & 7 deletions scripts/night-watch-cron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ EXECUTOR_PR_URL=""
EXECUTOR_PR_DRAFT=""
RESUME_FROM_EXISTING_PR=0
RESUME_BRANCH_NAME=""
RESUME_REASON=""
RESUME_FAILED_CI=0
RESUME_FAILED_CI_SUMMARY=""
RESUME_WITHOUT_PRD_FILE=0
SKIP_PRD_CLAIM=0

restore_issue_to_ready() {
local reason="${1:-Execution failed before implementation started.}"
Expand All @@ -171,14 +176,26 @@ if [ -z "${NW_TARGET_ISSUE:-}" ]; then
EXECUTOR_PR_NUMBER=$(printf '%s' "${EXECUTOR_PR_JSON}" | jq -r '.number // empty' 2>/dev/null || true)
EXECUTOR_PR_URL=$(printf '%s' "${EXECUTOR_PR_JSON}" | jq -r '.url // empty' 2>/dev/null || true)
EXECUTOR_PR_DRAFT=$(printf '%s' "${EXECUTOR_PR_JSON}" | jq -r '.isDraft // false' 2>/dev/null || true)
RESUME_REASON=$(printf '%s' "${EXECUTOR_PR_JSON}" | jq -r '.nightWatchResumeReason // "resumable"' 2>/dev/null || true)
RESUME_FAILED_CI_SUMMARY=$(printf '%s' "${EXECUTOR_PR_JSON}" | jq -r '.failedCheckSummary // empty' 2>/dev/null || true)
if [ "${RESUME_REASON}" = "failed_ci" ]; then
RESUME_FAILED_CI=1
fi
if [ -n "${RESUME_BRANCH_NAME}" ]; then
log "RESUME: Prioritizing resumable PR #${EXECUTOR_PR_NUMBER:-unknown} on ${RESUME_BRANCH_NAME}"
if [ "${RESUME_FAILED_CI}" = "1" ]; then
log "RESUME: Prioritizing failed-CI ready-review PR #${EXECUTOR_PR_NUMBER:-unknown} on ${RESUME_BRANCH_NAME}"
else
log "RESUME: Prioritizing resumable PR #${EXECUTOR_PR_NUMBER:-unknown} on ${RESUME_BRANCH_NAME}"
fi
else
RESUME_FROM_EXISTING_PR=0
EXECUTOR_PR_JSON=""
EXECUTOR_PR_NUMBER=""
EXECUTOR_PR_URL=""
EXECUTOR_PR_DRAFT=""
RESUME_REASON=""
RESUME_FAILED_CI=0
RESUME_FAILED_CI_SUMMARY=""
fi
fi
fi
Expand Down Expand Up @@ -262,7 +279,16 @@ if [ -z "${ISSUE_NUMBER}" ]; then
RESUME_PRD_NAME="${RESUME_BRANCH_NAME#*/}"
if [ -f "${PRD_DIR}/${RESUME_PRD_NAME}.md" ]; then
ELIGIBLE_PRD="${RESUME_PRD_NAME}.md"
log "RESUME: Using resumable filesystem PRD ${ELIGIBLE_PRD}"
if [ "${RESUME_FAILED_CI}" = "1" ]; then
log "RESUME: Using failed-CI filesystem PRD ${ELIGIBLE_PRD}"
else
log "RESUME: Using resumable filesystem PRD ${ELIGIBLE_PRD}"
fi
elif [ "${RESUME_FAILED_CI}" = "1" ]; then
ELIGIBLE_PRD="${RESUME_PRD_NAME}.md"
RESUME_WITHOUT_PRD_FILE=1
SKIP_PRD_CLAIM=1
log "RESUME: Failed-CI PR #${EXECUTOR_PR_NUMBER:-unknown} has no active PRD file in ${PRD_DIR}; resuming branch ${RESUME_BRANCH_NAME} for CI repair"
else
log "WARN: Resumable PR branch ${RESUME_BRANCH_NAME} has no matching PRD file in ${PRD_DIR}; falling back to normal selection"
RESUME_FROM_EXISTING_PR=0
Expand All @@ -271,6 +297,9 @@ if [ -z "${ISSUE_NUMBER}" ]; then
EXECUTOR_PR_URL=""
EXECUTOR_PR_DRAFT=""
RESUME_BRANCH_NAME=""
RESUME_REASON=""
RESUME_FAILED_CI=0
RESUME_FAILED_CI_SUMMARY=""
fi
fi
# Filesystem mode: scan PRD directory
Expand All @@ -283,8 +312,10 @@ if [ -z "${ISSUE_NUMBER}" ]; then
exit 0
fi
# Claim the PRD to prevent other runs from selecting it
claim_prd "${PRD_DIR}" "${ELIGIBLE_PRD}"
append_exit_trap "release_claim '${PRD_DIR}' '${ELIGIBLE_PRD}'"
if [ "${SKIP_PRD_CLAIM}" != "1" ]; then
claim_prd "${PRD_DIR}" "${ELIGIBLE_PRD}"
append_exit_trap "release_claim '${PRD_DIR}' '${ELIGIBLE_PRD}'"
fi
fi

PRD_NAME="${ELIGIBLE_PRD%.md}"
Expand Down Expand Up @@ -771,7 +802,42 @@ read_audit_triage_result() {
| sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//'
}

if is_tiny_audit_board_issue; then
if [ "${RESUME_FAILED_CI}" = "1" ]; then
PROMPT_PR_CONTEXT="PR #${EXECUTOR_PR_NUMBER:-unknown}"
if [ -n "${EXECUTOR_PR_URL}" ]; then
PROMPT_PR_CONTEXT="${PROMPT_PR_CONTEXT} (${EXECUTOR_PR_URL})"
fi
PROMPT_ISSUE_CONTEXT="current PR"
if [ -n "${ISSUE_NUMBER}" ]; then
PROMPT_ISSUE_CONTEXT="current PR and issue #${ISSUE_NUMBER}"
fi
PROMPT_FAILED_CHECKS="${RESUME_FAILED_CI_SUMMARY:-GitHub reported explicit failing status checks for this PR.}"
PROMPT="Fix the failing CI for existing executor ${PROMPT_PR_CONTEXT}: ${ISSUE_TITLE_RAW:-${PRD_NAME}}

Known failing checks:
${PROMPT_FAILED_CHECKS}

## Setup
- You are already inside an isolated worktree at: ${WORKTREE_DIR}
- Current branch is already checked out: ${BRANCH_NAME}
- Do NOT run git checkout/switch in ${PROJECT_DIR}
- Do NOT create or remove worktrees; the cron script manages that
- Install dependencies if needed and work in the current worktree only

## CI Repair Workflow
1. Inspect the GitHub CI failure details for ${PROMPT_PR_CONTEXT}; use \`gh pr checks\`, \`gh run view --log\`, or the linked run logs as needed.
2. Reproduce the failing command locally when practical.
3. Fix only the failures that belong to the ${PROMPT_ISSUE_CONTEXT}. Do NOT start unrelated PRDs, issues, refactors, or new features.
4. Run the focused verification needed to prove the CI failure is fixed.
5. Commit and push the fix to the existing branch:
${PROGRESS_PUSH_CMD}

## PR Lifecycle
- The controller owns PR lifecycle and labels for this branch
- Do NOT create another PR and do NOT edit PR labels
- STOP immediately after the final push
- Do NOT process any other issues or PRDs — only ${PROMPT_ISSUE_CONTEXT}"
elif is_tiny_audit_board_issue; then
log "PROMPT: Using lean audit triage workflow for issue #${ISSUE_NUMBER}"
PROMPT="$(build_audit_triage_prompt)"
elif [ -n "${ISSUE_NUMBER}" ]; then
Expand Down Expand Up @@ -866,6 +932,9 @@ if [ "${MERGED_PR_COUNT}" -gt 0 ]; then
"${NW_CLI}" board move-issue "${ISSUE_NUMBER}" --column "Done" 2>>"${LOG_FILE}" || true
emit_result "success_already_merged" "prd=${ELIGIBLE_PRD}|branch=${BRANCH_NAME}"
exit 0
elif [ "${RESUME_FAILED_CI}" = "1" ]; then
emit_result "success_already_merged" "prd=${ELIGIBLE_PRD}|branch=${BRANCH_NAME}"
exit 0
elif finalize_prd_done "already merged on ${BRANCH_NAME}"; then
emit_result "success_already_merged" "prd=${ELIGIBLE_PRD}|branch=${BRANCH_NAME}"
exit 0
Expand Down Expand Up @@ -991,7 +1060,34 @@ while [ "${ATTEMPT}" -lt "${MAX_RETRIES}" ]; do
RESUME_PROGRESS_NOTE="No checkpoint was created because there were no local changes or branch commits."
fi
# Switch prompt to "continue" mode for the next attempt (fresh context)
if [ -n "${ISSUE_NUMBER}" ]; then
if [ "${RESUME_FAILED_CI}" = "1" ]; then
PROMPT="Continue fixing failing CI for existing executor PR #${EXECUTOR_PR_NUMBER:-unknown}.

The previous session ran out of context window. Progress has been committed on branch ${BRANCH_NAME}.

Known failing checks from the original retry:
${RESUME_FAILED_CI_SUMMARY:-GitHub reported explicit failing status checks for this PR.}

## Your task
1. Review the current state: check git log, local changes, and pushed commits on ${BRANCH_NAME}
2. Inspect the GitHub CI failure logs with \`gh pr checks\`, \`gh run view --log\`, or linked run logs as needed
3. Continue fixing only the CI failures that belong to this current PR${ISSUE_NUMBER:+ and issue #${ISSUE_NUMBER}}
4. Do NOT start unrelated PRDs, issues, refactors, or new features
5. Run focused verification, commit, and push the fix:
${PROGRESS_PUSH_CMD}

## Setup
- You are already inside an isolated worktree at: ${WORKTREE_DIR}
- Current branch is already checked out: ${BRANCH_NAME}
- Do NOT run git checkout/switch in ${PROJECT_DIR}
- Do NOT create or remove worktrees; the cron script manages that

## PR Lifecycle
- The controller owns PR lifecycle and labels for this branch
- Do NOT create another PR and do NOT edit PR labels
- STOP immediately after the final push
- Do NOT process any other issues or PRDs"
elif [ -n "${ISSUE_NUMBER}" ]; then
PROMPT="Continue implementing PRD (GitHub issue #${ISSUE_NUMBER}: ${ISSUE_TITLE_RAW}).

The previous session ran out of context window. ${RESUME_PROGRESS_NOTE}
Expand Down Expand Up @@ -1251,7 +1347,16 @@ if [ ${EXIT_CODE} -eq 0 ]; then
if [ "${OPEN_PR_COUNT}" -gt 0 ]; then
refresh_executor_pr_metadata
mark_executor_pr_ready_for_review || true
if [ -n "${ISSUE_NUMBER}" ]; then
if [ -n "${ISSUE_NUMBER}" ] && [ "${RESUME_FAILED_CI}" = "1" ]; then
PR_URL=$(gh pr list --state open --json headRefName,url \
--jq ".[] | select(.headRefName == \"${BRANCH_NAME}\") | .url" 2>/dev/null || true)
if [ -n "${PR_URL}" ]; then
"${NW_CLI}" board comment "${ISSUE_NUMBER}" --body "CI repair pushed: ${PR_URL} (via ${EFFECTIVE_PROVIDER_LABEL})" 2>>"${LOG_FILE}" || true
gh pr comment "${PR_URL}" --body "> 🤖 CI fix by ${EFFECTIVE_PROVIDER_LABEL}" 2>>"${LOG_FILE}" || true
fi
log "SUCCESS: Failed CI repaired and PR returned to ready review — ${PR_URL}"
emit_result "success_open_pr" "prd=${ELIGIBLE_PRD}|branch=${BRANCH_NAME}${PR_URL:+|pr_url=${PR_URL}}|reason=ci_repaired"
elif [ -n "${ISSUE_NUMBER}" ]; then
# Board mode: comment with PR URL, then close issue and move to Done
PR_URL=$(gh pr list --state open --json headRefName,url \
--jq ".[] | select(.headRefName == \"${BRANCH_NAME}\") | .url" 2>/dev/null || true)
Expand All @@ -1264,6 +1369,14 @@ if [ ${EXIT_CODE} -eq 0 ]; then
"${NW_CLI}" board move-issue "${ISSUE_NUMBER}" --column "Done" 2>>"${LOG_FILE}" || true
log "SUCCESS: PR opened and ready for review — ${PR_URL}"
emit_result "success_open_pr" "prd=${ELIGIBLE_PRD}|branch=${BRANCH_NAME}${PR_URL:+|pr_url=${PR_URL}}${EXECUTOR_PR_NUMBER:+|pr_number=${EXECUTOR_PR_NUMBER}}"
elif [ "${RESUME_FAILED_CI}" = "1" ]; then
NON_BOARD_PR_URL=$(gh pr list --state open --json headRefName,url \
--jq ".[] | select(.headRefName == \"${BRANCH_NAME}\") | .url" 2>/dev/null || true)
if [ -n "${NON_BOARD_PR_URL}" ]; then
gh pr comment "${NON_BOARD_PR_URL}" --body "> 🤖 CI fix by ${EFFECTIVE_PROVIDER_LABEL}" 2>>"${LOG_FILE}" || true
fi
log "SUCCESS: Failed CI repaired and PR returned to ready review — ${NON_BOARD_PR_URL}"
emit_result "success_open_pr" "prd=${ELIGIBLE_PRD}|branch=${BRANCH_NAME}${NON_BOARD_PR_URL:+|pr_url=${NON_BOARD_PR_URL}}|reason=ci_repaired"
elif finalize_prd_done "implemented, PR opened on ${BRANCH_NAME}"; then
# Non-board mode: post attribution comment to the PR
NON_BOARD_PR_URL=$(gh pr list --state open --json headRefName,url \
Expand Down
Loading
Loading