From 446be25d7f3e2465dc2fe9c9cc165dee0e86413a Mon Sep 17 00:00:00 2001 From: kj-podonos Date: Wed, 24 Jun 2026 14:08:35 +0900 Subject: [PATCH 1/3] fix(ci): make PyPI promote idempotent for already-published versions The Promote (PyPI) workflow failed (red + Slack page) on every prod-deploy dispatch once the newest SDK <= prod spec was already on PyPI: the build-time preflight saw HTTP 200 and exit-1'd. That is a benign, expected re-dispatch, not a failure. Move the PyPI existence check up into `resolve` (it already knows the tag): - 200 (already published) -> already_published=true -> build + pypi skip -> green no-op, no Slack, plus a job-summary "no-op" line. - 404 (absent) -> build + publish as before. - other (000/403/429/5xx) -> fail closed (abort), now hardened with curl --retry/--max-time so a transient blip doesn't false-red. Delete the build-time preflight: it only guarded a publish-between-resolve- and-build race that can't happen (concurrency serializes promotes; PyPI rejects duplicate uploads). Add a `notify-success` job that pings Slack only on an actual publish (silent on the no-op). Sync docs (PUBLISH.md, RUNBOOK.md). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/promote-prod.yml | 76 +++++++++++++++++++++--------- RUNBOOK.md | 8 ++-- docs/PUBLISH.md | 22 +++++---- 3 files changed, 70 insertions(+), 36 deletions(-) diff --git a/.github/workflows/promote-prod.yml b/.github/workflows/promote-prod.yml index b9dc2e8..d2c96f8 100644 --- a/.github/workflows/promote-prod.yml +++ b/.github/workflows/promote-prod.yml @@ -13,8 +13,8 @@ name: Promote (PyPI) # human override / escape hatch and skips the ancestry check. # # Build-once-promote is really rebuild-from-the-release-tag: hatch-vcs stamps the clean -# vX.Y.Z from the tagged commit (byte-for-byte deterministic), then PyPI's preflight makes -# a double-publish to the immutable index impossible. dev/main + TestPyPI may lead this lane; +# vX.Y.Z from the tagged commit (byte-for-byte deterministic); resolve's idempotency check +# (already-on-PyPI ⇒ skip) + PyPI's immutable index make a double-publish impossible. dev/main + TestPyPI may lead this lane; # only a real prod deploy ever reaches PyPI. See docs/PUBLISH.md for the two-lane model. # # ┌─ PyPI Trusted Publisher PREREQ (one-time, manual PyPI settings change) ───────────────┐ @@ -54,6 +54,7 @@ jobs: outputs: proceed: ${{ steps.gate.outputs.proceed }} tag: ${{ steps.target.outputs.tag }} + already_published: ${{ steps.published.outputs.already_published }} steps: - name: Gate — App configured + authorized trigger id: gate @@ -233,9 +234,37 @@ jobs: echo "::notice::Per-sha pin → promoting ${TAG} (newest SDK whose spec ≤ prod ${S})." echo "tag=$TAG" >> "$GITHUB_OUTPUT" + - name: Skip if already on PyPI (idempotent no-op) + id: published + if: steps.gate.outputs.proceed == 'true' + env: + TAG: ${{ steps.target.outputs.tag }} + # The promote target is immutable + deterministic from its tag. If onepin== + # already exists, this is a benign re-dispatch (per-sha resolve re-picks the newest + # SDK ≤ prod, still published) → no-op SUCCESS, skip the whole build, don't page Slack. + # Fail CLOSED only on AMBIGUITY: 000/403/429/5xx = cannot confirm absent → abort. + run: | + set -euo pipefail + VERSION="${TAG#v}" + # --retry + --max-time so a transient pypi.org blip (5xx/429/connrefused) doesn't trip + # the fail-closed red. A persistent outage still aborts (set -e) — correct (can't confirm). + code="$(curl -s --retry 3 --retry-connrefused --retry-delay 2 --max-time 20 -o /dev/null -w '%{http_code}' "https://pypi.org/pypi/onepin/${VERSION}/json")" + echo "pypi.org returned HTTP $code for onepin==${VERSION}" + if [ "$code" = "404" ]; then + echo "Version ${VERSION} not yet on PyPI — proceeding to build + publish." + echo "already_published=false" >> "$GITHUB_OUTPUT" + elif [ "$code" = "200" ]; then + echo "::notice::onepin==${VERSION} already on PyPI — idempotent no-op, skipping build + publish (immutable index)." + { echo "### PyPI promote — no-op"; echo "\`onepin==${VERSION}\` already published. Nothing to do."; } >> "$GITHUB_STEP_SUMMARY" + echo "already_published=true" >> "$GITHUB_OUTPUT" + else + echo "::error::PyPI check returned unexpected HTTP ${code} for onepin==${VERSION} — cannot confirm absence; aborting (fail closed; retry when PyPI is reachable)." + exit 1 + fi + build: needs: resolve - if: needs.resolve.outputs.proceed == 'true' + if: ${{ needs.resolve.outputs.proceed == 'true' && needs.resolve.outputs.already_published == 'false' }} runs-on: ubuntu-latest outputs: version: ${{ steps.ver.outputs.version }} @@ -275,26 +304,6 @@ jobs: [[ "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] \ || { echo "::error::built version '$version' is not a clean release X.Y.Z (got a .devN/local — not building from a release tag?)"; exit 1; } echo "version=$version" >> "$GITHUB_OUTPUT" - - name: Preflight — version not already on PyPI (immutable index) - env: - VERSION: ${{ steps.ver.outputs.version }} - # PyPI never lets you re-upload a version. If onepin== already exists, - # a publish would hard-fail mid-run; abort cleanly here instead. - # Fail CLOSED: ONLY a definitive 404 (confirmed absent) proceeds. - # 200 (exists) AND anything else (000/403/429/5xx = cannot confirm absent) abort. - run: | - set -euo pipefail - code="$(curl -s -o /dev/null -w '%{http_code}' "https://pypi.org/pypi/onepin/${VERSION}/json")" - echo "pypi.org returned HTTP $code for onepin==${VERSION}" - if [ "$code" = "404" ]; then - echo "Version ${VERSION} not yet on PyPI — proceeding." - elif [ "$code" = "200" ]; then - echo "::error::onepin==${VERSION} is already published on PyPI — refusing to re-promote (the index is immutable)." - exit 1 - else - echo "::error::PyPI preflight returned unexpected HTTP ${code} for onepin==${VERSION} — cannot confirm version is absent; aborting to fail closed (retry when PyPI is reachable)." - exit 1 - fi - uses: actions/upload-artifact@v7 with: { name: dist, path: dist/* } @@ -330,6 +339,27 @@ jobs: env: GH_TOKEN: ${{ github.token }} + notify-success: + needs: [resolve, build, pypi] + if: ${{ needs.pypi.result == 'success' }} # ONLY when pypi actually published — skipped on the no-op path + runs-on: ubuntu-latest + steps: + - name: Notify Slack + if: env.SLACK_WEBHOOK_URL != '' + uses: slackapi/slack-github-action@v3 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL }} + webhook-type: incoming-webhook + payload: | + {"text":"✅ PyPI promote succeeded: onepin==${{ needs.build.outputs.version }} (${{ needs.resolve.outputs.tag }}) in ${{ github.repository }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"} + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + - name: Skip Slack notification + if: env.SLACK_WEBHOOK_URL == '' + run: echo "SLACK_WEBHOOK_URL not configured; skipping success notification." + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + notify-failure: if: failure() needs: [resolve, build, pypi] diff --git a/RUNBOOK.md b/RUNBOOK.md index 6b2fa54..7594a0c 100644 --- a/RUNBOOK.md +++ b/RUNBOOK.md @@ -25,7 +25,7 @@ twine unyank onepin==0.1.3 A bad tag reaches **TestPyPI** via `publish.yml` but does **not** reach customers until a prod promote (`promote-prod.yml`). Deleting the tag is viable any time before that promote runs. After the version is on **PyPI**, the tag delete no longer helps — use `twine yank` -instead (and PyPI's immutable index / the promote preflight already block a re-upload). +instead (and PyPI's immutable index / the promote `resolve` idempotency check already make a re-upload a no-op). ```bash git tag -d v0.X.Y @@ -68,8 +68,8 @@ The package publishes from **two independent lanes** (full model + diagram: - **PyPI lane — `promote-prod.yml`** (customers, prod-gated). Fires on `repository_dispatch[api-spec-updated]` **only when `environment == 'prod'`** (the backend production deploy dispatch), or manually via `workflow_dispatch`. - Resolves the latest `vX.Y.Z` tag, builds a clean `X.Y.Z`, runs the immutable-index - preflight, then publishes to **PyPI** (OIDC trusted publishing + provenance). + Resolves the latest `vX.Y.Z` tag, runs the immutable-index idempotency check (in `resolve`), + builds a clean `X.Y.Z`, then publishes to **PyPI** (OIDC trusted publishing + provenance). > **PyPI Trusted Publisher prereq:** the `onepin` PyPI project's Trusted Publisher must > point at workflow filename **`promote-prod.yml`** with environment **`pypi`** (it was @@ -107,7 +107,7 @@ Manual fallback / replay — **PyPI** (promote an existing release tag to custom ```bash # Used when the pipeline App was absent at the prod deploy, or to re-drive a promote. # Supplying -f tag= bypasses the per-sha ancestry resolver (human override). -# The preflight refuses a version that already exists on PyPI. +# resolve skips (no-op success) a version already on PyPI; only ambiguous PyPI responses abort. gh workflow run promote-prod.yml --ref main -f tag=vX.Y.Z ``` diff --git a/docs/PUBLISH.md b/docs/PUBLISH.md index dc80f18..8c610d9 100644 --- a/docs/PUBLISH.md +++ b/docs/PUBLISH.md @@ -11,7 +11,7 @@ must only ever receive a build that a real production deploy blessed.** | Cadence | **continuous** | **prod-gated** | | Trigger | push to `main` (SDK inputs) · release tag `vX.Y.Z` · `workflow_dispatch` | `repository_dispatch[api-spec-updated]` with `environment == 'prod'` · `workflow_dispatch` | | Version | `X.Y.Z.devN` on main · clean `X.Y.Z` on a tag (hatch-vcs) | clean `X.Y.Z` only (built from the release tag) | -| Gate | none (internal) | App-token guard **+** prod-environment trigger **+** PyPI preflight | +| Gate | none (internal) | App-token guard **+** prod-environment trigger **+** PyPI idempotency check | ## Version ownership @@ -62,7 +62,7 @@ must only ever receive a build that a real production deploy blessed.** │ no tag passes ⇒ ABORT │ │ build resolved tag (fetch-depth:0, hatch-vcs → clean X.Y.Z) │ │ assert ^[0-9]+\.[0-9]+\.[0-9]+$ (NO .devN / NO +local) │ - │ preflight: GET pypi.org/pypi/onepin//json → 200 ⇒ abort │ + │ idempotency check (resolve): 200 ⇒ skip no-op · 404 ⇒ publish · else abort │ │ │ │ │ ▼ │ │ ✦ PyPI ✦ (OIDC trusted publishing, environment: pypi, │ @@ -116,9 +116,9 @@ a manual settings change in the PyPI project — the workflow cannot self-config | 2 | **Prod trigger missing / GitHub App absent** → customer release silently skipped | The `onepin-pipeline-bot` App / `PIPELINE_APP_ID`+`PIPELINE_APP_PRIVATE_KEY` secrets aren't set yet at a real prod deploy | `promote-prod.yml`'s gate warn-skips (`::warning::`) instead of failing, and emits a visible notice. **Replay:** once the App exists, run `promote-prod.yml` via `workflow_dispatch` with the tag. (The backend's `notify-sdk-repos` dispatch carries the same guard.) | | 3 | **release-PR-merge gating** — "prod deployed but PyPI got nothing" | A regen `chore:` PR is invisible to release-please → no tag → nothing to promote | regen PRs are now **`feat:`** → release-please cuts a `vX.Y.Z` tag (`bump-minor-pre-major`). The promote iterates all `vX.Y.Z` tags newest-first; if none carry a valid `.spec-sha` ≤ S it aborts loudly (`::error::no released SDK matches`). | | 4 | **`.devN` non-monotonic / collides on TestPyPI** | Two builds of the same commit, or out-of-order history, produce a stale/duplicate `.devN` | hatch-vcs derives `.devN` from **git distance** (monotonic with history under `fetch-depth: 0`); TestPyPI publish uses `skip-existing: true` so a genuine re-run/retag never breaks the chain. TestPyPI is internal-only — a non-monotonic dev number never reaches customers. | -| 5 | **Double-publish to the immutable PyPI index** | A re-dispatch / re-run / rollback re-fires the promote for an already-published version | `promote-prod.yml` **preflight**: `GET https://pypi.org/pypi/onepin//json`; HTTP `200` ⇒ `::error::` abort before upload. `concurrency: { group: promote-prod, cancel-in-progress: false }` serializes promotes. PyPI itself is the final backstop (rejects re-uploads). | +| 5 | **Double-publish to the immutable PyPI index** | A re-dispatch / re-run / rollback re-fires the promote for an already-published version | `promote-prod.yml` **`resolve` idempotency check**: `GET https://pypi.org/pypi/onepin//json`; HTTP `200` ⇒ **skip `build` + `pypi` as an idempotent no-op** (green run, no Slack) — not an abort. `concurrency: { group: promote-prod, cancel-in-progress: false }` serializes promotes. PyPI itself is the final backstop (rejects re-uploads). | | 6 | **Wrong-version / wrong-sha promoted** to customers | Promote builds off a branch HEAD (a `.devN`), or promotes a tag whose API is ahead of the deployed spec | The build checks out `refs/tags/` (qualified — never a same-named branch) and **asserts a clean `^[0-9]+\.[0-9]+\.[0-9]+$`** (a `.devN`/local aborts). **Per-sha pinning** (shipped): on a prod dispatch carrying spec commit S, the resolver iterates tags newest-first and promotes the newest tag whose `.spec-sha` is an ancestor-or-equal of S in the spec repo (`compare` base...head → `ahead`/`identical` = safe); any tag ahead of prod is skipped. Any API error during classification aborts the whole resolve (fail closed) — the immutable index is never touched with an uncertain result. | -| 7 | **Rollback re-dispatch republishes/downgrades** | A non-forward dispatch (e.g. a rollback) reaches the receiver | The PyPI lane only acts on `environment == 'prod'` dispatches; the immutable-index preflight (row 5) blocks a re-publish of an existing version. (The backend additionally gates `notify-sdk-repos` on `github.event_name == 'push'` so a rollback `workflow_dispatch` doesn't re-dispatch.) | +| 7 | **Rollback re-dispatch republishes/downgrades** | A non-forward dispatch (e.g. a rollback) reaches the receiver | The PyPI lane only acts on `environment == 'prod'` dispatches; the `resolve` idempotency check (row 5) skips a re-publish of an existing version as a no-op. (The backend additionally gates `notify-sdk-repos` on `github.event_name == 'push'` so a rollback `workflow_dispatch` doesn't re-dispatch.) | | 8 | **`testpypi-smoke` flakes on a fresh `.devN`** — TestPyPI publish succeeds but the post-publish smoke install fails with `No matching distribution` | TestPyPI's `/simple/` index is eventually-consistent (Fastly CDN); the smoke job runs seconds after `test-pypi` uploads and **races the index** before the new version propagates | `testpypi-smoke` **retries** the install (10×30s, ~5 min ceiling, early-exit on success) with `--no-cache-dir` so pip never replays a cached negative index response. A genuinely uninstallable artifact still fails after the budget (the install never succeeds); the version flows via `env:` for script-injection safety. Note: `--no-cache-dir` covers pip's *client* cache, not Fastly *edge* caching — the time budget, not the flag, is what outlasts CDN lag. | ## Test plan (4 layers) @@ -143,16 +143,16 @@ a manual settings change in the PyPI project — the workflow cannot self-config - **PyPI lane gate** — `workflow_dispatch` `promote-prod.yml` **with no App secrets**: assert it `::warning::` warn-skips (no PyPI mutation). Dispatch a simulated `repository_dispatch` with `environment: dev`: assert it `::notice::` skips. -- **Preflight** — `workflow_dispatch` `promote-prod.yml` with `tag` = an **already-published** - version: assert the preflight returns `200` and the job aborts with `::error::` *before* the - `pypi` job. +- **Idempotency (no-op)** — `workflow_dispatch` `promote-prod.yml` with `tag` = an **already-published** + version: assert `resolve` logs `200`, sets `already_published=true`, and **`build` + `pypi` skip** + (green run, no Slack) — not an abort. - **actionlint** — `actionlint .github/workflows/*.yml` clean (CI-enforceable). ### 3. End-to-end (real release, gated on the App existing) - Merge a `feat:` regen PR → release-please opens a release PR → merge it → tag `vX.Y.Z` → `publish.yml` fires → TestPyPI gets the clean `X.Y.Z`. - Real backend **prod** deploy (`deploy-prod.yml`) → `repository_dispatch{environment:prod}` - → `promote-prod.yml` resolves the tag, builds clean `X.Y.Z`, preflight passes, **PyPI** + → `promote-prod.yml` resolves the tag (PyPI check: `404` ⇒ absent), builds clean `X.Y.Z`, **PyPI** publish + provenance attestation. Verify `pip install onepin==X.Y.Z` from pypi.org and `onepin --version`. - **Manual replay** path: `gh workflow run promote-prod.yml -f tag=vX.Y.Z` reaches PyPI @@ -162,11 +162,15 @@ a manual settings change in the PyPI project — the workflow cannot self-config - **Slack failure notifier** on both lanes (`notify-failure`, gated on `SLACK_WEBHOOK_URL`): fires on any build/publish failure with a direct run link. A clean warn-skip does **not** fire (skipped ≠ failed). +- **Slack success notifier** (PyPI lane, `notify-success`, `if: needs.pypi.result == 'success'`): + pings only on an actual publish — **silent on the idempotent no-op** (a skipped `pypi` is not a + success), so a redundant re-promote is a quiet green run. - **Build-provenance attestation** (PyPI lane, public-repo-gated): `actions/attest-build-provenance` + `gh attestation verify dist/*.whl --repo podonos/onepin-python` — a signed, verifiable record of exactly which commit/workflow produced the published bytes. - **Traceability log** — the promote logs the dispatched `client_payload.sha` + `spec_version` and the resolved tag, so a published version can always be traced back to the prod deploy that triggered it. -- **PyPI preflight log** — the `GET …/json` HTTP code is echoed every run (visible proof the +- **PyPI idempotency log** — `resolve` echoes the `GET …/json` HTTP code every run, and writes a + job-summary "no-op" line when it skips an already-published version (visible proof the immutable-index guard ran). From 79105bd0eeda6ab9ab34af8ed7e4cc40900a0f31 Mon Sep 17 00:00:00 2001 From: kj-podonos Date: Wed, 24 Jun 2026 14:31:11 +0900 Subject: [PATCH 2/3] fix(ci): make success Slack notification best-effort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review: if SLACK_WEBHOOK_URL is set but Slack rejects/times out, the new notify-success job failed AFTER pypi had already published — turning a real, immutable release into a red run with no Slack at all (notify-failure doesn't depend on notify-success, so it stays silent). Add continue-on-error to the Slack step so a notification hiccup can never red an already-published release. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/promote-prod.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/promote-prod.yml b/.github/workflows/promote-prod.yml index d2c96f8..a59a95c 100644 --- a/.github/workflows/promote-prod.yml +++ b/.github/workflows/promote-prod.yml @@ -346,6 +346,7 @@ jobs: steps: - name: Notify Slack if: env.SLACK_WEBHOOK_URL != '' + continue-on-error: true # a Slack hiccup must never red an already-published release uses: slackapi/slack-github-action@v3 with: webhook: ${{ secrets.SLACK_WEBHOOK_URL }} From a05e6f2df7a676dcb328b9f4f8a6038afe714b9c Mon Sep 17 00:00:00 2001 From: kj-podonos Date: Wed, 24 Jun 2026 15:05:50 +0900 Subject: [PATCH 3/3] fix(ci): harden PyPI check redirects + correct PUBLISH diagram order code-reviewer pass (non-blocking): - curl now follows redirects (-L --max-redirs 3) so a benign pypi.org 301/302 resolves to the real 200/404 instead of false-reding via the else branch; fail-closed semantics preserved (a redirect loop / unexpected final status still aborts). - PUBLISH.md flow diagram: move the idempotency-check line above the build steps so it reflects that the check runs in `resolve`, before `build`. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/promote-prod.yml | 2 +- docs/PUBLISH.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/promote-prod.yml b/.github/workflows/promote-prod.yml index a59a95c..6182af9 100644 --- a/.github/workflows/promote-prod.yml +++ b/.github/workflows/promote-prod.yml @@ -248,7 +248,7 @@ jobs: VERSION="${TAG#v}" # --retry + --max-time so a transient pypi.org blip (5xx/429/connrefused) doesn't trip # the fail-closed red. A persistent outage still aborts (set -e) — correct (can't confirm). - code="$(curl -s --retry 3 --retry-connrefused --retry-delay 2 --max-time 20 -o /dev/null -w '%{http_code}' "https://pypi.org/pypi/onepin/${VERSION}/json")" + code="$(curl -s -L --max-redirs 3 --retry 3 --retry-connrefused --retry-delay 2 --max-time 20 -o /dev/null -w '%{http_code}' "https://pypi.org/pypi/onepin/${VERSION}/json")" echo "pypi.org returned HTTP $code for onepin==${VERSION}" if [ "$code" = "404" ]; then echo "Version ${VERSION} not yet on PyPI — proceeding to build + publish." diff --git a/docs/PUBLISH.md b/docs/PUBLISH.md index 8c610d9..f15b8e1 100644 --- a/docs/PUBLISH.md +++ b/docs/PUBLISH.md @@ -60,9 +60,9 @@ must only ever receive a build that a real production deploy blessed.** │ behind / diverged ⇒ skip (SDK ahead of prod) │ │ 404 ⇒ skip (SHA GC'd); other error ⇒ ABORT (fail closed) │ │ no tag passes ⇒ ABORT │ + │ idempotency check (resolve): 200 ⇒ skip no-op · 404 ⇒ publish · else abort │ │ build resolved tag (fetch-depth:0, hatch-vcs → clean X.Y.Z) │ │ assert ^[0-9]+\.[0-9]+\.[0-9]+$ (NO .devN / NO +local) │ - │ idempotency check (resolve): 200 ⇒ skip no-op · 404 ⇒ publish · else abort │ │ │ │ │ ▼ │ │ ✦ PyPI ✦ (OIDC trusted publishing, environment: pypi, │