From 830401aaf5bd76fcbc98915fe3a5caca3fff3a59 Mon Sep 17 00:00:00 2001 From: Tal Haim Date: Thu, 18 Jun 2026 11:27:10 +0300 Subject: [PATCH] [CI] Wait for private on-prem deployment workflow to complete The public release workflow previously fired a repository_dispatch and exited immediately, so CE releases could succeed even when the private deploy or Naipi sanity test failed. Poll the triggered workflow run and fail the release job if deployment does not conclude successfully. Co-authored-by: Cursor --- .cursor/rules/ce-onprem-public-dispatch.mdc | 25 ++++++ .../workflows/deploy_ce_onprem_public.yaml | 77 ++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 .cursor/rules/ce-onprem-public-dispatch.mdc diff --git a/.cursor/rules/ce-onprem-public-dispatch.mdc b/.cursor/rules/ce-onprem-public-dispatch.mdc new file mode 100644 index 00000000..d183a594 --- /dev/null +++ b/.cursor/rules/ce-onprem-public-dispatch.mdc @@ -0,0 +1,25 @@ +--- +description: Maintain CE public-to-private on-prem deployment dispatch contract +globs: .github/workflows/*.y*ml +alwaysApply: false +--- + +# CE On-Prem Public Dispatch Contract + +- Scope: this rule applies when editing CE release and on-prem dispatch workflows. +- Treat this path as a contract chain: `release.yml` -> `deploy_ce_onprem_public.yaml` -> `ce-deployment` `repository_dispatch` -> Jenkins `mlrunce_deploy_onprem_v2/dev`. + +## Contract Rules + +- Keep the dispatch `event_type` as `deploy-ce-onprem` unless both repos are updated together. +- Keep payload keys stable (`version`, `system_id`, `run_naipi`, `source_repo`, `triggered_by`) or update the private workflow extractor in the same change. +- Preserve release tag expectations (`mlrun-ce-`) and the chart-version handoff from `release.yml`. +- Preserve repository resolution guard (`owner/repo`) for `DEPLOYMENT_REPO`. +- Keep run tracking robust: dispatch-time correlation is required; if changing polling logic, guard against selecting an unrelated `repository_dispatch` run. + +## Review Checklist Before Merging + +- Validate that new/renamed payload keys are consumed in `ce-deployment/.github/workflows/deploy_ce_onprem.yaml`. +- Validate `version` transformation remains consistent with chart release tags. +- Validate failures in private workflow propagation fail the public workflow (do not silently pass). +- Validate timeout and retry values still cover long Jenkins runs (up to ~6h). diff --git a/.github/workflows/deploy_ce_onprem_public.yaml b/.github/workflows/deploy_ce_onprem_public.yaml index 472f593a..889a76d1 100644 --- a/.github/workflows/deploy_ce_onprem_public.yaml +++ b/.github/workflows/deploy_ce_onprem_public.yaml @@ -23,6 +23,7 @@ jobs: trigger-deployment: name: Trigger Deployment in Private Repo runs-on: ubuntu-latest + timeout-minutes: 420 # 360-min Jenkins job + buffer steps: - name: Resolve target repository id: repo-info @@ -49,10 +50,14 @@ jobs: private-key: ${{ secrets.GH_APP_PRIVATE_KEY }} owner: ${{ steps.repo-info.outputs.owner }} repositories: ${{ steps.repo-info.outputs.repo }} + permission-contents: write + permission-actions: read - name: Send Repository Dispatch to Private Deployment Repo + id: dispatch run: | DEPLOYMENT_REPO="${{ steps.repo-info.outputs.full_name }}" + DISPATCH_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") curl -X POST \ -H "Accept: application/vnd.github+json" \ -H "Authorization: token ${{ steps.app-token.outputs.token }}" \ @@ -70,5 +75,75 @@ jobs: triggered_by: "${{ github.actor }}" } }')" - + + echo "dispatch_time=$DISPATCH_TIME" >> $GITHUB_OUTPUT echo "Deployment triggered in private repository" + + - name: Wait for private deployment workflow + env: + APP_ID: ${{ secrets.GH_APP_ID }} + APP_PRIVATE_KEY: ${{ secrets.GH_APP_PRIVATE_KEY }} + INSTALLATION_ID: ${{ steps.app-token.outputs.installation-id }} + REPO: ${{ steps.repo-info.outputs.full_name }} + DISPATCH_TIME: ${{ steps.dispatch.outputs.dispatch_time }} + run: | + set -euo pipefail + + # Mint a short-lived installation token from the App JWT. + mint_token() { + local now iat exp header payload unsigned sig jwt + now=$(date +%s); iat=$((now - 60)); exp=$((now + 540)) + b64() { openssl base64 -e -A | tr '+/' '-_' | tr -d '='; } + header=$(printf '{"alg":"RS256","typ":"JWT"}' | b64) + payload=$(printf '{"iat":%d,"exp":%d,"iss":"%s"}' "$iat" "$exp" "$APP_ID" | b64) + unsigned="${header}.${payload}" + sig=$(printf '%s' "$unsigned" \ + | openssl dgst -sha256 -sign <(printf '%s' "$APP_PRIVATE_KEY") -binary | b64) + jwt="${unsigned}.${sig}" + curl -sf -X POST \ + -H "Authorization: Bearer $jwt" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/app/installations/${INSTALLATION_ID}/access_tokens" \ + | jq -r '.token' + } + + TOKEN=$(mint_token) + + # Find the run triggered after our dispatch (eventual consistency). + RUN_ID="" + for attempt in $(seq 1 10); do + RUN_ID=$(curl -sf \ + -H "Authorization: token $TOKEN" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${REPO}/actions/runs?event=repository_dispatch&created=>=${DISPATCH_TIME}&per_page=5" \ + | jq -r '.workflow_runs | sort_by(.created_at) | last | .id // empty') + [ -n "$RUN_ID" ] && break + echo "Waiting for workflow run to appear (attempt $attempt)..." + sleep 30 + done + + if [ -z "$RUN_ID" ]; then + echo "::error::Could not find a triggered workflow run after dispatch." + exit 1 + fi + echo "Tracking run ${RUN_ID}" + + # Poll until complete, re-minting the token each loop so it never expires. + while true; do + TOKEN=$(mint_token) + RUN=$(curl -sf \ + -H "Authorization: token $TOKEN" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${REPO}/actions/runs/${RUN_ID}") + STATUS=$(echo "$RUN" | jq -r '.status') + CONCLUSION=$(echo "$RUN" | jq -r '.conclusion // empty') + echo " status=$STATUS conclusion=${CONCLUSION:-pending}" + [ "$STATUS" = "completed" ] && break + sleep 60 + done + + if [ "$CONCLUSION" != "success" ]; then + echo "::error::Private deployment workflow concluded: $CONCLUSION" + exit 1 + fi + echo "Deployment workflow succeeded."