diff --git a/selftest/risk_patterns_corpus.txt b/selftest/risk_patterns_corpus.txt new file mode 100644 index 0000000..b5e04a0 --- /dev/null +++ b/selftest/risk_patterns_corpus.txt @@ -0,0 +1,193 @@ +# Shared corpus for automerge risk-tier selftests (BB + GH). +# One entry per line: : +# Types: +# RISKY — must match high-risk patterns in BOTH BB (bb-automerge.py) AND GH (claude-author-automerge.yml) +# RISKY_BB — must match in BB only (e.g. bitbucket-pipelines.yml — irrelevant to GH workflow) +# SAFE — must NOT match in either +# +# When you add a new pattern category to claude-author-automerge.yml OR +# bb-automerge.py's HIGH_RISK_PATTERNS, add a representative RISKY entry here. +# Both selftests (test_automerge_risk_patterns.sh + test_bb_automerge_risk_patterns.sh) +# read from this file, so drift between the two pattern sets is caught. + +# ── RISKY: auth — all variants the GH workflow detects ───────────────────── +RISKY: src/auth/login.py +RISKY: internal/auth/security.go +# oauth2 alternation — wxa-mcp-server#193/#197 gap (2026-05-24) +RISKY: internal/oauth2/server.go +RISKY: internal/oauth2/handler.go +RISKY: pkg/oauth2/token.go +# Auth-adjacent variants (sibling-gap audit, 2026-05-24). +# Pattern is path-segment-anchored, not filename-prefix. So +# `internal/signin/handler.go` matches (signin is a segment) but +# `controllers/sessions_controller.rb` does NOT (sessions is a +# filename prefix, not a segment). Per-repo Rails/Django/Express +# conventions belong in caller's risk-paths.yml. +RISKY: internal/signin/handler.go +RISKY: internal/signup/form.go +RISKY: src/logout/handler.go +# sessions (plural) — Go convention +RISKY: internal/sessions/store.go +RISKY: internal/jwt/sign.go +RISKY: services/mfa/verify.py +RISKY: app/totp/generate.go +RISKY: lib/webauthn/register.go +RISKY: internal/passkey/store.go + +# ── RISKY: Go entrypoints — wxa-mcp-server#193 gap ───────────────────────── +RISKY: main.go +RISKY: cmd/server/main.go +RISKY: cmd/wxa-mcp-server/main.go + +# ── RISKY: billing variants ──────────────────────────────────────────────── +RISKY: internal/subscription/manager.go +RISKY: app/subscriptions/cancel.rb +RISKY: src/checkout/flow.tsx +RISKY: api/checkout/session.go +RISKY: internal/refund/process.go +RISKY: services/refund/issuer.py +RISKY: api/refunds/handler.go +RISKY: internal/billing/invoice.go +RISKY: billing/invoices.py +RISKY: internal/payment/charge.go +RISKY: internal/pricing/calc.go + +# ── RISKY: database (migrations + .sql) ──────────────────────────────────── +RISKY: internal/db/migrations/00049_workflows.sql +RISKY: internal/db/migrations/00050_workflow_executions.sql +RISKY: migrations/031_cdn_operator.sql +RISKY: schema/users.sql +RISKY: src/db/schema.sql + +# ── RISKY: secrets / credentials / env ───────────────────────────────────── +RISKY: internal/secret/manager.go +RISKY: config/secret/keystore.go +RISKY: src/secrets/vault.go +RISKY: secrets/api-keys.json +RISKY: config/credentials.yml +RISKY: credentials.py +RISKY: .env +RISKY: .env.production +RISKY: src/keychain_helpers.py + +# ── RISKY: containers + compose ──────────────────────────────────────────── +RISKY: Dockerfile +RISKY: deploy/Dockerfile.api +RISKY: docker-compose.yml +RISKY: docker-compose.staging.yml + +# ── RISKY: CI/CD ─────────────────────────────────────────────────────────── +RISKY: .github/workflows/ci.yml +RISKY: .github/workflows/deploy.yml +RISKY: .github/workflows/lint.yml +RISKY: .github/risk-paths.yml +RISKY: .github/CODEOWNERS +# BB-only: GH workflow doesn't include bitbucket-pipelines pattern (irrelevant to GH repos) +RISKY_BB: bitbucket-pipelines.yml + +# ── RISKY: infra (IAM, IaC, systemd, nginx, deploy scripts) ──────────────── +RISKY: infra/iam/scanner-role.json +RISKY: infra/iam/wxa-vpn-api-policy.json +RISKY: infra/iam/admin-role.tf +RISKY: infra/terraform/main.tf +RISKY: infra/k8s/api-deployment.yaml +RISKY: infra/digitalocean/systemd/wxa.service +RISKY: infra/scanner-id/identity.json +RISKY: infra/nginx/honeypot.conf +RISKY: infra/nginx/site.conf +RISKY: infra/nginx-checkip-vhost.conf +RISKY: infra/wxa-vpn-api.service +RISKY: infra/wxa-workload.slice +RISKY: infra/wxa-gt-builder.timer +RISKY: infra/systemd/api.service +RISKY: infra/some.tf +RISKY: infra/deploy-netflow-cron.sh +RISKY: infra/setup-actions-runner.sh +RISKY: infra/deploy-systemd.sh +RISKY: terraform/main.tf +RISKY: terraform/prod/main.tf +RISKY: pulumi/index.ts +RISKY: k8s/deployment.yaml +RISKY: fly.toml +RISKY: deploy/prod.sh +RISKY: deploy/deploy.sh +RISKY: deploy.sh +RISKY: deploy-staging.yml +RISKY: scripts/deploy/run.sh + +# ── SAFE: must NOT match ─────────────────────────────────────────────────── +SAFE: src/wxa_vpn/api/routes.py +SAFE: src/components/Button.tsx +SAFE: src/pages/Dashboard.tsx +SAFE: src/utils/format.ts +SAFE: src/api/findingsSlice.ts +SAFE: docs/architecture.md +SAFE: docs/data-dictionary.md +SAFE: README.md +SAFE: tests/test_anything.py +SAFE: tests/unit/utils_test.go +SAFE: openapi/paths/findings.yaml +SAFE: internal/api/handlers/findings.go +SAFE: internal/workflows/engine.go +SAFE: package.json +SAFE: package-lock.json +SAFE: go.mod +SAFE: go.sum +SAFE: scripts/run_analysis.py +# Adjacent to main.go but a test file +SAFE: main_test.go +SAFE: internal/foo/main_test.go +# Doc file mentioning oauth2 — pattern needs trailing / or end +SAFE: internal/oauth2.md +# Starts with "main" but not the literal main.go +SAFE: cmd/server/mainview.go +SAFE: mainview.go +# oauth2 substring but not a path segment +SAFE: src/oauth2helper.go +SAFE: oauth2helper.go +# Intentional parity-not-coverage tradeoffs (Wave 4.4 — match GH workflow): +# Filenames like `auth.ts`, `login.go`, `billing.py`, `secret.py`, +# `secret_key.py` are intentionally SAFE (NOT matched globally because the +# `(/|$)` segment anchor doesn't match leaf filenames). BB repos that need +# filename coverage either add an explicit HIGH_RISK_PATTERNS entry (and a +# RISKY: line here) or rely on Claude Review / Codex Review as the gate. +# Stripe paths likewise — only the BB-only HIGH_RISK_PATTERNS would have +# caught `stripe/webhooks.ts`; dropped for GH parity. +SAFE: src/auth.ts +SAFE: api/login.go +SAFE: billing.py +SAFE: secret.py +SAFE: config/secret_key.py +SAFE: src/secrets.py +SAFE: config/secret_keys.yml +SAFE: config/secret.json +SAFE: src/secrets.ts +SAFE: secrets.yaml +SAFE: secret_manager.py +# Nested docker-compose paths — partial parity gap: +# - `deploy/local/docker-compose.yml` IS still gated (matches `^deploy/.*`). +# - `services/api/docker-compose.yaml` is NOT gated globally. Repos that +# nest compose files outside `deploy/`, `infra/`, etc. should add an +# explicit HIGH_RISK_PATTERNS entry + RISKY: line here. +RISKY: deploy/local/docker-compose.yml +SAFE: services/api/docker-compose.yaml +SAFE: stripe/webhooks.ts +SAFE: src/stripe/client.go +# Substrings that look auth/billing-ish but aren't path segments +SAFE: internal/sessionsutil.go +SAFE: pkg/jwtutil.go +SAFE: lib/passkeystore.go +SAFE: internal/totps.go +SAFE: docs/checkout-flow.md +SAFE: lib/subscriber.go +SAFE: internal/secretly.go +SAFE: tests/test_authorization_logic.py +SAFE: docs/signin-flow.md +# Cron files SHOULD be safe — wxa_vpn#250 / wxa_vpn#439 lesson +SAFE: infra/crontabs/wxa-scanner.crontab +SAFE: infra/crontabs/wxa-scanner-active.crontab +SAFE: infra/crontabs/wxa-scanner-slow.crontab +SAFE: infra/crontabs/README.md +SAFE: infra/aws-scanner-setup.md +SAFE: infra/crontab.example +SAFE: infra/README.md diff --git a/selftest/test_automerge_risk_patterns.sh b/selftest/test_automerge_risk_patterns.sh index 00fb531..62769ad 100755 --- a/selftest/test_automerge_risk_patterns.sh +++ b/selftest/test_automerge_risk_patterns.sh @@ -50,104 +50,23 @@ matches() { return 1 } -# Cases the regex MUST flag as risky (manual click-merge). -RISKY=( - "src/auth/login.py" - "internal/auth/security.go" # auth segment, Go layout - "internal/oauth2/server.go" # oauth2 alternation — wxa-mcp-server#193/#197 gap (2026-05-24) - "internal/oauth2/handler.go" - "pkg/oauth2/token.go" - "main.go" # Go entrypoint at root — wxa-mcp-server#193 gap - "cmd/server/main.go" # Go entrypoint under cmd/ - "cmd/wxa-mcp-server/main.go" - # Auth-adjacent variants (sibling-gap audit, 2026-05-24). - # NOTE: Pattern is path-segment-anchored, not filename-prefix. So - # `internal/signin/handler.go` matches (signin is a segment) but - # `controllers/sessions_controller.rb` does NOT (sessions is a - # filename prefix, not a segment). Per-repo Rails/Django/Express - # conventions belong in caller's risk-paths.yml. - "internal/signin/handler.go" - "internal/signup/form.go" - "src/logout/handler.go" - "internal/sessions/store.go" # sessions (plural) — Go convention - "internal/jwt/sign.go" - "services/mfa/verify.py" - "app/totp/generate.go" - "lib/webauthn/register.go" - "internal/passkey/store.go" - # Billing-adjacent variants - "internal/subscription/manager.go" - "app/subscriptions/cancel.rb" - "api/checkout/session.go" - "services/refund/issuer.py" - "api/refunds/handler.go" - # Secret (singular) variants - "internal/secret/manager.go" - "config/secret/keystore.go" - "secrets/api-keys.json" - ".env.production" - "src/keychain_helpers.py" - "credentials.py" - "migrations/031_cdn_operator.sql" - "src/db/schema.sql" - "billing/invoices.py" - "Dockerfile" - "docker-compose.yml" - ".github/workflows/deploy.yml" - ".github/risk-paths.yml" - ".github/CODEOWNERS" - "infra/iam/scanner-role.json" # IAM policy - "infra/iam/wxa-vpn-api-policy.json" - "infra/terraform/main.tf" # IaC under infra/ - "infra/digitalocean/systemd/wxa.service" - "infra/scanner-id/identity.json" - "infra/nginx/honeypot.conf" - "infra/nginx-checkip-vhost.conf" # top-level nginx config - "infra/wxa-vpn-api.service" # systemd unit - "infra/wxa-workload.slice" - "infra/wxa-gt-builder.timer" - "infra/some.tf" - "infra/deploy-netflow-cron.sh" # shell script - "infra/setup-actions-runner.sh" - "infra/deploy-systemd.sh" - "terraform/main.tf" - "pulumi/index.ts" - "k8s/deployment.yaml" - "fly.toml" - "deploy/prod.sh" - "deploy.sh" - "deploy-staging.yml" -) - -# Cases the regex MUST allow through to auto-merge (the historical false positives). -SAFE=( - "src/wxa_vpn/api/routes.py" - "tests/test_anything.py" - "docs/data-dictionary.md" - "main_test.go" # adjacent to main.go but a test file - "internal/foo/main_test.go" - "internal/oauth2.md" # doc file mentioning oauth2 — pattern needs trailing / or end - "cmd/server/mainview.go" # starts with "main" but not the literal main.go - "src/oauth2helper.go" # oauth2 substring but not a path segment - # Substrings that look auth/billing-ish but aren't path segments — must NOT over-block - "internal/sessionsutil.go" # "sessionsutil" segment, not "sessions" - "pkg/jwtutil.go" # "jwtutil" not "jwt" - "lib/passkeystore.go" # "passkeystore" not "passkey" - "internal/totps.go" # "totps" — neither "totp/" nor "totp$" - "docs/checkout-flow.md" # "checkout-flow.md" not literal "checkout" - "lib/subscriber.go" # "subscriber" not "subscription" - "internal/secretly.go" # "secretly" not "secret" - "tests/test_authorization_logic.py" # "test_authorization_logic.py" — not literal "auth" - "docs/signin-flow.md" # doc with "signin-flow" substring - "scripts/run_analysis.py" - "infra/crontabs/wxa-scanner.crontab" # cron schedule — wxa_vpn#439 case - "infra/crontabs/wxa-scanner-active.crontab" - "infra/crontabs/README.md" - "infra/crontabs/wxa-scanner-slow.crontab" - "infra/aws-scanner-setup.md" # runbook docs - "infra/crontab.example" # example config - "infra/README.md" -) +# Test cases sourced from shared corpus at selftest/risk_patterns_corpus.txt. +# The corpus is single source of truth for BOTH the GH selftest (this file) +# AND the BB selftest (test_bb_automerge_risk_patterns.sh), so drift between +# claude-author-automerge.yml's regex and bb-automerge.py's HIGH_RISK_PATTERNS +# is caught here. +CORPUS="$(dirname "$0")/risk_patterns_corpus.txt" +[ ! -f "$CORPUS" ] && { echo "FAIL: corpus not found at $CORPUS"; exit 2; } +RISKY=() +SAFE=() +while IFS= read -r line; do + case "$line" in + RISKY_BB:*) ;; # BB-only entries skipped by GH selftest + RISKY:*) RISKY+=("${line#RISKY: }") ;; + SAFE:*) SAFE+=("${line#SAFE: }") ;; + "#"*|"") ;; + esac +done < "$CORPUS" failed=0 diff --git a/selftest/test_bb_automerge_risk_patterns.sh b/selftest/test_bb_automerge_risk_patterns.sh new file mode 100755 index 0000000..a6859f0 --- /dev/null +++ b/selftest/test_bb_automerge_risk_patterns.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Selftest for bb-automerge.py — exercises HIGH_RISK_PATTERNS against the +# shared corpus at selftest/risk_patterns_corpus.txt. ANY drift between +# bb-automerge.py and claude-author-automerge.yml is caught here OR by the +# parallel test_automerge_risk_patterns.sh; matching corpora means matching +# coverage. +# +# Run from the repo root: +# bash selftest/test_bb_automerge_risk_patterns.sh +# +# Resolution of bb-automerge.py: +# - Prefer $BB_AUTOMERGE_PY if set +# - Else $HOME/.claude/templates/ci-workflows/scripts/bb-automerge.py +# - Else fail +set -euo pipefail + +SCRIPT="${BB_AUTOMERGE_PY:-$HOME/.claude/templates/ci-workflows/scripts/bb-automerge.py}" +if [ ! -f "$SCRIPT" ]; then + echo "FAIL: bb-automerge.py not found at $SCRIPT" + echo "Hint: set BB_AUTOMERGE_PY env var to its location" + exit 2 +fi + +CORPUS="$(dirname "$0")/risk_patterns_corpus.txt" +[ ! -f "$CORPUS" ] && { echo "FAIL: corpus not found at $CORPUS"; exit 2; } + +# One-shot Python harness — load the script via importlib, exercise find_high_risk +HARNESS=$(cat <<'PYEOF' +import importlib.util, os, sys +spec = importlib.util.spec_from_file_location('bba', os.environ['SCRIPT']) +mod = importlib.util.module_from_spec(spec) +# Register in sys.modules BEFORE exec_module so @dataclass can resolve cls.__module__ +# (Python 3.14 stricter behavior — see CPython dataclasses.py line 814). +sys.modules['bba'] = mod +spec.loader.exec_module(mod) +fail = 0 +for line in open(os.environ['CORPUS']): + line = line.strip() + if not line or line.startswith('#'): + continue + if line.startswith('RISKY_BB:'): + path = line[len('RISKY_BB: '):] + hits = mod.find_high_risk([path]) + if not hits: + print(f"FAIL [risky_bb->safe]: '{path}' not classified as high-risk by BB patterns") + fail = 1 + elif line.startswith('RISKY:'): + path = line[len('RISKY: '):] + hits = mod.find_high_risk([path]) + if not hits: + print(f"FAIL [risky->safe]: '{path}' not classified as high-risk") + fail = 1 + elif line.startswith('SAFE:'): + path = line[len('SAFE: '):] + hits = mod.find_high_risk([path]) + if hits: + print(f"FAIL [safe->risky]: '{path}' incorrectly classified as high-risk") + fail = 1 +sys.exit(fail) +PYEOF +) +SCRIPT="$SCRIPT" CORPUS="$CORPUS" python3 -c "$HARNESS" +EXIT=$? +if [ $EXIT -eq 0 ]; then + COUNT=$(grep -cE '^(RISKY(_BB)?|SAFE):' "$CORPUS") + echo "PASS — all $COUNT cases" +fi +exit $EXIT