Skip to content
193 changes: 193 additions & 0 deletions selftest/risk_patterns_corpus.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# Shared corpus for automerge risk-tier selftests (BB + GH).
# One entry per line: <TYPE>: <path>
# Types:
# RISKY — must match high-risk patterns in BOTH BB (bb-automerge.py) AND GH (claude-author-automerge.yml)
# RISKY_BB — must match in BB only (e.g. bitbucket-pipelines.yml — irrelevant to GH workflow)
# SAFE — must NOT match in either
#
# When you add a new pattern category to claude-author-automerge.yml OR
# bb-automerge.py's HIGH_RISK_PATTERNS, add a representative RISKY entry here.
# Both selftests (test_automerge_risk_patterns.sh + test_bb_automerge_risk_patterns.sh)
# read from this file, so drift between the two pattern sets is caught.

# ── RISKY: auth — all variants the GH workflow detects ─────────────────────
RISKY: src/auth/login.py
RISKY: internal/auth/security.go
# oauth2 alternation — wxa-mcp-server#193/#197 gap (2026-05-24)
RISKY: internal/oauth2/server.go
RISKY: internal/oauth2/handler.go
RISKY: pkg/oauth2/token.go
# Auth-adjacent variants (sibling-gap audit, 2026-05-24).
# Pattern is path-segment-anchored, not filename-prefix. So
# `internal/signin/handler.go` matches (signin is a segment) but
# `controllers/sessions_controller.rb` does NOT (sessions is a
# filename prefix, not a segment). Per-repo Rails/Django/Express
# conventions belong in caller's risk-paths.yml.
RISKY: internal/signin/handler.go
RISKY: internal/signup/form.go
RISKY: src/logout/handler.go
# sessions (plural) — Go convention
RISKY: internal/sessions/store.go
RISKY: internal/jwt/sign.go
RISKY: services/mfa/verify.py
RISKY: app/totp/generate.go
RISKY: lib/webauthn/register.go
RISKY: internal/passkey/store.go

# ── RISKY: Go entrypoints — wxa-mcp-server#193 gap ─────────────────────────
RISKY: main.go
RISKY: cmd/server/main.go
RISKY: cmd/wxa-mcp-server/main.go

# ── RISKY: billing variants ────────────────────────────────────────────────
RISKY: internal/subscription/manager.go
RISKY: app/subscriptions/cancel.rb
RISKY: src/checkout/flow.tsx
RISKY: api/checkout/session.go
RISKY: internal/refund/process.go
RISKY: services/refund/issuer.py
RISKY: api/refunds/handler.go
RISKY: internal/billing/invoice.go
RISKY: billing/invoices.py
RISKY: internal/payment/charge.go
RISKY: internal/pricing/calc.go

# ── RISKY: database (migrations + .sql) ────────────────────────────────────
RISKY: internal/db/migrations/00049_workflows.sql
RISKY: internal/db/migrations/00050_workflow_executions.sql
RISKY: migrations/031_cdn_operator.sql
RISKY: schema/users.sql
RISKY: src/db/schema.sql

# ── RISKY: secrets / credentials / env ─────────────────────────────────────
RISKY: internal/secret/manager.go
RISKY: config/secret/keystore.go
RISKY: src/secrets/vault.go
RISKY: secrets/api-keys.json
RISKY: config/credentials.yml
RISKY: credentials.py
RISKY: .env
RISKY: .env.production
RISKY: src/keychain_helpers.py

# ── RISKY: containers + compose ────────────────────────────────────────────
RISKY: Dockerfile
RISKY: deploy/Dockerfile.api
RISKY: docker-compose.yml
RISKY: docker-compose.staging.yml

# ── RISKY: CI/CD ───────────────────────────────────────────────────────────
RISKY: .github/workflows/ci.yml
RISKY: .github/workflows/deploy.yml
RISKY: .github/workflows/lint.yml
RISKY: .github/risk-paths.yml
RISKY: .github/CODEOWNERS
# BB-only: GH workflow doesn't include bitbucket-pipelines pattern (irrelevant to GH repos)
RISKY_BB: bitbucket-pipelines.yml

# ── RISKY: infra (IAM, IaC, systemd, nginx, deploy scripts) ────────────────
RISKY: infra/iam/scanner-role.json
RISKY: infra/iam/wxa-vpn-api-policy.json
RISKY: infra/iam/admin-role.tf
RISKY: infra/terraform/main.tf
RISKY: infra/k8s/api-deployment.yaml
RISKY: infra/digitalocean/systemd/wxa.service
RISKY: infra/scanner-id/identity.json
RISKY: infra/nginx/honeypot.conf
RISKY: infra/nginx/site.conf
RISKY: infra/nginx-checkip-vhost.conf
RISKY: infra/wxa-vpn-api.service
RISKY: infra/wxa-workload.slice
RISKY: infra/wxa-gt-builder.timer
RISKY: infra/systemd/api.service
RISKY: infra/some.tf
RISKY: infra/deploy-netflow-cron.sh
RISKY: infra/setup-actions-runner.sh
RISKY: infra/deploy-systemd.sh
RISKY: terraform/main.tf
RISKY: terraform/prod/main.tf
RISKY: pulumi/index.ts
RISKY: k8s/deployment.yaml
RISKY: fly.toml
RISKY: deploy/prod.sh
RISKY: deploy/deploy.sh
RISKY: deploy.sh
RISKY: deploy-staging.yml
RISKY: scripts/deploy/run.sh

# ── SAFE: must NOT match ───────────────────────────────────────────────────
SAFE: src/wxa_vpn/api/routes.py
SAFE: src/components/Button.tsx
SAFE: src/pages/Dashboard.tsx
SAFE: src/utils/format.ts
SAFE: src/api/findingsSlice.ts
SAFE: docs/architecture.md
SAFE: docs/data-dictionary.md
SAFE: README.md
SAFE: tests/test_anything.py
SAFE: tests/unit/utils_test.go
SAFE: openapi/paths/findings.yaml
SAFE: internal/api/handlers/findings.go
SAFE: internal/workflows/engine.go
SAFE: package.json
SAFE: package-lock.json
SAFE: go.mod
SAFE: go.sum
SAFE: scripts/run_analysis.py
# Adjacent to main.go but a test file
SAFE: main_test.go
SAFE: internal/foo/main_test.go
# Doc file mentioning oauth2 — pattern needs trailing / or end
SAFE: internal/oauth2.md
# Starts with "main" but not the literal main.go
SAFE: cmd/server/mainview.go
SAFE: mainview.go
# oauth2 substring but not a path segment
SAFE: src/oauth2helper.go
SAFE: oauth2helper.go
# Intentional parity-not-coverage tradeoffs (Wave 4.4 — match GH workflow):
# Filenames like `auth.ts`, `login.go`, `billing.py`, `secret.py`,
# `secret_key.py` are intentionally SAFE (NOT matched globally because the
# `(/|$)` segment anchor doesn't match leaf filenames). BB repos that need
# filename coverage either add an explicit HIGH_RISK_PATTERNS entry (and a
# RISKY: line here) or rely on Claude Review / Codex Review as the gate.
# Stripe paths likewise — only the BB-only HIGH_RISK_PATTERNS would have
# caught `stripe/webhooks.ts`; dropped for GH parity.
SAFE: src/auth.ts
SAFE: api/login.go
SAFE: billing.py
SAFE: secret.py
SAFE: config/secret_key.py
SAFE: src/secrets.py
SAFE: config/secret_keys.yml
SAFE: config/secret.json
SAFE: src/secrets.ts
SAFE: secrets.yaml
SAFE: secret_manager.py
# Nested docker-compose paths — partial parity gap:
# - `deploy/local/docker-compose.yml` IS still gated (matches `^deploy/.*`).
# - `services/api/docker-compose.yaml` is NOT gated globally. Repos that
# nest compose files outside `deploy/`, `infra/`, etc. should add an
# explicit HIGH_RISK_PATTERNS entry + RISKY: line here.
RISKY: deploy/local/docker-compose.yml
SAFE: services/api/docker-compose.yaml
SAFE: stripe/webhooks.ts
SAFE: src/stripe/client.go
# Substrings that look auth/billing-ish but aren't path segments
SAFE: internal/sessionsutil.go
SAFE: pkg/jwtutil.go
SAFE: lib/passkeystore.go
SAFE: internal/totps.go
SAFE: docs/checkout-flow.md
SAFE: lib/subscriber.go
SAFE: internal/secretly.go
SAFE: tests/test_authorization_logic.py
SAFE: docs/signin-flow.md
# Cron files SHOULD be safe — wxa_vpn#250 / wxa_vpn#439 lesson
SAFE: infra/crontabs/wxa-scanner.crontab
SAFE: infra/crontabs/wxa-scanner-active.crontab
SAFE: infra/crontabs/wxa-scanner-slow.crontab
SAFE: infra/crontabs/README.md
SAFE: infra/aws-scanner-setup.md
SAFE: infra/crontab.example
SAFE: infra/README.md
115 changes: 17 additions & 98 deletions selftest/test_automerge_risk_patterns.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,104 +50,23 @@ matches() {
return 1
}

# Cases the regex MUST flag as risky (manual click-merge).
RISKY=(
"src/auth/login.py"
"internal/auth/security.go" # auth segment, Go layout
"internal/oauth2/server.go" # oauth2 alternation — wxa-mcp-server#193/#197 gap (2026-05-24)
"internal/oauth2/handler.go"
"pkg/oauth2/token.go"
"main.go" # Go entrypoint at root — wxa-mcp-server#193 gap
"cmd/server/main.go" # Go entrypoint under cmd/
"cmd/wxa-mcp-server/main.go"
# Auth-adjacent variants (sibling-gap audit, 2026-05-24).
# NOTE: Pattern is path-segment-anchored, not filename-prefix. So
# `internal/signin/handler.go` matches (signin is a segment) but
# `controllers/sessions_controller.rb` does NOT (sessions is a
# filename prefix, not a segment). Per-repo Rails/Django/Express
# conventions belong in caller's risk-paths.yml.
"internal/signin/handler.go"
"internal/signup/form.go"
"src/logout/handler.go"
"internal/sessions/store.go" # sessions (plural) — Go convention
"internal/jwt/sign.go"
"services/mfa/verify.py"
"app/totp/generate.go"
"lib/webauthn/register.go"
"internal/passkey/store.go"
# Billing-adjacent variants
"internal/subscription/manager.go"
"app/subscriptions/cancel.rb"
"api/checkout/session.go"
"services/refund/issuer.py"
"api/refunds/handler.go"
# Secret (singular) variants
"internal/secret/manager.go"
"config/secret/keystore.go"
"secrets/api-keys.json"
".env.production"
"src/keychain_helpers.py"
"credentials.py"
"migrations/031_cdn_operator.sql"
"src/db/schema.sql"
"billing/invoices.py"
"Dockerfile"
"docker-compose.yml"
".github/workflows/deploy.yml"
".github/risk-paths.yml"
".github/CODEOWNERS"
"infra/iam/scanner-role.json" # IAM policy
"infra/iam/wxa-vpn-api-policy.json"
"infra/terraform/main.tf" # IaC under infra/
"infra/digitalocean/systemd/wxa.service"
"infra/scanner-id/identity.json"
"infra/nginx/honeypot.conf"
"infra/nginx-checkip-vhost.conf" # top-level nginx config
"infra/wxa-vpn-api.service" # systemd unit
"infra/wxa-workload.slice"
"infra/wxa-gt-builder.timer"
"infra/some.tf"
"infra/deploy-netflow-cron.sh" # shell script
"infra/setup-actions-runner.sh"
"infra/deploy-systemd.sh"
"terraform/main.tf"
"pulumi/index.ts"
"k8s/deployment.yaml"
"fly.toml"
"deploy/prod.sh"
"deploy.sh"
"deploy-staging.yml"
)

# Cases the regex MUST allow through to auto-merge (the historical false positives).
SAFE=(
"src/wxa_vpn/api/routes.py"
"tests/test_anything.py"
"docs/data-dictionary.md"
"main_test.go" # adjacent to main.go but a test file
"internal/foo/main_test.go"
"internal/oauth2.md" # doc file mentioning oauth2 — pattern needs trailing / or end
"cmd/server/mainview.go" # starts with "main" but not the literal main.go
"src/oauth2helper.go" # oauth2 substring but not a path segment
# Substrings that look auth/billing-ish but aren't path segments — must NOT over-block
"internal/sessionsutil.go" # "sessionsutil" segment, not "sessions"
"pkg/jwtutil.go" # "jwtutil" not "jwt"
"lib/passkeystore.go" # "passkeystore" not "passkey"
"internal/totps.go" # "totps" — neither "totp/" nor "totp$"
"docs/checkout-flow.md" # "checkout-flow.md" not literal "checkout"
"lib/subscriber.go" # "subscriber" not "subscription"
"internal/secretly.go" # "secretly" not "secret"
"tests/test_authorization_logic.py" # "test_authorization_logic.py" — not literal "auth"
"docs/signin-flow.md" # doc with "signin-flow" substring
"scripts/run_analysis.py"
"infra/crontabs/wxa-scanner.crontab" # cron schedule — wxa_vpn#439 case
"infra/crontabs/wxa-scanner-active.crontab"
"infra/crontabs/README.md"
"infra/crontabs/wxa-scanner-slow.crontab"
"infra/aws-scanner-setup.md" # runbook docs
"infra/crontab.example" # example config
"infra/README.md"
)
# Test cases sourced from shared corpus at selftest/risk_patterns_corpus.txt.
# The corpus is single source of truth for BOTH the GH selftest (this file)
# AND the BB selftest (test_bb_automerge_risk_patterns.sh), so drift between
# claude-author-automerge.yml's regex and bb-automerge.py's HIGH_RISK_PATTERNS
# is caught here.
CORPUS="$(dirname "$0")/risk_patterns_corpus.txt"
[ ! -f "$CORPUS" ] && { echo "FAIL: corpus not found at $CORPUS"; exit 2; }
RISKY=()
SAFE=()
while IFS= read -r line; do
case "$line" in
RISKY_BB:*) ;; # BB-only entries skipped by GH selftest
RISKY:*) RISKY+=("${line#RISKY: }") ;;
SAFE:*) SAFE+=("${line#SAFE: }") ;;
"#"*|"") ;;
esac
done < "$CORPUS"

failed=0

Expand Down
68 changes: 68 additions & 0 deletions selftest/test_bb_automerge_risk_patterns.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash
# Selftest for bb-automerge.py — exercises HIGH_RISK_PATTERNS against the
# shared corpus at selftest/risk_patterns_corpus.txt. ANY drift between
# bb-automerge.py and claude-author-automerge.yml is caught here OR by the
# parallel test_automerge_risk_patterns.sh; matching corpora means matching
# coverage.
#
# Run from the repo root:
# bash selftest/test_bb_automerge_risk_patterns.sh
#
# Resolution of bb-automerge.py:
# - Prefer $BB_AUTOMERGE_PY if set
# - Else $HOME/.claude/templates/ci-workflows/scripts/bb-automerge.py
# - Else fail
set -euo pipefail

SCRIPT="${BB_AUTOMERGE_PY:-$HOME/.claude/templates/ci-workflows/scripts/bb-automerge.py}"
if [ ! -f "$SCRIPT" ]; then
echo "FAIL: bb-automerge.py not found at $SCRIPT"
echo "Hint: set BB_AUTOMERGE_PY env var to its location"
exit 2
fi

CORPUS="$(dirname "$0")/risk_patterns_corpus.txt"
[ ! -f "$CORPUS" ] && { echo "FAIL: corpus not found at $CORPUS"; exit 2; }

# One-shot Python harness — load the script via importlib, exercise find_high_risk
HARNESS=$(cat <<'PYEOF'
import importlib.util, os, sys
spec = importlib.util.spec_from_file_location('bba', os.environ['SCRIPT'])
mod = importlib.util.module_from_spec(spec)
# Register in sys.modules BEFORE exec_module so @dataclass can resolve cls.__module__
# (Python 3.14 stricter behavior — see CPython dataclasses.py line 814).
sys.modules['bba'] = mod
spec.loader.exec_module(mod)
fail = 0
for line in open(os.environ['CORPUS']):
line = line.strip()
if not line or line.startswith('#'):
continue
if line.startswith('RISKY_BB:'):
path = line[len('RISKY_BB: '):]
hits = mod.find_high_risk([path])
if not hits:
print(f"FAIL [risky_bb->safe]: '{path}' not classified as high-risk by BB patterns")
fail = 1
elif line.startswith('RISKY:'):
path = line[len('RISKY: '):]
hits = mod.find_high_risk([path])
if not hits:
print(f"FAIL [risky->safe]: '{path}' not classified as high-risk")
fail = 1
elif line.startswith('SAFE:'):
path = line[len('SAFE: '):]
hits = mod.find_high_risk([path])
if hits:
print(f"FAIL [safe->risky]: '{path}' incorrectly classified as high-risk")
fail = 1
sys.exit(fail)
PYEOF
)
SCRIPT="$SCRIPT" CORPUS="$CORPUS" python3 -c "$HARNESS"
EXIT=$?
if [ $EXIT -eq 0 ]; then
COUNT=$(grep -cE '^(RISKY(_BB)?|SAFE):' "$CORPUS")
echo "PASS — all $COUNT cases"
fi
exit $EXIT
Loading