diff --git a/tooling/edr-rehearsal/.gitignore b/tooling/edr-rehearsal/.gitignore new file mode 100644 index 00000000..e79355ea --- /dev/null +++ b/tooling/edr-rehearsal/.gitignore @@ -0,0 +1,2 @@ +results/* +!results/.gitkeep diff --git a/tooling/edr-rehearsal/README.md b/tooling/edr-rehearsal/README.md new file mode 100644 index 00000000..8dc8063e --- /dev/null +++ b/tooling/edr-rehearsal/README.md @@ -0,0 +1,31 @@ +# tooling/edr-rehearsal — push-button EDR (SentinelOne) rehearsal harness + +Rehearse the signed+notarized macOS runtime against Salesloft's EDR +(SentinelOne) on a throwaway EC2 Mac fixture, pick the surviving artifact by +evidence, and re-image. Authoring/operator tooling — distinct from the build +steps in `packaging/`. Ticket: **WEB-4805**. + +Start with **[RUNBOOK.md](RUNBOOK.md)**. + +| File | What | +|---|---| +| `RUNBOOK.md` | Operator-facing, step-by-step end-to-end procedure | +| `matrix.md` | 2 artifacts × 2 allowlist states × 5 stages, with a results column | +| `lib.sh` | Shared config + the AWS-profile guard + dry-run/confirm gate (sourced, not run) | +| `provision-fixture.sh` | Allocate a fresh dedicated host + instance per chip (`mac2.metal` arm64, `mac1.metal` intel), us-west-2, default profile | +| `install-s1.sh` | Install the SentinelOne agent on a fixture (site token via `S1_SITE_TOKEN`) | +| `run-rehearsal.sh` | Drive one artifact (`--artifact pyinstaller\|nuitka`) through install → onboard → all hook events → discovery daemon → `--clear` | +| `capture-telemetry.sh` | Collect S1 detections + Storyline + our logs, tagged `{artifact, allowlist, run-id}` | +| `teardown.sh` | Terminate the instance + release the dedicated host (the re-image) | +| `results/` | Per-cell evidence dirs (gitignored) | + +## Safety + +Every live-action script **defaults to dry-run** (prints the exact commands, +touches nothing) and requires `--execute` to do anything real. AWS-touching +scripts also warn about the **24-hour dedicated-host minimum** and require an +interactive `yes`. The **benchling AWS profile is hard-refused**. Nothing here +can block a developer's daily machine — fail-open is sacred. + +All scripts pass `shellcheck` and follow the `set -euo pipefail` / `HERE` / +heredoc conventions used in `packaging/scripts/`. diff --git a/tooling/edr-rehearsal/RUNBOOK.md b/tooling/edr-rehearsal/RUNBOOK.md new file mode 100644 index 00000000..f9c6e2d5 --- /dev/null +++ b/tooling/edr-rehearsal/RUNBOOK.md @@ -0,0 +1,197 @@ +# EDR (SentinelOne) rehearsal — operator runbook (WEB-4805) + +> One shot. We rehearse the signed+notarized macOS runtime against Salesloft's +> actual EDR (SentinelOne) on a throwaway EC2 Mac fixture **before** the +> customer ever sees it, pick the artifact that survives by evidence, and +> re-image so the rehearsal never pollutes the real fixtures. + +Ticket: https://linear.app/unboundsec/issue/WEB-4805 +Project: Non-Python MDM Rollout (Mac fleet) — customer Salesloft (~1,150-Mac +Jamf fleet). + +## What we are deciding + +Two signed artifacts ship from the same pipeline; we run BOTH end-to-end +against S1 and keep the one that stays clean: + +| Artifact | Source | pkg | +|---|---|---| +| PyInstaller (default) | WEB-4786 / WEB-4787 | `unbound-runtime-0.1.0.pkg` | +| Nuitka | WEB-4804 (PR #132) | `unbound-runtime-0.1.0-nuitka.pkg` (`-nuitka` suffix; `workflow_dispatch builder=nuitka`) | + +**Winner = clears S1 AND notarizes AND passes the bare-Mac universal2 gate** +(`packaging/scripts/lipo-gate.sh`). Pre-allowlisting with Mike (WEB-4784) may +settle it for either artifact even at `allowlist=none`. + +## Coordinates (source of truth — do not re-derive) + +| Thing | Value | +|---|---| +| Apple Team ID (signer/cert allowlist) | `ZMA55FTA8W` ("Websentry Inc") | +| Released pkg | `https://unbound-release-artifacts.s3.us-west-2.amazonaws.com/macos/0.1.0/unbound-runtime-0.1.0.pkg` | +| onboard.sh | `https://unbound-release-artifacts.s3.us-west-2.amazonaws.com/macos/0.1.0/onboard.sh` | +| Install layout | `/opt/unbound/current/{unbound-hook,unbound-discovery}/` ; LaunchDaemon `ai.getunbound.discovery` | +| AWS | `us-west-2`, **default** profile only (NEVER the benchling profile) | + +## Open dependency — DO NOT BLOCK on it + +The S1 **tenant + site token** are pending the WEB-4805 sourcing decision +(Salesloft's EDR is confirmed SentinelOne; the tenant we rehearse against — +Salesloft-supplied vs an Unbound S1 trial — is the open question, tracked +against WEB-4784). The harness is fully parameterized: when the token + agent +pkg land, drop them into the env vars below — no script edits required. + +This runbook and all scripts can be exercised in **dry-run today** (they print +every command and touch nothing). + +## Allowlist strategy under test + +| State | S1 console configuration | +|---|---| +| `none` | No exclusions. Baseline. | +| `team-id` | Signer/cert exclusion on **ZMA55FTA8W**, scope **Suppress Alerts** (NOT Interop) **+** path exclusion `/opt/unbound/*` for the LaunchDaemon. | + +Set the console policy to the matching state **before** each run; the scripts +only tag captures with the state, they do not configure S1's console. + +--- + +## Prerequisites + +- `aws` CLI authenticated to the **default** profile (the org payer / dev + account — see `project_unbound_aws_org`). The harness hard-refuses any + profile name containing `benchling`. +- An SSH keypair registered in EC2 (`EC2_KEY_NAME`), plus a subnet + security + group that allows SSH from your egress IP. Export them so the scripts emit + concrete commands: + ``` + export EC2_KEY_NAME=... EC2_SUBNET_ID=subnet-... + export EC2_SECURITY_GROUP_ID=sg-... + ``` +- `shellcheck` (CI runs it; scripts are clean). +- The S1 agent pkg URL + site token + console API token (pending; see above): + ``` + export S1_SITE_TOKEN=... # registration token, never on argv + export S1_API_TOKEN=... # console read token (capture only) + export S1_CONSOLE_URL=https://.sentinelone.net + ``` +- Rehearsal onboarding keys (a scoped, throwaway tenant — never a prod admin + key): + ``` + export ONBOARD_API_KEY=... ONBOARD_DISCOVERY_KEY=... + ``` + +> **Every live-action script defaults to DRY-RUN.** Run it once without +> `--execute` to read the exact commands, then add `--execute` when you mean +> it. AWS-touching scripts also print a cost warning and require typing `yes` +> (or `--yes`). Mac dedicated hosts bill a **24-hour minimum** per host. + +--- + +## Step 1 — Provision fresh fixtures (both chips) + +``` +./provision-fixture.sh --chip both # dry-run: prints every aws call +./provision-fixture.sh --chip both --execute # allocates host + instance per chip +``` + +- arm64 → `mac2.metal`, intel → `mac1.metal` (the Intel slice must be proven on + real x86_64 hardware, not just present in `lipo` output). +- The script resolves the newest Apple macOS AMI per chip, launches one + instance onto a dedicated host, waits for `instance-status-ok`, and tells you + to record `HOST_ID`/`INSTANCE_ID` into `results/fixtures-.env` (consumed + by `teardown.sh`). +- Note each fixture's public IP for the next steps. + +## Step 2 — Install the SentinelOne agent + +``` +S1_SITE_TOKEN=... ./install-s1.sh --host --pkg # dry-run +S1_SITE_TOKEN=... ./install-s1.sh --host --pkg --execute # installs +``` + +Repeat per fixture. Confirm in the S1 console that each fixture is registered +and online before rehearsing. The site token is read from env, never argv (it +would otherwise leak via `ps`). + +## Step 3 — Run the full matrix + +For each cell — **2 artifacts × 2 allowlist states** — set the S1 console +allowlist to the matching state, then drive the lifecycle. Use a stable +`--run-id` so the capture pairs with the run. + +``` +# allowlist = none +./run-rehearsal.sh --host --artifact pyinstaller --allowlist none --run-id r1 --execute +./run-rehearsal.sh --host --artifact nuitka --allowlist none --run-id r1 --execute +# allowlist = team-id (set ZMA55FTA8W suppress + /opt/unbound/* path excl. in S1 first) +./run-rehearsal.sh --host --artifact pyinstaller --allowlist team-id --run-id r1 --execute +./run-rehearsal.sh --host --artifact nuitka --allowlist team-id --run-id r1 --execute +``` + +Each run drives, in order: **pkg install → onboard.sh → all 5 hook events +(PreToolUse, PostToolUse, UserPromptSubmit, Stop, SessionStart) → discovery +daemon scheduled run → `--clear`**, and captures our own per-stage logs to +`results/__/`. + +> A non-zero stage does **not** abort the run — it is logged and the matrix +> continues. We are measuring what S1 does, and fail-open is sacred: a hook +> that fails open is expected behavior, not a stop condition. + +**Re-image between cells that share a fixture.** The cleanest re-image is +terminate + release the host (Step 5) and re-provision (Step 1) — a fresh host +boots a clean AMI with no S1/runtime residue. At minimum, run the artifact's +`--clear` (the rehearsal does this as Stage 5) and confirm `/opt/unbound` is +gone before the next install. + +## Step 4 — Capture telemetry per cell + +``` +S1_API_TOKEN=... S1_CONSOLE_URL=... \ + ./capture-telemetry.sh --host --artifact pyinstaller --allowlist none --run-id r1 # dry-run +S1_API_TOKEN=... S1_CONSOLE_URL=... \ + ./capture-telemetry.sh --host --artifact pyinstaller --allowlist none --run-id r1 --execute # collects +``` + +Collects, into `results/__/`: +- S1 console: agent record, threats/detections, activities (Storyline-adjacent) + scoped to the fixture. For each threat id, also export the full Storyline + (process tree) from the console — see the note the script prints. +- Our-side logs pulled off the fixture (`/var/log/unbound/discovery*.log`) plus + the per-stage logs `run-rehearsal.sh` captured. +- `metadata.txt` provenance stamp (artifact, allowlist, run-id, host, team-id, + captured-at). + +Pass `--since ` to scope S1 queries to the rehearsal window. + +## Step 5 — Pick the winner, then teardown + +1. Fill `matrix.md` from the evidence dirs. +2. Apply the decision rule: **clears S1 (ideally even at `allowlist=none`, and + certainly at `team-id`) AND notarizes AND passes the bare-Mac lipo gate**. + 0.1.0 already notarizes; the lipo gate is `packaging/scripts/lipo-gate.sh` + over each artifact's `dist/`. +3. Release the fixtures so billing stops: + ``` + ./teardown.sh --chip both # dry-run + ./teardown.sh --chip both --execute # terminate instances + release hosts + ``` + `teardown.sh` reads ids from `results/fixtures-.env`, or pass + `--instance-id`/`--host-id`. If you lost the ids, the script prints the + `describe-instances` query that finds them by the `unbound:purpose` tag. + +> **Re-imaging is the teardown.** The rehearsal must not pollute the Stream V +> fixtures: terminate + release, and provision fresh for any further runs. + +--- + +## Safety invariants (do not weaken) + +- Nothing here can block a developer's daily machine. Every live action targets + a throwaway EC2 Mac fixture and is `--execute`-gated; the runtime fails open + by design. +- The benchling AWS profile is hard-refused (`lib.sh`). +- Secrets (S1 site/API tokens, onboarding keys) come from env, never argv, and + are never written to the results dir except as the literal name in echoed + commands. +- Captured results may contain endpoint/host data — `results/` is gitignored. diff --git a/tooling/edr-rehearsal/capture-telemetry.sh b/tooling/edr-rehearsal/capture-telemetry.sh new file mode 100755 index 00000000..0a49cf57 --- /dev/null +++ b/tooling/edr-rehearsal/capture-telemetry.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Collect the evidence for one rehearsal cell (WEB-4805): S1 detections/threats +# + Storyline export from the S1 console API, plus our own install/hook/ +# discovery logs pulled off the fixture. Everything lands in a results dir +# tagged {artifact, allowlist-state, run-id} so matrix.md can be filled from +# files, not memory. +# +# The run-id is a PARAMETER (--run-id) so a capture is reproducible and matches +# the run-rehearsal.sh tag exactly. If omitted, it defaults to a UTC timestamp +# (fine for an interactive one-off; pass --run-id to pair with a specific run). +# +# DRY-RUN BY DEFAULT. --execute performs the S1 API queries + SSH log pulls. +# Reads only — capture never changes the fixture or the S1 tenant. +# +# Usage: +# S1_API_TOKEN=... capture-telemetry.sh --host --artifact pyinstaller|nuitka \ +# --allowlist none|team-id [--run-id ] [--since ] [--execute] +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=./lib.sh +source "$HERE/lib.sh" + +TARGET_HOST="" +ARTIFACT="" +ALLOWLIST="" +RUN_ID="" +SINCE="" +EXECUTE=0 +SSH_USER="${SSH_USER:-ec2-user}" + +# S1 console (mgmt) base URL + read API token. Pending WEB-4805 vendor decision. +S1_CONSOLE_URL="${S1_CONSOLE_URL:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --host) TARGET_HOST="${2:-}"; shift 2 ;; + --artifact) ARTIFACT="${2:-}"; shift 2 ;; + --allowlist) ALLOWLIST="${2:-}"; shift 2 ;; + --run-id) RUN_ID="${2:-}"; shift 2 ;; + --since) SINCE="${2:-}"; shift 2 ;; + --execute) EXECUTE=1; shift ;; + -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) die "unknown argument: $1 (see --help)" ;; + esac +done + +[[ -n "$TARGET_HOST" ]] || die "--host is required" +case "$ARTIFACT" in + pyinstaller|nuitka) ;; + *) die "--artifact must be pyinstaller or nuitka" ;; +esac +case "$ALLOWLIST" in + none|team-id) ;; + *) die "--allowlist must be none or team-id" ;; +esac +# Default run-id is a timestamp; pass --run-id to pair with a specific +# run-rehearsal.sh cell. (Default kept out of the inline command paths so the +# value is stable for the whole invocation.) +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" + +TAG="${ARTIFACT}_${ALLOWLIST}_${RUN_ID}" +OUT="$RESULTS_DIR/$TAG" + +s1_get() { # path -> prints curl command; runs it on --execute into a named file + local desc="$1" path="$2" outfile="$3" + log "$desc" + emit_cmd "curl -fsS -H 'Authorization: ApiToken \$S1_API_TOKEN' '$S1_CONSOLE_URL$path' > $OUT/$outfile" + if [[ $EXECUTE -eq 1 ]]; then + require_tool curl + [[ -n "${S1_API_TOKEN:-}" ]] || die "S1_API_TOKEN unset (read token from the S1 console; do not pass on argv)" + curl -fsS -H "Authorization: ApiToken ${S1_API_TOKEN}" "$S1_CONSOLE_URL$path" > "$OUT/$outfile" \ + || warn "S1 query failed: $desc (left $OUT/$outfile possibly empty)" + fi +} + +pull_log() { # remote-path local-name + local remote="$1" local_name="$2" + log "pull $remote" + emit_cmd "scp $SSH_USER@$TARGET_HOST:$remote $OUT/$local_name" + if [[ $EXECUTE -eq 1 ]]; then + require_tool scp + scp -o StrictHostKeyChecking=accept-new "$SSH_USER@$TARGET_HOST:$remote" "$OUT/$local_name" 2>/dev/null \ + || warn "could not pull $remote (may not exist for this stage — ok)" + fi +} + +main() { + section "Capture telemetry — $TAG" + log "Results dir: $OUT" + if [[ $EXECUTE -eq 1 ]]; then + mkdir -p "$OUT" + else + section "DRY RUN — no API/SSH calls. Re-run with --execute." + fi + + section "1) SentinelOne console: threats + Storyline for this fixture" + # The agent UUID/endpoint name for $TARGET_HOST is looked up first so the + # threat/Storyline queries scope to THIS fixture only. computerName filter + # keeps the query tenant-safe. + s1_get "agent record for fixture" \ + "/web/api/v2.1/agents?computerName__contains=${FIXTURE_TAG}" \ + "s1_agents.json" + local since_q="" + [[ -n "$SINCE" ]] && since_q="&createdAt__gte=${SINCE}" + s1_get "threats/detections for fixture" \ + "/web/api/v2.1/threats?computerName__contains=${FIXTURE_TAG}${since_q}" \ + "s1_threats.json" + s1_get "activities (Storyline-adjacent events)" \ + "/web/api/v2.1/activities?computerName__contains=${FIXTURE_TAG}${since_q}" \ + "s1_activities.json" + log "NOTE: full Storyline (process-tree) export is per-threat — for each threat id in" + log " s1_threats.json, also fetch /web/api/v2.1/threats//explore/* or export" + log " the Deep Visibility query from the console UI into $OUT/storyline/." + + section "2) Our-side logs off the fixture" + pull_log "/var/log/unbound/discovery.log" "unbound-discovery.log" + pull_log "/var/log/unbound/discovery.err.log" "unbound-discovery.err.log" + # run-rehearsal.sh already captured per-stage logs locally under the matching + # tag; copy them in so each cell's evidence is self-contained. + if [[ $EXECUTE -eq 1 && -d "$RESULTS_DIR/$TAG" && "$RESULTS_DIR/$TAG" != "$OUT" ]]; then + cp "$RESULTS_DIR/$TAG"/*.log "$OUT/" 2>/dev/null || true + fi + + section "3) Provenance stamp" + emit_cmd "write $OUT/metadata.txt (artifact, allowlist, run-id, host, team-id, captured-at)" + if [[ $EXECUTE -eq 1 ]]; then + { + printf 'artifact=%s\n' "$ARTIFACT" + printf 'allowlist=%s\n' "$ALLOWLIST" + printf 'run_id=%s\n' "$RUN_ID" + printf 'fixture_host=%s\n' "$TARGET_HOST" + printf 'team_id=%s\n' "$TEAM_ID" + printf 'release_version=%s\n' "$RELEASE_VERSION" + printf 'captured_at=%s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + } > "$OUT/metadata.txt" + log "wrote $OUT/metadata.txt" + fi + + section "Done" + log "Fill the matching row in matrix.md from the files in $OUT." + if [[ $EXECUTE -eq 0 ]]; then + section "DRY RUN complete. Nothing was captured." + fi +} + +main diff --git a/tooling/edr-rehearsal/install-s1.sh b/tooling/edr-rehearsal/install-s1.sh new file mode 100755 index 00000000..583c95c8 --- /dev/null +++ b/tooling/edr-rehearsal/install-s1.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Install the SentinelOne (S1) agent on an EC2 Mac fixture for the EDR +# rehearsal (WEB-4805). Parameterized: the S1 site token comes from the +# S1_SITE_TOKEN env var (never argv — tokens leak via ps/cmdline) and the +# signed agent pkg path/URL via --pkg. +# +# TODO(WEB-4805 / WEB-4784): The S1 tenant + site token are PENDING the vendor +# sourcing decision (Salesloft's actual EDR is confirmed SentinelOne; the +# tenant we rehearse against — Salesloft-supplied vs an Unbound trial — is the +# open WEB-4805 question). This script is fully parameterized so dropping the +# real token + pkg in later requires no edits. +# +# DRY-RUN BY DEFAULT: prints the install commands and exits 0. --execute runs +# the install on the fixture over SSH. install-s1 never touches the local +# machine — TARGET_HOST is the fixture. +# +# Usage: +# S1_SITE_TOKEN=... install-s1.sh --host --pkg [--execute] [--yes] +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=./lib.sh +source "$HERE/lib.sh" + +TARGET_HOST="" +S1_PKG="" +EXECUTE=0 +ASSUME_YES=0 +SSH_USER="${SSH_USER:-ec2-user}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --host) TARGET_HOST="${2:-}"; shift 2 ;; + --pkg) S1_PKG="${2:-}"; shift 2 ;; + --execute) EXECUTE=1; shift ;; + --yes) ASSUME_YES=1; shift ;; + -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) die "unknown argument: $1 (see --help)" ;; + esac +done + +[[ -n "$TARGET_HOST" ]] || die "--host is required (the EC2 Mac fixture, never your laptop)" +[[ -n "$S1_PKG" ]] || die "--pkg is required (the Salesloft/S1-supplied signed agent)" + +# The token is the one secret we refuse to default or print. Its absence is a +# hard stop on --execute; in dry-run we only note that it must be set. +if [[ -z "${S1_SITE_TOKEN:-}" ]]; then + if [[ $EXECUTE -eq 1 ]]; then + die "S1_SITE_TOKEN is unset. Source it from the S1 console; do NOT pass it on argv. (Pending WEB-4805 vendor decision.)" + fi + warn "S1_SITE_TOKEN unset — fine for dry-run, but --execute requires it." +fi + +# remote: emit the command, and in --execute actually run it over SSH. +remote() { + local desc="$1"; shift + log "$desc" + emit_cmd "ssh $SSH_USER@$TARGET_HOST -- $*" + if [[ $EXECUTE -eq 1 ]]; then + require_tool ssh + ssh -o StrictHostKeyChecking=accept-new "$SSH_USER@$TARGET_HOST" -- "$@" \ + || die "remote step failed: $desc" + fi +} + +main() { + section "SentinelOne agent install -> fixture $TARGET_HOST" + log "Site token: \$S1_SITE_TOKEN (env, not printed)" + log "Agent pkg : $S1_PKG" + + if [[ $EXECUTE -eq 1 ]]; then + warn "EXECUTE MODE: this installs an EDR agent on $TARGET_HOST." + confirm_or_die "Install SentinelOne on fixture $TARGET_HOST? Type 'yes': " + else + section "DRY RUN — no SSH/install will run. Re-run with --execute." + fi + + # S1 macOS install is a standard pkg install; the site token is supplied via + # the registration token file S1 reads at first boot (com.sentinelone.*). + # Exact mechanism is vendor-version-specific — confirm against the S1 console + # install instructions for the tenant chosen in WEB-4805. + remote "1) Stage the S1 agent pkg on the fixture" \ + "curl -fSL -o /tmp/s1-agent.pkg '$S1_PKG'" + + remote "2) Drop the registration (site) token where the agent reads it" \ + "sudo /bin/sh -c 'umask 077; printf %s \"\$S1_SITE_TOKEN\" > /tmp/com.sentinelone.registration-token'" + + remote "3) Install the agent" \ + "sudo installer -pkg /tmp/s1-agent.pkg -target /" + + remote "4) Verify the agent is registered + online" \ + "sudo /usr/local/bin/sentinelctl management status || true" + + remote "5) Scrub the staged token + pkg" \ + "sudo rm -f /tmp/com.sentinelone.registration-token /tmp/s1-agent.pkg" + + section "Allowlist note" + log "The ZMA55FTA8W signer exclusion + /opt/unbound/* path exclusion are configured" + log "in the S1 CONSOLE/policy, not on the endpoint. run-rehearsal.sh exercises BOTH" + log "allowlist states; toggle the console policy between runs and tag captures" + log "with --allowlist none|team-id accordingly (see matrix.md)." + + if [[ $EXECUTE -eq 0 ]]; then + section "DRY RUN complete. Nothing was installed." + fi +} + +main diff --git a/tooling/edr-rehearsal/lib.sh b/tooling/edr-rehearsal/lib.sh new file mode 100755 index 00000000..5cb53987 --- /dev/null +++ b/tooling/edr-rehearsal/lib.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Shared config + helpers for the EDR (SentinelOne) rehearsal harness +# (WEB-4805). Sourced by every script in this directory so the AWS-profile +# guard, dry-run/confirm gate, and result-dir conventions are identical +# everywhere. Not executable on its own. +# +# IMPORTANT (fail-open is sacred): nothing in this harness installs or runs +# the unbound runtime against a developer's daily machine. Every live action +# targets a throwaway EC2 Mac fixture and is gated behind --execute. There is +# no code path here that can BLOCK dev work. + +# --- Coordinates (source of truth: WEB-4805 + packaging/README.md) ----------- +# These config vars are consumed by the scripts that SOURCE this file, not by +# lib.sh itself; shellcheck can't see cross-file use, so silence SC2034 for the +# config block below. +# shellcheck disable=SC2034 +AWS_REGION="${AWS_REGION:-us-west-2}" +AWS_AZ="${AWS_AZ:-us-west-2a}" +# The DEFAULT profile only. NEVER benchling (that account is single-tenant for +# the Benchling POV). Overridable for an operator's own non-default sandbox, +# but the guard below hard-refuses anything matching /benchling/. +AWS_PROFILE_NAME="${AWS_PROFILE_NAME:-default}" + +# Apple Team ID for the signer/cert allowlist under test ("Websentry Inc"). +TEAM_ID="ZMA55FTA8W" + +# Released, signed+notarized artifacts (runtime-v0.1.0). +RELEASE_VERSION="${RELEASE_VERSION:-0.1.0}" +ARTIFACT_BASE="https://unbound-release-artifacts.s3.us-west-2.amazonaws.com/macos/${RELEASE_VERSION}" +PKG_URL="${ARTIFACT_BASE}/unbound-runtime-${RELEASE_VERSION}.pkg" +ONBOARD_URL="${ARTIFACT_BASE}/onboard.sh" + +# On-disk install layout (packaging/README.md). +INSTALL_PREFIX="/opt/unbound" +HOOK_BIN="${INSTALL_PREFIX}/current/unbound-hook/unbound-hook" +DISCOVERY_BIN="${INSTALL_PREFIX}/current/unbound-discovery/unbound-discovery" +DAEMON_LABEL="ai.getunbound.discovery" + +# EC2 plumbing — placeholders an operator fills in for their VPC/subnet/SG/key. +# Left as <...> so a dry-run reads clearly and an accidental --execute with +# unset plumbing fails loudly at the aws boundary rather than launching into a +# default VPC. +EC2_KEY_NAME="${EC2_KEY_NAME:-}" +EC2_SUBNET_ID="${EC2_SUBNET_ID:-}" +EC2_SECURITY_GROUP_ID="${EC2_SECURITY_GROUP_ID:-}" +EC2_MACOS_AMI_OWNER="${EC2_MACOS_AMI_OWNER:-amazon}" + +# Tagging so every rehearsal resource is findable + teardownable. +FIXTURE_TAG="${FIXTURE_TAG:-unbound-edr-rehearsal}" +TAG_KEY="unbound:purpose" +TAG_VALUE="web-4805-edr-rehearsal" + +# Results land beside the scripts by default. +RESULTS_DIR="${RESULTS_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/results}" + +# --- Output helpers ---------------------------------------------------------- +log() { printf ' %s\n' "$*"; } +warn() { printf 'WARNING: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } +section() { printf '\n=== %s ===\n' "$*"; } + +# emit_cmd: in dry-run, this is the ONLY way a "live" command surfaces — it is +# printed, never run. In --execute the caller runs the real aws/ssh command +# itself; emit_cmd is still used to echo it for the operator's audit log. +emit_cmd() { printf ' $ %s\n' "$*"; } + +# Common aws args every call shares: region + the (guarded) profile. +aws_common_args() { printf -- '--region %s --profile %s' "$AWS_REGION" "$AWS_PROFILE_NAME"; } + +# Hard refusal of the benchling profile (and any non-default unless the +# operator opted in via AWS_PROFILE_NAME). This is the one guard that must +# never be removed. +preflight_aws_profile_guard() { + case "$AWS_PROFILE_NAME" in + *benchling*) die "refusing to use a benchling AWS profile ('$AWS_PROFILE_NAME'). The rehearsal runs in the DEFAULT account only." ;; + esac + log "AWS profile: $AWS_PROFILE_NAME, region: $AWS_REGION (benchling profile is hard-refused)" +} + +# confirm_or_die: interactive gate for --execute paths. Honors --yes via the +# ASSUME_YES the caller sets. Reads from the terminal, not stdin, so it works +# even when stdin is a pipe. +confirm_or_die() { + local prompt="$1" reply="" + if [[ "${ASSUME_YES:-0}" -eq 1 ]]; then + log "(--yes given; skipping interactive confirmation)" + return 0 + fi + if [[ ! -t 0 ]] && [[ ! -r /dev/tty ]]; then + die "refusing to proceed without confirmation (no TTY). Re-run with --yes if you are certain." + fi + printf '%s' "$prompt" > /dev/tty + read -r reply < /dev/tty + [[ "$reply" == "yes" ]] || die "not confirmed (got '$reply'); aborting." +} + +# require_tool: friendly failure if a CLI the --execute path needs is absent. +require_tool() { command -v "$1" >/dev/null 2>&1 || die "required tool not found: $1"; } diff --git a/tooling/edr-rehearsal/matrix.md b/tooling/edr-rehearsal/matrix.md new file mode 100644 index 00000000..995b72da --- /dev/null +++ b/tooling/edr-rehearsal/matrix.md @@ -0,0 +1,67 @@ +# EDR (SentinelOne) rehearsal test matrix — WEB-4805 + +2 artifacts × 2 allowlist states × 5 lifecycle stages. Fill the **Result** +column from the captured evidence (`capture-telemetry.sh` writes one results +dir per cell, tagged `{artifact}_{allowlist}_{run-id}`). A cell "passes" when +S1 raised **no blocking/quarantine action** AND our binary still ran +fail-open (the hook stage output must contain the vendored module's +`"suppressOutput": true`, exactly as `packaging/scripts/smoke-test.sh` +asserts — proof the binary executed, not that S1 merely stayed quiet because +nothing ran). + +Ticket: https://linear.app/unboundsec/issue/WEB-4805 + +## Allowlist states + +| State | S1 console config under test | +|---|---| +| `none` | No Unbound exclusions. Baseline — what S1 does to an un-allowlisted fleet. | +| `team-id` | Signer/cert exclusion on Team ID **ZMA55FTA8W** ("Websentry Inc"), scope **Suppress Alerts** (NOT broad Interop mode) **+** path exclusion `/opt/unbound/*` for the LaunchDaemon. | + +## Matrix + +### Artifact: PyInstaller (default; WEB-4786 / WEB-4787) + +| Allowlist | Stage | What S1 sees | Result (detections / verdict) | Evidence dir | +|---|---|---|---|---| +| none | install (pkg) | `installer -pkg` of signed/notarized pkg, postinstall pre-warm + LaunchDaemon bootstrap | | | +| none | onboard.sh | `unbound-hook setup` writes config, bootstraps daemon | | | +| none | hook events | 5 events (PreToolUse, PostToolUse, UserPromptSubmit, Stop, SessionStart) | | | +| none | discovery daemon | root LaunchDaemon scan, multi-user `/Users/*` iteration, MCP scans | | | +| none | --clear | binary clear + system sweep (bootout, rm) | | | +| team-id | install (pkg) | same, with ZMA55FTA8W suppress + `/opt/unbound/*` excl. | | | +| team-id | onboard.sh | | | | +| team-id | hook events | | | | +| team-id | discovery daemon | | | | +| team-id | --clear | | | | + +### Artifact: Nuitka (WEB-4804; merged via PR #132) + +| Allowlist | Stage | What S1 sees | Result (detections / verdict) | Evidence dir | +|---|---|---|---|---| +| none | install (pkg) | `-nuitka` pkg install | | | +| none | onboard.sh | | | | +| none | hook events | | | | +| none | discovery daemon | | | | +| none | --clear | | | | +| team-id | install (pkg) | | | | +| team-id | onboard.sh | | | | +| team-id | hook events | | | | +| team-id | discovery daemon | | | | +| team-id | --clear | | | | + +## Decision (fill after the runs) + +Winner is the artifact that **clears S1** (ideally even at `allowlist=none`, +and certainly at `allowlist=team-id`) **AND** notarizes **AND** passes the +bare-Mac universal2 gate (`packaging/scripts/lipo-gate.sh`). + +| Criterion | PyInstaller | Nuitka | +|---|---|---| +| Clears S1 @ allowlist=none | | | +| Clears S1 @ allowlist=team-id | | | +| Notarizes (already true for shipped 0.1.0) | yes | | +| Passes bare-Mac lipo gate | | | +| **Winner** | | | + +**Chosen artifact:** _____ **Rationale:** _____ diff --git a/tooling/edr-rehearsal/provision-fixture.sh b/tooling/edr-rehearsal/provision-fixture.sh new file mode 100755 index 00000000..6b78aa12 --- /dev/null +++ b/tooling/edr-rehearsal/provision-fixture.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# Provision a FRESH EC2 Mac fixture for the EDR (SentinelOne) rehearsal +# (WEB-4805). Mac instances require a dedicated host, so this allocates a +# dedicated host AND launches one instance on it, for one or both chip types: +# +# arm64 -> mac2.metal (Apple silicon; default macOS AMI is arm64) +# intel -> mac1.metal (x86_64; proves the universal2 Intel slice on real HW) +# +# us-west-2, the DEFAULT aws profile. NEVER the benchling profile (that account +# is single-tenant for the Benchling POV — see project_benchling_single_tenant). +# +# DRY-RUN BY DEFAULT: prints every aws command it WOULD run and exits 0 without +# touching AWS. Pass --execute to actually allocate. Even with --execute it +# prints a cost warning and waits for confirmation, because Mac dedicated hosts +# bill a NON-NEGOTIABLE 24-hour minimum per allocation. +# +# Usage: +# provision-fixture.sh [--chip arm64|intel|both] [--execute] [--yes] +# +# Nothing here installs SentinelOne or runs the rehearsal — see install-s1.sh +# and run-rehearsal.sh. Tear down with teardown.sh when finished. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=./lib.sh +source "$HERE/lib.sh" + +CHIP="both" +EXECUTE=0 +ASSUME_YES=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --chip) CHIP="${2:-}"; shift 2 ;; + --execute) EXECUTE=1; shift ;; + --yes) ASSUME_YES=1; shift ;; + -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) die "unknown argument: $1 (see --help)" ;; + esac +done + +case "$CHIP" in + arm64|intel|both) ;; + *) die "--chip must be arm64, intel, or both (got: $CHIP)" ;; +esac + +# instance type + AMI-name pattern per chip. The AMI ID is resolved at +# provision time (it changes with every macOS point release) rather than +# pinned here — a stale baked AMI ID is a silent provisioning failure. +chip_instance_type() { + case "$1" in + arm64) echo "mac2.metal" ;; + intel) echo "mac1.metal" ;; + esac +} +chip_ami_pattern() { + # Canonical Apple-provided macOS AMIs (owner 100343932686 = "Amazon"). + case "$1" in + arm64) echo "amzn-ec2-macos-*-arm64" ;; + intel) echo "amzn-ec2-macos-*-x86_64" ;; + esac +} + +# Resolve the newest matching macOS AMI for a chip. In dry-run we only PRINT +# the query (no credentials assumed); --execute actually resolves it so the +# run-instances command is concrete. +resolve_ami_cmd() { + local pattern="$1" + printf 'aws ec2 describe-images %s \\\n' "$(aws_common_args)" + printf ' --owners %s \\\n' "$EC2_MACOS_AMI_OWNER" + printf ' --filters "Name=name,Values=%s" "Name=state,Values=available" \\\n' "$pattern" + printf ' --query "sort_by(Images,&CreationDate)[-1].ImageId" --output text\n' +} + +provision_one() { + local chip="$1" + local itype ami_pattern + itype="$(chip_instance_type "$chip")" + ami_pattern="$(chip_ami_pattern "$chip")" + + section "Fixture: $chip ($itype)" + + log "1) Allocate a dedicated host for $itype in $AWS_REGION/$AWS_AZ" + emit_cmd "$(cat <' + emit_cmd "aws ec2 describe-instances $(aws_common_args) --instance-ids --query 'Reservations[0].Instances[0].PublicIpAddress' --output text" + + log "Record HostId + InstanceId in $RESULTS_DIR/fixtures-$chip.env for teardown.sh:" + emit_cmd "printf 'CHIP=%s\\nHOST_ID=%s\\nINSTANCE_ID=%s\\n' $chip > $RESULTS_DIR/fixtures-$chip.env" +} + +main() { + preflight_aws_profile_guard + + if [[ $EXECUTE -eq 1 ]]; then + warn "EXECUTE MODE: this WILL allocate billable AWS resources." + warn "Mac dedicated hosts have a 24-HOUR minimum charge per host (~\$25-40/host/day)." + confirm_or_die "Allocate EC2 Mac fixture(s) for chip='$CHIP' in $AWS_REGION? Type 'yes' to proceed: " + else + section "DRY RUN — no AWS calls will be made. Re-run with --execute to provision." + fi + + case "$CHIP" in + arm64) provision_one arm64 ;; + intel) provision_one intel ;; + both) provision_one arm64; provision_one intel ;; + esac + + section "Next steps" + log " 1. install-s1.sh — install the SentinelOne agent on each fixture" + log " 2. run-rehearsal.sh --artifact pyinstaller|nuitka — drive the lifecycle" + log " 3. capture-telemetry.sh — collect detections + our logs" + log " 4. teardown.sh --execute — release the dedicated host(s) when done" + if [[ $EXECUTE -eq 0 ]]; then + section "DRY RUN complete. Nothing was provisioned." + fi +} + +main diff --git a/tooling/edr-rehearsal/results/.gitkeep b/tooling/edr-rehearsal/results/.gitkeep new file mode 100644 index 00000000..5f2ba52f --- /dev/null +++ b/tooling/edr-rehearsal/results/.gitkeep @@ -0,0 +1,2 @@ +# Rehearsal results land here, tagged {artifact}_{allowlist}_{run-id}. +# Captures may contain endpoint hostnames / fixture data — do not commit them. diff --git a/tooling/edr-rehearsal/run-rehearsal.sh b/tooling/edr-rehearsal/run-rehearsal.sh new file mode 100755 index 00000000..d013eee2 --- /dev/null +++ b/tooling/edr-rehearsal/run-rehearsal.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# Drive ONE signed+notarized runtime artifact through the full lifecycle on an +# EC2 Mac fixture (WEB-4805), so S1 sees exactly what the fleet will do: +# +# 1. pkg install (installer -pkg, via the released onboard.sh path) +# 2. onboard.sh (setup: writes config, bootstraps the daemon) +# 3. ALL hook events (PreToolUse, PostToolUse, UserPromptSubmit, +# Stop, SessionStart) for claude-code +# 4. discovery daemon run (kickstart the ai.getunbound.discovery daemon) +# 5. --clear (teardown via onboard.sh --clear) +# +# Both artifacts (pyinstaller default, nuitka -nuitka suffix) are installed via +# the SAME signed onboard.sh; --artifact only selects which pkg URL to fetch so +# the install path stays identical to production. +# +# Our own logs (install/hook/discovery/clear) are captured to a results dir, +# tagged {artifact, allowlist-state, run-id}, for capture-telemetry.sh to merge +# with the S1 side. +# +# DRY-RUN BY DEFAULT. --execute runs the lifecycle on the fixture over SSH. +# This NEVER runs against the local machine — --host is the fixture. +# +# Usage: +# run-rehearsal.sh --host --artifact pyinstaller|nuitka \ +# --allowlist none|team-id [--run-id ] [--execute] [--yes] +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=./lib.sh +source "$HERE/lib.sh" + +TARGET_HOST="" +ARTIFACT="" +ALLOWLIST="" +RUN_ID="" +EXECUTE=0 +ASSUME_YES=0 +SSH_USER="${SSH_USER:-ec2-user}" + +# Onboarding keys: rehearsal-only, supplied via env so they never hit argv or +# the results dir. Required only on --execute. +ONBOARD_API_KEY="${ONBOARD_API_KEY:-}" +ONBOARD_DISCOVERY_KEY="${ONBOARD_DISCOVERY_KEY:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --host) TARGET_HOST="${2:-}"; shift 2 ;; + --artifact) ARTIFACT="${2:-}"; shift 2 ;; + --allowlist) ALLOWLIST="${2:-}"; shift 2 ;; + --run-id) RUN_ID="${2:-}"; shift 2 ;; + --execute) EXECUTE=1; shift ;; + --yes) ASSUME_YES=1; shift ;; + -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) die "unknown argument: $1 (see --help)" ;; + esac +done + +[[ -n "$TARGET_HOST" ]] || die "--host is required" +case "$ARTIFACT" in + pyinstaller|nuitka) ;; + *) die "--artifact must be pyinstaller or nuitka (got: '$ARTIFACT')" ;; +esac +case "$ALLOWLIST" in + none|team-id) ;; + *) die "--allowlist must be none or team-id (got: '$ALLOWLIST')" ;; +esac +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" + +# Artifact -> pkg URL. The default (pyinstaller) is the canonical released pkg. +# Nuitka artifacts carry a -nuitka suffix (packaging/README.md "Cutting a +# release"). If a separate Nuitka pkg URL is published, override via +# NUITKA_PKG_URL. +artifact_pkg_url() { + case "$1" in + pyinstaller) echo "$PKG_URL" ;; + nuitka) echo "${NUITKA_PKG_URL:-${ARTIFACT_BASE}/unbound-runtime-${RELEASE_VERSION}-nuitka.pkg}" ;; + esac +} + +TAG="${ARTIFACT}_${ALLOWLIST}_${RUN_ID}" +RUN_RESULTS="$RESULTS_DIR/$TAG" + +# remote: echo the command; in --execute run it over SSH, tee'ing remote output +# into a per-stage local log under the run results dir. +remote_stage() { + local stage="$1"; shift + log "[$stage] $*" + emit_cmd "ssh $SSH_USER@$TARGET_HOST -- $*" + if [[ $EXECUTE -eq 1 ]]; then + require_tool ssh + mkdir -p "$RUN_RESULTS" + ssh -o StrictHostKeyChecking=accept-new "$SSH_USER@$TARGET_HOST" -- "$@" \ + > "$RUN_RESULTS/${stage}.log" 2>&1 \ + || warn "[$stage] returned non-zero — captured to ${stage}.log (NOT fatal: continue the matrix and record it)" + fi +} + +# A representative event payload per hook event. Identical shape to +# packaging/scripts/smoke-test.sh, so S1 sees a real vendored-module dispatch. +hook_payload() { + case "$1" in + PreToolUse) echo '{"hook_event_name":"PreToolUse","tool_name":"Bash","tool_input":{"command":"ls -la"},"session_id":"edr-rehearsal"}' ;; + PostToolUse) echo '{"hook_event_name":"PostToolUse","tool_name":"Bash","tool_input":{"command":"ls -la"},"tool_response":{"stdout":"ok"},"session_id":"edr-rehearsal"}' ;; + UserPromptSubmit) echo '{"hook_event_name":"UserPromptSubmit","prompt":"hello","session_id":"edr-rehearsal"}' ;; + Stop) echo '{"hook_event_name":"Stop","session_id":"edr-rehearsal"}' ;; + SessionStart) echo '{"hook_event_name":"SessionStart","session_id":"edr-rehearsal"}' ;; + esac +} + +run_hook_events() { + local ev payload + for ev in PreToolUse PostToolUse UserPromptSubmit Stop SessionStart; do + payload="$(hook_payload "$ev")" + # The hook runs as the user; sudo -u keeps it off root and closer to the + # real claude-code invocation. Output is the fail-open JSON — captured so + # the matrix can confirm the binary actually executed under S1. + remote_stage "hook_${ev}" \ + "printf '%s' '$payload' | $HOOK_BIN hook claude-code $ev" + done +} + +main() { + section "EDR rehearsal — artifact=$ARTIFACT allowlist=$ALLOWLIST run=$RUN_ID" + log "Fixture : $TARGET_HOST" + log "pkg URL : $(artifact_pkg_url "$ARTIFACT")" + log "Results : $RUN_RESULTS" + + if [[ $EXECUTE -eq 1 ]]; then + [[ -n "$ONBOARD_API_KEY" && -n "$ONBOARD_DISCOVERY_KEY" ]] \ + || die "ONBOARD_API_KEY and ONBOARD_DISCOVERY_KEY env vars are required on --execute (rehearsal keys; never argv)" + warn "EXECUTE MODE: installs + runs the runtime on fixture $TARGET_HOST." + warn "Confirm the S1 console allowlist state is set to '$ALLOWLIST' BEFORE proceeding." + confirm_or_die "Run $ARTIFACT through the full lifecycle on $TARGET_HOST (allowlist=$ALLOWLIST)? Type 'yes': " + mkdir -p "$RUN_RESULTS" + else + section "DRY RUN — no SSH/install will run. Re-run with --execute." + fi + + # 1) + 2) Install + onboard via the SIGNED released onboard.sh (production + # path). onboard.sh downloads the pkg, verifies sha256 + Team ID, installs, + # and runs `unbound-hook setup`. We point ARTIFACT_URL at the chosen pkg. + section "Stage 1+2: pkg install + onboard.sh" + remote_stage "install_onboard" \ + "curl -fSL -o /tmp/onboard.sh '$ONBOARD_URL' && sudo ARTIFACT_URL='$(artifact_pkg_url "$ARTIFACT")' bash /tmp/onboard.sh --api-key \"\$ONBOARD_API_KEY\" --discovery-key \"\$ONBOARD_DISCOVERY_KEY\"" + + # 3) All hook events. + section "Stage 3: all hook events (claude-code)" + run_hook_events + + # 4) Discovery daemon — kickstart the installed LaunchDaemon so S1 sees the + # scheduled scan behavior (multi-user /Users iteration, MCP scans) under a + # root daemon, which is what S1 Storyline tends to flag (-> /opt/unbound/* + # path exclusion). + section "Stage 4: discovery daemon scheduled run" + remote_stage "discovery_daemon" \ + "sudo launchctl kickstart -k system/$DAEMON_LABEL && sleep 60 && sudo tail -n 50 /var/log/unbound/discovery.log" + + # 5) --clear teardown via onboard.sh (binary clear + system sweep). + section "Stage 5: --clear" + remote_stage "clear" \ + "curl -fSL -o /tmp/onboard.sh '$ONBOARD_URL' && sudo bash /tmp/onboard.sh --clear" + + section "Done" + log "Our-side logs: $RUN_RESULTS/*.log" + log "Now run: capture-telemetry.sh --host $TARGET_HOST --artifact $ARTIFACT --allowlist $ALLOWLIST --run-id $RUN_ID" + log "Then RE-IMAGE the fixture before the next allowlist/artifact cell (rehearsal must not pollute fixtures)." + if [[ $EXECUTE -eq 0 ]]; then + section "DRY RUN complete. Nothing was installed or run." + fi +} + +main diff --git a/tooling/edr-rehearsal/teardown.sh b/tooling/edr-rehearsal/teardown.sh new file mode 100755 index 00000000..afda93c1 --- /dev/null +++ b/tooling/edr-rehearsal/teardown.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Tear down the EC2 Mac rehearsal fixture(s) (WEB-4805): terminate the +# instance and RELEASE the dedicated host so billing stops (subject to the +# unavoidable 24h host minimum). The rehearsal must not pollute the Stream V +# fixtures, so the canonical "re-image" is: terminate + release, then re-run +# provision-fixture.sh for any further runs (a fresh host = a clean macOS AMI, +# no S1/runtime residue). +# +# Reads HostId/InstanceId from $RESULTS_DIR/fixtures-.env (written by +# provision-fixture.sh) or from explicit flags. +# +# DRY-RUN BY DEFAULT: prints the teardown commands and exits 0. --execute +# performs the termination + release. There is NO path here that can affect a +# real developer machine. +# +# Usage: +# teardown.sh [--chip arm64|intel|both] [--instance-id i-..] [--host-id h-..] [--execute] [--yes] +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=./lib.sh +source "$HERE/lib.sh" + +CHIP="both" +INSTANCE_ID="" +HOST_ID="" +EXECUTE=0 +ASSUME_YES=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --chip) CHIP="${2:-}"; shift 2 ;; + --instance-id) INSTANCE_ID="${2:-}"; shift 2 ;; + --host-id) HOST_ID="${2:-}"; shift 2 ;; + --execute) EXECUTE=1; shift ;; + --yes) ASSUME_YES=1; shift ;; + -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) die "unknown argument: $1 (see --help)" ;; + esac +done + +case "$CHIP" in + arm64|intel|both) ;; + *) die "--chip must be arm64, intel, or both" ;; +esac + +# run_cmd: echo always; on --execute, run it. +run_cmd() { + emit_cmd "$*" + if [[ $EXECUTE -eq 1 ]]; then + require_tool aws + eval "$*" || warn "command returned non-zero: $*" + fi +} + +teardown_one() { + local chip="$1" + local iid="$INSTANCE_ID" hid="$HOST_ID" + local envfile="$RESULTS_DIR/fixtures-$chip.env" + + # Prefer explicit flags; otherwise read the env file provision-fixture wrote. + if [[ -z "$iid" || -z "$hid" ]] && [[ -f "$envfile" ]]; then + log "reading ids from $envfile" + # shellcheck disable=SC1090 # KEY=VALUE file written by provision-fixture.sh + source "$envfile" + iid="${iid:-${INSTANCE_ID:-}}" + hid="${hid:-${HOST_ID:-}}" + fi + + section "Teardown: $chip" + if [[ -z "$iid" || -z "$hid" ]]; then + warn "no instance/host id for $chip (no $envfile and no --instance-id/--host-id). Skipping." + log "Find them manually:" + emit_cmd "aws ec2 describe-instances $(aws_common_args) --filters 'Name=tag:$TAG_KEY,Values=$TAG_VALUE' --query 'Reservations[].Instances[].[InstanceId,Placement.HostId]' --output text" + return 0 + fi + + log "instance: $iid host: $hid" + log "1) Terminate the instance" + run_cmd "aws ec2 terminate-instances $(aws_common_args) --instance-ids $iid" + log "2) Wait for termination (a host cannot be released while it has a running instance)" + run_cmd "aws ec2 wait instance-terminated $(aws_common_args) --instance-ids $iid" + log "3) Release the dedicated host (stops billing beyond the 24h minimum)" + run_cmd "aws ec2 release-hosts $(aws_common_args) --host-ids $hid" + + if [[ $EXECUTE -eq 1 ]]; then + rm -f "$envfile" + log "removed $envfile" + fi +} + +main() { + preflight_aws_profile_guard + + if [[ $EXECUTE -eq 1 ]]; then + warn "EXECUTE MODE: this TERMINATES instances and RELEASES dedicated hosts." + confirm_or_die "Tear down rehearsal fixture(s) for chip='$CHIP'? Type 'yes': " + else + section "DRY RUN — no AWS calls. Re-run with --execute to tear down." + fi + + case "$CHIP" in + arm64) teardown_one arm64 ;; + intel) teardown_one intel ;; + both) teardown_one arm64; teardown_one intel ;; + esac + + section "Re-image reminder" + log "For another rehearsal cell, re-run provision-fixture.sh for a FRESH host." + log "A fresh host boots a clean macOS AMI — no S1 agent, no runtime residue." + if [[ $EXECUTE -eq 0 ]]; then + section "DRY RUN complete. Nothing was torn down." + fi +} + +main