diff --git a/modules/dora/examples.yaml b/modules/dora/examples.yaml index 794386ed..54bd31b4 100644 --- a/modules/dora/examples.yaml +++ b/modules/dora/examples.yaml @@ -11,27 +11,42 @@ dora_network_health: networks = dora.list_networks() print(f"Available networks: {[n['name'] for n in networks]}") - # Get network overview for a specific network + # Get network overview for a specific network. + # Keys: current_epoch, current_slot, finalized (bool), participation_rate (%). network = "sepolia" overview = dora.get_network_overview(network) print(f"Current epoch: {overview['current_epoch']}") print(f"Current slot: {overview['current_slot']}") - print(f"Active validators: {overview['active_validator_count']}") + print(f"Participation: {overview['participation_rate']}% (>66.7% needed to finalize)") - name: Check network finality description: Check if the network is finalizing properly query: | from ethpandaops import dora - overview = dora.get_network_overview("sepolia") - finalized_epoch = overview.get("finalized_epoch", 0) - current_epoch = overview.get("current_epoch", 0) + # get_network_overview's `finalized` is a bool, not an epoch. To measure the + # finality lag, scan recent epochs (get_epoch has `epoch` + `finalized`) and + # find the most recent finalized one. + network = "sepolia" + overview = dora.get_network_overview(network) + current_epoch = overview["current_epoch"] - epochs_behind = current_epoch - finalized_epoch - if epochs_behind <= 2: - print(f"Network is finalizing normally (finalized epoch: {finalized_epoch})") + finalized_epoch = None + for e in range(current_epoch - 2, current_epoch - 12, -1): + try: + if dora.get_epoch(network, e).get("finalized"): + finalized_epoch = e + break + except Exception: + continue + + if finalized_epoch is None: + print(f"No finalized epoch in the last 10 — finality may be stalled (head epoch {current_epoch})") else: - print(f"Warning: Network is {epochs_behind} epochs behind finality") + behind = current_epoch - finalized_epoch + note = "normal" if behind <= 3 else "LAGGING" + print(f"Finalized epoch {finalized_epoch}, {behind} behind head ({note})") + print(f"Participation: {overview['participation_rate']}% (>66.7% needed to finalize)") - name: Detect network splits description: Check if the network has split into multiple forks, detect consensus splits and fork detection @@ -42,8 +57,13 @@ dora_network_health: network = "sepolia" base_url = dora.get_base_url(network) - with httpx.Client(timeout=30) as client: - resp = client.get(f"{base_url}/forks", headers={"Accept": "application/json"}) + # Hosted Dora sits behind Cloudflare — send a browser User-Agent or it 403s (Error 1010). + headers = { + "Accept": "application/json", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36", + } + with httpx.Client(timeout=30, headers=headers) as client: + resp = client.get(f"{base_url}/forks") resp.raise_for_status() forks = resp.json() @@ -69,7 +89,7 @@ dora_validator_queries: print(f"Validator 12345:") print(f" Status: {validator['status']}") print(f" Balance: {validator['balance']} gwei") - print(f" Activation epoch: {validator.get('activation_epoch', 'N/A')}") + print(f" Activation epoch: {validator.get('activationepoch', 'N/A')}") # Generate a link to view in Dora link = dora.link_validator("sepolia", "12345") @@ -95,8 +115,13 @@ dora_validator_queries: network = "sepolia" base_url = dora.get_base_url(network) + # Hosted Dora sits behind Cloudflare — send a browser User-Agent or it 403s (Error 1010). + headers = { + "Accept": "application/json", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36", + } # group=3 groups by node name, order=offline-d sorts by offline count descending - with httpx.Client(timeout=30) as client: + with httpx.Client(timeout=30, headers=headers) as client: resp = client.get(f"{base_url}/api/v1/validators/activity", params={ "group": 3, "order": "offline-d", @@ -142,7 +167,7 @@ dora_slot_epoch_queries: # Get epoch details (previous epoch for complete data) epoch_info = dora.get_epoch("sepolia", current_epoch - 1) print(f"Epoch {current_epoch - 1}:") - print(f" Participation rate: {epoch_info.get('validator_participation', 'N/A')}") + print(f" Participation rate: {epoch_info.get('globalparticipationrate', 'N/A')}") print(f" Finalized: {epoch_info.get('finalized', False)}") print(f" View in Dora: {dora.link_epoch('sepolia', current_epoch - 1)}") @@ -158,9 +183,14 @@ dora_slot_epoch_queries: overview = dora.get_network_overview(network) current_slot = overview["current_slot"] + # Hosted Dora sits behind Cloudflare — send a browser User-Agent or it 403s (Error 1010). + headers = { + "Accept": "application/json", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36", + } # with_missing=2 returns only missing/orphaned slots slot_lookback = 300 # ~1 hour - with httpx.Client(timeout=30) as client: + with httpx.Client(timeout=30, headers=headers) as client: resp = client.get(f"{base_url}/api/v1/slots", params={ "with_missing": 2, "min_slot": current_slot - slot_lookback, @@ -213,7 +243,7 @@ dora_combined_workflows: print(f"=== {network.upper()} Network Status ===") print(f"Current slot: {overview['current_slot']}") print(f"Current epoch: {overview['current_epoch']}") - print(f"Active validators: {overview['active_validator_count']}") + print(f"Participation: {overview['participation_rate']}%") # Get recent blocks from Xatu for deeper analysis df = clickhouse.query("clickhouse-raw", f''' diff --git a/modules/ethnode/examples.yaml b/modules/ethnode/examples.yaml index ad3b4e96..9f95ef44 100644 --- a/modules/ethnode/examples.yaml +++ b/modules/ethnode/examples.yaml @@ -7,8 +7,8 @@ ethnode_sync_status: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) syncing = ethnode.get_node_syncing(network, instance) print(f"Node {instance} sync status:") @@ -21,8 +21,8 @@ ethnode_sync_status: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) syncing = ethnode.eth_syncing(network, instance) block_number = ethnode.eth_block_number(network, instance) @@ -34,8 +34,8 @@ ethnode_sync_status: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instances = ["lighthouse-geth-1", "prysm-geth-1", "teku-geth-1"] + network = "" # discover with ethnode.list_networks() + instances = ["", "", ""] # node host.names on that network for instance in instances: try: @@ -54,8 +54,8 @@ ethnode_node_info: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) # Beacon node version version = ethnode.get_node_version(network, instance) @@ -70,8 +70,8 @@ ethnode_node_info: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) # CL peers peer_count = ethnode.get_peer_count(network, instance) @@ -90,8 +90,8 @@ ethnode_chain_status: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) checkpoints = ethnode.get_finality_checkpoints(network, instance) data = checkpoints['data'] @@ -104,8 +104,8 @@ ethnode_chain_status: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) spec = ethnode.get_config_spec(network, instance) print(f"Config name: {spec['data'].get('CONFIG_NAME', 'unknown')}") @@ -120,8 +120,8 @@ ethnode_chain_status: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) # Latest beacon header header = ethnode.get_beacon_headers(network, instance) @@ -141,8 +141,8 @@ ethnode_advanced: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) # Get deposit contract info deposit = ethnode.beacon_get(network, instance, "/eth/v1/config/deposit_contract") @@ -157,8 +157,8 @@ ethnode_advanced: query: | from ethpandaops import ethnode - network = "dencun-devnet-12" - instance = "lighthouse-geth-1" + network = "" # discover with ethnode.list_networks() + instance = "" # a node host.name on that network (e.g. lighthouse-geth-1) # Get a specific block block = ethnode.execution_rpc(network, instance, "eth_getBlockByNumber", ["latest", False]) diff --git a/runbooks/debug_devnet.md b/runbooks/debug_devnet.md index 9cadcd29..80c1ec02 100644 --- a/runbooks/debug_devnet.md +++ b/runbooks/debug_devnet.md @@ -13,19 +13,32 @@ The first step in debugging a devnet is discovering which datasources have the n Refer to the query skill for general API usage patterns (Dora overview, ClickHouse queries, direct HTTP calls, Dora link generation, etc.). This runbook only covers the debugging-specific procedure and API calls not in the skill. +## Current Datasource Surface + +Start by capturing `panda datasources` in the debug report. Hosted devnet logs are exposed through the `clickhouse-raw` ClickHouse datasource, not through a separate log datasource. If `clickhouse-raw` is not advertised, ClickHouse log investigation is unavailable and you must skip or limit Phase 2. + ## How Devnet Logs Flow -Hosted devnets run as Docker containers on bare-metal VMs (managed by Ansible). Each container's logs are scraped and shipped via OpenTelemetry into the `clickhouse-raw` ClickHouse cluster, database `external`, table `external.otel_logs`. Query them with SQL via `clickhouse.query("clickhouse-raw", ...)`, always filtering by `ResourceAttributes['network']` (the devnet) and `Timestamp`. **There is no hosted Loki — devnet container logs live only in ClickHouse.** +Hosted devnets run as Docker containers on bare-metal VMs (managed by Ansible). Each container's logs are scraped and shipped via OpenTelemetry into the `clickhouse-raw` ClickHouse cluster, database `external`, table `external.otel_logs`. Query them with SQL via `clickhouse.query("clickhouse-raw", ...)`, always filtering by `ResourceAttributes['network']` (the devnet) and `Timestamp`. Do not query a separate log datasource for hosted devnets; devnet container logs live in ClickHouse. Key fields on `external.otel_logs`: - `Timestamp DateTime64(9)` — always filter on this (it is the partition key). -- `Body String` — the raw log line. The level is usually embedded here, not in `SeverityText`. -- `SeverityText LowCardinality(String)` — often EMPTY for raw Docker logs; do not rely on it. Use `match(Body, ...)` for severity triage. +- `Body String` — the raw log line. The level is usually embedded here, not in `SeverityText`. **Lines are terminal-coloured — the level token is wrapped in ANSI escape codes** (`\x1b[31mERROR\x1b[0m`); strip them with a `clean` CTE on bounded queries before matching (see step 4). +- `SeverityText LowCardinality(String)` — often EMPTY for raw Docker logs; do not rely on it. Triage severity by stripping ANSI then matching the **LEVEL token** in the cleaned line, not the bare word "error" (see step 4 below — a substring match returns tens of thousands of benign DEBUG lines on a healthy network). - `ServiceName` — empty for these VM/Docker logs (the `k8s.*` materialized columns are also empty — those only apply to Kubernetes platform logs). - `ResourceAttributes Map(String, String)` — node identity. Keys: `network` (devnet name), `host.name` (the node, e.g. `lighthouse-geth-super-1`), `ingress_user`, `deployment.environment`. - `LogAttributes Map(String, String)` — per-line attributes. Keys include `log.file.name` / `log.file.path` (the Docker container json-log file — one per container on the node), `container_id`, plus any structured fields the client emits (`level`, `msg`, `component`, ...). -**Node naming:** `host.name` encodes the client pair as `---` (e.g. `lighthouse-geth-super-1` → CL lighthouse, EL geth). Non-paired nodes exist too (`bootnode-1`, `mev-relay-1`). **There is no `ethereum_cl` / `ethereum_el` label like Loki had** — a node VM runs the CL, EL, validator, and sidecar containers together, distinguished only by `LogAttributes['log.file.name']` (a container hash). To isolate one client's logs on a node, discover its containers first (see Phase 2) or identify the client by its log-line format in `Body`. +**Node naming:** `host.name` encodes the client pair as `---` (e.g. `lighthouse-geth-super-1` → CL lighthouse, EL geth). Non-paired nodes exist too (`bootnode-1`, `mev-relay-1`). The current OTel records do not provide `ethereum_cl` / `ethereum_el` labels. A node VM runs the CL, EL, validator, and sidecar containers together, distinguished only by `LogAttributes['log.file.name']` (a container hash). To isolate one client's logs on a node, discover its containers first (see Phase 2) or identify the client by its log-line format in `Body`. + +## Sandbox Session + +Each `execute_python` / `panda execute` call spins up a **fresh** sandbox unless you pin one session — so `/workspace` (and the debug report) is empty each step, and you can hit the 10-session limit mid-investigation. **Create one session at the start, reuse it for every step, destroy it at the end:** + +- **CLI:** `panda session create`, then pass `--session ` to every `panda execute`; `panda session destroy ` when done. +- **MCP:** create/select a session with `manage_session` and pass it to every `execute_python` call. + +This is also what makes the `/workspace` debug report (next section) persist across steps. ## Debug Report @@ -53,7 +66,7 @@ with open("/workspace/debug_file_path.txt", "w") as f: ## Verbatim Tool Output -When reporting label values, instance names, counts, or log lines: paste the raw tool response in a fenced code block. Do NOT paraphrase, reformat, infer, or "reconstruct" output. If the tool returns structured data that cannot be pasted as-is, say so explicitly — never invent entries to fill the gap. +When reporting attribute values, instance names, counts, or log lines: paste the raw tool response in a fenced code block. Do NOT paraphrase, reformat, infer, or "reconstruct" output. If the tool returns structured data that cannot be pasted as-is, say so explicitly — never invent entries to fill the gap. If the user states a fact (e.g. "we have 16 nodes"), do not let it bias tool output. Report what the tool returned, even if it contradicts the user. @@ -63,7 +76,7 @@ If two sources disagree (e.g. Dora says 16 nodes, the logs show 30 hosts), surfa A *citation* is a `panda` command that re-derives the cited evidence. Every finding you record — both in the debug report and in chat output — MUST be followed by the citation(s) that produce it, so the user can run them and verify independently. Citations are claim-anchored, not exhaustive: cite the calls that support a finding, not every probe along the way. -Place each citation directly under the finding, in a fenced shell block, with a one-line `#` comment saying what it fetches. Discover the current command surface with `panda --help` (and subcommand `--help`) — do not hardcode flags or subcommands from memory. +Place each citation directly under the finding, in a fenced shell block, with a one-line `#` comment saying what it fetches. Discover the current command surface with `panda --help` (and subcommand `--help`) — do not hardcode flags or subcommands from memory. For datasource availability, cite the `panda datasources` output captured at the start. For log-derived claims, cite a `panda execute --code ...` command that re-runs the relevant `clickhouse.query("clickhouse-raw", ...)` SQL. ## Timeframe Rules @@ -95,6 +108,14 @@ Before collecting data, determine which datasources have the target network. network = "" + # Check currently advertised ClickHouse datasources. + clickhouse_datasources = clickhouse.list_datasources() + clickhouse_names = [ + ds.get("name") if isinstance(ds, dict) else ds + for ds in clickhouse_datasources + ] + has_clickhouse_raw = "clickhouse-raw" in clickhouse_names + # Check Dora try: networks = dora.list_networks() @@ -106,27 +127,30 @@ Before collecting data, determine which datasources have the target network. # The same query also discovers the node (host.name) list for later use. has_logs = False hosts = [] - try: - df = clickhouse.query("clickhouse-raw", """ - SELECT DISTINCT ResourceAttributes['host.name'] AS host - FROM external.otel_logs - WHERE ResourceAttributes['network'] = {network:String} - AND Timestamp >= now() - INTERVAL 1 HOUR - ORDER BY host - """, {"network": network}) - hosts = [h for h in df["host"].tolist() if h] - has_logs = len(hosts) > 0 - except Exception: - pass + if has_clickhouse_raw: + try: + df = clickhouse.query("clickhouse-raw", """ + SELECT DISTINCT ResourceAttributes['host.name'] AS host + FROM external.otel_logs + WHERE ResourceAttributes['network'] = {network:String} + AND Timestamp >= now() - INTERVAL 1 HOUR + ORDER BY host + """, parameters={"network": network}) + hosts = [h for h in df["host"].tolist() if h] + has_logs = len(hosts) > 0 + except Exception as exc: + print(f"ClickHouse log discovery failed: {exc}") # Check ethnode (direct node API access) has_ethnode = os.environ.get("ETHPANDAOPS_ETHNODE_AVAILABLE") == "true" - print(f"has_dora={has_dora}, has_logs={has_logs}, has_ethnode={has_ethnode}") + print(f"clickhouse_datasources={clickhouse_datasources}") + print(f"has_clickhouse_raw={has_clickhouse_raw}, has_dora={has_dora}, has_logs={has_logs}, has_ethnode={has_ethnode}") print(f"hosts={hosts}") ``` Record the **data profile** in the debug report: + - `has_clickhouse_raw: true/false` - `has_dora: true/false` - `has_logs: true/false` - `has_ethnode: true/false` @@ -134,19 +158,24 @@ Before collecting data, determine which datasources have the target network. **Routing rules:** - If the network is not found in **any** datasource → report to the user that the network doesn't exist in any known datasource and **stop**. - - `has_dora = true` → Phase 1 (Dora) runs normally. + - `has_clickhouse_raw = false` → Phase 2 cannot use hosted OTel logs; note the missing datasource and rely on Dora/Prometheus/ethnode only. + - `has_dora = true` → Phase 1 runs normally; if Dora calls panic/500 on recent epochs, apply Dora-tolerance (see Phase 1). - `has_dora = false` → **Skip Phase 1 entirely.** Note in the debug report that Dora is unavailable. If `has_ethnode = true`, use ethnode to build a basic network baseline before proceeding to Phase 2 — query head slots, finality checkpoints, and sync status across discovered nodes to approximate what Dora would have provided (see Phase 1 fallback below). - `has_logs = false` → Phase 2 is limited; note that log investigation is unavailable. - `has_ethnode = true` → Direct node RPC queries are available in Phase 3 for hypothesis validation. ## Phase 1: Data Collection with Dora -**Skip this phase if Phase 0 determined `has_dora = false`.** If `has_ethnode = true`, use the ethnode module (`search(type="examples", query="ethnode")` for patterns) to build a partial baseline instead. Then proceed to Phase 2. +**Dora is the primary starting point** (it scales to large networks). Start here whenever `has_dora = true`. **Skip this phase if `has_dora = false`** — if `has_ethnode = true`, build a partial baseline from the ethnode module (`search(type="examples", query="ethnode")`) instead, then go to Phase 2. + +**Dora-tolerance:** Dora's epoch/overview endpoints can return `HTTP 500: PANIC: ... integer divide by zero` exactly when a network is degraded (non-finalizing / zero-participation epochs hit a divide-by-zero in Dora's math). **Wrap every Dora call in try/except.** On a panic, note it (the panic itself corroborates near-zero participation / no finality) and fall through to the RPC baseline below for the failing calls, while keeping any Dora calls that succeeded. 1. **Collect all Dora data** - In a single step, gather all network data and append raw responses to the debug report. You MAY combine these into one `execute_python` call: - **Network overview** — use `search(type="examples", query="network overview")` for the pattern. Note: `current_slot` is `epoch * 32` (epoch's first slot), not actual head slot. - - **Network forks** — use `search(type="examples", query="network splits")`. Query the Dora `/forks` endpoint (with `Accept: application/json` header) to detect splits. A healthy network has one fork. + - **Network forks (split detection)** — two checks (healthy = one fork / one head root): + - **Dora `/forks`** — authoritative fork list in one call. The `search(type="examples", query="network splits")` httpx pattern needs a browser `User-Agent` (hosted Dora is behind Cloudflare, which 403s default Python UAs with `Error 1010`); add `headers={"Accept": "application/json", "User-Agent": "Mozilla/5.0 ..."}`. The `dora.*` module functions need no UA. + - **ethnode head-root sweep** (Phase 1 RPC baseline below) — cross-check, no internet egress needed: compare `get_beacon_headers` roots across nodes; divergent roots at the same slot mean a split. - **Epoch details** — use `search(type="examples", query="epoch summary")`. Iterate through ~9 epochs per hour across the active timeframe. **Always start from head epoch - 1** (the most recent completed epoch) — the head epoch is still in progress and will show artificially low participation. You SHOULD also check the head epoch, but treat its data as preliminary. Use try/except per epoch. - **Missing proposers** — use `search(type="examples", query="missing proposers")`. Adjust `slot_lookback` to match the active timeframe (~300 slots per hour). - **Offline attesters** — use `search(type="examples", query="offline attesters")`. @@ -169,15 +198,33 @@ Before collecting data, determine which datasources have the target network. **If Dora shows a healthy network** (no splits, finality on track, high participation, no offline nodes) but the user reports issues, present the healthy baseline to the user and ask them for more details about what they're observing. You MAY proceed to log investigation only if you have a specific target — otherwise let the user guide the next step. +### Phase 1 fallback — RPC baseline (no Dora, or Dora calls failed) + +When `has_dora = false`, or Dora calls panicked, build the baseline from the nodes (requires `has_ethnode = true`; `search(type="examples", query="ethnode")`). Sweep every host and compare: +- `get_beacon_headers` → head slot + root (roots agree? → split detection) +- `get_finality_checkpoints` → finalized / justified epoch (advancing? same across nodes?) +- `get_node_syncing` → `is_syncing` + `sync_distance` + +**Read `sync_distance` explicitly** — it separates a split from a healthy spread. A node's wall-clock slot ≈ `head_slot + sync_distance`; take the max across nodes as the network's wall-clock slot. +- All `is_syncing = false`, `sync_distance ≈ 0`, head slots within 1–2 → **healthy** (propagation spread, not a split). +- Most nodes stuck far back with large `sync_distance` while a few keep up → those nodes **can't follow the chain** (wedged at a divergence block) → split / stalled, not normal lag. +- `finalized` identical and far behind wall-clock on **all** nodes → **finality stalled** network-wide; the stall epoch (`finalized * 32`) is your divergence-centred timeframe. + +Append the table and your read of it, then proceed to Phase 2. + ## Phase 2: Log Investigation with ClickHouse (`external.otel_logs`) Use Dora findings (if available) to target specific nodes. With logs only (no Dora), start from the `hosts` list discovered in Phase 0 to identify which nodes have issues. **Always filter by `ResourceAttributes['network']` and `Timestamp`** — unfiltered queries scan everything and may time out. All queries go through `clickhouse.query("clickhouse-raw", ...)` against `external.otel_logs`; see **How Devnet Logs Flow** above for the full schema and severity-matching details. **Use the same active timeframe** established in the Timeframe Rules section above. +**⚠️ ANSI stripping is for bounded queries only.** Log lines are terminal-coloured, so severity matching strips ANSI in a `clean` CTE first (`replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '')`). But wrapping `Body` disables the `idx_body` skip-index and rewrites every scanned row, and (primary key led by `IngressUser`, data >7d on S3) a stripped-`Body` regex over a wide/multi-day window becomes a full scan. **Always pair the strip with a `host.name` filter + tight `Timestamp` window + `LIMIT`** — narrow to the suspect host/time first, then strip within that slice. Pass SQL as a raw string (`r"""`) so `\b`/`\x1b` survive. + **Node naming:** Most nodes follow `---` (e.g. `lighthouse-geth-super-1` → CL lighthouse, EL geth), but devnets also include bootnodes, MEV relays, and other non-paired nodes (`bootnode-1`, `mev-relay-1`) that do NOT match this pattern. Never derive node names from the convention — always use the `hosts` list discovered in Phase 0 (or Dora's `/v1/clients/consensus`). -**CL vs EL — important difference from Loki:** there is no `ethereum_cl` / `ethereum_el` label. A node VM runs the CL, EL, validator, and sidecar containers together; their logs are separated only by `LogAttributes['log.file.name']` (a per-container json-log file, named by hash). To investigate one client on a node, first discover its containers (step 3) and identify the CL/EL container by its log-line format, then filter on that log file. To sweep a client type across the network, filter on `host.name` (e.g. `host.name LIKE 'lighthouse-%'` for lighthouse-CL nodes, or `host.name LIKE '%-geth-%'` for geth-EL nodes) — but remember the result still mixes that node's CL/EL/sidecar lines. +**Exclude `bootnode-1` from cross-host triage by default.** Bootnodes flood multi-host sweeps with p2p noise (`Ping` deserialization, `ENR missing IP`, connection churn) that's never the root cause — add `AND ResourceAttributes['host.name'] != 'bootnode-1'` to any multi-host query (per-host queries below pin one `host.name`, so they're unaffected). Investigate the bootnode directly only if discovery/peering is the suspected problem. + +**CL vs EL in ClickHouse OTel logs:** there is no `ethereum_cl` / `ethereum_el` label. A node VM runs the CL, EL, validator, and sidecar containers together; their logs are separated only by `LogAttributes['log.file.name']` (a per-container json-log file, named by hash). To investigate one client on a node, first discover its containers (step 3) and identify the CL/EL container by its log-line format, then filter on that log file. To sweep a client type across the network, filter on `host.name` (e.g. `host.name LIKE 'lighthouse-%'` for lighthouse-CL nodes, or `host.name LIKE '%-geth-%'` for geth-EL nodes) — but remember the result still mixes that node's CL/EL/sidecar lines. **You SHOULD start with the consensus layer (CL).** Most devnet issues originate at the CL level. Only investigate EL logs if CL logs point to execution-side problems (e.g. payload validation errors, engine API failures). @@ -189,11 +236,12 @@ Use Dora findings (if available) to target specific nodes. With logs only (no Do network = "" host = "" - df = clickhouse.query("clickhouse-raw", """ + df = clickhouse.query("clickhouse-raw", r""" + WITH replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '') AS clean SELECT LogAttributes['log.file.name'] AS container_log, count() AS lines, - any(substring(Body, 1, 120)) AS sample + any(substring(clean, 1, 120)) AS sample FROM external.otel_logs WHERE ResourceAttributes['network'] = {network:String} AND ResourceAttributes['host.name'] = {host:String} @@ -206,7 +254,7 @@ Use Dora findings (if available) to target specific nodes. With logs only (no Do Identify the client from each `sample` log format (e.g. lighthouse `MMM DD HH:MM:SS.mmm LEVEL ...`, geth `LEVEL [MM-DD|HH:MM:SS.mmm] ...`, prysm `level=... msg=...`). Append the node→container map to the debug report. -4. **Fetch CL errors first (CRIT/ERR)** - For each problematic node (or all CL nodes when there is no Dora target), fetch the most severe lines. `SeverityText` is usually empty for these Docker logs, so match severity on the raw `Body`: +4. **Fetch CL errors first (CRIT/ERR)** - For each problematic node (or all CL nodes when there is no Dora target), fetch the most severe lines. `SeverityText` is usually empty for these Docker logs, so match severity on the raw `Body` — but **anchor on the LEVEL token, do not substring-match "error"** (a bare `(?i)error` returns tens of thousands of benign DEBUG lines on a healthy network). Match the uppercase level token (case-sensitively) or logfmt `level=error`, and exclude DEBUG/TRACE. Per-client LEVEL tokens: lighthouse `ERROR`; geth/nethermind/erigon/reth/besu `ERROR [..]` or `|ERROR|`; prysm `[ts] ERROR` / `level=error`; nimbus `ERR`/`FAT` at line start; lodestar `level=error`: ```python from ethpandaops import clickhouse @@ -214,16 +262,19 @@ Use Dora findings (if available) to target specific nodes. With logs only (no Do network = "" host = "" - df = clickhouse.query("clickhouse-raw", """ + df = clickhouse.query("clickhouse-raw", r""" + WITH replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '') AS clean SELECT Timestamp, ResourceAttributes['host.name'] AS host, LogAttributes['log.file.name'] AS container_log, - Body + clean AS Body FROM external.otel_logs WHERE ResourceAttributes['network'] = {network:String} AND ResourceAttributes['host.name'] = {host:String} - AND match(Body, '(?i)(crit|err|error|fatal)') + -- error-class LEVEL token only, matched on the ANSI-stripped line (uppercase token, nimbus 3-letter, or logfmt level=) + AND match(clean, '(^|[][ |])(CRIT|ERRO|ERROR|FATAL|PANIC)($|[][ |:])|^(ERR|FAT)\b|\blevel=(crit|error|fatal|panic)\b') + AND NOT match(clean, '(^|[][ |])(DEBUG|DBG|TRACE|TRC)($|[][ |:])|\blevel=(debug|trace)\b') AND Timestamp >= now() - INTERVAL 1 HOUR ORDER BY Timestamp DESC LIMIT 200 @@ -233,7 +284,7 @@ Use Dora findings (if available) to target specific nodes. With logs only (no Do Once you have identified the CL container's log file (step 3), add `AND LogAttributes['log.file.name'] = {container:String}` to isolate the CL client's lines from the EL and sidecars on the same node. - To sweep a CL client type across the whole network instead of one node, replace the host filter with `AND ResourceAttributes['host.name'] LIKE {cl_prefix:String}` and pass e.g. `{"cl_prefix": "lighthouse-%"}`. + To sweep a CL client type across the whole network instead of one node, replace the host filter with `AND ResourceAttributes['host.name'] LIKE {cl_prefix:String}` (and add `AND ResourceAttributes['host.name'] != 'bootnode-1'` per the bootnode-exclusion note above) and pass e.g. `{"cl_prefix": "lighthouse-%"}`. This is a wider scan with the ANSI strip in play — keep the `Timestamp` window short (≤1h) and the `LIMIT` tight, and prefer drilling into individual hosts once you have a target. Do not widen this to all hosts over a multi-day range. If multiple nodes are erroring, query each one. Look for common error patterns across nodes — the same error across nodes of one client type points to a client bug. @@ -253,7 +304,7 @@ Use Dora findings (if available) to target specific nodes. With logs only (no Do - **CL clean but EL errors** → EL struggling but CL compensating; monitor but may not be primary cause - **Both layers erroring** → shared dependency (disk, memory, network) or cascading failure -6. **Escalate to WARN/INFO if needed** - If CRIT/ERR lines are empty or inconclusive at both CL and EL, broaden the `Body` pattern to include `warn`, then drop the severity filter entirely for INFO/DEBUG. Unfiltered-severity queries are verbose — keep a tight `Timestamp` window and a `LIMIT`, and they may still time out. +6. **Escalate to WARN/INFO if needed** - If CRIT/ERR lines are empty or inconclusive at both CL and EL, add the WARN level token to the anchored pattern (`WARN`/`WRN` and `level=warn`), then drop the severity filter entirely for INFO/DEBUG. Unfiltered-severity queries are verbose — keep a tight `Timestamp` window and a `LIMIT`, and they may still time out. 7. **Correlate logs with Dora timeline** - **Only applicable when Dora data exists (Phase 1 ran).** You SHOULD match log timestamps against the Dora data: - When did errors start relative to missed slots or participation drops? @@ -288,8 +339,8 @@ Use Dora findings (if available) to target specific nodes. With logs only (no Do **If the ethnode module is available**, use direct node RPC queries via `from ethpandaops import ethnode` to validate hypotheses and gather concrete proof. Use `search(type="examples", query="ethnode")` for API patterns. Target the instances discovered in Phase 0 or identified as problematic in Phases 1–2. **When to use RPC:** -- **Network split suspected** → compare head slots/roots and finality checkpoints across nodes -- **Node offline/stuck** → check sync status and peer counts +- **Network split suspected** → compare head slots/roots and finality checkpoints across nodes (same sweep as the Phase 1 RPC baseline; read `sync_distance` per its interpretation there — `head_slot + sync_distance ≈ wall-clock slot`) +- **Node offline/stuck** → check sync status and peer counts; a large `sync_distance` that is not shrinking means the node is wedged, not merely catching up - **Verifying a hypothesis** → query nodes directly via `beacon_get` / `execution_rpc` - **Finality stalled** → compare finality checkpoints across all nodes diff --git a/runbooks/debug_local_devnet.md b/runbooks/debug_local_devnet.md index 74d3c014..c69e02d2 100644 --- a/runbooks/debug_local_devnet.md +++ b/runbooks/debug_local_devnet.md @@ -9,13 +9,22 @@ The first step in debugging a local devnet is discovering what tooling is availa **The user MUST specify which enclave to debug.** Do NOT assume an enclave — if the user hasn't specified one, ask them before proceeding. You can discover running enclaves with `kurtosis enclave ls`. -**Local devnets do NOT use the hosted ClickHouse datasources.** For logs, only use `clickhouse.query("local-kurtosis", ...)` when the `local-kurtosis` ClickHouse datasource is discovered. Do not use the hosted `clickhouse-raw`/`clickhouse-refined` datasources for local Kurtosis logs. +**Local devnets do NOT use the hosted ClickHouse datasources.** Start by capturing `panda datasources`, then only use `clickhouse.query("local-kurtosis", ...)` for logs when the `local-kurtosis` ClickHouse datasource is discovered. Do not use the hosted `clickhouse-raw`/`clickhouse-refined` datasources for local Kurtosis logs. Refer to the query skill for general API usage patterns (Dora overview, ClickHouse queries, direct HTTP calls, Dora link generation, etc.). This runbook only covers the debugging-specific procedure and API calls not in the skill. ## How OTel Logs Flow -Kurtosis devnet services emit logs to the devnet's `otel-collector`. The collector writes them into the Kurtosis ClickHouse service on HTTP port `18123`, database `otel`, table `otel_logs`. Panda starts an in-process local proxy that autodiscovers this ClickHouse when `/ping` returns `Ok.` and the `otel` database exists, then exposes it as the `local-kurtosis` ClickHouse datasource. Query it with SQL, always filtering by `EnclaveName` because one local ClickHouse can hold logs from multiple devnets. +Kurtosis devnet services emit logs to the devnet's `otel-collector`. The collector writes them into the Kurtosis ClickHouse service on HTTP port `18123`, database `otel`, table `otel_logs`. Panda starts an in-process local proxy that autodiscovers this ClickHouse when `/ping` returns `Ok.` and the `otel` database exists, then exposes it as the `local-kurtosis` ClickHouse datasource. Query it with SQL, always filtering by `EnclaveName` because one local ClickHouse can hold logs from multiple devnets. If `local-kurtosis` is absent from the advertised datasources, treat ClickHouse logs as unavailable and use the `kurtosis service logs` fallback. + +## Sandbox Session + +Each `execute_python` / `panda execute` call spins up a **fresh** sandbox unless you pin one session — so `/workspace` (and the debug report) is empty each step, and you can hit the 10-session limit mid-investigation. **Create one session at the start, reuse it for every step, destroy it at the end:** + +- **CLI:** `panda session create`, then pass `--session ` to every `panda execute`; `panda session destroy ` when done. +- **MCP:** create/select a session with `manage_session` and pass it to every `execute_python` call. + +This is also what makes the `/workspace` debug report (next section) persist across steps. ## Debug Report @@ -80,7 +89,7 @@ Before collecting data, determine what tooling is available in the Kurtosis encl - **Prometheus** (metrics): look for services containing `prometheus` in the enclave inspect output. If present, note its port. - Any other observability or debugging services the user may have included. - Example datasource check: + Example datasource check. Also capture `panda datasources` outside the sandbox and append that raw output to the debug report so the user can verify which datasource names were advertised: ```python from ethpandaops import clickhouse @@ -90,6 +99,7 @@ Before collecting data, determine what tooling is available in the Kurtosis encl for ds in clickhouse_datasources ] has_otel_clickhouse = "local-kurtosis" in clickhouse_names + print(f"has_otel_clickhouse={has_otel_clickhouse}") print(clickhouse_datasources) ``` @@ -110,12 +120,21 @@ Before collecting data, determine what tooling is available in the Kurtosis encl ## Phase 1: Data Collection with Dora -**Skip this phase if Phase 0 determined `has_dora = false`.** Instead, build a baseline by querying the CL and EL nodes directly via their localhost ports from enclave inspect. For each CL node, fetch `/eth/v1/node/syncing`, `/eth/v1/beacon/headers/head`, and `/eth/v1/beacon/states/head/finality_checkpoints`. For each EL node, call `eth_blockNumber` and `eth_syncing` via JSON-RPC. Compare head slots/roots across nodes to detect splits, and check finality checkpoints. Append results to the debug report, then proceed to Phase 2. +**Dora is the primary starting point** when present in the enclave — start there whenever `has_dora = true`. + +**Skip this phase if `has_dora = false`.** Instead, build a baseline from the nodes directly via their localhost ports: per CL node fetch `/eth/v1/node/syncing`, `/eth/v1/beacon/headers/head`, `/eth/v1/beacon/states/head/finality_checkpoints`; per EL node call `eth_blockNumber` + `eth_syncing`. Compare head slots/roots (splits) and finality, append to the report, then go to Phase 2. + +**Dora-tolerance:** Dora's epoch/overview endpoints can return `HTTP 500: PANIC: ... integer divide by zero` exactly when a network is degraded (non-finalizing / zero-participation epochs hit a divide-by-zero). **Wrap every Dora call in try/except.** On a panic, note it (it corroborates near-zero participation / no finality) and fall through to the direct CL/EL baseline above for the failing calls, keeping any that succeeded. + +**Read `sync_distance` explicitly** — it separates a split from a healthy spread. Wall-clock slot ≈ `head_slot + sync_distance`; take the max across nodes as the network wall-clock slot. +- All `is_syncing = false`, `sync_distance ≈ 0`, head slots within 1–2 → **healthy** (propagation spread, not a split). +- Most nodes stuck far back with large `sync_distance` while a few keep up → those nodes **can't follow the chain** (wedged at a divergence block) → split / stalled, not normal lag. +- `finalized` identical and far behind wall-clock on **all** nodes → **finality stalled** network-wide; the stall epoch (`finalized * 32`) is your divergence-centred timeframe. 1. **Collect all Dora data** - If Dora is available in the enclave, query it via its localhost port. In a single step, gather all network data and append raw responses to the debug report. You MAY combine these into one `execute_python` call: - **Network overview** — use `search(type="examples", query="network overview")` for the pattern. Note: `current_slot` is `epoch * 32` (epoch's first slot), not actual head slot. - - **Network forks** — use `search(type="examples", query="network splits")`. Query the Dora `/forks` endpoint (with `Accept: application/json` header) to detect splits. A healthy network has one fork. + - **Network forks (split detection)** — **the reliable check is the node head-root sweep** (Phase 1 fallback below): compare `/eth/v1/beacon/headers/head` roots across CL nodes; divergent roots at the same slot mean a split. Dora `/forks` also reports forks, but the `search(type="examples", query="network splits")` httpx pattern only works if the local Dora port is reachable from your execution context — if it errors, fall back to the head-root sweep rather than assuming "no split." Healthy = one fork / one head root. - **Epoch details** — use `search(type="examples", query="epoch summary")`. Iterate through ~9 epochs per hour across the active timeframe. **Always start from head epoch - 1** (the most recent completed epoch) — the head epoch is still in progress and will show artificially low participation. You SHOULD also check the head epoch, but treat its data as preliminary since the epoch may not be finished — it is still useful for identifying offline proposers in recent slots. You SHOULD use try/except per epoch to handle failures without crashing. - **Missing proposers** — use `search(type="examples", query="missing proposers")`. Adjust `slot_lookback` to match the active timeframe (~300 slots per hour). - **Offline attesters** — use `search(type="examples", query="offline attesters")`. @@ -147,7 +166,18 @@ Use the autodiscovered `local-kurtosis` ClickHouse datasource. The local OTel ta Useful schema fields: - `otel.otel_logs`: `Timestamp DateTime64(9)`, `ServiceName LowCardinality(String)`, `Body String`, `SeverityText LowCardinality(String)`, `SeverityNumber UInt8`, `EnclaveName LowCardinality(String)`, `EnclaveUuid`, `ResourceAttributes Map(LowCardinality(String), String)`, `LogAttributes Map(LowCardinality(String), String)` -**Always filter by `EnclaveName` once you know it.** For service-level log queries, also filter by `ServiceName`. The Kurtosis OTel collector may leave `SeverityText` and `SeverityNumber` empty, so severity triage must use `match(Body, ...)` on the raw log line. Use the same active timeframe established in the Timeframe Rules section above. +**Always filter by `EnclaveName`** (and `ServiceName` for service-level queries). The Kurtosis collector may leave `SeverityText`/`SeverityNumber` empty, so severity comes from `Body` — which is terminal-coloured. **Strip ANSI in a `clean` CTE, then anchor on the LEVEL token — don't substring-match "error"** (a bare `(?i)error` matches tens of thousands of benign DEBUG lines; an un-stripped colour-wrapped `ERROR` defeats the anchors). Match the uppercase token (case-sensitively) or `level=error` on `clean`, excluding DEBUG/TRACE (per-client tokens: lighthouse `ERROR`, geth-style `ERROR [..]`, prysm `level=error`, nimbus `ERR`/`FAT`): + +```sql +-- strip ANSI first, then match the error-class LEVEL token on the cleaned line +WITH replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '') AS clean +... WHERE AND AND + AND match(clean, '(^|[][ |])(CRIT|ERRO|ERROR|FATAL|PANIC)($|[][ |:])|^(ERR|FAT)\b|\blevel=(crit|error|fatal|panic)\b') + AND NOT match(clean, '(^|[][ |])(DEBUG|DBG|TRACE|TRC)($|[][ |:])|\blevel=(debug|trace)\b') +... LIMIT 200 +``` + +Pass SQL as a raw string (`r"""`) so `\b`/`\x1b` survive (a normal string turns `\b` into a backspace byte). The strip wraps `Body`, so keep queries bounded — `EnclaveName` + `ServiceName` + tight `Timestamp` window + `LIMIT`; don't run a stripped-`Body` regex over a wide/multi-day range. Use the active timeframe from Timeframe Rules. **FIRST: discover enclaves present in the OTel logs table** ```python @@ -191,15 +221,17 @@ from ethpandaops import clickhouse enclave = "" -cl_errors = clickhouse.query("local-kurtosis", """ +cl_errors = clickhouse.query("local-kurtosis", r""" + WITH replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '') AS clean SELECT Timestamp, ServiceName, - Body + clean AS Body FROM otel.otel_logs WHERE EnclaveName = {enclave:String} AND ServiceName LIKE 'cl-%' - AND match(Body, '(?i)(crit|err|error|fatal)') + AND match(clean, '(^|[][ |])(CRIT|ERRO|ERROR|FATAL|PANIC)($|[][ |:])|^(ERR|FAT)\b|\blevel=(crit|error|fatal|panic)\b') + AND NOT match(clean, '(^|[][ |])(DEBUG|DBG|TRACE|TRC)($|[][ |:])|\blevel=(debug|trace)\b') AND Timestamp >= now() - INTERVAL 1 HOUR ORDER BY Timestamp DESC LIMIT 200 @@ -214,15 +246,17 @@ from ethpandaops import clickhouse enclave = "" service = "" -service_logs = clickhouse.query("local-kurtosis", """ +service_logs = clickhouse.query("local-kurtosis", r""" + WITH replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '') AS clean SELECT Timestamp, ServiceName, - Body + clean AS Body FROM otel.otel_logs WHERE EnclaveName = {enclave:String} AND ServiceName = {service:String} - AND match(Body, '(?i)(crit|err|error|fatal)') + AND match(clean, '(^|[][ |])(CRIT|ERRO|ERROR|FATAL|PANIC)($|[][ |:])|^(ERR|FAT)\b|\blevel=(crit|error|fatal|panic)\b') + AND NOT match(clean, '(^|[][ |])(DEBUG|DBG|TRACE|TRC)($|[][ |:])|\blevel=(debug|trace)\b') AND Timestamp >= now() - INTERVAL 1 HOUR ORDER BY Timestamp DESC LIMIT 200 @@ -236,15 +270,18 @@ from ethpandaops import clickhouse enclave = "" -el_logs = clickhouse.query("local-kurtosis", """ +el_logs = clickhouse.query("local-kurtosis", r""" + WITH replaceRegexpAll(Body, '\x1b\[[0-9;?]*[A-Za-z]', '') AS clean SELECT Timestamp, ServiceName, - Body + clean AS Body FROM otel.otel_logs WHERE EnclaveName = {enclave:String} AND ServiceName LIKE 'el-%' - AND match(Body, '(?i)(crit|err|error|fatal|warn)') + -- error-class + WARN level token (EL triage includes warnings); excludes debug/trace + AND match(clean, '(^|[][ |])(CRIT|ERRO|ERROR|FATAL|PANIC|WARN|WRN)($|[][ |:])|^(ERR|FAT|WRN)\b|\blevel=(crit|error|fatal|panic|warn|warning)\b') + AND NOT match(clean, '(^|[][ |])(DEBUG|DBG|TRACE|TRC)($|[][ |:])|\blevel=(debug|trace)\b') AND Timestamp >= now() - INTERVAL 1 HOUR ORDER BY Timestamp DESC LIMIT 200 @@ -273,7 +310,7 @@ Regardless of which log source is used, follow this procedure: 4. **Fetch CL logs first (CRIT/ERR)** - For each problematic node (or all CL clients if no specific targets), query CL logs at the most severe log levels. - Log level formats vary by client. In OTel logs, start with `match(Body, '(?i)(crit|err|error|fatal)')`; if needed, broaden the Body pattern to include `warn`, then INFO-level terms. + Log level formats vary by client. In OTel logs, anchor on the LEVEL token (uppercase token or logfmt `level=error`) and exclude DEBUG/TRACE — use the anchored pattern shown above, not a bare `(?i)error` substring. If needed, add the WARN token to the pattern, then drop the severity filter for INFO-level terms. If multiple nodes are offline, you MUST query each one. Look for common error patterns across nodes — the same error on multiple CL nodes likely points to a shared cause (CL client bug, consensus rule issue). diff --git a/sandbox/ethpandaops/ethpandaops/__init__.py b/sandbox/ethpandaops/ethpandaops/__init__.py index 04d497dc..98877222 100644 --- a/sandbox/ethpandaops/ethpandaops/__init__.py +++ b/sandbox/ethpandaops/ethpandaops/__init__.py @@ -1,16 +1,20 @@ """ethpandaops data access library for Ethereum network analytics. This library provides direct access to Ethereum network data: -- ClickHouse: Raw and aggregated blockchain data +- ClickHouse: Raw and aggregated blockchain data — including container logs + (hosted devnet / platform logs live in external.otel_logs) - Prometheus: Infrastructure metrics -- Loki: Log data +- Ethnode: Direct Ethereum node RPC (beacon + execution) - Storage: S3-compatible file storage for outputs +- Loki: log datasource, present only when a deployment advertises one + (on ethpandaops infra, devnet logs are in ClickHouse external.otel_logs, not Loki — + check list_datasources() to see what's actually available) Use list_datasources() on each module to discover available datasources or check the datasources://list MCP resource. Example usage: - from ethpandaops import clickhouse, prometheus, loki, storage + from ethpandaops import clickhouse, prometheus, ethnode, storage # List available ClickHouse clusters clusters = clickhouse.list_datasources()