From 89c4a6f1e37dc5f6a5413e1d8f20ff64c9047def Mon Sep 17 00:00:00 2001 From: Stefan Date: Sat, 6 Jun 2026 12:58:59 +0200 Subject: [PATCH 1/6] fix(cli): reduce avoidable errors in panda CLI investigations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hosted-devnet CLI sessions repeatedly hit avoidable errors: querying the disabled Loki module, guessing non-existent ethnode instance names (bare 502s), parsing null head-epoch fields, and using a bare `otel_logs` that only resolves on local clusters. Each failure left the agent with a generic or absent hint that did not point at the right discovery command. - clickhouse: auto-qualify bare otel_logs/otel_traces to the database that owns them per datasource (external on hosted, otel on local-kurtosis), so the same query works in both deployment modes. Already-qualified refs and lookalike identifiers are preserved. - cli: message-aware error hints — ClickHouse UNKNOWN_TABLE/UNKNOWN_DATABASE now points at `panda schema` and names the real otel_logs locations; a 502 hint explains an ethnode instance may not exist and how to discover names. - loki: redirect failed commands to the ClickHouse OTel log tables with a ready-to-run query example. - ethnode: document that instances cannot be enumerated and the client-pair convention is not guessable; show how to discover real names. - dora: document that head/in-progress epochs return null fields (use head_epoch-1) and that summary fields nest under the `data` key. --- pkg/cli/dora.go | 14 +++- pkg/cli/ethnode.go | 16 +++++ pkg/cli/loki_cmd.go | 49 ++++++++++++-- pkg/cli/server_helpers.go | 25 ++++++- pkg/server/operations_clickhouse.go | 51 ++++++++++++++ pkg/server/operations_clickhouse_test.go | 85 ++++++++++++++++++++++++ 6 files changed, 233 insertions(+), 7 deletions(-) create mode 100644 pkg/server/operations_clickhouse_test.go diff --git a/pkg/cli/dora.go b/pkg/cli/dora.go index 2f4f9c80..16f8f008 100644 --- a/pkg/cli/dora.go +++ b/pkg/cli/dora.go @@ -136,7 +136,19 @@ var doraSlotCmd = &cobra.Command{ var doraEpochCmd = &cobra.Command{ Use: "epoch ", Short: "Get epoch summary (always JSON)", - Args: cobra.ExactArgs(2), + Long: `Get the Dora summary for a single epoch (always JSON). + +The summary fields are nested under the top-level "data" key +(e.g. data.finalized, data.globalParticipationRate, data.attestationCount). + +Query a COMPLETED epoch. The head (in-progress) epoch — and any future epoch — +has no aggregated data yet, so its fields come back null/None. For a current +snapshot use head_epoch - 1 (a completed epoch); the head epoch also reports +artificially low participation because it is still being filled. + +Examples: + panda dora epoch hoodi 100`, + Args: cobra.ExactArgs(2), RunE: func(cmd *cobra.Command, args []string) error { response, err := runServerOperationRaw(cmd, "dora.get_epoch", map[string]any{ "network": args[0], diff --git a/pkg/cli/ethnode.go b/pkg/cli/ethnode.go index 25a18a1d..edc279a8 100644 --- a/pkg/cli/ethnode.go +++ b/pkg/cli/ethnode.go @@ -16,6 +16,18 @@ var ethnodeCmd = &cobra.Command{ Long: `Direct access to Ethereum beacon and execution node APIs. Nodes are identified by network and instance name (e.g., "lighthouse-geth-1"). +Discovering instance names: + The proxy relays to any / host on demand and holds NO + enumerable instance list — there is no "list nodes" command, and the + -- convention is NOT guessable (a plausible name like + "teku-geth-1" may simply not exist, which surfaces as an HTTP 502). Get the + real names from a source that knows what nodes exist: + • Dora: panda dora ... (the /v1/clients/consensus endpoint lists CL nodes) + • OTel logs: panda clickhouse query clickhouse-raw \ + "SELECT DISTINCT ResourceAttributes['host.name'] FROM otel_logs \ + WHERE ResourceAttributes['network'] = '' \ + AND Timestamp >= now() - INTERVAL 1 HOUR" + Examples: panda ethnode list-datasources panda ethnode syncing dencun-devnet-12 lighthouse-geth-1 @@ -309,6 +321,10 @@ var ethNodeBeaconGetCmd = &cobra.Command{ Long: `Make a GET request to any beacon API endpoint. The path should start with / (e.g., /eth/v1/node/identity). + must be a real node name — instances cannot be enumerated and the +client-pair convention is not guessable. An HTTP 502 usually means the instance +does not exist; see 'panda ethnode --help' for how to discover valid names. + Examples: panda ethnode beacon-get my-devnet lighthouse-geth-1 /eth/v1/node/identity panda ethnode beacon-get my-devnet lighthouse-geth-1 /eth/v1/config/deposit_contract`, diff --git a/pkg/cli/loki_cmd.go b/pkg/cli/loki_cmd.go index d5d0ea51..fae32528 100644 --- a/pkg/cli/loki_cmd.go +++ b/pkg/cli/loki_cmd.go @@ -4,11 +4,45 @@ import ( "encoding/json" "fmt" "strconv" + "strings" "time" "github.com/spf13/cobra" ) +// lokiRedirectSynopsis points users at the ClickHouse OTel log tables, which +// have replaced Loki for container logs. +const lokiRedirectSynopsis = `Loki is not available here. Container logs now live in ClickHouse — query them with 'panda clickhouse query': + + • Hosted (multi-VM) devnets/testnets → external.otel_logs (filter ResourceAttributes['network']) + • Local Kurtosis devnets → otel.otel_logs (filter EnclaveName) + +A bare 'otel_logs' is auto-resolved to the right database for the datasource. Example: + + panda clickhouse query clickhouse-raw "SELECT Timestamp, ResourceAttributes['host.name'] AS host, Body \ + FROM otel_logs \ + WHERE ResourceAttributes['network'] = '' \ + AND Timestamp >= now() - INTERVAL 1 HOUR \ + AND match(Body, '(?i)error') \ + ORDER BY Timestamp DESC LIMIT 100"` + +// redirectLokiError appends the ClickHouse redirect synopsis when a Loki +// command failed because the Loki module is unavailable on this server. +func redirectLokiError(err error) error { + if err == nil { + return nil + } + + msg := err.Error() + if !strings.Contains(msg, "not enabled") && + !strings.Contains(msg, "not available") && + !strings.Contains(msg, "HTTP 404") { + return err + } + + return fmt.Errorf("%w\n\n%s", err, lokiRedirectSynopsis) +} + var ( lokiLimit int lokiStart string @@ -23,6 +57,11 @@ var lokiCmd = &cobra.Command{ Short: "Query Loki logs", Long: `Query Loki for log data. +NOTE: Loki has been superseded by the ClickHouse OTel log tables for container +logs. If these commands report Loki is unavailable, query 'external.otel_logs' +(hosted) / 'otel.otel_logs' (local Kurtosis) via 'panda clickhouse query' +instead — the error output prints a ready-to-run example. + Examples: panda loki list-datasources panda loki query ethpandaops '{app="beacon-node"}' @@ -72,7 +111,7 @@ var lokiListDatasourcesCmd = &cobra.Command{ RunE: func(cmd *cobra.Command, _ []string) error { response, err := runServerOperation(cmd, "loki.list_datasources", map[string]any{}) if err != nil { - return err + return redirectLokiError(err) } return printDatasourceList(response) @@ -93,7 +132,7 @@ var lokiQueryCmd = &cobra.Command{ "direction": lokiDirection, }) if err != nil { - return err + return redirectLokiError(err) } if isJSON() { @@ -117,7 +156,7 @@ var lokiQueryInstantCmd = &cobra.Command{ "direction": lokiDirection, }) if err != nil { - return err + return redirectLokiError(err) } if isJSON() { @@ -139,7 +178,7 @@ var lokiLabelsCmd = &cobra.Command{ "end": lokiEnd, }) if err != nil { - return err + return redirectLokiError(err) } if isJSON() { @@ -162,7 +201,7 @@ var lokiLabelValuesCmd = &cobra.Command{ "end": lokiEnd, }) if err != nil { - return err + return redirectLokiError(err) } if isJSON() { diff --git a/pkg/cli/server_helpers.go b/pkg/cli/server_helpers.go index cb1de2d2..7aa619c5 100644 --- a/pkg/cli/server_helpers.go +++ b/pkg/cli/server_helpers.go @@ -454,17 +454,40 @@ func decodeAPIError(status int, data []byte) error { return fmt.Errorf("HTTP %d: %s", status, message) } -func serverErrorHint(status int, _ string) string { +func serverErrorHint(status int, message string) string { + // Message-specific hints take precedence over status-only hints: a ClickHouse + // "unknown table" error arrives as a generic HTTP error, so the status alone + // would mislead the user toward a missing-datasource hint. + if isUnknownTableError(message) { + return "ClickHouse could not find that table. List the real tables with 'panda schema '. " + + "Hosted devnet/testnet container logs live in 'external.otel_logs' (a bare 'otel_logs' is auto-resolved); " + + "local Kurtosis devnet logs live in 'otel.otel_logs'." + } + switch status { case http.StatusNotFound: return "the requested module, operation, datasource, or resource is not available on this server; check 'panda datasources' and 'panda resources list'" case http.StatusServiceUnavailable: return "the server is running but a required service (e.g. sandbox) is not available — check server logs with 'docker compose logs server'" + case http.StatusBadGateway: + return "the upstream node or datasource returned an error or was unreachable; if this is an ethnode call, the instance name may not exist — node instances cannot be enumerated, so discover real names from Dora ('/v1/clients/consensus') or the 'host.name' values in 'external.otel_logs'" default: return "" } } +// isUnknownTableError reports whether a server error message describes a +// ClickHouse missing-table/database condition (e.g. UNKNOWN_TABLE / Code: 60, +// or UNKNOWN_DATABASE / Code: 81). +func isUnknownTableError(message string) bool { + lower := strings.ToLower(message) + + return strings.Contains(lower, "unknown table") || + strings.Contains(lower, "unknown_table") || + strings.Contains(lower, "unknown database") || + strings.Contains(lower, "unknown_database") +} + func isConnectionRefused(err error) bool { if errors.Is(err, syscall.ECONNREFUSED) { return true diff --git a/pkg/server/operations_clickhouse.go b/pkg/server/operations_clickhouse.go index 12a957f6..24204e6c 100644 --- a/pkg/server/operations_clickhouse.go +++ b/pkg/server/operations_clickhouse.go @@ -4,12 +4,20 @@ import ( "fmt" "net/http" "net/url" + "regexp" "strings" "github.com/ethpandaops/panda/pkg/operations" "github.com/ethpandaops/panda/pkg/proxy/handlers" ) +// unqualifiedOtelTable matches a bare (database-unqualified) reference to the +// OpenTelemetry log/trace tables. The leading group captures the boundary char +// (or start-of-string) so qualified references like `external.otel_logs`, +// `otel.otel_logs`, or a column named `my_otel_logs` are left untouched — +// Go's RE2 has no lookbehind, so the boundary is captured and re-emitted. +var unqualifiedOtelTable = regexp.MustCompile(`(?i)(^|[^.\w])(otel_logs|otel_traces)\b`) + func (s *service) handleClickHouseOperation(operationID string, w http.ResponseWriter, r *http.Request) bool { switch operationID { case "clickhouse.list_datasources": @@ -63,6 +71,8 @@ func (s *service) handleClickHouseQuery(w http.ResponseWriter, r *http.Request) return } + sql = s.qualifyOtelTables(datasource, sql) + params := url.Values{"default_format": {"TabSeparatedWithNames"}} for key, value := range optionalMapArg(req.Args, "parameters") { params.Set("param_"+key, formatClickHouseParamValue(value)) @@ -93,6 +103,47 @@ func (s *service) handleClickHouseQuery(w http.ResponseWriter, r *http.Request) writePassthroughResponse(w, http.StatusOK, headers.Get("Content-Type"), body) } +// qualifyOtelTables makes a bare `otel_logs` / `otel_traces` reference resolve +// against the right database so the same query works in both deployment modes. +// The OTel log tables live in different databases per cluster: local Kurtosis +// devnets expose them in `otel` (the local-kurtosis default database, where a +// bare reference already resolves), while the hosted clickhouse-raw cluster +// keeps them in `external` (its default database is `default`, so a bare +// reference fails). We qualify unqualified references to the database that owns +// them for this datasource; already-qualified references are left untouched. +func (s *service) qualifyOtelTables(datasource, sql string) string { + return qualifyOtelTablesSQL(sql, s.otelDatabaseFor(datasource)) +} + +// qualifyOtelTablesSQL prefixes every unqualified otel_logs/otel_traces +// reference in sql with targetDB. Already-qualified references and lookalike +// identifiers (e.g. a column named my_otel_logs) are preserved. +func qualifyOtelTablesSQL(sql, targetDB string) string { + if targetDB == "" || !unqualifiedOtelTable.MatchString(sql) { + return sql + } + + return unqualifiedOtelTable.ReplaceAllString(sql, "${1}"+targetDB+".${2}") +} + +// otelDatabaseFor returns the database that owns the OTel log/trace tables for +// the given ClickHouse datasource. Local Kurtosis clusters (default database +// `otel`) own them in `otel`; everywhere else they live in `external`. +func (s *service) otelDatabaseFor(datasource string) string { + for _, info := range s.proxyService.ClickHouseDatasourceInfo() { + if info.Name == datasource { + if info.Metadata["database"] == "otel" { + return "otel" + } + + return "external" + } + } + + // Datasource not advertised (yet) — assume the hosted convention. + return "external" +} + func formatClickHouseParamValue(value any) string { switch v := value.(type) { case nil: diff --git a/pkg/server/operations_clickhouse_test.go b/pkg/server/operations_clickhouse_test.go new file mode 100644 index 00000000..4130e652 --- /dev/null +++ b/pkg/server/operations_clickhouse_test.go @@ -0,0 +1,85 @@ +package server + +import "testing" + +func TestQualifyOtelTablesSQL(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + sql string + targetDB string + want string + }{ + { + name: "bare otel_logs qualified to external", + sql: "SELECT Body FROM otel_logs WHERE Timestamp > now()", + targetDB: "external", + want: "SELECT Body FROM external.otel_logs WHERE Timestamp > now()", + }, + { + name: "bare otel_logs qualified to otel for local", + sql: "SELECT Body FROM otel_logs", + targetDB: "otel", + want: "SELECT Body FROM otel.otel_logs", + }, + { + name: "bare otel_traces qualified", + sql: "SELECT * FROM otel_traces", + targetDB: "external", + want: "SELECT * FROM external.otel_traces", + }, + { + name: "already-qualified external left untouched", + sql: "SELECT * FROM external.otel_logs", + targetDB: "external", + want: "SELECT * FROM external.otel_logs", + }, + { + name: "already-qualified internal left untouched", + sql: "SELECT * FROM internal.otel_logs", + targetDB: "external", + want: "SELECT * FROM internal.otel_logs", + }, + { + name: "already-qualified otel left untouched", + sql: "SELECT * FROM otel.otel_logs", + targetDB: "external", + want: "SELECT * FROM otel.otel_logs", + }, + { + name: "lookalike identifier preserved", + sql: "SELECT my_otel_logs FROM external.otel_logs", + targetDB: "external", + want: "SELECT my_otel_logs FROM external.otel_logs", + }, + { + name: "join across both tables qualified", + sql: "SELECT * FROM otel_logs l JOIN otel_traces t ON l.TraceId = t.TraceId", + targetDB: "external", + want: "SELECT * FROM external.otel_logs l JOIN external.otel_traces t ON l.TraceId = t.TraceId", + }, + { + name: "empty target db is a no-op", + sql: "SELECT * FROM otel_logs", + targetDB: "", + want: "SELECT * FROM otel_logs", + }, + { + name: "unrelated query untouched", + sql: "SELECT count() FROM fct_block_head", + targetDB: "external", + want: "SELECT count() FROM fct_block_head", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + if got := qualifyOtelTablesSQL(tt.sql, tt.targetDB); got != tt.want { + t.Fatalf("qualifyOtelTablesSQL(%q, %q) = %q, want %q", tt.sql, tt.targetDB, got, tt.want) + } + }) + } +} From ab4c01af22480f831596f55d10dab91a47520bc0 Mon Sep 17 00:00:00 2001 From: Stefan Date: Sat, 6 Jun 2026 13:02:08 +0200 Subject: [PATCH 2/6] feat(cli): add `panda ethnode list-networks` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ethnode.list_networks server operation and Python function already existed and worked; only the CLI wrapper was missing. Add it, mirroring list-datasources and reusing printListing. Note this lists NETWORKS, not node instances — instance names still cannot be enumerated (the proxy holds no enumerable instance list), so the help continues to point at Dora / otel_logs host.name for node discovery. --- pkg/cli/ethnode.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pkg/cli/ethnode.go b/pkg/cli/ethnode.go index edc279a8..94ba8b66 100644 --- a/pkg/cli/ethnode.go +++ b/pkg/cli/ethnode.go @@ -30,6 +30,7 @@ Discovering instance names: Examples: panda ethnode list-datasources + panda ethnode list-networks panda ethnode syncing dencun-devnet-12 lighthouse-geth-1 panda ethnode peers dencun-devnet-12 lighthouse-geth-1 panda ethnode finality dencun-devnet-12 lighthouse-geth-1 @@ -41,6 +42,7 @@ func init() { ethnodeCmd.AddCommand( ethNodeListDatasourcesCmd, + ethNodeListNetworksCmd, ethNodeSyncingCmd, ethNodeVersionCmd, ethNodeHealthCmd, @@ -77,6 +79,24 @@ var ethNodeListDatasourcesCmd = &cobra.Command{ }, } +var ethNodeListNetworksCmd = &cobra.Command{ + Use: "list-networks", + Short: "List networks reachable for direct node access", + Long: `List the networks that support direct ethnode (beacon/execution) access. + +This lists NETWORKS, not node instances. Instance names cannot be enumerated — +see 'panda ethnode --help' for how to discover the node names within a network.`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + response, err := runServerOperation(cmd, "ethnode.list_networks", map[string]any{}) + if err != nil { + return err + } + + return printListing(response, "networks", "No networks reachable for direct node access.") + }, +} + var ethNodeSyncingCmd = &cobra.Command{ Use: "syncing ", Short: "Get beacon node sync status", From 0ae74407bce93db85f33aaf647a020392ca1258d Mon Sep 17 00:00:00 2001 From: Stefan Date: Sat, 6 Jun 2026 13:08:04 +0200 Subject: [PATCH 3/6] fix(cli): correct dora epoch help field names and null claim Live verification against glamsterdam-devnet-5 showed the Dora epoch fields are lowercase (globalparticipationrate, attestationscount), not camelCase, and the head epoch reports partial/artificially-low values rather than strict null (null only at the boundary / for future epochs). Correct the help so it does not point users at field names that are always None. --- pkg/cli/dora.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/cli/dora.go b/pkg/cli/dora.go index 16f8f008..d1234c21 100644 --- a/pkg/cli/dora.go +++ b/pkg/cli/dora.go @@ -138,13 +138,13 @@ var doraEpochCmd = &cobra.Command{ Short: "Get epoch summary (always JSON)", Long: `Get the Dora summary for a single epoch (always JSON). -The summary fields are nested under the top-level "data" key -(e.g. data.finalized, data.globalParticipationRate, data.attestationCount). +The summary fields are nested under the top-level "data" key and use lowercase +names (e.g. data.finalized, data.globalparticipationrate, data.attestationscount). -Query a COMPLETED epoch. The head (in-progress) epoch — and any future epoch — -has no aggregated data yet, so its fields come back null/None. For a current -snapshot use head_epoch - 1 (a completed epoch); the head epoch also reports -artificially low participation because it is still being filled. +Query a COMPLETED epoch. A future epoch returns no data, and the head +(in-progress) epoch reports partial, artificially low participation because it +is still being filled (and can be null right at the epoch boundary). For a +reliable snapshot use head_epoch - 1. Examples: panda dora epoch hoodi 100`, From 341d98fa5b7e9a22686ef7dd9fd2a6ad8781686f Mon Sep 17 00:00:00 2001 From: Stefan Date: Tue, 9 Jun 2026 15:24:28 +0200 Subject: [PATCH 4/6] review: address samcm's feedback on cli-error-ergonomics - Remove server-side otel table auto-qualification (qualifyOtelTables): fix this upstream in docs/discovery instead so the agent generates fully-qualified queries from the start - Move ClickHouse unknown-table error hint from the generic serverErrorHint helper to wrapClickHouseError in clickhouse_cmd.go, closer to the datasource that produces the error - Remove StatusBadGateway case with datasource-specific content from the generic helper; ethnode command Long description already covers instance-name discovery - Delete the now-orphaned qualifyOtelTablesSQL test file --- pkg/cli/clickhouse_cmd.go | 19 +++++- pkg/cli/server_helpers.go | 25 +------ pkg/server/operations_clickhouse.go | 51 -------------- pkg/server/operations_clickhouse_test.go | 85 ------------------------ 4 files changed, 19 insertions(+), 161 deletions(-) delete mode 100644 pkg/server/operations_clickhouse_test.go diff --git a/pkg/cli/clickhouse_cmd.go b/pkg/cli/clickhouse_cmd.go index 00539dcc..a4250770 100644 --- a/pkg/cli/clickhouse_cmd.go +++ b/pkg/cli/clickhouse_cmd.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/csv" "fmt" + "strings" "github.com/spf13/cobra" ) @@ -76,7 +77,7 @@ func runClickHouseOperation(cmd *cobra.Command, operationID, datasource, sql str "sql": sql, }) if err != nil { - return err + return wrapClickHouseError(err) } if raw { @@ -125,6 +126,22 @@ func printClickHouseJSON(data []byte, raw bool) error { }) } +func wrapClickHouseError(err error) error { + if err == nil { + return nil + } + + lower := strings.ToLower(err.Error()) + if strings.Contains(lower, "unknown table") || strings.Contains(lower, "unknown_table") || + strings.Contains(lower, "unknown database") || strings.Contains(lower, "unknown_database") { + return fmt.Errorf("%w\n\n hint: ClickHouse could not find that table. List real tables with 'panda schema '. "+ + "OTel log tables must be fully qualified: use 'external.otel_logs' for hosted devnets/testnets, "+ + "or 'otel.otel_logs' for local Kurtosis devnets.", err) + } + + return err +} + func parseClickHouseTSV(data []byte) ([]string, [][]string, error) { trimmed := bytes.TrimSpace(data) if len(trimmed) == 0 { diff --git a/pkg/cli/server_helpers.go b/pkg/cli/server_helpers.go index 7aa619c5..cb1de2d2 100644 --- a/pkg/cli/server_helpers.go +++ b/pkg/cli/server_helpers.go @@ -454,40 +454,17 @@ func decodeAPIError(status int, data []byte) error { return fmt.Errorf("HTTP %d: %s", status, message) } -func serverErrorHint(status int, message string) string { - // Message-specific hints take precedence over status-only hints: a ClickHouse - // "unknown table" error arrives as a generic HTTP error, so the status alone - // would mislead the user toward a missing-datasource hint. - if isUnknownTableError(message) { - return "ClickHouse could not find that table. List the real tables with 'panda schema '. " + - "Hosted devnet/testnet container logs live in 'external.otel_logs' (a bare 'otel_logs' is auto-resolved); " + - "local Kurtosis devnet logs live in 'otel.otel_logs'." - } - +func serverErrorHint(status int, _ string) string { switch status { case http.StatusNotFound: return "the requested module, operation, datasource, or resource is not available on this server; check 'panda datasources' and 'panda resources list'" case http.StatusServiceUnavailable: return "the server is running but a required service (e.g. sandbox) is not available — check server logs with 'docker compose logs server'" - case http.StatusBadGateway: - return "the upstream node or datasource returned an error or was unreachable; if this is an ethnode call, the instance name may not exist — node instances cannot be enumerated, so discover real names from Dora ('/v1/clients/consensus') or the 'host.name' values in 'external.otel_logs'" default: return "" } } -// isUnknownTableError reports whether a server error message describes a -// ClickHouse missing-table/database condition (e.g. UNKNOWN_TABLE / Code: 60, -// or UNKNOWN_DATABASE / Code: 81). -func isUnknownTableError(message string) bool { - lower := strings.ToLower(message) - - return strings.Contains(lower, "unknown table") || - strings.Contains(lower, "unknown_table") || - strings.Contains(lower, "unknown database") || - strings.Contains(lower, "unknown_database") -} - func isConnectionRefused(err error) bool { if errors.Is(err, syscall.ECONNREFUSED) { return true diff --git a/pkg/server/operations_clickhouse.go b/pkg/server/operations_clickhouse.go index 24204e6c..12a957f6 100644 --- a/pkg/server/operations_clickhouse.go +++ b/pkg/server/operations_clickhouse.go @@ -4,20 +4,12 @@ import ( "fmt" "net/http" "net/url" - "regexp" "strings" "github.com/ethpandaops/panda/pkg/operations" "github.com/ethpandaops/panda/pkg/proxy/handlers" ) -// unqualifiedOtelTable matches a bare (database-unqualified) reference to the -// OpenTelemetry log/trace tables. The leading group captures the boundary char -// (or start-of-string) so qualified references like `external.otel_logs`, -// `otel.otel_logs`, or a column named `my_otel_logs` are left untouched — -// Go's RE2 has no lookbehind, so the boundary is captured and re-emitted. -var unqualifiedOtelTable = regexp.MustCompile(`(?i)(^|[^.\w])(otel_logs|otel_traces)\b`) - func (s *service) handleClickHouseOperation(operationID string, w http.ResponseWriter, r *http.Request) bool { switch operationID { case "clickhouse.list_datasources": @@ -71,8 +63,6 @@ func (s *service) handleClickHouseQuery(w http.ResponseWriter, r *http.Request) return } - sql = s.qualifyOtelTables(datasource, sql) - params := url.Values{"default_format": {"TabSeparatedWithNames"}} for key, value := range optionalMapArg(req.Args, "parameters") { params.Set("param_"+key, formatClickHouseParamValue(value)) @@ -103,47 +93,6 @@ func (s *service) handleClickHouseQuery(w http.ResponseWriter, r *http.Request) writePassthroughResponse(w, http.StatusOK, headers.Get("Content-Type"), body) } -// qualifyOtelTables makes a bare `otel_logs` / `otel_traces` reference resolve -// against the right database so the same query works in both deployment modes. -// The OTel log tables live in different databases per cluster: local Kurtosis -// devnets expose them in `otel` (the local-kurtosis default database, where a -// bare reference already resolves), while the hosted clickhouse-raw cluster -// keeps them in `external` (its default database is `default`, so a bare -// reference fails). We qualify unqualified references to the database that owns -// them for this datasource; already-qualified references are left untouched. -func (s *service) qualifyOtelTables(datasource, sql string) string { - return qualifyOtelTablesSQL(sql, s.otelDatabaseFor(datasource)) -} - -// qualifyOtelTablesSQL prefixes every unqualified otel_logs/otel_traces -// reference in sql with targetDB. Already-qualified references and lookalike -// identifiers (e.g. a column named my_otel_logs) are preserved. -func qualifyOtelTablesSQL(sql, targetDB string) string { - if targetDB == "" || !unqualifiedOtelTable.MatchString(sql) { - return sql - } - - return unqualifiedOtelTable.ReplaceAllString(sql, "${1}"+targetDB+".${2}") -} - -// otelDatabaseFor returns the database that owns the OTel log/trace tables for -// the given ClickHouse datasource. Local Kurtosis clusters (default database -// `otel`) own them in `otel`; everywhere else they live in `external`. -func (s *service) otelDatabaseFor(datasource string) string { - for _, info := range s.proxyService.ClickHouseDatasourceInfo() { - if info.Name == datasource { - if info.Metadata["database"] == "otel" { - return "otel" - } - - return "external" - } - } - - // Datasource not advertised (yet) — assume the hosted convention. - return "external" -} - func formatClickHouseParamValue(value any) string { switch v := value.(type) { case nil: diff --git a/pkg/server/operations_clickhouse_test.go b/pkg/server/operations_clickhouse_test.go deleted file mode 100644 index 4130e652..00000000 --- a/pkg/server/operations_clickhouse_test.go +++ /dev/null @@ -1,85 +0,0 @@ -package server - -import "testing" - -func TestQualifyOtelTablesSQL(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - sql string - targetDB string - want string - }{ - { - name: "bare otel_logs qualified to external", - sql: "SELECT Body FROM otel_logs WHERE Timestamp > now()", - targetDB: "external", - want: "SELECT Body FROM external.otel_logs WHERE Timestamp > now()", - }, - { - name: "bare otel_logs qualified to otel for local", - sql: "SELECT Body FROM otel_logs", - targetDB: "otel", - want: "SELECT Body FROM otel.otel_logs", - }, - { - name: "bare otel_traces qualified", - sql: "SELECT * FROM otel_traces", - targetDB: "external", - want: "SELECT * FROM external.otel_traces", - }, - { - name: "already-qualified external left untouched", - sql: "SELECT * FROM external.otel_logs", - targetDB: "external", - want: "SELECT * FROM external.otel_logs", - }, - { - name: "already-qualified internal left untouched", - sql: "SELECT * FROM internal.otel_logs", - targetDB: "external", - want: "SELECT * FROM internal.otel_logs", - }, - { - name: "already-qualified otel left untouched", - sql: "SELECT * FROM otel.otel_logs", - targetDB: "external", - want: "SELECT * FROM otel.otel_logs", - }, - { - name: "lookalike identifier preserved", - sql: "SELECT my_otel_logs FROM external.otel_logs", - targetDB: "external", - want: "SELECT my_otel_logs FROM external.otel_logs", - }, - { - name: "join across both tables qualified", - sql: "SELECT * FROM otel_logs l JOIN otel_traces t ON l.TraceId = t.TraceId", - targetDB: "external", - want: "SELECT * FROM external.otel_logs l JOIN external.otel_traces t ON l.TraceId = t.TraceId", - }, - { - name: "empty target db is a no-op", - sql: "SELECT * FROM otel_logs", - targetDB: "", - want: "SELECT * FROM otel_logs", - }, - { - name: "unrelated query untouched", - sql: "SELECT count() FROM fct_block_head", - targetDB: "external", - want: "SELECT count() FROM fct_block_head", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - - if got := qualifyOtelTablesSQL(tt.sql, tt.targetDB); got != tt.want { - t.Fatalf("qualifyOtelTablesSQL(%q, %q) = %q, want %q", tt.sql, tt.targetDB, got, tt.want) - } - }) - } -} From 4b4184f094add2c0e2a89819580f0f2a4b3c5906 Mon Sep 17 00:00:00 2001 From: Stefan Date: Tue, 9 Jun 2026 15:42:12 +0200 Subject: [PATCH 5/6] fix(cli): keep otel docs qualified and stop double-hinting CH errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups exposed by removing server-side otel table auto-qualification in the previous commit: 1. The ethnode help text and loki redirect synopsis still showed bare `otel_logs` against clickhouse-raw (a hosted cluster whose default db is `default`) and claimed it was "auto-resolved". With qualification gone those examples now fail, so qualify them to `external.otel_logs` and drop the stale auto-resolve claim. 2. wrapClickHouseError appended its hint on top of the error returned by decodeAPIError. A ClickHouse unknown-table/database error comes back as HTTP 404, so the generic status hint ("datasource ... not available") fired too — producing two hints, the first misleading. decodeAPIError now returns a structured *apiError (status + message); wrapClickHouseError matches it via errors.As and rebuilds the error with only the table-specific hint, while genuine route 404s (unknown datasource) keep the generic hint. Verified live against the local server. Adds a unit test covering both branches. --- pkg/cli/clickhouse_cmd.go | 34 ++++++++++++----- pkg/cli/clickhouse_cmd_test.go | 69 ++++++++++++++++++++++++++++++++++ pkg/cli/ethnode.go | 2 +- pkg/cli/loki_cmd.go | 4 +- pkg/cli/server_helpers.go | 24 +++++++++--- 5 files changed, 114 insertions(+), 19 deletions(-) create mode 100644 pkg/cli/clickhouse_cmd_test.go diff --git a/pkg/cli/clickhouse_cmd.go b/pkg/cli/clickhouse_cmd.go index a4250770..1f2b080b 100644 --- a/pkg/cli/clickhouse_cmd.go +++ b/pkg/cli/clickhouse_cmd.go @@ -3,6 +3,7 @@ package cli import ( "bytes" "encoding/csv" + "errors" "fmt" "strings" @@ -126,20 +127,33 @@ func printClickHouseJSON(data []byte, raw bool) error { }) } +// clickHouseUnknownTableHint sits next to the datasource (rather than in the +// generic error helper) and replaces the misleading generic 404 hint that a +// ClickHouse unknown-table/database error would otherwise surface. +const clickHouseUnknownTableHint = "ClickHouse could not find that table. List real tables with 'panda schema '. " + + "OTel log tables must be fully qualified: use 'external.otel_logs' for hosted devnets/testnets, " + + "or 'otel.otel_logs' for local Kurtosis devnets." + +// wrapClickHouseError swaps the generic status hint for a table-specific hint +// when ClickHouse reports an unknown table/database. ClickHouse returns these +// as HTTP 404, whose generic hint would wrongly suggest the datasource itself +// is missing. func wrapClickHouseError(err error) error { - if err == nil { - return nil + var apiErr *apiError + if !errors.As(err, &apiErr) || !isUnknownClickHouseTableError(apiErr.Message) { + return err } - lower := strings.ToLower(err.Error()) - if strings.Contains(lower, "unknown table") || strings.Contains(lower, "unknown_table") || - strings.Contains(lower, "unknown database") || strings.Contains(lower, "unknown_database") { - return fmt.Errorf("%w\n\n hint: ClickHouse could not find that table. List real tables with 'panda schema '. "+ - "OTel log tables must be fully qualified: use 'external.otel_logs' for hosted devnets/testnets, "+ - "or 'otel.otel_logs' for local Kurtosis devnets.", err) - } + return fmt.Errorf("HTTP %d: %s\n\n hint: %s", apiErr.Status, apiErr.Message, clickHouseUnknownTableHint) +} + +func isUnknownClickHouseTableError(message string) bool { + lower := strings.ToLower(message) - return err + return strings.Contains(lower, "unknown table") || + strings.Contains(lower, "unknown_table") || + strings.Contains(lower, "unknown database") || + strings.Contains(lower, "unknown_database") } func parseClickHouseTSV(data []byte) ([]string, [][]string, error) { diff --git a/pkg/cli/clickhouse_cmd_test.go b/pkg/cli/clickhouse_cmd_test.go new file mode 100644 index 00000000..8355008d --- /dev/null +++ b/pkg/cli/clickhouse_cmd_test.go @@ -0,0 +1,69 @@ +package cli + +import ( + "errors" + "strings" + "testing" +) + +func TestWrapClickHouseError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + wantHint bool + wantGeneric bool + }{ + { + name: "unknown table replaces generic 404 hint", + err: &apiError{ + Status: 404, + Message: "Code: 60. DB::Exception: Unknown table expression identifier 'otel_logs'. (UNKNOWN_TABLE)", + }, + wantHint: true, + wantGeneric: false, + }, + { + name: "unknown database gets the clickhouse hint", + err: &apiError{ + Status: 404, + Message: "Code: 81. DB::Exception: Database bogusdb does not exist. (UNKNOWN_DATABASE)", + }, + wantHint: true, + wantGeneric: false, + }, + { + name: "unknown datasource keeps the generic hint", + err: &apiError{ + Status: 404, + Message: `clickhouse datasource "nonexistent" not found`, + }, + wantHint: false, + wantGeneric: true, + }, + { + name: "non-apiError is returned untouched", + err: errors.New("connection refused"), + wantHint: false, + wantGeneric: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := wrapClickHouseError(tt.err).Error() + + if hasHint := strings.Contains(got, clickHouseUnknownTableHint); hasHint != tt.wantHint { + t.Errorf("clickhouse hint present = %v, want %v\nerror: %s", hasHint, tt.wantHint, got) + } + + genericHint := serverErrorHint(404, "") + if hasGeneric := strings.Contains(got, genericHint); hasGeneric != tt.wantGeneric { + t.Errorf("generic hint present = %v, want %v\nerror: %s", hasGeneric, tt.wantGeneric, got) + } + }) + } +} diff --git a/pkg/cli/ethnode.go b/pkg/cli/ethnode.go index 94ba8b66..bc63382a 100644 --- a/pkg/cli/ethnode.go +++ b/pkg/cli/ethnode.go @@ -24,7 +24,7 @@ Discovering instance names: real names from a source that knows what nodes exist: • Dora: panda dora ... (the /v1/clients/consensus endpoint lists CL nodes) • OTel logs: panda clickhouse query clickhouse-raw \ - "SELECT DISTINCT ResourceAttributes['host.name'] FROM otel_logs \ + "SELECT DISTINCT ResourceAttributes['host.name'] FROM external.otel_logs \ WHERE ResourceAttributes['network'] = '' \ AND Timestamp >= now() - INTERVAL 1 HOUR" diff --git a/pkg/cli/loki_cmd.go b/pkg/cli/loki_cmd.go index fae32528..665335bb 100644 --- a/pkg/cli/loki_cmd.go +++ b/pkg/cli/loki_cmd.go @@ -17,10 +17,10 @@ const lokiRedirectSynopsis = `Loki is not available here. Container logs now liv • Hosted (multi-VM) devnets/testnets → external.otel_logs (filter ResourceAttributes['network']) • Local Kurtosis devnets → otel.otel_logs (filter EnclaveName) -A bare 'otel_logs' is auto-resolved to the right database for the datasource. Example: +Qualify the table with its database (the example targets a hosted cluster). Example: panda clickhouse query clickhouse-raw "SELECT Timestamp, ResourceAttributes['host.name'] AS host, Body \ - FROM otel_logs \ + FROM external.otel_logs \ WHERE ResourceAttributes['network'] = '' \ AND Timestamp >= now() - INTERVAL 1 HOUR \ AND match(Body, '(?i)error') \ diff --git a/pkg/cli/server_helpers.go b/pkg/cli/server_helpers.go index cb1de2d2..a6c60cf9 100644 --- a/pkg/cli/server_helpers.go +++ b/pkg/cli/server_helpers.go @@ -432,6 +432,23 @@ func getBuildStatus(ctx context.Context, runID int64) (*serverapi.BuildStatusRes return &response, nil } +// apiError is a non-2xx response from the server. It exposes the status and +// message so a command can attach a datasource-specific hint in place of the +// generic status hint — e.g. a ClickHouse unknown-table error arrives as a 404 +// whose generic hint would wrongly point at a missing datasource. +type apiError struct { + Status int + Message string +} + +func (e *apiError) Error() string { + if hint := serverErrorHint(e.Status, e.Message); hint != "" { + return fmt.Sprintf("HTTP %d: %s\n\n hint: %s", e.Status, e.Message, hint) + } + + return fmt.Sprintf("HTTP %d: %s", e.Status, e.Message) +} + func decodeAPIError(status int, data []byte) error { var message string @@ -446,12 +463,7 @@ func decodeAPIError(status int, data []byte) error { message = strings.TrimSpace(string(data)) } - hint := serverErrorHint(status, message) - if hint != "" { - return fmt.Errorf("HTTP %d: %s\n\n hint: %s", status, message, hint) - } - - return fmt.Errorf("HTTP %d: %s", status, message) + return &apiError{Status: status, Message: message} } func serverErrorHint(status int, _ string) string { From fab276fcdfc5b209d57e7db4d8dc26f87baca8f0 Mon Sep 17 00:00:00 2001 From: Stefan Date: Tue, 9 Jun 2026 16:18:51 +0200 Subject: [PATCH 6/6] refactor(cli): make loki redirect replace the generic hint, like CH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the ClickHouse unknown-table fix for the Loki redirect. A disabled Loki module returns HTTP 404, so redirectLokiError was stacking its synopsis on top of the generic "check 'panda datasources'" hint — redundant once we point straight at the replacement datasource. - redirectLokiError now matches the structured *apiError via errors.As and rebuilds the error with only the synopsis. Matching on status 404 / "not enabled" / "not available" is more precise than the old substring scan of the formatted string, and a genuine query failure against a live Loki (e.g. a 400 parse error) is no longer redirected. - Align the synopsis example's severity regex with the canonical runbook pattern (crit|err|error|fatal) and note that severity lives in Body. - Add a unit test covering redirect vs non-redirect branches. --- pkg/cli/loki_cmd.go | 37 ++++++++++++++++-------- pkg/cli/loki_cmd_test.go | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 12 deletions(-) create mode 100644 pkg/cli/loki_cmd_test.go diff --git a/pkg/cli/loki_cmd.go b/pkg/cli/loki_cmd.go index 665335bb..10167c7a 100644 --- a/pkg/cli/loki_cmd.go +++ b/pkg/cli/loki_cmd.go @@ -2,7 +2,9 @@ package cli import ( "encoding/json" + "errors" "fmt" + "net/http" "strconv" "strings" "time" @@ -17,30 +19,41 @@ const lokiRedirectSynopsis = `Loki is not available here. Container logs now liv • Hosted (multi-VM) devnets/testnets → external.otel_logs (filter ResourceAttributes['network']) • Local Kurtosis devnets → otel.otel_logs (filter EnclaveName) -Qualify the table with its database (the example targets a hosted cluster). Example: +Qualify the table with its database — the example below targets a hosted cluster. +Severity lives in Body for these Docker logs (SeverityText is usually empty), so +match on Body. Example: panda clickhouse query clickhouse-raw "SELECT Timestamp, ResourceAttributes['host.name'] AS host, Body \ FROM external.otel_logs \ WHERE ResourceAttributes['network'] = '' \ AND Timestamp >= now() - INTERVAL 1 HOUR \ - AND match(Body, '(?i)error') \ + AND match(Body, '(?i)(crit|err|error|fatal)') \ ORDER BY Timestamp DESC LIMIT 100"` -// redirectLokiError appends the ClickHouse redirect synopsis when a Loki -// command failed because the Loki module is unavailable on this server. +// redirectLokiError swaps the generic status hint for the ClickHouse redirect +// synopsis when a Loki command failed because the Loki module is unavailable on +// this server. The module being disabled returns HTTP 404, whose generic hint +// ("check 'panda datasources'") is redundant once we point straight at the +// replacement datasource. func redirectLokiError(err error) error { - if err == nil { - return nil + var apiErr *apiError + if !errors.As(err, &apiErr) || !lokiUnavailable(apiErr) { + return err } - msg := err.Error() - if !strings.Contains(msg, "not enabled") && - !strings.Contains(msg, "not available") && - !strings.Contains(msg, "HTTP 404") { - return err + return fmt.Errorf("HTTP %d: %s\n\n%s", apiErr.Status, apiErr.Message, lokiRedirectSynopsis) +} + +// lokiUnavailable reports whether an API error means the Loki module/datasource +// is not served here (rather than a genuine query failure against a live Loki). +func lokiUnavailable(e *apiError) bool { + if e.Status == http.StatusNotFound { + return true } - return fmt.Errorf("%w\n\n%s", err, lokiRedirectSynopsis) + lower := strings.ToLower(e.Message) + + return strings.Contains(lower, "not enabled") || strings.Contains(lower, "not available") } var ( diff --git a/pkg/cli/loki_cmd_test.go b/pkg/cli/loki_cmd_test.go new file mode 100644 index 00000000..3b009b89 --- /dev/null +++ b/pkg/cli/loki_cmd_test.go @@ -0,0 +1,61 @@ +package cli + +import ( + "errors" + "strings" + "testing" +) + +func TestRedirectLokiError(t *testing.T) { + t.Parallel() + + genericHint := serverErrorHint(404, "") + + tests := []struct { + name string + err error + wantSynopsis bool + wantGeneric bool + }{ + { + name: "module not enabled (404) redirects and drops generic hint", + err: &apiError{Status: 404, Message: `module "loki" is not enabled`}, + wantSynopsis: true, + wantGeneric: false, + }, + { + name: "not available message redirects", + err: &apiError{Status: 503, Message: "loki datasource not available"}, + wantSynopsis: true, + wantGeneric: false, + }, + { + name: "genuine query failure against live loki is not redirected", + err: &apiError{Status: 400, Message: "parse error: unexpected token"}, + wantSynopsis: false, + wantGeneric: false, + }, + { + name: "non-apiError is returned untouched", + err: errors.New("connection refused"), + wantSynopsis: false, + wantGeneric: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := redirectLokiError(tt.err).Error() + + if hasSynopsis := strings.Contains(got, lokiRedirectSynopsis); hasSynopsis != tt.wantSynopsis { + t.Errorf("synopsis present = %v, want %v\nerror: %s", hasSynopsis, tt.wantSynopsis, got) + } + + if hasGeneric := strings.Contains(got, genericHint); hasGeneric != tt.wantGeneric { + t.Errorf("generic hint present = %v, want %v\nerror: %s", hasGeneric, tt.wantGeneric, got) + } + }) + } +}