From a905963a3236a09e3e6fab1aeeba0d735983bdf2 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 4 Jun 2026 21:34:55 +0900 Subject: [PATCH 1/2] =?UTF-8?q?fix(scripts):=20M5a=20=E2=80=94=20--host=20?= =?UTF-8?q?127.0.0.1=20for=20--local=20mode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit E2E run of scripts/run-jepsen-m5-local.sh after PR #924 + #925 merge revealed a DNS-resolution failure during the workload's first setup!: DynamoDB :cognitect.anomalies/not-found: n5: nodename nor servname provided, or not known Root cause: the workload's open! resolves the DynamoDB client hostname via (or (:dynamo-host test) (name node)). When --local is set, :dynamo-host is nil (the cluster is local), so it falls through to (name node) where node is one of default-nodes ["n1" "n2" "n3" "n4" "n5"] — virtual labels, not real hostnames. DNS resolution fails. Fix: thread --host 127.0.0.1 through lein run. cli/common-cli-opts maps it to :host -> prepare-dynamo-opts copies to :dynamo-host -> make-ddb-client uses it as the endpoint hostname. All five nodes' clients now correctly dial the single-process loopback DynamoDB endpoint (PROC_ADDR=127.0.0.1:50051 / 63801). Verification: post-fix run no longer hits the n5 DNS failure. setup! now proceeds to verify-multi-group-routing! and create-all-tables!. Note: a separate issue surfaces after this fix — workers report ResourceNotFoundException for every txn, even though create-all-tables! reports success and ListRoutes shows the expected two-group catalog. The 5-parallel client setup may be racing on CreateTable in a way the server reports as ACTIVE but the table-meta key is not actually durably visible to the subsequent invoke!. That's a separate investigation outside this fix. --- scripts/run-jepsen-m5-local.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/run-jepsen-m5-local.sh b/scripts/run-jepsen-m5-local.sh index 4afd2653..f10b8145 100755 --- a/scripts/run-jepsen-m5-local.sh +++ b/scripts/run-jepsen-m5-local.sh @@ -206,10 +206,19 @@ HOME="$(pwd)/tmp-home" LEIN_HOME="$(pwd)/.lein" \ --local \ --time-limit 30 \ --rate 5 \ + --host 127.0.0.1 \ --dynamo-port 63801 \ --list-routes-bin "$LIST_ROUTES_BIN" \ --grpc-host-port "$PROC_ADDR" \ || EXIT_CODE=$? +# --host 127.0.0.1 — without this the workload's open! resolves the +# DynamoDB client hostname from (name node) where node is one of +# default-nodes ["n1" "n2" "n3" "n4" "n5"]; these are virtual labels, +# not real hostnames, and DNS resolution fails with 'nodename nor +# servname provided'. --host overrides via cli/common-cli-opts' +# --host -> :host -> :dynamo-host -> make-ddb-client wiring. Required +# for the single-process two-group topology this script launches — +# every "node" client talks to the same loopback DynamoDB endpoint. EXIT_CODE=${EXIT_CODE:-0} From 83d44c069b72a2203d46449ca852ae518659494a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 4 Jun 2026 22:03:42 +0900 Subject: [PATCH 2/2] =?UTF-8?q?fix(scripts):=20M5a=20=E2=80=94=20[,?= =?UTF-8?q?=20T1=5FKEY)=20coverage=20in=20--shardRanges=20(issue=20#930=20?= =?UTF-8?q?partial)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding default range coverage for routing keys lexicographically smaller than T1_KEY. Without this, any table whose base64-encoded name sorts before 'amVwc2VuX2FwcGVuZF90MQ' (= base64('jepsen_append_t1')) returns 'no route for key' from ShardedCoordinator.dispatchTxn, which createTableWithRetry silently swallows as ACTIVE (issue #930 root cause #1). Topology consequence: the default range goes to group 1. Tables 1-2 share that group; tables 3-4 use group 2. NOTE — partial fix. E2E run after this commit STILL shows all workers reporting ResourceNotFoundException. The jepsen_append_t1..4 routing keys are inside the original [T1_KEY, +inf) coverage, so those should not benefit from this change — yet they're still failing. A second bug is present beyond the routing-coverage gap and needs separate investigation. See issue #930 for the next-step plan. --- scripts/run-jepsen-m5-local.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/scripts/run-jepsen-m5-local.sh b/scripts/run-jepsen-m5-local.sh index f10b8145..c47f5835 100755 --- a/scripts/run-jepsen-m5-local.sh +++ b/scripts/run-jepsen-m5-local.sh @@ -98,11 +98,18 @@ for bin in "$ROUTE_KEY_BIN" "$LIST_ROUTES_BIN" "$BINARY"; do done T1_KEY="$("$ROUTE_KEY_BIN" jepsen_append_t1)" T3_KEY="$("$ROUTE_KEY_BIN" jepsen_append_t3)" -# Group 1: [T1_KEY, T3_KEY) — tables 1, 2 -# Group 2: [T3_KEY, +inf) — tables 3, 4 -# Keys outside [T1_KEY, +inf) fall through to the default group; this -# workload only writes table-route keys so that range is unused. -SHARD_RANGES="${T1_KEY}:${T3_KEY}=1,${T3_KEY}:=2" +# Issue #930 fix: --shardRanges must cover every routing key. Without +# a [, T1_KEY) range, any table whose base64-encoded name sorts +# before "amVwc2VuX2FwcGVuZF90MQ" (= base64("jepsen_append_t1")) +# returns "no route for key" from ShardedCoordinator.dispatchTxn, and +# createTableWithRetry silently swallows that as ACTIVE. +# +# Group 1: [, T3_KEY) — default + tables 1, 2 +# Group 2: [T3_KEY, +inf) — tables 3, 4 +# +# Note: assigning the default range to group 1 (not a third group) keeps +# the topology consistent with the 1-process-2-groups launch. +SHARD_RANGES=":${T3_KEY}=1,${T3_KEY}:=2" echo "[shard-ranges] $SHARD_RANGES" # ---- stop any previously managed cluster ----