From 458283413e518368d0d5eed6b5bf2ca510fd60b8 Mon Sep 17 00:00:00 2001 From: Monu Singh Date: Tue, 28 Apr 2026 13:30:49 +0530 Subject: [PATCH] Fix NPE in ShardReplicationTask under high shard concurrency (#1660) FollowerClusterStats.stats was backed by a non-thread-safe LinkedHashMap (mutableMapOf()). The singleton is shared across all ShardReplicationTasks (one per shard), and under concurrent puts from ~1024 shards, internal rehashing caused stats[shardId] to return a transient null, which the '!!' assertion in ShardReplicationTask.replicate() and TranslogSequencer turned into a NullPointerException, auto-pausing replication. Replace mutableMapOf() with ConcurrentHashMap() so concurrent put/get/remove never observe a spurious null. Change var to val since the reference is never reassigned. Add FollowerClusterStatsTests with a regression test that stresses the shared map under concurrent put/get/remove patterns mirroring the production call sites. Signed-off-by: Monu Singh --- .../replication/task/shard/FollowerClusterStats.kt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/org/opensearch/replication/task/shard/FollowerClusterStats.kt b/src/main/kotlin/org/opensearch/replication/task/shard/FollowerClusterStats.kt index 77cfb6720..d36dcb2fe 100644 --- a/src/main/kotlin/org/opensearch/replication/task/shard/FollowerClusterStats.kt +++ b/src/main/kotlin/org/opensearch/replication/task/shard/FollowerClusterStats.kt @@ -20,6 +20,7 @@ import org.opensearch.core.xcontent.ToXContentFragment import org.opensearch.core.xcontent.XContentBuilder import org.opensearch.common.xcontent.XContentType import org.opensearch.core.index.shard.ShardId +import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong class FollowerShardMetric { @@ -116,5 +117,6 @@ class FollowerShardMetric { @Singleton class FollowerClusterStats { - var stats :MutableMap = mutableMapOf() -} \ No newline at end of file + // ConcurrentHashMap: concurrent put/remove/get from multiple ShardReplicationTasks (issue #1660) + val stats: MutableMap = ConcurrentHashMap() +}