From e42b50e1e204488ea8c686c49f3b80e46bd69925 Mon Sep 17 00:00:00 2001
From: 3AceShowHand <jinl1037@hotmail.com>
Date: Thu, 25 Jun 2026 22:03:05 +0800
Subject: [PATCH 1/6] Revert "Remove more logs and cherry pick review (#26)"

This reverts commit 93b73f331dd5445624325af34cc441f71e8412a5.
---
 async_producer.go      | 188 ++++++++++++++++++++++++-----------------
 async_producer_test.go | 113 ++++++-------------------
 client.go              |   4 +-
 3 files changed, 139 insertions(+), 166 deletions(-)

diff --git a/async_producer.go b/async_producer.go
index bcd7382e7..c9d5f7d44 100644
--- a/async_producer.go
+++ b/async_producer.go
@@ -817,8 +817,10 @@ func (p *asyncProducer) backoffOrDone(retries int) bool {
 	if backoff <= 0 {
 		return !p.shuttingDown()
 	}
+	timer := time.NewTimer(backoff)
+	defer timer.Stop()
 	select {
-	case <-time.After(backoff):
+	case <-timer.C:
 		return true
 	case <-p.done:
 		return false
@@ -1242,7 +1244,7 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool {
 }
 
 func (bp *brokerProducer) shutdown() {
-	if !bp.closed.CompareAndSwap(false, true) {
+	if bp.closed.Swap(true) {
 		return
 	}
 	if bp.done != nil {
@@ -1474,6 +1476,10 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 	produceSet := newProduceSet(p)
 	produceSet.addPartitionSet(topic, partition, pSet)
 	bufferMessages, bufferBytes := int64(produceSet.bufferCount), int64(produceSet.bufferBytes)
+	var reservedMessages, reservedBytes int64
+	defer func() {
+		p.retryBufferQuota.release(reservedMessages, reservedBytes)
+	}()
 
 	muted := alreadyMuted
 	failBatch := func(err error) {
@@ -1496,7 +1502,7 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 		failBatch(ErrProducerRetryBufferOverflow)
 		return
 	}
-	defer p.retryBufferQuota.release(bufferMessages, bufferBytes)
+	reservedMessages, reservedBytes = bufferMessages, bufferBytes
 
 	// Honor Producer.Retry.Backoff between retry attempts (#2469). retryBatch
 	// dispatches the produceSet directly to the broker, bypassing partitionProducer.dispatch.
@@ -1526,11 +1532,23 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 	}
 }
 
-func (p *asyncProducer) failMutedSet(set *produceSet, err error) {
-	p.muter.unmute(set)
-	set.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
-		p.returnErrors(pSet.msgs, err)
-	})
+type partitionBatchRetry struct {
+	topic     string
+	partition int32
+	pSet      *partitionSet
+}
+
+func (p *asyncProducer) failMutedBatch(batch partitionBatchRetry, err error) {
+	produceSet := newProduceSet(p)
+	produceSet.addPartitionSet(batch.topic, batch.partition, batch.pSet)
+	p.muter.unmute(produceSet)
+	p.returnErrors(batch.pSet.msgs, err)
+}
+
+func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err error) {
+	for _, batch := range batches {
+		p.failMutedBatch(batch, err)
+	}
 }
 
 func newRetryBufferQuota(conf *Config) messageQuota {
@@ -1559,88 +1577,99 @@ func producerMessageByteSizeVersion(conf *Config) int {
 	return 1
 }
 
-// retryBatchesAfterRefresh retries muted batches from one failed ProduceRequest
-// after a single metadata refresh, preserving that failed-request boundary.
-func (p *asyncProducer) retryBatchesAfterRefresh(batches *produceSet, retryErr error) {
+// retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError.
+// Connection errors already hold partition mutes; doing the refresh in the
+// response loop can block all progress for that broker while the cluster is
+// unstable. Refresh once for the failed request so multi-partition batches do
+// not amplify controller-fail metadata traffic. Group retry partitions by their
+// refreshed leader broker, but hand them off from this retry worker so large
+// clusters do not create one temporary goroutine per target broker.
+//
+// This is intentionally not implemented as a loop over retryBatch. retryBatch
+// is the single-partition direct retry primitive; this path owns all muted
+// partitions from one failed ProduceRequest and must retry, fail, reserve
+// buffer, back off, and refresh metadata at that failed-request boundary.
+func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []partitionBatchRetry, retryErr error) {
 	if p.shuttingDown() {
-		p.failMutedSet(batches, ErrShuttingDown)
+		p.failMutedBatches(batches, ErrShuttingDown)
 		return
 	}
 
-	var (
-		maxRetryAttempt             int
-		bufferMessages, bufferBytes int64
-	)
-	retryable := newProduceSet(p)
-	batches.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
-		if !pSet.shouldKeepMuted(p.conf.Producer.Retry.Max) {
-			failed := newProduceSet(p)
-			failed.addPartitionSet(topic, partition, pSet)
-			p.failMutedSet(failed, retryErr)
-			return
+	var retryable []partitionBatchRetry
+	maxRetryAttempt := 0
+	for _, batch := range batches {
+		if !batch.pSet.shouldKeepMuted(p.conf.Producer.Retry.Max) {
+			p.failMutedBatch(batch, retryErr)
+			continue
 		}
-		for _, msg := range pSet.msgs {
+		for _, msg := range batch.pSet.msgs {
 			msg.retries++
 			if msg.retries > maxRetryAttempt {
 				maxRetryAttempt = msg.retries
 			}
 		}
-		retryable.addPartitionSet(topic, partition, pSet)
-		bufferMessages += int64(len(pSet.msgs))
-		bufferBytes += int64(pSet.bufferBytes)
-	})
-	if retryable.empty() {
+		retryable = append(retryable, batch)
+	}
+	if len(retryable) == 0 {
 		return
 	}
 
-	// Reserve before waiting so pending direct retries still count against the
-	// producer-level retry buffer budget.
+	// Reserve before metadata refresh so refresh-blocked direct retries still
+	// count against the producer-level retry buffer budget.
+	var bufferMessages, bufferBytes int64
+	for _, batch := range retryable {
+		bufferMessages += int64(len(batch.pSet.msgs))
+		bufferBytes += int64(batch.pSet.bufferBytes)
+	}
 	if !p.retryBufferQuota.tryReserve(bufferMessages, bufferBytes) {
-		p.failMutedSet(retryable, ErrProducerRetryBufferOverflow)
+		p.failMutedBatches(retryable, ErrProducerRetryBufferOverflow)
 		return
 	}
 
-	if !p.backoffOrDone(maxRetryAttempt) {
+	if p.shuttingDown() {
 		p.retryBufferQuota.release(bufferMessages, bufferBytes)
-		p.failMutedSet(retryable, ErrShuttingDown)
+		p.failMutedBatches(retryable, ErrShuttingDown)
 		return
 	}
-	topics := make([]string, 0, len(retryable.msgs))
-	for topic := range retryable.msgs {
-		topics = append(topics, topic)
+	if len(topics) > 0 {
+		if err := p.client.RefreshMetadata(topics...); err != nil {
+			Logger.Printf("Failed refreshing metadata because of %v\n", err)
+		}
 	}
-	if err := p.client.RefreshMetadata(topics...); err != nil {
-		Logger.Printf("Failed refreshing metadata because of %v\n", err)
+	if p.shuttingDown() {
+		p.retryBufferQuota.release(bufferMessages, bufferBytes)
+		p.failMutedBatches(retryable, ErrShuttingDown)
+		return
+	}
+
+	if !p.backoffOrDone(maxRetryAttempt) {
+		p.retryBufferQuota.release(bufferMessages, bufferBytes)
+		p.failMutedBatches(retryable, ErrShuttingDown)
+		return
 	}
 
 	brokerSets := make(map[*Broker]*produceSet)
-	retryable.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
-		leader, err := p.client.Leader(topic, partition)
+	for _, batch := range retryable {
+		if p.shuttingDown() {
+			batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes)
+			p.retryBufferQuota.release(batchMessages, batchBytes)
+			p.failMutedBatch(batch, ErrShuttingDown)
+			continue
+		}
+		leader, err := p.client.Leader(batch.topic, batch.partition)
 		if err != nil {
-			Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", topic, partition, err)
-			batchMessages, batchBytes := int64(len(pSet.msgs)), int64(pSet.bufferBytes)
+			Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, err)
+			batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes)
 			p.retryBufferQuota.release(batchMessages, batchBytes)
-			failed := newProduceSet(p)
-			failed.addPartitionSet(topic, partition, pSet)
-			p.failMutedSet(failed, retryErr)
-			return
+			p.failMutedBatch(batch, retryErr)
+			continue
 		}
 		set := brokerSets[leader]
 		if set == nil {
 			set = newProduceSet(p)
 			brokerSets[leader] = set
 		}
-		set.addPartitionSet(topic, partition, pSet)
-	})
-
-	// handoffRetryBatch also checks shutdown, but doing it here avoids creating
-	// or ref-counting brokerProducers just to reject the retry.
-	if p.shuttingDown() {
-		for _, set := range brokerSets {
-			p.retryBufferQuota.release(int64(set.bufferCount), int64(set.bufferBytes))
-			p.failMutedSet(set, ErrShuttingDown)
-		}
-		return
+		set.addPartitionSet(batch.topic, batch.partition, batch.pSet)
 	}
 
 	for leader, set := range brokerSets {
@@ -1649,7 +1678,10 @@ func (p *asyncProducer) retryBatchesAfterRefresh(batches *produceSet, retryErr e
 		p.unrefBrokerProducer(leader, bp)
 		p.retryBufferQuota.release(int64(set.bufferCount), int64(set.bufferBytes))
 		if !accepted {
-			p.failMutedSet(set, ErrShuttingDown)
+			p.muter.unmute(set)
+			set.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
+				p.returnErrors(pSet.msgs, ErrShuttingDown)
+			})
 		}
 	}
 }
@@ -1657,16 +1689,10 @@ func (p *asyncProducer) retryBatchesAfterRefresh(batches *produceSet, retryErr e
 // handoffRetryBatch transfers ownership of a muted retry batch to the broker
 // bridge. On false, the caller still owns the mute and must fail the batch.
 func (p *asyncProducer) handoffRetryBatch(bp *brokerProducer, set *produceSet) bool {
-	// If shutdown is already visible, do not let select randomly choose a ready
-	// send on bp.output. The second select still handles shutdown racing with
-	// the handoff.
-	select {
-	case <-bp.done:
-		return false
-	case <-p.done:
+	if bp.closed.Load() {
 		return false
-	default:
 	}
+
 	select {
 	case bp.output <- set:
 		return true
@@ -1689,9 +1715,10 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) {
 		bp.parent.abandonBrokerConnection(bp.broker)
 		_ = bp.broker.Close()
 		bp.closing = err
-
-		var retrySet *produceSet
 		keepMuted := make(map[string]map[int32]struct{})
+		var retryTopics []string
+		retryTopicSeen := make(map[string]struct{})
+		var retryBatches []partitionBatchRetry
 		sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
 			// Connection failures have no ProduceResponse to feed back through
 			// partitionProducer. Keep the sent batch muted and retry it asynchronously
@@ -1704,18 +1731,23 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) {
 			if !(bp.parent.conf.Producer.Idempotent || pSet.shouldKeepMuted(bp.parent.conf.Producer.Retry.Max)) {
 				bp.parent.returnErrors(pSet.msgs, err)
 			} else {
-				if retrySet == nil {
-					retrySet = newProduceSet(bp.parent)
-				}
 				if keepMuted[topic] == nil {
 					keepMuted[topic] = make(map[int32]struct{})
 				}
 				keepMuted[topic][partition] = struct{}{}
-				retrySet.addPartitionSet(topic, partition, pSet)
+				if _, ok := retryTopicSeen[topic]; !ok {
+					retryTopicSeen[topic] = struct{}{}
+					retryTopics = append(retryTopics, topic)
+				}
+				retryBatches = append(retryBatches, partitionBatchRetry{
+					topic:     topic,
+					partition: partition,
+					pSet:      pSet,
+				})
 			}
 		})
-		if retrySet != nil {
-			go bp.parent.retryBatchesAfterRefresh(retrySet, err)
+		if len(retryBatches) > 0 {
+			go bp.parent.retryBatchesAfterRefresh(retryTopics, retryBatches, err)
 		}
 		bp.accumulatingBatch.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
 			bp.parent.retryMessages(pSet.msgs, err)
@@ -1959,7 +1991,7 @@ func (q *messageQuota) tryReserve(messages, bytes int64) bool {
 
 	nextMessages := q.messages + messages
 	nextBytes := q.bytes + bytes
-	if q.overflows(nextMessages, nextBytes) {
+	if q.wouldOverflow(nextMessages, nextBytes) {
 		return false
 	}
 	q.messages = nextMessages
@@ -1977,7 +2009,7 @@ func (q *messageQuota) addAndCheckOverflow(messages, bytes int64) bool {
 
 	q.messages += messages
 	q.bytes += bytes
-	return q.overflows(q.messages, q.bytes)
+	return q.wouldOverflow(q.messages, q.bytes)
 }
 
 func (q *messageQuota) release(messages, bytes int64) {
@@ -1996,7 +2028,7 @@ func (q *messageQuota) shouldTrack(messages, bytes int64) bool {
 	return (messages != 0 || bytes != 0) && (q.maxMessages > 0 || q.maxBytes > 0)
 }
 
-func (q *messageQuota) overflows(messages, bytes int64) bool {
+func (q *messageQuota) wouldOverflow(messages, bytes int64) bool {
 	return (q.maxMessages > 0 && messages >= q.maxMessages) ||
 		(q.maxBytes > 0 && bytes >= q.maxBytes)
 }
diff --git a/async_producer_test.go b/async_producer_test.go
index bbc68976d..b16e0bf61 100644
--- a/async_producer_test.go
+++ b/async_producer_test.go
@@ -1571,7 +1571,7 @@ func TestBrokerProducerShutdown(t *testing.T) {
 
 // TestBrokerProducerWaitForSpaceEmptyBufferRollover ensures forced rollovers with an empty buffer
 // do not deadlock waiting for responses when no partitions are muted.
-func TestBrokerProducerWaitForSpaceEmptyRollover(t *testing.T) {
+func TestBrokerProducerWaitForSpaceEmptyBufferRollover(t *testing.T) {
 	config := NewTestConfig()
 	parent := &asyncProducer{
 		conf:   config,
@@ -1707,9 +1707,9 @@ func holdFirstRetryAfterReserve(parent *asyncProducer, retryBP *brokerProducer,
 	return firstRetryReserved, release
 }
 
-// TestBrokerProducerWaitForSpaceExternalUnmute ensures waitForSpace does not
+// TestBrokerProducerWaitForSpaceRespectsExternalUnmute ensures waitForSpace does not
 // deadlock when partitions are muted by another producer and are unmuted elsewhere.
-func TestBrokerProducerWaitForSpaceExternalUnmute(t *testing.T) {
+func TestBrokerProducerWaitForSpaceRespectsExternalUnmute(t *testing.T) {
 	config := NewTestConfig()
 	txnMgr := &transactionManager{
 		producerID:      0,
@@ -1796,7 +1796,7 @@ func TestBrokerProducerFlushSkipsMutedPartitions(t *testing.T) {
 	}
 }
 
-func TestBrokerProducerRunExternalUnmute(t *testing.T) {
+func TestBrokerProducerRunFlushesAfterExternalUnmute(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Flush.Messages = 1
 	parent := &asyncProducer{
@@ -1843,9 +1843,9 @@ func TestBrokerProducerRunExternalUnmute(t *testing.T) {
 	assertDoneWithin(t, done, 2*time.Second)
 }
 
-// TestBrokerProducerWaitForSpaceAllMuted verifies that waitForSpace unblocks
+// TestBrokerProducerWaitForSpaceAllPartitionsMuted verifies that waitForSpace unblocks
 // when all partitions in the accumulating batch are externally muted and later unmuted.
-func TestBrokerProducerWaitForSpaceAllMuted(t *testing.T) {
+func TestBrokerProducerWaitForSpaceAllPartitionsMuted(t *testing.T) {
 	config := NewTestConfig()
 	parent := &asyncProducer{
 		conf:   config,
@@ -1880,9 +1880,9 @@ func TestBrokerProducerWaitForSpaceAllMuted(t *testing.T) {
 	}
 }
 
-// TestPartitionMuterCloseWakesWait verifies that closing the muter wakes
+// TestPartitionMuterCloseWakesWaitUntilMuted verifies that closing the muter wakes
 // goroutines blocked in waitUntilMuted.
-func TestPartitionMuterCloseWakesWait(t *testing.T) {
+func TestPartitionMuterCloseWakesWaitUntilMuted(t *testing.T) {
 	config := NewTestConfig()
 	parent := &asyncProducer{
 		conf:   config,
@@ -2031,7 +2031,7 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	}
 }
 
-func TestHandleErrorRetryKeepsMute(t *testing.T) {
+func TestHandleErrorNonIdempotentRetryKeepsPartitionMuted(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2092,7 +2092,7 @@ func TestHandleErrorRetryKeepsMute(t *testing.T) {
 	}
 }
 
-func TestHandleErrorRetryFailsBatch(t *testing.T) {
+func TestHandleErrorNonIdempotentRetryFailsWholeBatch(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2141,7 +2141,7 @@ func TestHandleErrorRetryFailsBatch(t *testing.T) {
 	parent.muter.unmute(contender)
 }
 
-func TestHandleErrorRetryAsyncRefresh(t *testing.T) {
+func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2219,7 +2219,7 @@ func TestHandleErrorRetryAsyncRefresh(t *testing.T) {
 	require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0])
 }
 
-func TestHandleErrorRetryRefreshOnce(t *testing.T) {
+func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2288,7 +2288,7 @@ func TestHandleErrorRetryRefreshOnce(t *testing.T) {
 	require.Same(t, retryPartitionSet1, retriedPartitions[int32(1)])
 }
 
-func TestHandleErrorRetryGroupsByLeader(t *testing.T) {
+func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2385,7 +2385,7 @@ func TestHandleErrorRetryGroupsByLeader(t *testing.T) {
 	assertNotDone(t, outputB, 50*time.Millisecond)
 }
 
-func TestHandleSuccessRetryUsesPartitionProducer(t *testing.T) {
+func TestHandleSuccessNonIdempotentRetryUsesPartitionProducer(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2429,7 +2429,7 @@ func TestHandleSuccessRetryUsesPartitionProducer(t *testing.T) {
 	parent.muter.unmute(contender)
 }
 
-func TestRetryBatchShutdownReleasesMute(t *testing.T) {
+func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2542,9 +2542,11 @@ func TestRetryUsesSharedBufferBudget(t *testing.T) {
 
 			retry := func(partition int32, pSet *partitionSet) {
 				if tt.afterRefresh {
-					retrySet := newProduceSet(parent)
-					retrySet.addPartitionSet("topic", partition, pSet)
-					parent.retryBatchesAfterRefresh(retrySet, ErrOutOfBrokers)
+					parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
+						topic:     "topic",
+						partition: partition,
+						pSet:      pSet,
+					}}, ErrOutOfBrokers)
 					return
 				}
 				parent.retryBatch("topic", partition, pSet, ErrNotEnoughReplicas, true)
@@ -2598,7 +2600,7 @@ func TestRetryUsesSharedBufferBudget(t *testing.T) {
 	}
 }
 
-func TestRetryBatchesAfterRefreshBrokerDone(t *testing.T) {
+func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
@@ -2635,9 +2637,11 @@ func TestRetryBatchesAfterRefreshBrokerDone(t *testing.T) {
 
 	done := make(chan struct{})
 	go func() {
-		retrySet := newProduceSet(parent)
-		retrySet.addPartitionSet("topic", 0, retryPartitionSet)
-		parent.retryBatchesAfterRefresh(retrySet, ErrOutOfBrokers)
+		parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
+			topic:     "topic",
+			partition: 0,
+			pSet:      retryPartitionSet,
+		}}, ErrOutOfBrokers)
 		close(done)
 	}()
 
@@ -2654,70 +2658,7 @@ func TestRetryBatchesAfterRefreshBrokerDone(t *testing.T) {
 	parent.muter.unmute(contender)
 }
 
-func TestRetryBatchesAfterRefreshShutdown(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		errors:     make(chan *ProducerError, 2),
-		done:       make(chan struct{}),
-		txnmgr:     &transactionManager{},
-	}
-	leader := &Broker{id: 1}
-	output := make(chan *produceSet, 1)
-	parent.brokers[leader] = &brokerProducer{
-		parent: parent,
-		broker: leader,
-		output: output,
-		input:  make(chan *ProducerMessage),
-		done:   make(chan struct{}),
-	}
-	shutdownStarted := false
-	parent.client = &stubLeaderClient{
-		cfg: config,
-		leaderFunc: func(string, int32) (*Broker, error) {
-			if !shutdownStarted {
-				close(parent.done)
-				shutdownStarted = true
-			}
-			return leader, nil
-		},
-	}
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry-0")})
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("retry-1")})
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-	parent.inFlight.Add(2)
-
-	parent.retryBatchesAfterRefresh(sent, ErrOutOfBrokers)
-
-	firstErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	secondErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrShuttingDown, firstErr.Err)
-	require.Equal(t, ErrShuttingDown, secondErr.Err)
-	require.ElementsMatch(t, []int32{0, 1}, []int32{firstErr.Msg.Partition, secondErr.Msg.Partition})
-	assertNotDone(t, output, 50*time.Millisecond)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next-0")})
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next-1")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected partition mutes to be released after shutdown")
-	}
-	parent.muter.unmute(contender)
-}
-
-func TestHandleErrorRetryLeaderError(t *testing.T) {
+func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
diff --git a/client.go b/client.go
index 6dbf0d009..71b57b15c 100644
--- a/client.go
+++ b/client.go
@@ -732,7 +732,7 @@ func (client *client) checkSeedBrokersHealth(brokers []*Broker) {
 
 	for _, broker := range brokers {
 		if err := broker.getSockError(); err != nil {
-			DebugLogger.Printf("client/seedbrokers close seed broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err)
+			Logger.Printf("client/seedbrokers close seed broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err)
 			safeAsyncClose(broker)
 		}
 	}
@@ -741,7 +741,7 @@ func (client *client) checkSeedBrokersHealth(brokers []*Broker) {
 func (client *client) checkBrokersHealth() {
 	for id, broker := range client.brokers {
 		if err := broker.getSockError(); err != nil {
-			DebugLogger.Printf("client/brokers close broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err)
+			Logger.Printf("client/brokers close broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err)
 			safeAsyncClose(broker)
 			delete(client.brokers, id)
 		}

From 07e58ada53b106c9307f78885fd7190daef49a14 Mon Sep 17 00:00:00 2001
From: 3AceShowHand <jinl1037@hotmail.com>
Date: Thu, 25 Jun 2026 22:03:05 +0800
Subject: [PATCH 2/6] Revert "refactor async producer (#25)"

This reverts commit 8162613c39d5b4de7b099edbf17332836e612a07.
---
 async_producer.go      | 368 ++++++++++++++-----------
 async_producer_test.go | 613 ++++++++++++++++++++++++-----------------
 produce_set_test.go    |  37 +--
 3 files changed, 582 insertions(+), 436 deletions(-)

diff --git a/async_producer.go b/async_producer.go
index c9d5f7d44..14c813ace 100644
--- a/async_producer.go
+++ b/async_producer.go
@@ -93,16 +93,17 @@ type asyncProducer struct {
 	errors                    chan *ProducerError
 	input, successes, retries chan *ProducerMessage
 	inFlight                  sync.WaitGroup
-
-	// done is closed before waiting on in-flight messages so retryBatch
+	// shutdownCh is closed before waiting on in-flight messages so retryBatch
 	// goroutines can stop waiting on broker handoff and release partition mutes.
-	done   chan struct{}
-	closed atomic.Bool
+	shutdownCh       chan struct{}
+	shutdownChClosed atomic.Bool
 
 	brokers    map[*Broker]*brokerProducer
 	brokerRefs map[*brokerProducer]int
 	brokerLock sync.Mutex
 
+	retryBuffer retryBufferQuota
+
 	txnmgr *transactionManager
 	txLock sync.Mutex
 
@@ -110,13 +111,17 @@ type asyncProducer struct {
 	// mirroring Kafka's RecordAccumulator.
 	muter *partitionMuter
 
-	// retryBufferQuota tracks producer-level retry buffer occupancy when
-	// Producer.Retry.MaxBufferLength or MaxBufferBytes is bounded.
-	retryBufferQuota messageQuota
-
 	metricsRegistry metrics.Registry
 }
 
+type retryBufferQuota struct {
+	// messages and bytes track producer-level retry buffer occupancy when
+	// Producer.Retry.MaxBufferLength or MaxBufferBytes is bounded.
+	messages int64
+	bytes    int64
+	mu       sync.Mutex
+}
+
 type partitionMuter struct {
 	mu             sync.Mutex
 	cond           *sync.Cond
@@ -307,19 +312,18 @@ func newAsyncProducer(client Client) (AsyncProducer, error) {
 	}
 
 	p := &asyncProducer{
-		client:           client,
-		conf:             client.Config(),
-		errors:           make(chan *ProducerError),
-		input:            make(chan *ProducerMessage),
-		successes:        make(chan *ProducerMessage),
-		retries:          make(chan *ProducerMessage),
-		done:             make(chan struct{}),
-		brokers:          make(map[*Broker]*brokerProducer),
-		brokerRefs:       make(map[*brokerProducer]int),
-		txnmgr:           txnmgr,
-		muter:            newPartitionMuter(),
-		retryBufferQuota: newRetryBufferQuota(client.Config()),
-		metricsRegistry:  newCleanupRegistry(client.Config().MetricRegistry),
+		client:          client,
+		conf:            client.Config(),
+		errors:          make(chan *ProducerError),
+		input:           make(chan *ProducerMessage),
+		successes:       make(chan *ProducerMessage),
+		retries:         make(chan *ProducerMessage),
+		shutdownCh:      make(chan struct{}),
+		brokers:         make(map[*Broker]*brokerProducer),
+		brokerRefs:      make(map[*brokerProducer]int),
+		txnmgr:          txnmgr,
+		muter:           newPartitionMuter(),
+		metricsRegistry: newCleanupRegistry(client.Config().MetricRegistry),
 	}
 
 	// launch our singleton dispatchers
@@ -793,19 +797,33 @@ func (p *asyncProducer) newPartitionProducer(topic string, partition int32) chan
 }
 
 func (pp *partitionProducer) backoff(retries int) {
-	time.Sleep(pp.parent.calcBackoff(retries))
+	pp.parent.backoff(retries)
 }
 
-func (p *asyncProducer) calcBackoff(retries int) time.Duration {
-	if backoffFunc := p.conf.Producer.Retry.BackoffFunc; backoffFunc != nil {
-		return backoffFunc(retries, p.conf.Producer.Retry.Max)
+func (p *asyncProducer) backoff(retries int) {
+	backoff := p.retryBackoff(retries)
+	if backoff > 0 {
+		time.Sleep(backoff)
 	}
-	return p.conf.Producer.Retry.Backoff
+}
+
+func (p *asyncProducer) retryBackoff(retries int) time.Duration {
+	var backoff time.Duration
+	if p.conf.Producer.Retry.BackoffFunc != nil {
+		maxRetries := p.conf.Producer.Retry.Max
+		backoff = p.conf.Producer.Retry.BackoffFunc(retries, maxRetries)
+	} else {
+		backoff = p.conf.Producer.Retry.Backoff
+	}
+	return backoff
 }
 
 func (p *asyncProducer) shuttingDown() bool {
+	if p.shutdownCh == nil {
+		return false
+	}
 	select {
-	case <-p.done:
+	case <-p.shutdownCh:
 		return true
 	default:
 		return false
@@ -813,16 +831,20 @@ func (p *asyncProducer) shuttingDown() bool {
 }
 
 func (p *asyncProducer) backoffOrDone(retries int) bool {
-	backoff := p.calcBackoff(retries)
+	backoff := p.retryBackoff(retries)
 	if backoff <= 0 {
 		return !p.shuttingDown()
 	}
 	timer := time.NewTimer(backoff)
 	defer timer.Stop()
+	if p.shutdownCh == nil {
+		<-timer.C
+		return true
+	}
 	select {
 	case <-timer.C:
 		return true
-	case <-p.done:
+	case <-p.shutdownCh:
 		return false
 	}
 }
@@ -1108,8 +1130,7 @@ type brokerProducer struct {
 	// output while the batch still owns its partition mute; if the brokerProducer
 	// is shutting down, that send may never be received and the mute would block
 	// later same-partition messages and producer shutdown.
-	done   chan struct{}
-	closed atomic.Bool
+	done chan struct{}
 
 	accumulatingBatch *produceSet
 	flushingBatch     *produceSet // batch that has been muted and is ready to send
@@ -1233,20 +1254,18 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool {
 	partial := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool {
 		return bp.parent.muter.tryMutePartition(topic, partition)
 	})
-	if partial == nil {
-		return false
-	}
-	bp.flushingBatch = partial
-	if bp.accumulatingBatch.empty() {
-		bp.rollOver()
+	if partial != nil {
+		bp.flushingBatch = partial
+		if bp.accumulatingBatch.empty() {
+			bp.rollOver()
+		}
+		return true
 	}
-	return true
+
+	return false
 }
 
 func (bp *brokerProducer) shutdown() {
-	if bp.closed.Swap(true) {
-		return
-	}
 	if bp.done != nil {
 		close(bp.done)
 	}
@@ -1476,20 +1495,25 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 	produceSet := newProduceSet(p)
 	produceSet.addPartitionSet(topic, partition, pSet)
 	bufferMessages, bufferBytes := int64(produceSet.bufferCount), int64(produceSet.bufferBytes)
-	var reservedMessages, reservedBytes int64
-	defer func() {
-		p.retryBufferQuota.release(reservedMessages, reservedBytes)
-	}()
-
+	bufferReserved := false
+	releaseBuffer := func() {
+		if bufferReserved {
+			p.releaseBuffer(bufferMessages, bufferBytes)
+			bufferReserved = false
+		}
+	}
 	muted := alreadyMuted
 	failBatch := func(err error) {
+		releaseBuffer()
 		// Release the partition before reporting errors so a blocked Errors
 		// consumer cannot keep later same-partition messages muted.
 		if muted {
 			p.muter.unmute(produceSet)
 			muted = false
 		}
-		p.returnErrors(pSet.msgs, err)
+		for _, msg := range pSet.msgs {
+			p.returnError(msg, err)
+		}
 	}
 	for _, msg := range pSet.msgs {
 		if msg.retries >= p.conf.Producer.Retry.Max {
@@ -1498,11 +1522,11 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 		}
 		msg.retries++
 	}
-	if !p.retryBufferQuota.tryReserve(bufferMessages, bufferBytes) {
+	if !p.reserveBuffer(bufferMessages, bufferBytes) {
 		failBatch(ErrProducerRetryBufferOverflow)
 		return
 	}
-	reservedMessages, reservedBytes = bufferMessages, bufferBytes
+	bufferReserved = p.retryBufferBounded() && (bufferMessages != 0 || bufferBytes != 0)
 
 	// Honor Producer.Retry.Backoff between retry attempts (#2469). retryBatch
 	// dispatches the produceSet directly to the broker, bypassing partitionProducer.dispatch.
@@ -1527,9 +1551,11 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 	}
 	bp := p.getBrokerProducer(leader)
 	defer p.unrefBrokerProducer(leader, bp)
-	if !p.handoffRetryBatch(bp, produceSet) {
+	if !p.sendRetryBatch(bp, produceSet) {
 		failBatch(ErrShuttingDown)
+		return
 	}
+	releaseBuffer()
 }
 
 type partitionBatchRetry struct {
@@ -1542,7 +1568,9 @@ func (p *asyncProducer) failMutedBatch(batch partitionBatchRetry, err error) {
 	produceSet := newProduceSet(p)
 	produceSet.addPartitionSet(batch.topic, batch.partition, batch.pSet)
 	p.muter.unmute(produceSet)
-	p.returnErrors(batch.pSet.msgs, err)
+	for _, msg := range batch.pSet.msgs {
+		p.returnError(msg, err)
+	}
 }
 
 func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err error) {
@@ -1551,32 +1579,85 @@ func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err erro
 	}
 }
 
-func newRetryBufferQuota(conf *Config) messageQuota {
-	maxBufferLength, maxBufferBytes := retryBufferLimits(conf)
-	return newMessageQuota(maxBufferLength, maxBufferBytes)
-}
-
-func retryBufferLimits(conf *Config) (int64, int64) {
-	maxBufferLength := conf.Producer.Retry.MaxBufferLength
-	if maxBufferLength > 0 {
-		maxBufferLength = max(maxBufferLength, minFunctionalRetryBufferLength)
+func (p *asyncProducer) retryBufferLimits() (int64, int64) {
+	maxBufferLength := p.conf.Producer.Retry.MaxBufferLength
+	if 0 < maxBufferLength && maxBufferLength < minFunctionalRetryBufferLength {
+		maxBufferLength = minFunctionalRetryBufferLength
 	}
 
-	maxBufferBytes := conf.Producer.Retry.MaxBufferBytes
-	if maxBufferBytes > 0 {
-		maxBufferBytes = max(maxBufferBytes, minFunctionalRetryBufferBytes)
+	maxBufferBytes := p.conf.Producer.Retry.MaxBufferBytes
+	if 0 < maxBufferBytes && maxBufferBytes < minFunctionalRetryBufferBytes {
+		maxBufferBytes = minFunctionalRetryBufferBytes
 	}
 
 	return int64(maxBufferLength), maxBufferBytes
 }
 
-func producerMessageByteSizeVersion(conf *Config) int {
-	if conf.Version.IsAtLeast(V0_11_0_0) {
+func (p *asyncProducer) retryBufferBounded() bool {
+	maxBufferLength, maxBufferBytes := p.retryBufferLimits()
+	return maxBufferLength > 0 || maxBufferBytes > 0
+}
+
+func (p *asyncProducer) producerMessageByteSizeVersion() int {
+	if p.conf.Version.IsAtLeast(V0_11_0_0) {
 		return 2
 	}
 	return 1
 }
 
+func (p *asyncProducer) bufferWouldOverflow(messages, bytes int64) bool {
+	maxBufferLength, maxBufferBytes := p.retryBufferLimits()
+	return (maxBufferLength > 0 && messages >= maxBufferLength) ||
+		(maxBufferBytes > 0 && bytes >= maxBufferBytes)
+}
+
+func (p *asyncProducer) reserveBuffer(messages, bytes int64) bool {
+	if messages == 0 && bytes == 0 {
+		return true
+	}
+	if !p.retryBufferBounded() {
+		return true
+	}
+	p.retryBuffer.mu.Lock()
+	defer p.retryBuffer.mu.Unlock()
+
+	if p.bufferWouldOverflow(p.retryBuffer.messages+messages, p.retryBuffer.bytes+bytes) {
+		return false
+	}
+	p.retryBuffer.messages += messages
+	p.retryBuffer.bytes += bytes
+	return true
+}
+
+func (p *asyncProducer) addToBuffer(messages, bytes int64) bool {
+	if messages == 0 && bytes == 0 {
+		return false
+	}
+	if !p.retryBufferBounded() {
+		return false
+	}
+	p.retryBuffer.mu.Lock()
+	defer p.retryBuffer.mu.Unlock()
+
+	p.retryBuffer.messages += messages
+	p.retryBuffer.bytes += bytes
+	return p.bufferWouldOverflow(p.retryBuffer.messages, p.retryBuffer.bytes)
+}
+
+func (p *asyncProducer) releaseBuffer(messages, bytes int64) {
+	if messages == 0 && bytes == 0 {
+		return
+	}
+	if !p.retryBufferBounded() {
+		return
+	}
+	p.retryBuffer.mu.Lock()
+	defer p.retryBuffer.mu.Unlock()
+
+	p.retryBuffer.messages -= messages
+	p.retryBuffer.bytes -= bytes
+}
+
 // retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError.
 // Connection errors already hold partition mutes; doing the refresh in the
 // response loop can block all progress for that broker while the cluster is
@@ -1621,13 +1702,13 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part
 		bufferMessages += int64(len(batch.pSet.msgs))
 		bufferBytes += int64(batch.pSet.bufferBytes)
 	}
-	if !p.retryBufferQuota.tryReserve(bufferMessages, bufferBytes) {
+	if !p.reserveBuffer(bufferMessages, bufferBytes) {
 		p.failMutedBatches(retryable, ErrProducerRetryBufferOverflow)
 		return
 	}
 
 	if p.shuttingDown() {
-		p.retryBufferQuota.release(bufferMessages, bufferBytes)
+		p.releaseBuffer(bufferMessages, bufferBytes)
 		p.failMutedBatches(retryable, ErrShuttingDown)
 		return
 	}
@@ -1637,13 +1718,13 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part
 		}
 	}
 	if p.shuttingDown() {
-		p.retryBufferQuota.release(bufferMessages, bufferBytes)
+		p.releaseBuffer(bufferMessages, bufferBytes)
 		p.failMutedBatches(retryable, ErrShuttingDown)
 		return
 	}
 
 	if !p.backoffOrDone(maxRetryAttempt) {
-		p.retryBufferQuota.release(bufferMessages, bufferBytes)
+		p.releaseBuffer(bufferMessages, bufferBytes)
 		p.failMutedBatches(retryable, ErrShuttingDown)
 		return
 	}
@@ -1652,15 +1733,15 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part
 	for _, batch := range retryable {
 		if p.shuttingDown() {
 			batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes)
-			p.retryBufferQuota.release(batchMessages, batchBytes)
+			p.releaseBuffer(batchMessages, batchBytes)
 			p.failMutedBatch(batch, ErrShuttingDown)
 			continue
 		}
-		leader, err := p.client.Leader(batch.topic, batch.partition)
-		if err != nil {
-			Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, err)
+		leader, leaderErr := p.client.Leader(batch.topic, batch.partition)
+		if leaderErr != nil {
+			Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, leaderErr)
 			batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes)
-			p.retryBufferQuota.release(batchMessages, batchBytes)
+			p.releaseBuffer(batchMessages, batchBytes)
 			p.failMutedBatch(batch, retryErr)
 			continue
 		}
@@ -1673,32 +1754,55 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part
 	}
 
 	for leader, set := range brokerSets {
-		bp := p.getBrokerProducer(leader)
-		accepted := p.handoffRetryBatch(bp, set)
-		p.unrefBrokerProducer(leader, bp)
-		p.retryBufferQuota.release(int64(set.bufferCount), int64(set.bufferBytes))
-		if !accepted {
-			p.muter.unmute(set)
-			set.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
-				p.returnErrors(pSet.msgs, ErrShuttingDown)
-			})
-		}
+		p.handoffRetrySet(leader, set)
 	}
 }
 
-// handoffRetryBatch transfers ownership of a muted retry batch to the broker
-// bridge. On false, the caller still owns the mute and must fail the batch.
-func (p *asyncProducer) handoffRetryBatch(bp *brokerProducer, set *produceSet) bool {
-	if bp.closed.Load() {
-		return false
+func (p *asyncProducer) handoffRetrySet(leader *Broker, set *produceSet) {
+	bp := p.getBrokerProducer(leader)
+	accepted := p.sendRetryBatch(bp, set)
+	p.unrefBrokerProducer(leader, bp)
+	setMessages, setBytes := int64(set.bufferCount), int64(set.bufferBytes)
+	p.releaseBuffer(setMessages, setBytes)
+	if !accepted {
+		p.muter.unmute(set)
+		set.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
+			for _, msg := range pSet.msgs {
+				p.returnError(msg, ErrShuttingDown)
+			}
+		})
 	}
+}
 
+// sendRetryBatch transfers ownership of a muted retry batch to the broker
+// bridge. A false result means no owner accepted the batch, so the caller must
+// release the mute and fail the messages instead of leaving a retry goroutine
+// blocked while it still owns the partition mute.
+func (p *asyncProducer) sendRetryBatch(bp *brokerProducer, set *produceSet) bool {
+	var done <-chan struct{}
+	if bp.done != nil {
+		done = bp.done
+	}
+	select {
+	case <-done:
+		return false
+	default:
+	}
+	select {
+	case bp.output <- set:
+		return true
+	default:
+	}
+	var shutdownCh <-chan struct{}
+	if p.shutdownCh != nil {
+		shutdownCh = p.shutdownCh
+	}
 	select {
 	case bp.output <- set:
 		return true
-	case <-bp.done:
+	case <-done:
 		return false
-	case <-p.done:
+	case <-shutdownCh:
 		return false
 	}
 }
@@ -1769,7 +1873,7 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) {
 // effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock
 // based on https://godoc.org/github.com/eapache/channels#InfiniteChannel
 func (p *asyncProducer) retryHandler() {
-	version := producerMessageByteSizeVersion(p.conf)
+	version := p.producerMessageByteSizeVersion()
 
 	var msg *ProducerMessage
 	buf := queue.New()
@@ -1782,7 +1886,7 @@ func (p *asyncProducer) retryHandler() {
 			case msg = <-p.retries:
 			case p.input <- buf.Peek().(*ProducerMessage):
 				msgToRemove := buf.Remove().(*ProducerMessage)
-				p.retryBufferQuota.release(1, int64(msgToRemove.ByteSize(version)))
+				p.releaseBuffer(1, int64(msgToRemove.ByteSize(version)))
 				continue
 			}
 		}
@@ -1792,7 +1896,7 @@ func (p *asyncProducer) retryHandler() {
 		}
 
 		buf.Add(msg)
-		bufferOverflow := p.retryBufferQuota.addAndCheckOverflow(1, int64(msg.ByteSize(version)))
+		bufferOverflow := p.addToBuffer(1, int64(msg.ByteSize(version)))
 
 		if !bufferOverflow {
 			continue
@@ -1803,10 +1907,10 @@ func (p *asyncProducer) retryHandler() {
 			select {
 			case p.input <- msgToHandle:
 				buf.Remove()
-				p.retryBufferQuota.release(1, int64(msgToHandle.ByteSize(version)))
+				p.releaseBuffer(1, int64(msgToHandle.ByteSize(version)))
 			default:
 				buf.Remove()
-				p.retryBufferQuota.release(1, int64(msgToHandle.ByteSize(version)))
+				p.releaseBuffer(1, int64(msgToHandle.ByteSize(version)))
 				p.returnError(msgToHandle, ErrProducerRetryBufferOverflow)
 			}
 		}
@@ -1817,8 +1921,8 @@ func (p *asyncProducer) retryHandler() {
 
 func (p *asyncProducer) shutdown() {
 	Logger.Println("Producer shutting down.")
-	if p.done != nil && p.closed.CompareAndSwap(false, true) {
-		close(p.done)
+	if p.shutdownCh != nil && p.shutdownChClosed.CompareAndSwap(false, true) {
+		close(p.shutdownCh)
 	}
 	p.inFlight.Add(1)
 	p.input <- &ProducerMessage{flags: shutdown}
@@ -1964,71 +2068,3 @@ func (p *asyncProducer) abandonBrokerConnection(broker *Broker) {
 
 	delete(p.brokers, broker)
 }
-
-// messageQuota tracks message and byte counts against optional maximums.
-type messageQuota struct {
-	mu          sync.Mutex
-	messages    int64
-	bytes       int64
-	maxMessages int64
-	maxBytes    int64
-}
-
-func newMessageQuota(maxMessages, maxBytes int64) messageQuota {
-	return messageQuota{
-		maxMessages: maxMessages,
-		maxBytes:    maxBytes,
-	}
-}
-
-func (q *messageQuota) tryReserve(messages, bytes int64) bool {
-	if !q.shouldTrack(messages, bytes) {
-		return true
-	}
-
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	nextMessages := q.messages + messages
-	nextBytes := q.bytes + bytes
-	if q.wouldOverflow(nextMessages, nextBytes) {
-		return false
-	}
-	q.messages = nextMessages
-	q.bytes = nextBytes
-	return true
-}
-
-func (q *messageQuota) addAndCheckOverflow(messages, bytes int64) bool {
-	if !q.shouldTrack(messages, bytes) {
-		return false
-	}
-
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	q.messages += messages
-	q.bytes += bytes
-	return q.wouldOverflow(q.messages, q.bytes)
-}
-
-func (q *messageQuota) release(messages, bytes int64) {
-	if !q.shouldTrack(messages, bytes) {
-		return
-	}
-
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	q.messages -= messages
-	q.bytes -= bytes
-}
-
-func (q *messageQuota) shouldTrack(messages, bytes int64) bool {
-	return (messages != 0 || bytes != 0) && (q.maxMessages > 0 || q.maxBytes > 0)
-}
-
-func (q *messageQuota) wouldOverflow(messages, bytes int64) bool {
-	return (q.maxMessages > 0 && messages >= q.maxMessages) ||
-		(q.maxBytes > 0 && bytes >= q.maxBytes)
-}
diff --git a/async_producer_test.go b/async_producer_test.go
index b16e0bf61..a3c220f63 100644
--- a/async_producer_test.go
+++ b/async_producer_test.go
@@ -1562,8 +1562,6 @@ func TestBrokerProducerShutdown(t *testing.T) {
 	bp := producer.(*asyncProducer).getBrokerProducer(broker)
 	// Initiate the shutdown of all of them
 	producer.(*asyncProducer).unrefBrokerProducer(broker, bp)
-	assertDoneWithin(t, bp.done, time.Second)
-	require.NotPanics(t, bp.shutdown)
 
 	_ = producer.Close()
 	mockBroker.Close()
@@ -1637,16 +1635,37 @@ func assertDoneWithin[T any](t *testing.T, ch <-chan T, timeout time.Duration) T
 	}
 }
 
+func waitForProducerBuffer(t *testing.T, parent *asyncProducer, messages, bytes int64) {
+	t.Helper()
+	deadline := time.After(2 * time.Second)
+	ticker := time.NewTicker(time.Millisecond)
+	defer ticker.Stop()
+	for {
+		parent.retryBuffer.mu.Lock()
+		gotMessages := parent.retryBuffer.messages
+		gotBytes := parent.retryBuffer.bytes
+		parent.retryBuffer.mu.Unlock()
+		if (messages < 0 || gotMessages == messages) && (bytes < 0 || gotBytes == bytes) {
+			return
+		}
+		select {
+		case <-deadline:
+			t.Fatalf("timed out waiting for buffer budget messages=%d bytes=%d, got messages=%d bytes=%d",
+				messages, bytes, gotMessages, gotBytes)
+		case <-ticker.C:
+		}
+	}
+}
+
 func newBlockingRetryProducer(config *Config, errorBuffer int) (*asyncProducer, *brokerProducer) {
 	parent := &asyncProducer{
-		conf:             config,
-		muter:            newPartitionMuter(),
-		brokers:          make(map[*Broker]*brokerProducer),
-		brokerRefs:       make(map[*brokerProducer]int),
-		errors:           make(chan *ProducerError, errorBuffer),
-		done:             make(chan struct{}),
-		txnmgr:           &transactionManager{},
-		retryBufferQuota: newRetryBufferQuota(config),
+		conf:       config,
+		muter:      newPartitionMuter(),
+		brokers:    make(map[*Broker]*brokerProducer),
+		brokerRefs: make(map[*brokerProducer]int),
+		errors:     make(chan *ProducerError, errorBuffer),
+		shutdownCh: make(chan struct{}),
+		txnmgr:     &transactionManager{},
 	}
 	leader := &Broker{id: 1}
 	parent.client = &stubLeaderClient{leader: leader, cfg: config}
@@ -1661,52 +1680,6 @@ func newBlockingRetryProducer(config *Config, errorBuffer int) (*asyncProducer,
 	return parent, retryBP
 }
 
-// holdFirstRetryAfterReserve lets the test prove a second retry sees quota held by the first.
-func holdFirstRetryAfterReserve(parent *asyncProducer, retryBP *brokerProducer, afterRefresh bool) (<-chan struct{}, func()) {
-	firstRetryReserved := make(chan struct{}, 1)
-	blockFirstRetry := make(chan struct{}, 1)
-	releaseFirstRetry := make(chan struct{}, 1)
-	blockFirstRetry <- struct{}{}
-
-	hold := func() {
-		select {
-		case <-blockFirstRetry:
-			firstRetryReserved <- struct{}{}
-			<-releaseFirstRetry
-		default:
-		}
-	}
-	release := func() {
-		select {
-		case releaseFirstRetry <- struct{}{}:
-		default:
-		}
-	}
-
-	if afterRefresh {
-		parent.client = &stubLeaderClient{
-			leader: retryBP.broker,
-			cfg:    parent.conf,
-			refreshMetadata: func(...string) error {
-				hold()
-				return nil
-			},
-		}
-	} else {
-		parent.client = &stubLeaderClient{
-			cfg: parent.conf,
-			leaderFunc: func(_ string, partition int32) (*Broker, error) {
-				if partition == 0 {
-					hold()
-				}
-				return retryBP.broker, nil
-			},
-		}
-	}
-
-	return firstRetryReserved, release
-}
-
 // TestBrokerProducerWaitForSpaceRespectsExternalUnmute ensures waitForSpace does not
 // deadlock when partitions are muted by another producer and are unmuted elsewhere.
 func TestBrokerProducerWaitForSpaceRespectsExternalUnmute(t *testing.T) {
@@ -1987,12 +1960,11 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	}
 
 	parent := &asyncProducer{
-		conf:             config,
-		muter:            newPartitionMuter(),
-		brokers:          make(map[*Broker]*brokerProducer),
-		brokerRefs:       make(map[*brokerProducer]int),
-		txnmgr:           txnMgr,
-		retryBufferQuota: newRetryBufferQuota(config),
+		conf:       config,
+		muter:      newPartitionMuter(),
+		brokers:    make(map[*Broker]*brokerProducer),
+		brokerRefs: make(map[*brokerProducer]int),
+		txnmgr:     txnMgr,
 	}
 	leader := &Broker{}
 	parent.client = &stubLeaderClient{leader: leader, cfg: config}
@@ -2023,6 +1995,7 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	default:
 		t.Fatal("expected retry batch to be dispatched")
 	}
+	waitForProducerBuffer(t, parent, 0, 0)
 
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
@@ -2149,28 +2122,18 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T
 	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength
 
 	parent := &asyncProducer{
-		conf:             config,
-		muter:            newPartitionMuter(),
-		brokers:          make(map[*Broker]*brokerProducer),
-		brokerRefs:       make(map[*brokerProducer]int),
-		txnmgr:           &transactionManager{},
-		retryBufferQuota: newRetryBufferQuota(config),
+		conf:       config,
+		muter:      newPartitionMuter(),
+		brokers:    make(map[*Broker]*brokerProducer),
+		brokerRefs: make(map[*brokerProducer]int),
+		txnmgr:     &transactionManager{},
 	}
 	failedBroker := &Broker{id: 1}
 	retryLeader := &Broker{id: 2}
-	started := make(chan struct{}, 1)
-	release := make(chan struct{})
-	client := &stubLeaderClient{
-		leader: retryLeader,
-		cfg:    config,
-		refreshMetadata: func(...string) error {
-			select {
-			case started <- struct{}{}:
-			default:
-			}
-			<-release
-			return nil
-		},
+	client := &blockingRefreshClient{
+		stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config},
+		started:          make(chan struct{}, 1),
+		release:          make(chan struct{}),
 	}
 	parent.client = client
 
@@ -2204,7 +2167,8 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T
 		close(done)
 	}()
 	assertDoneWithin(t, done, 2*time.Second)
-	assertDoneWithin(t, started, 2*time.Second)
+	assertDoneWithin(t, client.started, 2*time.Second)
+	waitForProducerBuffer(t, parent, 1, -1)
 
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
@@ -2213,10 +2177,11 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T
 		t.Fatal("expected partition to remain muted while async metadata refresh is pending")
 	}
 
-	close(release)
+	close(client.release)
 	retrySet := assertDoneWithin(t, output, 2*time.Second)
 	defer parent.muter.unmute(retrySet)
 	require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0])
+	waitForProducerBuffer(t, parent, 0, 0)
 }
 
 func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) {
@@ -2234,14 +2199,9 @@ func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *t
 	}
 	failedBroker := &Broker{id: 1}
 	retryLeader := &Broker{id: 2}
-	calls := make(chan []string, 1)
-	client := &stubLeaderClient{
-		leader: retryLeader,
-		cfg:    config,
-		refreshMetadata: func(topics ...string) error {
-			calls <- append([]string(nil), topics...)
-			return nil
-		},
+	client := &countingRefreshClient{
+		stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config},
+		calls:            make(chan []string, 1),
 	}
 	parent.client = client
 
@@ -2274,10 +2234,10 @@ func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *t
 
 	bp.handleError(sent, ErrOutOfBrokers)
 
-	require.Equal(t, []string{"topic"}, assertDoneWithin(t, calls, 2*time.Second))
+	require.Equal(t, []string{"topic"}, assertDoneWithin(t, client.calls, 2*time.Second))
 	retrySet := assertDoneWithin(t, output, 2*time.Second)
 	defer parent.muter.unmute(retrySet)
-	assertNotDone(t, calls, 50*time.Millisecond)
+	assertNotDone(t, client.calls, 50*time.Millisecond)
 	assertNotDone(t, output, 50*time.Millisecond)
 
 	retriedPartitions := make(map[int32]*partitionSet)
@@ -2304,9 +2264,8 @@ func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) {
 	failedBroker := &Broker{id: 1}
 	leaderA := &Broker{id: 2}
 	leaderB := &Broker{id: 3}
-	calls := make(chan []string, 1)
-	client := &stubLeaderClient{
-		cfg: config,
+	client := &routingLeaderClient{
+		stubLeaderClient: stubLeaderClient{cfg: config},
 		leaders: map[string]map[int32]*Broker{
 			"topic-a": {
 				0: leaderA,
@@ -2317,10 +2276,7 @@ func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) {
 				1: leaderB,
 			},
 		},
-		refreshMetadata: func(topics ...string) error {
-			calls <- append([]string(nil), topics...)
-			return nil
-		},
+		calls: make(chan []string, 1),
 	}
 	parent.client = client
 
@@ -2360,7 +2316,7 @@ func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) {
 
 	bp.handleError(sent, ErrOutOfBrokers)
 
-	require.ElementsMatch(t, []string{"topic-a", "topic-b"}, assertDoneWithin(t, calls, 2*time.Second))
+	require.ElementsMatch(t, []string{"topic-a", "topic-b"}, assertDoneWithin(t, client.calls, 2*time.Second))
 	retrySetA := assertDoneWithin(t, outputA, 2*time.Second)
 	defer parent.muter.unmute(retrySetA)
 	retrySetB := assertDoneWithin(t, outputB, 2*time.Second)
@@ -2438,27 +2394,16 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
 	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength
 
 	parent := &asyncProducer{
-		conf:             config,
-		muter:            newPartitionMuter(),
-		brokers:          make(map[*Broker]*brokerProducer),
-		brokerRefs:       make(map[*brokerProducer]int),
-		errors:           make(chan *ProducerError, 1),
-		done:             make(chan struct{}),
-		txnmgr:           &transactionManager{},
-		retryBufferQuota: newRetryBufferQuota(config),
+		conf:       config,
+		muter:      newPartitionMuter(),
+		brokers:    make(map[*Broker]*brokerProducer),
+		brokerRefs: make(map[*brokerProducer]int),
+		errors:     make(chan *ProducerError, 1),
+		shutdownCh: make(chan struct{}),
+		txnmgr:     &transactionManager{},
 	}
 	leader := &Broker{id: 1}
-	leaderLookup := make(chan struct{}, 1)
-	parent.client = &stubLeaderClient{
-		cfg: config,
-		leaderFunc: func(string, int32) (*Broker, error) {
-			select {
-			case leaderLookup <- struct{}{}:
-			default:
-			}
-			return leader, nil
-		},
-	}
+	parent.client = &stubLeaderClient{leader: leader, cfg: config}
 	retryBP := &brokerProducer{
 		parent: parent,
 		broker: leader,
@@ -2481,15 +2426,16 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
 		parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true)
 		close(done)
 	}()
-	assertDoneWithin(t, leaderLookup, 2*time.Second)
+	waitForProducerBuffer(t, parent, 1, -1)
 
-	if parent.closed.CompareAndSwap(false, true) {
-		close(parent.done)
+	if parent.shutdownChClosed.CompareAndSwap(false, true) {
+		close(parent.shutdownCh)
 	}
 
 	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
 	require.Equal(t, ErrShuttingDown, producerErr.Err)
 	assertDoneWithin(t, done, 2*time.Second)
+	waitForProducerBuffer(t, parent, 0, 0)
 
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
@@ -2499,105 +2445,114 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
 	parent.muter.unmute(contender)
 }
 
-func TestRetryUsesSharedBufferBudget(t *testing.T) {
-	tests := []struct {
-		name         string
-		afterRefresh bool
-		bytesLimit   bool
-	}{
-		{name: "retryBatch length"},
-		{name: "retryBatch bytes", bytesLimit: true},
-		{name: "retryBatchesAfterRefresh length", afterRefresh: true},
-		{name: "retryBatchesAfterRefresh bytes", afterRefresh: true, bytesLimit: true},
+func TestRetryBatchUsesSharedBufferLengthBudget(t *testing.T) {
+	config := NewTestConfig()
+	config.Producer.Idempotent = true
+	config.Producer.Retry.Max = 1
+	config.Producer.Retry.Backoff = 0
+	config.Producer.Return.Errors = true
+	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1
+
+	parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1)
+
+	first := newProduceSet(parent)
+	for i := 0; i < minFunctionalRetryBufferLength; i++ {
+		safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
+	}
+	firstPartitionSet := first.msgs["topic"][0]
+	if !parent.muter.tryMute(first) {
+		t.Fatal("expected first retry batch to mute partitions")
 	}
+	parent.inFlight.Add(minFunctionalRetryBufferLength)
 
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			config := NewTestConfig()
-			config.Producer.Idempotent = !tt.afterRefresh
-			config.Producer.Retry.Max = 1
-			config.Producer.Retry.Backoff = 0
-			config.Producer.Return.Errors = true
-
-			firstCount := 1
-			errorBuffer := 2
-			var firstMsg *ProducerMessage
-			secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}
-
-			if tt.bytesLimit {
-				firstMsg = &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))}
-				version := producerMessageByteSizeVersion(config)
-				firstBytes := int64(firstMsg.ByteSize(version))
-				secondBytes := int64(secondMsg.ByteSize(version))
-				config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes
-			} else {
-				config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1
-				firstCount = minFunctionalRetryBufferLength
-				errorBuffer = minFunctionalRetryBufferLength + 1
-			}
+	firstDone := make(chan struct{})
+	go func() {
+		parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true)
+		close(firstDone)
+	}()
+	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
 
-			parent, retryBP := newBlockingRetryProducer(config, errorBuffer)
-			firstRetryReserved, releaseFirstRetry := holdFirstRetryAfterReserve(parent, retryBP, tt.afterRefresh)
-			defer releaseFirstRetry()
-
-			retry := func(partition int32, pSet *partitionSet) {
-				if tt.afterRefresh {
-					parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
-						topic:     "topic",
-						partition: partition,
-						pSet:      pSet,
-					}}, ErrOutOfBrokers)
-					return
-				}
-				parent.retryBatch("topic", partition, pSet, ErrNotEnoughReplicas, true)
-			}
+	second := newProduceSet(parent)
+	safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")})
+	secondPartitionSet := second.msgs["topic"][1]
+	if !parent.muter.tryMute(second) {
+		t.Fatal("expected second retry batch to mute partitions")
+	}
+	parent.inFlight.Add(1)
+	parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true)
 
-			first := newProduceSet(parent)
-			if tt.bytesLimit {
-				safeAddMessage(t, first, firstMsg)
-			} else {
-				for i := 0; i < minFunctionalRetryBufferLength; i++ {
-					safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-				}
-			}
-			firstPartitionSet := first.msgs["topic"][0]
-			if !parent.muter.tryMute(first) {
-				t.Fatal("expected first retry batch to mute partitions")
-			}
-			parent.inFlight.Add(firstCount)
+	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
+	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
+	require.Equal(t, int32(1), producerErr.Msg.Partition)
+	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
 
-			firstDone := make(chan struct{})
-			go func() {
-				retry(0, firstPartitionSet)
-				close(firstDone)
-			}()
-			assertDoneWithin(t, firstRetryReserved, 2*time.Second)
+	contender := newProduceSet(parent)
+	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
+	if !parent.muter.tryMute(contender) {
+		t.Fatal("expected overflowed retry batch to release partition mute")
+	}
+	parent.muter.unmute(contender)
 
-			second := newProduceSet(parent)
-			safeAddMessage(t, second, secondMsg)
-			secondPartitionSet := second.msgs["topic"][1]
-			if !parent.muter.tryMute(second) {
-				t.Fatal("expected second retry batch to mute partitions")
-			}
-			parent.inFlight.Add(1)
-			retry(1, secondPartitionSet)
+	close(retryBP.done)
+	assertDoneWithin(t, firstDone, 2*time.Second)
+	waitForProducerBuffer(t, parent, 0, 0)
+}
 
-			producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-			require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
-			require.Equal(t, int32(1), producerErr.Msg.Partition)
+func TestRetryBatchUsesSharedBufferBytesBudget(t *testing.T) {
+	config := NewTestConfig()
+	config.Producer.Idempotent = true
+	config.Producer.Retry.Max = 1
+	config.Producer.Retry.Backoff = 0
+	config.Producer.Return.Errors = true
 
-			contender := newProduceSet(parent)
-			safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
-			if !parent.muter.tryMute(contender) {
-				t.Fatal("expected overflowed retry batch to release partition mute")
-			}
-			parent.muter.unmute(contender)
+	parent, retryBP := newBlockingRetryProducer(config, 2)
 
-			close(retryBP.done)
-			releaseFirstRetry()
-			assertDoneWithin(t, firstDone, 2*time.Second)
-		})
+	firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))}
+	secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}
+	version := parent.producerMessageByteSizeVersion()
+	firstBytes := int64(firstMsg.ByteSize(version))
+	secondBytes := int64(secondMsg.ByteSize(version))
+	config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes
+
+	first := newProduceSet(parent)
+	safeAddMessage(t, first, firstMsg)
+	firstPartitionSet := first.msgs["topic"][0]
+	if !parent.muter.tryMute(first) {
+		t.Fatal("expected first retry batch to mute partitions")
+	}
+	parent.inFlight.Add(1)
+
+	firstDone := make(chan struct{})
+	go func() {
+		parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true)
+		close(firstDone)
+	}()
+	waitForProducerBuffer(t, parent, -1, firstBytes)
+
+	second := newProduceSet(parent)
+	safeAddMessage(t, second, secondMsg)
+	secondPartitionSet := second.msgs["topic"][1]
+	if !parent.muter.tryMute(second) {
+		t.Fatal("expected second retry batch to mute partitions")
 	}
+	parent.inFlight.Add(1)
+	parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true)
+
+	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
+	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
+	require.Equal(t, int32(1), producerErr.Msg.Partition)
+	waitForProducerBuffer(t, parent, -1, firstBytes)
+
+	contender := newProduceSet(parent)
+	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
+	if !parent.muter.tryMute(contender) {
+		t.Fatal("expected overflowed retry batch to release partition mute")
+	}
+	parent.muter.unmute(contender)
+
+	close(retryBP.done)
+	assertDoneWithin(t, firstDone, 2*time.Second)
+	waitForProducerBuffer(t, parent, 0, 0)
 }
 
 func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T) {
@@ -2613,7 +2568,7 @@ func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T
 		brokers:    make(map[*Broker]*brokerProducer),
 		brokerRefs: make(map[*brokerProducer]int),
 		errors:     make(chan *ProducerError, 1),
-		done:       make(chan struct{}),
+		shutdownCh: make(chan struct{}),
 		txnmgr:     &transactionManager{},
 	}
 	leader := &Broker{id: 1}
@@ -2658,6 +2613,128 @@ func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T
 	parent.muter.unmute(contender)
 }
 
+func TestRetryBatchesAfterRefreshUsesSharedBufferLengthBudget(t *testing.T) {
+	config := NewTestConfig()
+	config.Producer.Idempotent = false
+	config.Producer.Retry.Max = 1
+	config.Producer.Retry.Backoff = 0
+	config.Producer.Return.Errors = true
+	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1
+
+	parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1)
+
+	first := newProduceSet(parent)
+	for i := 0; i < minFunctionalRetryBufferLength; i++ {
+		safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
+	}
+	if !parent.muter.tryMute(first) {
+		t.Fatal("expected first retry batch to mute partitions")
+	}
+	parent.inFlight.Add(minFunctionalRetryBufferLength)
+
+	firstDone := make(chan struct{})
+	go func() {
+		parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
+			topic:     "topic",
+			partition: 0,
+			pSet:      first.msgs["topic"][0],
+		}}, ErrOutOfBrokers)
+		close(firstDone)
+	}()
+	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
+
+	second := newProduceSet(parent)
+	safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")})
+	if !parent.muter.tryMute(second) {
+		t.Fatal("expected second retry batch to mute partitions")
+	}
+	parent.inFlight.Add(1)
+	parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
+		topic:     "topic",
+		partition: 1,
+		pSet:      second.msgs["topic"][1],
+	}}, ErrOutOfBrokers)
+
+	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
+	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
+	require.Equal(t, int32(1), producerErr.Msg.Partition)
+	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
+
+	contender := newProduceSet(parent)
+	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
+	if !parent.muter.tryMute(contender) {
+		t.Fatal("expected overflowed retry batch to release partition mute")
+	}
+	parent.muter.unmute(contender)
+
+	close(retryBP.done)
+	assertDoneWithin(t, firstDone, 2*time.Second)
+	waitForProducerBuffer(t, parent, 0, 0)
+}
+
+func TestRetryBatchesAfterRefreshUsesSharedBufferBytesBudget(t *testing.T) {
+	config := NewTestConfig()
+	config.Producer.Idempotent = false
+	config.Producer.Retry.Max = 1
+	config.Producer.Retry.Backoff = 0
+	config.Producer.Return.Errors = true
+
+	parent, retryBP := newBlockingRetryProducer(config, 2)
+
+	firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))}
+	secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}
+	version := parent.producerMessageByteSizeVersion()
+	firstBytes := int64(firstMsg.ByteSize(version))
+	secondBytes := int64(secondMsg.ByteSize(version))
+	config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes
+
+	first := newProduceSet(parent)
+	safeAddMessage(t, first, firstMsg)
+	if !parent.muter.tryMute(first) {
+		t.Fatal("expected first retry batch to mute partitions")
+	}
+	parent.inFlight.Add(1)
+
+	firstDone := make(chan struct{})
+	go func() {
+		parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
+			topic:     "topic",
+			partition: 0,
+			pSet:      first.msgs["topic"][0],
+		}}, ErrOutOfBrokers)
+		close(firstDone)
+	}()
+	waitForProducerBuffer(t, parent, -1, firstBytes)
+
+	second := newProduceSet(parent)
+	safeAddMessage(t, second, secondMsg)
+	if !parent.muter.tryMute(second) {
+		t.Fatal("expected second retry batch to mute partitions")
+	}
+	parent.inFlight.Add(1)
+	parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
+		topic:     "topic",
+		partition: 1,
+		pSet:      second.msgs["topic"][1],
+	}}, ErrOutOfBrokers)
+
+	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
+	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
+	require.Equal(t, int32(1), producerErr.Msg.Partition)
+	waitForProducerBuffer(t, parent, -1, firstBytes)
+
+	contender := newProduceSet(parent)
+	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
+	if !parent.muter.tryMute(contender) {
+		t.Fatal("expected overflowed retry batch to release partition mute")
+	}
+	parent.muter.unmute(contender)
+
+	close(retryBP.done)
+	assertDoneWithin(t, firstDone, 2*time.Second)
+	waitForProducerBuffer(t, parent, 0, 0)
+}
+
 func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
@@ -2673,7 +2750,10 @@ func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) {
 		errors:     make(chan *ProducerError, 1),
 		txnmgr:     &transactionManager{},
 	}
-	parent.client = &stubLeaderClient{cfg: config, leaderErr: ErrOutOfBrokers}
+	parent.client = &failingLeaderClient{
+		stubLeaderClient: stubLeaderClient{cfg: config},
+		err:              ErrOutOfBrokers,
+	}
 
 	bp := &brokerProducer{
 		parent:            parent,
@@ -2704,12 +2784,8 @@ func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) {
 }
 
 type stubLeaderClient struct {
-	cfg             *Config
-	leader          *Broker
-	leaderFunc      func(string, int32) (*Broker, error)
-	leaderErr       error
-	leaders         map[string]map[int32]*Broker
-	refreshMetadata func(...string) error
+	cfg    *Config
+	leader *Broker
 }
 
 func (c *stubLeaderClient) Config() *Config                            { return c.cfg }
@@ -2721,36 +2797,16 @@ func (c *stubLeaderClient) Topics() ([]string, error)                  { return
 func (c *stubLeaderClient) Partitions(string) ([]int32, error)         { return nil, nil }
 func (c *stubLeaderClient) WritablePartitions(string) ([]int32, error) { return nil, nil }
 func (c *stubLeaderClient) Leader(topic string, partitionID int32) (*Broker, error) {
-	if c.leaderFunc != nil {
-		return c.leaderFunc(topic, partitionID)
-	}
-	if c.leaderErr != nil {
-		return nil, c.leaderErr
-	}
-	if partitions := c.leaders[topic]; partitions != nil {
-		if leader := partitions[partitionID]; leader != nil {
-			return leader, nil
-		}
-	}
-	if c.leaders != nil {
-		return nil, ErrLeaderNotAvailable
-	}
 	return c.leader, nil
 }
-func (c *stubLeaderClient) LeaderAndEpoch(topic string, partition int32) (*Broker, int32, error) {
-	leader, err := c.Leader(topic, partition)
-	return leader, 0, err
-}
-func (c *stubLeaderClient) Replicas(string, int32) ([]int32, error)        { return nil, nil }
-func (c *stubLeaderClient) InSyncReplicas(string, int32) ([]int32, error)  { return nil, nil }
-func (c *stubLeaderClient) OfflineReplicas(string, int32) ([]int32, error) { return nil, nil }
-func (c *stubLeaderClient) RefreshBrokers([]string) error                  { return nil }
-func (c *stubLeaderClient) RefreshMetadata(topics ...string) error {
-	if c.refreshMetadata != nil {
-		return c.refreshMetadata(topics...)
-	}
-	return nil
+func (c *stubLeaderClient) LeaderAndEpoch(string, int32) (*Broker, int32, error) {
+	return c.leader, 0, nil
 }
+func (c *stubLeaderClient) Replicas(string, int32) ([]int32, error)          { return nil, nil }
+func (c *stubLeaderClient) InSyncReplicas(string, int32) ([]int32, error)    { return nil, nil }
+func (c *stubLeaderClient) OfflineReplicas(string, int32) ([]int32, error)   { return nil, nil }
+func (c *stubLeaderClient) RefreshBrokers([]string) error                    { return nil }
+func (c *stubLeaderClient) RefreshMetadata(...string) error                  { return nil }
 func (c *stubLeaderClient) GetOffset(string, int32, int64) (int64, error)    { return 0, nil }
 func (c *stubLeaderClient) Coordinator(string) (*Broker, error)              { return nil, nil }
 func (c *stubLeaderClient) RefreshCoordinator(string) error                  { return nil }
@@ -2762,6 +2818,69 @@ func (c *stubLeaderClient) PartitionNotReadable(string, int32) bool          { r
 func (c *stubLeaderClient) Close() error                                     { return nil }
 func (c *stubLeaderClient) Closed() bool                                     { return false }
 
+type failingLeaderClient struct {
+	stubLeaderClient
+	err error
+}
+
+func (c *failingLeaderClient) Leader(string, int32) (*Broker, error) {
+	return nil, c.err
+}
+
+func (c *failingLeaderClient) LeaderAndEpoch(string, int32) (*Broker, int32, error) {
+	return nil, 0, c.err
+}
+
+type blockingRefreshClient struct {
+	stubLeaderClient
+	started chan struct{}
+	release chan struct{}
+}
+
+func (c *blockingRefreshClient) RefreshMetadata(...string) error {
+	select {
+	case c.started <- struct{}{}:
+	default:
+	}
+	<-c.release
+	return nil
+}
+
+type countingRefreshClient struct {
+	stubLeaderClient
+	calls chan []string
+}
+
+func (c *countingRefreshClient) RefreshMetadata(topics ...string) error {
+	c.calls <- append([]string(nil), topics...)
+	return nil
+}
+
+type routingLeaderClient struct {
+	stubLeaderClient
+	leaders map[string]map[int32]*Broker
+	calls   chan []string
+}
+
+func (c *routingLeaderClient) Leader(topic string, partition int32) (*Broker, error) {
+	if partitions := c.leaders[topic]; partitions != nil {
+		if leader := partitions[partition]; leader != nil {
+			return leader, nil
+		}
+	}
+	return nil, ErrLeaderNotAvailable
+}
+
+func (c *routingLeaderClient) LeaderAndEpoch(topic string, partition int32) (*Broker, int32, error) {
+	leader, err := c.Leader(topic, partition)
+	return leader, 0, err
+}
+
+func (c *routingLeaderClient) RefreshMetadata(topics ...string) error {
+	c.calls <- append([]string(nil), topics...)
+	return nil
+}
+
 func testProducerInterceptor(
 	t *testing.T,
 	interceptors []ProducerInterceptor,
diff --git a/produce_set_test.go b/produce_set_test.go
index ea8086750..359bfd47b 100644
--- a/produce_set_test.go
+++ b/produce_set_test.go
@@ -26,29 +26,20 @@ func safeAddMessage(t *testing.T, ps *produceSet, msg *ProducerMessage) {
 }
 
 func TestShouldKeepMuted(t *testing.T) {
-	pSet := &partitionSet{}
-	if pSet.shouldKeepMuted(1) {
-		t.Error("empty partition set should not be retryable")
-	}
-
-	pSet = &partitionSet{
-		msgs: []*ProducerMessage{
-			{retries: 0},
-			{retries: 0},
-		},
-	}
-	if !pSet.shouldKeepMuted(1) {
-		t.Error("expected batch to be retryable when every message is below retry max")
-	}
-
-	pSet = &partitionSet{
-		msgs: []*ProducerMessage{
-			{retries: 0},
-			{retries: 1},
-		},
-	}
-	if pSet.shouldKeepMuted(1) {
-		t.Error("expected batch not to be retryable when any message has exhausted retries")
+	if (&partitionSet{}).shouldKeepMuted(1) {
+		t.Fatal("empty partition set should not be retryable")
+	}
+	if !(&partitionSet{msgs: []*ProducerMessage{
+		{retries: 0},
+		{retries: 0},
+	}}).shouldKeepMuted(1) {
+		t.Fatal("expected batch to be retryable when every message is below retry max")
+	}
+	if (&partitionSet{msgs: []*ProducerMessage{
+		{retries: 0},
+		{retries: 1},
+	}}).shouldKeepMuted(1) {
+		t.Fatal("expected batch not to be retryable when any message has exhausted retries")
 	}
 }
 

From 2031e626e168a14b367444700442b5b025669e16 Mon Sep 17 00:00:00 2001
From: 3AceShowHand <jinl1037@hotmail.com>
Date: Thu, 25 Jun 2026 22:03:05 +0800
Subject: [PATCH 3/6] Revert "Optimize logs (#24)"

This reverts commit dd8ea47b5117d8c751099378fc308bd332659f53.
---
 async_producer.go | 1 +
 client.go         | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/async_producer.go b/async_producer.go
index 14c813ace..d6f02bd45 100644
--- a/async_producer.go
+++ b/async_producer.go
@@ -1206,6 +1206,7 @@ func (bp *brokerProducer) run() {
 			}
 
 			if bp.accumulatingBatch.wouldOverflow(msg) {
+				DebugLogger.Printf("producer/broker/%d maximum request accumulated, waiting for space\n", bp.broker.ID())
 				if err := bp.waitForSpace(msg, false); err != nil {
 					bp.parent.retryMessage(msg, err)
 					continue
diff --git a/client.go b/client.go
index 71b57b15c..6641743ac 100644
--- a/client.go
+++ b/client.go
@@ -1103,10 +1103,12 @@ func (client *client) tryRefreshMetadata(topics []string, attemptsRemaining int,
 				return err
 			}
 			// else remove that broker and try again
+			Logger.Printf("client/metadata got error from broker %d while fetching metadata: %v\n", broker.ID(), err)
 			_ = broker.Close()
 			client.deregisterBroker(broker)
 		} else {
 			// some other error, remove that broker and try again
+			Logger.Printf("client/metadata got error from broker %d while fetching metadata: %v\n", broker.ID(), err)
 			brokerErrors = append(brokerErrors, err)
 			_ = broker.Close()
 			client.deregisterBroker(broker)

From cff8910c6faca23f7ea912a250f2899618ab292a Mon Sep 17 00:00:00 2001
From: 3AceShowHand <jinl1037@hotmail.com>
Date: Thu, 25 Jun 2026 22:03:05 +0800
Subject: [PATCH 4/6] Revert "add more code to enhance the retry (#23)"

This reverts commit 7d297873a58582cab761159daf43f7270d37974a.
---
 async_producer.go           | 313 +++------------------
 async_producer_test.go      | 527 +-----------------------------------
 config.go                   |  16 +-
 functional_producer_test.go |   4 +-
 produce_set.go              |   9 -
 5 files changed, 61 insertions(+), 808 deletions(-)

diff --git a/async_producer.go b/async_producer.go
index d6f02bd45..29bc3a114 100644
--- a/async_producer.go
+++ b/async_producer.go
@@ -14,7 +14,7 @@ import (
 	"github.com/rcrowley/go-metrics"
 )
 
-// ErrProducerRetryBufferOverflow is returned when producer retry buffering is full and OOM prevention needs to be applied.
+// ErrProducerRetryBufferOverflow is returned when the bridging retry buffer is full and OOM prevention needs to be applied.
 var ErrProducerRetryBufferOverflow = errors.New("retry buffer full: message discarded to prevent buffer overflow")
 
 const (
@@ -102,8 +102,6 @@ type asyncProducer struct {
 	brokerRefs map[*brokerProducer]int
 	brokerLock sync.Mutex
 
-	retryBuffer retryBufferQuota
-
 	txnmgr *transactionManager
 	txLock sync.Mutex
 
@@ -114,14 +112,6 @@ type asyncProducer struct {
 	metricsRegistry metrics.Registry
 }
 
-type retryBufferQuota struct {
-	// messages and bytes track producer-level retry buffer occupancy when
-	// Producer.Retry.MaxBufferLength or MaxBufferBytes is bounded.
-	messages int64
-	bytes    int64
-	mu       sync.Mutex
-}
-
 type partitionMuter struct {
 	mu             sync.Mutex
 	cond           *sync.Cond
@@ -801,13 +791,6 @@ func (pp *partitionProducer) backoff(retries int) {
 }
 
 func (p *asyncProducer) backoff(retries int) {
-	backoff := p.retryBackoff(retries)
-	if backoff > 0 {
-		time.Sleep(backoff)
-	}
-}
-
-func (p *asyncProducer) retryBackoff(retries int) time.Duration {
 	var backoff time.Duration
 	if p.conf.Producer.Retry.BackoffFunc != nil {
 		maxRetries := p.conf.Producer.Retry.Max
@@ -815,37 +798,8 @@ func (p *asyncProducer) retryBackoff(retries int) time.Duration {
 	} else {
 		backoff = p.conf.Producer.Retry.Backoff
 	}
-	return backoff
-}
-
-func (p *asyncProducer) shuttingDown() bool {
-	if p.shutdownCh == nil {
-		return false
-	}
-	select {
-	case <-p.shutdownCh:
-		return true
-	default:
-		return false
-	}
-}
-
-func (p *asyncProducer) backoffOrDone(retries int) bool {
-	backoff := p.retryBackoff(retries)
-	if backoff <= 0 {
-		return !p.shuttingDown()
-	}
-	timer := time.NewTimer(backoff)
-	defer timer.Stop()
-	if p.shutdownCh == nil {
-		<-timer.C
-		return true
-	}
-	select {
-	case <-timer.C:
-		return true
-	case <-p.shutdownCh:
-		return false
+	if backoff > 0 {
+		time.Sleep(backoff)
 	}
 }
 
@@ -1494,18 +1448,12 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo
 func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitionSet, retryErr error, alreadyMuted bool) {
 	Logger.Printf("Retrying batch for %v-%d because of %v\n", topic, partition, retryErr)
 	produceSet := newProduceSet(p)
-	produceSet.addPartitionSet(topic, partition, pSet)
-	bufferMessages, bufferBytes := int64(produceSet.bufferCount), int64(produceSet.bufferBytes)
-	bufferReserved := false
-	releaseBuffer := func() {
-		if bufferReserved {
-			p.releaseBuffer(bufferMessages, bufferBytes)
-			bufferReserved = false
-		}
-	}
+	produceSet.msgs[topic] = make(map[int32]*partitionSet)
+	produceSet.msgs[topic][partition] = pSet
+	produceSet.bufferBytes += pSet.bufferBytes
+	produceSet.bufferCount += len(pSet.msgs)
 	muted := alreadyMuted
 	failBatch := func(err error) {
-		releaseBuffer()
 		// Release the partition before reporting errors so a blocked Errors
 		// consumer cannot keep later same-partition messages muted.
 		if muted {
@@ -1523,17 +1471,12 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 		}
 		msg.retries++
 	}
-	if !p.reserveBuffer(bufferMessages, bufferBytes) {
-		failBatch(ErrProducerRetryBufferOverflow)
-		return
-	}
-	bufferReserved = p.retryBufferBounded() && (bufferMessages != 0 || bufferBytes != 0)
 
-	// Honor Producer.Retry.Backoff between retry attempts (#2469). retryBatch
-	// dispatches the produceSet directly to the broker, bypassing partitionProducer.dispatch.
-	if len(pSet.msgs) > 0 && !p.backoffOrDone(pSet.msgs[0].retries) {
-		failBatch(ErrShuttingDown)
-		return
+	// honor Producer.Retry.Backoff between retry attempts (#2469); the
+	// non-idempotent path gets this from partitionProducer.dispatch, but
+	// retryBatch dispatches the produceSet directly to the broker
+	if len(pSet.msgs) > 0 {
+		p.backoff(pSet.msgs[0].retries)
 	}
 
 	// it's expected that a metadata refresh has been requested prior to calling retryBatch
@@ -1556,7 +1499,6 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 		failBatch(ErrShuttingDown)
 		return
 	}
-	releaseBuffer()
 }
 
 type partitionBatchRetry struct {
@@ -1565,213 +1507,30 @@ type partitionBatchRetry struct {
 	pSet      *partitionSet
 }
 
-func (p *asyncProducer) failMutedBatch(batch partitionBatchRetry, err error) {
-	produceSet := newProduceSet(p)
-	produceSet.addPartitionSet(batch.topic, batch.partition, batch.pSet)
-	p.muter.unmute(produceSet)
-	for _, msg := range batch.pSet.msgs {
-		p.returnError(msg, err)
-	}
-}
-
-func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err error) {
-	for _, batch := range batches {
-		p.failMutedBatch(batch, err)
-	}
-}
-
-func (p *asyncProducer) retryBufferLimits() (int64, int64) {
-	maxBufferLength := p.conf.Producer.Retry.MaxBufferLength
-	if 0 < maxBufferLength && maxBufferLength < minFunctionalRetryBufferLength {
-		maxBufferLength = minFunctionalRetryBufferLength
-	}
-
-	maxBufferBytes := p.conf.Producer.Retry.MaxBufferBytes
-	if 0 < maxBufferBytes && maxBufferBytes < minFunctionalRetryBufferBytes {
-		maxBufferBytes = minFunctionalRetryBufferBytes
-	}
-
-	return int64(maxBufferLength), maxBufferBytes
-}
-
-func (p *asyncProducer) retryBufferBounded() bool {
-	maxBufferLength, maxBufferBytes := p.retryBufferLimits()
-	return maxBufferLength > 0 || maxBufferBytes > 0
-}
-
-func (p *asyncProducer) producerMessageByteSizeVersion() int {
-	if p.conf.Version.IsAtLeast(V0_11_0_0) {
-		return 2
-	}
-	return 1
-}
-
-func (p *asyncProducer) bufferWouldOverflow(messages, bytes int64) bool {
-	maxBufferLength, maxBufferBytes := p.retryBufferLimits()
-	return (maxBufferLength > 0 && messages >= maxBufferLength) ||
-		(maxBufferBytes > 0 && bytes >= maxBufferBytes)
-}
-
-func (p *asyncProducer) reserveBuffer(messages, bytes int64) bool {
-	if messages == 0 && bytes == 0 {
-		return true
-	}
-	if !p.retryBufferBounded() {
-		return true
-	}
-	p.retryBuffer.mu.Lock()
-	defer p.retryBuffer.mu.Unlock()
-
-	if p.bufferWouldOverflow(p.retryBuffer.messages+messages, p.retryBuffer.bytes+bytes) {
-		return false
-	}
-	p.retryBuffer.messages += messages
-	p.retryBuffer.bytes += bytes
-	return true
-}
-
-func (p *asyncProducer) addToBuffer(messages, bytes int64) bool {
-	if messages == 0 && bytes == 0 {
-		return false
-	}
-	if !p.retryBufferBounded() {
-		return false
-	}
-	p.retryBuffer.mu.Lock()
-	defer p.retryBuffer.mu.Unlock()
-
-	p.retryBuffer.messages += messages
-	p.retryBuffer.bytes += bytes
-	return p.bufferWouldOverflow(p.retryBuffer.messages, p.retryBuffer.bytes)
-}
-
-func (p *asyncProducer) releaseBuffer(messages, bytes int64) {
-	if messages == 0 && bytes == 0 {
-		return
-	}
-	if !p.retryBufferBounded() {
-		return
-	}
-	p.retryBuffer.mu.Lock()
-	defer p.retryBuffer.mu.Unlock()
-
-	p.retryBuffer.messages -= messages
-	p.retryBuffer.bytes -= bytes
-}
-
 // retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError.
 // Connection errors already hold partition mutes; doing the refresh in the
 // response loop can block all progress for that broker while the cluster is
 // unstable. Refresh once for the failed request so multi-partition batches do
-// not amplify controller-fail metadata traffic. Group retry partitions by their
-// refreshed leader broker, but hand them off from this retry worker so large
-// clusters do not create one temporary goroutine per target broker.
-//
-// This is intentionally not implemented as a loop over retryBatch. retryBatch
-// is the single-partition direct retry primitive; this path owns all muted
-// partitions from one failed ProduceRequest and must retry, fail, reserve
-// buffer, back off, and refresh metadata at that failed-request boundary.
+// not amplify controller-fail metadata traffic, then retry partitions
+// independently so one blocked broker handoff cannot hold up the rest.
 func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []partitionBatchRetry, retryErr error) {
-	if p.shuttingDown() {
-		p.failMutedBatches(batches, ErrShuttingDown)
-		return
-	}
-
-	var retryable []partitionBatchRetry
-	maxRetryAttempt := 0
-	for _, batch := range batches {
-		if !batch.pSet.shouldKeepMuted(p.conf.Producer.Retry.Max) {
-			p.failMutedBatch(batch, retryErr)
-			continue
-		}
-		for _, msg := range batch.pSet.msgs {
-			msg.retries++
-			if msg.retries > maxRetryAttempt {
-				maxRetryAttempt = msg.retries
+	if p.shutdownCh != nil {
+		select {
+		case <-p.shutdownCh:
+			for _, batch := range batches {
+				go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true)
 			}
+			return
+		default:
 		}
-		retryable = append(retryable, batch)
-	}
-	if len(retryable) == 0 {
-		return
-	}
-
-	// Reserve before metadata refresh so refresh-blocked direct retries still
-	// count against the producer-level retry buffer budget.
-	var bufferMessages, bufferBytes int64
-	for _, batch := range retryable {
-		bufferMessages += int64(len(batch.pSet.msgs))
-		bufferBytes += int64(batch.pSet.bufferBytes)
-	}
-	if !p.reserveBuffer(bufferMessages, bufferBytes) {
-		p.failMutedBatches(retryable, ErrProducerRetryBufferOverflow)
-		return
-	}
-
-	if p.shuttingDown() {
-		p.releaseBuffer(bufferMessages, bufferBytes)
-		p.failMutedBatches(retryable, ErrShuttingDown)
-		return
 	}
 	if len(topics) > 0 {
 		if err := p.client.RefreshMetadata(topics...); err != nil {
 			Logger.Printf("Failed refreshing metadata because of %v\n", err)
 		}
 	}
-	if p.shuttingDown() {
-		p.releaseBuffer(bufferMessages, bufferBytes)
-		p.failMutedBatches(retryable, ErrShuttingDown)
-		return
-	}
-
-	if !p.backoffOrDone(maxRetryAttempt) {
-		p.releaseBuffer(bufferMessages, bufferBytes)
-		p.failMutedBatches(retryable, ErrShuttingDown)
-		return
-	}
-
-	brokerSets := make(map[*Broker]*produceSet)
-	for _, batch := range retryable {
-		if p.shuttingDown() {
-			batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes)
-			p.releaseBuffer(batchMessages, batchBytes)
-			p.failMutedBatch(batch, ErrShuttingDown)
-			continue
-		}
-		leader, leaderErr := p.client.Leader(batch.topic, batch.partition)
-		if leaderErr != nil {
-			Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, leaderErr)
-			batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes)
-			p.releaseBuffer(batchMessages, batchBytes)
-			p.failMutedBatch(batch, retryErr)
-			continue
-		}
-		set := brokerSets[leader]
-		if set == nil {
-			set = newProduceSet(p)
-			brokerSets[leader] = set
-		}
-		set.addPartitionSet(batch.topic, batch.partition, batch.pSet)
-	}
-
-	for leader, set := range brokerSets {
-		p.handoffRetrySet(leader, set)
-	}
-}
-
-func (p *asyncProducer) handoffRetrySet(leader *Broker, set *produceSet) {
-	bp := p.getBrokerProducer(leader)
-	accepted := p.sendRetryBatch(bp, set)
-	p.unrefBrokerProducer(leader, bp)
-	setMessages, setBytes := int64(set.bufferCount), int64(set.bufferBytes)
-	p.releaseBuffer(setMessages, setBytes)
-	if !accepted {
-		p.muter.unmute(set)
-		set.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
-			for _, msg := range pSet.msgs {
-				p.returnError(msg, ErrShuttingDown)
-			}
-		})
+	for _, batch := range batches {
+		go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true)
 	}
 }
 
@@ -1874,8 +1633,22 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) {
 // effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock
 // based on https://godoc.org/github.com/eapache/channels#InfiniteChannel
 func (p *asyncProducer) retryHandler() {
-	version := p.producerMessageByteSizeVersion()
+	maxBufferLength := p.conf.Producer.Retry.MaxBufferLength
+	if 0 < maxBufferLength && maxBufferLength < minFunctionalRetryBufferLength {
+		maxBufferLength = minFunctionalRetryBufferLength
+	}
+
+	maxBufferBytes := p.conf.Producer.Retry.MaxBufferBytes
+	if 0 < maxBufferBytes && maxBufferBytes < minFunctionalRetryBufferBytes {
+		maxBufferBytes = minFunctionalRetryBufferBytes
+	}
+
+	version := 1
+	if p.conf.Version.IsAtLeast(V0_11_0_0) {
+		version = 2
+	}
 
+	var currentByteSize int64
 	var msg *ProducerMessage
 	buf := queue.New()
 
@@ -1887,7 +1660,7 @@ func (p *asyncProducer) retryHandler() {
 			case msg = <-p.retries:
 			case p.input <- buf.Peek().(*ProducerMessage):
 				msgToRemove := buf.Remove().(*ProducerMessage)
-				p.releaseBuffer(1, int64(msgToRemove.ByteSize(version)))
+				currentByteSize -= int64(msgToRemove.ByteSize(version))
 				continue
 			}
 		}
@@ -1897,9 +1670,9 @@ func (p *asyncProducer) retryHandler() {
 		}
 
 		buf.Add(msg)
-		bufferOverflow := p.addToBuffer(1, int64(msg.ByteSize(version)))
+		currentByteSize += int64(msg.ByteSize(version))
 
-		if !bufferOverflow {
+		if (maxBufferLength <= 0 || buf.Length() < maxBufferLength) && (maxBufferBytes <= 0 || currentByteSize < maxBufferBytes) {
 			continue
 		}
 
@@ -1908,10 +1681,10 @@ func (p *asyncProducer) retryHandler() {
 			select {
 			case p.input <- msgToHandle:
 				buf.Remove()
-				p.releaseBuffer(1, int64(msgToHandle.ByteSize(version)))
+				currentByteSize -= int64(msgToHandle.ByteSize(version))
 			default:
 				buf.Remove()
-				p.releaseBuffer(1, int64(msgToHandle.ByteSize(version)))
+				currentByteSize -= int64(msgToHandle.ByteSize(version))
 				p.returnError(msgToHandle, ErrProducerRetryBufferOverflow)
 			}
 		}
diff --git a/async_producer_test.go b/async_producer_test.go
index a3c220f63..4bb9cd87a 100644
--- a/async_producer_test.go
+++ b/async_producer_test.go
@@ -1635,51 +1635,6 @@ func assertDoneWithin[T any](t *testing.T, ch <-chan T, timeout time.Duration) T
 	}
 }
 
-func waitForProducerBuffer(t *testing.T, parent *asyncProducer, messages, bytes int64) {
-	t.Helper()
-	deadline := time.After(2 * time.Second)
-	ticker := time.NewTicker(time.Millisecond)
-	defer ticker.Stop()
-	for {
-		parent.retryBuffer.mu.Lock()
-		gotMessages := parent.retryBuffer.messages
-		gotBytes := parent.retryBuffer.bytes
-		parent.retryBuffer.mu.Unlock()
-		if (messages < 0 || gotMessages == messages) && (bytes < 0 || gotBytes == bytes) {
-			return
-		}
-		select {
-		case <-deadline:
-			t.Fatalf("timed out waiting for buffer budget messages=%d bytes=%d, got messages=%d bytes=%d",
-				messages, bytes, gotMessages, gotBytes)
-		case <-ticker.C:
-		}
-	}
-}
-
-func newBlockingRetryProducer(config *Config, errorBuffer int) (*asyncProducer, *brokerProducer) {
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		errors:     make(chan *ProducerError, errorBuffer),
-		shutdownCh: make(chan struct{}),
-		txnmgr:     &transactionManager{},
-	}
-	leader := &Broker{id: 1}
-	parent.client = &stubLeaderClient{leader: leader, cfg: config}
-	retryBP := &brokerProducer{
-		parent: parent,
-		broker: leader,
-		output: make(chan *produceSet),
-		input:  make(chan *ProducerMessage),
-		done:   make(chan struct{}),
-	}
-	parent.brokers[leader] = retryBP
-	return parent, retryBP
-}
-
 // TestBrokerProducerWaitForSpaceRespectsExternalUnmute ensures waitForSpace does not
 // deadlock when partitions are muted by another producer and are unmuted elsewhere.
 func TestBrokerProducerWaitForSpaceRespectsExternalUnmute(t *testing.T) {
@@ -1952,7 +1907,6 @@ func TestBrokerProducerRollOverClearsTimer(t *testing.T) {
 func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = true
-	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength
 	txnMgr := &transactionManager{
 		producerID:      0,
 		producerEpoch:   0,
@@ -1995,7 +1949,6 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	default:
 		t.Fatal("expected retry batch to be dispatched")
 	}
-	waitForProducerBuffer(t, parent, 0, 0)
 
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
@@ -2065,61 +2018,11 @@ func TestHandleErrorNonIdempotentRetryKeepsPartitionMuted(t *testing.T) {
 	}
 }
 
-func TestHandleErrorNonIdempotentRetryFailsWholeBatch(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Return.Errors = true
-
-	parent := &asyncProducer{
-		conf:    config,
-		muter:   newPartitionMuter(),
-		errors:  make(chan *ProducerError, 2),
-		retries: make(chan *ProducerMessage, 2),
-		txnmgr:  &transactionManager{},
-	}
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            &Broker{id: 1},
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-
-	exhausted := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("exhausted"), retries: 1}
-	retryable := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retryable")}
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, exhausted)
-	safeAddMessage(t, sent, retryable)
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-	parent.inFlight.Add(2)
-
-	bp.handleError(sent, ErrOutOfBrokers)
-
-	firstErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	secondErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, exhausted, firstErr.Msg)
-	require.Equal(t, ErrOutOfBrokers, firstErr.Err)
-	require.Equal(t, retryable, secondErr.Msg)
-	require.Equal(t, ErrOutOfBrokers, secondErr.Err)
-	assertNotDone(t, parent.retries, 50*time.Millisecond)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected partition mute to be released after whole-batch failure")
-	}
-	parent.muter.unmute(contender)
-}
-
 func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
 	config.Producer.Retry.Backoff = 0
-	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength
 
 	parent := &asyncProducer{
 		conf:       config,
@@ -2168,7 +2071,6 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T
 	}()
 	assertDoneWithin(t, done, 2*time.Second)
 	assertDoneWithin(t, client.started, 2*time.Second)
-	waitForProducerBuffer(t, parent, 1, -1)
 
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
@@ -2181,7 +2083,6 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T
 	retrySet := assertDoneWithin(t, output, 2*time.Second)
 	defer parent.muter.unmute(retrySet)
 	require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0])
-	waitForProducerBuffer(t, parent, 0, 0)
 }
 
 func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) {
@@ -2235,112 +2136,23 @@ func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *t
 	bp.handleError(sent, ErrOutOfBrokers)
 
 	require.Equal(t, []string{"topic"}, assertDoneWithin(t, client.calls, 2*time.Second))
-	retrySet := assertDoneWithin(t, output, 2*time.Second)
-	defer parent.muter.unmute(retrySet)
+	retrySet0 := assertDoneWithin(t, output, 2*time.Second)
+	defer parent.muter.unmute(retrySet0)
+	retrySet1 := assertDoneWithin(t, output, 2*time.Second)
+	defer parent.muter.unmute(retrySet1)
 	assertNotDone(t, client.calls, 50*time.Millisecond)
-	assertNotDone(t, output, 50*time.Millisecond)
 
 	retriedPartitions := make(map[int32]*partitionSet)
-	retrySet.eachPartition(func(_ string, partition int32, pSet *partitionSet) {
+	retrySet0.eachPartition(func(_ string, partition int32, pSet *partitionSet) {
+		retriedPartitions[partition] = pSet
+	})
+	retrySet1.eachPartition(func(_ string, partition int32, pSet *partitionSet) {
 		retriedPartitions[partition] = pSet
 	})
 	require.Same(t, retryPartitionSet0, retriedPartitions[int32(0)])
 	require.Same(t, retryPartitionSet1, retriedPartitions[int32(1)])
 }
 
-func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		txnmgr:     &transactionManager{},
-	}
-	failedBroker := &Broker{id: 1}
-	leaderA := &Broker{id: 2}
-	leaderB := &Broker{id: 3}
-	client := &routingLeaderClient{
-		stubLeaderClient: stubLeaderClient{cfg: config},
-		leaders: map[string]map[int32]*Broker{
-			"topic-a": {
-				0: leaderA,
-				1: leaderA,
-			},
-			"topic-b": {
-				0: leaderB,
-				1: leaderB,
-			},
-		},
-		calls: make(chan []string, 1),
-	}
-	parent.client = client
-
-	outputA := make(chan *produceSet, 1)
-	outputB := make(chan *produceSet, 1)
-	retryBPA := &brokerProducer{
-		parent: parent,
-		broker: leaderA,
-		output: outputA,
-		input:  make(chan *ProducerMessage),
-	}
-	retryBPB := &brokerProducer{
-		parent: parent,
-		broker: leaderB,
-		output: outputB,
-		input:  make(chan *ProducerMessage),
-	}
-	parent.brokers[leaderA] = retryBPA
-	parent.brokers[leaderB] = retryBPB
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            failedBroker,
-		input:             make(chan *ProducerMessage),
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-a", Partition: 0, Value: StringEncoder("a-0")})
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-a", Partition: 1, Value: StringEncoder("a-1")})
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-b", Partition: 0, Value: StringEncoder("b-0")})
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-b", Partition: 1, Value: StringEncoder("b-1")})
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-
-	bp.handleError(sent, ErrOutOfBrokers)
-
-	require.ElementsMatch(t, []string{"topic-a", "topic-b"}, assertDoneWithin(t, client.calls, 2*time.Second))
-	retrySetA := assertDoneWithin(t, outputA, 2*time.Second)
-	defer parent.muter.unmute(retrySetA)
-	retrySetB := assertDoneWithin(t, outputB, 2*time.Second)
-	defer parent.muter.unmute(retrySetB)
-
-	require.Len(t, retrySetA.msgs, 1)
-	require.Len(t, retrySetA.msgs["topic-a"], 2)
-	require.Len(t, retrySetB.msgs, 1)
-	require.Len(t, retrySetB.msgs["topic-b"], 2)
-	for _, partitions := range retrySetA.msgs {
-		for _, pSet := range partitions {
-			require.Equal(t, 1, pSet.msgs[0].retries)
-		}
-	}
-	for _, partitions := range retrySetB.msgs {
-		for _, pSet := range partitions {
-			require.Equal(t, 1, pSet.msgs[0].retries)
-		}
-	}
-
-	assertNotDone(t, outputA, 50*time.Millisecond)
-	assertNotDone(t, outputB, 50*time.Millisecond)
-}
-
 func TestHandleSuccessNonIdempotentRetryUsesPartitionProducer(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
@@ -2391,7 +2203,6 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
 	config.Producer.Retry.Max = 1
 	config.Producer.Retry.Backoff = 0
 	config.Producer.Return.Errors = true
-	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength
 
 	parent := &asyncProducer{
 		conf:       config,
@@ -2420,187 +2231,16 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
 		t.Fatal("expected sent batch to mute partitions")
 	}
 	parent.inFlight.Add(1)
-
-	done := make(chan struct{})
-	go func() {
-		parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true)
-		close(done)
-	}()
-	waitForProducerBuffer(t, parent, 1, -1)
-
 	if parent.shutdownChClosed.CompareAndSwap(false, true) {
 		close(parent.shutdownCh)
 	}
 
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrShuttingDown, producerErr.Err)
-	assertDoneWithin(t, done, 2*time.Second)
-	waitForProducerBuffer(t, parent, 0, 0)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected partition mute to be released after aborted retry handoff")
-	}
-	parent.muter.unmute(contender)
-}
-
-func TestRetryBatchUsesSharedBufferLengthBudget(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = true
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1
-
-	parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1)
-
-	first := newProduceSet(parent)
-	for i := 0; i < minFunctionalRetryBufferLength; i++ {
-		safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	}
-	firstPartitionSet := first.msgs["topic"][0]
-	if !parent.muter.tryMute(first) {
-		t.Fatal("expected first retry batch to mute partitions")
-	}
-	parent.inFlight.Add(minFunctionalRetryBufferLength)
-
-	firstDone := make(chan struct{})
-	go func() {
-		parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true)
-		close(firstDone)
-	}()
-	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
-
-	second := newProduceSet(parent)
-	safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")})
-	secondPartitionSet := second.msgs["topic"][1]
-	if !parent.muter.tryMute(second) {
-		t.Fatal("expected second retry batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-	parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true)
-
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
-	require.Equal(t, int32(1), producerErr.Msg.Partition)
-	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected overflowed retry batch to release partition mute")
-	}
-	parent.muter.unmute(contender)
-
-	close(retryBP.done)
-	assertDoneWithin(t, firstDone, 2*time.Second)
-	waitForProducerBuffer(t, parent, 0, 0)
-}
-
-func TestRetryBatchUsesSharedBufferBytesBudget(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = true
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-
-	parent, retryBP := newBlockingRetryProducer(config, 2)
-
-	firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))}
-	secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}
-	version := parent.producerMessageByteSizeVersion()
-	firstBytes := int64(firstMsg.ByteSize(version))
-	secondBytes := int64(secondMsg.ByteSize(version))
-	config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes
-
-	first := newProduceSet(parent)
-	safeAddMessage(t, first, firstMsg)
-	firstPartitionSet := first.msgs["topic"][0]
-	if !parent.muter.tryMute(first) {
-		t.Fatal("expected first retry batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-
-	firstDone := make(chan struct{})
-	go func() {
-		parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true)
-		close(firstDone)
-	}()
-	waitForProducerBuffer(t, parent, -1, firstBytes)
-
-	second := newProduceSet(parent)
-	safeAddMessage(t, second, secondMsg)
-	secondPartitionSet := second.msgs["topic"][1]
-	if !parent.muter.tryMute(second) {
-		t.Fatal("expected second retry batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-	parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true)
-
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
-	require.Equal(t, int32(1), producerErr.Msg.Partition)
-	waitForProducerBuffer(t, parent, -1, firstBytes)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected overflowed retry batch to release partition mute")
-	}
-	parent.muter.unmute(contender)
-
-	close(retryBP.done)
-	assertDoneWithin(t, firstDone, 2*time.Second)
-	waitForProducerBuffer(t, parent, 0, 0)
-}
-
-func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		errors:     make(chan *ProducerError, 1),
-		shutdownCh: make(chan struct{}),
-		txnmgr:     &transactionManager{},
-	}
-	leader := &Broker{id: 1}
-	parent.client = &stubLeaderClient{leader: leader, cfg: config}
-	retryBP := &brokerProducer{
-		parent: parent,
-		broker: leader,
-		output: make(chan *produceSet),
-		input:  make(chan *ProducerMessage),
-		done:   make(chan struct{}),
-	}
-	parent.brokers[leader] = retryBP
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	retryPartitionSet := sent.msgs["topic"][0]
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-
 	done := make(chan struct{})
 	go func() {
-		parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
-			topic:     "topic",
-			partition: 0,
-			pSet:      retryPartitionSet,
-		}}, ErrOutOfBrokers)
+		parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true)
 		close(done)
 	}()
 
-	close(retryBP.done)
 	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
 	require.Equal(t, ErrShuttingDown, producerErr.Err)
 	assertDoneWithin(t, done, 2*time.Second)
@@ -2608,131 +2248,9 @@ func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
 	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected partition mute to be released after brokerProducer done")
-	}
-	parent.muter.unmute(contender)
-}
-
-func TestRetryBatchesAfterRefreshUsesSharedBufferLengthBudget(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-	config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1
-
-	parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1)
-
-	first := newProduceSet(parent)
-	for i := 0; i < minFunctionalRetryBufferLength; i++ {
-		safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	}
-	if !parent.muter.tryMute(first) {
-		t.Fatal("expected first retry batch to mute partitions")
-	}
-	parent.inFlight.Add(minFunctionalRetryBufferLength)
-
-	firstDone := make(chan struct{})
-	go func() {
-		parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
-			topic:     "topic",
-			partition: 0,
-			pSet:      first.msgs["topic"][0],
-		}}, ErrOutOfBrokers)
-		close(firstDone)
-	}()
-	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
-
-	second := newProduceSet(parent)
-	safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")})
-	if !parent.muter.tryMute(second) {
-		t.Fatal("expected second retry batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-	parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
-		topic:     "topic",
-		partition: 1,
-		pSet:      second.msgs["topic"][1],
-	}}, ErrOutOfBrokers)
-
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
-	require.Equal(t, int32(1), producerErr.Msg.Partition)
-	waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected overflowed retry batch to release partition mute")
-	}
-	parent.muter.unmute(contender)
-
-	close(retryBP.done)
-	assertDoneWithin(t, firstDone, 2*time.Second)
-	waitForProducerBuffer(t, parent, 0, 0)
-}
-
-func TestRetryBatchesAfterRefreshUsesSharedBufferBytesBudget(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-
-	parent, retryBP := newBlockingRetryProducer(config, 2)
-
-	firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))}
-	secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}
-	version := parent.producerMessageByteSizeVersion()
-	firstBytes := int64(firstMsg.ByteSize(version))
-	secondBytes := int64(secondMsg.ByteSize(version))
-	config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes
-
-	first := newProduceSet(parent)
-	safeAddMessage(t, first, firstMsg)
-	if !parent.muter.tryMute(first) {
-		t.Fatal("expected first retry batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-
-	firstDone := make(chan struct{})
-	go func() {
-		parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
-			topic:     "topic",
-			partition: 0,
-			pSet:      first.msgs["topic"][0],
-		}}, ErrOutOfBrokers)
-		close(firstDone)
-	}()
-	waitForProducerBuffer(t, parent, -1, firstBytes)
-
-	second := newProduceSet(parent)
-	safeAddMessage(t, second, secondMsg)
-	if !parent.muter.tryMute(second) {
-		t.Fatal("expected second retry batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-	parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{
-		topic:     "topic",
-		partition: 1,
-		pSet:      second.msgs["topic"][1],
-	}}, ErrOutOfBrokers)
-
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err)
-	require.Equal(t, int32(1), producerErr.Msg.Partition)
-	waitForProducerBuffer(t, parent, -1, firstBytes)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected overflowed retry batch to release partition mute")
+		t.Fatal("expected partition mute to be released after aborted retry handoff")
 	}
 	parent.muter.unmute(contender)
-
-	close(retryBP.done)
-	assertDoneWithin(t, firstDone, 2*time.Second)
-	waitForProducerBuffer(t, parent, 0, 0)
 }
 
 func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) {
@@ -2856,31 +2374,6 @@ func (c *countingRefreshClient) RefreshMetadata(topics ...string) error {
 	return nil
 }
 
-type routingLeaderClient struct {
-	stubLeaderClient
-	leaders map[string]map[int32]*Broker
-	calls   chan []string
-}
-
-func (c *routingLeaderClient) Leader(topic string, partition int32) (*Broker, error) {
-	if partitions := c.leaders[topic]; partitions != nil {
-		if leader := partitions[partition]; leader != nil {
-			return leader, nil
-		}
-	}
-	return nil, ErrLeaderNotAvailable
-}
-
-func (c *routingLeaderClient) LeaderAndEpoch(topic string, partition int32) (*Broker, int32, error) {
-	leader, err := c.Leader(topic, partition)
-	return leader, 0, err
-}
-
-func (c *routingLeaderClient) RefreshMetadata(topics ...string) error {
-	c.calls <- append([]string(nil), topics...)
-	return nil
-}
-
 func testProducerInterceptor(
 	t *testing.T,
 	interceptors []ProducerInterceptor,
diff --git a/config.go b/config.go
index 9e837bbc8..8b87a6cd2 100644
--- a/config.go
+++ b/config.go
@@ -276,20 +276,16 @@ type Config struct {
 			// more sophisticated backoff strategies. This takes precedence over
 			// `Backoff` if set.
 			BackoffFunc func(retries, maxRetries int) time.Duration
-			// The maximum number of messages in producer retry buffering, including
-			// the bridging buffer between `input` and `retries` channels in
-			// AsyncProducer#retryHandler and direct retries waiting for metadata
-			// refresh, retry backoff, leader lookup, partition mute, or broker handoff.
-			// The limit is to prevent retry buffering from overflowing or causing OOM.
+			// The maximum length of the bridging buffer between `input` and `retries` channels
+			// in AsyncProducer#retryHandler.
+			// The limit is to prevent this buffer from overflowing or causing OOM.
 			// Defaults to 0 for unlimited.
 			// Any value between 0 and 4096 is pushed to 4096.
 			// A zero or negative value indicates unlimited.
 			MaxBufferLength int
-			// The maximum total byte size of messages in producer retry buffering,
-			// including the bridging buffer between `input` and `retries` channels
-			// in AsyncProducer#retryHandler and direct retries waiting for metadata
-			// refresh, retry backoff, leader lookup, partition mute, or broker handoff.
-			// This limit prevents retry buffering from consuming excessive memory.
+			// The maximum total byte size of messages in the bridging buffer between `input`
+			// and `retries` channels in AsyncProducer#retryHandler.
+			// This limit prevents the buffer from consuming excessive memory.
 			// Defaults to 0 for unlimited.
 			// Any value between 0 and 32 MB is pushed to 32 MB.
 			// A zero or negative value indicates unlimited.
diff --git a/functional_producer_test.go b/functional_producer_test.go
index bbb0e27dc..df8a5d0ca 100644
--- a/functional_producer_test.go
+++ b/functional_producer_test.go
@@ -430,8 +430,8 @@ func TestFuncTxnProduceAndCommitOffset(t *testing.T) {
 
 	handler.started.Add(4)
 	go func() {
-		consumeErr := cg.Consume(ctx, []string{"test.4"}, handler)
-		require.NoError(t, consumeErr)
+		err = cg.Consume(ctx, []string{"test.4"}, handler)
+		require.NoError(t, err)
 	}()
 
 	handler.started.Wait()
diff --git a/produce_set.go b/produce_set.go
index 0fbb7097a..121a272ac 100644
--- a/produce_set.go
+++ b/produce_set.go
@@ -50,15 +50,6 @@ func newProduceSetWithMeta(parent *asyncProducer, producerID int64, producerEpoc
 	}
 }
 
-func (ps *produceSet) addPartitionSet(topic string, partition int32, set *partitionSet) {
-	if ps.msgs[topic] == nil {
-		ps.msgs[topic] = make(map[int32]*partitionSet)
-	}
-	ps.msgs[topic][partition] = set
-	ps.bufferBytes += set.bufferBytes
-	ps.bufferCount += len(set.msgs)
-}
-
 func (ps *produceSet) add(msg *ProducerMessage) error {
 	var err error
 	var key, val []byte

From d171171ca92e9ca4a0ebb343e38fba4502446be8 Mon Sep 17 00:00:00 2001
From: 3AceShowHand <jinl1037@hotmail.com>
Date: Thu, 25 Jun 2026 22:03:05 +0800
Subject: [PATCH 5/6] Revert "Mute by retry batch (#22)"

This reverts commit 505e3f94e70bdac3532623c53b4baf29a21b61f8.
---
 async_producer.go      | 222 +++++++++----------------
 async_producer_test.go | 365 +++--------------------------------------
 produce_set.go         |  38 ++++-
 produce_set_test.go    |  18 --
 4 files changed, 128 insertions(+), 515 deletions(-)

diff --git a/async_producer.go b/async_producer.go
index 29bc3a114..99821e5b4 100644
--- a/async_producer.go
+++ b/async_producer.go
@@ -6,7 +6,6 @@ import (
 	"fmt"
 	"math"
 	"sync"
-	"sync/atomic"
 	"time"
 
 	"github.com/eapache/go-resiliency/breaker"
@@ -93,10 +92,6 @@ type asyncProducer struct {
 	errors                    chan *ProducerError
 	input, successes, retries chan *ProducerMessage
 	inFlight                  sync.WaitGroup
-	// shutdownCh is closed before waiting on in-flight messages so retryBatch
-	// goroutines can stop waiting on broker handoff and release partition mutes.
-	shutdownCh       chan struct{}
-	shutdownChClosed atomic.Bool
 
 	brokers    map[*Broker]*brokerProducer
 	brokerRefs map[*brokerProducer]int
@@ -189,6 +184,13 @@ func (m *partitionMuter) tryMutePartition(topic string, partition int32) bool {
 	return true
 }
 
+func (m *partitionMuter) isMutedPartition(topic string, partition int32) bool {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	return m.isMuted(topic, partition)
+}
+
 // waitUntilMuted blocks until all partitions in the set can be muted, then mutes them.
 // Returns false if the muter was closed before all partitions could be muted.
 func (m *partitionMuter) waitUntilMuted(set *produceSet) bool {
@@ -308,7 +310,6 @@ func newAsyncProducer(client Client) (AsyncProducer, error) {
 		input:           make(chan *ProducerMessage),
 		successes:       make(chan *ProducerMessage),
 		retries:         make(chan *ProducerMessage),
-		shutdownCh:      make(chan struct{}),
 		brokers:         make(map[*Broker]*brokerProducer),
 		brokerRefs:      make(map[*brokerProducer]int),
 		txnmgr:          txnmgr,
@@ -382,6 +383,11 @@ type ProducerMessage struct {
 	sequenceNumber int32
 	producerEpoch  int16
 	hasSequence    bool
+	// reusePartitionMute is a one-shot marker for non-idempotent retries. The
+	// first retried message from a failed, still-muted batch may reuse that
+	// existing in-flight reservation so later same-partition messages cannot
+	// slip in between the failed send and its retry.
+	reusePartitionMute bool
 }
 
 const producerMessageOverhead = 26 // the metadata overhead of CRC, flags, etc.
@@ -408,6 +414,7 @@ func (m *ProducerMessage) ByteSize(version int) int {
 func (m *ProducerMessage) clear() {
 	m.flags = 0
 	m.retries = 0
+	m.reusePartitionMute = false
 	m.sequenceNumber = 0
 	m.producerEpoch = 0
 	m.hasSequence = false
@@ -787,16 +794,12 @@ func (p *asyncProducer) newPartitionProducer(topic string, partition int32) chan
 }
 
 func (pp *partitionProducer) backoff(retries int) {
-	pp.parent.backoff(retries)
-}
-
-func (p *asyncProducer) backoff(retries int) {
 	var backoff time.Duration
-	if p.conf.Producer.Retry.BackoffFunc != nil {
-		maxRetries := p.conf.Producer.Retry.Max
-		backoff = p.conf.Producer.Retry.BackoffFunc(retries, maxRetries)
+	if pp.parent.conf.Producer.Retry.BackoffFunc != nil {
+		maxRetries := pp.parent.conf.Producer.Retry.Max
+		backoff = pp.parent.conf.Producer.Retry.BackoffFunc(retries, maxRetries)
 	} else {
-		backoff = p.conf.Producer.Retry.Backoff
+		backoff = pp.parent.conf.Producer.Retry.Backoff
 	}
 	if backoff > 0 {
 		time.Sleep(backoff)
@@ -966,7 +969,6 @@ func (p *asyncProducer) newBrokerProducer(broker *Broker) *brokerProducer {
 		input:             input,
 		output:            bridge,
 		responses:         responses,
-		done:              make(chan struct{}),
 		accumulatingBatch: newProduceSet(p),
 		currentRetries:    make(map[string]map[int32]error),
 	}
@@ -1080,11 +1082,6 @@ type brokerProducer struct {
 	output    chan<- *produceSet
 	responses <-chan *brokerProducerResponse
 	abandoned chan struct{}
-	// done is closed before output is closed. Direct retry handoff waits on
-	// output while the batch still owns its partition mute; if the brokerProducer
-	// is shutting down, that send may never be received and the mute would block
-	// later same-partition messages and producer shutdown.
-	done chan struct{}
 
 	accumulatingBatch *produceSet
 	flushingBatch     *produceSet // batch that has been muted and is ready to send
@@ -1217,13 +1214,28 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool {
 		return true
 	}
 
-	return false
+	reused := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool {
+		partitions := bp.accumulatingBatch.msgs[topic]
+		if partitions == nil {
+			return false
+		}
+		set := partitions[partition]
+		return set.canReusePartitionMute() && bp.parent.muter.isMutedPartition(topic, partition)
+	})
+	if reused == nil {
+		return false
+	}
+	reused.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
+		pSet.clearPartitionMuteReuse()
+	})
+	bp.flushingBatch = reused
+	if bp.accumulatingBatch.empty() {
+		bp.rollOver()
+	}
+	return true
 }
 
 func (bp *brokerProducer) shutdown() {
-	if bp.done != nil {
-		close(bp.done)
-	}
 	// flush any ready buffer
 	for bp.flushingBatch != nil {
 		select {
@@ -1381,7 +1393,7 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo
 				bp.parent.returnErrors(pSet.msgs, block.Err)
 			} else {
 				retryTopics = append(retryTopics, topic)
-				if bp.parent.conf.Producer.Idempotent {
+				if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) {
 					if keepMuted[topic] == nil {
 						keepMuted[topic] = make(map[int32]struct{})
 					}
@@ -1424,9 +1436,7 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo
 				if bp.parent.conf.Producer.Idempotent {
 					go bp.parent.retryBatch(topic, partition, pSet, block.Err, true)
 				} else {
-					// Non-idempotent response retries must go back through partitionProducer.
-					// That path advances the retry high watermark and refreshes the partition
-					// leader before sending the retry; retryBatch would bypass that state.
+					pSet.markPartitionMuteReusable()
 					bp.parent.retryMessages(pSet.msgs, block.Err)
 				}
 				// dropping the following messages has the side effect of incrementing their retry count
@@ -1452,119 +1462,40 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio
 	produceSet.msgs[topic][partition] = pSet
 	produceSet.bufferBytes += pSet.bufferBytes
 	produceSet.bufferCount += len(pSet.msgs)
-	muted := alreadyMuted
-	failBatch := func(err error) {
-		// Release the partition before reporting errors so a blocked Errors
-		// consumer cannot keep later same-partition messages muted.
-		if muted {
-			p.muter.unmute(produceSet)
-			muted = false
-		}
-		for _, msg := range pSet.msgs {
-			p.returnError(msg, err)
-		}
-	}
 	for _, msg := range pSet.msgs {
 		if msg.retries >= p.conf.Producer.Retry.Max {
-			failBatch(retryErr)
+			p.returnErrors(pSet.msgs, retryErr)
+			if alreadyMuted {
+				p.muter.unmute(produceSet)
+			}
 			return
 		}
 		msg.retries++
 	}
 
-	// honor Producer.Retry.Backoff between retry attempts (#2469); the
-	// non-idempotent path gets this from partitionProducer.dispatch, but
-	// retryBatch dispatches the produceSet directly to the broker
-	if len(pSet.msgs) > 0 {
-		p.backoff(pSet.msgs[0].retries)
-	}
-
 	// it's expected that a metadata refresh has been requested prior to calling retryBatch
 	leader, leaderErr := p.client.Leader(topic, partition)
 	if leaderErr != nil {
 		Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", topic, partition, leaderErr)
-		failBatch(retryErr)
+		for _, msg := range pSet.msgs {
+			p.returnError(msg, retryErr)
+		}
+		if alreadyMuted {
+			p.muter.unmute(produceSet)
+		}
 		return
 	}
 	if !alreadyMuted {
 		if !p.muter.waitUntilMuted(produceSet) {
-			failBatch(retryErr)
-			return
-		}
-		muted = true
-	}
-	bp := p.getBrokerProducer(leader)
-	defer p.unrefBrokerProducer(leader, bp)
-	if !p.sendRetryBatch(bp, produceSet) {
-		failBatch(ErrShuttingDown)
-		return
-	}
-}
-
-type partitionBatchRetry struct {
-	topic     string
-	partition int32
-	pSet      *partitionSet
-}
-
-// retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError.
-// Connection errors already hold partition mutes; doing the refresh in the
-// response loop can block all progress for that broker while the cluster is
-// unstable. Refresh once for the failed request so multi-partition batches do
-// not amplify controller-fail metadata traffic, then retry partitions
-// independently so one blocked broker handoff cannot hold up the rest.
-func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []partitionBatchRetry, retryErr error) {
-	if p.shutdownCh != nil {
-		select {
-		case <-p.shutdownCh:
-			for _, batch := range batches {
-				go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true)
+			for _, msg := range pSet.msgs {
+				p.returnError(msg, retryErr)
 			}
 			return
-		default:
-		}
-	}
-	if len(topics) > 0 {
-		if err := p.client.RefreshMetadata(topics...); err != nil {
-			Logger.Printf("Failed refreshing metadata because of %v\n", err)
 		}
 	}
-	for _, batch := range batches {
-		go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true)
-	}
-}
-
-// sendRetryBatch transfers ownership of a muted retry batch to the broker
-// bridge. A false result means no owner accepted the batch, so the caller must
-// release the mute and fail the messages instead of leaving a retry goroutine
-// blocked while it still owns the partition mute.
-func (p *asyncProducer) sendRetryBatch(bp *brokerProducer, set *produceSet) bool {
-	var done <-chan struct{}
-	if bp.done != nil {
-		done = bp.done
-	}
-	select {
-	case <-done:
-		return false
-	default:
-	}
-	select {
-	case bp.output <- set:
-		return true
-	default:
-	}
-	var shutdownCh <-chan struct{}
-	if p.shutdownCh != nil {
-		shutdownCh = p.shutdownCh
-	}
-	select {
-	case bp.output <- set:
-		return true
-	case <-done:
-		return false
-	case <-shutdownCh:
-		return false
-	}
+	bp := p.getBrokerProducer(leader)
+	bp.output <- produceSet
+	p.unrefBrokerProducer(leader, bp)
 }
 
 func (bp *brokerProducer) handleError(sent *produceSet, err error) {
@@ -1579,40 +1510,41 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) {
 		bp.parent.abandonBrokerConnection(bp.broker)
 		_ = bp.broker.Close()
 		bp.closing = err
-		keepMuted := make(map[string]map[int32]struct{})
 		var retryTopics []string
 		retryTopicSeen := make(map[string]struct{})
-		var retryBatches []partitionBatchRetry
 		sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
-			// Connection failures have no ProduceResponse to feed back through
-			// partitionProducer. Keep the sent batch muted and retry it asynchronously
-			// so later same-partition batches cannot pass it, without blocking the
-			// brokerProducer response loop (#1203).
+			if _, ok := retryTopicSeen[topic]; ok {
+				return
+			}
+			retryTopicSeen[topic] = struct{}{}
+			retryTopics = append(retryTopics, topic)
+		})
+		if bp.parent.conf.Producer.Idempotent && len(retryTopics) > 0 {
+			refreshErr := bp.parent.client.RefreshMetadata(retryTopics...)
+			if refreshErr != nil {
+				Logger.Printf("Failed refreshing metadata because of %v\n", refreshErr)
+			}
+		}
+		keepMuted := make(map[string]map[int32]struct{})
+		sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
+			// keep partition marked as in-flight during retry (connection error)
 			if bp.currentRetries[topic] == nil {
 				bp.currentRetries[topic] = make(map[int32]error)
 			}
 			bp.currentRetries[topic][partition] = err
-			if !(bp.parent.conf.Producer.Idempotent || pSet.shouldKeepMuted(bp.parent.conf.Producer.Retry.Max)) {
-				bp.parent.returnErrors(pSet.msgs, err)
-			} else {
+			if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) {
 				if keepMuted[topic] == nil {
 					keepMuted[topic] = make(map[int32]struct{})
 				}
 				keepMuted[topic][partition] = struct{}{}
-				if _, ok := retryTopicSeen[topic]; !ok {
-					retryTopicSeen[topic] = struct{}{}
-					retryTopics = append(retryTopics, topic)
-				}
-				retryBatches = append(retryBatches, partitionBatchRetry{
-					topic:     topic,
-					partition: partition,
-					pSet:      pSet,
-				})
+			}
+			if bp.parent.conf.Producer.Idempotent {
+				go bp.parent.retryBatch(topic, partition, pSet, err, true)
+			} else {
+				pSet.markPartitionMuteReusable()
+				bp.parent.retryMessages(pSet.msgs, err)
 			}
 		})
-		if len(retryBatches) > 0 {
-			go bp.parent.retryBatchesAfterRefresh(retryTopics, retryBatches, err)
-		}
 		bp.accumulatingBatch.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
 			bp.parent.retryMessages(pSet.msgs, err)
 		})
@@ -1694,10 +1626,6 @@ func (p *asyncProducer) retryHandler() {
 // utility functions
 
 func (p *asyncProducer) shutdown() {
-	Logger.Println("Producer shutting down.")
-	if p.shutdownCh != nil && p.shutdownChClosed.CompareAndSwap(false, true) {
-		close(p.shutdownCh)
-	}
 	p.inFlight.Add(1)
 	p.input <- &ProducerMessage{flags: shutdown}
 
diff --git a/async_producer_test.go b/async_producer_test.go
index 4bb9cd87a..ed5e6beeb 100644
--- a/async_producer_test.go
+++ b/async_producer_test.go
@@ -1957,105 +1957,37 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	}
 }
 
-func TestHandleErrorNonIdempotentRetryKeepsPartitionMuted(t *testing.T) {
+func TestHandleSuccessNonIdempotentRetryKeepsPartitionMuted(t *testing.T) {
 	config := NewTestConfig()
 	config.Producer.Idempotent = false
 	config.Producer.Retry.Max = 1
 	config.Producer.Retry.Backoff = 0
 
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		txnmgr:     &transactionManager{},
-	}
-	failedBroker := &Broker{id: 1}
-	retryLeader := &Broker{id: 2}
-	parent.client = &stubLeaderClient{leader: retryLeader, cfg: config}
-
-	output := make(chan *produceSet)
-	retryBP := &brokerProducer{
-		parent: parent,
-		broker: retryLeader,
-		output: output,
-		input:  make(chan *ProducerMessage),
-	}
-	parent.brokers[retryLeader] = retryBP
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            failedBroker,
-		input:             make(chan *ProducerMessage),
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	retryPartitionSet := sent.msgs["topic"][0]
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-
-	done := make(chan struct{})
-	go func() {
-		bp.handleError(sent, ErrOutOfBrokers)
-		close(done)
-	}()
-	assertDoneWithin(t, done, 2*time.Second)
-
-	retrySet := assertDoneWithin(t, output, 2*time.Second)
-	defer parent.muter.unmute(retrySet)
-	require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0])
-	require.Equal(t, 1, retryPartitionSet.msgs[0].retries)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if parent.muter.tryMute(contender) {
-		parent.muter.unmute(contender)
-		t.Fatal("expected partition to remain muted by the retrying batch")
+	txnMgr := &transactionManager{
+		producerID:      0,
+		producerEpoch:   0,
+		sequenceNumbers: make(map[string]int32),
 	}
-}
-
-func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-
 	parent := &asyncProducer{
 		conf:       config,
 		muter:      newPartitionMuter(),
 		brokers:    make(map[*Broker]*brokerProducer),
 		brokerRefs: make(map[*brokerProducer]int),
-		txnmgr:     &transactionManager{},
-	}
-	failedBroker := &Broker{id: 1}
-	retryLeader := &Broker{id: 2}
-	client := &blockingRefreshClient{
-		stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config},
-		started:          make(chan struct{}, 1),
-		release:          make(chan struct{}),
-	}
-	parent.client = client
-
-	output := make(chan *produceSet)
-	retryBP := &brokerProducer{
-		parent: parent,
-		broker: retryLeader,
-		output: output,
-		input:  make(chan *ProducerMessage),
+		retries:    make(chan *ProducerMessage, 1),
+		txnmgr:     txnMgr,
 	}
-	parent.brokers[retryLeader] = retryBP
+	leader := &Broker{id: 1}
+	parent.client = &stubLeaderClient{leader: leader, cfg: config}
 
 	bp := &brokerProducer{
 		parent:            parent,
-		broker:            failedBroker,
+		broker:            leader,
 		input:             make(chan *ProducerMessage),
 		accumulatingBatch: newProduceSet(parent),
 		currentRetries:    make(map[string]map[int32]error),
 	}
+	parent.brokers[leader] = bp
+	parent.brokerRefs[bp] = 1
 
 	sent := newProduceSet(parent)
 	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
@@ -2063,242 +1995,29 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T
 	if !parent.muter.tryMute(sent) {
 		t.Fatal("expected sent batch to mute partitions")
 	}
-
-	done := make(chan struct{})
-	go func() {
-		bp.handleError(sent, ErrOutOfBrokers)
-		close(done)
-	}()
-	assertDoneWithin(t, done, 2*time.Second)
-	assertDoneWithin(t, client.started, 2*time.Second)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if parent.muter.tryMute(contender) {
-		parent.muter.unmute(contender)
-		t.Fatal("expected partition to remain muted while async metadata refresh is pending")
-	}
-
-	close(client.release)
-	retrySet := assertDoneWithin(t, output, 2*time.Second)
-	defer parent.muter.unmute(retrySet)
-	require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0])
-}
-
-func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		txnmgr:     &transactionManager{},
-	}
-	failedBroker := &Broker{id: 1}
-	retryLeader := &Broker{id: 2}
-	client := &countingRefreshClient{
-		stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config},
-		calls:            make(chan []string, 1),
-	}
-	parent.client = client
-
-	output := make(chan *produceSet, 2)
-	retryBP := &brokerProducer{
-		parent: parent,
-		broker: retryLeader,
-		output: output,
-		input:  make(chan *ProducerMessage),
-	}
-	parent.brokers[retryLeader] = retryBP
-	parent.brokerRefs[retryBP] = 1
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            failedBroker,
-		input:             make(chan *ProducerMessage),
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry-0")})
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("retry-1")})
-	retryPartitionSet0 := sent.msgs["topic"][0]
-	retryPartitionSet1 := sent.msgs["topic"][1]
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-
-	bp.handleError(sent, ErrOutOfBrokers)
-
-	require.Equal(t, []string{"topic"}, assertDoneWithin(t, client.calls, 2*time.Second))
-	retrySet0 := assertDoneWithin(t, output, 2*time.Second)
-	defer parent.muter.unmute(retrySet0)
-	retrySet1 := assertDoneWithin(t, output, 2*time.Second)
-	defer parent.muter.unmute(retrySet1)
-	assertNotDone(t, client.calls, 50*time.Millisecond)
-
-	retriedPartitions := make(map[int32]*partitionSet)
-	retrySet0.eachPartition(func(_ string, partition int32, pSet *partitionSet) {
-		retriedPartitions[partition] = pSet
-	})
-	retrySet1.eachPartition(func(_ string, partition int32, pSet *partitionSet) {
-		retriedPartitions[partition] = pSet
-	})
-	require.Same(t, retryPartitionSet0, retriedPartitions[int32(0)])
-	require.Same(t, retryPartitionSet1, retriedPartitions[int32(1)])
-}
-
-func TestHandleSuccessNonIdempotentRetryUsesPartitionProducer(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-
-	parent := &asyncProducer{
-		conf:    config,
-		muter:   newPartitionMuter(),
-		retries: make(chan *ProducerMessage, 1),
-		txnmgr:  &transactionManager{},
-	}
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            &Broker{id: 1},
-		input:             make(chan *ProducerMessage),
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-
-	sent := newProduceSet(parent)
-	msg := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}
-	safeAddMessage(t, sent, msg)
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
+	defer parent.muter.unmute(sent)
 
 	response := new(ProduceResponse)
 	response.AddTopicPartition("topic", 0, ErrNotLeaderForPartition)
 	bp.handleSuccess(sent, response)
 
 	retried := assertDoneWithin(t, parent.retries, 2*time.Second)
-	require.Equal(t, msg, retried)
-	require.Equal(t, 1, retried.retries)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected response retry to release sent mute before partitionProducer retry")
-	}
-	parent.muter.unmute(contender)
-}
-
-func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		errors:     make(chan *ProducerError, 1),
-		shutdownCh: make(chan struct{}),
-		txnmgr:     &transactionManager{},
-	}
-	leader := &Broker{id: 1}
-	parent.client = &stubLeaderClient{leader: leader, cfg: config}
-	retryBP := &brokerProducer{
-		parent: parent,
-		broker: leader,
-		output: make(chan *produceSet),
-		input:  make(chan *ProducerMessage),
-		done:   make(chan struct{}),
-	}
-	parent.brokers[leader] = retryBP
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	retryPartitionSet := sent.msgs["topic"][0]
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-	parent.inFlight.Add(1)
-	if parent.shutdownChClosed.CompareAndSwap(false, true) {
-		close(parent.shutdownCh)
-	}
-
-	done := make(chan struct{})
-	go func() {
-		parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true)
-		close(done)
-	}()
-
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrShuttingDown, producerErr.Err)
-	assertDoneWithin(t, done, 2*time.Second)
+	require.Equal(t, retryPartitionSet.msgs[0], retried)
+	require.True(t, retried.reusePartitionMute)
 
 	contender := newProduceSet(parent)
 	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected partition mute to be released after aborted retry handoff")
-	}
-	parent.muter.unmute(contender)
-}
-
-func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-	config.Producer.Return.Errors = true
-
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		errors:     make(chan *ProducerError, 1),
-		txnmgr:     &transactionManager{},
-	}
-	parent.client = &failingLeaderClient{
-		stubLeaderClient: stubLeaderClient{cfg: config},
-		err:              ErrOutOfBrokers,
-	}
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            &Broker{id: 1},
-		input:             make(chan *ProducerMessage),
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
+	if parent.muter.tryMute(contender) {
+		parent.muter.unmute(contender)
+		t.Fatal("expected partition to remain muted by the retrying batch")
 	}
-	parent.inFlight.Add(1)
-
-	bp.handleError(sent, ErrOutOfBrokers)
 
-	producerErr := assertDoneWithin(t, parent.errors, 2*time.Second)
-	require.Equal(t, ErrOutOfBrokers, producerErr.Err)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if !parent.muter.tryMute(contender) {
-		t.Fatal("expected partition mute to be released after retry leader lookup failure")
+	safeAddMessage(t, bp.accumulatingBatch, retried)
+	if !bp.tryBuildFlushingBatch() {
+		t.Fatal("expected retry batch to reuse the existing partition mute")
 	}
-	parent.muter.unmute(contender)
+	require.Equal(t, retryPartitionSet.msgs[0], bp.flushingBatch.msgs["topic"][0].msgs[0])
+	require.False(t, retryPartitionSet.msgs[0].reusePartitionMute)
 }
 
 type stubLeaderClient struct {
@@ -2336,44 +2055,6 @@ func (c *stubLeaderClient) PartitionNotReadable(string, int32) bool          { r
 func (c *stubLeaderClient) Close() error                                     { return nil }
 func (c *stubLeaderClient) Closed() bool                                     { return false }
 
-type failingLeaderClient struct {
-	stubLeaderClient
-	err error
-}
-
-func (c *failingLeaderClient) Leader(string, int32) (*Broker, error) {
-	return nil, c.err
-}
-
-func (c *failingLeaderClient) LeaderAndEpoch(string, int32) (*Broker, int32, error) {
-	return nil, 0, c.err
-}
-
-type blockingRefreshClient struct {
-	stubLeaderClient
-	started chan struct{}
-	release chan struct{}
-}
-
-func (c *blockingRefreshClient) RefreshMetadata(...string) error {
-	select {
-	case c.started <- struct{}{}:
-	default:
-	}
-	<-c.release
-	return nil
-}
-
-type countingRefreshClient struct {
-	stubLeaderClient
-	calls chan []string
-}
-
-func (c *countingRefreshClient) RefreshMetadata(topics ...string) error {
-	c.calls <- append([]string(nil), topics...)
-	return nil
-}
-
 func testProducerInterceptor(
 	t *testing.T,
 	interceptors []ProducerInterceptor,
diff --git a/produce_set.go b/produce_set.go
index 121a272ac..a86235b9e 100644
--- a/produce_set.go
+++ b/produce_set.go
@@ -12,18 +12,40 @@ type partitionSet struct {
 	bufferBytes   int
 }
 
-// shouldKeepMuted matches retryBatch's whole-batch retry rule: if any message has
-// exhausted retries, the batch will be failed instead of retried.
-func (ps *partitionSet) shouldKeepMuted(maxRetries int) bool {
-	if len(ps.msgs) == 0 {
-		return false
+// markPartitionMuteReusable lets the first message carry the retry batch's
+// right to reuse the partition mute that is still held by its failed send.
+// Only the first message is marked because one held mute represents one
+// in-flight retry permission; marking every message could let split retry
+// fragments bypass each other under the same reservation.
+func (ps *partitionSet) markPartitionMuteReusable() {
+	if len(ps.msgs) > 0 {
+		ps.msgs[0].reusePartitionMute = true
+	}
+}
+
+// canReusePartitionMute reports whether this retry batch may be flushed while
+// its partition is already muted by the previous send attempt.
+func (ps *partitionSet) canReusePartitionMute() bool {
+	return len(ps.msgs) > 0 && ps.msgs[0].reusePartitionMute
+}
+
+// clearPartitionMuteReuse consumes the one-shot reuse marker once the retry
+// batch is selected for flushing.
+func (ps *partitionSet) clearPartitionMuteReuse() {
+	for _, msg := range ps.msgs {
+		msg.reusePartitionMute = false
 	}
+}
+
+// canRetry prevents holding a partition mute when every message in the set will
+// be returned as an error instead of being retried.
+func (ps *partitionSet) canRetry(maxRetries int) bool {
 	for _, msg := range ps.msgs {
-		if msg.retries >= maxRetries {
-			return false
+		if msg.retries < maxRetries {
+			return true
 		}
 	}
-	return true
+	return false
 }
 
 type produceSet struct {
diff --git a/produce_set_test.go b/produce_set_test.go
index 359bfd47b..9ab964949 100644
--- a/produce_set_test.go
+++ b/produce_set_test.go
@@ -25,24 +25,6 @@ func safeAddMessage(t *testing.T, ps *produceSet, msg *ProducerMessage) {
 	}
 }
 
-func TestShouldKeepMuted(t *testing.T) {
-	if (&partitionSet{}).shouldKeepMuted(1) {
-		t.Fatal("empty partition set should not be retryable")
-	}
-	if !(&partitionSet{msgs: []*ProducerMessage{
-		{retries: 0},
-		{retries: 0},
-	}}).shouldKeepMuted(1) {
-		t.Fatal("expected batch to be retryable when every message is below retry max")
-	}
-	if (&partitionSet{msgs: []*ProducerMessage{
-		{retries: 0},
-		{retries: 1},
-	}}).shouldKeepMuted(1) {
-		t.Fatal("expected batch not to be retryable when any message has exhausted retries")
-	}
-}
-
 func TestProduceSetInitial(t *testing.T) {
 	_, ps := makeProduceSet()
 

From 81e2f4bf80180b8d6647d78abbccec75a8bfd1f2 Mon Sep 17 00:00:00 2001
From: 3AceShowHand <jinl1037@hotmail.com>
Date: Thu, 25 Jun 2026 22:03:05 +0800
Subject: [PATCH 6/6] Revert "Out of order no idempotent  (#21)"

This reverts commit 4c5ff0cff596d32699f8e0c1a20d5b5caa5ab111.
---
 async_producer.go      |  51 ++-----------------
 async_producer_test.go | 110 -----------------------------------------
 produce_set.go         |  36 --------------
 3 files changed, 4 insertions(+), 193 deletions(-)

diff --git a/async_producer.go b/async_producer.go
index 99821e5b4..a9af20b1b 100644
--- a/async_producer.go
+++ b/async_producer.go
@@ -184,13 +184,6 @@ func (m *partitionMuter) tryMutePartition(topic string, partition int32) bool {
 	return true
 }
 
-func (m *partitionMuter) isMutedPartition(topic string, partition int32) bool {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	return m.isMuted(topic, partition)
-}
-
 // waitUntilMuted blocks until all partitions in the set can be muted, then mutes them.
 // Returns false if the muter was closed before all partitions could be muted.
 func (m *partitionMuter) waitUntilMuted(set *produceSet) bool {
@@ -383,11 +376,6 @@ type ProducerMessage struct {
 	sequenceNumber int32
 	producerEpoch  int16
 	hasSequence    bool
-	// reusePartitionMute is a one-shot marker for non-idempotent retries. The
-	// first retried message from a failed, still-muted batch may reuse that
-	// existing in-flight reservation so later same-partition messages cannot
-	// slip in between the failed send and its retry.
-	reusePartitionMute bool
 }
 
 const producerMessageOverhead = 26 // the metadata overhead of CRC, flags, etc.
@@ -414,7 +402,6 @@ func (m *ProducerMessage) ByteSize(version int) int {
 func (m *ProducerMessage) clear() {
 	m.flags = 0
 	m.retries = 0
-	m.reusePartitionMute = false
 	m.sequenceNumber = 0
 	m.producerEpoch = 0
 	m.hasSequence = false
@@ -1096,14 +1083,8 @@ func (bp *brokerProducer) run() {
 	var output chan<- *produceSet
 
 	for {
-		var unmuteCh <-chan struct{}
 		if bp.flushingBatch == nil && (bp.timerFired || bp.accumulatingBatch.readyToFlush()) {
 			bp.tryBuildFlushingBatch()
-			if bp.flushingBatch == nil {
-				if ch, blocked := bp.parent.muter.awaitUnmuteChan(bp.accumulatingBatch); blocked {
-					unmuteCh = ch
-				}
-			}
 		}
 
 		var timerChan <-chan time.Time
@@ -1188,7 +1169,6 @@ func (bp *brokerProducer) run() {
 			if ok {
 				bp.handleResponse(response)
 			}
-		case <-unmuteCh:
 		}
 	}
 }
@@ -1206,29 +1186,10 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool {
 	partial := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool {
 		return bp.parent.muter.tryMutePartition(topic, partition)
 	})
-	if partial != nil {
-		bp.flushingBatch = partial
-		if bp.accumulatingBatch.empty() {
-			bp.rollOver()
-		}
-		return true
-	}
-
-	reused := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool {
-		partitions := bp.accumulatingBatch.msgs[topic]
-		if partitions == nil {
-			return false
-		}
-		set := partitions[partition]
-		return set.canReusePartitionMute() && bp.parent.muter.isMutedPartition(topic, partition)
-	})
-	if reused == nil {
+	if partial == nil {
 		return false
 	}
-	reused.eachPartition(func(_ string, _ int32, pSet *partitionSet) {
-		pSet.clearPartitionMuteReuse()
-	})
-	bp.flushingBatch = reused
+	bp.flushingBatch = partial
 	if bp.accumulatingBatch.empty() {
 		bp.rollOver()
 	}
@@ -1393,7 +1354,7 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo
 				bp.parent.returnErrors(pSet.msgs, block.Err)
 			} else {
 				retryTopics = append(retryTopics, topic)
-				if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) {
+				if bp.parent.conf.Producer.Idempotent {
 					if keepMuted[topic] == nil {
 						keepMuted[topic] = make(map[int32]struct{})
 					}
@@ -1436,7 +1397,6 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo
 				if bp.parent.conf.Producer.Idempotent {
 					go bp.parent.retryBatch(topic, partition, pSet, block.Err, true)
 				} else {
-					pSet.markPartitionMuteReusable()
 					bp.parent.retryMessages(pSet.msgs, block.Err)
 				}
 				// dropping the following messages has the side effect of incrementing their retry count
@@ -1532,16 +1492,13 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) {
 				bp.currentRetries[topic] = make(map[int32]error)
 			}
 			bp.currentRetries[topic][partition] = err
-			if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) {
+			if bp.parent.conf.Producer.Idempotent {
 				if keepMuted[topic] == nil {
 					keepMuted[topic] = make(map[int32]struct{})
 				}
 				keepMuted[topic][partition] = struct{}{}
-			}
-			if bp.parent.conf.Producer.Idempotent {
 				go bp.parent.retryBatch(topic, partition, pSet, err, true)
 			} else {
-				pSet.markPartitionMuteReusable()
 				bp.parent.retryMessages(pSet.msgs, err)
 			}
 		})
diff --git a/async_producer_test.go b/async_producer_test.go
index ed5e6beeb..c32ac3ef5 100644
--- a/async_producer_test.go
+++ b/async_producer_test.go
@@ -1724,53 +1724,6 @@ func TestBrokerProducerFlushSkipsMutedPartitions(t *testing.T) {
 	}
 }
 
-func TestBrokerProducerRunFlushesAfterExternalUnmute(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Flush.Messages = 1
-	parent := &asyncProducer{
-		conf:   config,
-		muter:  newPartitionMuter(),
-		txnmgr: &transactionManager{},
-	}
-
-	blockedSet := newProduceSet(parent)
-	safeAddMessage(t, blockedSet, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("held")})
-	if !parent.muter.tryMute(blockedSet) {
-		t.Fatal("expected to mute partition externally")
-	}
-
-	input := make(chan *ProducerMessage)
-	output := make(chan *produceSet, 1)
-	responses := make(chan *brokerProducerResponse)
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            &Broker{id: 1},
-		input:             input,
-		output:            output,
-		responses:         responses,
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-	retryMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("waiting")}
-	safeAddMessage(t, bp.accumulatingBatch, retryMsg)
-
-	done := make(chan struct{})
-	go func() {
-		bp.run()
-		close(done)
-	}()
-
-	assertNotDone(t, output, 50*time.Millisecond)
-	parent.muter.unmute(blockedSet)
-
-	flushed := assertDoneWithin(t, output, 2*time.Second)
-	require.Equal(t, retryMsg, flushed.msgs["topic"][0].msgs[0])
-
-	close(input)
-	close(responses)
-	assertDoneWithin(t, done, 2*time.Second)
-}
-
 // TestBrokerProducerWaitForSpaceAllPartitionsMuted verifies that waitForSpace unblocks
 // when all partitions in the accumulating batch are externally muted and later unmuted.
 func TestBrokerProducerWaitForSpaceAllPartitionsMuted(t *testing.T) {
@@ -1957,69 +1910,6 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) {
 	}
 }
 
-func TestHandleSuccessNonIdempotentRetryKeepsPartitionMuted(t *testing.T) {
-	config := NewTestConfig()
-	config.Producer.Idempotent = false
-	config.Producer.Retry.Max = 1
-	config.Producer.Retry.Backoff = 0
-
-	txnMgr := &transactionManager{
-		producerID:      0,
-		producerEpoch:   0,
-		sequenceNumbers: make(map[string]int32),
-	}
-	parent := &asyncProducer{
-		conf:       config,
-		muter:      newPartitionMuter(),
-		brokers:    make(map[*Broker]*brokerProducer),
-		brokerRefs: make(map[*brokerProducer]int),
-		retries:    make(chan *ProducerMessage, 1),
-		txnmgr:     txnMgr,
-	}
-	leader := &Broker{id: 1}
-	parent.client = &stubLeaderClient{leader: leader, cfg: config}
-
-	bp := &brokerProducer{
-		parent:            parent,
-		broker:            leader,
-		input:             make(chan *ProducerMessage),
-		accumulatingBatch: newProduceSet(parent),
-		currentRetries:    make(map[string]map[int32]error),
-	}
-	parent.brokers[leader] = bp
-	parent.brokerRefs[bp] = 1
-
-	sent := newProduceSet(parent)
-	safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")})
-	retryPartitionSet := sent.msgs["topic"][0]
-	if !parent.muter.tryMute(sent) {
-		t.Fatal("expected sent batch to mute partitions")
-	}
-	defer parent.muter.unmute(sent)
-
-	response := new(ProduceResponse)
-	response.AddTopicPartition("topic", 0, ErrNotLeaderForPartition)
-	bp.handleSuccess(sent, response)
-
-	retried := assertDoneWithin(t, parent.retries, 2*time.Second)
-	require.Equal(t, retryPartitionSet.msgs[0], retried)
-	require.True(t, retried.reusePartitionMute)
-
-	contender := newProduceSet(parent)
-	safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")})
-	if parent.muter.tryMute(contender) {
-		parent.muter.unmute(contender)
-		t.Fatal("expected partition to remain muted by the retrying batch")
-	}
-
-	safeAddMessage(t, bp.accumulatingBatch, retried)
-	if !bp.tryBuildFlushingBatch() {
-		t.Fatal("expected retry batch to reuse the existing partition mute")
-	}
-	require.Equal(t, retryPartitionSet.msgs[0], bp.flushingBatch.msgs["topic"][0].msgs[0])
-	require.False(t, retryPartitionSet.msgs[0].reusePartitionMute)
-}
-
 type stubLeaderClient struct {
 	cfg    *Config
 	leader *Broker
diff --git a/produce_set.go b/produce_set.go
index a86235b9e..380f7f5ab 100644
--- a/produce_set.go
+++ b/produce_set.go
@@ -12,42 +12,6 @@ type partitionSet struct {
 	bufferBytes   int
 }
 
-// markPartitionMuteReusable lets the first message carry the retry batch's
-// right to reuse the partition mute that is still held by its failed send.
-// Only the first message is marked because one held mute represents one
-// in-flight retry permission; marking every message could let split retry
-// fragments bypass each other under the same reservation.
-func (ps *partitionSet) markPartitionMuteReusable() {
-	if len(ps.msgs) > 0 {
-		ps.msgs[0].reusePartitionMute = true
-	}
-}
-
-// canReusePartitionMute reports whether this retry batch may be flushed while
-// its partition is already muted by the previous send attempt.
-func (ps *partitionSet) canReusePartitionMute() bool {
-	return len(ps.msgs) > 0 && ps.msgs[0].reusePartitionMute
-}
-
-// clearPartitionMuteReuse consumes the one-shot reuse marker once the retry
-// batch is selected for flushing.
-func (ps *partitionSet) clearPartitionMuteReuse() {
-	for _, msg := range ps.msgs {
-		msg.reusePartitionMute = false
-	}
-}
-
-// canRetry prevents holding a partition mute when every message in the set will
-// be returned as an error instead of being retried.
-func (ps *partitionSet) canRetry(maxRetries int) bool {
-	for _, msg := range ps.msgs {
-		if msg.retries < maxRetries {
-			return true
-		}
-	}
-	return false
-}
-
 type produceSet struct {
 	parent        *asyncProducer
 	msgs          map[string]map[int32]*partitionSet