From e42b50e1e204488ea8c686c49f3b80e46bd69925 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 25 Jun 2026 22:03:05 +0800 Subject: [PATCH 1/6] Revert "Remove more logs and cherry pick review (#26)" This reverts commit 93b73f331dd5445624325af34cc441f71e8412a5. --- async_producer.go | 188 ++++++++++++++++++++++++----------------- async_producer_test.go | 113 ++++++------------------- client.go | 4 +- 3 files changed, 139 insertions(+), 166 deletions(-) diff --git a/async_producer.go b/async_producer.go index bcd7382e7..c9d5f7d44 100644 --- a/async_producer.go +++ b/async_producer.go @@ -817,8 +817,10 @@ func (p *asyncProducer) backoffOrDone(retries int) bool { if backoff <= 0 { return !p.shuttingDown() } + timer := time.NewTimer(backoff) + defer timer.Stop() select { - case <-time.After(backoff): + case <-timer.C: return true case <-p.done: return false @@ -1242,7 +1244,7 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool { } func (bp *brokerProducer) shutdown() { - if !bp.closed.CompareAndSwap(false, true) { + if bp.closed.Swap(true) { return } if bp.done != nil { @@ -1474,6 +1476,10 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio produceSet := newProduceSet(p) produceSet.addPartitionSet(topic, partition, pSet) bufferMessages, bufferBytes := int64(produceSet.bufferCount), int64(produceSet.bufferBytes) + var reservedMessages, reservedBytes int64 + defer func() { + p.retryBufferQuota.release(reservedMessages, reservedBytes) + }() muted := alreadyMuted failBatch := func(err error) { @@ -1496,7 +1502,7 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio failBatch(ErrProducerRetryBufferOverflow) return } - defer p.retryBufferQuota.release(bufferMessages, bufferBytes) + reservedMessages, reservedBytes = bufferMessages, bufferBytes // Honor Producer.Retry.Backoff between retry attempts (#2469). retryBatch // dispatches the produceSet directly to the broker, bypassing partitionProducer.dispatch. @@ -1526,11 +1532,23 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio } } -func (p *asyncProducer) failMutedSet(set *produceSet, err error) { - p.muter.unmute(set) - set.eachPartition(func(_ string, _ int32, pSet *partitionSet) { - p.returnErrors(pSet.msgs, err) - }) +type partitionBatchRetry struct { + topic string + partition int32 + pSet *partitionSet +} + +func (p *asyncProducer) failMutedBatch(batch partitionBatchRetry, err error) { + produceSet := newProduceSet(p) + produceSet.addPartitionSet(batch.topic, batch.partition, batch.pSet) + p.muter.unmute(produceSet) + p.returnErrors(batch.pSet.msgs, err) +} + +func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err error) { + for _, batch := range batches { + p.failMutedBatch(batch, err) + } } func newRetryBufferQuota(conf *Config) messageQuota { @@ -1559,88 +1577,99 @@ func producerMessageByteSizeVersion(conf *Config) int { return 1 } -// retryBatchesAfterRefresh retries muted batches from one failed ProduceRequest -// after a single metadata refresh, preserving that failed-request boundary. -func (p *asyncProducer) retryBatchesAfterRefresh(batches *produceSet, retryErr error) { +// retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError. +// Connection errors already hold partition mutes; doing the refresh in the +// response loop can block all progress for that broker while the cluster is +// unstable. Refresh once for the failed request so multi-partition batches do +// not amplify controller-fail metadata traffic. Group retry partitions by their +// refreshed leader broker, but hand them off from this retry worker so large +// clusters do not create one temporary goroutine per target broker. +// +// This is intentionally not implemented as a loop over retryBatch. retryBatch +// is the single-partition direct retry primitive; this path owns all muted +// partitions from one failed ProduceRequest and must retry, fail, reserve +// buffer, back off, and refresh metadata at that failed-request boundary. +func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []partitionBatchRetry, retryErr error) { if p.shuttingDown() { - p.failMutedSet(batches, ErrShuttingDown) + p.failMutedBatches(batches, ErrShuttingDown) return } - var ( - maxRetryAttempt int - bufferMessages, bufferBytes int64 - ) - retryable := newProduceSet(p) - batches.eachPartition(func(topic string, partition int32, pSet *partitionSet) { - if !pSet.shouldKeepMuted(p.conf.Producer.Retry.Max) { - failed := newProduceSet(p) - failed.addPartitionSet(topic, partition, pSet) - p.failMutedSet(failed, retryErr) - return + var retryable []partitionBatchRetry + maxRetryAttempt := 0 + for _, batch := range batches { + if !batch.pSet.shouldKeepMuted(p.conf.Producer.Retry.Max) { + p.failMutedBatch(batch, retryErr) + continue } - for _, msg := range pSet.msgs { + for _, msg := range batch.pSet.msgs { msg.retries++ if msg.retries > maxRetryAttempt { maxRetryAttempt = msg.retries } } - retryable.addPartitionSet(topic, partition, pSet) - bufferMessages += int64(len(pSet.msgs)) - bufferBytes += int64(pSet.bufferBytes) - }) - if retryable.empty() { + retryable = append(retryable, batch) + } + if len(retryable) == 0 { return } - // Reserve before waiting so pending direct retries still count against the - // producer-level retry buffer budget. + // Reserve before metadata refresh so refresh-blocked direct retries still + // count against the producer-level retry buffer budget. + var bufferMessages, bufferBytes int64 + for _, batch := range retryable { + bufferMessages += int64(len(batch.pSet.msgs)) + bufferBytes += int64(batch.pSet.bufferBytes) + } if !p.retryBufferQuota.tryReserve(bufferMessages, bufferBytes) { - p.failMutedSet(retryable, ErrProducerRetryBufferOverflow) + p.failMutedBatches(retryable, ErrProducerRetryBufferOverflow) return } - if !p.backoffOrDone(maxRetryAttempt) { + if p.shuttingDown() { p.retryBufferQuota.release(bufferMessages, bufferBytes) - p.failMutedSet(retryable, ErrShuttingDown) + p.failMutedBatches(retryable, ErrShuttingDown) return } - topics := make([]string, 0, len(retryable.msgs)) - for topic := range retryable.msgs { - topics = append(topics, topic) + if len(topics) > 0 { + if err := p.client.RefreshMetadata(topics...); err != nil { + Logger.Printf("Failed refreshing metadata because of %v\n", err) + } } - if err := p.client.RefreshMetadata(topics...); err != nil { - Logger.Printf("Failed refreshing metadata because of %v\n", err) + if p.shuttingDown() { + p.retryBufferQuota.release(bufferMessages, bufferBytes) + p.failMutedBatches(retryable, ErrShuttingDown) + return + } + + if !p.backoffOrDone(maxRetryAttempt) { + p.retryBufferQuota.release(bufferMessages, bufferBytes) + p.failMutedBatches(retryable, ErrShuttingDown) + return } brokerSets := make(map[*Broker]*produceSet) - retryable.eachPartition(func(topic string, partition int32, pSet *partitionSet) { - leader, err := p.client.Leader(topic, partition) + for _, batch := range retryable { + if p.shuttingDown() { + batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes) + p.retryBufferQuota.release(batchMessages, batchBytes) + p.failMutedBatch(batch, ErrShuttingDown) + continue + } + leader, err := p.client.Leader(batch.topic, batch.partition) if err != nil { - Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", topic, partition, err) - batchMessages, batchBytes := int64(len(pSet.msgs)), int64(pSet.bufferBytes) + Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, err) + batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes) p.retryBufferQuota.release(batchMessages, batchBytes) - failed := newProduceSet(p) - failed.addPartitionSet(topic, partition, pSet) - p.failMutedSet(failed, retryErr) - return + p.failMutedBatch(batch, retryErr) + continue } set := brokerSets[leader] if set == nil { set = newProduceSet(p) brokerSets[leader] = set } - set.addPartitionSet(topic, partition, pSet) - }) - - // handoffRetryBatch also checks shutdown, but doing it here avoids creating - // or ref-counting brokerProducers just to reject the retry. - if p.shuttingDown() { - for _, set := range brokerSets { - p.retryBufferQuota.release(int64(set.bufferCount), int64(set.bufferBytes)) - p.failMutedSet(set, ErrShuttingDown) - } - return + set.addPartitionSet(batch.topic, batch.partition, batch.pSet) } for leader, set := range brokerSets { @@ -1649,7 +1678,10 @@ func (p *asyncProducer) retryBatchesAfterRefresh(batches *produceSet, retryErr e p.unrefBrokerProducer(leader, bp) p.retryBufferQuota.release(int64(set.bufferCount), int64(set.bufferBytes)) if !accepted { - p.failMutedSet(set, ErrShuttingDown) + p.muter.unmute(set) + set.eachPartition(func(_ string, _ int32, pSet *partitionSet) { + p.returnErrors(pSet.msgs, ErrShuttingDown) + }) } } } @@ -1657,16 +1689,10 @@ func (p *asyncProducer) retryBatchesAfterRefresh(batches *produceSet, retryErr e // handoffRetryBatch transfers ownership of a muted retry batch to the broker // bridge. On false, the caller still owns the mute and must fail the batch. func (p *asyncProducer) handoffRetryBatch(bp *brokerProducer, set *produceSet) bool { - // If shutdown is already visible, do not let select randomly choose a ready - // send on bp.output. The second select still handles shutdown racing with - // the handoff. - select { - case <-bp.done: - return false - case <-p.done: + if bp.closed.Load() { return false - default: } + select { case bp.output <- set: return true @@ -1689,9 +1715,10 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { bp.parent.abandonBrokerConnection(bp.broker) _ = bp.broker.Close() bp.closing = err - - var retrySet *produceSet keepMuted := make(map[string]map[int32]struct{}) + var retryTopics []string + retryTopicSeen := make(map[string]struct{}) + var retryBatches []partitionBatchRetry sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) { // Connection failures have no ProduceResponse to feed back through // partitionProducer. Keep the sent batch muted and retry it asynchronously @@ -1704,18 +1731,23 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { if !(bp.parent.conf.Producer.Idempotent || pSet.shouldKeepMuted(bp.parent.conf.Producer.Retry.Max)) { bp.parent.returnErrors(pSet.msgs, err) } else { - if retrySet == nil { - retrySet = newProduceSet(bp.parent) - } if keepMuted[topic] == nil { keepMuted[topic] = make(map[int32]struct{}) } keepMuted[topic][partition] = struct{}{} - retrySet.addPartitionSet(topic, partition, pSet) + if _, ok := retryTopicSeen[topic]; !ok { + retryTopicSeen[topic] = struct{}{} + retryTopics = append(retryTopics, topic) + } + retryBatches = append(retryBatches, partitionBatchRetry{ + topic: topic, + partition: partition, + pSet: pSet, + }) } }) - if retrySet != nil { - go bp.parent.retryBatchesAfterRefresh(retrySet, err) + if len(retryBatches) > 0 { + go bp.parent.retryBatchesAfterRefresh(retryTopics, retryBatches, err) } bp.accumulatingBatch.eachPartition(func(topic string, partition int32, pSet *partitionSet) { bp.parent.retryMessages(pSet.msgs, err) @@ -1959,7 +1991,7 @@ func (q *messageQuota) tryReserve(messages, bytes int64) bool { nextMessages := q.messages + messages nextBytes := q.bytes + bytes - if q.overflows(nextMessages, nextBytes) { + if q.wouldOverflow(nextMessages, nextBytes) { return false } q.messages = nextMessages @@ -1977,7 +2009,7 @@ func (q *messageQuota) addAndCheckOverflow(messages, bytes int64) bool { q.messages += messages q.bytes += bytes - return q.overflows(q.messages, q.bytes) + return q.wouldOverflow(q.messages, q.bytes) } func (q *messageQuota) release(messages, bytes int64) { @@ -1996,7 +2028,7 @@ func (q *messageQuota) shouldTrack(messages, bytes int64) bool { return (messages != 0 || bytes != 0) && (q.maxMessages > 0 || q.maxBytes > 0) } -func (q *messageQuota) overflows(messages, bytes int64) bool { +func (q *messageQuota) wouldOverflow(messages, bytes int64) bool { return (q.maxMessages > 0 && messages >= q.maxMessages) || (q.maxBytes > 0 && bytes >= q.maxBytes) } diff --git a/async_producer_test.go b/async_producer_test.go index bbc68976d..b16e0bf61 100644 --- a/async_producer_test.go +++ b/async_producer_test.go @@ -1571,7 +1571,7 @@ func TestBrokerProducerShutdown(t *testing.T) { // TestBrokerProducerWaitForSpaceEmptyBufferRollover ensures forced rollovers with an empty buffer // do not deadlock waiting for responses when no partitions are muted. -func TestBrokerProducerWaitForSpaceEmptyRollover(t *testing.T) { +func TestBrokerProducerWaitForSpaceEmptyBufferRollover(t *testing.T) { config := NewTestConfig() parent := &asyncProducer{ conf: config, @@ -1707,9 +1707,9 @@ func holdFirstRetryAfterReserve(parent *asyncProducer, retryBP *brokerProducer, return firstRetryReserved, release } -// TestBrokerProducerWaitForSpaceExternalUnmute ensures waitForSpace does not +// TestBrokerProducerWaitForSpaceRespectsExternalUnmute ensures waitForSpace does not // deadlock when partitions are muted by another producer and are unmuted elsewhere. -func TestBrokerProducerWaitForSpaceExternalUnmute(t *testing.T) { +func TestBrokerProducerWaitForSpaceRespectsExternalUnmute(t *testing.T) { config := NewTestConfig() txnMgr := &transactionManager{ producerID: 0, @@ -1796,7 +1796,7 @@ func TestBrokerProducerFlushSkipsMutedPartitions(t *testing.T) { } } -func TestBrokerProducerRunExternalUnmute(t *testing.T) { +func TestBrokerProducerRunFlushesAfterExternalUnmute(t *testing.T) { config := NewTestConfig() config.Producer.Flush.Messages = 1 parent := &asyncProducer{ @@ -1843,9 +1843,9 @@ func TestBrokerProducerRunExternalUnmute(t *testing.T) { assertDoneWithin(t, done, 2*time.Second) } -// TestBrokerProducerWaitForSpaceAllMuted verifies that waitForSpace unblocks +// TestBrokerProducerWaitForSpaceAllPartitionsMuted verifies that waitForSpace unblocks // when all partitions in the accumulating batch are externally muted and later unmuted. -func TestBrokerProducerWaitForSpaceAllMuted(t *testing.T) { +func TestBrokerProducerWaitForSpaceAllPartitionsMuted(t *testing.T) { config := NewTestConfig() parent := &asyncProducer{ conf: config, @@ -1880,9 +1880,9 @@ func TestBrokerProducerWaitForSpaceAllMuted(t *testing.T) { } } -// TestPartitionMuterCloseWakesWait verifies that closing the muter wakes +// TestPartitionMuterCloseWakesWaitUntilMuted verifies that closing the muter wakes // goroutines blocked in waitUntilMuted. -func TestPartitionMuterCloseWakesWait(t *testing.T) { +func TestPartitionMuterCloseWakesWaitUntilMuted(t *testing.T) { config := NewTestConfig() parent := &asyncProducer{ conf: config, @@ -2031,7 +2031,7 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) { } } -func TestHandleErrorRetryKeepsMute(t *testing.T) { +func TestHandleErrorNonIdempotentRetryKeepsPartitionMuted(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2092,7 +2092,7 @@ func TestHandleErrorRetryKeepsMute(t *testing.T) { } } -func TestHandleErrorRetryFailsBatch(t *testing.T) { +func TestHandleErrorNonIdempotentRetryFailsWholeBatch(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2141,7 +2141,7 @@ func TestHandleErrorRetryFailsBatch(t *testing.T) { parent.muter.unmute(contender) } -func TestHandleErrorRetryAsyncRefresh(t *testing.T) { +func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2219,7 +2219,7 @@ func TestHandleErrorRetryAsyncRefresh(t *testing.T) { require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0]) } -func TestHandleErrorRetryRefreshOnce(t *testing.T) { +func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2288,7 +2288,7 @@ func TestHandleErrorRetryRefreshOnce(t *testing.T) { require.Same(t, retryPartitionSet1, retriedPartitions[int32(1)]) } -func TestHandleErrorRetryGroupsByLeader(t *testing.T) { +func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2385,7 +2385,7 @@ func TestHandleErrorRetryGroupsByLeader(t *testing.T) { assertNotDone(t, outputB, 50*time.Millisecond) } -func TestHandleSuccessRetryUsesPartitionProducer(t *testing.T) { +func TestHandleSuccessNonIdempotentRetryUsesPartitionProducer(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2429,7 +2429,7 @@ func TestHandleSuccessRetryUsesPartitionProducer(t *testing.T) { parent.muter.unmute(contender) } -func TestRetryBatchShutdownReleasesMute(t *testing.T) { +func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2542,9 +2542,11 @@ func TestRetryUsesSharedBufferBudget(t *testing.T) { retry := func(partition int32, pSet *partitionSet) { if tt.afterRefresh { - retrySet := newProduceSet(parent) - retrySet.addPartitionSet("topic", partition, pSet) - parent.retryBatchesAfterRefresh(retrySet, ErrOutOfBrokers) + parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ + topic: "topic", + partition: partition, + pSet: pSet, + }}, ErrOutOfBrokers) return } parent.retryBatch("topic", partition, pSet, ErrNotEnoughReplicas, true) @@ -2598,7 +2600,7 @@ func TestRetryUsesSharedBufferBudget(t *testing.T) { } } -func TestRetryBatchesAfterRefreshBrokerDone(t *testing.T) { +func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 @@ -2635,9 +2637,11 @@ func TestRetryBatchesAfterRefreshBrokerDone(t *testing.T) { done := make(chan struct{}) go func() { - retrySet := newProduceSet(parent) - retrySet.addPartitionSet("topic", 0, retryPartitionSet) - parent.retryBatchesAfterRefresh(retrySet, ErrOutOfBrokers) + parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ + topic: "topic", + partition: 0, + pSet: retryPartitionSet, + }}, ErrOutOfBrokers) close(done) }() @@ -2654,70 +2658,7 @@ func TestRetryBatchesAfterRefreshBrokerDone(t *testing.T) { parent.muter.unmute(contender) } -func TestRetryBatchesAfterRefreshShutdown(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, 2), - done: make(chan struct{}), - txnmgr: &transactionManager{}, - } - leader := &Broker{id: 1} - output := make(chan *produceSet, 1) - parent.brokers[leader] = &brokerProducer{ - parent: parent, - broker: leader, - output: output, - input: make(chan *ProducerMessage), - done: make(chan struct{}), - } - shutdownStarted := false - parent.client = &stubLeaderClient{ - cfg: config, - leaderFunc: func(string, int32) (*Broker, error) { - if !shutdownStarted { - close(parent.done) - shutdownStarted = true - } - return leader, nil - }, - } - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry-0")}) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("retry-1")}) - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - parent.inFlight.Add(2) - - parent.retryBatchesAfterRefresh(sent, ErrOutOfBrokers) - - firstErr := assertDoneWithin(t, parent.errors, 2*time.Second) - secondErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrShuttingDown, firstErr.Err) - require.Equal(t, ErrShuttingDown, secondErr.Err) - require.ElementsMatch(t, []int32{0, 1}, []int32{firstErr.Msg.Partition, secondErr.Msg.Partition}) - assertNotDone(t, output, 50*time.Millisecond) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next-0")}) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next-1")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected partition mutes to be released after shutdown") - } - parent.muter.unmute(contender) -} - -func TestHandleErrorRetryLeaderError(t *testing.T) { +func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 diff --git a/client.go b/client.go index 6dbf0d009..71b57b15c 100644 --- a/client.go +++ b/client.go @@ -732,7 +732,7 @@ func (client *client) checkSeedBrokersHealth(brokers []*Broker) { for _, broker := range brokers { if err := broker.getSockError(); err != nil { - DebugLogger.Printf("client/seedbrokers close seed broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err) + Logger.Printf("client/seedbrokers close seed broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err) safeAsyncClose(broker) } } @@ -741,7 +741,7 @@ func (client *client) checkSeedBrokersHealth(brokers []*Broker) { func (client *client) checkBrokersHealth() { for id, broker := range client.brokers { if err := broker.getSockError(); err != nil { - DebugLogger.Printf("client/brokers close broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err) + Logger.Printf("client/brokers close broker #%d at %s due to socket error: %v", broker.ID(), broker.Addr(), err) safeAsyncClose(broker) delete(client.brokers, id) } From 07e58ada53b106c9307f78885fd7190daef49a14 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 25 Jun 2026 22:03:05 +0800 Subject: [PATCH 2/6] Revert "refactor async producer (#25)" This reverts commit 8162613c39d5b4de7b099edbf17332836e612a07. --- async_producer.go | 368 ++++++++++++++----------- async_producer_test.go | 613 ++++++++++++++++++++++++----------------- produce_set_test.go | 37 +-- 3 files changed, 582 insertions(+), 436 deletions(-) diff --git a/async_producer.go b/async_producer.go index c9d5f7d44..14c813ace 100644 --- a/async_producer.go +++ b/async_producer.go @@ -93,16 +93,17 @@ type asyncProducer struct { errors chan *ProducerError input, successes, retries chan *ProducerMessage inFlight sync.WaitGroup - - // done is closed before waiting on in-flight messages so retryBatch + // shutdownCh is closed before waiting on in-flight messages so retryBatch // goroutines can stop waiting on broker handoff and release partition mutes. - done chan struct{} - closed atomic.Bool + shutdownCh chan struct{} + shutdownChClosed atomic.Bool brokers map[*Broker]*brokerProducer brokerRefs map[*brokerProducer]int brokerLock sync.Mutex + retryBuffer retryBufferQuota + txnmgr *transactionManager txLock sync.Mutex @@ -110,13 +111,17 @@ type asyncProducer struct { // mirroring Kafka's RecordAccumulator. muter *partitionMuter - // retryBufferQuota tracks producer-level retry buffer occupancy when - // Producer.Retry.MaxBufferLength or MaxBufferBytes is bounded. - retryBufferQuota messageQuota - metricsRegistry metrics.Registry } +type retryBufferQuota struct { + // messages and bytes track producer-level retry buffer occupancy when + // Producer.Retry.MaxBufferLength or MaxBufferBytes is bounded. + messages int64 + bytes int64 + mu sync.Mutex +} + type partitionMuter struct { mu sync.Mutex cond *sync.Cond @@ -307,19 +312,18 @@ func newAsyncProducer(client Client) (AsyncProducer, error) { } p := &asyncProducer{ - client: client, - conf: client.Config(), - errors: make(chan *ProducerError), - input: make(chan *ProducerMessage), - successes: make(chan *ProducerMessage), - retries: make(chan *ProducerMessage), - done: make(chan struct{}), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - txnmgr: txnmgr, - muter: newPartitionMuter(), - retryBufferQuota: newRetryBufferQuota(client.Config()), - metricsRegistry: newCleanupRegistry(client.Config().MetricRegistry), + client: client, + conf: client.Config(), + errors: make(chan *ProducerError), + input: make(chan *ProducerMessage), + successes: make(chan *ProducerMessage), + retries: make(chan *ProducerMessage), + shutdownCh: make(chan struct{}), + brokers: make(map[*Broker]*brokerProducer), + brokerRefs: make(map[*brokerProducer]int), + txnmgr: txnmgr, + muter: newPartitionMuter(), + metricsRegistry: newCleanupRegistry(client.Config().MetricRegistry), } // launch our singleton dispatchers @@ -793,19 +797,33 @@ func (p *asyncProducer) newPartitionProducer(topic string, partition int32) chan } func (pp *partitionProducer) backoff(retries int) { - time.Sleep(pp.parent.calcBackoff(retries)) + pp.parent.backoff(retries) } -func (p *asyncProducer) calcBackoff(retries int) time.Duration { - if backoffFunc := p.conf.Producer.Retry.BackoffFunc; backoffFunc != nil { - return backoffFunc(retries, p.conf.Producer.Retry.Max) +func (p *asyncProducer) backoff(retries int) { + backoff := p.retryBackoff(retries) + if backoff > 0 { + time.Sleep(backoff) } - return p.conf.Producer.Retry.Backoff +} + +func (p *asyncProducer) retryBackoff(retries int) time.Duration { + var backoff time.Duration + if p.conf.Producer.Retry.BackoffFunc != nil { + maxRetries := p.conf.Producer.Retry.Max + backoff = p.conf.Producer.Retry.BackoffFunc(retries, maxRetries) + } else { + backoff = p.conf.Producer.Retry.Backoff + } + return backoff } func (p *asyncProducer) shuttingDown() bool { + if p.shutdownCh == nil { + return false + } select { - case <-p.done: + case <-p.shutdownCh: return true default: return false @@ -813,16 +831,20 @@ func (p *asyncProducer) shuttingDown() bool { } func (p *asyncProducer) backoffOrDone(retries int) bool { - backoff := p.calcBackoff(retries) + backoff := p.retryBackoff(retries) if backoff <= 0 { return !p.shuttingDown() } timer := time.NewTimer(backoff) defer timer.Stop() + if p.shutdownCh == nil { + <-timer.C + return true + } select { case <-timer.C: return true - case <-p.done: + case <-p.shutdownCh: return false } } @@ -1108,8 +1130,7 @@ type brokerProducer struct { // output while the batch still owns its partition mute; if the brokerProducer // is shutting down, that send may never be received and the mute would block // later same-partition messages and producer shutdown. - done chan struct{} - closed atomic.Bool + done chan struct{} accumulatingBatch *produceSet flushingBatch *produceSet // batch that has been muted and is ready to send @@ -1233,20 +1254,18 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool { partial := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool { return bp.parent.muter.tryMutePartition(topic, partition) }) - if partial == nil { - return false - } - bp.flushingBatch = partial - if bp.accumulatingBatch.empty() { - bp.rollOver() + if partial != nil { + bp.flushingBatch = partial + if bp.accumulatingBatch.empty() { + bp.rollOver() + } + return true } - return true + + return false } func (bp *brokerProducer) shutdown() { - if bp.closed.Swap(true) { - return - } if bp.done != nil { close(bp.done) } @@ -1476,20 +1495,25 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio produceSet := newProduceSet(p) produceSet.addPartitionSet(topic, partition, pSet) bufferMessages, bufferBytes := int64(produceSet.bufferCount), int64(produceSet.bufferBytes) - var reservedMessages, reservedBytes int64 - defer func() { - p.retryBufferQuota.release(reservedMessages, reservedBytes) - }() - + bufferReserved := false + releaseBuffer := func() { + if bufferReserved { + p.releaseBuffer(bufferMessages, bufferBytes) + bufferReserved = false + } + } muted := alreadyMuted failBatch := func(err error) { + releaseBuffer() // Release the partition before reporting errors so a blocked Errors // consumer cannot keep later same-partition messages muted. if muted { p.muter.unmute(produceSet) muted = false } - p.returnErrors(pSet.msgs, err) + for _, msg := range pSet.msgs { + p.returnError(msg, err) + } } for _, msg := range pSet.msgs { if msg.retries >= p.conf.Producer.Retry.Max { @@ -1498,11 +1522,11 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio } msg.retries++ } - if !p.retryBufferQuota.tryReserve(bufferMessages, bufferBytes) { + if !p.reserveBuffer(bufferMessages, bufferBytes) { failBatch(ErrProducerRetryBufferOverflow) return } - reservedMessages, reservedBytes = bufferMessages, bufferBytes + bufferReserved = p.retryBufferBounded() && (bufferMessages != 0 || bufferBytes != 0) // Honor Producer.Retry.Backoff between retry attempts (#2469). retryBatch // dispatches the produceSet directly to the broker, bypassing partitionProducer.dispatch. @@ -1527,9 +1551,11 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio } bp := p.getBrokerProducer(leader) defer p.unrefBrokerProducer(leader, bp) - if !p.handoffRetryBatch(bp, produceSet) { + if !p.sendRetryBatch(bp, produceSet) { failBatch(ErrShuttingDown) + return } + releaseBuffer() } type partitionBatchRetry struct { @@ -1542,7 +1568,9 @@ func (p *asyncProducer) failMutedBatch(batch partitionBatchRetry, err error) { produceSet := newProduceSet(p) produceSet.addPartitionSet(batch.topic, batch.partition, batch.pSet) p.muter.unmute(produceSet) - p.returnErrors(batch.pSet.msgs, err) + for _, msg := range batch.pSet.msgs { + p.returnError(msg, err) + } } func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err error) { @@ -1551,32 +1579,85 @@ func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err erro } } -func newRetryBufferQuota(conf *Config) messageQuota { - maxBufferLength, maxBufferBytes := retryBufferLimits(conf) - return newMessageQuota(maxBufferLength, maxBufferBytes) -} - -func retryBufferLimits(conf *Config) (int64, int64) { - maxBufferLength := conf.Producer.Retry.MaxBufferLength - if maxBufferLength > 0 { - maxBufferLength = max(maxBufferLength, minFunctionalRetryBufferLength) +func (p *asyncProducer) retryBufferLimits() (int64, int64) { + maxBufferLength := p.conf.Producer.Retry.MaxBufferLength + if 0 < maxBufferLength && maxBufferLength < minFunctionalRetryBufferLength { + maxBufferLength = minFunctionalRetryBufferLength } - maxBufferBytes := conf.Producer.Retry.MaxBufferBytes - if maxBufferBytes > 0 { - maxBufferBytes = max(maxBufferBytes, minFunctionalRetryBufferBytes) + maxBufferBytes := p.conf.Producer.Retry.MaxBufferBytes + if 0 < maxBufferBytes && maxBufferBytes < minFunctionalRetryBufferBytes { + maxBufferBytes = minFunctionalRetryBufferBytes } return int64(maxBufferLength), maxBufferBytes } -func producerMessageByteSizeVersion(conf *Config) int { - if conf.Version.IsAtLeast(V0_11_0_0) { +func (p *asyncProducer) retryBufferBounded() bool { + maxBufferLength, maxBufferBytes := p.retryBufferLimits() + return maxBufferLength > 0 || maxBufferBytes > 0 +} + +func (p *asyncProducer) producerMessageByteSizeVersion() int { + if p.conf.Version.IsAtLeast(V0_11_0_0) { return 2 } return 1 } +func (p *asyncProducer) bufferWouldOverflow(messages, bytes int64) bool { + maxBufferLength, maxBufferBytes := p.retryBufferLimits() + return (maxBufferLength > 0 && messages >= maxBufferLength) || + (maxBufferBytes > 0 && bytes >= maxBufferBytes) +} + +func (p *asyncProducer) reserveBuffer(messages, bytes int64) bool { + if messages == 0 && bytes == 0 { + return true + } + if !p.retryBufferBounded() { + return true + } + p.retryBuffer.mu.Lock() + defer p.retryBuffer.mu.Unlock() + + if p.bufferWouldOverflow(p.retryBuffer.messages+messages, p.retryBuffer.bytes+bytes) { + return false + } + p.retryBuffer.messages += messages + p.retryBuffer.bytes += bytes + return true +} + +func (p *asyncProducer) addToBuffer(messages, bytes int64) bool { + if messages == 0 && bytes == 0 { + return false + } + if !p.retryBufferBounded() { + return false + } + p.retryBuffer.mu.Lock() + defer p.retryBuffer.mu.Unlock() + + p.retryBuffer.messages += messages + p.retryBuffer.bytes += bytes + return p.bufferWouldOverflow(p.retryBuffer.messages, p.retryBuffer.bytes) +} + +func (p *asyncProducer) releaseBuffer(messages, bytes int64) { + if messages == 0 && bytes == 0 { + return + } + if !p.retryBufferBounded() { + return + } + p.retryBuffer.mu.Lock() + defer p.retryBuffer.mu.Unlock() + + p.retryBuffer.messages -= messages + p.retryBuffer.bytes -= bytes +} + // retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError. // Connection errors already hold partition mutes; doing the refresh in the // response loop can block all progress for that broker while the cluster is @@ -1621,13 +1702,13 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part bufferMessages += int64(len(batch.pSet.msgs)) bufferBytes += int64(batch.pSet.bufferBytes) } - if !p.retryBufferQuota.tryReserve(bufferMessages, bufferBytes) { + if !p.reserveBuffer(bufferMessages, bufferBytes) { p.failMutedBatches(retryable, ErrProducerRetryBufferOverflow) return } if p.shuttingDown() { - p.retryBufferQuota.release(bufferMessages, bufferBytes) + p.releaseBuffer(bufferMessages, bufferBytes) p.failMutedBatches(retryable, ErrShuttingDown) return } @@ -1637,13 +1718,13 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part } } if p.shuttingDown() { - p.retryBufferQuota.release(bufferMessages, bufferBytes) + p.releaseBuffer(bufferMessages, bufferBytes) p.failMutedBatches(retryable, ErrShuttingDown) return } if !p.backoffOrDone(maxRetryAttempt) { - p.retryBufferQuota.release(bufferMessages, bufferBytes) + p.releaseBuffer(bufferMessages, bufferBytes) p.failMutedBatches(retryable, ErrShuttingDown) return } @@ -1652,15 +1733,15 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part for _, batch := range retryable { if p.shuttingDown() { batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes) - p.retryBufferQuota.release(batchMessages, batchBytes) + p.releaseBuffer(batchMessages, batchBytes) p.failMutedBatch(batch, ErrShuttingDown) continue } - leader, err := p.client.Leader(batch.topic, batch.partition) - if err != nil { - Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, err) + leader, leaderErr := p.client.Leader(batch.topic, batch.partition) + if leaderErr != nil { + Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, leaderErr) batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes) - p.retryBufferQuota.release(batchMessages, batchBytes) + p.releaseBuffer(batchMessages, batchBytes) p.failMutedBatch(batch, retryErr) continue } @@ -1673,32 +1754,55 @@ func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []part } for leader, set := range brokerSets { - bp := p.getBrokerProducer(leader) - accepted := p.handoffRetryBatch(bp, set) - p.unrefBrokerProducer(leader, bp) - p.retryBufferQuota.release(int64(set.bufferCount), int64(set.bufferBytes)) - if !accepted { - p.muter.unmute(set) - set.eachPartition(func(_ string, _ int32, pSet *partitionSet) { - p.returnErrors(pSet.msgs, ErrShuttingDown) - }) - } + p.handoffRetrySet(leader, set) } } -// handoffRetryBatch transfers ownership of a muted retry batch to the broker -// bridge. On false, the caller still owns the mute and must fail the batch. -func (p *asyncProducer) handoffRetryBatch(bp *brokerProducer, set *produceSet) bool { - if bp.closed.Load() { - return false +func (p *asyncProducer) handoffRetrySet(leader *Broker, set *produceSet) { + bp := p.getBrokerProducer(leader) + accepted := p.sendRetryBatch(bp, set) + p.unrefBrokerProducer(leader, bp) + setMessages, setBytes := int64(set.bufferCount), int64(set.bufferBytes) + p.releaseBuffer(setMessages, setBytes) + if !accepted { + p.muter.unmute(set) + set.eachPartition(func(_ string, _ int32, pSet *partitionSet) { + for _, msg := range pSet.msgs { + p.returnError(msg, ErrShuttingDown) + } + }) } +} +// sendRetryBatch transfers ownership of a muted retry batch to the broker +// bridge. A false result means no owner accepted the batch, so the caller must +// release the mute and fail the messages instead of leaving a retry goroutine +// blocked while it still owns the partition mute. +func (p *asyncProducer) sendRetryBatch(bp *brokerProducer, set *produceSet) bool { + var done <-chan struct{} + if bp.done != nil { + done = bp.done + } + select { + case <-done: + return false + default: + } + select { + case bp.output <- set: + return true + default: + } + var shutdownCh <-chan struct{} + if p.shutdownCh != nil { + shutdownCh = p.shutdownCh + } select { case bp.output <- set: return true - case <-bp.done: + case <-done: return false - case <-p.done: + case <-shutdownCh: return false } } @@ -1769,7 +1873,7 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { // effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock // based on https://godoc.org/github.com/eapache/channels#InfiniteChannel func (p *asyncProducer) retryHandler() { - version := producerMessageByteSizeVersion(p.conf) + version := p.producerMessageByteSizeVersion() var msg *ProducerMessage buf := queue.New() @@ -1782,7 +1886,7 @@ func (p *asyncProducer) retryHandler() { case msg = <-p.retries: case p.input <- buf.Peek().(*ProducerMessage): msgToRemove := buf.Remove().(*ProducerMessage) - p.retryBufferQuota.release(1, int64(msgToRemove.ByteSize(version))) + p.releaseBuffer(1, int64(msgToRemove.ByteSize(version))) continue } } @@ -1792,7 +1896,7 @@ func (p *asyncProducer) retryHandler() { } buf.Add(msg) - bufferOverflow := p.retryBufferQuota.addAndCheckOverflow(1, int64(msg.ByteSize(version))) + bufferOverflow := p.addToBuffer(1, int64(msg.ByteSize(version))) if !bufferOverflow { continue @@ -1803,10 +1907,10 @@ func (p *asyncProducer) retryHandler() { select { case p.input <- msgToHandle: buf.Remove() - p.retryBufferQuota.release(1, int64(msgToHandle.ByteSize(version))) + p.releaseBuffer(1, int64(msgToHandle.ByteSize(version))) default: buf.Remove() - p.retryBufferQuota.release(1, int64(msgToHandle.ByteSize(version))) + p.releaseBuffer(1, int64(msgToHandle.ByteSize(version))) p.returnError(msgToHandle, ErrProducerRetryBufferOverflow) } } @@ -1817,8 +1921,8 @@ func (p *asyncProducer) retryHandler() { func (p *asyncProducer) shutdown() { Logger.Println("Producer shutting down.") - if p.done != nil && p.closed.CompareAndSwap(false, true) { - close(p.done) + if p.shutdownCh != nil && p.shutdownChClosed.CompareAndSwap(false, true) { + close(p.shutdownCh) } p.inFlight.Add(1) p.input <- &ProducerMessage{flags: shutdown} @@ -1964,71 +2068,3 @@ func (p *asyncProducer) abandonBrokerConnection(broker *Broker) { delete(p.brokers, broker) } - -// messageQuota tracks message and byte counts against optional maximums. -type messageQuota struct { - mu sync.Mutex - messages int64 - bytes int64 - maxMessages int64 - maxBytes int64 -} - -func newMessageQuota(maxMessages, maxBytes int64) messageQuota { - return messageQuota{ - maxMessages: maxMessages, - maxBytes: maxBytes, - } -} - -func (q *messageQuota) tryReserve(messages, bytes int64) bool { - if !q.shouldTrack(messages, bytes) { - return true - } - - q.mu.Lock() - defer q.mu.Unlock() - - nextMessages := q.messages + messages - nextBytes := q.bytes + bytes - if q.wouldOverflow(nextMessages, nextBytes) { - return false - } - q.messages = nextMessages - q.bytes = nextBytes - return true -} - -func (q *messageQuota) addAndCheckOverflow(messages, bytes int64) bool { - if !q.shouldTrack(messages, bytes) { - return false - } - - q.mu.Lock() - defer q.mu.Unlock() - - q.messages += messages - q.bytes += bytes - return q.wouldOverflow(q.messages, q.bytes) -} - -func (q *messageQuota) release(messages, bytes int64) { - if !q.shouldTrack(messages, bytes) { - return - } - - q.mu.Lock() - defer q.mu.Unlock() - - q.messages -= messages - q.bytes -= bytes -} - -func (q *messageQuota) shouldTrack(messages, bytes int64) bool { - return (messages != 0 || bytes != 0) && (q.maxMessages > 0 || q.maxBytes > 0) -} - -func (q *messageQuota) wouldOverflow(messages, bytes int64) bool { - return (q.maxMessages > 0 && messages >= q.maxMessages) || - (q.maxBytes > 0 && bytes >= q.maxBytes) -} diff --git a/async_producer_test.go b/async_producer_test.go index b16e0bf61..a3c220f63 100644 --- a/async_producer_test.go +++ b/async_producer_test.go @@ -1562,8 +1562,6 @@ func TestBrokerProducerShutdown(t *testing.T) { bp := producer.(*asyncProducer).getBrokerProducer(broker) // Initiate the shutdown of all of them producer.(*asyncProducer).unrefBrokerProducer(broker, bp) - assertDoneWithin(t, bp.done, time.Second) - require.NotPanics(t, bp.shutdown) _ = producer.Close() mockBroker.Close() @@ -1637,16 +1635,37 @@ func assertDoneWithin[T any](t *testing.T, ch <-chan T, timeout time.Duration) T } } +func waitForProducerBuffer(t *testing.T, parent *asyncProducer, messages, bytes int64) { + t.Helper() + deadline := time.After(2 * time.Second) + ticker := time.NewTicker(time.Millisecond) + defer ticker.Stop() + for { + parent.retryBuffer.mu.Lock() + gotMessages := parent.retryBuffer.messages + gotBytes := parent.retryBuffer.bytes + parent.retryBuffer.mu.Unlock() + if (messages < 0 || gotMessages == messages) && (bytes < 0 || gotBytes == bytes) { + return + } + select { + case <-deadline: + t.Fatalf("timed out waiting for buffer budget messages=%d bytes=%d, got messages=%d bytes=%d", + messages, bytes, gotMessages, gotBytes) + case <-ticker.C: + } + } +} + func newBlockingRetryProducer(config *Config, errorBuffer int) (*asyncProducer, *brokerProducer) { parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, errorBuffer), - done: make(chan struct{}), - txnmgr: &transactionManager{}, - retryBufferQuota: newRetryBufferQuota(config), + conf: config, + muter: newPartitionMuter(), + brokers: make(map[*Broker]*brokerProducer), + brokerRefs: make(map[*brokerProducer]int), + errors: make(chan *ProducerError, errorBuffer), + shutdownCh: make(chan struct{}), + txnmgr: &transactionManager{}, } leader := &Broker{id: 1} parent.client = &stubLeaderClient{leader: leader, cfg: config} @@ -1661,52 +1680,6 @@ func newBlockingRetryProducer(config *Config, errorBuffer int) (*asyncProducer, return parent, retryBP } -// holdFirstRetryAfterReserve lets the test prove a second retry sees quota held by the first. -func holdFirstRetryAfterReserve(parent *asyncProducer, retryBP *brokerProducer, afterRefresh bool) (<-chan struct{}, func()) { - firstRetryReserved := make(chan struct{}, 1) - blockFirstRetry := make(chan struct{}, 1) - releaseFirstRetry := make(chan struct{}, 1) - blockFirstRetry <- struct{}{} - - hold := func() { - select { - case <-blockFirstRetry: - firstRetryReserved <- struct{}{} - <-releaseFirstRetry - default: - } - } - release := func() { - select { - case releaseFirstRetry <- struct{}{}: - default: - } - } - - if afterRefresh { - parent.client = &stubLeaderClient{ - leader: retryBP.broker, - cfg: parent.conf, - refreshMetadata: func(...string) error { - hold() - return nil - }, - } - } else { - parent.client = &stubLeaderClient{ - cfg: parent.conf, - leaderFunc: func(_ string, partition int32) (*Broker, error) { - if partition == 0 { - hold() - } - return retryBP.broker, nil - }, - } - } - - return firstRetryReserved, release -} - // TestBrokerProducerWaitForSpaceRespectsExternalUnmute ensures waitForSpace does not // deadlock when partitions are muted by another producer and are unmuted elsewhere. func TestBrokerProducerWaitForSpaceRespectsExternalUnmute(t *testing.T) { @@ -1987,12 +1960,11 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) { } parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - txnmgr: txnMgr, - retryBufferQuota: newRetryBufferQuota(config), + conf: config, + muter: newPartitionMuter(), + brokers: make(map[*Broker]*brokerProducer), + brokerRefs: make(map[*brokerProducer]int), + txnmgr: txnMgr, } leader := &Broker{} parent.client = &stubLeaderClient{leader: leader, cfg: config} @@ -2023,6 +1995,7 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) { default: t.Fatal("expected retry batch to be dispatched") } + waitForProducerBuffer(t, parent, 0, 0) contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) @@ -2149,28 +2122,18 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - txnmgr: &transactionManager{}, - retryBufferQuota: newRetryBufferQuota(config), + conf: config, + muter: newPartitionMuter(), + brokers: make(map[*Broker]*brokerProducer), + brokerRefs: make(map[*brokerProducer]int), + txnmgr: &transactionManager{}, } failedBroker := &Broker{id: 1} retryLeader := &Broker{id: 2} - started := make(chan struct{}, 1) - release := make(chan struct{}) - client := &stubLeaderClient{ - leader: retryLeader, - cfg: config, - refreshMetadata: func(...string) error { - select { - case started <- struct{}{}: - default: - } - <-release - return nil - }, + client := &blockingRefreshClient{ + stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config}, + started: make(chan struct{}, 1), + release: make(chan struct{}), } parent.client = client @@ -2204,7 +2167,8 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T close(done) }() assertDoneWithin(t, done, 2*time.Second) - assertDoneWithin(t, started, 2*time.Second) + assertDoneWithin(t, client.started, 2*time.Second) + waitForProducerBuffer(t, parent, 1, -1) contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) @@ -2213,10 +2177,11 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T t.Fatal("expected partition to remain muted while async metadata refresh is pending") } - close(release) + close(client.release) retrySet := assertDoneWithin(t, output, 2*time.Second) defer parent.muter.unmute(retrySet) require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0]) + waitForProducerBuffer(t, parent, 0, 0) } func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) { @@ -2234,14 +2199,9 @@ func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *t } failedBroker := &Broker{id: 1} retryLeader := &Broker{id: 2} - calls := make(chan []string, 1) - client := &stubLeaderClient{ - leader: retryLeader, - cfg: config, - refreshMetadata: func(topics ...string) error { - calls <- append([]string(nil), topics...) - return nil - }, + client := &countingRefreshClient{ + stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config}, + calls: make(chan []string, 1), } parent.client = client @@ -2274,10 +2234,10 @@ func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *t bp.handleError(sent, ErrOutOfBrokers) - require.Equal(t, []string{"topic"}, assertDoneWithin(t, calls, 2*time.Second)) + require.Equal(t, []string{"topic"}, assertDoneWithin(t, client.calls, 2*time.Second)) retrySet := assertDoneWithin(t, output, 2*time.Second) defer parent.muter.unmute(retrySet) - assertNotDone(t, calls, 50*time.Millisecond) + assertNotDone(t, client.calls, 50*time.Millisecond) assertNotDone(t, output, 50*time.Millisecond) retriedPartitions := make(map[int32]*partitionSet) @@ -2304,9 +2264,8 @@ func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) { failedBroker := &Broker{id: 1} leaderA := &Broker{id: 2} leaderB := &Broker{id: 3} - calls := make(chan []string, 1) - client := &stubLeaderClient{ - cfg: config, + client := &routingLeaderClient{ + stubLeaderClient: stubLeaderClient{cfg: config}, leaders: map[string]map[int32]*Broker{ "topic-a": { 0: leaderA, @@ -2317,10 +2276,7 @@ func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) { 1: leaderB, }, }, - refreshMetadata: func(topics ...string) error { - calls <- append([]string(nil), topics...) - return nil - }, + calls: make(chan []string, 1), } parent.client = client @@ -2360,7 +2316,7 @@ func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) { bp.handleError(sent, ErrOutOfBrokers) - require.ElementsMatch(t, []string{"topic-a", "topic-b"}, assertDoneWithin(t, calls, 2*time.Second)) + require.ElementsMatch(t, []string{"topic-a", "topic-b"}, assertDoneWithin(t, client.calls, 2*time.Second)) retrySetA := assertDoneWithin(t, outputA, 2*time.Second) defer parent.muter.unmute(retrySetA) retrySetB := assertDoneWithin(t, outputB, 2*time.Second) @@ -2438,27 +2394,16 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, 1), - done: make(chan struct{}), - txnmgr: &transactionManager{}, - retryBufferQuota: newRetryBufferQuota(config), + conf: config, + muter: newPartitionMuter(), + brokers: make(map[*Broker]*brokerProducer), + brokerRefs: make(map[*brokerProducer]int), + errors: make(chan *ProducerError, 1), + shutdownCh: make(chan struct{}), + txnmgr: &transactionManager{}, } leader := &Broker{id: 1} - leaderLookup := make(chan struct{}, 1) - parent.client = &stubLeaderClient{ - cfg: config, - leaderFunc: func(string, int32) (*Broker, error) { - select { - case leaderLookup <- struct{}{}: - default: - } - return leader, nil - }, - } + parent.client = &stubLeaderClient{leader: leader, cfg: config} retryBP := &brokerProducer{ parent: parent, broker: leader, @@ -2481,15 +2426,16 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true) close(done) }() - assertDoneWithin(t, leaderLookup, 2*time.Second) + waitForProducerBuffer(t, parent, 1, -1) - if parent.closed.CompareAndSwap(false, true) { - close(parent.done) + if parent.shutdownChClosed.CompareAndSwap(false, true) { + close(parent.shutdownCh) } producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) require.Equal(t, ErrShuttingDown, producerErr.Err) assertDoneWithin(t, done, 2*time.Second) + waitForProducerBuffer(t, parent, 0, 0) contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) @@ -2499,105 +2445,114 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { parent.muter.unmute(contender) } -func TestRetryUsesSharedBufferBudget(t *testing.T) { - tests := []struct { - name string - afterRefresh bool - bytesLimit bool - }{ - {name: "retryBatch length"}, - {name: "retryBatch bytes", bytesLimit: true}, - {name: "retryBatchesAfterRefresh length", afterRefresh: true}, - {name: "retryBatchesAfterRefresh bytes", afterRefresh: true, bytesLimit: true}, +func TestRetryBatchUsesSharedBufferLengthBudget(t *testing.T) { + config := NewTestConfig() + config.Producer.Idempotent = true + config.Producer.Retry.Max = 1 + config.Producer.Retry.Backoff = 0 + config.Producer.Return.Errors = true + config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1 + + parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1) + + first := newProduceSet(parent) + for i := 0; i < minFunctionalRetryBufferLength; i++ { + safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) + } + firstPartitionSet := first.msgs["topic"][0] + if !parent.muter.tryMute(first) { + t.Fatal("expected first retry batch to mute partitions") } + parent.inFlight.Add(minFunctionalRetryBufferLength) - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = !tt.afterRefresh - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - firstCount := 1 - errorBuffer := 2 - var firstMsg *ProducerMessage - secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")} - - if tt.bytesLimit { - firstMsg = &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))} - version := producerMessageByteSizeVersion(config) - firstBytes := int64(firstMsg.ByteSize(version)) - secondBytes := int64(secondMsg.ByteSize(version)) - config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes - } else { - config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1 - firstCount = minFunctionalRetryBufferLength - errorBuffer = minFunctionalRetryBufferLength + 1 - } + firstDone := make(chan struct{}) + go func() { + parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true) + close(firstDone) + }() + waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) - parent, retryBP := newBlockingRetryProducer(config, errorBuffer) - firstRetryReserved, releaseFirstRetry := holdFirstRetryAfterReserve(parent, retryBP, tt.afterRefresh) - defer releaseFirstRetry() - - retry := func(partition int32, pSet *partitionSet) { - if tt.afterRefresh { - parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ - topic: "topic", - partition: partition, - pSet: pSet, - }}, ErrOutOfBrokers) - return - } - parent.retryBatch("topic", partition, pSet, ErrNotEnoughReplicas, true) - } + second := newProduceSet(parent) + safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}) + secondPartitionSet := second.msgs["topic"][1] + if !parent.muter.tryMute(second) { + t.Fatal("expected second retry batch to mute partitions") + } + parent.inFlight.Add(1) + parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true) - first := newProduceSet(parent) - if tt.bytesLimit { - safeAddMessage(t, first, firstMsg) - } else { - for i := 0; i < minFunctionalRetryBufferLength; i++ { - safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - } - } - firstPartitionSet := first.msgs["topic"][0] - if !parent.muter.tryMute(first) { - t.Fatal("expected first retry batch to mute partitions") - } - parent.inFlight.Add(firstCount) + producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) + require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) + require.Equal(t, int32(1), producerErr.Msg.Partition) + waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) - firstDone := make(chan struct{}) - go func() { - retry(0, firstPartitionSet) - close(firstDone) - }() - assertDoneWithin(t, firstRetryReserved, 2*time.Second) + contender := newProduceSet(parent) + safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) + if !parent.muter.tryMute(contender) { + t.Fatal("expected overflowed retry batch to release partition mute") + } + parent.muter.unmute(contender) - second := newProduceSet(parent) - safeAddMessage(t, second, secondMsg) - secondPartitionSet := second.msgs["topic"][1] - if !parent.muter.tryMute(second) { - t.Fatal("expected second retry batch to mute partitions") - } - parent.inFlight.Add(1) - retry(1, secondPartitionSet) + close(retryBP.done) + assertDoneWithin(t, firstDone, 2*time.Second) + waitForProducerBuffer(t, parent, 0, 0) +} - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) - require.Equal(t, int32(1), producerErr.Msg.Partition) +func TestRetryBatchUsesSharedBufferBytesBudget(t *testing.T) { + config := NewTestConfig() + config.Producer.Idempotent = true + config.Producer.Retry.Max = 1 + config.Producer.Retry.Backoff = 0 + config.Producer.Return.Errors = true - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected overflowed retry batch to release partition mute") - } - parent.muter.unmute(contender) + parent, retryBP := newBlockingRetryProducer(config, 2) - close(retryBP.done) - releaseFirstRetry() - assertDoneWithin(t, firstDone, 2*time.Second) - }) + firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))} + secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")} + version := parent.producerMessageByteSizeVersion() + firstBytes := int64(firstMsg.ByteSize(version)) + secondBytes := int64(secondMsg.ByteSize(version)) + config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes + + first := newProduceSet(parent) + safeAddMessage(t, first, firstMsg) + firstPartitionSet := first.msgs["topic"][0] + if !parent.muter.tryMute(first) { + t.Fatal("expected first retry batch to mute partitions") + } + parent.inFlight.Add(1) + + firstDone := make(chan struct{}) + go func() { + parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true) + close(firstDone) + }() + waitForProducerBuffer(t, parent, -1, firstBytes) + + second := newProduceSet(parent) + safeAddMessage(t, second, secondMsg) + secondPartitionSet := second.msgs["topic"][1] + if !parent.muter.tryMute(second) { + t.Fatal("expected second retry batch to mute partitions") } + parent.inFlight.Add(1) + parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true) + + producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) + require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) + require.Equal(t, int32(1), producerErr.Msg.Partition) + waitForProducerBuffer(t, parent, -1, firstBytes) + + contender := newProduceSet(parent) + safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) + if !parent.muter.tryMute(contender) { + t.Fatal("expected overflowed retry batch to release partition mute") + } + parent.muter.unmute(contender) + + close(retryBP.done) + assertDoneWithin(t, firstDone, 2*time.Second) + waitForProducerBuffer(t, parent, 0, 0) } func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T) { @@ -2613,7 +2568,7 @@ func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T brokers: make(map[*Broker]*brokerProducer), brokerRefs: make(map[*brokerProducer]int), errors: make(chan *ProducerError, 1), - done: make(chan struct{}), + shutdownCh: make(chan struct{}), txnmgr: &transactionManager{}, } leader := &Broker{id: 1} @@ -2658,6 +2613,128 @@ func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T parent.muter.unmute(contender) } +func TestRetryBatchesAfterRefreshUsesSharedBufferLengthBudget(t *testing.T) { + config := NewTestConfig() + config.Producer.Idempotent = false + config.Producer.Retry.Max = 1 + config.Producer.Retry.Backoff = 0 + config.Producer.Return.Errors = true + config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1 + + parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1) + + first := newProduceSet(parent) + for i := 0; i < minFunctionalRetryBufferLength; i++ { + safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) + } + if !parent.muter.tryMute(first) { + t.Fatal("expected first retry batch to mute partitions") + } + parent.inFlight.Add(minFunctionalRetryBufferLength) + + firstDone := make(chan struct{}) + go func() { + parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ + topic: "topic", + partition: 0, + pSet: first.msgs["topic"][0], + }}, ErrOutOfBrokers) + close(firstDone) + }() + waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) + + second := newProduceSet(parent) + safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}) + if !parent.muter.tryMute(second) { + t.Fatal("expected second retry batch to mute partitions") + } + parent.inFlight.Add(1) + parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ + topic: "topic", + partition: 1, + pSet: second.msgs["topic"][1], + }}, ErrOutOfBrokers) + + producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) + require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) + require.Equal(t, int32(1), producerErr.Msg.Partition) + waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) + + contender := newProduceSet(parent) + safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) + if !parent.muter.tryMute(contender) { + t.Fatal("expected overflowed retry batch to release partition mute") + } + parent.muter.unmute(contender) + + close(retryBP.done) + assertDoneWithin(t, firstDone, 2*time.Second) + waitForProducerBuffer(t, parent, 0, 0) +} + +func TestRetryBatchesAfterRefreshUsesSharedBufferBytesBudget(t *testing.T) { + config := NewTestConfig() + config.Producer.Idempotent = false + config.Producer.Retry.Max = 1 + config.Producer.Retry.Backoff = 0 + config.Producer.Return.Errors = true + + parent, retryBP := newBlockingRetryProducer(config, 2) + + firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))} + secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")} + version := parent.producerMessageByteSizeVersion() + firstBytes := int64(firstMsg.ByteSize(version)) + secondBytes := int64(secondMsg.ByteSize(version)) + config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes + + first := newProduceSet(parent) + safeAddMessage(t, first, firstMsg) + if !parent.muter.tryMute(first) { + t.Fatal("expected first retry batch to mute partitions") + } + parent.inFlight.Add(1) + + firstDone := make(chan struct{}) + go func() { + parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ + topic: "topic", + partition: 0, + pSet: first.msgs["topic"][0], + }}, ErrOutOfBrokers) + close(firstDone) + }() + waitForProducerBuffer(t, parent, -1, firstBytes) + + second := newProduceSet(parent) + safeAddMessage(t, second, secondMsg) + if !parent.muter.tryMute(second) { + t.Fatal("expected second retry batch to mute partitions") + } + parent.inFlight.Add(1) + parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ + topic: "topic", + partition: 1, + pSet: second.msgs["topic"][1], + }}, ErrOutOfBrokers) + + producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) + require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) + require.Equal(t, int32(1), producerErr.Msg.Partition) + waitForProducerBuffer(t, parent, -1, firstBytes) + + contender := newProduceSet(parent) + safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) + if !parent.muter.tryMute(contender) { + t.Fatal("expected overflowed retry batch to release partition mute") + } + parent.muter.unmute(contender) + + close(retryBP.done) + assertDoneWithin(t, firstDone, 2*time.Second) + waitForProducerBuffer(t, parent, 0, 0) +} + func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false @@ -2673,7 +2750,10 @@ func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) { errors: make(chan *ProducerError, 1), txnmgr: &transactionManager{}, } - parent.client = &stubLeaderClient{cfg: config, leaderErr: ErrOutOfBrokers} + parent.client = &failingLeaderClient{ + stubLeaderClient: stubLeaderClient{cfg: config}, + err: ErrOutOfBrokers, + } bp := &brokerProducer{ parent: parent, @@ -2704,12 +2784,8 @@ func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) { } type stubLeaderClient struct { - cfg *Config - leader *Broker - leaderFunc func(string, int32) (*Broker, error) - leaderErr error - leaders map[string]map[int32]*Broker - refreshMetadata func(...string) error + cfg *Config + leader *Broker } func (c *stubLeaderClient) Config() *Config { return c.cfg } @@ -2721,36 +2797,16 @@ func (c *stubLeaderClient) Topics() ([]string, error) { return func (c *stubLeaderClient) Partitions(string) ([]int32, error) { return nil, nil } func (c *stubLeaderClient) WritablePartitions(string) ([]int32, error) { return nil, nil } func (c *stubLeaderClient) Leader(topic string, partitionID int32) (*Broker, error) { - if c.leaderFunc != nil { - return c.leaderFunc(topic, partitionID) - } - if c.leaderErr != nil { - return nil, c.leaderErr - } - if partitions := c.leaders[topic]; partitions != nil { - if leader := partitions[partitionID]; leader != nil { - return leader, nil - } - } - if c.leaders != nil { - return nil, ErrLeaderNotAvailable - } return c.leader, nil } -func (c *stubLeaderClient) LeaderAndEpoch(topic string, partition int32) (*Broker, int32, error) { - leader, err := c.Leader(topic, partition) - return leader, 0, err -} -func (c *stubLeaderClient) Replicas(string, int32) ([]int32, error) { return nil, nil } -func (c *stubLeaderClient) InSyncReplicas(string, int32) ([]int32, error) { return nil, nil } -func (c *stubLeaderClient) OfflineReplicas(string, int32) ([]int32, error) { return nil, nil } -func (c *stubLeaderClient) RefreshBrokers([]string) error { return nil } -func (c *stubLeaderClient) RefreshMetadata(topics ...string) error { - if c.refreshMetadata != nil { - return c.refreshMetadata(topics...) - } - return nil +func (c *stubLeaderClient) LeaderAndEpoch(string, int32) (*Broker, int32, error) { + return c.leader, 0, nil } +func (c *stubLeaderClient) Replicas(string, int32) ([]int32, error) { return nil, nil } +func (c *stubLeaderClient) InSyncReplicas(string, int32) ([]int32, error) { return nil, nil } +func (c *stubLeaderClient) OfflineReplicas(string, int32) ([]int32, error) { return nil, nil } +func (c *stubLeaderClient) RefreshBrokers([]string) error { return nil } +func (c *stubLeaderClient) RefreshMetadata(...string) error { return nil } func (c *stubLeaderClient) GetOffset(string, int32, int64) (int64, error) { return 0, nil } func (c *stubLeaderClient) Coordinator(string) (*Broker, error) { return nil, nil } func (c *stubLeaderClient) RefreshCoordinator(string) error { return nil } @@ -2762,6 +2818,69 @@ func (c *stubLeaderClient) PartitionNotReadable(string, int32) bool { r func (c *stubLeaderClient) Close() error { return nil } func (c *stubLeaderClient) Closed() bool { return false } +type failingLeaderClient struct { + stubLeaderClient + err error +} + +func (c *failingLeaderClient) Leader(string, int32) (*Broker, error) { + return nil, c.err +} + +func (c *failingLeaderClient) LeaderAndEpoch(string, int32) (*Broker, int32, error) { + return nil, 0, c.err +} + +type blockingRefreshClient struct { + stubLeaderClient + started chan struct{} + release chan struct{} +} + +func (c *blockingRefreshClient) RefreshMetadata(...string) error { + select { + case c.started <- struct{}{}: + default: + } + <-c.release + return nil +} + +type countingRefreshClient struct { + stubLeaderClient + calls chan []string +} + +func (c *countingRefreshClient) RefreshMetadata(topics ...string) error { + c.calls <- append([]string(nil), topics...) + return nil +} + +type routingLeaderClient struct { + stubLeaderClient + leaders map[string]map[int32]*Broker + calls chan []string +} + +func (c *routingLeaderClient) Leader(topic string, partition int32) (*Broker, error) { + if partitions := c.leaders[topic]; partitions != nil { + if leader := partitions[partition]; leader != nil { + return leader, nil + } + } + return nil, ErrLeaderNotAvailable +} + +func (c *routingLeaderClient) LeaderAndEpoch(topic string, partition int32) (*Broker, int32, error) { + leader, err := c.Leader(topic, partition) + return leader, 0, err +} + +func (c *routingLeaderClient) RefreshMetadata(topics ...string) error { + c.calls <- append([]string(nil), topics...) + return nil +} + func testProducerInterceptor( t *testing.T, interceptors []ProducerInterceptor, diff --git a/produce_set_test.go b/produce_set_test.go index ea8086750..359bfd47b 100644 --- a/produce_set_test.go +++ b/produce_set_test.go @@ -26,29 +26,20 @@ func safeAddMessage(t *testing.T, ps *produceSet, msg *ProducerMessage) { } func TestShouldKeepMuted(t *testing.T) { - pSet := &partitionSet{} - if pSet.shouldKeepMuted(1) { - t.Error("empty partition set should not be retryable") - } - - pSet = &partitionSet{ - msgs: []*ProducerMessage{ - {retries: 0}, - {retries: 0}, - }, - } - if !pSet.shouldKeepMuted(1) { - t.Error("expected batch to be retryable when every message is below retry max") - } - - pSet = &partitionSet{ - msgs: []*ProducerMessage{ - {retries: 0}, - {retries: 1}, - }, - } - if pSet.shouldKeepMuted(1) { - t.Error("expected batch not to be retryable when any message has exhausted retries") + if (&partitionSet{}).shouldKeepMuted(1) { + t.Fatal("empty partition set should not be retryable") + } + if !(&partitionSet{msgs: []*ProducerMessage{ + {retries: 0}, + {retries: 0}, + }}).shouldKeepMuted(1) { + t.Fatal("expected batch to be retryable when every message is below retry max") + } + if (&partitionSet{msgs: []*ProducerMessage{ + {retries: 0}, + {retries: 1}, + }}).shouldKeepMuted(1) { + t.Fatal("expected batch not to be retryable when any message has exhausted retries") } } From 2031e626e168a14b367444700442b5b025669e16 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 25 Jun 2026 22:03:05 +0800 Subject: [PATCH 3/6] Revert "Optimize logs (#24)" This reverts commit dd8ea47b5117d8c751099378fc308bd332659f53. --- async_producer.go | 1 + client.go | 2 ++ 2 files changed, 3 insertions(+) diff --git a/async_producer.go b/async_producer.go index 14c813ace..d6f02bd45 100644 --- a/async_producer.go +++ b/async_producer.go @@ -1206,6 +1206,7 @@ func (bp *brokerProducer) run() { } if bp.accumulatingBatch.wouldOverflow(msg) { + DebugLogger.Printf("producer/broker/%d maximum request accumulated, waiting for space\n", bp.broker.ID()) if err := bp.waitForSpace(msg, false); err != nil { bp.parent.retryMessage(msg, err) continue diff --git a/client.go b/client.go index 71b57b15c..6641743ac 100644 --- a/client.go +++ b/client.go @@ -1103,10 +1103,12 @@ func (client *client) tryRefreshMetadata(topics []string, attemptsRemaining int, return err } // else remove that broker and try again + Logger.Printf("client/metadata got error from broker %d while fetching metadata: %v\n", broker.ID(), err) _ = broker.Close() client.deregisterBroker(broker) } else { // some other error, remove that broker and try again + Logger.Printf("client/metadata got error from broker %d while fetching metadata: %v\n", broker.ID(), err) brokerErrors = append(brokerErrors, err) _ = broker.Close() client.deregisterBroker(broker) From cff8910c6faca23f7ea912a250f2899618ab292a Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 25 Jun 2026 22:03:05 +0800 Subject: [PATCH 4/6] Revert "add more code to enhance the retry (#23)" This reverts commit 7d297873a58582cab761159daf43f7270d37974a. --- async_producer.go | 313 +++------------------ async_producer_test.go | 527 +----------------------------------- config.go | 16 +- functional_producer_test.go | 4 +- produce_set.go | 9 - 5 files changed, 61 insertions(+), 808 deletions(-) diff --git a/async_producer.go b/async_producer.go index d6f02bd45..29bc3a114 100644 --- a/async_producer.go +++ b/async_producer.go @@ -14,7 +14,7 @@ import ( "github.com/rcrowley/go-metrics" ) -// ErrProducerRetryBufferOverflow is returned when producer retry buffering is full and OOM prevention needs to be applied. +// ErrProducerRetryBufferOverflow is returned when the bridging retry buffer is full and OOM prevention needs to be applied. var ErrProducerRetryBufferOverflow = errors.New("retry buffer full: message discarded to prevent buffer overflow") const ( @@ -102,8 +102,6 @@ type asyncProducer struct { brokerRefs map[*brokerProducer]int brokerLock sync.Mutex - retryBuffer retryBufferQuota - txnmgr *transactionManager txLock sync.Mutex @@ -114,14 +112,6 @@ type asyncProducer struct { metricsRegistry metrics.Registry } -type retryBufferQuota struct { - // messages and bytes track producer-level retry buffer occupancy when - // Producer.Retry.MaxBufferLength or MaxBufferBytes is bounded. - messages int64 - bytes int64 - mu sync.Mutex -} - type partitionMuter struct { mu sync.Mutex cond *sync.Cond @@ -801,13 +791,6 @@ func (pp *partitionProducer) backoff(retries int) { } func (p *asyncProducer) backoff(retries int) { - backoff := p.retryBackoff(retries) - if backoff > 0 { - time.Sleep(backoff) - } -} - -func (p *asyncProducer) retryBackoff(retries int) time.Duration { var backoff time.Duration if p.conf.Producer.Retry.BackoffFunc != nil { maxRetries := p.conf.Producer.Retry.Max @@ -815,37 +798,8 @@ func (p *asyncProducer) retryBackoff(retries int) time.Duration { } else { backoff = p.conf.Producer.Retry.Backoff } - return backoff -} - -func (p *asyncProducer) shuttingDown() bool { - if p.shutdownCh == nil { - return false - } - select { - case <-p.shutdownCh: - return true - default: - return false - } -} - -func (p *asyncProducer) backoffOrDone(retries int) bool { - backoff := p.retryBackoff(retries) - if backoff <= 0 { - return !p.shuttingDown() - } - timer := time.NewTimer(backoff) - defer timer.Stop() - if p.shutdownCh == nil { - <-timer.C - return true - } - select { - case <-timer.C: - return true - case <-p.shutdownCh: - return false + if backoff > 0 { + time.Sleep(backoff) } } @@ -1494,18 +1448,12 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitionSet, retryErr error, alreadyMuted bool) { Logger.Printf("Retrying batch for %v-%d because of %v\n", topic, partition, retryErr) produceSet := newProduceSet(p) - produceSet.addPartitionSet(topic, partition, pSet) - bufferMessages, bufferBytes := int64(produceSet.bufferCount), int64(produceSet.bufferBytes) - bufferReserved := false - releaseBuffer := func() { - if bufferReserved { - p.releaseBuffer(bufferMessages, bufferBytes) - bufferReserved = false - } - } + produceSet.msgs[topic] = make(map[int32]*partitionSet) + produceSet.msgs[topic][partition] = pSet + produceSet.bufferBytes += pSet.bufferBytes + produceSet.bufferCount += len(pSet.msgs) muted := alreadyMuted failBatch := func(err error) { - releaseBuffer() // Release the partition before reporting errors so a blocked Errors // consumer cannot keep later same-partition messages muted. if muted { @@ -1523,17 +1471,12 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio } msg.retries++ } - if !p.reserveBuffer(bufferMessages, bufferBytes) { - failBatch(ErrProducerRetryBufferOverflow) - return - } - bufferReserved = p.retryBufferBounded() && (bufferMessages != 0 || bufferBytes != 0) - // Honor Producer.Retry.Backoff between retry attempts (#2469). retryBatch - // dispatches the produceSet directly to the broker, bypassing partitionProducer.dispatch. - if len(pSet.msgs) > 0 && !p.backoffOrDone(pSet.msgs[0].retries) { - failBatch(ErrShuttingDown) - return + // honor Producer.Retry.Backoff between retry attempts (#2469); the + // non-idempotent path gets this from partitionProducer.dispatch, but + // retryBatch dispatches the produceSet directly to the broker + if len(pSet.msgs) > 0 { + p.backoff(pSet.msgs[0].retries) } // it's expected that a metadata refresh has been requested prior to calling retryBatch @@ -1556,7 +1499,6 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio failBatch(ErrShuttingDown) return } - releaseBuffer() } type partitionBatchRetry struct { @@ -1565,213 +1507,30 @@ type partitionBatchRetry struct { pSet *partitionSet } -func (p *asyncProducer) failMutedBatch(batch partitionBatchRetry, err error) { - produceSet := newProduceSet(p) - produceSet.addPartitionSet(batch.topic, batch.partition, batch.pSet) - p.muter.unmute(produceSet) - for _, msg := range batch.pSet.msgs { - p.returnError(msg, err) - } -} - -func (p *asyncProducer) failMutedBatches(batches []partitionBatchRetry, err error) { - for _, batch := range batches { - p.failMutedBatch(batch, err) - } -} - -func (p *asyncProducer) retryBufferLimits() (int64, int64) { - maxBufferLength := p.conf.Producer.Retry.MaxBufferLength - if 0 < maxBufferLength && maxBufferLength < minFunctionalRetryBufferLength { - maxBufferLength = minFunctionalRetryBufferLength - } - - maxBufferBytes := p.conf.Producer.Retry.MaxBufferBytes - if 0 < maxBufferBytes && maxBufferBytes < minFunctionalRetryBufferBytes { - maxBufferBytes = minFunctionalRetryBufferBytes - } - - return int64(maxBufferLength), maxBufferBytes -} - -func (p *asyncProducer) retryBufferBounded() bool { - maxBufferLength, maxBufferBytes := p.retryBufferLimits() - return maxBufferLength > 0 || maxBufferBytes > 0 -} - -func (p *asyncProducer) producerMessageByteSizeVersion() int { - if p.conf.Version.IsAtLeast(V0_11_0_0) { - return 2 - } - return 1 -} - -func (p *asyncProducer) bufferWouldOverflow(messages, bytes int64) bool { - maxBufferLength, maxBufferBytes := p.retryBufferLimits() - return (maxBufferLength > 0 && messages >= maxBufferLength) || - (maxBufferBytes > 0 && bytes >= maxBufferBytes) -} - -func (p *asyncProducer) reserveBuffer(messages, bytes int64) bool { - if messages == 0 && bytes == 0 { - return true - } - if !p.retryBufferBounded() { - return true - } - p.retryBuffer.mu.Lock() - defer p.retryBuffer.mu.Unlock() - - if p.bufferWouldOverflow(p.retryBuffer.messages+messages, p.retryBuffer.bytes+bytes) { - return false - } - p.retryBuffer.messages += messages - p.retryBuffer.bytes += bytes - return true -} - -func (p *asyncProducer) addToBuffer(messages, bytes int64) bool { - if messages == 0 && bytes == 0 { - return false - } - if !p.retryBufferBounded() { - return false - } - p.retryBuffer.mu.Lock() - defer p.retryBuffer.mu.Unlock() - - p.retryBuffer.messages += messages - p.retryBuffer.bytes += bytes - return p.bufferWouldOverflow(p.retryBuffer.messages, p.retryBuffer.bytes) -} - -func (p *asyncProducer) releaseBuffer(messages, bytes int64) { - if messages == 0 && bytes == 0 { - return - } - if !p.retryBufferBounded() { - return - } - p.retryBuffer.mu.Lock() - defer p.retryBuffer.mu.Unlock() - - p.retryBuffer.messages -= messages - p.retryBuffer.bytes -= bytes -} - // retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError. // Connection errors already hold partition mutes; doing the refresh in the // response loop can block all progress for that broker while the cluster is // unstable. Refresh once for the failed request so multi-partition batches do -// not amplify controller-fail metadata traffic. Group retry partitions by their -// refreshed leader broker, but hand them off from this retry worker so large -// clusters do not create one temporary goroutine per target broker. -// -// This is intentionally not implemented as a loop over retryBatch. retryBatch -// is the single-partition direct retry primitive; this path owns all muted -// partitions from one failed ProduceRequest and must retry, fail, reserve -// buffer, back off, and refresh metadata at that failed-request boundary. +// not amplify controller-fail metadata traffic, then retry partitions +// independently so one blocked broker handoff cannot hold up the rest. func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []partitionBatchRetry, retryErr error) { - if p.shuttingDown() { - p.failMutedBatches(batches, ErrShuttingDown) - return - } - - var retryable []partitionBatchRetry - maxRetryAttempt := 0 - for _, batch := range batches { - if !batch.pSet.shouldKeepMuted(p.conf.Producer.Retry.Max) { - p.failMutedBatch(batch, retryErr) - continue - } - for _, msg := range batch.pSet.msgs { - msg.retries++ - if msg.retries > maxRetryAttempt { - maxRetryAttempt = msg.retries + if p.shutdownCh != nil { + select { + case <-p.shutdownCh: + for _, batch := range batches { + go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true) } + return + default: } - retryable = append(retryable, batch) - } - if len(retryable) == 0 { - return - } - - // Reserve before metadata refresh so refresh-blocked direct retries still - // count against the producer-level retry buffer budget. - var bufferMessages, bufferBytes int64 - for _, batch := range retryable { - bufferMessages += int64(len(batch.pSet.msgs)) - bufferBytes += int64(batch.pSet.bufferBytes) - } - if !p.reserveBuffer(bufferMessages, bufferBytes) { - p.failMutedBatches(retryable, ErrProducerRetryBufferOverflow) - return - } - - if p.shuttingDown() { - p.releaseBuffer(bufferMessages, bufferBytes) - p.failMutedBatches(retryable, ErrShuttingDown) - return } if len(topics) > 0 { if err := p.client.RefreshMetadata(topics...); err != nil { Logger.Printf("Failed refreshing metadata because of %v\n", err) } } - if p.shuttingDown() { - p.releaseBuffer(bufferMessages, bufferBytes) - p.failMutedBatches(retryable, ErrShuttingDown) - return - } - - if !p.backoffOrDone(maxRetryAttempt) { - p.releaseBuffer(bufferMessages, bufferBytes) - p.failMutedBatches(retryable, ErrShuttingDown) - return - } - - brokerSets := make(map[*Broker]*produceSet) - for _, batch := range retryable { - if p.shuttingDown() { - batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes) - p.releaseBuffer(batchMessages, batchBytes) - p.failMutedBatch(batch, ErrShuttingDown) - continue - } - leader, leaderErr := p.client.Leader(batch.topic, batch.partition) - if leaderErr != nil { - Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", batch.topic, batch.partition, leaderErr) - batchMessages, batchBytes := int64(len(batch.pSet.msgs)), int64(batch.pSet.bufferBytes) - p.releaseBuffer(batchMessages, batchBytes) - p.failMutedBatch(batch, retryErr) - continue - } - set := brokerSets[leader] - if set == nil { - set = newProduceSet(p) - brokerSets[leader] = set - } - set.addPartitionSet(batch.topic, batch.partition, batch.pSet) - } - - for leader, set := range brokerSets { - p.handoffRetrySet(leader, set) - } -} - -func (p *asyncProducer) handoffRetrySet(leader *Broker, set *produceSet) { - bp := p.getBrokerProducer(leader) - accepted := p.sendRetryBatch(bp, set) - p.unrefBrokerProducer(leader, bp) - setMessages, setBytes := int64(set.bufferCount), int64(set.bufferBytes) - p.releaseBuffer(setMessages, setBytes) - if !accepted { - p.muter.unmute(set) - set.eachPartition(func(_ string, _ int32, pSet *partitionSet) { - for _, msg := range pSet.msgs { - p.returnError(msg, ErrShuttingDown) - } - }) + for _, batch := range batches { + go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true) } } @@ -1874,8 +1633,22 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { // effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock // based on https://godoc.org/github.com/eapache/channels#InfiniteChannel func (p *asyncProducer) retryHandler() { - version := p.producerMessageByteSizeVersion() + maxBufferLength := p.conf.Producer.Retry.MaxBufferLength + if 0 < maxBufferLength && maxBufferLength < minFunctionalRetryBufferLength { + maxBufferLength = minFunctionalRetryBufferLength + } + + maxBufferBytes := p.conf.Producer.Retry.MaxBufferBytes + if 0 < maxBufferBytes && maxBufferBytes < minFunctionalRetryBufferBytes { + maxBufferBytes = minFunctionalRetryBufferBytes + } + + version := 1 + if p.conf.Version.IsAtLeast(V0_11_0_0) { + version = 2 + } + var currentByteSize int64 var msg *ProducerMessage buf := queue.New() @@ -1887,7 +1660,7 @@ func (p *asyncProducer) retryHandler() { case msg = <-p.retries: case p.input <- buf.Peek().(*ProducerMessage): msgToRemove := buf.Remove().(*ProducerMessage) - p.releaseBuffer(1, int64(msgToRemove.ByteSize(version))) + currentByteSize -= int64(msgToRemove.ByteSize(version)) continue } } @@ -1897,9 +1670,9 @@ func (p *asyncProducer) retryHandler() { } buf.Add(msg) - bufferOverflow := p.addToBuffer(1, int64(msg.ByteSize(version))) + currentByteSize += int64(msg.ByteSize(version)) - if !bufferOverflow { + if (maxBufferLength <= 0 || buf.Length() < maxBufferLength) && (maxBufferBytes <= 0 || currentByteSize < maxBufferBytes) { continue } @@ -1908,10 +1681,10 @@ func (p *asyncProducer) retryHandler() { select { case p.input <- msgToHandle: buf.Remove() - p.releaseBuffer(1, int64(msgToHandle.ByteSize(version))) + currentByteSize -= int64(msgToHandle.ByteSize(version)) default: buf.Remove() - p.releaseBuffer(1, int64(msgToHandle.ByteSize(version))) + currentByteSize -= int64(msgToHandle.ByteSize(version)) p.returnError(msgToHandle, ErrProducerRetryBufferOverflow) } } diff --git a/async_producer_test.go b/async_producer_test.go index a3c220f63..4bb9cd87a 100644 --- a/async_producer_test.go +++ b/async_producer_test.go @@ -1635,51 +1635,6 @@ func assertDoneWithin[T any](t *testing.T, ch <-chan T, timeout time.Duration) T } } -func waitForProducerBuffer(t *testing.T, parent *asyncProducer, messages, bytes int64) { - t.Helper() - deadline := time.After(2 * time.Second) - ticker := time.NewTicker(time.Millisecond) - defer ticker.Stop() - for { - parent.retryBuffer.mu.Lock() - gotMessages := parent.retryBuffer.messages - gotBytes := parent.retryBuffer.bytes - parent.retryBuffer.mu.Unlock() - if (messages < 0 || gotMessages == messages) && (bytes < 0 || gotBytes == bytes) { - return - } - select { - case <-deadline: - t.Fatalf("timed out waiting for buffer budget messages=%d bytes=%d, got messages=%d bytes=%d", - messages, bytes, gotMessages, gotBytes) - case <-ticker.C: - } - } -} - -func newBlockingRetryProducer(config *Config, errorBuffer int) (*asyncProducer, *brokerProducer) { - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, errorBuffer), - shutdownCh: make(chan struct{}), - txnmgr: &transactionManager{}, - } - leader := &Broker{id: 1} - parent.client = &stubLeaderClient{leader: leader, cfg: config} - retryBP := &brokerProducer{ - parent: parent, - broker: leader, - output: make(chan *produceSet), - input: make(chan *ProducerMessage), - done: make(chan struct{}), - } - parent.brokers[leader] = retryBP - return parent, retryBP -} - // TestBrokerProducerWaitForSpaceRespectsExternalUnmute ensures waitForSpace does not // deadlock when partitions are muted by another producer and are unmuted elsewhere. func TestBrokerProducerWaitForSpaceRespectsExternalUnmute(t *testing.T) { @@ -1952,7 +1907,6 @@ func TestBrokerProducerRollOverClearsTimer(t *testing.T) { func TestRetryBatchRespectsPartitionMuter(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = true - config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength txnMgr := &transactionManager{ producerID: 0, producerEpoch: 0, @@ -1995,7 +1949,6 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) { default: t.Fatal("expected retry batch to be dispatched") } - waitForProducerBuffer(t, parent, 0, 0) contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) @@ -2065,61 +2018,11 @@ func TestHandleErrorNonIdempotentRetryKeepsPartitionMuted(t *testing.T) { } } -func TestHandleErrorNonIdempotentRetryFailsWholeBatch(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Return.Errors = true - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - errors: make(chan *ProducerError, 2), - retries: make(chan *ProducerMessage, 2), - txnmgr: &transactionManager{}, - } - - bp := &brokerProducer{ - parent: parent, - broker: &Broker{id: 1}, - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - - exhausted := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("exhausted"), retries: 1} - retryable := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retryable")} - sent := newProduceSet(parent) - safeAddMessage(t, sent, exhausted) - safeAddMessage(t, sent, retryable) - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - parent.inFlight.Add(2) - - bp.handleError(sent, ErrOutOfBrokers) - - firstErr := assertDoneWithin(t, parent.errors, 2*time.Second) - secondErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, exhausted, firstErr.Msg) - require.Equal(t, ErrOutOfBrokers, firstErr.Err) - require.Equal(t, retryable, secondErr.Msg) - require.Equal(t, ErrOutOfBrokers, secondErr.Err) - assertNotDone(t, parent.retries, 50*time.Millisecond) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected partition mute to be released after whole-batch failure") - } - parent.muter.unmute(contender) -} - func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 config.Producer.Retry.Backoff = 0 - config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength parent := &asyncProducer{ conf: config, @@ -2168,7 +2071,6 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T }() assertDoneWithin(t, done, 2*time.Second) assertDoneWithin(t, client.started, 2*time.Second) - waitForProducerBuffer(t, parent, 1, -1) contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) @@ -2181,7 +2083,6 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T retrySet := assertDoneWithin(t, output, 2*time.Second) defer parent.muter.unmute(retrySet) require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0]) - waitForProducerBuffer(t, parent, 0, 0) } func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) { @@ -2235,112 +2136,23 @@ func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *t bp.handleError(sent, ErrOutOfBrokers) require.Equal(t, []string{"topic"}, assertDoneWithin(t, client.calls, 2*time.Second)) - retrySet := assertDoneWithin(t, output, 2*time.Second) - defer parent.muter.unmute(retrySet) + retrySet0 := assertDoneWithin(t, output, 2*time.Second) + defer parent.muter.unmute(retrySet0) + retrySet1 := assertDoneWithin(t, output, 2*time.Second) + defer parent.muter.unmute(retrySet1) assertNotDone(t, client.calls, 50*time.Millisecond) - assertNotDone(t, output, 50*time.Millisecond) retriedPartitions := make(map[int32]*partitionSet) - retrySet.eachPartition(func(_ string, partition int32, pSet *partitionSet) { + retrySet0.eachPartition(func(_ string, partition int32, pSet *partitionSet) { + retriedPartitions[partition] = pSet + }) + retrySet1.eachPartition(func(_ string, partition int32, pSet *partitionSet) { retriedPartitions[partition] = pSet }) require.Same(t, retryPartitionSet0, retriedPartitions[int32(0)]) require.Same(t, retryPartitionSet1, retriedPartitions[int32(1)]) } -func TestHandleErrorNonIdempotentRetryGroupsBatchesByLeader(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - txnmgr: &transactionManager{}, - } - failedBroker := &Broker{id: 1} - leaderA := &Broker{id: 2} - leaderB := &Broker{id: 3} - client := &routingLeaderClient{ - stubLeaderClient: stubLeaderClient{cfg: config}, - leaders: map[string]map[int32]*Broker{ - "topic-a": { - 0: leaderA, - 1: leaderA, - }, - "topic-b": { - 0: leaderB, - 1: leaderB, - }, - }, - calls: make(chan []string, 1), - } - parent.client = client - - outputA := make(chan *produceSet, 1) - outputB := make(chan *produceSet, 1) - retryBPA := &brokerProducer{ - parent: parent, - broker: leaderA, - output: outputA, - input: make(chan *ProducerMessage), - } - retryBPB := &brokerProducer{ - parent: parent, - broker: leaderB, - output: outputB, - input: make(chan *ProducerMessage), - } - parent.brokers[leaderA] = retryBPA - parent.brokers[leaderB] = retryBPB - - bp := &brokerProducer{ - parent: parent, - broker: failedBroker, - input: make(chan *ProducerMessage), - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-a", Partition: 0, Value: StringEncoder("a-0")}) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-a", Partition: 1, Value: StringEncoder("a-1")}) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-b", Partition: 0, Value: StringEncoder("b-0")}) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic-b", Partition: 1, Value: StringEncoder("b-1")}) - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - - bp.handleError(sent, ErrOutOfBrokers) - - require.ElementsMatch(t, []string{"topic-a", "topic-b"}, assertDoneWithin(t, client.calls, 2*time.Second)) - retrySetA := assertDoneWithin(t, outputA, 2*time.Second) - defer parent.muter.unmute(retrySetA) - retrySetB := assertDoneWithin(t, outputB, 2*time.Second) - defer parent.muter.unmute(retrySetB) - - require.Len(t, retrySetA.msgs, 1) - require.Len(t, retrySetA.msgs["topic-a"], 2) - require.Len(t, retrySetB.msgs, 1) - require.Len(t, retrySetB.msgs["topic-b"], 2) - for _, partitions := range retrySetA.msgs { - for _, pSet := range partitions { - require.Equal(t, 1, pSet.msgs[0].retries) - } - } - for _, partitions := range retrySetB.msgs { - for _, pSet := range partitions { - require.Equal(t, 1, pSet.msgs[0].retries) - } - } - - assertNotDone(t, outputA, 50*time.Millisecond) - assertNotDone(t, outputB, 50*time.Millisecond) -} - func TestHandleSuccessNonIdempotentRetryUsesPartitionProducer(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false @@ -2391,7 +2203,6 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { config.Producer.Retry.Max = 1 config.Producer.Retry.Backoff = 0 config.Producer.Return.Errors = true - config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength parent := &asyncProducer{ conf: config, @@ -2420,187 +2231,16 @@ func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { t.Fatal("expected sent batch to mute partitions") } parent.inFlight.Add(1) - - done := make(chan struct{}) - go func() { - parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true) - close(done) - }() - waitForProducerBuffer(t, parent, 1, -1) - if parent.shutdownChClosed.CompareAndSwap(false, true) { close(parent.shutdownCh) } - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrShuttingDown, producerErr.Err) - assertDoneWithin(t, done, 2*time.Second) - waitForProducerBuffer(t, parent, 0, 0) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected partition mute to be released after aborted retry handoff") - } - parent.muter.unmute(contender) -} - -func TestRetryBatchUsesSharedBufferLengthBudget(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = true - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1 - - parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1) - - first := newProduceSet(parent) - for i := 0; i < minFunctionalRetryBufferLength; i++ { - safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - } - firstPartitionSet := first.msgs["topic"][0] - if !parent.muter.tryMute(first) { - t.Fatal("expected first retry batch to mute partitions") - } - parent.inFlight.Add(minFunctionalRetryBufferLength) - - firstDone := make(chan struct{}) - go func() { - parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true) - close(firstDone) - }() - waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) - - second := newProduceSet(parent) - safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}) - secondPartitionSet := second.msgs["topic"][1] - if !parent.muter.tryMute(second) { - t.Fatal("expected second retry batch to mute partitions") - } - parent.inFlight.Add(1) - parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true) - - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) - require.Equal(t, int32(1), producerErr.Msg.Partition) - waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected overflowed retry batch to release partition mute") - } - parent.muter.unmute(contender) - - close(retryBP.done) - assertDoneWithin(t, firstDone, 2*time.Second) - waitForProducerBuffer(t, parent, 0, 0) -} - -func TestRetryBatchUsesSharedBufferBytesBudget(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = true - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - parent, retryBP := newBlockingRetryProducer(config, 2) - - firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))} - secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")} - version := parent.producerMessageByteSizeVersion() - firstBytes := int64(firstMsg.ByteSize(version)) - secondBytes := int64(secondMsg.ByteSize(version)) - config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes - - first := newProduceSet(parent) - safeAddMessage(t, first, firstMsg) - firstPartitionSet := first.msgs["topic"][0] - if !parent.muter.tryMute(first) { - t.Fatal("expected first retry batch to mute partitions") - } - parent.inFlight.Add(1) - - firstDone := make(chan struct{}) - go func() { - parent.retryBatch("topic", 0, firstPartitionSet, ErrNotEnoughReplicas, true) - close(firstDone) - }() - waitForProducerBuffer(t, parent, -1, firstBytes) - - second := newProduceSet(parent) - safeAddMessage(t, second, secondMsg) - secondPartitionSet := second.msgs["topic"][1] - if !parent.muter.tryMute(second) { - t.Fatal("expected second retry batch to mute partitions") - } - parent.inFlight.Add(1) - parent.retryBatch("topic", 1, secondPartitionSet, ErrNotEnoughReplicas, true) - - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) - require.Equal(t, int32(1), producerErr.Msg.Partition) - waitForProducerBuffer(t, parent, -1, firstBytes) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected overflowed retry batch to release partition mute") - } - parent.muter.unmute(contender) - - close(retryBP.done) - assertDoneWithin(t, firstDone, 2*time.Second) - waitForProducerBuffer(t, parent, 0, 0) -} - -func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, 1), - shutdownCh: make(chan struct{}), - txnmgr: &transactionManager{}, - } - leader := &Broker{id: 1} - parent.client = &stubLeaderClient{leader: leader, cfg: config} - retryBP := &brokerProducer{ - parent: parent, - broker: leader, - output: make(chan *produceSet), - input: make(chan *ProducerMessage), - done: make(chan struct{}), - } - parent.brokers[leader] = retryBP - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - retryPartitionSet := sent.msgs["topic"][0] - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - parent.inFlight.Add(1) - done := make(chan struct{}) go func() { - parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ - topic: "topic", - partition: 0, - pSet: retryPartitionSet, - }}, ErrOutOfBrokers) + parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true) close(done) }() - close(retryBP.done) producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) require.Equal(t, ErrShuttingDown, producerErr.Err) assertDoneWithin(t, done, 2*time.Second) @@ -2608,131 +2248,9 @@ func TestRetryBatchesAfterRefreshReleasesMuteWhenBrokerProducerDone(t *testing.T contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) if !parent.muter.tryMute(contender) { - t.Fatal("expected partition mute to be released after brokerProducer done") - } - parent.muter.unmute(contender) -} - -func TestRetryBatchesAfterRefreshUsesSharedBufferLengthBudget(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + 1 - - parent, retryBP := newBlockingRetryProducer(config, minFunctionalRetryBufferLength+1) - - first := newProduceSet(parent) - for i := 0; i < minFunctionalRetryBufferLength; i++ { - safeAddMessage(t, first, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - } - if !parent.muter.tryMute(first) { - t.Fatal("expected first retry batch to mute partitions") - } - parent.inFlight.Add(minFunctionalRetryBufferLength) - - firstDone := make(chan struct{}) - go func() { - parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ - topic: "topic", - partition: 0, - pSet: first.msgs["topic"][0], - }}, ErrOutOfBrokers) - close(firstDone) - }() - waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) - - second := newProduceSet(parent) - safeAddMessage(t, second, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")}) - if !parent.muter.tryMute(second) { - t.Fatal("expected second retry batch to mute partitions") - } - parent.inFlight.Add(1) - parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ - topic: "topic", - partition: 1, - pSet: second.msgs["topic"][1], - }}, ErrOutOfBrokers) - - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) - require.Equal(t, int32(1), producerErr.Msg.Partition) - waitForProducerBuffer(t, parent, minFunctionalRetryBufferLength, -1) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected overflowed retry batch to release partition mute") - } - parent.muter.unmute(contender) - - close(retryBP.done) - assertDoneWithin(t, firstDone, 2*time.Second) - waitForProducerBuffer(t, parent, 0, 0) -} - -func TestRetryBatchesAfterRefreshUsesSharedBufferBytesBudget(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - parent, retryBP := newBlockingRetryProducer(config, 2) - - firstMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: ByteEncoder(make([]byte, minFunctionalRetryBufferBytes))} - secondMsg := &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("overflow")} - version := parent.producerMessageByteSizeVersion() - firstBytes := int64(firstMsg.ByteSize(version)) - secondBytes := int64(secondMsg.ByteSize(version)) - config.Producer.Retry.MaxBufferBytes = firstBytes + secondBytes - - first := newProduceSet(parent) - safeAddMessage(t, first, firstMsg) - if !parent.muter.tryMute(first) { - t.Fatal("expected first retry batch to mute partitions") - } - parent.inFlight.Add(1) - - firstDone := make(chan struct{}) - go func() { - parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ - topic: "topic", - partition: 0, - pSet: first.msgs["topic"][0], - }}, ErrOutOfBrokers) - close(firstDone) - }() - waitForProducerBuffer(t, parent, -1, firstBytes) - - second := newProduceSet(parent) - safeAddMessage(t, second, secondMsg) - if !parent.muter.tryMute(second) { - t.Fatal("expected second retry batch to mute partitions") - } - parent.inFlight.Add(1) - parent.retryBatchesAfterRefresh([]string{"topic"}, []partitionBatchRetry{{ - topic: "topic", - partition: 1, - pSet: second.msgs["topic"][1], - }}, ErrOutOfBrokers) - - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrProducerRetryBufferOverflow, producerErr.Err) - require.Equal(t, int32(1), producerErr.Msg.Partition) - waitForProducerBuffer(t, parent, -1, firstBytes) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected overflowed retry batch to release partition mute") + t.Fatal("expected partition mute to be released after aborted retry handoff") } parent.muter.unmute(contender) - - close(retryBP.done) - assertDoneWithin(t, firstDone, 2*time.Second) - waitForProducerBuffer(t, parent, 0, 0) } func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) { @@ -2856,31 +2374,6 @@ func (c *countingRefreshClient) RefreshMetadata(topics ...string) error { return nil } -type routingLeaderClient struct { - stubLeaderClient - leaders map[string]map[int32]*Broker - calls chan []string -} - -func (c *routingLeaderClient) Leader(topic string, partition int32) (*Broker, error) { - if partitions := c.leaders[topic]; partitions != nil { - if leader := partitions[partition]; leader != nil { - return leader, nil - } - } - return nil, ErrLeaderNotAvailable -} - -func (c *routingLeaderClient) LeaderAndEpoch(topic string, partition int32) (*Broker, int32, error) { - leader, err := c.Leader(topic, partition) - return leader, 0, err -} - -func (c *routingLeaderClient) RefreshMetadata(topics ...string) error { - c.calls <- append([]string(nil), topics...) - return nil -} - func testProducerInterceptor( t *testing.T, interceptors []ProducerInterceptor, diff --git a/config.go b/config.go index 9e837bbc8..8b87a6cd2 100644 --- a/config.go +++ b/config.go @@ -276,20 +276,16 @@ type Config struct { // more sophisticated backoff strategies. This takes precedence over // `Backoff` if set. BackoffFunc func(retries, maxRetries int) time.Duration - // The maximum number of messages in producer retry buffering, including - // the bridging buffer between `input` and `retries` channels in - // AsyncProducer#retryHandler and direct retries waiting for metadata - // refresh, retry backoff, leader lookup, partition mute, or broker handoff. - // The limit is to prevent retry buffering from overflowing or causing OOM. + // The maximum length of the bridging buffer between `input` and `retries` channels + // in AsyncProducer#retryHandler. + // The limit is to prevent this buffer from overflowing or causing OOM. // Defaults to 0 for unlimited. // Any value between 0 and 4096 is pushed to 4096. // A zero or negative value indicates unlimited. MaxBufferLength int - // The maximum total byte size of messages in producer retry buffering, - // including the bridging buffer between `input` and `retries` channels - // in AsyncProducer#retryHandler and direct retries waiting for metadata - // refresh, retry backoff, leader lookup, partition mute, or broker handoff. - // This limit prevents retry buffering from consuming excessive memory. + // The maximum total byte size of messages in the bridging buffer between `input` + // and `retries` channels in AsyncProducer#retryHandler. + // This limit prevents the buffer from consuming excessive memory. // Defaults to 0 for unlimited. // Any value between 0 and 32 MB is pushed to 32 MB. // A zero or negative value indicates unlimited. diff --git a/functional_producer_test.go b/functional_producer_test.go index bbb0e27dc..df8a5d0ca 100644 --- a/functional_producer_test.go +++ b/functional_producer_test.go @@ -430,8 +430,8 @@ func TestFuncTxnProduceAndCommitOffset(t *testing.T) { handler.started.Add(4) go func() { - consumeErr := cg.Consume(ctx, []string{"test.4"}, handler) - require.NoError(t, consumeErr) + err = cg.Consume(ctx, []string{"test.4"}, handler) + require.NoError(t, err) }() handler.started.Wait() diff --git a/produce_set.go b/produce_set.go index 0fbb7097a..121a272ac 100644 --- a/produce_set.go +++ b/produce_set.go @@ -50,15 +50,6 @@ func newProduceSetWithMeta(parent *asyncProducer, producerID int64, producerEpoc } } -func (ps *produceSet) addPartitionSet(topic string, partition int32, set *partitionSet) { - if ps.msgs[topic] == nil { - ps.msgs[topic] = make(map[int32]*partitionSet) - } - ps.msgs[topic][partition] = set - ps.bufferBytes += set.bufferBytes - ps.bufferCount += len(set.msgs) -} - func (ps *produceSet) add(msg *ProducerMessage) error { var err error var key, val []byte From d171171ca92e9ca4a0ebb343e38fba4502446be8 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 25 Jun 2026 22:03:05 +0800 Subject: [PATCH 5/6] Revert "Mute by retry batch (#22)" This reverts commit 505e3f94e70bdac3532623c53b4baf29a21b61f8. --- async_producer.go | 222 +++++++++---------------- async_producer_test.go | 365 +++-------------------------------------- produce_set.go | 38 ++++- produce_set_test.go | 18 -- 4 files changed, 128 insertions(+), 515 deletions(-) diff --git a/async_producer.go b/async_producer.go index 29bc3a114..99821e5b4 100644 --- a/async_producer.go +++ b/async_producer.go @@ -6,7 +6,6 @@ import ( "fmt" "math" "sync" - "sync/atomic" "time" "github.com/eapache/go-resiliency/breaker" @@ -93,10 +92,6 @@ type asyncProducer struct { errors chan *ProducerError input, successes, retries chan *ProducerMessage inFlight sync.WaitGroup - // shutdownCh is closed before waiting on in-flight messages so retryBatch - // goroutines can stop waiting on broker handoff and release partition mutes. - shutdownCh chan struct{} - shutdownChClosed atomic.Bool brokers map[*Broker]*brokerProducer brokerRefs map[*brokerProducer]int @@ -189,6 +184,13 @@ func (m *partitionMuter) tryMutePartition(topic string, partition int32) bool { return true } +func (m *partitionMuter) isMutedPartition(topic string, partition int32) bool { + m.mu.Lock() + defer m.mu.Unlock() + + return m.isMuted(topic, partition) +} + // waitUntilMuted blocks until all partitions in the set can be muted, then mutes them. // Returns false if the muter was closed before all partitions could be muted. func (m *partitionMuter) waitUntilMuted(set *produceSet) bool { @@ -308,7 +310,6 @@ func newAsyncProducer(client Client) (AsyncProducer, error) { input: make(chan *ProducerMessage), successes: make(chan *ProducerMessage), retries: make(chan *ProducerMessage), - shutdownCh: make(chan struct{}), brokers: make(map[*Broker]*brokerProducer), brokerRefs: make(map[*brokerProducer]int), txnmgr: txnmgr, @@ -382,6 +383,11 @@ type ProducerMessage struct { sequenceNumber int32 producerEpoch int16 hasSequence bool + // reusePartitionMute is a one-shot marker for non-idempotent retries. The + // first retried message from a failed, still-muted batch may reuse that + // existing in-flight reservation so later same-partition messages cannot + // slip in between the failed send and its retry. + reusePartitionMute bool } const producerMessageOverhead = 26 // the metadata overhead of CRC, flags, etc. @@ -408,6 +414,7 @@ func (m *ProducerMessage) ByteSize(version int) int { func (m *ProducerMessage) clear() { m.flags = 0 m.retries = 0 + m.reusePartitionMute = false m.sequenceNumber = 0 m.producerEpoch = 0 m.hasSequence = false @@ -787,16 +794,12 @@ func (p *asyncProducer) newPartitionProducer(topic string, partition int32) chan } func (pp *partitionProducer) backoff(retries int) { - pp.parent.backoff(retries) -} - -func (p *asyncProducer) backoff(retries int) { var backoff time.Duration - if p.conf.Producer.Retry.BackoffFunc != nil { - maxRetries := p.conf.Producer.Retry.Max - backoff = p.conf.Producer.Retry.BackoffFunc(retries, maxRetries) + if pp.parent.conf.Producer.Retry.BackoffFunc != nil { + maxRetries := pp.parent.conf.Producer.Retry.Max + backoff = pp.parent.conf.Producer.Retry.BackoffFunc(retries, maxRetries) } else { - backoff = p.conf.Producer.Retry.Backoff + backoff = pp.parent.conf.Producer.Retry.Backoff } if backoff > 0 { time.Sleep(backoff) @@ -966,7 +969,6 @@ func (p *asyncProducer) newBrokerProducer(broker *Broker) *brokerProducer { input: input, output: bridge, responses: responses, - done: make(chan struct{}), accumulatingBatch: newProduceSet(p), currentRetries: make(map[string]map[int32]error), } @@ -1080,11 +1082,6 @@ type brokerProducer struct { output chan<- *produceSet responses <-chan *brokerProducerResponse abandoned chan struct{} - // done is closed before output is closed. Direct retry handoff waits on - // output while the batch still owns its partition mute; if the brokerProducer - // is shutting down, that send may never be received and the mute would block - // later same-partition messages and producer shutdown. - done chan struct{} accumulatingBatch *produceSet flushingBatch *produceSet // batch that has been muted and is ready to send @@ -1217,13 +1214,28 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool { return true } - return false + reused := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool { + partitions := bp.accumulatingBatch.msgs[topic] + if partitions == nil { + return false + } + set := partitions[partition] + return set.canReusePartitionMute() && bp.parent.muter.isMutedPartition(topic, partition) + }) + if reused == nil { + return false + } + reused.eachPartition(func(_ string, _ int32, pSet *partitionSet) { + pSet.clearPartitionMuteReuse() + }) + bp.flushingBatch = reused + if bp.accumulatingBatch.empty() { + bp.rollOver() + } + return true } func (bp *brokerProducer) shutdown() { - if bp.done != nil { - close(bp.done) - } // flush any ready buffer for bp.flushingBatch != nil { select { @@ -1381,7 +1393,7 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo bp.parent.returnErrors(pSet.msgs, block.Err) } else { retryTopics = append(retryTopics, topic) - if bp.parent.conf.Producer.Idempotent { + if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) { if keepMuted[topic] == nil { keepMuted[topic] = make(map[int32]struct{}) } @@ -1424,9 +1436,7 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo if bp.parent.conf.Producer.Idempotent { go bp.parent.retryBatch(topic, partition, pSet, block.Err, true) } else { - // Non-idempotent response retries must go back through partitionProducer. - // That path advances the retry high watermark and refreshes the partition - // leader before sending the retry; retryBatch would bypass that state. + pSet.markPartitionMuteReusable() bp.parent.retryMessages(pSet.msgs, block.Err) } // dropping the following messages has the side effect of incrementing their retry count @@ -1452,119 +1462,40 @@ func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitio produceSet.msgs[topic][partition] = pSet produceSet.bufferBytes += pSet.bufferBytes produceSet.bufferCount += len(pSet.msgs) - muted := alreadyMuted - failBatch := func(err error) { - // Release the partition before reporting errors so a blocked Errors - // consumer cannot keep later same-partition messages muted. - if muted { - p.muter.unmute(produceSet) - muted = false - } - for _, msg := range pSet.msgs { - p.returnError(msg, err) - } - } for _, msg := range pSet.msgs { if msg.retries >= p.conf.Producer.Retry.Max { - failBatch(retryErr) + p.returnErrors(pSet.msgs, retryErr) + if alreadyMuted { + p.muter.unmute(produceSet) + } return } msg.retries++ } - // honor Producer.Retry.Backoff between retry attempts (#2469); the - // non-idempotent path gets this from partitionProducer.dispatch, but - // retryBatch dispatches the produceSet directly to the broker - if len(pSet.msgs) > 0 { - p.backoff(pSet.msgs[0].retries) - } - // it's expected that a metadata refresh has been requested prior to calling retryBatch leader, leaderErr := p.client.Leader(topic, partition) if leaderErr != nil { Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", topic, partition, leaderErr) - failBatch(retryErr) + for _, msg := range pSet.msgs { + p.returnError(msg, retryErr) + } + if alreadyMuted { + p.muter.unmute(produceSet) + } return } if !alreadyMuted { if !p.muter.waitUntilMuted(produceSet) { - failBatch(retryErr) - return - } - muted = true - } - bp := p.getBrokerProducer(leader) - defer p.unrefBrokerProducer(leader, bp) - if !p.sendRetryBatch(bp, produceSet) { - failBatch(ErrShuttingDown) - return - } -} - -type partitionBatchRetry struct { - topic string - partition int32 - pSet *partitionSet -} - -// retryBatchesAfterRefresh keeps metadata refresh out of brokerProducer.handleError. -// Connection errors already hold partition mutes; doing the refresh in the -// response loop can block all progress for that broker while the cluster is -// unstable. Refresh once for the failed request so multi-partition batches do -// not amplify controller-fail metadata traffic, then retry partitions -// independently so one blocked broker handoff cannot hold up the rest. -func (p *asyncProducer) retryBatchesAfterRefresh(topics []string, batches []partitionBatchRetry, retryErr error) { - if p.shutdownCh != nil { - select { - case <-p.shutdownCh: - for _, batch := range batches { - go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true) + for _, msg := range pSet.msgs { + p.returnError(msg, retryErr) } return - default: - } - } - if len(topics) > 0 { - if err := p.client.RefreshMetadata(topics...); err != nil { - Logger.Printf("Failed refreshing metadata because of %v\n", err) } } - for _, batch := range batches { - go p.retryBatch(batch.topic, batch.partition, batch.pSet, retryErr, true) - } -} - -// sendRetryBatch transfers ownership of a muted retry batch to the broker -// bridge. A false result means no owner accepted the batch, so the caller must -// release the mute and fail the messages instead of leaving a retry goroutine -// blocked while it still owns the partition mute. -func (p *asyncProducer) sendRetryBatch(bp *brokerProducer, set *produceSet) bool { - var done <-chan struct{} - if bp.done != nil { - done = bp.done - } - select { - case <-done: - return false - default: - } - select { - case bp.output <- set: - return true - default: - } - var shutdownCh <-chan struct{} - if p.shutdownCh != nil { - shutdownCh = p.shutdownCh - } - select { - case bp.output <- set: - return true - case <-done: - return false - case <-shutdownCh: - return false - } + bp := p.getBrokerProducer(leader) + bp.output <- produceSet + p.unrefBrokerProducer(leader, bp) } func (bp *brokerProducer) handleError(sent *produceSet, err error) { @@ -1579,40 +1510,41 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { bp.parent.abandonBrokerConnection(bp.broker) _ = bp.broker.Close() bp.closing = err - keepMuted := make(map[string]map[int32]struct{}) var retryTopics []string retryTopicSeen := make(map[string]struct{}) - var retryBatches []partitionBatchRetry sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) { - // Connection failures have no ProduceResponse to feed back through - // partitionProducer. Keep the sent batch muted and retry it asynchronously - // so later same-partition batches cannot pass it, without blocking the - // brokerProducer response loop (#1203). + if _, ok := retryTopicSeen[topic]; ok { + return + } + retryTopicSeen[topic] = struct{}{} + retryTopics = append(retryTopics, topic) + }) + if bp.parent.conf.Producer.Idempotent && len(retryTopics) > 0 { + refreshErr := bp.parent.client.RefreshMetadata(retryTopics...) + if refreshErr != nil { + Logger.Printf("Failed refreshing metadata because of %v\n", refreshErr) + } + } + keepMuted := make(map[string]map[int32]struct{}) + sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) { + // keep partition marked as in-flight during retry (connection error) if bp.currentRetries[topic] == nil { bp.currentRetries[topic] = make(map[int32]error) } bp.currentRetries[topic][partition] = err - if !(bp.parent.conf.Producer.Idempotent || pSet.shouldKeepMuted(bp.parent.conf.Producer.Retry.Max)) { - bp.parent.returnErrors(pSet.msgs, err) - } else { + if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) { if keepMuted[topic] == nil { keepMuted[topic] = make(map[int32]struct{}) } keepMuted[topic][partition] = struct{}{} - if _, ok := retryTopicSeen[topic]; !ok { - retryTopicSeen[topic] = struct{}{} - retryTopics = append(retryTopics, topic) - } - retryBatches = append(retryBatches, partitionBatchRetry{ - topic: topic, - partition: partition, - pSet: pSet, - }) + } + if bp.parent.conf.Producer.Idempotent { + go bp.parent.retryBatch(topic, partition, pSet, err, true) + } else { + pSet.markPartitionMuteReusable() + bp.parent.retryMessages(pSet.msgs, err) } }) - if len(retryBatches) > 0 { - go bp.parent.retryBatchesAfterRefresh(retryTopics, retryBatches, err) - } bp.accumulatingBatch.eachPartition(func(topic string, partition int32, pSet *partitionSet) { bp.parent.retryMessages(pSet.msgs, err) }) @@ -1694,10 +1626,6 @@ func (p *asyncProducer) retryHandler() { // utility functions func (p *asyncProducer) shutdown() { - Logger.Println("Producer shutting down.") - if p.shutdownCh != nil && p.shutdownChClosed.CompareAndSwap(false, true) { - close(p.shutdownCh) - } p.inFlight.Add(1) p.input <- &ProducerMessage{flags: shutdown} diff --git a/async_producer_test.go b/async_producer_test.go index 4bb9cd87a..ed5e6beeb 100644 --- a/async_producer_test.go +++ b/async_producer_test.go @@ -1957,105 +1957,37 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) { } } -func TestHandleErrorNonIdempotentRetryKeepsPartitionMuted(t *testing.T) { +func TestHandleSuccessNonIdempotentRetryKeepsPartitionMuted(t *testing.T) { config := NewTestConfig() config.Producer.Idempotent = false config.Producer.Retry.Max = 1 config.Producer.Retry.Backoff = 0 - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - txnmgr: &transactionManager{}, - } - failedBroker := &Broker{id: 1} - retryLeader := &Broker{id: 2} - parent.client = &stubLeaderClient{leader: retryLeader, cfg: config} - - output := make(chan *produceSet) - retryBP := &brokerProducer{ - parent: parent, - broker: retryLeader, - output: output, - input: make(chan *ProducerMessage), - } - parent.brokers[retryLeader] = retryBP - - bp := &brokerProducer{ - parent: parent, - broker: failedBroker, - input: make(chan *ProducerMessage), - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - retryPartitionSet := sent.msgs["topic"][0] - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - - done := make(chan struct{}) - go func() { - bp.handleError(sent, ErrOutOfBrokers) - close(done) - }() - assertDoneWithin(t, done, 2*time.Second) - - retrySet := assertDoneWithin(t, output, 2*time.Second) - defer parent.muter.unmute(retrySet) - require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0]) - require.Equal(t, 1, retryPartitionSet.msgs[0].retries) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if parent.muter.tryMute(contender) { - parent.muter.unmute(contender) - t.Fatal("expected partition to remain muted by the retrying batch") + txnMgr := &transactionManager{ + producerID: 0, + producerEpoch: 0, + sequenceNumbers: make(map[string]int32), } -} - -func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - parent := &asyncProducer{ conf: config, muter: newPartitionMuter(), brokers: make(map[*Broker]*brokerProducer), brokerRefs: make(map[*brokerProducer]int), - txnmgr: &transactionManager{}, - } - failedBroker := &Broker{id: 1} - retryLeader := &Broker{id: 2} - client := &blockingRefreshClient{ - stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config}, - started: make(chan struct{}, 1), - release: make(chan struct{}), - } - parent.client = client - - output := make(chan *produceSet) - retryBP := &brokerProducer{ - parent: parent, - broker: retryLeader, - output: output, - input: make(chan *ProducerMessage), + retries: make(chan *ProducerMessage, 1), + txnmgr: txnMgr, } - parent.brokers[retryLeader] = retryBP + leader := &Broker{id: 1} + parent.client = &stubLeaderClient{leader: leader, cfg: config} bp := &brokerProducer{ parent: parent, - broker: failedBroker, + broker: leader, input: make(chan *ProducerMessage), accumulatingBatch: newProduceSet(parent), currentRetries: make(map[string]map[int32]error), } + parent.brokers[leader] = bp + parent.brokerRefs[bp] = 1 sent := newProduceSet(parent) safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) @@ -2063,242 +1995,29 @@ func TestHandleErrorNonIdempotentRetryDoesNotWaitForMetadataRefresh(t *testing.T if !parent.muter.tryMute(sent) { t.Fatal("expected sent batch to mute partitions") } - - done := make(chan struct{}) - go func() { - bp.handleError(sent, ErrOutOfBrokers) - close(done) - }() - assertDoneWithin(t, done, 2*time.Second) - assertDoneWithin(t, client.started, 2*time.Second) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if parent.muter.tryMute(contender) { - parent.muter.unmute(contender) - t.Fatal("expected partition to remain muted while async metadata refresh is pending") - } - - close(client.release) - retrySet := assertDoneWithin(t, output, 2*time.Second) - defer parent.muter.unmute(retrySet) - require.Equal(t, retryPartitionSet, retrySet.msgs["topic"][0]) -} - -func TestHandleErrorNonIdempotentRetryRefreshesMetadataOncePerFailedRequest(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - txnmgr: &transactionManager{}, - } - failedBroker := &Broker{id: 1} - retryLeader := &Broker{id: 2} - client := &countingRefreshClient{ - stubLeaderClient: stubLeaderClient{leader: retryLeader, cfg: config}, - calls: make(chan []string, 1), - } - parent.client = client - - output := make(chan *produceSet, 2) - retryBP := &brokerProducer{ - parent: parent, - broker: retryLeader, - output: output, - input: make(chan *ProducerMessage), - } - parent.brokers[retryLeader] = retryBP - parent.brokerRefs[retryBP] = 1 - - bp := &brokerProducer{ - parent: parent, - broker: failedBroker, - input: make(chan *ProducerMessage), - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry-0")}) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 1, Value: StringEncoder("retry-1")}) - retryPartitionSet0 := sent.msgs["topic"][0] - retryPartitionSet1 := sent.msgs["topic"][1] - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - - bp.handleError(sent, ErrOutOfBrokers) - - require.Equal(t, []string{"topic"}, assertDoneWithin(t, client.calls, 2*time.Second)) - retrySet0 := assertDoneWithin(t, output, 2*time.Second) - defer parent.muter.unmute(retrySet0) - retrySet1 := assertDoneWithin(t, output, 2*time.Second) - defer parent.muter.unmute(retrySet1) - assertNotDone(t, client.calls, 50*time.Millisecond) - - retriedPartitions := make(map[int32]*partitionSet) - retrySet0.eachPartition(func(_ string, partition int32, pSet *partitionSet) { - retriedPartitions[partition] = pSet - }) - retrySet1.eachPartition(func(_ string, partition int32, pSet *partitionSet) { - retriedPartitions[partition] = pSet - }) - require.Same(t, retryPartitionSet0, retriedPartitions[int32(0)]) - require.Same(t, retryPartitionSet1, retriedPartitions[int32(1)]) -} - -func TestHandleSuccessNonIdempotentRetryUsesPartitionProducer(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - retries: make(chan *ProducerMessage, 1), - txnmgr: &transactionManager{}, - } - - bp := &brokerProducer{ - parent: parent, - broker: &Broker{id: 1}, - input: make(chan *ProducerMessage), - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - - sent := newProduceSet(parent) - msg := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")} - safeAddMessage(t, sent, msg) - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } + defer parent.muter.unmute(sent) response := new(ProduceResponse) response.AddTopicPartition("topic", 0, ErrNotLeaderForPartition) bp.handleSuccess(sent, response) retried := assertDoneWithin(t, parent.retries, 2*time.Second) - require.Equal(t, msg, retried) - require.Equal(t, 1, retried.retries) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected response retry to release sent mute before partitionProducer retry") - } - parent.muter.unmute(contender) -} - -func TestRetryBatchReleasesMuteWhenHandoffAbortedByShutdown(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, 1), - shutdownCh: make(chan struct{}), - txnmgr: &transactionManager{}, - } - leader := &Broker{id: 1} - parent.client = &stubLeaderClient{leader: leader, cfg: config} - retryBP := &brokerProducer{ - parent: parent, - broker: leader, - output: make(chan *produceSet), - input: make(chan *ProducerMessage), - done: make(chan struct{}), - } - parent.brokers[leader] = retryBP - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - retryPartitionSet := sent.msgs["topic"][0] - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - parent.inFlight.Add(1) - if parent.shutdownChClosed.CompareAndSwap(false, true) { - close(parent.shutdownCh) - } - - done := make(chan struct{}) - go func() { - parent.retryBatch("topic", 0, retryPartitionSet, ErrOutOfBrokers, true) - close(done) - }() - - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrShuttingDown, producerErr.Err) - assertDoneWithin(t, done, 2*time.Second) + require.Equal(t, retryPartitionSet.msgs[0], retried) + require.True(t, retried.reusePartitionMute) contender := newProduceSet(parent) safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected partition mute to be released after aborted retry handoff") - } - parent.muter.unmute(contender) -} - -func TestHandleErrorNonIdempotentRetryReleasesMuteOnLeaderError(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - config.Producer.Return.Errors = true - - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - errors: make(chan *ProducerError, 1), - txnmgr: &transactionManager{}, - } - parent.client = &failingLeaderClient{ - stubLeaderClient: stubLeaderClient{cfg: config}, - err: ErrOutOfBrokers, - } - - bp := &brokerProducer{ - parent: parent, - broker: &Broker{id: 1}, - input: make(chan *ProducerMessage), - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") + if parent.muter.tryMute(contender) { + parent.muter.unmute(contender) + t.Fatal("expected partition to remain muted by the retrying batch") } - parent.inFlight.Add(1) - - bp.handleError(sent, ErrOutOfBrokers) - producerErr := assertDoneWithin(t, parent.errors, 2*time.Second) - require.Equal(t, ErrOutOfBrokers, producerErr.Err) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if !parent.muter.tryMute(contender) { - t.Fatal("expected partition mute to be released after retry leader lookup failure") + safeAddMessage(t, bp.accumulatingBatch, retried) + if !bp.tryBuildFlushingBatch() { + t.Fatal("expected retry batch to reuse the existing partition mute") } - parent.muter.unmute(contender) + require.Equal(t, retryPartitionSet.msgs[0], bp.flushingBatch.msgs["topic"][0].msgs[0]) + require.False(t, retryPartitionSet.msgs[0].reusePartitionMute) } type stubLeaderClient struct { @@ -2336,44 +2055,6 @@ func (c *stubLeaderClient) PartitionNotReadable(string, int32) bool { r func (c *stubLeaderClient) Close() error { return nil } func (c *stubLeaderClient) Closed() bool { return false } -type failingLeaderClient struct { - stubLeaderClient - err error -} - -func (c *failingLeaderClient) Leader(string, int32) (*Broker, error) { - return nil, c.err -} - -func (c *failingLeaderClient) LeaderAndEpoch(string, int32) (*Broker, int32, error) { - return nil, 0, c.err -} - -type blockingRefreshClient struct { - stubLeaderClient - started chan struct{} - release chan struct{} -} - -func (c *blockingRefreshClient) RefreshMetadata(...string) error { - select { - case c.started <- struct{}{}: - default: - } - <-c.release - return nil -} - -type countingRefreshClient struct { - stubLeaderClient - calls chan []string -} - -func (c *countingRefreshClient) RefreshMetadata(topics ...string) error { - c.calls <- append([]string(nil), topics...) - return nil -} - func testProducerInterceptor( t *testing.T, interceptors []ProducerInterceptor, diff --git a/produce_set.go b/produce_set.go index 121a272ac..a86235b9e 100644 --- a/produce_set.go +++ b/produce_set.go @@ -12,18 +12,40 @@ type partitionSet struct { bufferBytes int } -// shouldKeepMuted matches retryBatch's whole-batch retry rule: if any message has -// exhausted retries, the batch will be failed instead of retried. -func (ps *partitionSet) shouldKeepMuted(maxRetries int) bool { - if len(ps.msgs) == 0 { - return false +// markPartitionMuteReusable lets the first message carry the retry batch's +// right to reuse the partition mute that is still held by its failed send. +// Only the first message is marked because one held mute represents one +// in-flight retry permission; marking every message could let split retry +// fragments bypass each other under the same reservation. +func (ps *partitionSet) markPartitionMuteReusable() { + if len(ps.msgs) > 0 { + ps.msgs[0].reusePartitionMute = true + } +} + +// canReusePartitionMute reports whether this retry batch may be flushed while +// its partition is already muted by the previous send attempt. +func (ps *partitionSet) canReusePartitionMute() bool { + return len(ps.msgs) > 0 && ps.msgs[0].reusePartitionMute +} + +// clearPartitionMuteReuse consumes the one-shot reuse marker once the retry +// batch is selected for flushing. +func (ps *partitionSet) clearPartitionMuteReuse() { + for _, msg := range ps.msgs { + msg.reusePartitionMute = false } +} + +// canRetry prevents holding a partition mute when every message in the set will +// be returned as an error instead of being retried. +func (ps *partitionSet) canRetry(maxRetries int) bool { for _, msg := range ps.msgs { - if msg.retries >= maxRetries { - return false + if msg.retries < maxRetries { + return true } } - return true + return false } type produceSet struct { diff --git a/produce_set_test.go b/produce_set_test.go index 359bfd47b..9ab964949 100644 --- a/produce_set_test.go +++ b/produce_set_test.go @@ -25,24 +25,6 @@ func safeAddMessage(t *testing.T, ps *produceSet, msg *ProducerMessage) { } } -func TestShouldKeepMuted(t *testing.T) { - if (&partitionSet{}).shouldKeepMuted(1) { - t.Fatal("empty partition set should not be retryable") - } - if !(&partitionSet{msgs: []*ProducerMessage{ - {retries: 0}, - {retries: 0}, - }}).shouldKeepMuted(1) { - t.Fatal("expected batch to be retryable when every message is below retry max") - } - if (&partitionSet{msgs: []*ProducerMessage{ - {retries: 0}, - {retries: 1}, - }}).shouldKeepMuted(1) { - t.Fatal("expected batch not to be retryable when any message has exhausted retries") - } -} - func TestProduceSetInitial(t *testing.T) { _, ps := makeProduceSet() From 81e2f4bf80180b8d6647d78abbccec75a8bfd1f2 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 25 Jun 2026 22:03:05 +0800 Subject: [PATCH 6/6] Revert "Out of order no idempotent (#21)" This reverts commit 4c5ff0cff596d32699f8e0c1a20d5b5caa5ab111. --- async_producer.go | 51 ++----------------- async_producer_test.go | 110 ----------------------------------------- produce_set.go | 36 -------------- 3 files changed, 4 insertions(+), 193 deletions(-) diff --git a/async_producer.go b/async_producer.go index 99821e5b4..a9af20b1b 100644 --- a/async_producer.go +++ b/async_producer.go @@ -184,13 +184,6 @@ func (m *partitionMuter) tryMutePartition(topic string, partition int32) bool { return true } -func (m *partitionMuter) isMutedPartition(topic string, partition int32) bool { - m.mu.Lock() - defer m.mu.Unlock() - - return m.isMuted(topic, partition) -} - // waitUntilMuted blocks until all partitions in the set can be muted, then mutes them. // Returns false if the muter was closed before all partitions could be muted. func (m *partitionMuter) waitUntilMuted(set *produceSet) bool { @@ -383,11 +376,6 @@ type ProducerMessage struct { sequenceNumber int32 producerEpoch int16 hasSequence bool - // reusePartitionMute is a one-shot marker for non-idempotent retries. The - // first retried message from a failed, still-muted batch may reuse that - // existing in-flight reservation so later same-partition messages cannot - // slip in between the failed send and its retry. - reusePartitionMute bool } const producerMessageOverhead = 26 // the metadata overhead of CRC, flags, etc. @@ -414,7 +402,6 @@ func (m *ProducerMessage) ByteSize(version int) int { func (m *ProducerMessage) clear() { m.flags = 0 m.retries = 0 - m.reusePartitionMute = false m.sequenceNumber = 0 m.producerEpoch = 0 m.hasSequence = false @@ -1096,14 +1083,8 @@ func (bp *brokerProducer) run() { var output chan<- *produceSet for { - var unmuteCh <-chan struct{} if bp.flushingBatch == nil && (bp.timerFired || bp.accumulatingBatch.readyToFlush()) { bp.tryBuildFlushingBatch() - if bp.flushingBatch == nil { - if ch, blocked := bp.parent.muter.awaitUnmuteChan(bp.accumulatingBatch); blocked { - unmuteCh = ch - } - } } var timerChan <-chan time.Time @@ -1188,7 +1169,6 @@ func (bp *brokerProducer) run() { if ok { bp.handleResponse(response) } - case <-unmuteCh: } } } @@ -1206,29 +1186,10 @@ func (bp *brokerProducer) tryBuildFlushingBatch() bool { partial := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool { return bp.parent.muter.tryMutePartition(topic, partition) }) - if partial != nil { - bp.flushingBatch = partial - if bp.accumulatingBatch.empty() { - bp.rollOver() - } - return true - } - - reused := bp.accumulatingBatch.takePartitions(func(topic string, partition int32) bool { - partitions := bp.accumulatingBatch.msgs[topic] - if partitions == nil { - return false - } - set := partitions[partition] - return set.canReusePartitionMute() && bp.parent.muter.isMutedPartition(topic, partition) - }) - if reused == nil { + if partial == nil { return false } - reused.eachPartition(func(_ string, _ int32, pSet *partitionSet) { - pSet.clearPartitionMuteReuse() - }) - bp.flushingBatch = reused + bp.flushingBatch = partial if bp.accumulatingBatch.empty() { bp.rollOver() } @@ -1393,7 +1354,7 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo bp.parent.returnErrors(pSet.msgs, block.Err) } else { retryTopics = append(retryTopics, topic) - if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) { + if bp.parent.conf.Producer.Idempotent { if keepMuted[topic] == nil { keepMuted[topic] = make(map[int32]struct{}) } @@ -1436,7 +1397,6 @@ func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceRespo if bp.parent.conf.Producer.Idempotent { go bp.parent.retryBatch(topic, partition, pSet, block.Err, true) } else { - pSet.markPartitionMuteReusable() bp.parent.retryMessages(pSet.msgs, block.Err) } // dropping the following messages has the side effect of incrementing their retry count @@ -1532,16 +1492,13 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { bp.currentRetries[topic] = make(map[int32]error) } bp.currentRetries[topic][partition] = err - if bp.parent.conf.Producer.Idempotent || pSet.canRetry(bp.parent.conf.Producer.Retry.Max) { + if bp.parent.conf.Producer.Idempotent { if keepMuted[topic] == nil { keepMuted[topic] = make(map[int32]struct{}) } keepMuted[topic][partition] = struct{}{} - } - if bp.parent.conf.Producer.Idempotent { go bp.parent.retryBatch(topic, partition, pSet, err, true) } else { - pSet.markPartitionMuteReusable() bp.parent.retryMessages(pSet.msgs, err) } }) diff --git a/async_producer_test.go b/async_producer_test.go index ed5e6beeb..c32ac3ef5 100644 --- a/async_producer_test.go +++ b/async_producer_test.go @@ -1724,53 +1724,6 @@ func TestBrokerProducerFlushSkipsMutedPartitions(t *testing.T) { } } -func TestBrokerProducerRunFlushesAfterExternalUnmute(t *testing.T) { - config := NewTestConfig() - config.Producer.Flush.Messages = 1 - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - txnmgr: &transactionManager{}, - } - - blockedSet := newProduceSet(parent) - safeAddMessage(t, blockedSet, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("held")}) - if !parent.muter.tryMute(blockedSet) { - t.Fatal("expected to mute partition externally") - } - - input := make(chan *ProducerMessage) - output := make(chan *produceSet, 1) - responses := make(chan *brokerProducerResponse) - bp := &brokerProducer{ - parent: parent, - broker: &Broker{id: 1}, - input: input, - output: output, - responses: responses, - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - retryMsg := &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("waiting")} - safeAddMessage(t, bp.accumulatingBatch, retryMsg) - - done := make(chan struct{}) - go func() { - bp.run() - close(done) - }() - - assertNotDone(t, output, 50*time.Millisecond) - parent.muter.unmute(blockedSet) - - flushed := assertDoneWithin(t, output, 2*time.Second) - require.Equal(t, retryMsg, flushed.msgs["topic"][0].msgs[0]) - - close(input) - close(responses) - assertDoneWithin(t, done, 2*time.Second) -} - // TestBrokerProducerWaitForSpaceAllPartitionsMuted verifies that waitForSpace unblocks // when all partitions in the accumulating batch are externally muted and later unmuted. func TestBrokerProducerWaitForSpaceAllPartitionsMuted(t *testing.T) { @@ -1957,69 +1910,6 @@ func TestRetryBatchRespectsPartitionMuter(t *testing.T) { } } -func TestHandleSuccessNonIdempotentRetryKeepsPartitionMuted(t *testing.T) { - config := NewTestConfig() - config.Producer.Idempotent = false - config.Producer.Retry.Max = 1 - config.Producer.Retry.Backoff = 0 - - txnMgr := &transactionManager{ - producerID: 0, - producerEpoch: 0, - sequenceNumbers: make(map[string]int32), - } - parent := &asyncProducer{ - conf: config, - muter: newPartitionMuter(), - brokers: make(map[*Broker]*brokerProducer), - brokerRefs: make(map[*brokerProducer]int), - retries: make(chan *ProducerMessage, 1), - txnmgr: txnMgr, - } - leader := &Broker{id: 1} - parent.client = &stubLeaderClient{leader: leader, cfg: config} - - bp := &brokerProducer{ - parent: parent, - broker: leader, - input: make(chan *ProducerMessage), - accumulatingBatch: newProduceSet(parent), - currentRetries: make(map[string]map[int32]error), - } - parent.brokers[leader] = bp - parent.brokerRefs[bp] = 1 - - sent := newProduceSet(parent) - safeAddMessage(t, sent, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("retry")}) - retryPartitionSet := sent.msgs["topic"][0] - if !parent.muter.tryMute(sent) { - t.Fatal("expected sent batch to mute partitions") - } - defer parent.muter.unmute(sent) - - response := new(ProduceResponse) - response.AddTopicPartition("topic", 0, ErrNotLeaderForPartition) - bp.handleSuccess(sent, response) - - retried := assertDoneWithin(t, parent.retries, 2*time.Second) - require.Equal(t, retryPartitionSet.msgs[0], retried) - require.True(t, retried.reusePartitionMute) - - contender := newProduceSet(parent) - safeAddMessage(t, contender, &ProducerMessage{Topic: "topic", Partition: 0, Value: StringEncoder("next")}) - if parent.muter.tryMute(contender) { - parent.muter.unmute(contender) - t.Fatal("expected partition to remain muted by the retrying batch") - } - - safeAddMessage(t, bp.accumulatingBatch, retried) - if !bp.tryBuildFlushingBatch() { - t.Fatal("expected retry batch to reuse the existing partition mute") - } - require.Equal(t, retryPartitionSet.msgs[0], bp.flushingBatch.msgs["topic"][0].msgs[0]) - require.False(t, retryPartitionSet.msgs[0].reusePartitionMute) -} - type stubLeaderClient struct { cfg *Config leader *Broker diff --git a/produce_set.go b/produce_set.go index a86235b9e..380f7f5ab 100644 --- a/produce_set.go +++ b/produce_set.go @@ -12,42 +12,6 @@ type partitionSet struct { bufferBytes int } -// markPartitionMuteReusable lets the first message carry the retry batch's -// right to reuse the partition mute that is still held by its failed send. -// Only the first message is marked because one held mute represents one -// in-flight retry permission; marking every message could let split retry -// fragments bypass each other under the same reservation. -func (ps *partitionSet) markPartitionMuteReusable() { - if len(ps.msgs) > 0 { - ps.msgs[0].reusePartitionMute = true - } -} - -// canReusePartitionMute reports whether this retry batch may be flushed while -// its partition is already muted by the previous send attempt. -func (ps *partitionSet) canReusePartitionMute() bool { - return len(ps.msgs) > 0 && ps.msgs[0].reusePartitionMute -} - -// clearPartitionMuteReuse consumes the one-shot reuse marker once the retry -// batch is selected for flushing. -func (ps *partitionSet) clearPartitionMuteReuse() { - for _, msg := range ps.msgs { - msg.reusePartitionMute = false - } -} - -// canRetry prevents holding a partition mute when every message in the set will -// be returned as an error instead of being retried. -func (ps *partitionSet) canRetry(maxRetries int) bool { - for _, msg := range ps.msgs { - if msg.retries < maxRetries { - return true - } - } - return false -} - type produceSet struct { parent *asyncProducer msgs map[string]map[int32]*partitionSet