From 243c31f64b918ca20d47cc83f3a365ab41c84d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arnaud=20He=CC=81ritier?= <arnaud.heritier@docker.com>
Date: Thu, 21 May 2026 21:16:54 +0200
Subject: [PATCH] fix(#2861): release per-message render caches when streaming
 completes

messageModel held its rendered ANSI output (renderCache.result) and a
per-message IncrementalRenderer (inputPrefix + outputPrefix +
codeBlocksPrefix) for the lifetime of the session. With ~80-120 KB of
retained strings per assistant turn and no eviction, a long TUI session
grew its heap linearly until macOS's memorystatus subsystem killed
docker-agent as the largest compressed process.

Add Finalize() on messageModel that drops both caches when a message is
no longer the active streaming target, and call it on the previous view
whenever a new top-level entry is appended (and on every loaded view
except the last in LoadFromSession). Render() and renderAssistantMarkdown()
gracefully fall back to a transient renderer for finalized messages, so
output stays correct; the chat list's bounded LRU still memoizes their
rendered output.

Finalize() is gated on MessageTypeAssistant: only assistant views allocate
an IncrementalRenderer and accumulate large rendered ANSI strings during
streaming, so user messages, tool calls, and other types have nothing to
release. Setting finalized=true on those views would only have the side
effect of permanently disabling renderCache for selected/hovered states
(which bypass the parent's bounded LRU), forcing a fresh re-render every
animation tick.

addReasoningBlock() does not finalize the previous view: reasoning
blocks routinely interleave with an actively streaming assistant turn,
and finalizing the in-progress view would force every subsequent chunk
through the transient-renderer path. The next non-reasoning entry will
finalize via addMessage() when the streaming turn actually ends.

removeSpinner() no longer calls invalidateAllItems(): the spinner is
always at the tail, so wiping the LRU on every assistant turn was both
wasteful and immediately re-armed the leak by forcing a full re-render
of every previous message into a fresh renderCache. Spinner-driven
messages are never put in the LRU (shouldCacheMessage returns false),
so only the joined renderedLines slab needs to be reset.

Adds a regression test that streams 400 large assistant messages and
asserts both heap growth (<30 KB/message vs. the 161 KB/message
pre-fix slope) and the structural invariant that at most one view
retains render-cache state. The assertion uses a new exported
HasLiveRenderState() method on the message.Model interface instead of
cross-package reflection on unexported fields.

Fixes #2861

Assisted-By: docker-agent
---
 pkg/tui/components/message/message.go    |  89 +++++++++++-
 pkg/tui/components/messages/leak_test.go | 169 +++++++++++++++++++++++
 pkg/tui/components/messages/messages.go  |  51 ++++++-
 3 files changed, 307 insertions(+), 2 deletions(-)
 create mode 100644 pkg/tui/components/messages/leak_test.go

diff --git a/pkg/tui/components/message/message.go b/pkg/tui/components/message/message.go
index 96e50a50e..bd1f35237 100644
--- a/pkg/tui/components/message/message.go
+++ b/pkg/tui/components/message/message.go
@@ -27,6 +27,16 @@ type Model interface {
 	SetSelected(selected bool)
 	SetHovered(hovered bool)
 	CodeBlocks() []markdown.CodeBlock
+	// Finalize releases per-message render state that is only needed while the
+	// message is actively streaming. The message content and code-block metadata
+	// are preserved; calling View() afterwards still produces correct output
+	// without retaining a per-view render cache or IncrementalRenderer.
+	Finalize()
+	// HasLiveRenderState reports whether this view currently retains a
+	// populated renderCache or an IncrementalRenderer instance. Used by tests
+	// to assert that finalized views have actually released their per-message
+	// render state without reaching into unexported fields via reflection.
+	HasLiveRenderState() bool
 }
 
 // messageModel implements Model
@@ -59,6 +69,14 @@ type messageModel struct {
 	// streamed-in chunks only re-render the trailing block instead of the whole
 	// accumulated markdown each time.
 	mdRenderer *markdown.IncrementalRenderer
+
+	// finalized is set by Finalize() once the message is no longer the active
+	// streaming view. After it is set, Render() still produces correct output,
+	// but does not store anything in renderCache and does not retain an
+	// IncrementalRenderer between calls — both are pure caches whose memory
+	// dominates a long session, and they are not worth keeping for messages
+	// that are unlikely to be re-rendered hot.
+	finalized bool
 }
 
 // renderCache stores the most recent Render result keyed by the inputs that
@@ -99,6 +117,11 @@ func (mv *messageModel) Init() tea.Cmd {
 }
 
 func (mv *messageModel) SetMessage(msg *types.Message) {
+	// Un-finalize when the underlying message is changed (e.g. streaming
+	// resumes into this view). Finalize is meant for views that have
+	// permanently lost their actively-streaming status; mutating the message
+	// re-arms the per-message caches so subsequent renders are fast again.
+	mv.finalized = false
 	// If the new content is not an extension of the previous one (different
 	// message, or the message was edited), drop the IncrementalRenderer's
 	// cached prefix so its memory is released immediately rather than on the
@@ -177,7 +200,11 @@ func (mv *messageModel) Render(width int) string {
 	// MessageTypeAssistant placeholder) animate on every tick, so the result is
 	// not cacheable. Everything else is a pure function of the inputs tracked in
 	// renderCache below.
-	cacheable := !mv.isSpinnerDriven()
+	// Spinner-driven messages animate every tick and are not cacheable.
+	// Finalized messages skip writing into renderCache so the per-view
+	// retained ANSI string does not pile up across long sessions; the chat
+	// list's bounded LRU still memoizes their rendered output.
+	cacheable := !mv.isSpinnerDriven() && !mv.finalized
 	if cacheable {
 		c := &mv.renderCache
 		if c.valid &&
@@ -376,10 +403,22 @@ func (mv *messageModel) render(width int) string {
 // so each new chunk only re-parses the trailing region. The first render at a
 // given width is equivalent to a fresh full render.
 //
+// For finalized messages we use a transient renderer that is discarded after
+// each call. Finalized messages are no longer streamed, so the prefix-cache
+// inside an IncrementalRenderer is not earning its keep — keeping it resident
+// across the lifetime of every historical message in a session is the
+// dominant source of retained memory in long sessions. The parent message
+// list's bounded rendered-item LRU can still memoize finalized message output
+// without storing an additional per-view copy.
+//
 // It also returns the list of fenced code blocks emitted by the renderer so
 // that callers can map clicks on the per-block copy affordance back to the
 // underlying raw code.
 func (mv *messageModel) renderAssistantMarkdown(content string, width int) (string, []markdown.CodeBlock, error) {
+	if mv.finalized {
+		r := markdown.NewIncrementalRenderer(width)
+		return r.RenderWithCodeBlocks(content)
+	}
 	if mv.mdRenderer == nil {
 		mv.mdRenderer = markdown.NewIncrementalRenderer(width)
 	} else {
@@ -442,6 +481,54 @@ func (mv *messageModel) StopAnimation() {
 	}
 }
 
+// Finalize releases per-message render state that no longer needs to be kept
+// resident once the message is no longer the actively streaming view. This is
+// called by the parent message list when a new top-level message arrives, and
+// for every historical view loaded from a session.
+//
+// Finalize is a no-op for non-assistant message types: only assistant views
+// allocate an IncrementalRenderer and accumulate large rendered ANSI strings
+// during streaming, so user messages, tool calls, error/welcome banners and
+// the like have nothing to release. Setting `finalized = true` on those views
+// would only have the side-effect of permanently disabling renderCache for
+// selected/hovered states (which bypass the parent's bounded LRU), forcing a
+// fresh re-render on every animation tick. Restricting the disable to
+// assistant views keeps the leak fix scoped to the type that actually leaks.
+//
+// The retained payload of an assistant view is dominated by the renderCache
+// (a copy of the rendered ANSI string) and the IncrementalRenderer's internal
+// caches (last rendered prefix, glamour AST state). Both are pure render
+// state — they can be regenerated from mv.message on demand. We deliberately
+// leave mv.message, mv.codeBlocks and the spinner untouched so that View()
+// keeps returning correct output, click-targeting on code blocks still works,
+// and the spinner-driven types continue to animate.
+//
+// Finalize is idempotent and durable: subsequent renders do not re-populate
+// renderCache or store an IncrementalRenderer on the struct. This is
+// important because the parent message list invalidates its own LRU on
+// several events (spinner removal, theme change, window resize) and would
+// otherwise re-render every previously finalized view, putting the per-
+// message render state right back where it was.
+func (mv *messageModel) Finalize() {
+	if mv.message == nil || mv.message.Type != types.MessageTypeAssistant {
+		return
+	}
+	mv.renderCache = renderCache{}
+	if mv.mdRenderer != nil {
+		mv.mdRenderer.Reset()
+		mv.mdRenderer = nil
+	}
+	mv.finalized = true
+}
+
+// HasLiveRenderState reports whether this view still retains per-message
+// render state — either a populated renderCache or an IncrementalRenderer
+// instance. Used as a structural assertion in regression tests that verify
+// Finalize() actually released what it was supposed to release.
+func (mv *messageModel) HasLiveRenderState() bool {
+	return mv.renderCache.result != "" || mv.mdRenderer != nil
+}
+
 // SetSize sets the dimensions of the message view
 func (mv *messageModel) SetSize(width, height int) tea.Cmd {
 	if mv.width != width {
diff --git a/pkg/tui/components/messages/leak_test.go b/pkg/tui/components/messages/leak_test.go
new file mode 100644
index 000000000..a0a980979
--- /dev/null
+++ b/pkg/tui/components/messages/leak_test.go
@@ -0,0 +1,169 @@
+package messages
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"testing"
+
+	"github.com/docker/docker-agent/pkg/tui/components/message"
+	"github.com/docker/docker-agent/pkg/tui/core/layout"
+	"github.com/docker/docker-agent/pkg/tui/service"
+)
+
+// TestLongSessionDoesNotRetainPerMessageRenderState streams a long sequence
+// of large assistant messages through the same public API the TUI uses
+// (AddUserMessage / AddAssistantMessage / AppendToLastMessage), rendering the
+// active assistant view between chunks so messageModel.renderCache and
+// IncrementalRenderer state are exercised. It asserts two things:
+//
+//  1. Heap growth stays far below the pre-fix slope. Before the fix every
+//     assistant view kept its own renderCache (~70 KB rendered ANSI) and an
+//     IncrementalRenderer (cached prefix + rendered output) for a measured
+//     ~161 KB of retained state per message. After the fix, per-message render
+//     state is released as soon as the message stops being the active
+//     streaming view, so the remaining slope is mostly the raw session content.
+//
+//  2. The structural invariant: after every non-active assistant view has
+//     been finalized, only the single actively-streaming view (at most one)
+//     should be holding a non-empty renderCache or a live IncrementalRenderer.
+//
+// Run with -short to skip; the test is deterministic but takes a few seconds.
+func TestLongSessionDoesNotRetainPerMessageRenderState(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping leak test in -short mode")
+	}
+
+	// `go test` may reset runtime.MemProfileRate when its own -memprofile flag
+	// is unset, which would override the value set in init().
+	runtime.MemProfileRate = 1
+
+	const (
+		warmupMessages  = 200
+		measureMessages = 200
+		chunkSize       = 256
+		messageBytes    = 8192
+		viewWidth       = 120
+		viewHeight      = 40
+
+		// Per-message heap budget for the retained slope. Pre-fix slope was
+		// ~161 KB/msg and did not flatten. The remaining post-fix slope is mostly
+		// the original message content and small per-view bookkeeping.
+		maxGrowthPerMessageBytes = 30 * 1024
+	)
+
+	sessionState := &service.SessionState{}
+	m := NewScrollableView(viewWidth, viewHeight, sessionState).(*model)
+	m.SetSize(viewWidth, viewHeight)
+
+	body := buildMarkdownBody(messageBytes)
+
+	heapInuse := func() uint64 {
+		runtime.GC()
+		runtime.GC() // second pass to settle finalizers
+		var ms runtime.MemStats
+		runtime.ReadMemStats(&ms)
+		return ms.HeapInuse
+	}
+
+	renderLastView := func() {
+		if len(m.views) == 0 {
+			return
+		}
+		_ = m.views[len(m.views)-1].View()
+	}
+
+	streamMessage := func(i int) {
+		m.AddUserMessage(fmt.Sprintf("user message %d", i))
+		m.AddAssistantMessage()
+		for off := 0; off < len(body); off += chunkSize {
+			end := min(off+chunkSize, len(body))
+			m.AppendToLastMessage("root", body[off:end])
+			if (off/chunkSize)%4 == 0 {
+				renderLastView()
+			}
+		}
+		renderLastView()
+	}
+
+	// Warmup: stream enough messages to saturate the chat-list LRU.
+	for i := range warmupMessages {
+		streamMessage(i)
+	}
+	afterWarmup := heapInuse()
+
+	// Measurement: stream another batch and observe the slope.
+	for i := range measureMessages {
+		streamMessage(warmupMessages + i)
+	}
+	afterMeasure := heapInuse()
+
+	growth := max(int64(afterMeasure)-int64(afterWarmup), 0)
+	perMessage := growth / measureMessages
+
+	t.Logf("warmup_heap=%d KB measure_heap=%d KB delta=%d KB perMsg=%d KB (budget=%d KB)",
+		afterWarmup/1024, afterMeasure/1024, growth/1024, perMessage/1024,
+		maxGrowthPerMessageBytes/1024)
+
+	live := countLiveRenderState(t, m.views)
+	t.Logf("views=%d live render-state holders=%d (LRU cap=%d, current=%d)",
+		len(m.views), live, renderedItemsCacheSize, m.renderedItems.Len())
+
+	if perMessage > maxGrowthPerMessageBytes {
+		t.Fatalf("heap growth %d B/message exceeds budget %d B/message — "+
+			"per-message render state appears to be retained after finalization",
+			perMessage, maxGrowthPerMessageBytes)
+	}
+
+	if live > 2 {
+		t.Fatalf("expected at most 2 messageModels to hold render-cache state "+
+			"(the active streaming view), got %d out of %d views",
+			live, len(m.views))
+	}
+}
+
+// countLiveRenderState walks the view list and reports how many message.Model
+// views still hold per-message render state — a populated renderCache or an
+// IncrementalRenderer. Uses the exported HasLiveRenderState() so the test
+// does not depend on reflection across package boundaries.
+func countLiveRenderState(t *testing.T, views []layout.Model) int {
+	t.Helper()
+	var live int
+	for _, v := range views {
+		mv, ok := v.(message.Model)
+		if !ok {
+			continue
+		}
+		if mv.HasLiveRenderState() {
+			live++
+		}
+	}
+	return live
+}
+
+// buildMarkdownBody returns a deterministic markdown blob of approximately
+// the requested size. The mix of prose paragraphs and a fenced code block
+// exercises both stable-boundary detection and code-block caching in the
+// IncrementalRenderer (the two structures called out in the leak hypothesis).
+func buildMarkdownBody(targetBytes int) string {
+	const paragraph = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " +
+		"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " +
+		"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris " +
+		"nisi ut aliquip ex ea commodo consequat.\n\n"
+	const codeBlock = "```go\n" +
+		"func example(x int) int {\n" +
+		"    // some representative code\n" +
+		"    return x * 2\n" +
+		"}\n" +
+		"```\n\n"
+
+	var b strings.Builder
+	b.Grow(targetBytes + len(paragraph))
+	for b.Len() < targetBytes {
+		b.WriteString(paragraph)
+		if b.Len()%3 == 0 {
+			b.WriteString(codeBlock)
+		}
+	}
+	return b.String()
+}
diff --git a/pkg/tui/components/messages/messages.go b/pkg/tui/components/messages/messages.go
index c5c5748f8..390ab50d3 100644
--- a/pkg/tui/components/messages/messages.go
+++ b/pkg/tui/components/messages/messages.go
@@ -1184,6 +1184,21 @@ func (m *model) invalidateAllItems() {
 	m.renderDirty = true
 }
 
+// finalizePreviousMessageView releases per-message render state on the most
+// recent message.Model view (if any) before a new top-level entry is
+// appended. The renderCache and IncrementalRenderer are pure caches — dropping
+// them prevents long sessions from accumulating O(N) retained render state
+// for messages that are no longer streaming, while View() lazily rebuilds
+// them if the message is ever re-rendered.
+func (m *model) finalizePreviousMessageView() {
+	if len(m.views) == 0 {
+		return
+	}
+	if mv, ok := m.views[len(m.views)-1].(message.Model); ok {
+		mv.Finalize()
+	}
+}
+
 // Message management methods
 func (m *model) AddUserMessage(content string) tea.Cmd {
 	return m.addMessage(types.User(content))
@@ -1226,6 +1241,7 @@ func (m *model) AddAssistantMessage() tea.Cmd {
 }
 
 func (m *model) AddCancelledMessage() tea.Cmd {
+	m.finalizePreviousMessageView()
 	msg := types.Cancelled()
 	m.messages = append(m.messages, msg)
 	view := m.createMessageView(msg)
@@ -1250,6 +1266,7 @@ func (m *model) addMessage(msg *types.Message) tea.Cmd {
 	m.clearSelection()
 	shouldAutoScroll := !m.userHasScrolled
 
+	m.finalizePreviousMessageView()
 	m.messages = append(m.messages, msg)
 	view := m.createMessageView(msg)
 	m.sessionState.SetPreviousMessage(msg)
@@ -1410,6 +1427,16 @@ func (m *model) LoadFromSession(sess *session.Session) tea.Cmd {
 		cmds = append(cmds, view.Init())
 	}
 
+	// Finalize all but the last message.Model view: historical assistant
+	// messages will only ever be re-rendered on demand, so there is no need
+	// to keep their renderCache or IncrementalRenderer state resident. The
+	// most recent view is left untouched in case streaming continues into it.
+	for i := range len(m.views) - 1 {
+		if mv, ok := m.views[i].(message.Model); ok {
+			mv.Finalize()
+		}
+	}
+
 	cmds = append(cmds, m.ScrollToBottom())
 	return tea.Batch(cmds...)
 }
@@ -1460,6 +1487,7 @@ func (m *model) AddOrUpdateToolCall(agentName string, toolCall tools.ToolCall, t
 	}
 
 	// Otherwise create a standalone tool call message
+	m.finalizePreviousMessageView()
 	msg := types.ToolCallMessage(agentName, toolCall, toolDef, status)
 	m.messages = append(m.messages, msg)
 	view := m.createToolCallView(msg)
@@ -1546,6 +1574,14 @@ func (m *model) AppendReasoning(agentName, content string) tea.Cmd {
 }
 
 // addReasoningBlock creates a new reasoning block message.
+//
+// Reasoning blocks routinely interleave with an actively streaming assistant
+// turn (the LLM emits a thought, then resumes content). Finalizing the
+// previous view here would drop the renderCache and IncrementalRenderer of
+// a message the user is still watching, and the very next chunk would have
+// to rebuild them from scratch via the transient renderer path. The next
+// non-reasoning entry (user message, tool call, error) will finalize via
+// addMessage when the streaming turn actually ends.
 func (m *model) addReasoningBlock(agentName, content string) tea.Cmd {
 	m.clearSelection()
 	shouldAutoScroll := !m.userHasScrolled
@@ -1670,7 +1706,20 @@ func (m *model) removeSpinner() {
 			m.views = m.views[:lastIdx]
 		}
 		m.messages = m.messages[:lastIdx]
-		m.invalidateAllItems()
+		// The spinner is always at the tail, so other indices are unchanged
+		// and their cached entries remain valid. Only the joined renderedLines
+		// references the now-removed spinner, so we drop it and force a rejoin
+		// on the next render. The LRU itself never held a spinner entry
+		// (shouldCacheMessage returns false for spinner-driven types).
+		// Avoiding invalidateAllItems here is essential for long sessions:
+		// it would otherwise wipe up to 500 cached renderings once per
+		// assistant turn, forcing every previous message to be re-parsed
+		// from markdown on the next render.
+		m.renderedLines = nil
+		m.lineOffsets = nil
+		m.totalHeight = 0
+		m.urlSpans.clear()
+		m.renderDirty = true
 	}
 }