From c6a9db0c0aa80b729b91daa924a7593bc0f841e2 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 11 Jun 2026 18:47:56 +0200
Subject: [PATCH] fix(llm): snapshot vision encoder output before caching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-image embedding cache stored the EValue returned by
vision_encoder.execute(), whose tensor aliases the method's reusable
output buffer. The next execute() overwrites that buffer, so in any
conversation with more than one image every cached entry silently
became the most recently encoded image — the model would describe the
first picture as the second one on re-prefilled turns. The audio path
already snapshots its encoder output for exactly this reason; do the
same for vision: copy the output bytes into the cache entry and serve
cache hits from a tensor over the owned bytes.

Authored with Claude.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../common/runner/encoders/vision_encoder.cpp   | 17 +++++++++++++----
 .../common/runner/encoders/vision_encoder.h     | 17 +++++++++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 59fee53e11..09fb459661 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -112,7 +112,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
 
   auto it = embedding_cache_.find(path);
   if (it != embedding_cache_.end()) {
-    return it->second;
+    return EValue(*it->second.tensor);
   }
 
   auto shape = ET_UNWRAP(getInputShape());
@@ -128,9 +128,18 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
 
   auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
-  auto embedding = result[0];
-  embedding_cache_.emplace(path, embedding);
-  return embedding;
+  auto out_tensor = result[0].toTensor();
+
+  CachedEmbedding cached;
+  cached.bytes.resize(out_tensor.nbytes());
+  std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(),
+              out_tensor.nbytes());
+  cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end());
+  cached.dtype = out_tensor.scalar_type();
+  auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached));
+  entry->second.tensor = ::executorch::extension::from_blob(
+      entry->second.bytes.data(), entry->second.sizes, entry->second.dtype);
+  return EValue(*entry->second.tensor);
 }
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
index bb8a8421b9..54d43bb869 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -2,11 +2,14 @@
 #pragma once
 
 #include "iencoder.h"
+#include <cstdint>
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <runner/multimodal_input.h>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace executorch::extension::llm {
 
@@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder {
     bool with_batch;
   };
 
+  // The method's output EValue aliases the runtime's reusable output buffer,
+  // which the NEXT vision_encoder.execute() overwrites — caching it directly
+  // silently turns earlier images into the most recently encoded one. Cache
+  // an owned byte snapshot instead and hand out a tensor over those bytes.
+  struct CachedEmbedding {
+    std::vector<uint8_t> bytes;
+    std::vector<::executorch::aten::SizesType> sizes;
+    ::executorch::aten::ScalarType dtype;
+    ::executorch::extension::TensorPtr tensor;
+  };
+
   ::executorch::runtime::Result<ImageShape> getInputShape() const;
   std::vector<float> preprocessImage(const std::string &path,
                                      const ImageShape &targetShape) const;
 
   ::executorch::extension::Module *module_;
-  std::unordered_map<std::string, ::executorch::runtime::EValue>
-      embedding_cache_;
+  std::unordered_map<std::string, CachedEmbedding> embedding_cache_;
 };
 
 } // namespace executorch::extension::llm