From c6a9db0c0aa80b729b91daa924a7593bc0f841e2 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 11 Jun 2026 18:47:56 +0200 Subject: [PATCH] fix(llm): snapshot vision encoder output before caching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-image embedding cache stored the EValue returned by vision_encoder.execute(), whose tensor aliases the method's reusable output buffer. The next execute() overwrites that buffer, so in any conversation with more than one image every cached entry silently became the most recently encoded image — the model would describe the first picture as the second one on re-prefilled turns. The audio path already snapshots its encoder output for exactly this reason; do the same for vision: copy the output bytes into the cache entry and serve cache hits from a tensor over the owned bytes. Authored with Claude. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../common/runner/encoders/vision_encoder.cpp | 17 +++++++++++++---- .../common/runner/encoders/vision_encoder.h | 17 +++++++++++++++-- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 59fee53e11..09fb459661 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -112,7 +112,7 @@ Result VisionEncoder::encode(const MultimodalInput &input) { auto it = embedding_cache_.find(path); if (it != embedding_cache_.end()) { - return it->second; + return EValue(*it->second.tensor); } auto shape = ET_UNWRAP(getInputShape()); @@ -128,9 +128,18 @@ Result VisionEncoder::encode(const MultimodalInput &input) { chw.data(), sizes, ::executorch::aten::ScalarType::Float); auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); - auto embedding = result[0]; - embedding_cache_.emplace(path, embedding); - return embedding; + auto out_tensor = result[0].toTensor(); + + CachedEmbedding cached; + cached.bytes.resize(out_tensor.nbytes()); + std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(), + out_tensor.nbytes()); + cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end()); + cached.dtype = out_tensor.scalar_type(); + auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached)); + entry->second.tensor = ::executorch::extension::from_blob( + entry->second.bytes.data(), entry->second.sizes, entry->second.dtype); + return EValue(*entry->second.tensor); } } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index bb8a8421b9..54d43bb869 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -2,11 +2,14 @@ #pragma once #include "iencoder.h" +#include #include +#include #include #include #include #include +#include namespace executorch::extension::llm { @@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder { bool with_batch; }; + // The method's output EValue aliases the runtime's reusable output buffer, + // which the NEXT vision_encoder.execute() overwrites — caching it directly + // silently turns earlier images into the most recently encoded one. Cache + // an owned byte snapshot instead and hand out a tensor over those bytes. + struct CachedEmbedding { + std::vector bytes; + std::vector<::executorch::aten::SizesType> sizes; + ::executorch::aten::ScalarType dtype; + ::executorch::extension::TensorPtr tensor; + }; + ::executorch::runtime::Result getInputShape() const; std::vector preprocessImage(const std::string &path, const ImageShape &targetShape) const; ::executorch::extension::Module *module_; - std::unordered_map - embedding_cache_; + std::unordered_map embedding_cache_; }; } // namespace executorch::extension::llm