diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 59fee53e11..09fb459661 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -112,7 +112,7 @@ Result VisionEncoder::encode(const MultimodalInput &input) { auto it = embedding_cache_.find(path); if (it != embedding_cache_.end()) { - return it->second; + return EValue(*it->second.tensor); } auto shape = ET_UNWRAP(getInputShape()); @@ -128,9 +128,18 @@ Result VisionEncoder::encode(const MultimodalInput &input) { chw.data(), sizes, ::executorch::aten::ScalarType::Float); auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); - auto embedding = result[0]; - embedding_cache_.emplace(path, embedding); - return embedding; + auto out_tensor = result[0].toTensor(); + + CachedEmbedding cached; + cached.bytes.resize(out_tensor.nbytes()); + std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(), + out_tensor.nbytes()); + cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end()); + cached.dtype = out_tensor.scalar_type(); + auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached)); + entry->second.tensor = ::executorch::extension::from_blob( + entry->second.bytes.data(), entry->second.sizes, entry->second.dtype); + return EValue(*entry->second.tensor); } } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index bb8a8421b9..54d43bb869 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -2,11 +2,14 @@ #pragma once #include "iencoder.h" +#include #include +#include #include #include #include #include +#include namespace executorch::extension::llm { @@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder { bool with_batch; }; + // The method's output EValue aliases the runtime's reusable output buffer, + // which the NEXT vision_encoder.execute() overwrites — caching it directly + // silently turns earlier images into the most recently encoded one. Cache + // an owned byte snapshot instead and hand out a tensor over those bytes. + struct CachedEmbedding { + std::vector bytes; + std::vector<::executorch::aten::SizesType> sizes; + ::executorch::aten::ScalarType dtype; + ::executorch::extension::TensorPtr tensor; + }; + ::executorch::runtime::Result getInputShape() const; std::vector preprocessImage(const std::string &path, const ImageShape &targetShape) const; ::executorch::extension::Module *module_; - std::unordered_map - embedding_cache_; + std::unordered_map embedding_cache_; }; } // namespace executorch::extension::llm