Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {

auto it = embedding_cache_.find(path);
if (it != embedding_cache_.end()) {
return it->second;
return EValue(*it->second.tensor);
}

auto shape = ET_UNWRAP(getInputShape());
Expand All @@ -128,9 +128,18 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
chw.data(), sizes, ::executorch::aten::ScalarType::Float);

auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
auto embedding = result[0];
embedding_cache_.emplace(path, embedding);
return embedding;
auto out_tensor = result[0].toTensor();

CachedEmbedding cached;
cached.bytes.resize(out_tensor.nbytes());
std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(),
out_tensor.nbytes());
cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end());
cached.dtype = out_tensor.scalar_type();
auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached));
entry->second.tensor = ::executorch::extension::from_blob(
entry->second.bytes.data(), entry->second.sizes, entry->second.dtype);
return EValue(*entry->second.tensor);
}

} // namespace executorch::extension::llm
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
#pragma once

#include "iencoder.h"
#include <cstdint>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/core/evalue.h>
#include <runner/multimodal_input.h>
#include <string>
#include <unordered_map>
#include <vector>

namespace executorch::extension::llm {

Expand All @@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder {
bool with_batch;
};

// The method's output EValue aliases the runtime's reusable output buffer,
// which the NEXT vision_encoder.execute() overwrites — caching it directly
// silently turns earlier images into the most recently encoded one. Cache
// an owned byte snapshot instead and hand out a tensor over those bytes.
struct CachedEmbedding {
std::vector<uint8_t> bytes;
std::vector<::executorch::aten::SizesType> sizes;
::executorch::aten::ScalarType dtype;
::executorch::extension::TensorPtr tensor;
};

::executorch::runtime::Result<ImageShape> getInputShape() const;
std::vector<float> preprocessImage(const std::string &path,
const ImageShape &targetShape) const;

::executorch::extension::Module *module_;
std::unordered_map<std::string, ::executorch::runtime::EValue>
embedding_cache_;
std::unordered_map<std::string, CachedEmbedding> embedding_cache_;
};

} // namespace executorch::extension::llm
Loading