Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const VALID_CATEGORIES = [
'Models - Image Embeddings',
'Models - Image Generation',
'Models - LLM',
'Models - LLM Multimodal',
'Models - Object Detection',
'Models - Instance Segmentation',
'Models - Pose Estimation',
Expand Down
7 changes: 5 additions & 2 deletions apps/llm/components/llmModels.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ const llm = models.llm;
export type LLMModelSources = LLMProps['model'];

export const LLM_MODELS: ModelOption<LLMModelSources>[] = [
// Gemma4
{ label: 'Gemma4 E2B', value: llm.gemma4_e2b() },
//Gemma 4
{
label: 'Gemma 4 E2B',
value: llm.gemma4_e2b(),
},
// Llama 3.2
{
label: 'Llama 3.2 1B',
Expand Down
Binary file modified packages/react-native-executorch/android/libs/classes.jar
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ namespace llm {

TextPrefiller::TextPrefiller(TextDecoderRunner *text_decoder_runner,
bool use_kv_cache, bool enable_parallel_prefill,
int64_t max_seq_len)
int64_t max_seq_len, int32_t prefill_chunk_size)
: text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache),
enable_parallel_prefill_(enable_parallel_prefill),
max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
max_seq_len_(max_seq_len > 0 ? max_seq_len : 128),
prefill_chunk_size_(prefill_chunk_size) {}

::executorch::runtime::Result<uint64_t>
TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
Expand All @@ -31,11 +32,10 @@ TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
}

// Check if we need to chunk the prompt tokens
int32_t num_prompt_tokens = prompt_tokens.size();
const int32_t chunk_size = static_cast<int32_t>(max_seq_len_);
int32_t chunk_size =
prefill_chunk_size_ > 0 ? prefill_chunk_size_ : max_seq_len_;

// If prompt tokens exceed chunk_size, we need to chunk them
if (num_prompt_tokens > chunk_size) {
uint64_t cur_token = 0;
int num_tokens_to_process = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,14 @@ namespace llm {

class TextPrefiller {
public:
// prefill_chunk_size: when > 0, the prompt is always processed in steps of
// this size (see prefill()). Set to the model's forward sequence-length cap
// for the MLX backend (its forward is exported with a sliding-window bound
// and one-shot prefill spikes Metal memory). Other backends (XNNPACK/CoreML)
// pass 0 → original one-shot behavior.
TextPrefiller(TextDecoderRunner *text_decoder_runner, bool use_kv_cache,
bool enable_parallel_prefill, int64_t max_seq_len = 128);
bool enable_parallel_prefill, int64_t max_seq_len = 128,
int32_t prefill_chunk_size = 0);

virtual ~TextPrefiller() = default;
/**
Expand Down Expand Up @@ -70,6 +76,7 @@ class TextPrefiller {
bool use_kv_cache_;
bool enable_parallel_prefill_;
int64_t max_seq_len_;
int32_t prefill_chunk_size_;
};

} // namespace llm
Expand Down
29 changes: 22 additions & 7 deletions packages/react-native-executorch/common/runner/text_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,24 @@ Error TextRunner::load_subcomponents() {

Stats *stats_ptr = &stats_;

text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
*module_, io_manager_.get(), config_);
text_decoder_runner_ =
std::make_unique<TextDecoderRunner>(*module_, io_manager_.get(), config_);

int32_t prefill_chunk_size = 0;
auto fwd_meta = module_->method_meta("forward");
if (fwd_meta.ok() && fwd_meta->uses_backend("MLXBackend")) {
auto input_meta = fwd_meta->input_tensor_meta(0);
if (input_meta.ok()) {
auto sizes = input_meta->sizes();
if (sizes.size() >= 2 && sizes[sizes.size() - 1] > 0) {
prefill_chunk_size = sizes[sizes.size() - 1];
}
}
}

text_prefiller_ = std::make_unique<TextPrefiller>(
text_decoder_runner_.get(), config_.enable_kv_cache,
config_.enable_dynamic_shape, config_.max_seq_len);
config_.enable_dynamic_shape, config_.max_seq_len, prefill_chunk_size);
text_token_generator_ = std::make_unique<TextTokenGenerator>(
tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
std::move(eos_ids_), stats_ptr, config_);
Expand Down Expand Up @@ -83,10 +96,12 @@ Error TextRunner::generate_internal(
std::vector<uint64_t> prompt_tokens = encodeResult.get();
int num_prompt_tokens = prompt_tokens.size();

// For dynamic-shape PTEs (Gemma4 iter*), get_max_seq_len is the per-call
// decoder chunk size (e.g. 128) and the true generation budget lives in
// get_max_context_len. Static-shape PTEs set both equal, so this collapses
// to the old behavior. Mirrors multimodal_prefiller.cpp:96.
// For dynamic-shape PTEs (e.g. Gemma4 MLX/Vulkan), get_max_seq_len is the
// per-call decoder chunk size (e.g. the sliding window) and the real
// generation budget lives in get_max_context_len. Static-shape PTEs set both
// equal, so this collapses to the old behavior. Without this the budget is
// computed from the small chunk size, so max_new_tokens can resolve to ~0 and
// generation ends immediately after prefill.
const int32_t seq_cap = config_.enable_dynamic_shape
? config_.max_context_length
: config_.max_seq_len;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ Pod::Spec.new do |s|

s.libraries = "z"
s.ios.vendored_frameworks = "third-party/ios/ExecutorchLib.xcframework"

# NOTE: mlx.metallib (the MLX GPU kernels) is bundled INSIDE
# ExecutorchLib.framework, colocated with the binary that contains the MLX
# code. MLX's runtime loader resolves the metallib relative to that binary
# (via dladdr), so it must live next to it in the framework — not at the app
# bundle root.
# Exclude file with tests to not introduce gtest dependency.
# Do not include the headers from common/rnexecutorch/jsi/ as source files.
# Xcode/Cocoapods leaks them to other pods that an app also depends on, so if
Expand Down
36 changes: 33 additions & 3 deletions packages/react-native-executorch/src/constants/modelRegistry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import { RnExecutorchErrorCode } from '../errors/ErrorCodes';
* compile-time error.
* @category Utils
*/
export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn';
export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn' | 'mlx';

/**
* Options for a `models` accessor call.
Expand Down Expand Up @@ -78,7 +78,7 @@ type ConfigOf<V> = Extract<
>;
type BackendsOf<V> = Extract<keyof V, Backend>;

const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'vulkan', 'qnn'];
const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'mlx', 'vulkan', 'qnn'];

function firstBackend(variants: AnyVariantMap): Backend {
for (const b of BACKEND_ORDER) {
Expand Down Expand Up @@ -181,6 +181,33 @@ function tts<C extends TextToSpeechModelConfig>(c: C): () => C {
// Per-backend variant maps for models that ship more than one backend.
// ─────────────────────────────────────────────────────────────────────────────

const GEMMA4_E2B_VARIANTS = {
mlx: {
base: {
modelName: 'gemma4-e2b' as const,
modelSource: M.GEMMA4_E2B_MLX_MODEL,
tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
},
},
xnnpack: {
base: {
modelName: 'gemma4-e2b' as const,
modelSource: M.GEMMA4_E2B_XNNPACK_MODEL,
tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
},
},
vulkan: {
base: {
modelName: 'gemma4-e2b' as const,
modelSource: M.GEMMA4_E2B_VULKAN_MODEL,
tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
},
},
};

const EFFICIENTNET_V2_S_VARIANTS = {
xnnpack: {
base: {
Expand Down Expand Up @@ -496,7 +523,10 @@ export const models = {
M.LFM2_5_1_2B_INSTRUCT_QUANTIZED
),
bielik_v3_0_1_5b: pair(M.BIELIK_V3_0_1_5B, M.BIELIK_V3_0_1_5B_QUANTIZED),
gemma4_e2b: base(M.GEMMA4_E2B),
gemma4_e2b: variant(GEMMA4_E2B_VARIANTS, {
ios: 'mlx',
android: 'vulkan',
}),
// Multimodal LLMs — same hook/module as plain LLMs, listed here so users
// pick a model by capability ("LLM") rather than by modality.
lfm2_5_vl_1_6b: base(M.LFM2_5_VL_1_6B_QUANTIZED),
Expand Down
34 changes: 20 additions & 14 deletions packages/react-native-executorch/src/constants/modelUrls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,34 +124,40 @@ export const QWEN3_0_6B_QUANTIZED = {
generationConfig: QWEN3_GENERATION_CONFIG,
} as const;

// GEMMA 4
const GEMMA4_E2B_XNNPACK = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
const GEMMA4_E2B_VULKAN = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
const GEMMA4_E2B_XNNPACK_MM = `${URL_PREFIX}-gemma-4-multimodal/${PREVIOUS_VERSION_TAG}/e2b/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
// GEMMA 4 — separate HF repo; tokenizer files live at the e2b root and are
// shared by all backend variants.
const GEMMA4_E2B_PREFIX = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b`;
export const GEMMA4_E2B_MLX_MODEL = `${GEMMA4_E2B_PREFIX}/mlx/gemma4_e2b_mlx_int4.pte`;
export const GEMMA4_E2B_XNNPACK_MODEL = `${GEMMA4_E2B_PREFIX}/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
export const GEMMA4_E2B_VULKAN_MODEL = `${GEMMA4_E2B_PREFIX}/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
export const GEMMA4_E2B_TOKENIZER = `${GEMMA4_E2B_PREFIX}/tokenizer.json`;
export const GEMMA4_E2B_TOKENIZER_CONFIG = `${GEMMA4_E2B_PREFIX}/tokenizer_config.json`;

const GEMMA4_E2B_MODEL =
Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MODEL : GEMMA4_E2B_MLX_MODEL;

const GEMMA4_E2B_MLX_MM = `${URL_PREFIX}-gemma-4-multimodal/${PREVIOUS_VERSION_TAG}/e2b/mlx/gemma4_e2b_mlx_int4.pte`;
const GEMMA4_E2B_VULKAN_MM = `${URL_PREFIX}-gemma-4-multimodal/${PREVIOUS_VERSION_TAG}/e2b/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
const GEMMA4_TOKENIZER = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/tokenizer.json`;
const GEMMA4_TOKENIZER_CONFIG = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/tokenizer_config.json`;

/**
* @category Models - LLM
*/
export const GEMMA4_E2B = {
modelName: 'gemma4-e2b',
modelSource:
Platform.OS === `android` ? GEMMA4_E2B_VULKAN : GEMMA4_E2B_XNNPACK,
tokenizerSource: GEMMA4_TOKENIZER,
tokenizerConfigSource: GEMMA4_TOKENIZER_CONFIG,
modelSource: GEMMA4_E2B_MODEL,
tokenizerSource: GEMMA4_E2B_TOKENIZER,
tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
} as const;

/**
* @category Models - VLM
* @category Models - LLM Multimodal
*/
Comment thread
msluszniak marked this conversation as resolved.
export const GEMMA4_E2B_MM = {
modelName: 'gemma4-e2b-multimodal',
modelSource:
Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MM : GEMMA4_E2B_XNNPACK_MM,
tokenizerSource: GEMMA4_TOKENIZER,
tokenizerConfigSource: GEMMA4_TOKENIZER_CONFIG,
Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MM : GEMMA4_E2B_MLX_MM,
tokenizerSource: GEMMA4_E2B_TOKENIZER,
tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
capabilities: ['vision', 'audio'],
audioConfig: {
samplesPerBlock: 7680,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
* LICENSE file in the root directory of this source tree.
*/

#import "ExecuTorchBackendOption.h"
#import "ExecuTorchBackendOptionsMap.h"
#import "ExecuTorchError.h"
#import "ExecuTorchLog.h"
#import "ExecuTorchModule.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
* LICENSE file in the root directory of this source tree.
*/

#import "ExecuTorchBackendOption.h"
#import "ExecuTorchBackendOptionsMap.h"
#import "ExecuTorchValue.h"

NS_ASSUME_NONNULL_BEGIN
Expand Down Expand Up @@ -198,6 +200,37 @@ NS_SWIFT_NAME(Module)
*/
- (BOOL)load:(NSError **)error;

/**
* Loads the module's program with per-delegate backend options.
*
* The receiver retains @c options for as long as the underlying program
* references it (lifetime tracked via ARC).
*
* @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
* load-time configuration, built once via
* `[ExecuTorchBackendOptionsMap mapWithOptions:error:]`.
* @param verification The verification level to apply when loading the program.
* @param error A pointer to an NSError pointer that will be set if an error
* occurs.
* @return YES if the program was successfully loaded; otherwise, NO.
*/
- (BOOL)loadWithOptions:(ExecuTorchBackendOptionsMap *)options
verification:(ExecuTorchVerification)verification
error:(NSError **)error NS_REFINED_FOR_SWIFT;

/**
* Loads the module's program with per-delegate backend options using minimal
* verification.
*
* @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
* load-time configuration.
* @param error A pointer to an NSError pointer that will be set if an error
* occurs.
* @return YES if the program was successfully loaded; otherwise, NO.
*/
- (BOOL)loadWithOptions:(ExecuTorchBackendOptionsMap *)options
error:(NSError **)error NS_REFINED_FOR_SWIFT;

/**
* Checks if the module is loaded.
*
Expand All @@ -215,6 +248,19 @@ NS_SWIFT_NAME(Module)
- (BOOL)loadMethod:(NSString *)methodName
error:(NSError **)error NS_SWIFT_NAME(load(_:));

/**
* Loads a specific method from the program with per-delegate backend options.
*
* @param methodName A string representing the name of the method to load.
* @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
* load-time configuration.
* @param error A pointer to an NSError pointer that is set if an error occurs.
* @return YES if the method was successfully loaded; otherwise, NO.
*/
- (BOOL)loadMethod:(NSString *)methodName
options:(ExecuTorchBackendOptionsMap *)options
error:(NSError **)error NS_REFINED_FOR_SWIFT;

/**
* Checks if a specific method is loaded.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ class BufferDataLoader final : public executorch::runtime::DataLoader {
ET_UNUSED const DataLoader::SegmentInfo &segment_info) const override {
size_t total_size;
bool overflow = c10::add_overflows(offset, size, &total_size);
ET_CHECK_OR_RETURN_ERROR(!overflow && total_size <= size_, InvalidArgument,
"offset %zu + size %zu > size_ %zu", offset, size,
size_);
ET_CHECK_OR_RETURN_ERROR(
!overflow && total_size <= size_, InvalidArgument,
"offset %zu + size %zu > size_ %zu, or overflow detected", offset, size,
size_);
return executorch::runtime::FreeableBuffer(data_ + offset, size,
/*free_fn=*/nullptr);
}
Expand Down
Loading
Loading