software-mansion · mkopcins · Jun 11, 2026 · Jun 3, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/.eslintrc.js b/.eslintrc.js
@@ -8,6 +8,7 @@ const VALID_CATEGORIES = [
   'Models - Image Embeddings',
   'Models - Image Generation',
   'Models - LLM',
+  'Models - LLM Multimodal',
   'Models - Object Detection',
   'Models - Instance Segmentation',
   'Models - Pose Estimation',

diff --git a/apps/llm/components/llmModels.ts b/apps/llm/components/llmModels.ts
@@ -10,8 +10,11 @@ const llm = models.llm;
 export type LLMModelSources = LLMProps['model'];
 
 export const LLM_MODELS: ModelOption<LLMModelSources>[] = [
-  // Gemma4
-  { label: 'Gemma4 E2B', value: llm.gemma4_e2b() },
+  //Gemma 4
+  {
+    label: 'Gemma 4 E2B',
+    value: llm.gemma4_e2b(),
+  },
   // Llama 3.2
   {
     label: 'Llama 3.2 1B',

diff --git a/packages/react-native-executorch/android/libs/classes.jar b/packages/react-native-executorch/android/libs/classes.jar
diff --git a/packages/react-native-executorch/common/runner/text_prefiller.cpp b/packages/react-native-executorch/common/runner/text_prefiller.cpp
@@ -18,10 +18,11 @@ namespace llm {
 
 TextPrefiller::TextPrefiller(TextDecoderRunner *text_decoder_runner,
                              bool use_kv_cache, bool enable_parallel_prefill,
-                             int64_t max_seq_len)
+                             int64_t max_seq_len, int32_t prefill_chunk_size)
     : text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill),
-      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
+      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128),
+      prefill_chunk_size_(prefill_chunk_size) {}
 
 ::executorch::runtime::Result<uint64_t>
 TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
@@ -31,11 +32,10 @@ TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
   }
 
-  // Check if we need to chunk the prompt tokens
   int32_t num_prompt_tokens = prompt_tokens.size();
-  const int32_t chunk_size = static_cast<int32_t>(max_seq_len_);
+  int32_t chunk_size =
+      prefill_chunk_size_ > 0 ? prefill_chunk_size_ : max_seq_len_;
 
-  // If prompt tokens exceed chunk_size, we need to chunk them
   if (num_prompt_tokens > chunk_size) {
     uint64_t cur_token = 0;
     int num_tokens_to_process = 0;

diff --git a/packages/react-native-executorch/common/runner/text_prefiller.h b/packages/react-native-executorch/common/runner/text_prefiller.h
@@ -19,8 +19,14 @@ namespace llm {
 
 class TextPrefiller {
 public:
+  // prefill_chunk_size: when > 0, the prompt is always processed in steps of
+  // this size (see prefill()). Set to the model's forward sequence-length cap
+  // for the MLX backend (its forward is exported with a sliding-window bound
+  // and one-shot prefill spikes Metal memory). Other backends (XNNPACK/CoreML)
+  // pass 0 → original one-shot behavior.
   TextPrefiller(TextDecoderRunner *text_decoder_runner, bool use_kv_cache,
-                bool enable_parallel_prefill, int64_t max_seq_len = 128);
+                bool enable_parallel_prefill, int64_t max_seq_len = 128,
+                int32_t prefill_chunk_size = 0);
 
   virtual ~TextPrefiller() = default;
   /**
@@ -70,6 +76,7 @@ class TextPrefiller {
   bool use_kv_cache_;
   bool enable_parallel_prefill_;
   int64_t max_seq_len_;
+  int32_t prefill_chunk_size_;
 };
 
 } // namespace llm

diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -26,11 +26,24 @@ Error TextRunner::load_subcomponents() {
 
   Stats *stats_ptr = &stats_;
 
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
-      *module_, io_manager_.get(), config_);
+  text_decoder_runner_ =
+      std::make_unique<TextDecoderRunner>(*module_, io_manager_.get(), config_);
+
+  int32_t prefill_chunk_size = 0;
+  auto fwd_meta = module_->method_meta("forward");
+  if (fwd_meta.ok() && fwd_meta->uses_backend("MLXBackend")) {
+    auto input_meta = fwd_meta->input_tensor_meta(0);
+    if (input_meta.ok()) {
+      auto sizes = input_meta->sizes();
+      if (sizes.size() >= 2 && sizes[sizes.size() - 1] > 0) {
+        prefill_chunk_size = sizes[sizes.size() - 1];
+      }
+    }
+  }
+
   text_prefiller_ = std::make_unique<TextPrefiller>(
       text_decoder_runner_.get(), config_.enable_kv_cache,
-      config_.enable_dynamic_shape, config_.max_seq_len);
+      config_.enable_dynamic_shape, config_.max_seq_len, prefill_chunk_size);
   text_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
       std::move(eos_ids_), stats_ptr, config_);
@@ -83,10 +96,12 @@ Error TextRunner::generate_internal(
   std::vector<uint64_t> prompt_tokens = encodeResult.get();
   int num_prompt_tokens = prompt_tokens.size();
 
-  // For dynamic-shape PTEs (Gemma4 iter*), get_max_seq_len is the per-call
-  // decoder chunk size (e.g. 128) and the true generation budget lives in
-  // get_max_context_len. Static-shape PTEs set both equal, so this collapses
-  // to the old behavior. Mirrors multimodal_prefiller.cpp:96.
+  // For dynamic-shape PTEs (e.g. Gemma4 MLX/Vulkan), get_max_seq_len is the
+  // per-call decoder chunk size (e.g. the sliding window) and the real
+  // generation budget lives in get_max_context_len. Static-shape PTEs set both
+  // equal, so this collapses to the old behavior. Without this the budget is
+  // computed from the small chunk size, so max_new_tokens can resolve to ~0 and
+  // generation ends immediately after prefill.
   const int32_t seq_cap = config_.enable_dynamic_shape
                               ? config_.max_context_length
                               : config_.max_seq_len;

diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec
@@ -62,6 +62,12 @@ Pod::Spec.new do |s|
 
   s.libraries = "z"
   s.ios.vendored_frameworks = "third-party/ios/ExecutorchLib.xcframework"
+
+  # NOTE: mlx.metallib (the MLX GPU kernels) is bundled INSIDE
+  # ExecutorchLib.framework, colocated with the binary that contains the MLX
+  # code. MLX's runtime loader resolves the metallib relative to that binary
+  # (via dladdr), so it must live next to it in the framework — not at the app
+  # bundle root.
   # Exclude file with tests to not introduce gtest dependency.
   # Do not include the headers from common/rnexecutorch/jsi/ as source files.
   # Xcode/Cocoapods leaks them to other pods that an app also depends on, so if

diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -38,7 +38,7 @@ import { RnExecutorchErrorCode } from '../errors/ErrorCodes';
  * compile-time error.
  * @category Utils
  */
-export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn';
+export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn' | 'mlx';
 
 /**
  * Options for a `models` accessor call.
@@ -78,7 +78,7 @@ type ConfigOf<V> = Extract<
 >;
 type BackendsOf<V> = Extract<keyof V, Backend>;
 
-const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'vulkan', 'qnn'];
+const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'mlx', 'vulkan', 'qnn'];
 
 function firstBackend(variants: AnyVariantMap): Backend {
   for (const b of BACKEND_ORDER) {
@@ -181,6 +181,33 @@ function tts<C extends TextToSpeechModelConfig>(c: C): () => C {
 // Per-backend variant maps for models that ship more than one backend.
 // ─────────────────────────────────────────────────────────────────────────────
 
+const GEMMA4_E2B_VARIANTS = {
+  mlx: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_MLX_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+  xnnpack: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_XNNPACK_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+  vulkan: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_VULKAN_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+};
+
 const EFFICIENTNET_V2_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -496,7 +523,10 @@ export const models = {
       M.LFM2_5_1_2B_INSTRUCT_QUANTIZED
     ),
     bielik_v3_0_1_5b: pair(M.BIELIK_V3_0_1_5B, M.BIELIK_V3_0_1_5B_QUANTIZED),
-    gemma4_e2b: base(M.GEMMA4_E2B),
+    gemma4_e2b: variant(GEMMA4_E2B_VARIANTS, {
+      ios: 'mlx',
+      android: 'vulkan',
+    }),
     // Multimodal LLMs — same hook/module as plain LLMs, listed here so users
     // pick a model by capability ("LLM") rather than by modality.
     lfm2_5_vl_1_6b: base(M.LFM2_5_VL_1_6B_QUANTIZED),

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -124,34 +124,40 @@ export const QWEN3_0_6B_QUANTIZED = {
   generationConfig: QWEN3_GENERATION_CONFIG,
 } as const;
 
-// GEMMA 4
-const GEMMA4_E2B_XNNPACK = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
-const GEMMA4_E2B_VULKAN = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
-const GEMMA4_E2B_XNNPACK_MM = `${URL_PREFIX}-gemma-4-multimodal/${PREVIOUS_VERSION_TAG}/e2b/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
+// GEMMA 4 — separate HF repo; tokenizer files live at the e2b root and are
+// shared by all backend variants.
+const GEMMA4_E2B_PREFIX = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b`;
+export const GEMMA4_E2B_MLX_MODEL = `${GEMMA4_E2B_PREFIX}/mlx/gemma4_e2b_mlx_int4.pte`;
+export const GEMMA4_E2B_XNNPACK_MODEL = `${GEMMA4_E2B_PREFIX}/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
+export const GEMMA4_E2B_VULKAN_MODEL = `${GEMMA4_E2B_PREFIX}/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
+export const GEMMA4_E2B_TOKENIZER = `${GEMMA4_E2B_PREFIX}/tokenizer.json`;
+export const GEMMA4_E2B_TOKENIZER_CONFIG = `${GEMMA4_E2B_PREFIX}/tokenizer_config.json`;
+
+const GEMMA4_E2B_MODEL =
+  Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MODEL : GEMMA4_E2B_MLX_MODEL;
+
+const GEMMA4_E2B_MLX_MM = `${URL_PREFIX}-gemma-4-multimodal/${PREVIOUS_VERSION_TAG}/e2b/mlx/gemma4_e2b_mlx_int4.pte`;
 const GEMMA4_E2B_VULKAN_MM = `${URL_PREFIX}-gemma-4-multimodal/${PREVIOUS_VERSION_TAG}/e2b/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
-const GEMMA4_TOKENIZER = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/tokenizer.json`;
-const GEMMA4_TOKENIZER_CONFIG = `${URL_PREFIX}-gemma-4/${PREVIOUS_VERSION_TAG}/e2b/tokenizer_config.json`;
 
 /**
  * @category Models - LLM
  */
 export const GEMMA4_E2B = {
   modelName: 'gemma4-e2b',
-  modelSource:
-    Platform.OS === `android` ? GEMMA4_E2B_VULKAN : GEMMA4_E2B_XNNPACK,
-  tokenizerSource: GEMMA4_TOKENIZER,
-  tokenizerConfigSource: GEMMA4_TOKENIZER_CONFIG,
+  modelSource: GEMMA4_E2B_MODEL,
+  tokenizerSource: GEMMA4_E2B_TOKENIZER,
+  tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
 } as const;
 
 /**
- * @category Models - VLM
+ * @category Models - LLM Multimodal
  */
 export const GEMMA4_E2B_MM = {
   modelName: 'gemma4-e2b-multimodal',
   modelSource:
-    Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MM : GEMMA4_E2B_XNNPACK_MM,
-  tokenizerSource: GEMMA4_TOKENIZER,
-  tokenizerConfigSource: GEMMA4_TOKENIZER_CONFIG,
+    Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MM : GEMMA4_E2B_MLX_MM,
+  tokenizerSource: GEMMA4_E2B_TOKENIZER,
+  tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
   capabilities: ['vision', 'audio'],
   audioConfig: {
     samplesPerBlock: 7680,

diff --git a/...es/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so b/...es/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so
diff --git a/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so b/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so
diff --git a/packages/react-native-executorch/third-party/include/executorch/ExecuTorch.h b/packages/react-native-executorch/third-party/include/executorch/ExecuTorch.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchBackendOption.h"
+#import "ExecuTorchBackendOptionsMap.h"
 #import "ExecuTorchError.h"
 #import "ExecuTorchLog.h"
 #import "ExecuTorchModule.h"

diff --git a/packages/react-native-executorch/third-party/include/executorch/ExecuTorchModule.h b/packages/react-native-executorch/third-party/include/executorch/ExecuTorchModule.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchBackendOption.h"
+#import "ExecuTorchBackendOptionsMap.h"
 #import "ExecuTorchValue.h"
 
 NS_ASSUME_NONNULL_BEGIN
@@ -198,6 +200,37 @@ NS_SWIFT_NAME(Module)
  */
 - (BOOL)load:(NSError **)error;
 
+/**
+ * Loads the module's program with per-delegate backend options.
+ *
+ * The receiver retains @c options for as long as the underlying program
+ * references it (lifetime tracked via ARC).
+ *
+ * @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
+ *        load-time configuration, built once via
+ *        `[ExecuTorchBackendOptionsMap mapWithOptions:error:]`.
+ * @param verification The verification level to apply when loading the program.
+ * @param error A pointer to an NSError pointer that will be set if an error
+ * occurs.
+ * @return YES if the program was successfully loaded; otherwise, NO.
+ */
+- (BOOL)loadWithOptions:(ExecuTorchBackendOptionsMap *)options
+           verification:(ExecuTorchVerification)verification
+                  error:(NSError **)error NS_REFINED_FOR_SWIFT;
+
+/**
+ * Loads the module's program with per-delegate backend options using minimal
+ * verification.
+ *
+ * @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
+ *        load-time configuration.
+ * @param error A pointer to an NSError pointer that will be set if an error
+ * occurs.
+ * @return YES if the program was successfully loaded; otherwise, NO.
+ */
+- (BOOL)loadWithOptions:(ExecuTorchBackendOptionsMap *)options
+                  error:(NSError **)error NS_REFINED_FOR_SWIFT;
+
 /**
  * Checks if the module is loaded.
  *
@@ -215,6 +248,19 @@ NS_SWIFT_NAME(Module)
 - (BOOL)loadMethod:(NSString *)methodName
              error:(NSError **)error NS_SWIFT_NAME(load(_:));
 
+/**
+ * Loads a specific method from the program with per-delegate backend options.
+ *
+ * @param methodName A string representing the name of the method to load.
+ * @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
+ *        load-time configuration.
+ * @param error A pointer to an NSError pointer that is set if an error occurs.
+ * @return YES if the method was successfully loaded; otherwise, NO.
+ */
+- (BOOL)loadMethod:(NSString *)methodName
+           options:(ExecuTorchBackendOptionsMap *)options
+             error:(NSError **)error NS_REFINED_FOR_SWIFT;
+
 /**
  * Checks if a specific method is loaded.
  *

diff --git a/...tive-executorch/third-party/include/executorch/extension/data_loader/buffer_data_loader.h b/...tive-executorch/third-party/include/executorch/extension/data_loader/buffer_data_loader.h
@@ -36,9 +36,10 @@ class BufferDataLoader final : public executorch::runtime::DataLoader {
        ET_UNUSED const DataLoader::SegmentInfo &segment_info) const override {
     size_t total_size;
     bool overflow = c10::add_overflows(offset, size, &total_size);
-    ET_CHECK_OR_RETURN_ERROR(!overflow && total_size <= size_, InvalidArgument,
-                             "offset %zu + size %zu > size_ %zu", offset, size,
-                             size_);
+    ET_CHECK_OR_RETURN_ERROR(
+        !overflow && total_size <= size_, InvalidArgument,
+        "offset %zu + size %zu > size_ %zu, or overflow detected", offset, size,
+        size_);
     return executorch::runtime::FreeableBuffer(data_ + offset, size,
                                                /*free_fn=*/nullptr);
   }