diff --git a/.eslintrc.js b/.eslintrc.js
index b09ff37d23..fd7eca7393 100644
--- a/.eslintrc.js
+++ b/.eslintrc.js
@@ -8,6 +8,7 @@ const VALID_CATEGORIES = [
   'Models - Image Embeddings',
   'Models - Image Generation',
   'Models - LLM',
+  'Models - LLM Multimodal',
   'Models - Object Detection',
   'Models - Instance Segmentation',
   'Models - Pose Estimation',
diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 0de5004849..003c4fcb7d 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -12,6 +12,11 @@ import {
   View,
 } from 'react-native';
 import { launchImageLibrary } from 'react-native-image-picker';
+import {
+  AudioManager,
+  AudioRecorder,
+  AudioContext,
+} from 'react-native-audio-api';
 import { useIsFocused } from '@react-navigation/native';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
 import { models, useLLM } from 'react-native-executorch';
@@ -23,12 +28,14 @@ import Spinner from '../../components/Spinner';
 import { GeneratingContext } from '../../context';
 import SuggestedPrompts from '../../components/SuggestedPrompts';
 import ErrorBanner from '../../components/ErrorBanner';
+import AudioWaveform from '../../components/AudioWaveform';
 
 const SUGGESTED_PROMPTS = [
   "What's in this image?",
   'Describe this scene in detail',
   'What objects can you see?',
   'What text appears in this image?',
+  'Transcribe the audio',
 ];
 import { useLLMStats } from '../../hooks/useLLMStats';
 import { StatsBar } from '../../components/StatsBar';
@@ -46,12 +53,18 @@ function MultimodalLLMScreen() {
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
-  // Added error state
-  const [error, setError] = useState<string | null>(null);
+  const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
+  const [audioLabel, setAudioLabel] = useState<string | null>(null);
+  const [audioUrl, setAudioUrl] = useState('');
+  const [isFetchingAudio, setIsFetchingAudio] = useState(false);
+  const [isRecording, setIsRecording] = useState(false);
+  const [hasMicPermission, setHasMicPermission] = useState(false);
+  const recorder = useRef(new AudioRecorder());
+  const recordChunks = useRef<Float32Array[]>([]);
 
-  const vlm = useLLM({
-    model: models.llm.lfm2_5_vl_1_6b(),
-  });
+  const [error, setError] = useState<string | null>(null);
+  const model = models.llm.gemma4_e2b_multimodal();
+  const vlm = useLLM({ model: model });
   const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
   const { stats, onMessageSend } = useLLMStats(
     vlm.response,
@@ -68,6 +81,95 @@ function MultimodalLLMScreen() {
     if (vlm.error) setError(String(vlm.error));
   }, [vlm.error]);
 
+  useEffect(() => {
+    AudioManager.setAudioSessionOptions({
+      iosCategory: 'playAndRecord',
+      iosMode: 'spokenAudio',
+      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
+    });
+    (async () => {
+      const status = await AudioManager.requestRecordingPermissions();
+      setHasMicPermission(status === 'Granted');
+    })();
+
+    return () => {
+      if (vlm.isGenerating) vlm.interrupt();
+      // eslint-disable-next-line react-hooks/exhaustive-deps
+      recorder.current.stop();
+      AudioManager.setAudioSessionActivity(false);
+    };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  const loadAudioFromUrl = async () => {
+    const url = audioUrl.trim();
+    if (!url) return;
+    setIsFetchingAudio(true);
+    try {
+      const ctx = new AudioContext({ sampleRate: 16000 });
+      const decoded = await ctx.decodeAudioData(url);
+      const pcm = decoded.getChannelData(0);
+      const name = url.split('/').pop() || 'audio';
+      setAudioBuffer(pcm);
+      setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setIsFetchingAudio(false);
+    }
+  };
+
+  const startRecording = async () => {
+    if (!hasMicPermission) {
+      setError('Microphone permission denied. Please enable it in Settings.');
+      return;
+    }
+    recordChunks.current = [];
+    const sampleRate = 16000;
+    recorder.current.onAudioReady(
+      { sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
+      ({ buffer }) => {
+        recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
+      }
+    );
+    try {
+      const ok = await AudioManager.setAudioSessionActivity(true);
+      if (!ok) {
+        setError('Cannot start audio session');
+        return;
+      }
+      const result = recorder.current.start();
+      if (result.status === 'error') {
+        setError(`Recording problems: ${result.message}`);
+        return;
+      }
+      setIsRecording(true);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    }
+  };
+
+  const stopRecording = () => {
+    recorder.current.stop();
+    setIsRecording(false);
+    const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
+    if (total === 0) return;
+    const pcm = new Float32Array(total);
+    let off = 0;
+    for (const c of recordChunks.current) {
+      pcm.set(c, off);
+      off += c.length;
+    }
+    recordChunks.current = [];
+    setAudioBuffer(pcm);
+    setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
+  };
+
+  const clearAudio = () => {
+    setAudioBuffer(null);
+    setAudioLabel(null);
+  };
+
   const pickImage = async () => {
     try {
       const result = await launchImageLibrary({ mediaType: 'photo' });
@@ -81,19 +183,27 @@ function MultimodalLLMScreen() {
   };
 
   const sendMessage = async () => {
-    if (!userInput.trim() || vlm.isGenerating) return;
+    if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
+      return;
     onMessageSend();
     const text = userInput.trim();
     setUserInput('');
     textInputRef.current?.clear();
     Keyboard.dismiss();
     const currentImageUri = imageUri;
+    const currentAudio = audioBuffer;
     setImageUri(null);
+    setAudioBuffer(null);
+    setAudioLabel(null);
     try {
-      await vlm.sendMessage(
-        text,
-        currentImageUri ? { imagePath: currentImageUri } : undefined
-      );
+      const media =
+        currentImageUri || currentAudio
+          ? {
+              ...(currentImageUri ? { imagePath: currentImageUri } : {}),
+              ...(currentAudio ? { audioBuffer: currentAudio } : {}),
+            }
+          : undefined;
+      await vlm.sendMessage(text, media);
     } catch (e) {
       // Updated to set UI error instead of just console.error
       setError(e instanceof Error ? e.message : String(e));
@@ -135,7 +245,9 @@ function MultimodalLLMScreen() {
             <View style={styles.helloMessageContainer}>
               <Text style={styles.helloText}>Hello! 👋</Text>
               <Text style={styles.bottomHelloText}>
-                Pick an image and ask me anything about it.
+                {model.capabilities.find((c) => c === 'audio')
+                  ? 'Say hi, or pick an image, and ask me anything about it.'
+                  : 'Pick an image and ask me anything about it.'}
               </Text>
               <SuggestedPrompts
                 prompts={SUGGESTED_PROMPTS}
@@ -159,6 +271,48 @@ function MultimodalLLMScreen() {
             </TouchableOpacity>
           )}
 
+          {/* Audio URL input */}
+          <View style={styles.audioUrlRow}>
+            <TextInput
+              placeholder="Audio URL (mp3/wav/…)"
+              placeholderTextColor="#C1C6E5"
+              style={styles.audioUrlInput}
+              value={audioUrl}
+              onChangeText={setAudioUrl}
+              autoCapitalize="none"
+              autoCorrect={false}
+            />
+            <TouchableOpacity
+              style={[
+                styles.audioUrlButton,
+                (!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
+                  styles.disabled,
+              ]}
+              onPress={loadAudioFromUrl}
+              disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
+            >
+              <Text style={styles.audioUrlButtonText}>
+                {isFetchingAudio ? '…' : 'Load'}
+              </Text>
+            </TouchableOpacity>
+          </View>
+
+          {/* Audio attachment strip */}
+          {audioLabel && (
+            <View style={styles.audioAttachmentContainer}>
+              <View style={styles.audioAttachmentRow}>
+                <Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
+                <TouchableOpacity onPress={clearAudio}>
+                  <Text style={styles.audioAttachmentClear}>✕</Text>
+                </TouchableOpacity>
+              </View>
+              <AudioWaveform
+                buffer={audioBuffer}
+                style={styles.audioWaveform}
+              />
+            </View>
+          )}
+
           <StatsBar stats={stats} />
           <View
             style={[
@@ -178,6 +332,17 @@ function MultimodalLLMScreen() {
               <Text style={styles.imageButtonText}>📷</Text>
             </TouchableOpacity>
 
+            {/* Mic record / stop button */}
+            <TouchableOpacity
+              style={styles.imageButton}
+              onPress={isRecording ? stopRecording : startRecording}
+              disabled={vlm.isGenerating}
+            >
+              <Text style={styles.imageButtonText}>
+                {isRecording ? '⏹️' : '🎤'}
+              </Text>
+            </TouchableOpacity>
+
             <TextInput
               autoCorrect={false}
               ref={textInputRef}
@@ -198,14 +363,15 @@ function MultimodalLLMScreen() {
               onChangeText={setUserInput}
             />
 
-            {userInput.trim() && !vlm.isGenerating && (
-              <TouchableOpacity
-                style={styles.sendChatTouchable}
-                onPress={sendMessage}
-              >
-                <SendIcon height={24} width={24} padding={4} margin={8} />
-              </TouchableOpacity>
-            )}
+            {(imageUri || audioBuffer || userInput.trim()) &&
+              !vlm.isGenerating && (
+                <TouchableOpacity
+                  style={styles.sendChatTouchable}
+                  onPress={sendMessage}
+                >
+                  <SendIcon height={24} width={24} padding={4} margin={8} />
+                </TouchableOpacity>
+              )}
             {vlm.isGenerating && (
               <TouchableOpacity
                 style={styles.sendChatTouchable}
@@ -319,6 +485,71 @@ const styles = StyleSheet.create({
     fontFamily: 'regular',
     color: ColorPalette.blueDark,
   },
+  audioAttachmentContainer: {
+    flexDirection: 'column',
+    paddingHorizontal: 16,
+    paddingVertical: 8,
+    marginHorizontal: 16,
+    marginBottom: 4,
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    backgroundColor: '#fafbff',
+  },
+  audioAttachmentRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+  },
+  audioAttachmentText: {
+    fontSize: 13,
+    fontFamily: 'regular',
+    color: ColorPalette.blueDark,
+  },
+  audioAttachmentClear: {
+    fontSize: 16,
+    color: ColorPalette.blueDark,
+    paddingHorizontal: 8,
+  },
+  audioWaveform: {
+    marginTop: 6,
+    minWidth: 0,
+  },
+  audioUrlRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginHorizontal: 16,
+    marginBottom: 4,
+  },
+  audioUrlInput: {
+    flex: 1,
+    padding: 10,
+    borderTopLeftRadius: 8,
+    borderBottomLeftRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    borderRightWidth: 0,
+    fontFamily: 'regular',
+    fontSize: 13,
+    color: ColorPalette.primary,
+  },
+  audioUrlButton: {
+    paddingVertical: 10,
+    paddingHorizontal: 16,
+    backgroundColor: ColorPalette.strongPrimary,
+    borderTopRightRadius: 8,
+    borderBottomRightRadius: 8,
+    justifyContent: 'center',
+    alignItems: 'center',
+  },
+  audioUrlButtonText: {
+    color: '#fff',
+    fontFamily: 'medium',
+    fontSize: 13,
+  },
+  disabled: {
+    opacity: 0.5,
+  },
   bottomContainer: {
     height: 100,
     width: '100%',
diff --git a/apps/llm/components/AudioWaveform.tsx b/apps/llm/components/AudioWaveform.tsx
new file mode 100644
index 0000000000..cac4035614
--- /dev/null
+++ b/apps/llm/components/AudioWaveform.tsx
@@ -0,0 +1,57 @@
+import { useMemo } from 'react';
+import { StyleProp, StyleSheet, View, ViewStyle } from 'react-native';
+import ColorPalette from '../colors';
+
+interface AudioWaveformProps {
+  buffer: Float32Array | null | undefined;
+  style?: StyleProp<ViewStyle>;
+}
+
+const NUM_BARS = 32;
+
+export default function AudioWaveform({ buffer, style }: AudioWaveformProps) {
+  const bars = useMemo(() => {
+    if (!buffer || buffer.length === 0) return null;
+    const chunkSize = Math.max(1, Math.floor(buffer.length / NUM_BARS));
+    const peaks: number[] = [];
+    let max = 0;
+    for (let i = 0; i < NUM_BARS; i++) {
+      const start = i * chunkSize;
+      const end = Math.min(start + chunkSize, buffer.length);
+      let peak = 0;
+      for (let j = start; j < end; j++) {
+        const v = Math.abs(buffer[j] ?? 0);
+        if (v > peak) peak = v;
+      }
+      peaks.push(peak);
+      if (peak > max) max = peak;
+    }
+    return max > 0 ? peaks.map((p) => p / max) : peaks;
+  }, [buffer]);
+
+  if (!bars) return null;
+
+  return (
+    <View style={[styles.container, style]}>
+      {bars.map((amp, i) => (
+        <View key={i} style={[styles.bar, { height: 2 + amp * 14 }]} />
+      ))}
+    </View>
+  );
+}
+
+const styles = StyleSheet.create({
+  container: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    height: 16,
+    minWidth: 160,
+    gap: 2,
+  },
+  bar: {
+    flex: 1,
+    borderRadius: 1,
+    backgroundColor: ColorPalette.blueDark,
+    opacity: 0.35,
+  },
+});
diff --git a/apps/llm/components/MessageItem.tsx b/apps/llm/components/MessageItem.tsx
index 2c44714ac0..cda8609885 100644
--- a/apps/llm/components/MessageItem.tsx
+++ b/apps/llm/components/MessageItem.tsx
@@ -11,6 +11,7 @@ import MarkdownComponent from './MarkdownComponent';
 import LlamaIcon from '../assets/icons/llama_icon.svg';
 import ColorPalette from '../colors';
 import { Message } from 'react-native-executorch';
+import AudioWaveform from './AudioWaveform';
 
 interface MessageItemProps {
   message: Message;
@@ -43,6 +44,12 @@ const MessageItem = memo(({ message, deleteMessage }: MessageItemProps) => {
             resizeMode="contain"
           />
         )}
+        {message.audioWaveform && (
+          <AudioWaveform
+            buffer={message.audioWaveform}
+            style={styles.userMessageWaveform}
+          />
+        )}
         <MarkdownComponent text={message.content} />
       </View>
     </View>
@@ -103,6 +110,9 @@ const styles = StyleSheet.create({
     borderRadius: 6,
     marginBottom: 6,
   },
+  userMessageWaveform: {
+    marginBottom: 6,
+  },
   aiMessageIconContainer: {
     backgroundColor: ColorPalette.seaBlueLight,
     height: 32,
diff --git a/apps/llm/components/llmModels.ts b/apps/llm/components/llmModels.ts
index 1d80d7a395..58b8c01d74 100644
--- a/apps/llm/components/llmModels.ts
+++ b/apps/llm/components/llmModels.ts
@@ -10,6 +10,11 @@ const llm = models.llm;
 export type LLMModelSources = LLMProps['model'];
 
 export const LLM_MODELS: ModelOption<LLMModelSources>[] = [
+  //Gemma 4
+  {
+    label: 'Gemma 4 E2B',
+    value: llm.gemma4_e2b(),
+  },
   // Llama 3.2
   {
     label: 'Llama 3.2 1B',
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 5f88f3764a..909c2da57f 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -22,12 +22,24 @@ import { ModelPicker, ModelOption } from '../components/ModelPicker';
 const speechToText = models.speech_to_text;
 const vad = models.vad;
 
+const isSimulator = DeviceInfo.isEmulatorSync();
+const backend = Platform.OS === 'ios' && !isSimulator ? 'coreml' : 'xnnpack';
+
 type STTModelSources = SpeechToTextProps['model'];
 
 const MODELS: ModelOption<STTModelSources>[] = [
-  { label: 'Whisper Tiny EN', value: speechToText.whisper_tiny_en() },
-  { label: 'Whisper Base EN', value: speechToText.whisper_base_en() },
-  { label: 'Whisper Small EN', value: speechToText.whisper_small_en() },
+  {
+    label: 'Whisper Tiny EN',
+    value: speechToText.whisper_tiny_en({ backend }),
+  },
+  {
+    label: 'Whisper Base EN',
+    value: speechToText.whisper_base_en({ backend }),
+  },
+  {
+    label: 'Whisper Small EN',
+    value: speechToText.whisper_small_en({ backend }),
+  },
 ];
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
@@ -42,12 +54,10 @@ import DeviceInfo from 'react-native-device-info';
 import { VerboseTranscription } from '../components/VerboseTranscription';
 import ErrorBanner from '../components/ErrorBanner';
 
-const isSimulator = DeviceInfo.isEmulatorSync();
-
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const [selectedModel, setSelectedModel] = useState<STTModelSources>(
-    Platform.OS === 'ios'
-      ? speechToText.whisper_base_en()
+    Platform.OS === 'ios' && !isSimulator
+      ? speechToText.whisper_base_en({ backend })
       : speechToText.whisper_tiny_en()
   );
 
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useLLM.md b/docs/docs/03-hooks/01-natural-language-processing/useLLM.md
index 7b1cb25158..29b1be4d72 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useLLM.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useLLM.md
@@ -56,7 +56,7 @@ The code snippet above fetches the model from the specified URL, loads it into m
 
 `useLLM` takes [`LLMProps`](../../06-api-reference/interfaces/LLMProps.md) that consists of:
 
-- [model source](../../06-api-reference/interfaces/LLMProps.md#modelsource), [tokenizer source](../../06-api-reference/interfaces/LLMProps.md#tokenizersource), and [tokenizer config source](../../06-api-reference/interfaces/LLMProps.md#tokenizerconfigsource).
+- [model](../../06-api-reference/interfaces/LLMModel.md).
 - An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model.
 
 You need more details? Check the following resources:
@@ -494,13 +494,13 @@ Depending on selected model and the user's device generation speed can be above
 
 ## Vision-Language Models (VLM)
 
-Some models support multimodal input — text and images together. To use them, pass a `capabilities` array when loading the model.
+Some models support multimodal input — text, images and/or audio together. To use them, pass a `capabilities` array when loading the model.
 
 ### Loading a VLM
 
 ```tsx
 import { models, useLLM } from 'react-native-executorch';
-const llm = useLLM({ model: models.llm.lfm2_5_vl_1_6b() });
+const llm = useLLM({ model: models.llm.gemma4_e2b_multimodal() });
 ```
 
 The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
@@ -511,22 +511,26 @@ const llm = useLLM({
     modelSource: '...',
     tokenizerSource: '...',
     tokenizerConfigSource: '...',
-    capabilities: ['vision'],
+    capabilities: ['vision', 'audio'],
   },
 });
 ```
 
 Passing `capabilities` unlocks the typed `media` argument on `sendMessage`.
 
-### Sending a message with an image
+### Sending a message with an image or audio recording
 
 ```tsx
-const llm = useLLM({ model: models.llm.lfm2_5_vl_1_6b() });
+const llm = useLLM({ model: models.llm.gemma4_e2b_multimodal() });
 
 const send = () => {
   llm.sendMessage('What is in this image?', {
     imagePath: '/path/to/image.jpg',
   });
+  // or
+  llm.sendMessage('What can you hear?', {
+    audioBuffer: audioRecording,
+  });
 };
 
 return (
@@ -538,6 +542,7 @@ return (
 ```
 
 The `imagePath` should be a local file path on the device.
+The `audioBuffer` should be a `Float32Array` with 16kHz waveform.
 
 ### Functional generation with images
 
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md
index 967625160c..48b3395f87 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md
@@ -118,12 +118,12 @@ Model presets expose an optional `generationConfig` that `LLMModule.fromModelNam
 
 ## Vision-Language Models (VLM)
 
-Some models support multimodal input — text and images together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname):
+Some models support multimodal input — text, images and/or audio together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname):
 
 ```typescript
 import { models, LLMModule } from 'react-native-executorch';
 const llm = await LLMModule.fromModelName(
-  models.llm.lfm2_5_vl_1_6b(),
+  models.llm.gemma4_e2b_multimodal(),
   undefined,
   (token) => console.log(token)
 );
@@ -133,20 +133,24 @@ The `capabilities` field is already set on the model constant. You can also cons
 
 ```typescript
 const llm = await LLMModule.fromModelName({
-  modelName: 'lfm2.5-vl-1.6b-quantized',
+  modelName: 'gemma4-e2b-multimodal',
   modelSource: require('./path/to/model.pte'),
   tokenizerSource: require('./path/to/tokenizer.json'),
   tokenizerConfigSource: require('./path/to/tokenizer_config.json'),
-  capabilities: ['vision'],
+  capabilities: ['vision', 'audio'],
 });
 ```
 
-Once loaded, pass `imagePath` to [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage):
+Once loaded, pass `imagePath` or `audioBuffer` to [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage):
 
 ```typescript
 const response = await llm.sendMessage('What is in this image?', {
   imagePath: '/path/to/image.jpg',
 });
+// or
+const response = await llm.sendMessage('What can you hear?', {
+  audioBuffer: audioRecording, //expected as waveform 16kHz
+});
 ```
 
 Or use [`generate`](../../06-api-reference/classes/LLMModule.md#generate) with `mediaPath` on the message:
@@ -159,7 +163,14 @@ const chat: Message[] = [
     mediaPath: '/path/to/image.jpg',
   },
 ];
-
+// or
+const chat: Message[] = [
+  {
+    role: 'user',
+    content: 'Transcribe the recording.',
+    audioWaveform: audioRecording,
+  },
+];
 const response = await llm.generate(chat);
 ```
 
diff --git a/docs/versioned_docs/version-0.9.x/03-hooks/01-natural-language-processing/useLLM.md b/docs/versioned_docs/version-0.9.x/03-hooks/01-natural-language-processing/useLLM.md
new file mode 100644
index 0000000000..f19920a486
--- /dev/null
+++ b/docs/versioned_docs/version-0.9.x/03-hooks/01-natural-language-processing/useLLM.md
@@ -0,0 +1,580 @@
+---
+title: useLLM
+keywords:
+  [
+    react native,
+    react native ai,
+    react native llm,
+    react native qwen,
+    react native llama,
+    react native executorch,
+    executorch,
+    pytorch,
+    on-device ai,
+    mobile ai,
+    llama 3,
+    qwen,
+    text generation,
+    tool calling,
+    function calling,
+  ]
+description: "Learn how to use LLMs in your React Native applications with React Native ExecuTorch's useLLM hook."
+---
+
+React Native ExecuTorch supports a variety of LLMs (checkout our [HuggingFace repository](https://huggingface.co/software-mansion) for model already converted to ExecuTorch format) including Llama 3.2. Before getting started, you’ll need to obtain the .pte binary—a serialized model, the tokenizer and tokenizer config JSON files. There are various ways to accomplish this:
+
+:::info
+It is recommended to use models provided by us, which are available at our [HuggingFace repository](https://huggingface.co/collections/software-mansion/llm). You can also use [constants](../../06-api-reference/index.md#models---llm) shipped with our library.
+
+Alternatively, follow the official [tutorial](https://docs.pytorch.org/executorch/stable/llm/export-llm.html) made by ExecuTorch team to export an arbitrary LLM model.
+:::
+
+:::warning
+Lower-end devices might not be able to fit LLMs into memory. We recommend using quantized models to reduce the memory footprint.
+:::
+
+## API Reference
+
+- For detailed API Reference for `useLLM` see: [`useLLM` API Reference](../../06-api-reference/functions/useLLM.md).
+- For all LLM models available out-of-the-box in React Native ExecuTorch see: [LLM Models](../../06-api-reference/index.md#models---llm).
+- For useful LLM utility functionalities please refer to the following link: [LLM Utility Functionalities](../../06-api-reference/index.md#utilities---llm).
+
+## Initializing
+
+In order to load a model into the app, you need to run the following code:
+
+```typescript
+import { models, useLLM } from 'react-native-executorch';
+const llm = useLLM({ model: models.llm.lfm2_5_1_2b_instruct() });
+```
+
+<br/>
+
+The code snippet above fetches the model from the specified URL, loads it into memory, and returns an object with various functions and properties for controlling the model. You can monitor the loading progress by checking the [`llm.downloadProgress`](../../06-api-reference/interfaces/LLMType.md#downloadprogress) and [`llm.isReady`](../../06-api-reference/interfaces/LLMType.md#isready) property, and if anything goes wrong, the [`llm.error`](../../06-api-reference/interfaces/LLMType.md#error) property will contain the error message.
+
+### Arguments
+
+`useLLM` takes [`LLMProps`](../../06-api-reference/interfaces/LLMProps.md) that consists of:
+
+- [model source](../../06-api-reference/interfaces/LLMProps.md#modelsource), [tokenizer source](../../06-api-reference/interfaces/LLMProps.md#tokenizersource), and [tokenizer config source](../../06-api-reference/interfaces/LLMProps.md#tokenizerconfigsource).
+- An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model.
+
+You need more details? Check the following resources:
+
+- For detailed information about `useLLM` arguments check this section: [`useLLM` arguments](../../06-api-reference/functions/useLLM.md#parameters).
+- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
+- For available LLM models please check out the following list: [LLM Models](../../06-api-reference/index.md#models---llm).
+
+### Returns
+
+`useLLM` returns [`LLMType`](../../06-api-reference/interfaces/LLMType.md) which provides:
+
+- State properties: [`response`](../../06-api-reference/interfaces/LLMType.md#response), [`token`](../../06-api-reference/interfaces/LLMType.md#token), [`isReady`](../../06-api-reference/interfaces/LLMType.md#isready), [`isGenerating`](../../06-api-reference/interfaces/LLMType.md#isgenerating), [`downloadProgress`](../../06-api-reference/interfaces/LLMType.md#downloadprogress), [`error`](../../06-api-reference/interfaces/LLMType.md#error), [`messageHistory`](../../06-api-reference/interfaces/LLMType.md#messagehistory)
+- Generation methods: [`generate`](../../06-api-reference/interfaces/LLMType.md#generate), [`sendMessage`](../../06-api-reference/interfaces/LLMType.md#sendmessage), [`interrupt`](../../06-api-reference/interfaces/LLMType.md#interrupt)
+- Configuration: [`configure`](../../06-api-reference/interfaces/LLMType.md#configure), [`deleteMessage`](../../06-api-reference/interfaces/LLMType.md#deletemessage)
+- Token counting: [`getGeneratedTokenCount`](../../06-api-reference/interfaces/LLMType.md#getgeneratedtokencount), [`getPromptTokenCount`](../../06-api-reference/interfaces/LLMType.md#getprompttokencount), [`getTotalTokenCount`](../../06-api-reference/interfaces/LLMType.md#gettotaltokencount)
+
+For complete details, see the [LLMType API Reference](../../06-api-reference/interfaces/LLMType.md).
+
+## Functional vs managed
+
+You can use functions returned from this hooks in two manners:
+
+1. Functional/pure - we will not keep any state for you. You'll need to keep conversation history and handle function calling yourself. Use [`generate`](../../06-api-reference/interfaces/LLMType.md#generate) and [`response`](../../06-api-reference/interfaces/LLMType.md#response). Note that you don't need to run [`configure`](../../06-api-reference/interfaces/LLMType.md#configure) to use those. Furthermore, [`chatConfig`](../../06-api-reference/interfaces/LLMConfig.md#chatconfig) and [`toolsConfig`](../../06-api-reference/interfaces/LLMConfig.md#toolsconfig) will not have any effect on those functions.
+
+2. Managed/stateful - we will manage conversation state. Tool calls will be parsed and called automatically after passing appropriate callbacks. See more at [managed LLM chat](#managed-llm-chat).
+
+## Functional way
+
+### Simple generation
+
+To perform chat completion you can use the [`generate`](../../06-api-reference/interfaces/LLMType.md#generate) function. The [`response`](../../06-api-reference/interfaces/LLMType.md#response) value is updated with each token as it's generated, and the function returns a promise that resolves to the complete response when generation finishes.
+
+```tsx
+const llm = useLLM({ model: models.llm.lfm2_5_1_2b_instruct() });
+
+const handleGenerate = async () => {
+  const chat: Message[] = [
+    { role: 'system', content: 'You are a helpful assistant' },
+    { role: 'user', content: 'Hi!' },
+    { role: 'assistant', content: 'Hi!, how can I help you?' },
+    { role: 'user', content: 'What is the meaning of life?' },
+  ];
+
+  // Chat completion - returns the generated response
+  const response = await llm.generate(chat);
+  console.log('Complete response:', response);
+};
+
+return (
+  <View>
+    <Button onPress={handleGenerate} title="Generate!" />
+    <Text>{llm.response}</Text>
+  </View>
+);
+```
+
+### Interrupting the model
+
+Sometimes, you might want to stop the model while it’s generating. To do this, you can use [`interrupt`](../../06-api-reference/interfaces/LLMType.md#interrupt), which will halt the model and update the response one last time.
+
+There are also cases when you need to check if tokens are being generated, such as to conditionally render a stop button. We’ve made this easy with the [`isGenerating`](../../06-api-reference/interfaces/LLMType.md#isgenerating) property.
+
+:::warning
+If you try to dismount the component using this hook while generation is still going on, it will result in crash.
+You'll need to interrupt the model first and wait until [`isGenerating`](../../06-api-reference/interfaces/LLMType.md#isgenerating) is set to false.
+:::
+
+### Reasoning
+
+Some models ship with a built-in "reasoning" or "thinking" mode, but this is model-specific, not a feature of our library. If the model you're using supports disabling reasoning, follow the instructions provided by the model authors. For example, Qwen 3 lets you disable reasoning by adding the `/no_think` suffix to your prompts - [source](https://qwenlm.github.io/blog/qwen3/#advanced-usages).
+
+### Tool calling
+
+Sometimes text processing capabilities of LLMs are not enough. That's when you may want to introduce tool calling (also called function calling). It allows model to use external tools to perform its tasks. The tools may be any arbitrary function that you want your model to run. It may retrieve some data from 3rd party API. It may do an action inside an app like pressing buttons or filling forms, or it may use system APIs to interact with your phone (turning on flashlight, adding events to your calendar, changing volume etc.).
+
+```tsx
+const TOOL_DEFINITIONS: LLMTool[] = [
+  {
+    name: 'get_weather',
+    description: 'Get/check weather in given location.',
+    parameters: {
+      type: 'dict',
+      properties: {
+        location: {
+          type: 'string',
+          description: 'Location where user wants to check weather',
+        },
+      },
+      required: ['location'],
+    },
+  },
+];
+
+const llm = useLLM({ model: models.llm.hammer2_1_1_5b() });
+
+const handleGenerate = () => {
+  const chat: Message[] = [
+    {
+      role: 'system',
+      content: `You are a helpful assistant. Current time and date: ${new Date().toString()}`,
+    },
+    {
+      role: 'user',
+      content: `Hi, what's the weather like in Cracow right now?`,
+    },
+  ];
+
+  // Chat completion
+  llm.generate(chat, TOOL_DEFINITIONS);
+};
+
+useEffect(() => {
+  // Parse response and call tools accordingly
+  // ...
+}, [llm.response]);
+
+return (
+  <View>
+    <Button onPress={handleGenerate} title="Generate!" />
+    <Text>{llm.response}</Text>
+  </View>
+);
+```
+
+## Managed LLM Chat
+
+### Configuring the model
+
+To configure model (i.e. change system prompt, load initial conversation history or manage tool calling, set generation settings) you can use
+[`configure`](../../06-api-reference/classes/LLMModule.md#configure) method. [**`chatConfig`**](../../06-api-reference/interfaces/LLMConfig.md#chatconfig) and [**`toolsConfig`**](../../06-api-reference/interfaces/LLMConfig.md#toolsconfig) is only applied to managed chats i.e. when using [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage) (see: [Functional vs managed](../../03-hooks/01-natural-language-processing/useLLM.md#functional-vs-managed)) It accepts object with following fields:
+
+- [`chatConfig`](../../06-api-reference/interfaces/LLMConfig.md#chatconfig) - Object configuring chat management that contains:
+  - [`systemPrompt`](../../06-api-reference/interfaces/ChatConfig.md#systemprompt) - Often used to tell the model what is its purpose, for example - "Be a helpful translator".
+
+  - [`initialMessageHistory`](../../06-api-reference/interfaces/ChatConfig.md#initialmessagehistory) - Object that represent the conversation history. This can be used to provide initial context to the model.
+
+  - [`contextStrategy`](../../06-api-reference/interfaces/ChatConfig.md#contextstrategy) - Object implementing [`ContextStrategy`](../../06-api-reference/interfaces/ContextStrategy.md) interface used to manage conversation context, including trimming history if necessary. Custom strategies can be implemented or one of the built-in options can be used (e.g. [`NoopContextStrategy`](../../06-api-reference/classes/NoopContextStrategy.md), [`MessageCountContextStrategy`](../../06-api-reference/classes/MessageCountContextStrategy.md) or the default [`SlidingWindowContextStrategy`](../../06-api-reference/classes/SlidingWindowContextStrategy.md)).
+
+- [`toolsConfig`](../../06-api-reference/interfaces/LLMConfig.md#toolsconfig) - Object configuring options for enabling and managing tool use. **It will only have effect if your model's chat template support it**. Contains following properties:
+  - [`tools`](../../06-api-reference/interfaces/ToolsConfig.md#tools) - List of objects defining tools.
+
+  - [`executeToolCallback`](../../06-api-reference/interfaces/ToolsConfig.md#executetoolcallback) - Function that accepts [`ToolCall`](../../06-api-reference/interfaces/ToolCall.md), executes tool and returns the string to model.
+
+  - [`displayToolCalls`](../../06-api-reference/interfaces/ToolsConfig.md#displaytoolcalls) - If set to `true`, JSON tool calls will be displayed in chat. If `false`, only answers will be displayed.
+
+- [`generationConfig`](../../06-api-reference/interfaces/LLMConfig.md#generationconfig) - Object configuring generation settings with following properties:
+  - [`outputTokenBatchSize`](../../06-api-reference/interfaces/GenerationConfig.md#batchtimeinterval) - Soft upper limit on the number of tokens in each token batch (in certain cases there can be more tokens in given batch, i.e. when the batch would end with special emoji join character).
+
+  - [`batchTimeInterval`](../../06-api-reference/interfaces/GenerationConfig.md#batchtimeinterval) - Upper limit on the time interval between consecutive token batches.
+
+  - [`temperature`](../../06-api-reference/interfaces/GenerationConfig.md#temperature) - Scales output logits by the inverse of temperature. Controls the randomness / creativity of text generation.
+
+  - [`topP`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topP. Range `[0, 1]`. Values of `0` or `1` disable top-p filtering.
+
+  - [`minP`](../../06-api-reference/interfaces/GenerationConfig.md#minp) - Minimum-probability threshold applied after softmax: tokens whose probability is below `minP * max_prob` are excluded from sampling. Range `[0, 1]`. Default `0` disables the filter. Stacks with `topP` when both are set.
+
+  - [`repetitionPenalty`](../../06-api-reference/interfaces/GenerationConfig.md#repetitionpenalty) - Multiplicative penalty applied to logits of tokens that already appeared in the prompt or the generated text. Values greater than `1` discourage repetition; default `1` disables the penalty.
+
+:::info[Built-in models ship with sampling defaults]
+Model presets expose an optional [`generationConfig`](../../06-api-reference/interfaces/LLMProps.md) on the `model` prop. Whenever the upstream model card publishes recommended values (currently Qwen3 and LFM2-VL) the preset carries them and `useLLM` applies them automatically before `isReady` flips — you don't need to call `configure` just to get sensible defaults. Any fields you then pass to `configure` still override on a per-field basis.
+:::
+
+### Model configuration example
+
+```tsx
+import { useEffect } from 'react';
+import {
+  models,
+  MessageCountContextStrategy,
+  DEFAULT_SYSTEM_PROMPT,
+  ToolCall,
+  useLLM,
+} from 'react-native-executorch';
+const TOOL_DEFINITIONS: LLMTool[] = [
+  {
+    name: 'get_weather',
+    description: 'Get/check weather in given location.',
+    parameters: {
+      type: 'dict',
+      properties: {
+        location: {
+          type: 'string',
+          description: 'Location where user wants to check weather',
+        },
+      },
+      required: ['location'],
+    },
+  },
+];
+
+const getWeather = async (_call: ToolCall) => {
+  return 'The weather is great!';
+};
+
+const executeTool: (call: ToolCall) => Promise<string | null> = async (
+  call
+) => {
+  switch (call.toolName) {
+    case 'get_weather':
+      return await getWeather(call);
+    default:
+      console.error(`Wrong function! We don't handle it!`);
+      return null;
+  }
+};
+
+const llm = useLLM({ model: models.llm.lfm2_5_1_2b_instruct() });
+
+const { configure } = llm;
+useEffect(() => {
+  configure({
+    chatConfig: {
+      systemPrompt: `${DEFAULT_SYSTEM_PROMPT} Current time and date: ${new Date().toString()}`,
+      initialMessageHistory: [
+        {
+          role: 'user',
+          content: 'What is the current time and date?',
+        },
+      ],
+      contextStrategy: new MessageCountContextStrategy(6),
+    },
+    toolsConfig: {
+      tools: TOOL_DEFINITIONS,
+      executeToolCallback: executeTool,
+      displayToolCalls: true,
+    },
+    generationConfig: {
+      outputTokenBatchSize: 15,
+      batchTimeInterval: 100,
+      temperature: 0.7,
+      topP: 0.9,
+      minP: 0.05,
+      repetitionPenalty: 1.05,
+    },
+  });
+}, [configure]);
+```
+
+### Sending a message
+
+In order to send a message to the model, one can use the following code:
+
+```tsx
+const llm = useLLM({ model: models.llm.lfm2_5_1_2b_instruct() });
+
+const send = () => {
+  const message = 'Hi, who are you?';
+  llm.sendMessage(message);
+};
+
+return <Button onPress={send} title="Generate!" />;
+```
+
+### Accessing conversation history
+
+Behind the scenes, tokens are generated one by one, and the [`response`](../../06-api-reference/interfaces/LLMType.md#response) property is updated with each token as it’s created.
+If you want to get entire conversation you can use [`messageHistory`](../../06-api-reference/interfaces/LLMType.md#messagehistory) field:
+
+```tsx
+return (
+  <View>
+    {llm.messageHistory.map((message) => (
+      <Text>{message.content}</Text>
+    ))}
+  </View>
+);
+```
+
+### Tool calling example
+
+```tsx
+const TOOL_DEFINITIONS: LLMTool[] = [
+  {
+    name: 'get_weather',
+    description: 'Get/check weather in given location.',
+    parameters: {
+      type: 'dict',
+      properties: {
+        location: {
+          type: 'string',
+          description: 'Location where user wants to check weather',
+        },
+      },
+      required: ['location'],
+    },
+  },
+];
+
+const llm = useLLM({ model: models.llm.hammer2_1_1_5b() });
+
+useEffect(() => {
+  llm.configure({
+    chatConfig: {
+      systemPrompt: `You are helpful assistant. Current time and date: ${new Date().toString()}`,
+    },
+    toolsConfig: {
+      tools: TOOL_DEFINITIONS,
+      executeToolCallback: async (call) => {
+        if (call.toolName === 'get_weather') {
+          console.log('Checking weather!');
+          // perform call to weather API
+          // ...
+          const mockResults = 'Weather is great!';
+          return mockResults;
+        }
+        return null;
+      },
+      displayToolCalls: true,
+    },
+  });
+}, []);
+
+const send = () => {
+  const message = `Hi, what's the weather like in Cracow right now?`;
+  llm.sendMessage(message);
+};
+
+return (
+  <View>
+    <Button onPress={send} title="Generate!" />
+    <Text>{llm.response}</Text>
+  </View>
+);
+```
+
+### Structured output example
+
+```tsx
+import { Schema } from 'jsonschema';
+
+const responseSchema: Schema = {
+  properties: {
+    username: {
+      type: 'string',
+      description: 'Name of user, that is asking a question.',
+    },
+    question: {
+      type: 'string',
+      description: 'Question that user asks.',
+    },
+    bid: {
+      type: 'number',
+      description: 'Amount of money, that user offers.',
+    },
+    currency: {
+      type: 'string',
+      description: 'Currency of offer.',
+    },
+  },
+  required: ['username', 'bid'],
+  type: 'object',
+};
+
+// alternatively use Zod
+import * as z from 'zod/v4';
+const responseSchemaWithZod = z.object({
+  username: z
+    .string()
+    .meta({ description: 'Name of user, that is asking a question.' }),
+  question: z.optional(
+    z.string().meta({ description: 'Question that user asks.' })
+  ),
+  bid: z.number().meta({ description: 'Amount of money, that user offers.' }),
+  currency: z.optional(z.string().meta({ description: 'Currency of offer.' })),
+});
+
+const llm = useLLM({ model: models.llm.qwen3_4b() });
+
+useEffect(() => {
+  const formattingInstructions = getStructuredOutputPrompt(responseSchema);
+  // alternatively pass schema defined with Zod
+  //  const formattingInstructions = getStructuredOutputPrompt(responseSchemaWithZod);
+
+  // Some extra prompting to improve quality of response.
+  const prompt = `Your goal is to parse user's messages and return them in JSON format. Don't respond to user. Simply return JSON with user's question parsed. \n${formattingInstructions}\n /no_think`;
+
+  llm.configure({
+    chatConfig: {
+      systemPrompt: prompt,
+    },
+  });
+}, []);
+
+useEffect(() => {
+  const lastMessage = llm.messageHistory.at(-1);
+  if (!llm.isGenerating && lastMessage?.role === 'assistant') {
+    try {
+      const formattedOutput = fixAndValidateStructuredOutput(
+        lastMessage.content,
+        responseSchemaWithZod
+      );
+      // Zod will allow you to correctly type output
+      const formattedOutputWithZod = fixAndValidateStructuredOutput(
+        lastMessage.content,
+        responseSchema
+      );
+      console.log('Formatted output:', formattedOutput, formattedOutputWithZod);
+    } catch (e) {
+      console.log(
+        "Error parsing output and/or output doesn't match required schema!",
+        e
+      );
+    }
+  }
+}, [llm.messageHistory, llm.isGenerating]);
+
+const send = () => {
+  const message = `I'm John. Is this product damaged? I can give you $100 for this.`;
+  llm.sendMessage(message);
+};
+
+return (
+  <View>
+    <Button onPress={send} title="Generate!" />
+    <Text>{llm.response}</Text>
+  </View>
+);
+```
+
+The response should include JSON:
+
+```json
+{
+  "username": "John",
+  "question": "Is this product damaged?",
+  "bid": 100,
+  "currency": "USD"
+}
+```
+
+## Token Batching
+
+Depending on selected model and the user's device generation speed can be above 60 tokens per second. If the [`tokenCallback`](../../06-api-reference/classes/LLMModule.md#tokencallback) from [`LLMModule`](../../06-api-reference/classes/LLMModule.md), which is used under the hood, triggers rerenders and is invoked on every single token it can significantly decrease the app's performance. To alleviate this and help improve performance we've implemented token batching. To configure this you need to call [`configure`](../../06-api-reference/interfaces/LLMType.md#configure) method and pass [`generationConfig`](../../06-api-reference/interfaces/LLMConfig.md#generationconfig). You can check what you can configure [Configuring the Model](../../03-hooks/01-natural-language-processing/useLLM.md#configuring-the-model). They set the size of the batch before tokens are emitted and the maximum time interval between consecutive batches respectively. Each batch is emitted if either `timeInterval` elapses since last batch or `countInterval` number of tokens are generated. This allows for smooth generation even if model lags during generation. Default parameters are set to 10 tokens and 80ms for time interval (~12 batches per second).
+
+## Vision-Language Models (VLM)
+
+Some models support multimodal input — text, images and/or audio together. To use them, pass a `capabilities` array when loading the model.
+
+### Loading a VLM
+
+```tsx
+import { models, useLLM } from 'react-native-executorch';
+const llm = useLLM({ model: models.llm.gemma4_e2b_multimodal() });
+```
+
+The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
+
+```tsx
+const llm = useLLM({
+  model: {
+    modelSource: '...',
+    tokenizerSource: '...',
+    tokenizerConfigSource: '...',
+    capabilities: ['vision', 'audio'],
+  },
+});
+```
+
+Passing `capabilities` unlocks the typed `media` argument on `sendMessage`.
+
+### Sending a message with an image or audio recording
+
+```tsx
+const llm = useLLM({ model: models.llm.gemma4_e2b_multimodal() });
+
+const send = () => {
+  llm.sendMessage('What is in this image?', {
+    imagePath: '/path/to/image.jpg',
+  });
+  // or
+  llm.sendMessage('What can you hear?', {
+    audioBuffer: audioRecording,
+  });
+};
+
+return (
+  <View>
+    <Button onPress={send} title="Send!" />
+    <Text>{llm.response}</Text>
+  </View>
+);
+```
+
+The `imagePath` should be a local file path on the device.
+The `audioBuffer` should be a `Float32Array` with 16kHz waveform.
+
+### Functional generation with images
+
+You can also use `generate` directly by setting `mediaPath` on user messages:
+
+```tsx
+const llm = useLLM({ model: models.llm.lfm2_5_vl_1_6b() });
+
+const handleGenerate = async () => {
+  const chat: Message[] = [
+    {
+      role: 'user',
+      content: 'Describe this image.',
+      mediaPath: '/path/to/image.jpg',
+    },
+  ];
+
+  const response = await llm.generate(chat);
+  console.log(response);
+};
+```
+
+## Available models
+
+| Model Family                                                                               |            Sizes             | Quantized | Capabilities |
+| ------------------------------------------------------------------------------------------ | :--------------------------: | :-------: | :----------: |
+| [Hammer 2.1](https://huggingface.co/software-mansion/react-native-executorch-hammer-2.1)   |        0.5B, 1.5B, 3B        |    ✅     |      -       |
+| [Qwen 2.5](https://huggingface.co/software-mansion/react-native-executorch-qwen-2.5)       |        0.5B, 1.5B, 3B        |    ✅     |      -       |
+| [Qwen 3](https://huggingface.co/software-mansion/react-native-executorch-qwen-3)           |        0.6B, 1.7B, 4B        |    ✅     |      -       |
+| [Qwen 3.5](https://huggingface.co/software-mansion/react-native-executorch-qwen-3.5)       |           0.8B, 2B           |    ✅     |      -       |
+| [Phi 4 Mini](https://huggingface.co/software-mansion/react-native-executorch-phi-4-mini)   |              4B              |    ✅     |      -       |
+| [SmolLM 2](https://huggingface.co/software-mansion/react-native-executorch-smolLm-2)       |       135M, 360M, 1.7B       |    ✅     |      -       |
+| [LLaMA 3.2](https://huggingface.co/software-mansion/react-native-executorch-llama-3.2)     |            1B, 3B            |    ✅     |      -       |
+| [Bielik v3.0](https://huggingface.co/software-mansion/react-native-executorch-bielik-v3.0) |             1.5B             |    ✅     |      -       |
+| [LFM2.5](https://huggingface.co/software-mansion/react-native-executorch-lfm-2.5)          | 350M, 450M-VL, 1.2B, 1.6B-VL |    ✅     |    vision    |
diff --git a/docs/versioned_docs/version-0.9.x/04-typescript-api/01-natural-language-processing/LLMModule.md b/docs/versioned_docs/version-0.9.x/04-typescript-api/01-natural-language-processing/LLMModule.md
new file mode 100644
index 0000000000..48b3395f87
--- /dev/null
+++ b/docs/versioned_docs/version-0.9.x/04-typescript-api/01-natural-language-processing/LLMModule.md
@@ -0,0 +1,199 @@
+---
+title: LLMModule
+---
+
+TypeScript API implementation of the [useLLM](../../03-hooks/01-natural-language-processing/useLLM.md) hook.
+
+## API Reference
+
+- For detailed API Reference for `LLMModule` see: [`LLMModule` API Reference](../../06-api-reference/classes/LLMModule.md).
+- For all LLM models available out-of-the-box in React Native ExecuTorch see: [LLM Models](../../06-api-reference/index.md#models---llm).
+- For useful LLM utility functionalities please refer to the following link: [LLM Utility Functionalities](../../06-api-reference/index.md#utilities---llm).
+
+## High Level Overview
+
+```typescript
+import { models, LLMModule } from 'react-native-executorch';
+// Creating an instance and loading the model
+const llm = await LLMModule.fromModelName(
+  models.llm.lfm2_5_1_2b_instruct(),
+  (progress) => console.log(progress),
+  (token) => console.log(token),
+  (messages) => console.log(messages)
+);
+
+// Running the model - returns the generated response
+const response = await llm.sendMessage('Hello, World!');
+console.log('Response:', response);
+
+// Interrupting the model (to actually interrupt the generation it has to be called when sendMessage or generate is running)
+llm.interrupt();
+
+// Deleting the model from memory
+llm.delete();
+```
+
+### Methods
+
+All methods of `LLMModule` are explained in details here: [LLMModule API Reference](../../06-api-reference/classes/LLMModule.md).
+
+## Loading the model
+
+Use the static [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname) factory method:
+
+```typescript
+const llm = await LLMModule.fromModelName(
+  models.llm.lfm2_5_1_2b_instruct(), // model config constant
+  onDownloadProgress, // optional, progress 0–1
+  tokenCallback, // optional, called on every token
+  messageHistoryCallback // optional, called when generation finishes
+);
+```
+
+The model config object contains `modelSource`, `tokenizerSource`, `tokenizerConfigSource`, and optional `capabilities`. Pass one of the built-in constants (e.g. `LFM2_5_1_2B_INSTRUCT`) or construct it manually.
+
+This method returns a promise resolving to an `LLMModule` instance.
+
+For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
+
+## Listening for download progress
+
+To subscribe to the download progress event, you can pass the `onDownloadProgress` callback as the second argument to [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname). This function is called whenever the download progress changes.
+
+## Running the model
+
+To run the model, you can use [`generate`](../../06-api-reference/classes/LLMModule.md#generate) method. It allows you to pass chat messages and receive completion from the model. It doesn't provide any message history management.
+
+Alternatively in managed chat (see: [Functional vs managed](../../03-hooks/01-natural-language-processing/useLLM.md#functional-vs-managed)), you can use the [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage) method. It accepts the user message and returns a promise that resolves to the generated response. Additionally, it will call [`messageHistoryCallback`](../../06-api-reference/classes/LLMModule.md#messagehistorycallback) with the updated message history containing both user message and model response.
+
+If you need raw model access without any wrappers, you can use [`forward`](../../06-api-reference/classes/LLMModule.md#forward). It provides direct access to the model, so the input string is passed straight into the model and returns the generated response. It may be useful to work with models that aren't finetuned for chat completions. If you're not sure what are implications of that (e.g. that you have to include special model tokens), you're better off with [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage).
+
+## Listening for generated tokens
+
+To subscribe to the token generation event, you can pass [`tokenCallback`](../../06-api-reference/classes/LLMModule.md#tokencallback) or [`messageHistoryCallback`](../../06-api-reference/classes/LLMModule.md#messagehistorycallback) functions to the constructor. [`tokenCallback`](../../06-api-reference/classes/LLMModule.md#tokencallback) is called on every token and contains only the most recent token. [`messageHistoryCallback`](../../06-api-reference/classes/LLMModule.md#messagehistorycallback) is called whenever model finishes generation and contains all message history including user's and model's last messages.
+
+## Interrupting the model
+
+In order to interrupt the model, you can use the [`interrupt`](../../06-api-reference/classes/LLMModule.md#interrupt) method.
+
+## Token Batching
+
+Depending on selected model and the user's device generation speed can be above 60 tokens per second. If the [`tokenCallback`](../../06-api-reference/classes/LLMModule.md#tokencallback) triggers rerenders and is invoked on every single token it can significantly decrease the app's performance. To alleviate this and help improve performance we've implemented token batching. To configure this you need to call [`configure`](../../06-api-reference/classes/LLMModule.md#configure) method and pass [`generationConfig`](../../06-api-reference/interfaces/LLMConfig.md#generationconfig). In the next section, there are listed what you can tweak with this config.
+
+## Configuring the model
+
+To configure model (i.e. change system prompt, load initial conversation history or manage tool calling, set generation settings) you can use
+[`configure`](../../06-api-reference/classes/LLMModule.md#configure) method. [**`chatConfig`**](../../06-api-reference/interfaces/LLMConfig.md#chatconfig) and [**`toolsConfig`**](../../06-api-reference/interfaces/LLMConfig.md#toolsconfig) is only applied to managed chats i.e. when using [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage) (see: [Functional vs managed](../../03-hooks/01-natural-language-processing/useLLM.md#functional-vs-managed)) It accepts object with following fields:
+
+- [`chatConfig`](../../06-api-reference/interfaces/LLMConfig.md#chatconfig) - Object configuring chat management that contains:
+  - [`systemPrompt`](../../06-api-reference/interfaces/ChatConfig.md#systemprompt) - Often used to tell the model what is its purpose, for example - "Be a helpful translator".
+
+  - [`initialMessageHistory`](../../06-api-reference/interfaces/ChatConfig.md#initialmessagehistory) - Object that represent the conversation history. This can be used to provide initial context to the model.
+
+  - [`contextStrategy`](../../06-api-reference/interfaces/ChatConfig.md#contextstrategy) - Object implementing [`ContextStrategy`](../../06-api-reference/interfaces/ContextStrategy.md) interface used to manage conversation context, including trimming history if necessary. Custom strategies can be implemented or one of the built-in options can be used (e.g. [`NoopContextStrategy`](../../06-api-reference/classes/NoopContextStrategy.md), [`MessageCountContextStrategy`](../../06-api-reference/classes/MessageCountContextStrategy.md) or the default [`SlidingWindowContextStrategy`](../../06-api-reference/classes/SlidingWindowContextStrategy.md)).
+
+- [`toolsConfig`](../../06-api-reference/interfaces/ToolsConfig.md) - Object configuring options for enabling and managing tool use. **It will only have effect if your model's chat template support it**. Contains following properties:
+  - [`tools`](../../06-api-reference/interfaces/ToolsConfig.md#tools) - List of objects defining tools.
+
+  - [`executeToolCallback`](../../06-api-reference/interfaces/ToolsConfig.md#executetoolcallback) - Function that accepts [`ToolCall`](../../06-api-reference/interfaces/ToolCall.md), executes tool and returns the string to model.
+
+  - [`displayToolCalls`](../../06-api-reference/interfaces/ToolsConfig.md#displaytoolcalls) - If set to `true`, JSON tool calls will be displayed in chat. If `false`, only answers will be displayed.
+
+- [`generationConfig`](../../06-api-reference/interfaces/LLMConfig.md#generationconfig) - Object configuring generation settings with following properties:
+  - [`outputTokenBatchSize`](../../06-api-reference/interfaces/GenerationConfig.md#batchtimeinterval) - Soft upper limit on the number of tokens in each token batch (in certain cases there can be more tokens in given batch, i.e. when the batch would end with special emoji join character).
+
+  - [`batchTimeInterval`](../../06-api-reference/interfaces/GenerationConfig.md#batchtimeinterval) - Upper limit on the time interval between consecutive token batches.
+
+  - [`temperature`](../../06-api-reference/interfaces/GenerationConfig.md#temperature) - Scales output logits by the inverse of temperature. Controls the randomness / creativity of text generation.
+
+  - [`topP`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topP. Range `[0, 1]`. Values of `0` or `1` disable top-p filtering.
+
+  - [`minP`](../../06-api-reference/interfaces/GenerationConfig.md#minp) - Minimum-probability threshold applied after softmax: tokens whose probability is below `minP * max_prob` are excluded from sampling. Range `[0, 1]`. Default `0` disables the filter. Stacks with `topP` when both are set.
+
+  - [`repetitionPenalty`](../../06-api-reference/interfaces/GenerationConfig.md#repetitionpenalty) - Multiplicative penalty applied to logits of tokens that already appeared in the prompt or the generated text. Values greater than `1` discourage repetition; default `1` disables the penalty.
+
+:::info[Built-in models ship with sampling defaults]
+Model presets expose an optional `generationConfig` that `LLMModule.fromModelName` applies automatically when available — for Qwen3 and LFM2-VL this means the model-card recommended sampling settings are in effect without any explicit `configure` call. Any fields you pass to `configure` still override on a per-field basis.
+:::
+
+## Vision-Language Models (VLM)
+
+Some models support multimodal input — text, images and/or audio together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname):
+
+```typescript
+import { models, LLMModule } from 'react-native-executorch';
+const llm = await LLMModule.fromModelName(
+  models.llm.gemma4_e2b_multimodal(),
+  undefined,
+  (token) => console.log(token)
+);
+```
+
+The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
+
+```typescript
+const llm = await LLMModule.fromModelName({
+  modelName: 'gemma4-e2b-multimodal',
+  modelSource: require('./path/to/model.pte'),
+  tokenizerSource: require('./path/to/tokenizer.json'),
+  tokenizerConfigSource: require('./path/to/tokenizer_config.json'),
+  capabilities: ['vision', 'audio'],
+});
+```
+
+Once loaded, pass `imagePath` or `audioBuffer` to [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage):
+
+```typescript
+const response = await llm.sendMessage('What is in this image?', {
+  imagePath: '/path/to/image.jpg',
+});
+// or
+const response = await llm.sendMessage('What can you hear?', {
+  audioBuffer: audioRecording, //expected as waveform 16kHz
+});
+```
+
+Or use [`generate`](../../06-api-reference/classes/LLMModule.md#generate) with `mediaPath` on the message:
+
+```typescript
+const chat: Message[] = [
+  {
+    role: 'user',
+    content: 'Describe this image.',
+    mediaPath: '/path/to/image.jpg',
+  },
+];
+// or
+const chat: Message[] = [
+  {
+    role: 'user',
+    content: 'Transcribe the recording.',
+    audioWaveform: audioRecording,
+  },
+];
+const response = await llm.generate(chat);
+```
+
+## Using a custom model
+
+Use [`fromCustomModel`](../../06-api-reference/classes/LLMModule.md#fromcustommodel) to load your own exported LLM instead of a built-in preset:
+
+```typescript
+import { LLMModule } from 'react-native-executorch';
+const llm = await LLMModule.fromCustomModel(
+  'https://example.com/model.pte',
+  'https://example.com/tokenizer.json',
+  'https://example.com/tokenizer_config.json',
+  (progress) => console.log(progress),
+  (token) => console.log(token),
+  (messages) => console.log(messages)
+);
+```
+
+### Required model contract
+
+The `.pte` model binary must be exported following the [ExecuTorch LLM export process](https://docs.pytorch.org/executorch/1.1/llm/export-llm.html). The native runner expects the standard ExecuTorch text-generation interface — KV-cache management, prefill/decode phases, and logit sampling are all handled by the runtime.
+
+## Deleting the model from memory
+
+To delete the model from memory, you can use the [`delete`](../../06-api-reference/classes/LLMModule.md#delete) method.
diff --git a/packages/bare-resource-fetcher/package.json b/packages/bare-resource-fetcher/package.json
index f6e261c9bc..fab8a23943 100644
--- a/packages/bare-resource-fetcher/package.json
+++ b/packages/bare-resource-fetcher/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-executorch-bare-resource-fetcher",
-  "version": "0.9.0",
+  "version": "0.9.1",
   "description": "Bare React Native resource fetcher for react-native-executorch",
   "main": "lib/index.js",
   "types": "lib/index.d.ts",
diff --git a/packages/expo-resource-fetcher/package.json b/packages/expo-resource-fetcher/package.json
index 7e661e2f02..80c654a507 100644
--- a/packages/expo-resource-fetcher/package.json
+++ b/packages/expo-resource-fetcher/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-executorch-expo-resource-fetcher",
-  "version": "0.9.0",
+  "version": "0.9.1",
   "description": "Expo resource fetcher for react-native-executorch",
   "main": "lib/index.js",
   "types": "lib/index.d.ts",
diff --git a/packages/react-native-executorch/android/libs/classes.jar b/packages/react-native-executorch/android/libs/classes.jar
index be5ec2ee7f..81eee28f71 100644
Binary files a/packages/react-native-executorch/android/libs/classes.jar and b/packages/react-native-executorch/android/libs/classes.jar differ
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index f94ef918ac..e4209b2f79 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -4,6 +4,7 @@
 #include <cstdint>
 #include <set>
 #include <span>
+#include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <variant>
@@ -17,6 +18,7 @@
 
 #include <rnexecutorch/metaprogramming/TypeConcepts.h>
 #include <rnexecutorch/models/instance_segmentation/Types.h>
+#include <rnexecutorch/models/llm/Types.h>
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
@@ -223,6 +225,22 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
   return getArrayAsVector<float>(val, runtime);
 }
 
+template <>
+inline std::vector<std::vector<float>>
+getValue<std::vector<std::vector<float>>>(const jsi::Value &val,
+                                          jsi::Runtime &runtime) {
+  jsi::Array array = val.asObject(runtime).asArray(runtime);
+  const size_t length = array.size(runtime);
+  std::vector<std::vector<float>> result;
+  result.reserve(length);
+  for (size_t i = 0; i < length; ++i) {
+    jsi::Value element = array.getValueAtIndex(runtime, i);
+    auto span = getTypedArrayAsSpan<float>(element, runtime);
+    result.emplace_back(span.begin(), span.end());
+  }
+  return result;
+}
+
 template <>
 inline std::vector<int64_t>
 getValue<std::vector<int64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
@@ -302,6 +320,31 @@ getValue<std::span<uint64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getTypedArrayAsSpan<uint64_t>(val, runtime);
 }
 
+template <>
+inline models::llm::MultimodalInputs
+getValue<models::llm::MultimodalInputs>(const jsi::Value &val,
+                                        jsi::Runtime &runtime) {
+  models::llm::MultimodalInputs multimodalInputs;
+  jsi::Object obj = val.asObject(runtime);
+
+  jsi::Value v = obj.getProperty(runtime, "imageToken");
+  if (!v.isUndefined() && !v.isNull()) {
+    auto &images = multimodalInputs.images.emplace();
+    images.token = getValue<std::string>(v, runtime);
+    v = obj.getProperty(runtime, "imagePaths");
+    images.paths = getValue<std::vector<std::string>>(v, runtime);
+  }
+  v = obj.getProperty(runtime, "audioToken");
+  if (!v.isUndefined() && !v.isNull()) {
+    auto &audios = multimodalInputs.audios.emplace();
+    audios.token = getValue<std::string>(v, runtime);
+    v = obj.getProperty(runtime, "audioWaveforms");
+    audios.waveforms = getValue<std::vector<std::vector<float>>>(v, runtime);
+  }
+
+  return multimodalInputs;
+}
+
 // Conversion from C++ types to jsi --------------------------------------------
 
 // Implementation functions might return any type, but in a promise we can only
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 7e0fa4b26e..924bba9f99 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -1,11 +1,12 @@
 #include "LLM.h"
+#include "rnexecutorch/models/llm/Types.h"
 
 #include <executorch/extension/tensor/tensor.h>
 #include <filesystem>
 #include <map>
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
+#include <runner/encoders/audio_encoder.h>
 #include <runner/encoders/vision_encoder.h>
 #include <runner/multimodal_runner.h>
 #include <runner/text_runner.h>
@@ -21,7 +22,6 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::vector<std::string> capabilities,
          std::shared_ptr<react::CallInvoker> callInvoker)
     : BaseModel(modelSource, callInvoker, Module::LoadMode::Mmap) {
-
   if (capabilities.empty()) {
     runner_ =
         std::make_unique<llm::TextRunner>(std::move(module_), tokenizerSource);
@@ -31,6 +31,9 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
       if (cap == "vision") {
         encoders[llm::MultimodalType::Image] =
             std::make_unique<llm::VisionEncoder>(*module_);
+      } else if (cap == "audio") {
+        encoders[llm::MultimodalType::Audio] =
+            std::make_unique<llm::AudioEncoder>(*module_);
       }
     }
     runner_ = std::make_unique<llm::MultimodalRunner>(
@@ -75,62 +78,73 @@ std::string LLM::generate(std::string input,
 }
 
 std::string LLM::generateMultimodal(std::string prompt,
-                                    std::vector<std::string> imagePaths,
-                                    std::string imageToken,
-                                    std::shared_ptr<jsi::Function> callback) {
+                                    std::shared_ptr<jsi::Function> callback,
+                                    MultimodalInputs mutlimodalInputs) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
   if (!runner_->is_multimodal()) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "This model does not support multimodal input. Use generate(prompt, "
-        "callback) for text-only generation.");
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "This model does not support multimodal input.");
   }
-  if (imageToken.empty()) {
+  if (!mutlimodalInputs.images.has_value() &&
+      !mutlimodalInputs.audios.has_value()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidUserInput,
-        "imageToken must not be empty. Pass the model's image token (e.g. "
-        "from tokenizer_config.json).");
+        "At least one of imageToken/audioToken must be non-empty");
   }
 
-  const size_t kImageTokenLen = imageToken.size();
-
+  // Scan the prompt once, splitting at the earliest placeholder at each step
+  // so that image/audio placeholders can be freely interleaved in the prompt.
   std::vector<llm::MultimodalInput> inputs;
-  size_t imageIdx = 0;
-  size_t searchPos = 0;
-
-  while (true) {
-    size_t found = prompt.find(imageToken, searchPos);
-    if (found == std::string::npos) {
-      if (searchPos < prompt.size()) {
-        inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
-      }
+  size_t imageIdx = 0, audioIdx = 0, pos = 0;
+  while (pos < prompt.size()) {
+    size_t imgAt = mutlimodalInputs.images.has_value()
+                       ? prompt.find(mutlimodalInputs.images.value().token, pos)
+                       : std::string::npos;
+    size_t audAt = mutlimodalInputs.audios.has_value()
+                       ? prompt.find(mutlimodalInputs.audios.value().token, pos)
+                       : std::string::npos;
+    if (imgAt == std::string::npos && audAt == std::string::npos) {
+      inputs.push_back(llm::make_text_input(prompt.substr(pos)));
       break;
     }
-    // Text segment before this placeholder
-    if (found > searchPos) {
-      inputs.push_back(
-          llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
+    const bool imageFirst = imgAt != std::string::npos &&
+                            (audAt == std::string::npos || imgAt < audAt);
+    size_t at = imageFirst ? imgAt : audAt;
+    if (at > pos) {
+      inputs.push_back(llm::make_text_input(prompt.substr(pos, at - pos)));
     }
-    // Image at this position
-    if (imageIdx >= imagePaths.size()) {
-      throw RnExecutorchError(
-          RnExecutorchErrorCode::InvalidUserInput,
-          "More '" + imageToken +
-              "' placeholders in prompt than image paths provided");
+    if (imageFirst) {
+      auto &images = mutlimodalInputs.images.value();
+      if (imageIdx >= images.paths.size()) {
+        throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                                "More '" + images.token +
+                                    "' placeholders than image paths");
+      }
+      inputs.push_back(llm::make_image_input(images.paths[imageIdx++]));
+      pos = at + images.token.size();
+    } else {
+      auto &audios = mutlimodalInputs.audios.value();
+      if (audioIdx >= audios.waveforms.size()) {
+        throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                                "More '" + audios.token +
+                                    "' placeholders than audio waveforms");
+      }
+      inputs.push_back(
+          llm::make_audio_input(std::move(audios.waveforms[audioIdx++])));
+      pos = at + audios.token.size();
     }
-    inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
-    searchPos = found + kImageTokenLen;
   }
-
-  if (imageIdx < imagePaths.size()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "More image paths provided than '" + imageToken +
-                                "' placeholders in prompt");
+  if ((mutlimodalInputs.images.has_value() &&
+       imageIdx < mutlimodalInputs.images.value().paths.size()) ||
+      (mutlimodalInputs.audios.has_value() &&
+       audioIdx < mutlimodalInputs.audios.value().waveforms.size())) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidUserInput,
+        "More image/audio paths provided than placeholders in prompt");
   }
-
   if (inputs.empty()) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
                             "No inputs to generate from");
@@ -150,7 +164,6 @@ std::string LLM::generateMultimodal(std::string prompt,
   if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate multimodal response");
   }
-
   return output;
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 222b5bc62f..4b7087351b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -7,6 +7,7 @@
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
 #include <rnexecutorch/models/BaseModel.h>
+#include <rnexecutorch/models/llm/Types.h>
 #include <runner/base_llm_runner.h>
 
 namespace rnexecutorch {
@@ -22,10 +23,10 @@ class LLM : public BaseModel {
 
   std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
+
   std::string generateMultimodal(std::string prompt,
-                                 std::vector<std::string> imagePaths,
-                                 std::string imageToken,
-                                 std::shared_ptr<jsi::Function> callback);
+                                 std::shared_ptr<jsi::Function> callback,
+                                 MultimodalInputs mutlimodalInputs = {});
 
   void interrupt();
   void reset();
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/Types.h
new file mode 100644
index 0000000000..921d4fa8f4
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/Types.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace rnexecutorch::models::llm {
+struct ImageInputs {
+  std::vector<std::string> paths;
+  std::string token;
+};
+
+struct AudioInputs {
+  std::vector<std::vector<float>> waveforms;
+  std::string token;
+};
+
+struct MultimodalInputs {
+  std::optional<ImageInputs> images;
+  std::optional<AudioInputs> audios;
+};
+
+} // namespace rnexecutorch::models::llm
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 1f34b3a18e..5f9d7287a5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -293,6 +293,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp
         ${COMMON_DIR}/runner/sampler.cpp
         ${COMMON_DIR}/runner/arange_util.cpp
         ${COMMON_DIR}/runner/encoders/vision_encoder.cpp
+        ${COMMON_DIR}/runner/encoders/audio_encoder.cpp
         ${IMAGE_UTILS_SOURCES}
     LIBS tokenizers_deps opencv_deps
 )
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index ae0a11e777..4b34f4248e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -1,11 +1,15 @@
 #include "BaseModelTests.h"
+#include "utils/TestUtils.h"
 #include <gtest/gtest.h>
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include <ReactCommon/CallInvoker.h>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/models/llm/LLM.h>
+#include <runner/encoders/audio_encoder.h>
 #include <runner/encoders/vision_encoder.h>
 
 using namespace rnexecutorch;
@@ -30,6 +34,12 @@ std::string formatChatML(const std::string &systemPrompt,
          "<|im_start|>assistant\n";
 }
 
+// Helper to format a single-turn prompt in Gemma's chat template.
+std::string formatGemma(const std::string &userMessage) {
+  return "<start_of_turn>user\n" + userMessage + "<end_of_turn>\n" +
+         "<start_of_turn>model\n";
+}
+
 // ============================================================================
 // Common tests via typed test suite
 // ============================================================================
@@ -227,6 +237,18 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) {
   EXPECT_THROW(encoder->load(), rnexecutorch::RnExecutorchError);
 }
 
+TEST(AudioEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) {
+  // smolLm2_135M_8da4w.pte has no audio_encoder method
+  auto module = std::make_unique<::executorch::extension::Module>(
+      "smolLm2_135M_8da4w.pte",
+      ::executorch::extension::Module::LoadMode::File);
+
+  auto encoder =
+      std::make_unique<executorch::extension::llm::AudioEncoder>(*module);
+
+  EXPECT_THROW(encoder->load(), rnexecutorch::RnExecutorchError);
+}
+
 // ============================================================================
 // VLM-specific tests
 // ============================================================================
@@ -243,7 +265,11 @@ TEST_F(LLMTest, TextModelIsNotMultimodal) {
 
 TEST_F(LLMTest, GenerateMultimodalOnTextModelThrows) {
   LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
-  EXPECT_THROW(model.generateMultimodal("hello", {}, "<image>", nullptr),
+  // A text-only runner reports is_multimodal() == false, so any multimodal
+  // call must be rejected before the inputs are even inspected.
+  MultimodalInputs inputs{.images =
+                              ImageInputs{.paths = {}, .token = "<image>"}};
+  EXPECT_THROW(model.generateMultimodal("hello", nullptr, std::move(inputs)),
                RnExecutorchError);
 }
 
@@ -270,22 +296,120 @@ std::shared_ptr<facebook::react::CallInvoker> VLMTest::invoker_;
 std::unique_ptr<LLM> VLMTest::model_;
 
 TEST_F(VLMTest, GenerateMultimodalEmptyImageTokenThrows) {
-  EXPECT_THROW(
-      model_->generateMultimodal("hello", {kTestImagePath}, "", nullptr),
-      RnExecutorchError);
+  MultimodalInputs inputs{
+      .images = ImageInputs{.paths = {kTestImagePath}, .token = ""}};
+  EXPECT_THROW(model_->generateMultimodal("hello", nullptr, std::move(inputs)),
+               RnExecutorchError);
 }
 
 TEST_F(VLMTest, GenerateMultimodalMorePlaceholdersThanImagePaths) {
   std::string prompt = std::string(kVlmImageToken) + " and " + kVlmImageToken;
-  EXPECT_THROW(model_->generateMultimodal(prompt, {kTestImagePath},
-                                          kVlmImageToken, nullptr),
+  MultimodalInputs inputs{.images = ImageInputs{.paths = {kTestImagePath},
+                                                .token = kVlmImageToken}};
+  EXPECT_THROW(model_->generateMultimodal(prompt, nullptr, std::move(inputs)),
                RnExecutorchError);
 }
 
 TEST_F(VLMTest, GenerateMultimodalMoreImagePathsThanPlaceholders) {
   std::string prompt = std::string(kVlmImageToken) + " describe";
-  EXPECT_THROW(model_->generateMultimodal(prompt,
-                                          {kTestImagePath, kTestImagePath},
-                                          kVlmImageToken, nullptr),
+  MultimodalInputs inputs{
+      .images = ImageInputs{.paths = {kTestImagePath, kTestImagePath},
+                            .token = kVlmImageToken}};
+  EXPECT_THROW(model_->generateMultimodal(prompt, nullptr, std::move(inputs)),
+               RnExecutorchError);
+}
+
+// ============================================================================
+// Audio (Gemma 4) multimodal tests
+// ============================================================================
+constexpr auto kGemmaModelPath = "gemma4_e2b_mm_xnnpack.pte";
+constexpr auto kGemmaTokenizerPath = "gemma_tokenizer.json";
+constexpr auto kGemmaAudioToken = "<audio_soft_token>";
+constexpr auto kTestAudioPath = "test_audio_float.raw";
+
+// Fixture that loads the audio-capable Gemma model once for all audio tests.
+class GemmaAudioTest : public ::testing::Test {
+protected:
+  static void SetUpTestSuite() {
+    invoker_ = createMockCallInvoker();
+    model_ = std::make_unique<LLM>(kGemmaModelPath, kGemmaTokenizerPath,
+                                   std::vector<std::string>{"vision", "audio"},
+                                   invoker_);
+  }
+
+  static void TearDownTestSuite() {
+    model_.reset();
+    invoker_.reset();
+  }
+
+  static std::vector<float> loadAudio(size_t maxSamples = 32000) {
+    auto wav = test_utils::loadAudioFromFile(kTestAudioPath);
+    if (wav.size() > maxSamples) {
+      wav.resize(maxSamples);
+    }
+    return wav;
+  }
+
+  static std::shared_ptr<facebook::react::CallInvoker> invoker_;
+  static std::unique_ptr<LLM> model_;
+};
+
+std::shared_ptr<facebook::react::CallInvoker> GemmaAudioTest::invoker_;
+std::unique_ptr<LLM> GemmaAudioTest::model_;
+
+TEST_F(GemmaAudioTest, GenerateMultimodalNoInputsThrows) {
+  EXPECT_THROW(model_->generateMultimodal("hello", nullptr, {}),
+               RnExecutorchError);
+}
+
+TEST_F(GemmaAudioTest, GenerateMultimodalEmptyAudioTokenThrows) {
+  MultimodalInputs inputs{
+      .audios = AudioInputs{.waveforms = {loadAudio()}, .token = ""}};
+  EXPECT_THROW(model_->generateMultimodal("hello", nullptr, std::move(inputs)),
+               RnExecutorchError);
+}
+
+TEST_F(GemmaAudioTest, GenerateMultimodalMorePlaceholdersThanWaveformsThrows) {
+  std::string prompt =
+      std::string(kGemmaAudioToken) + " and " + kGemmaAudioToken;
+  MultimodalInputs inputs{.audios = AudioInputs{.waveforms = {loadAudio()},
+                                                .token = kGemmaAudioToken}};
+  EXPECT_THROW(model_->generateMultimodal(prompt, nullptr, std::move(inputs)),
                RnExecutorchError);
 }
+
+TEST_F(GemmaAudioTest, GenerateMultimodalMoreWaveformsThanPlaceholdersThrows) {
+  std::string prompt = std::string(kGemmaAudioToken) + " describe";
+  MultimodalInputs inputs{
+      .audios = AudioInputs{.waveforms = {loadAudio(), loadAudio()},
+                            .token = kGemmaAudioToken}};
+  EXPECT_THROW(model_->generateMultimodal(prompt, nullptr, std::move(inputs)),
+               RnExecutorchError);
+}
+
+TEST_F(GemmaAudioTest, GenerateMultimodalAudioProducesOutput) {
+  std::vector<float> wav = loadAudio();
+  ASSERT_FALSE(wav.empty())
+      << "test_audio_float.raw missing on device - check run_tests.sh assets";
+
+  std::string prompt =
+      formatGemma(std::string(kGemmaAudioToken) + " Transcribe the audio.");
+  MultimodalInputs inputs{.audios = AudioInputs{.waveforms = {std::move(wav)},
+                                                .token = kGemmaAudioToken}};
+  std::string output =
+      model_->generateMultimodal(prompt, nullptr, std::move(inputs));
+
+  EXPECT_FALSE(output.empty());
+  EXPECT_GT(model_->getGeneratedTokenCount(), 0);
+}
+
+TEST_F(GemmaAudioTest, GenerateMultimodalInterleavedTextAndAudio) {
+  std::string prompt = formatGemma("Listen: " + std::string(kGemmaAudioToken) +
+                                   " then summarise it.");
+  MultimodalInputs inputs{.audios = AudioInputs{.waveforms = {loadAudio()},
+                                                .token = kGemmaAudioToken}};
+  std::string output =
+      model_->generateMultimodal(prompt, nullptr, std::move(inputs));
+
+  EXPECT_FALSE(output.empty());
+}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index 9fbbaade13..5896d16a3e 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -17,6 +17,7 @@ MODELS_DIR="$SCRIPT_DIR/integration/assets/models"
 TEST_EXECUTABLES=(
   "NumericalTests"
   "RunnerTests"
+  "SamplerTests"
   "LogTests"
   "FileUtilsTest"
   "ImageProcessingTest"
@@ -81,6 +82,8 @@ MODELS=(
   "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/software-mansion/react-native-executorch-lfm2.5-VL-1.6B/resolve/main/quantized/lfm2_5_vl_1_6b_8da4w_xnnpack.pte"
   "lfm2_vl_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-lfm2.5-VL-1.6B/resolve/main/tokenizer.json"
   "lfm2_vl_tokenizer_config.json|https://huggingface.co/software-mansion/react-native-executorch-lfm2.5-VL-1.6B/resolve/main/tokenizer_config.json"
+  "gemma4_e2b_mm_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-gemma-4-multimodal/v0.9.0/e2b/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte"
+  "gemma_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-gemma-4/v0.9.0/e2b/tokenizer.json"
   "yolo26n-seg.pte|https://huggingface.co/software-mansion/react-native-executorch-yolo26-seg/resolve/v0.8.0/yolo26n-seg/xnnpack/yolo26n-seg.pte"
   "segmentation_image.jpg|https://upload.wikimedia.org/wikipedia/commons/thumb/8/85/Collage_audi.jpg/1280px-Collage_audi.jpg"
   "yolo26n-pose.pte|https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose/resolve/v0.9.0/xnnpack/yolo26_pose_n_xnnpack_fp32.pte"
@@ -207,7 +210,7 @@ models_for_test() {
   TokenizerModuleTests) echo "tokenizer.json" ;;
   SpeechToTextTests) echo "whisper_tiny_en_xnnpack.pte whisper_tokenizer.json fsmn-vad_xnnpack.pte" ;;
   TextToSpeechTests) echo "kokoro_duration_predictor.pte kokoro_synthesizer.pte kokoro_af_heart.bin kokoro_us_lexicon.json kokoro_en_tagger.json kokoro_us_phonemizer.pte" ;;
-  LLMTests) echo "smolLm2_135M_8da4w.pte smollm_tokenizer.json lfm2_5_vl_quantized_xnnpack_v2.pte lfm2_vl_tokenizer.json lfm2_vl_tokenizer_config.json test_image.jpg" ;;
+  LLMTests) echo "smolLm2_135M_8da4w.pte smollm_tokenizer.json lfm2_5_vl_quantized_xnnpack_v2.pte lfm2_vl_tokenizer.json lfm2_vl_tokenizer_config.json test_image.jpg gemma4_e2b_mm_xnnpack.pte gemma_tokenizer.json" ;;
   TextToImageTests) echo "t2i_tokenizer.json t2i_encoder.pte t2i_unet.pte t2i_decoder.pte" ;;
   InstanceSegmentationTests) echo "yolo26n-seg.pte segmentation_image.jpg" ;;
   PoseEstimationTests) echo "yolo26n-pose.pte" ;;
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/unit/SamplerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/unit/SamplerTest.cpp
index 4295f16232..bf7a1d02d6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/unit/SamplerTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/unit/SamplerTest.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <gtest/gtest.h>
+#include <runner/irunner.h>
 #include <runner/sampler.h>
 #include <vector>
 
@@ -26,27 +27,39 @@ std::vector<int> sampleMany(Sampler &s, std::vector<T> logits,
 
 // 1. Repetition penalty on positive logit: token 0 should be sampled less.
 TEST(SamplerTest, RepetitionPenaltyReducesPositiveLogit) {
-  Sampler s(2, 1.0f, 1.0f, 0, 0.0f, 1.3f);
+  Sampler s(2, {.temperature = 1.0f, .topp = 1.0f, .repetition_penalty = 1.3f});
   std::vector<float> logits = {1.0f, 1.0f};
   std::vector<uint64_t> recent = {0};
   auto counts = sampleMany(s, logits, recent, 2000);
   EXPECT_LT(counts[0], 1200);
 }
 
-// 2. Repetition penalty on negative logit: penalised token should appear even
-// less.
+// 2. Repetition penalty on negative logit: multiplying a negative logit by the
+// penalty makes it more negative, so the penalised token is sampled strictly
+// less often than without the penalty. Compare against an unpenalised baseline
+// rather than a fixed threshold: with penalty 1.5 the penalised logit is
+// -1.0 * 1.5 = -1.5, giving P(token 1) = e^-1.5 / (1 + e^-1.5) ≈ 0.18 (~365 of
+// 2000) versus the baseline e^-1 / (1 + e^-1) ≈ 0.27 (~538). A static "< 200"
+// bound would be mathematically unreachable at this penalty.
 TEST(SamplerTest, RepetitionPenaltyMultipliesNegativeLogit) {
-  Sampler s(2, 1.0f, 1.0f, 0, 0.0f, 1.5f);
-  std::vector<float> logits = {0.0f, -1.0f};
+  Sampler baseline(
+      2, {.temperature = 1.0f, .topp = 1.0f, .repetition_penalty = 1.0f});
+  Sampler penalised(
+      2, {.temperature = 1.0f, .topp = 1.0f, .repetition_penalty = 1.5f});
+  std::vector<float> logits_b = {0.0f, -1.0f};
+  std::vector<float> logits_p = {0.0f, -1.0f};
   std::vector<uint64_t> recent = {1};
-  auto counts = sampleMany(s, logits, recent, 2000);
-  EXPECT_LT(counts[1], 200);
+  auto cb = sampleMany(baseline, logits_b, recent, 2000);
+  auto cp = sampleMany(penalised, logits_p, recent, 2000);
+  EXPECT_LT(cp[1], cb[1]);
 }
 
 // 3. No recent tokens — penalty has no effect.
 TEST(SamplerTest, RepetitionPenaltyNoRecentTokensHasNoEffect) {
-  Sampler baseline(2, 1.0f, 1.0f, 0, 0.0f, 1.0f);
-  Sampler penalised(2, 1.0f, 1.0f, 0, 0.0f, 2.0f);
+  Sampler baseline(
+      2, {.temperature = 1.0f, .topp = 1.0f, .repetition_penalty = 1.0f});
+  Sampler penalised(
+      2, {.temperature = 1.0f, .topp = 1.0f, .repetition_penalty = 2.0f});
   std::vector<float> logits_b = {1.0f, 1.0f};
   std::vector<float> logits_p = {1.0f, 1.0f};
   std::vector<uint64_t> recent = {};
@@ -57,7 +70,7 @@ TEST(SamplerTest, RepetitionPenaltyNoRecentTokensHasNoEffect) {
 
 // 4. Min-p truncation: token with very low probability is excluded.
 TEST(SamplerTest, MinPFiltersTailTokens) {
-  Sampler s(3, 1.0f, 1.0f, 0, 0.1f, 1.0f);
+  Sampler s(3, {.temperature = 1.0f, .topp = 1.0f, .min_p = 0.1f});
   std::vector<float> logits = {5.0f, -5.0f, -5.0f};
   std::vector<uint64_t> recent = {};
   auto counts = sampleMany(s, logits, recent, 1000);
@@ -68,7 +81,7 @@ TEST(SamplerTest, MinPFiltersTailTokens) {
 
 // 5. Min-p = 0 disables filtering.
 TEST(SamplerTest, MinPZeroDisablesFiltering) {
-  Sampler s(3, 0.0f, 1.0f, 0, 0.0f, 1.0f);
+  Sampler s(3, {.temperature = 0.0f, .topp = 1.0f});
   std::vector<float> logits = {1.0f, -1000.0f, -1000.0f};
   std::vector<uint64_t> recent = {};
   EXPECT_EQ(s.sample(logits.data(), recent), 0);
@@ -76,7 +89,7 @@ TEST(SamplerTest, MinPZeroDisablesFiltering) {
 
 // 6. Min-p + top-p stacked.
 TEST(SamplerTest, MinPAndToppStack) {
-  Sampler s(4, 1.0f, 0.5f, 0, 0.2f, 1.0f);
+  Sampler s(4, {.temperature = 1.0f, .topp = 0.5f, .min_p = 0.2f});
   std::vector<float> logits = {5.0f, 2.0f, -2.0f, -5.0f};
   std::vector<uint64_t> recent = {};
   auto counts = sampleMany(s, logits, recent, 2000);
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
index a021040807..7229d64f20 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -56,11 +56,16 @@ Error BaseLLMRunner::load() {
             ? static_cast<int32_t>(metadata_.at(kMaxContextLen))
             : static_cast<int32_t>(metadata_.at(kMaxSeqLen));
   }
-  if (config_.max_new_tokens < 0)
-    config_.max_new_tokens =
-        std::min(config_.max_seq_len, config_.max_context_length);
   config_.enable_dynamic_shape =
       static_cast<bool>(metadata_.at(kEnableDynamicShape));
+  if (config_.max_new_tokens < 0) {
+    // For dynamic-shape PTEs, max_seq_len is the per-call decoder chunk
+    // size, not the generation budget — use max_context_length instead.
+    const int32_t seq_cap = config_.enable_dynamic_shape
+                                ? config_.max_context_length
+                                : config_.max_seq_len;
+    config_.max_new_tokens = std::min(seq_cap, config_.max_context_length);
+  }
   config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
 
   eos_ids_ = std::make_unique<std::unordered_set<uint64_t>>();
@@ -149,6 +154,8 @@ void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
   config_.repetition_penalty = repetition_penalty;
 }
 
+void BaseLLMRunner::set_topk(int32_t topk) noexcept { config_.topk = topk; }
+
 void BaseLLMRunner::set_count_interval(size_t count_interval) {
   config_.output_token_batch_size = count_interval;
 }
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
index 9710f5ae70..82de49bea3 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.h
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -55,6 +55,7 @@ class BaseLLMRunner {
   void set_topp(float topp) noexcept;
   void set_min_p(float min_p) noexcept;
   void set_repetition_penalty(float repetition_penalty) noexcept;
+  void set_topk(int32_t topk) noexcept;
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);
 
diff --git a/packages/react-native-executorch/common/runner/constants.h b/packages/react-native-executorch/common/runner/constants.h
index f1fee23471..368371688a 100644
--- a/packages/react-native-executorch/common/runner/constants.h
+++ b/packages/react-native-executorch/common/runner/constants.h
@@ -23,8 +23,22 @@ inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
-
 inline constexpr auto numOfAddedBoSTokens = 0;
 inline constexpr auto numOfAddedEoSTokens = 0;
 
+// Gemma4
+// PLE models only: token id that marks image placeholder slots in input_ids.
+// token_embedding run on this id produces the per-layer PLE signal for image
+// positions; the inputs_embeds output for those positions is discarded (the
+// vision encoder output replaces it).
+inline constexpr auto kImagePlaceholderId = "image_placeholder_id";
+// True iff the model exposes a per-layer-embedding (PLE) signal alongside
+// inputs_embeds (Gemma4-style). When true, `token_embedding.execute()`
+// returns the tuple (inputs_embeds, ple_tok) and the runner must thread
+// ple_tok into text_decoder; when false (or absent), token_embedding returns
+// inputs_embeds alone. Text-only PTEs that ship a single `forward` method
+// omit this key entirely — it is meaningful only for multimodal PTEs that
+// expose a separate `token_embedding` method.
+inline constexpr auto kHasPLE = "has_ple";
+
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/audio_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/audio_encoder.cpp
new file mode 100644
index 0000000000..36227dc966
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/encoders/audio_encoder.cpp
@@ -0,0 +1,111 @@
+// common/runner/encoders/audio_encoder.cpp
+#include "audio_encoder.h"
+
+#include <rnexecutorch/Error.h>
+#include <runner/constants.h>
+
+#include <executorch/extension/tensor/tensor.h>
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace executorch::extension::llm {
+
+using ::executorch::aten::SizesType;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Result;
+
+namespace {
+constexpr int32_t kSamplingRate = 16e3;
+constexpr int32_t kMaxLengthSeconds = 30;
+constexpr int32_t kSamplesPerBlock = 7680;
+constexpr int64_t kAudioBlockKMin = 1;
+constexpr int64_t kAudioBlockKMax =
+    kSamplingRate * kMaxLengthSeconds / kSamplesPerBlock;
+} // namespace
+
+AudioEncoder::AudioEncoder(::executorch::extension::Module &module)
+    : module_(&module) {}
+
+Error AudioEncoder::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  auto method_names_result = module_->method_names();
+  if (!method_names_result.ok()) {
+    return method_names_result.error();
+  }
+  if (method_names_result->count(kAudioEncoderMethod) == 0) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::InvalidConfig,
+        "Model does not support audio: 'audio_encoder' method not found. "
+        "Check that the .pte file matches the declared capabilities.");
+  }
+  return module_->load_method(kAudioEncoderMethod);
+}
+
+bool AudioEncoder::is_loaded() const noexcept {
+  return module_->is_method_loaded(kAudioEncoderMethod);
+}
+
+int32_t AudioEncoder::encoderTokenCount() const noexcept {
+  return last_token_count_;
+}
+
+Result<EValue> AudioEncoder::encode(const MultimodalInput &input) {
+  if (!is_loaded()) {
+    return Error::InvalidState;
+  }
+  if (!input.is_audio()) {
+    return Error::InvalidArgument;
+  }
+
+  const auto &wav = input.get_audio();
+  ET_CHECK_OR_RETURN_ERROR(!wav.samples.empty(), InvalidArgument,
+                           "AudioEncoder: empty waveform");
+
+  const int64_t n_valid = static_cast<int64_t>(wav.samples.size());
+  const int64_t k_blocks = (n_valid + kSamplesPerBlock - 1) / kSamplesPerBlock;
+  ET_CHECK_OR_RETURN_ERROR(
+      k_blocks >= kAudioBlockKMin && k_blocks <= kAudioBlockKMax,
+      InvalidArgument,
+      "AudioEncoder: waveform of %lld samples needs k_blocks=%lld.",
+      static_cast<long long>(n_valid), static_cast<long long>(k_blocks));
+  const int64_t n_padded = k_blocks * kSamplesPerBlock;
+
+  // Own the padded waveform for the lifetime of this call; from_blob below
+  // borrows without copying. The current export takes
+  //   forward(waveform[1, 7680*k] fp32, num_blocks: int64 scalar)
+  // — input 1 is a rank-0 Long telling the encoder how many of the K_MAX
+  // blocks contain real PCM. Passing a 2-d mask here trips "Attempted to
+  // change tensor rank: old=0, new=2".
+  padded_wav_.assign(static_cast<size_t>(n_padded), 0.0f);
+  std::memcpy(padded_wav_.data(), wav.samples.data(),
+              static_cast<size_t>(n_valid) * sizeof(float));
+
+  valid_samples_scalar_ = n_valid;
+
+  auto wav_tensor = ::executorch::extension::from_blob(
+      padded_wav_.data(), {1, static_cast<SizesType>(n_padded)},
+      ::executorch::aten::ScalarType::Float);
+
+  auto num_blocks_tensor = ::executorch::extension::from_blob(
+      &valid_samples_scalar_, {}, ::executorch::aten::ScalarType::Long);
+
+  std::vector<EValue> args = {EValue(*wav_tensor), EValue(*num_blocks_tensor)};
+  auto exec_result = ET_UNWRAP(module_->execute(kAudioEncoderMethod, args));
+  ET_CHECK_OR_RETURN_ERROR(!exec_result.empty(), InvalidState,
+                           "audio_encoder returned no outputs");
+  auto audio_tensor = exec_result[0].toTensor();
+  ET_CHECK_OR_RETURN_ERROR(audio_tensor.dim() == 3, InvalidState,
+                           "audio_encoder output rank=%zd, expected 3",
+                           audio_tensor.dim());
+  last_token_count_ = static_cast<int32_t>(audio_tensor.size(1));
+  return exec_result[0];
+}
+
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/audio_encoder.h b/packages/react-native-executorch/common/runner/encoders/audio_encoder.h
new file mode 100644
index 0000000000..9723e4fbd7
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/encoders/audio_encoder.h
@@ -0,0 +1,40 @@
+// common/runner/encoders/audio_encoder.h
+#pragma once
+
+#include "iencoder.h"
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/evalue.h>
+#include <runner/multimodal_input.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace executorch::extension::llm {
+
+// Runs the Gemma4 `audio_encoder` PTE method.
+//
+// Contract mirrors SpeechToText (Whisper): JS hands in fp32 mono 16 kHz PCM
+// via `MultimodalInput::get_audio()`; the PTE owns the log-mel frontend so
+// this class just wraps the samples in a `[1, N_samples]` Float tensor and
+// executes. Resampling and WAV/MP3 decoding are the caller's responsibility
+// (e.g. react-native-audio-api).
+class AudioEncoder : public IEncoder {
+public:
+  explicit AudioEncoder(::executorch::extension::Module &module);
+
+  ::executorch::runtime::Error load() override;
+  bool is_loaded() const noexcept override;
+  ::executorch::runtime::Result<::executorch::runtime::EValue>
+  encode(const MultimodalInput &input) override;
+  // Number of audio embedding tokens produced per encode() call. 0 until first
+  // encode, since Gemma4's audio_encoder has a dynamic T dim.
+  int32_t encoderTokenCount() const noexcept override;
+
+private:
+  ::executorch::extension::Module *module_;
+  int32_t last_token_count_ = 0;
+  std::vector<float> padded_wav_;
+  int64_t valid_samples_scalar_ = 0;
+};
+
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index de3e196c1f..59fee53e11 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -2,7 +2,6 @@
 #include "vision_encoder.h"
 
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <runner/constants.h>
 
diff --git a/packages/react-native-executorch/common/runner/irunner.h b/packages/react-native-executorch/common/runner/irunner.h
index 54b14c354f..4e5b14444a 100644
--- a/packages/react-native-executorch/common/runner/irunner.h
+++ b/packages/react-native-executorch/common/runner/irunner.h
@@ -73,6 +73,11 @@ struct GenerationConfig {
   size_t output_token_batch_size = 10;
   size_t batch_time_interval_ms = 120;
 
+  // Top-k sampling – keep only the k highest-logit tokens before softmax.
+  // 0 (default) disables top-k filtering. Stacks with topp: temperature ->
+  // top-k -> top-p -> softmax -> multinomial.
+  int32_t topk = 0;
+
   // Enable dynamic input shapes (if implemented) or not
   // Impacts the prefill phase and causes TextPrefiller to pass all the tokens
   // at once if set to true.
diff --git a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
index 071b193539..8d83c1fa64 100644
--- a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
+++ b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
@@ -14,19 +14,50 @@
 #include "text_decoder_runner.h"
 
 namespace executorch::extension::llm {
+// Supports two PTE contracts, selected per-call from the kHasPLE metadata
+// key (mirrors how kEnableDynamicShape etc. are read — queried on demand,
+// not cached in a member). Callers that need it multiple times in a hot
+// path should snapshot into a local.
+//
+//  * Legacy (has_ple == false):
+//      token_embedding(ids) -> inputs_embeds
+//      text_decoder(inputs_embeds, input_pos)
+//
+//  * Gemma-style PLE (has_ple == true):
+//      token_embedding(ids) -> (inputs_embeds, ple_tok)
+//      text_decoder(inputs_embeds, ple_tok, input_pos)
+//    ple_tok carries Gemma4's per-layer PLE signal keyed on input_ids. It's
+//    computed once in token_embedding and threaded through every decoder call
+//    so PLE fires at every position (including multimodal placeholder slots).
 class MultimodalDecoderRunner : public TextDecoderRunner {
 public:
   explicit MultimodalDecoderRunner(Module &module, IOManager *io_manager,
                                    const GenerationConfig &config)
       : TextDecoderRunner(module, io_manager, config) {}
 
+  bool has_ple() const {
+    auto r = module_->get(kHasPLE);
+    if (r.error() != ::executorch::runtime::Error::Ok) {
+      return false;
+    }
+    return r->toScalar().to<bool>();
+  }
+
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   step(TensorPtr &tokens, int64_t start_pos) override {
     auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens);
     if (!embed_result.ok()) {
       return embed_result.error();
     }
-    return decode((*embed_result)[0], start_pos);
+    auto &embed_outputs = *embed_result;
+    if (has_ple()) {
+      ET_CHECK_MSG(embed_outputs.size() == 2,
+                   "Expected 2 outputs (inputs_embeds, ple_tok) from "
+                   "token_embedding, got %zu",
+                   embed_outputs.size());
+      return decode(embed_outputs[0], embed_outputs[1], start_pos);
+    }
+    return decode(embed_outputs[0], start_pos);
   }
 
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
@@ -46,6 +77,24 @@ class MultimodalDecoderRunner : public TextDecoderRunner {
     return outputs[0].toTensor();
   }
 
+  inline ::executorch::runtime::Result<::executorch::aten::Tensor>
+  decode(const ::executorch::runtime::EValue &embeddings,
+         const ::executorch::runtime::EValue &ple_tok, int64_t start_pos) {
+    auto start_pos_tensor = ::executorch::extension::from_blob(
+        &start_pos, {1}, ::executorch::aten::ScalarType::Long);
+    auto outputs_result = module_->execute(
+        kTextModelMethod, {embeddings, ple_tok, start_pos_tensor});
+    if (!outputs_result.ok()) {
+      return outputs_result.error();
+    }
+    auto &outputs = *outputs_result;
+    ET_CHECK_MSG(outputs.size() == 1,
+                 "Expected 1 output from text_decoder, got %zu",
+                 outputs.size());
+    ET_CHECK_MSG(outputs[0].isTensor(), "text_decoder output is not a tensor");
+    return outputs[0].toTensor();
+  }
+
   inline ::executorch::runtime::Error load() override {
     if (is_method_loaded()) {
       return ::executorch::runtime::Error::Ok;
diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h
index 6b7de35014..b49da0561f 100644
--- a/packages/react-native-executorch/common/runner/multimodal_input.h
+++ b/packages/react-native-executorch/common/runner/multimodal_input.h
@@ -20,6 +20,10 @@ struct ImagePath {
   std::string path;
 };
 
+struct AudioWaveform {
+  std::vector<float> samples;
+};
+
 class MultimodalInput {
 public:
   explicit MultimodalInput(std::string text) : data_(std::move(text)) {}
@@ -27,6 +31,7 @@ class MultimodalInput {
       : data_(std::move(tokens)) {}
   explicit MultimodalInput(ImagePath image_path)
       : data_(std::move(image_path)) {}
+  explicit MultimodalInput(AudioWaveform audio) : data_(std::move(audio)) {}
 
   MultimodalInput(const MultimodalInput &) = default;
   MultimodalInput &operator=(const MultimodalInput &) = default;
@@ -42,6 +47,9 @@ class MultimodalInput {
   bool is_image() const noexcept {
     return std::holds_alternative<ImagePath>(data_);
   }
+  bool is_audio() const noexcept {
+    return std::holds_alternative<AudioWaveform>(data_);
+  }
 
   const std::string &get_text() const & { return std::get<std::string>(data_); }
   const std::vector<uint64_t> &get_tokens() const & {
@@ -50,9 +58,13 @@ class MultimodalInput {
   const std::string &get_image_path() const & {
     return std::get<ImagePath>(data_).path;
   }
+  const AudioWaveform &get_audio() const & {
+    return std::get<AudioWaveform>(data_);
+  }
 
 private:
-  std::variant<std::string, std::vector<uint64_t>, ImagePath> data_;
+  std::variant<std::string, std::vector<uint64_t>, ImagePath, AudioWaveform>
+      data_;
 };
 
 inline MultimodalInput make_text_input(const std::string &text) noexcept {
@@ -64,5 +76,8 @@ inline MultimodalInput make_text_input(std::string &&text) noexcept {
 inline MultimodalInput make_image_input(std::string path) noexcept {
   return MultimodalInput(ImagePath{std::move(path)});
 }
+inline MultimodalInput make_audio_input(std::vector<float> samples) noexcept {
+  return MultimodalInput(AudioWaveform{std::move(samples)});
+}
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
index 83a1a7f79c..8b04dc39bf 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -13,6 +13,9 @@
 #include "constants.h"
 #include "util.h"
 #include <algorithm>
+#include <cstring>
+#include <rnexecutorch/Log.h>
+#include <string>
 
 namespace executorch::extension::llm {
 
@@ -23,91 +26,390 @@ using ::executorch::runtime::Result;
 
 MultimodalPrefiller::MultimodalPrefiller(
     Module &module, MultimodalDecoderRunner &decoder_runner,
-    tokenizers::HFTokenizer &tokenizer, IEncoder *image_encoder)
+    tokenizers::HFTokenizer &tokenizer,
+    std::unordered_map<std::string, int64_t> metadata, IEncoder *image_encoder,
+    IEncoder *audio_encoder)
     : module_(&module), decoder_runner_(&decoder_runner),
-      tokenizer_(&tokenizer), image_encoder_(image_encoder) {}
+      tokenizer_(&tokenizer), metadata_(metadata),
+      image_encoder_(image_encoder), audio_encoder_(audio_encoder) {}
 
-Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
-                                              int64_t &start_pos) {
-  EValue encoder_output;
-  std::vector<int64_t> padded_tokens_storage;
-  TensorPtr sliced_embed_storage;
+int64_t MultimodalPrefiller::get_max_seq_len() const {
+  auto r = module_->get(kMaxSeqLen);
+  if (r.error() != ::executorch::runtime::Error::Ok) {
+    return metadata_.at(kMaxSeqLen);
+  }
+  return r->toScalar().to<int64_t>();
+}
+
+int64_t MultimodalPrefiller::get_max_context_len() const {
+  auto r = module_->get(kMaxContextLen);
+  if (r.error() != ::executorch::runtime::Error::Ok) {
+    return metadata_.at(kMaxContextLen) || get_max_seq_len();
+  }
+  return r->toScalar().to<int64_t>();
+}
+
+bool MultimodalPrefiller::get_enable_dynamic_shape() const {
+  auto r = module_->get(kEnableDynamicShape);
+  if (r.error() != ::executorch::runtime::Error::Ok) {
+    return metadata_.at(kEnableDynamicShape);
+  }
+  return r->toScalar().to<bool>();
+}
 
+[[nodiscard]] auto MultimodalPrefiller::processMultimodalInput(
+    const MultimodalInput &input, std::vector<int64_t> &ids,
+    std::vector<Types::ImageSlot> &image_slots,
+    std::vector<Types::AudioSlot> &audio_slots) {
   if (input.is_image()) {
     ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState,
                              "No image encoder registered");
-    auto encode_result = image_encoder_->encode(input);
-    ET_CHECK_OK_OR_RETURN_ERROR(encode_result.error(), "Image encoding failed");
-    encoder_output = *encode_result;
-
-  } else if (input.is_text() || input.is_tokens()) {
-    std::vector<uint64_t> tokens;
-    if (input.is_text()) {
-      auto encode_result = tokenizer_->encode(input.get_text());
-      if (!encode_result.ok()) {
-        ET_LOG(Error, "Tokenizer encode error %d",
-               static_cast<uint32_t>(encode_result.error()));
-        return Error::InvalidArgument;
-      }
-      tokens = std::move(*encode_result);
-    } else {
-      tokens = input.get_tokens();
+    const int32_t num_visual = image_encoder_->encoderTokenCount();
+    ET_CHECK_OR_RETURN_ERROR(num_visual > 0, InvalidState,
+                             "Image encoder reports 0 visual tokens");
+    image_slots.push_back(Types::ImageSlot{&input,
+                                           static_cast<int64_t>(ids.size()),
+                                           static_cast<int64_t>(num_visual)});
+    ids.insert(ids.end(), static_cast<size_t>(num_visual), 0);
+  } else if (input.is_audio()) {
+    ET_CHECK_OR_RETURN_ERROR(audio_encoder_ != nullptr, InvalidState,
+                             "No audio encoder registered");
+    auto enc = audio_encoder_->encode(input);
+    ET_CHECK_OK_OR_RETURN_ERROR(enc.error(), "Audio encoding failed");
+    // Snapshot the encoder output NOW — see AudioSlot comment above for
+    // why the returned EValue's tensor metadata can't survive past the
+    // next module_->execute(). num_audio and audio_hidden are read from
+    // the tensor directly rather than from encoderTokenCount() so they
+    // are guaranteed to reflect THIS encode call.
+    auto audio_tensor = enc->toTensor();
+    ET_CHECK_OR_RETURN_ERROR(audio_tensor.dim() == 3, InvalidState,
+                             "audio_encoder output rank=%zd, expected 3",
+                             audio_tensor.dim());
+    const int64_t num_audio = static_cast<int64_t>(audio_tensor.size(1));
+    const int64_t audio_hidden = static_cast<int64_t>(audio_tensor.size(2));
+    ET_CHECK_OR_RETURN_ERROR(num_audio > 0, InvalidState,
+                             "Audio encoder produced 0 tokens");
+    std::vector<uint8_t> bytes(audio_tensor.nbytes());
+    std::memcpy(bytes.data(), audio_tensor.const_data_ptr(),
+                audio_tensor.nbytes());
+    audio_slots.push_back(Types::AudioSlot{
+        std::move(bytes), audio_tensor.scalar_type(),
+        static_cast<int64_t>(ids.size()), num_audio, audio_hidden});
+    ids.insert(ids.end(), static_cast<size_t>(num_audio), 0);
+  } else if (input.is_text()) {
+    auto encode_result = tokenizer_->encode(input.get_text());
+    if (!encode_result.ok()) {
+      ET_LOG(Error, "Tokenizer encode error %d",
+             static_cast<uint32_t>(encode_result.error()));
+      return Error::InvalidArgument;
     }
+    std::vector<uint64_t> tokens = std::move(*encode_result);
+    for (auto t : tokens) {
+      ids.push_back(static_cast<int64_t>(t));
+    }
+  } else if (input.is_tokens()) {
+    std::vector<uint64_t> tokens = input.get_tokens();
+    for (auto t : tokens) {
+      ids.push_back(static_cast<int64_t>(t));
+    }
+  } else {
+    ET_LOG(Error, "Unsupported MultimodalInput type");
+    return Error::NotSupported;
+  }
+  return ::executorch::runtime::Error::Ok;
+}
+
+[[nodiscard]] auto MultimodalPrefiller::encodeAudio(
+    const Types::AudioSlot &slot, const auto hidden,
+    std::vector<uint8_t> &embeds_buf, const size_t embeds_elem_size,
+    const ::executorch::aten::ScalarType &embeds_dtype) {
+  ET_CHECK_OR_RETURN_ERROR(
+      slot.audio_hidden == static_cast<int64_t>(hidden), InvalidState,
+      "audio encoder hidden %lld != text_embed hidden %lld",
+      static_cast<long long>(slot.audio_hidden),
+      static_cast<long long>(hidden));
 
-    const auto actual_seq_len = static_cast<SizesType>(tokens.size());
+  const auto audio_dtype = slot.dtype;
+  const size_t audio_elems =
+      static_cast<size_t>(slot.num_audio) * static_cast<size_t>(hidden);
+  const size_t audio_elem_size =
+      audio_elems > 0 ? slot.bytes.size() / audio_elems : 0;
+  ET_CHECK_OR_RETURN_ERROR(
+      audio_elem_size > 0 && audio_elem_size * audio_elems == slot.bytes.size(),
+      InvalidState,
+      "audio slot bytes %zu inconsistent with num_audio=%lld hidden=%lld",
+      slot.bytes.size(), static_cast<long long>(slot.num_audio),
+      static_cast<long long>(hidden));
 
-    // The token_embedding PTE has a fixed MAX_SEQ_LEN input buffer.
-    // Pad with zeros, run embedding, then slice output back to actual length.
-    int64_t max_seq_len = actual_seq_len; // fallback: no padding needed
-    auto max_seq_len_result = module_->get(kMaxSeqLen);
-    if (max_seq_len_result.error() == Error::Ok) {
-      max_seq_len = max_seq_len_result->toScalar().to<int64_t>();
+  uint8_t *dst = embeds_buf.data() + static_cast<size_t>(slot.slot_start) *
+                                         static_cast<size_t>(hidden) *
+                                         embeds_elem_size;
+
+  if (audio_dtype == embeds_dtype) {
+    std::memcpy(dst, slot.bytes.data(), audio_elems * embeds_elem_size);
+  } else if (audio_dtype == ::executorch::aten::ScalarType::Float &&
+             embeds_dtype == ::executorch::aten::ScalarType::Half) {
+    const float *src = reinterpret_cast<const float *>(slot.bytes.data());
+    auto *dst_h = reinterpret_cast<::executorch::aten::Half *>(dst);
+    for (size_t i = 0; i < audio_elems; ++i) {
+      dst_h[i] = ::executorch::aten::Half(src[i]);
+    }
+  } else if (audio_dtype == ::executorch::aten::ScalarType::Half &&
+             embeds_dtype == ::executorch::aten::ScalarType::Float) {
+    const auto *src =
+        reinterpret_cast<const ::executorch::aten::Half *>(slot.bytes.data());
+    auto *dst_f = reinterpret_cast<float *>(dst);
+    for (size_t i = 0; i < audio_elems; ++i) {
+      dst_f[i] = static_cast<float>(src[i]);
     }
+  } else {
+    ET_CHECK_OR_RETURN_ERROR(
+        false, InvalidState,
+        "unsupported audio/text dtype pair: audio=%hhd text=%hhd",
+        static_cast<int8_t>(audio_dtype), static_cast<int8_t>(embeds_dtype));
+  }
+  return ::executorch::runtime::Error::Ok;
+}
 
-    padded_tokens_storage.assign(max_seq_len, 0);
-    std::ranges::copy(tokens, padded_tokens_storage.begin());
+[[nodiscard]] auto MultimodalPrefiller::encodeImages(
+    const Types::ImageSlot &slot, const auto hidden,
+    std::vector<uint8_t> &embeds_buf, const size_t embeds_elem_size,
+    const ::executorch::aten::ScalarType &embeds_dtype) {
+  auto encode_result = image_encoder_->encode(*slot.input);
+  ET_CHECK_OK_OR_RETURN_ERROR(encode_result.error(), "Image encoding failed");
+  auto encoder_output = *encode_result;
+  auto vision_tensor = encoder_output.toTensor();
 
-    auto text_tensor = ::executorch::extension::from_blob(
-        padded_tokens_storage.data(), {1, static_cast<SizesType>(max_seq_len)},
-        ::executorch::aten::ScalarType::Long);
+  const auto vision_dtype = vision_tensor.scalar_type();
+  const size_t visual_elems =
+      static_cast<size_t>(slot.num_visual) * static_cast<size_t>(hidden);
+  uint8_t *dst = embeds_buf.data() + static_cast<size_t>(slot.slot_start) *
+                                         static_cast<size_t>(hidden) *
+                                         embeds_elem_size;
+  if (vision_dtype == embeds_dtype) {
+    const uint8_t *src =
+        static_cast<const uint8_t *>(vision_tensor.const_data_ptr());
+    std::memcpy(dst, src, visual_elems * embeds_elem_size);
+  } else if (vision_dtype == ::executorch::aten::ScalarType::Float &&
+             embeds_dtype == ::executorch::aten::ScalarType::Half) {
+    const float *src = vision_tensor.const_data_ptr<float>();
+    auto *dst_h = reinterpret_cast<::executorch::aten::Half *>(dst);
+    for (size_t i = 0; i < visual_elems; ++i) {
+      dst_h[i] = ::executorch::aten::Half(src[i]);
+    }
+  } else if (vision_dtype == ::executorch::aten::ScalarType::Half &&
+             embeds_dtype == ::executorch::aten::ScalarType::Float) {
+    const auto *src = vision_tensor.const_data_ptr<::executorch::aten::Half>();
+    auto *dst_f = reinterpret_cast<float *>(dst);
+    for (size_t i = 0; i < visual_elems; ++i) {
+      dst_f[i] = static_cast<float>(src[i]);
+    }
+  } else {
+    ET_CHECK_OR_RETURN_ERROR(
+        false, InvalidState,
+        "unsupported vision/text dtype pair: vision=%hhd text=%hhd",
+        static_cast<int8_t>(vision_dtype), static_cast<int8_t>(embeds_dtype));
+  }
+  return ::executorch::runtime::Error::Ok;
+}
 
-    auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor);
-    ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
+[[nodiscard]] auto
+MultimodalPrefiller::initializePLE(auto &embed_outputs, auto total_len,
+                                   Types::PLEEmbeddings &ple_embeddings) {
+  auto full_ple_tok = embed_outputs[1].toTensor();
+  ple_embeddings.num_layers = static_cast<SizesType>(full_ple_tok.size(2));
+  ple_embeddings.ple_dim = static_cast<SizesType>(full_ple_tok.size(3));
+  ple_embeddings.ple_tok_dtype = full_ple_tok.scalar_type();
+  const size_t total_numel = static_cast<size_t>(full_ple_tok.numel());
+  const size_t total_bytes = full_ple_tok.nbytes();
+  ET_CHECK_OR_RETURN_ERROR(total_numel > 0, InvalidState,
+                           "ple_tok has zero elements");
+  ple_embeddings.ple_elem_size = total_bytes / total_numel;
+  const size_t prefix_bytes = static_cast<size_t>(total_len) *
+                              static_cast<size_t>(ple_embeddings.num_layers) *
+                              static_cast<size_t>(ple_embeddings.ple_dim) *
+                              ple_embeddings.ple_elem_size;
+  ple_embeddings.ple_tok_buf.resize(prefix_bytes);
+  std::memcpy(ple_embeddings.ple_tok_buf.data(),
+              full_ple_tok.mutable_data_ptr(), prefix_bytes);
+  return ::executorch::runtime::Error::Ok;
+}
 
-    auto full_embed = (*embed_result)[0].toTensor();
-    const auto embed_dim = static_cast<SizesType>(full_embed.size(2));
-    sliced_embed_storage = ::executorch::extension::from_blob(
-        full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim},
-        ::executorch::aten::ScalarType::Float);
-    encoder_output = EValue(*sliced_embed_storage);
+[[nodiscard]] auto MultimodalPrefiller::prefillChunk(
+    std::vector<EValue> &last_outs, std::vector<uint8_t> &embeds_buf,
+    auto chunk_start, auto chunk_len, auto hidden, auto embeds_elem_size,
+    auto embeds_dtype, Types::PLEEmbeddings &ple_embeddings,
+    std::vector<int64_t> &cache_positions) {
+  uint8_t *embeds_chunk_ptr =
+      embeds_buf.data() + static_cast<size_t>(chunk_start) *
+                              static_cast<size_t>(hidden) * embeds_elem_size;
+  auto embeds_chunk = ::executorch::extension::from_blob(
+      embeds_chunk_ptr, {1, static_cast<SizesType>(chunk_len), hidden},
+      embeds_dtype);
 
-  } else {
-    ET_LOG(Error, "Unsupported MultimodalInput type");
-    return Error::NotSupported;
+  TensorPtr ple_chunk;
+  if (decoder_runner_->has_ple()) {
+    uint8_t *ple_chunk_ptr =
+        ple_embeddings.ple_tok_buf.data() +
+        static_cast<size_t>(chunk_start) *
+            static_cast<size_t>(ple_embeddings.num_layers) *
+            static_cast<size_t>(ple_embeddings.ple_dim) *
+            ple_embeddings.ple_elem_size;
+    ple_chunk = ::executorch::extension::from_blob(
+        ple_chunk_ptr,
+        {1, static_cast<SizesType>(chunk_len), ple_embeddings.num_layers,
+         ple_embeddings.ple_dim},
+        ple_embeddings.ple_tok_dtype);
   }
 
-  // Run text_decoder for prefill.
-  int64_t seq_len = encoder_output.toTensor().size(1);
-  if (seq_len == 0) {
-    ET_LOG(Error, "Encoder returned empty output");
-    return Error::InvalidState;
+  auto pos_chunk = ::executorch::extension::from_blob(
+      cache_positions.data() + chunk_start, {static_cast<SizesType>(chunk_len)},
+      ::executorch::aten::ScalarType::Long);
+
+  auto res = decoder_runner_->has_ple()
+                 ? module_->execute(kTextModelMethod,
+                                    {EValue(*embeds_chunk), EValue(*ple_chunk),
+                                     EValue(*pos_chunk)})
+                 : module_->execute(kTextModelMethod, {EValue(*embeds_chunk),
+                                                       EValue(*pos_chunk)});
+  ET_CHECK_OK_OR_RETURN_ERROR(res.error());
+  last_outs = std::move(*res);
+  return ::executorch::runtime::Error::Ok;
+}
+
+Result<uint64_t>
+MultimodalPrefiller::prefill(const std::vector<MultimodalInput> &inputs,
+                             int64_t &start_pos) {
+  const bool has_ple = decoder_runner_->has_ple();
+
+  ET_CHECK_OR_RETURN_ERROR(!inputs.empty(), InvalidArgument,
+                           "prefill: empty input list");
+
+  // ------------------------------------------------------------
+  //   * get_max_seq_len     — text_decoder S cap. Max prefill chunk length
+  //   (<=get_max_conetxt_len)
+  //   * get_max_context_len — total KV budget. Caps max context length for
+  //   multi-turn conversation.
+  // ------------------------------------------------------------
+  int64_t max_seq_len = get_max_seq_len();
+  int64_t max_context_len = get_max_context_len();
+  bool enable_dynamic_shape = get_enable_dynamic_shape();
+  const int64_t prefill_total_cap =
+      enable_dynamic_shape ? max_context_len : max_seq_len;
+  const int64_t decoder_chunk_size = max_seq_len;
+
+  std::vector<int64_t> ids;
+  ids.reserve(static_cast<size_t>(prefill_total_cap));
+  std::vector<Types::ImageSlot> image_slots;
+  std::vector<Types::AudioSlot> audio_slots;
+
+  for (const auto &input : inputs) {
+    auto res = processMultimodalInput(input, ids, image_slots, audio_slots);
+    if (res != ::executorch::runtime::Error::Ok) {
+      return res;
+    }
+  }
+
+  const int64_t total_len = static_cast<int64_t>(ids.size());
+  ET_CHECK_OR_RETURN_ERROR(total_len > 0, InvalidArgument,
+                           "prefill produced zero tokens");
+
+  ET_CHECK_OR_RETURN_ERROR(total_len <= prefill_total_cap, InvalidArgument,
+                           "Prefill length %lld exceeds %s (%lld)",
+                           static_cast<long long>(total_len),
+                           enable_dynamic_shape ? "get_max_context_len"
+                                                : "get_max_seq_len",
+                           static_cast<long long>(prefill_total_cap));
+  if (!enable_dynamic_shape) {
+    ids.resize(static_cast<size_t>(max_seq_len), 0);
+  }
+
+  // ------------------------------------------------------------
+  // Single token_embedding call over the fused id buffer.
+  // ------------------------------------------------------------
+  const int64_t tok_buf_len = static_cast<int64_t>(ids.size());
+  auto token_tensor = ::executorch::extension::from_blob(
+      ids.data(), {1, static_cast<SizesType>(tok_buf_len)},
+      ::executorch::aten::ScalarType::Long);
+
+  auto embed_result = module_->execute(kTokenEmbeddingMethod, token_tensor);
+  ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
+  auto &embed_outputs = *embed_result;
+
+  auto full_embed = embed_outputs[0].toTensor();
+  const auto hidden = static_cast<SizesType>(full_embed.size(2));
+
+  // Own the embeds for the live prefix — subsequent vision_encoder.execute
+  // calls may reuse the token_embedding output buffer in the runtime.
+  const ::executorch::aten::ScalarType embeds_dtype = full_embed.scalar_type();
+  const size_t embeds_total_numel = static_cast<size_t>(full_embed.numel());
+  ET_CHECK_OR_RETURN_ERROR(embeds_total_numel > 0, InvalidState,
+                           "token_embedding returned zero elements");
+  const size_t embeds_elem_size = full_embed.nbytes() / embeds_total_numel;
+  const size_t embeds_prefix_bytes = static_cast<size_t>(total_len) *
+                                     static_cast<size_t>(hidden) *
+                                     embeds_elem_size;
+  std::vector<uint8_t> embeds_buf(embeds_prefix_bytes);
+  std::memcpy(embeds_buf.data(), full_embed.mutable_data_ptr(),
+              embeds_prefix_bytes);
+
+  // ------------------------------------------------------------
+  // Pass 2: encode images and splice their outputs into embeds_buf.
+  // ------------------------------------------------------------
+  for (const auto &slot : image_slots) {
+    auto res =
+        encodeImages(slot, hidden, embeds_buf, embeds_elem_size, embeds_dtype);
+    if (res != ::executorch::runtime::Error::Ok) {
+      return res;
+    }
   }
 
-  std::vector<int64_t> cache_positions;
-  auto cache_pos_result = populate_start_pos_or_cache_position(
-      module_, start_pos, cache_positions, seq_len, kTextModelMethod);
-  ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error());
+  // ------------------------------------------------------------
+  // Pass 2b: splice encoded audio tokens into embeds_buf. Reads from the
+  // byte snapshot taken at encode time so post-encode execute() calls can't
+  // invalidate slot state. Same dtype-conversion matrix as vision.
+  // ------------------------------------------------------------
+  for (auto &slot : audio_slots) {
+    auto res =
+        encodeAudio(slot, hidden, embeds_buf, embeds_elem_size, embeds_dtype);
+    if (res != ::executorch::runtime::Error::Ok) {
+      return res;
+    }
+  }
 
-  auto prefill_result =
-      module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result});
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
+  Types::PLEEmbeddings ple_embeddings;
+  if (has_ple) {
+    auto res = initializePLE(embed_outputs, total_len, ple_embeddings);
+    if (res != ::executorch::runtime::Error::Ok) {
+      return res;
+    }
+  }
 
-  auto &prefill_outputs = *prefill_result;
-  ET_CHECK_OR_RETURN_ERROR(!prefill_outputs.empty(), InvalidState,
+  std::vector<EValue> last_outs;
+  const int64_t chunk_cap =
+      decoder_chunk_size > 0 ? decoder_chunk_size : total_len;
+  std::vector<int64_t> cache_positions(static_cast<size_t>(total_len));
+  for (int64_t i = 0; i < total_len; ++i) {
+    cache_positions[static_cast<size_t>(i)] = start_pos + i;
+  }
+  const int64_t num_chunks = (total_len + chunk_cap - 1) / chunk_cap;
+  for (int64_t ci = 0; ci < num_chunks; ++ci) {
+    const int64_t chunk_start = ci * chunk_cap;
+    const int64_t chunk_end = std::min(chunk_start + chunk_cap, total_len);
+    const int64_t chunk_len = chunk_end - chunk_start;
+    auto res = prefillChunk(last_outs, embeds_buf, chunk_start, chunk_len,
+                            hidden, embeds_elem_size, embeds_dtype,
+                            ple_embeddings, cache_positions);
+    if (res != ::executorch::runtime::Error::Ok) {
+      return res;
+    }
+  }
+
+  ET_CHECK_OR_RETURN_ERROR(!last_outs.empty(), InvalidState,
                            "text_decoder returned no outputs during prefill");
 
-  auto logits = prefill_outputs[0].toTensor();
-  start_pos += seq_len;
+  auto logits = last_outs[0].toTensor();
+  start_pos += total_len;
 
   return static_cast<uint64_t>(decoder_runner_->logits_to_token(logits));
 }
@@ -127,6 +429,9 @@ Error MultimodalPrefiller::load() {
   if (methods.find(kVisionEncoderMethod) != methods.end()) {
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
   }
+  if (methods.find(kAudioEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod));
+  }
   return Error::Ok;
 }
 
@@ -140,8 +445,13 @@ bool MultimodalPrefiller::is_method_loaded() {
     return false;
   }
   const auto &methods = *methods_res;
-  if (methods.find(kVisionEncoderMethod) != methods.end()) {
-    return module_->is_method_loaded(kVisionEncoderMethod);
+  if (methods.find(kVisionEncoderMethod) != methods.end() &&
+      !module_->is_method_loaded(kVisionEncoderMethod)) {
+    return false;
+  }
+  if (methods.find(kAudioEncoderMethod) != methods.end() &&
+      !module_->is_method_loaded(kAudioEncoderMethod)) {
+    return false;
   }
   return true;
 }
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
index d9b5a9bf5c..05037d88c8 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.h
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
@@ -18,26 +18,77 @@
 
 namespace executorch::extension::llm {
 
+namespace Types {
+struct ImageSlot {
+  const MultimodalInput *input; // non-owning, valid for duration of call
+  int64_t slot_start;
+  int64_t num_visual;
+};
+
+struct AudioSlot {
+  std::vector<uint8_t> bytes;
+  ::executorch::aten::ScalarType dtype;
+  int64_t slot_start;
+  int64_t num_audio;
+  int64_t audio_hidden;
+};
+
+struct PLEEmbeddings {
+  std::vector<uint8_t> ple_tok_buf;
+  aten::SizesType num_layers = 0;
+  aten::SizesType ple_dim = 0;
+  size_t ple_elem_size = 0;
+  ::executorch::aten::ScalarType ple_tok_dtype =
+      ::executorch::aten::ScalarType::Half;
+};
+} // namespace Types
+
 class MultimodalPrefiller {
 public:
-  explicit MultimodalPrefiller(Module &module,
-                               MultimodalDecoderRunner &decoder_runner,
-                               tokenizers::HFTokenizer &tokenizer,
-                               IEncoder *image_encoder = nullptr);
+  explicit MultimodalPrefiller(
+      Module &module, MultimodalDecoderRunner &decoder_runner,
+      tokenizers::HFTokenizer &tokenizer,
+      std::unordered_map<std::string, int64_t> metadata,
+      IEncoder *image_encoder = nullptr, IEncoder *audio_encoder = nullptr);
 
   // Prefill one input segment. Updates start_pos in-place.
   // Returns the first predicted token after this segment.
-  ::executorch::runtime::Result<uint64_t> prefill(const MultimodalInput &input,
-                                                  int64_t &start_pos);
+  ::executorch::runtime::Result<uint64_t>
+  prefill(const std::vector<MultimodalInput> &inputs, int64_t &start_pos);
 
+  auto processMultimodalInput(const MultimodalInput &input,
+                              std::vector<int64_t> &ids,
+                              std::vector<Types::ImageSlot> &image_slots,
+                              std::vector<Types::AudioSlot> &audio_slots);
   ::executorch::runtime::Error load();
   bool is_method_loaded();
+  int64_t get_max_seq_len() const;
+  int64_t get_max_context_len() const;
+  bool get_enable_dynamic_shape() const;
 
 private:
+  auto encodeImages(const Types::ImageSlot &slot, const auto hidden,
+                    std::vector<uint8_t> &embeds_buf,
+                    const size_t embeds_elem_size,
+                    const ::executorch::aten::ScalarType &embeds_dtype);
+  auto encodeAudio(const Types::AudioSlot &slot, const auto hidden,
+                   std::vector<uint8_t> &embeds_buf,
+                   const size_t embeds_elem_size,
+                   const ::executorch::aten::ScalarType &embeds_dtype);
+  auto prefillChunk(std::vector<::executorch::runtime::EValue> &last_outs,
+                    std::vector<uint8_t> &embeds_buf, auto chunk_start,
+                    auto chunk_len, auto hidden, auto embeds_elem_size,
+                    auto embeds_dtype, Types::PLEEmbeddings &ple_embeddings,
+                    std::vector<int64_t> &cache_positions);
+  auto initializePLE(auto &embed_outputs, auto total_len,
+                     Types::PLEEmbeddings &ple_embeddings);
+
   Module *module_;
   MultimodalDecoderRunner *decoder_runner_;
   tokenizers::HFTokenizer *tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
   IEncoder *image_encoder_;
+  IEncoder *audio_encoder_;
 };
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index 767fef9f38..084a7ef191 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -3,7 +3,6 @@
 #include "constants.h"
 #include "util.h"
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 
 namespace executorch::extension::llm {
 
@@ -54,8 +53,14 @@ Error MultimodalRunner::load_subcomponents() {
   if (enc_it != encoders_.end()) {
     image_encoder = enc_it->second.get();
   }
+  IEncoder *audio_encoder = nullptr;
+  auto aud_it = encoders_.find(MultimodalType::Audio);
+  if (aud_it != encoders_.end()) {
+    audio_encoder = aud_it->second.get();
+  }
   mm_prefiller_ = std::make_unique<MultimodalPrefiller>(
-      *module_, *mm_decoder_runner_, *tokenizer_, image_encoder);
+      *module_, *mm_decoder_runner_, *tokenizer_, metadata_, image_encoder,
+      audio_encoder);
   mm_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
       std::move(eos_ids_), stats_ptr, config_);
@@ -78,22 +83,24 @@ Error MultimodalRunner::generate_internal(
   }
 
   stats_.inference_start_ms = time_in_ms();
-
-  uint64_t prefill_next_token = 0;
-  for (const auto &input : inputs) {
-    auto prefill_result = mm_prefiller_->prefill(input, pos_);
-    if (!prefill_result.ok())
-      return prefill_result.error();
-    prefill_next_token = prefill_result.get();
-  }
+  auto prefill_result = mm_prefiller_->prefill(inputs, pos_);
+  if (!prefill_result.ok())
+    return prefill_result.error();
+  uint64_t prefill_next_token = prefill_result.get();
 
   stats_.first_token_ms = time_in_ms();
   stats_.prompt_eval_end_ms = time_in_ms();
   stats_.num_prompt_tokens = pos_;
 
+  // For dynamic-shape PTEs (Gemma4 iter*), get_max_seq_len is the per-call
+  // decoder chunk size (e.g. 128) and the true generation budget lives in
+  // get_max_context_len. Mirrors text_runner.cpp:95-97.
+  const int32_t seq_cap = config_.enable_dynamic_shape
+                              ? config_.max_context_length
+                              : config_.max_seq_len;
   int32_t resolved_max_new = resolve_max_new_tokens(
-      static_cast<int32_t>(pos_), config_.max_seq_len,
-      config_.max_context_length, config_.max_new_tokens);
+      static_cast<int32_t>(pos_), seq_cap, config_.max_context_length,
+      config_.max_new_tokens);
 
   std::vector<uint64_t> seed_tokens = {prefill_next_token};
   auto wrapped_callback = [&](const std::string &piece) {
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
index d24e0b40c2..c6180c54f0 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.h
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.h
@@ -10,7 +10,7 @@
 
 namespace executorch::extension::llm {
 
-enum class MultimodalType { Image };
+enum class MultimodalType { Image, Audio };
 
 class MultimodalRunner : public BaseLLMRunner {
 public:
diff --git a/packages/react-native-executorch/common/runner/sampler.cpp b/packages/react-native-executorch/common/runner/sampler.cpp
index 26c75d4dd5..250d6a83ef 100644
--- a/packages/react-native-executorch/common/runner/sampler.cpp
+++ b/packages/react-native-executorch/common/runner/sampler.cpp
@@ -35,6 +35,7 @@
 #include "sampler.h"
 #include <algorithm>
 #include <ctime>
+#include <limits>
 #include <vector>
 
 namespace executorch {
@@ -46,7 +47,7 @@ template <typename T> int32_t Sampler::sample_argmax(T *probabilities) {
   // return the index that has the highest probability
   int max_i = 0;
   T max_p = probabilities[0];
-  for (int i = 1; i < vocab_size_; i++) {
+  for (size_t i = 1; i < vocab_size_; i++) {
     if (probabilities[i] > max_p) {
       max_i = i;
       max_p = probabilities[i];
@@ -60,7 +61,7 @@ int32_t Sampler::sample_mult(T *probabilities, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   T cdf = 0.0;
-  for (int i = 0; i < vocab_size_; i++) {
+  for (size_t i = 0; i < vocab_size_; i++) {
     cdf += probabilities[i];
     if (coin < cdf) {
       return i;
@@ -84,7 +85,7 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
       std::make_unique<ProbIndex<T>[]>(vocab_size_);
 
   const float cutoff = (1.0f - topp_) / (n - 1);
-  for (int i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; i++) {
     if (probabilities[i] >= cutoff) {
       probindex[n0].index = i;
       probindex[n0].prob = probabilities[i];
@@ -92,61 +93,138 @@ int32_t Sampler::sample_topp(T *probabilities, float coin) {
     }
   }
 
-  auto compare = [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
-    return a.prob > b.prob;
-  };
-  std::sort(probindex.get(), probindex.get() + n0, compare);
+  std::sort(probindex.get(), probindex.get() + n0,
+            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
+              return a.prob > b.prob;
+            });
 
   // truncate the list where cumulative probability exceeds topp
   T cumulative_prob = 0;
-  int last_idx = n0 - 1; // in case of rounding errors consider all elements
-  for (int i = 0; i < n0; i++) {
+  int last_idx = n0 - 1;
+  for (size_t i = 0; i < n0; i++) {
     cumulative_prob += probindex[i].prob;
-    if (cumulative_prob > topp_) {
+    if (static_cast<float>(cumulative_prob) > topp_) {
       last_idx = i;
-      break; // we've exceeded topp by including last_idx
+      break;
     }
   }
 
   // sample from the truncated list
-  const T &r = coin * cumulative_prob;
+  float r = coin * static_cast<float>(cumulative_prob);
   T cdf = 0;
-  for (int i = 0; i <= last_idx; i++) {
+  for (size_t i = 0; i <= last_idx; i++) {
     cdf += probindex[i].prob;
-    if (r < cdf) {
+    if (r < static_cast<float>(cdf)) {
       return probindex[i].index;
     }
   }
-  return probindex[last_idx].index; // in case of rounding errors
+  return probindex[last_idx].index;
 }
 
-Sampler::Sampler(int32_t vocab_size, float temperature, float topp,
-                 unsigned long long rng_seed, float min_p,
-                 float repetition_penalty)
+// Mask logits outside the top-k by rank to -inf. Ties at the k-th boundary
+// are kept (matches HuggingFace TopKLogitsWarper).
+template <typename T> void Sampler::mask_topk(T *logits) {
+  if (topk_ <= 0 || topk_ >= vocab_size_) {
+    return;
+  }
+  // Partial-select the (topk_-th largest) threshold using nth_element on a
+  // copy of logits; O(n) average.
+  std::vector<T> scratch(logits, logits + vocab_size_);
+  std::nth_element(scratch.begin(), scratch.begin() + (topk_ - 1),
+                   scratch.end(), std::greater<T>());
+  const T threshold = scratch[topk_ - 1];
+  constexpr T neg_inf = std::numeric_limits<T>::lowest();
+  for (size_t i = 0; i < vocab_size_; i++) {
+    if (logits[i] < threshold) {
+      logits[i] = neg_inf;
+    }
+  }
+}
+
+// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf.
+// Keeps the token that crosses the threshold (HuggingFace convention).
+template <typename T> void Sampler::mask_topp(T *logits) {
+  if (topp_ <= 0.0f || topp_ >= 1.0f) {
+    return;
+  }
+  // Softmax into a scratch probs[] (do not mutate logits yet).
+  T max_val = logits[0];
+  for (size_t i = 1; i < vocab_size_; i++) {
+    if (logits[i] > max_val) {
+      max_val = logits[i];
+    }
+  }
+  std::unique_ptr<ProbIndex<T>[]> probindex =
+      std::make_unique<ProbIndex<T>[]>(vocab_size_);
+  T sum = 0;
+  for (size_t i = 0; i < vocab_size_; i++) {
+    T e = static_cast<T>(std::expf(static_cast<float>(logits[i] - max_val)));
+    probindex[i].prob = e;
+    probindex[i].index = i;
+    sum += e;
+  }
+  if (sum <= T(0)) {
+    return;
+  }
+  for (size_t i = 0; i < vocab_size_; i++) {
+    probindex[i].prob /= sum;
+  }
+  std::sort(probindex.get(), probindex.get() + vocab_size_,
+            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
+              return a.prob > b.prob;
+            });
+
+  // Find the smallest prefix whose cumulative probability >= topp_.
+  T cumulative = 0;
+  int last_idx = vocab_size_ - 1;
+  for (size_t i = 0; i < vocab_size_; i++) {
+    cumulative += probindex[i].prob;
+    if (static_cast<float>(cumulative) >= topp_) {
+      last_idx = i;
+      break;
+    }
+  }
+  // Mark kept indices, then -inf the rest.
+  std::vector<bool> keep(vocab_size_, false);
+  for (size_t i = 0; i <= last_idx; i++) {
+    keep[probindex[i].index] = true;
+  }
+  constexpr T neg_inf = std::numeric_limits<T>::lowest();
+  for (size_t i = 0; i < vocab_size_; i++) {
+    if (!keep[i]) {
+      logits[i] = neg_inf;
+    }
+  }
+}
+
+Sampler::Sampler(int32_t vocab_size, GenerationConfig config,
+                 unsigned long long rng_seed)
     : vocab_size_(vocab_size),
-      inv_temperature_((temperature != 0.0f) ? (1.0f / temperature) : 0.0f),
-      topp_(topp), min_p_(min_p), repetition_penalty_(repetition_penalty),
+      inv_temperature_(
+          (config.temperature != 0.0f) ? (1.0f / config.temperature) : 0.0f),
+      topp_(config.topp), min_p_(config.min_p),
+      repetition_penalty_(config.repetition_penalty), topk_(config.topk),
       rng_state_(rng_seed) {}
 
-Sampler::Sampler(int vocab_size, float temperature, float topp)
-    : Sampler(vocab_size, temperature, topp, std::time(nullptr), 0.0f, 1.0f) {}
+Sampler::Sampler(int32_t vocab_size, GenerationConfig config)
+    : Sampler(vocab_size, config, std::time(nullptr)) {}
 
 template <typename T> static void softmax(T *x, int size) {
   // find max value (for numerical stability)
   T max_val = x[0];
-  for (int i = 1; i < size; i++) {
+  for (size_t i = 1; i < size; i++) {
     if (x[i] > max_val) {
       max_val = x[i];
     }
   }
   // exp and sum
   T sum = 0;
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
     x[i] = expf(x[i] - max_val);
     sum += x[i];
   }
   // normalize
-  for (int i = 0; i < size; i++) {
+  for (size_t i = 0; i < size; i++) {
     x[i] /= sum;
   }
 }
@@ -175,20 +253,18 @@ int32_t Sampler::sample(T *logits, const std::vector<uint64_t> &recent_tokens) {
     apply_repetition_penalty(logits, vocab_size_, recent_tokens);
     // 2. apply the temperature to the logits
     apply_temperature(logits, vocab_size_);
-    // 3. apply softmax to the logits to get the probabilities for next token
+    // 3. mask out logits outside top-k by rank (pre-softmax, becomes 0 mass)
+    mask_topk(logits);
+    // 4. mask out logits outside top-p by rank (pre-softmax)
+    mask_topp(logits);
+    // 5. apply softmax to the logits to get the probabilities for next token
     softmax(logits, vocab_size_);
-    // 4. apply min_p truncation
+    // 6. apply min_p truncation
     apply_min_p(logits, vocab_size_);
     // flip a (float) coin (this is our source of entropy for sampling)
     float coin = random_f32(&rng_state_);
-    // 5. we sample from this distribution to get the next token
-    if (topp_ <= 0 || topp_ >= 1) {
-      // simply sample from the predicted probability distribution
-      next = sample_mult(logits, coin);
-    } else {
-      // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(logits, coin);
-    }
+    // 7. we sample from this distribution to get the next token
+    next = sample_mult(logits, coin);
   }
   return next;
 }
diff --git a/packages/react-native-executorch/common/runner/sampler.h b/packages/react-native-executorch/common/runner/sampler.h
index 16811297ef..6af1a5a487 100644
--- a/packages/react-native-executorch/common/runner/sampler.h
+++ b/packages/react-native-executorch/common/runner/sampler.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "runner/irunner.h"
 #include <algorithm>
 #include <cctype>
 #include <cmath>
@@ -28,6 +29,7 @@ namespace executorch {
 namespace extension {
 namespace llm {
 // A simple llama2 sampler.
+struct GenerationConfig;
 
 inline constexpr auto kTopp = 0.9f;
 
@@ -38,11 +40,13 @@ template <typename T> struct ProbIndex {
 
 class Sampler {
 public:
-  Sampler(int32_t vocab_size, float temperature, float topp,
-          unsigned long long rng_seed, float min_p = 0.0f,
-          float repetition_penalty = 1.0f);
-
-  Sampler(int32_t vocab_size, float temperature, float topp);
+  // topk <= 0 disables top-k filtering. topp <= 0 || topp >= 1 disables top-p.
+  // Pipeline when temperature != 0: temperature -> top-k mask -> top-p mask
+  // -> softmax -> multinomial. Note: topk == 1 with temperature != 0 collapses
+  // to greedy; pass topk = 0 to keep full-vocab temperature sampling.
+  Sampler(int32_t vocab_size, GenerationConfig config,
+          unsigned long long rng_seed);
+  Sampler(int32_t vocab_size, GenerationConfig config);
 
   template <typename T> int32_t sample(T *logits);
 
@@ -53,6 +57,9 @@ class Sampler {
   template <typename T> int32_t sample_topp(T *probabilities, float coin);
   template <typename T> int32_t sample_mult(T *probabilities, float coin);
   template <typename T> int32_t sample_argmax(T *probabilities);
+  // In-place logit warpers: set excluded indices to -inf.
+  template <typename T> void mask_topk(T *logits);
+  template <typename T> void mask_topp(T *logits);
 
   template <typename T>
   inline void apply_temperature(T *logits, int32_t vocab_size) {
@@ -110,6 +117,7 @@ class Sampler {
   float topp_;
   float min_p_;
   float repetition_penalty_;
+  int32_t topk_;
   unsigned long long rng_state_;
 };
 
diff --git a/packages/react-native-executorch/common/runner/text_decoder_runner.cpp b/packages/react-native-executorch/common/runner/text_decoder_runner.cpp
index e67d3e41fb..77770e3418 100644
--- a/packages/react-native-executorch/common/runner/text_decoder_runner.cpp
+++ b/packages/react-native-executorch/common/runner/text_decoder_runner.cpp
@@ -31,7 +31,6 @@ TextDecoderRunner::TextDecoderRunner(Module &module, IOManager *io_manager,
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<executorch::aten::Tensor>
 TextDecoderRunner::step(TensorPtr &tokens, int64_t start_pos) {
-  // ET_LOG(Info, "Input token %" PRIu64, input_token);
   auto method_meta_result = module_->method_meta("forward");
   if (!method_meta_result.ok()) {
     return method_meta_result.error();
@@ -102,9 +101,7 @@ int32_t TextDecoderRunner::logits_to_token(
           auto num_tokens = logits_tensor.size(1);
           logits += (num_tokens - 1) * vocab_size;
         }
-        Sampler sampler(vocab_size, config_.temperature, config_.topp,
-                        static_cast<unsigned long long>(std::time(nullptr)),
-                        config_.min_p, config_.repetition_penalty);
+        Sampler sampler(vocab_size, config_);
         result = sampler.sample(logits, recent_tokens);
       });
   return result;
diff --git a/packages/react-native-executorch/common/runner/text_decoder_runner.h b/packages/react-native-executorch/common/runner/text_decoder_runner.h
index bffc254bd6..d3aa229cd0 100644
--- a/packages/react-native-executorch/common/runner/text_decoder_runner.h
+++ b/packages/react-native-executorch/common/runner/text_decoder_runner.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "constants.h"
 #include "io_manager.h"
 #include "sampler.h"
 
@@ -40,8 +41,8 @@ class TextDecoderRunner {
   step(TensorPtr &input, int64_t start_pos);
 
   /**
-   * Load the Module for text decode purpose.
-   * @return The error code.
+   * Load the Module for text decode purpose. Loads the dynamic-shape `forward`
+   * method used for both prefill and decode.
    */
   virtual ::executorch::runtime::Error load() {
     return module_->load_method("forward");
diff --git a/packages/react-native-executorch/common/runner/text_prefiller.cpp b/packages/react-native-executorch/common/runner/text_prefiller.cpp
index dc961158b7..370ca5c7f4 100644
--- a/packages/react-native-executorch/common/runner/text_prefiller.cpp
+++ b/packages/react-native-executorch/common/runner/text_prefiller.cpp
@@ -18,10 +18,11 @@ namespace llm {
 
 TextPrefiller::TextPrefiller(TextDecoderRunner *text_decoder_runner,
                              bool use_kv_cache, bool enable_parallel_prefill,
-                             int64_t max_seq_len)
+                             int64_t max_seq_len, int32_t prefill_chunk_size)
     : text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill),
-      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
+      max_seq_len_(max_seq_len > 0 ? max_seq_len : 128),
+      prefill_chunk_size_(prefill_chunk_size) {}
 
 ::executorch::runtime::Result<uint64_t>
 TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
@@ -31,17 +32,17 @@ TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
   }
 
-  // Check if we need to chunk the prompt tokens
   int32_t num_prompt_tokens = prompt_tokens.size();
+  int32_t chunk_size =
+      prefill_chunk_size_ > 0 ? prefill_chunk_size_ : max_seq_len_;
 
-  // If prompt tokens exceed max_seq_len_, we need to chunk them
-  if (num_prompt_tokens > max_seq_len_) {
+  if (num_prompt_tokens > chunk_size) {
     uint64_t cur_token = 0;
     int num_tokens_to_process = 0;
 
     while (num_tokens_to_process < num_prompt_tokens) {
-      auto num_tokens_to_prefill_with = std::min<int>(
-          num_prompt_tokens - num_tokens_to_process, max_seq_len_);
+      auto num_tokens_to_prefill_with =
+          std::min<int>(num_prompt_tokens - num_tokens_to_process, chunk_size);
 
       std::vector<uint64_t> prompt_tokens_to_process(
           num_tokens_to_prefill_with);
@@ -75,7 +76,6 @@ TextPrefiller::prefill_chunk(std::vector<uint64_t> &prompt_tokens,
   // store the token
   uint64_t cur_token;
   if (enable_parallel_prefill_ || !use_kv_cache_) {
-    // initialize tensor wrappers
     auto tokens = from_blob(prompt_tokens.data(), {1, num_prompt_tokens},
                             executorch::aten::ScalarType::Long);
 
diff --git a/packages/react-native-executorch/common/runner/text_prefiller.h b/packages/react-native-executorch/common/runner/text_prefiller.h
index 7929fe9c7f..b8cdb1b98c 100644
--- a/packages/react-native-executorch/common/runner/text_prefiller.h
+++ b/packages/react-native-executorch/common/runner/text_prefiller.h
@@ -19,8 +19,14 @@ namespace llm {
 
 class TextPrefiller {
 public:
+  // prefill_chunk_size: when > 0, the prompt is always processed in steps of
+  // this size (see prefill()). Set to the model's forward sequence-length cap
+  // for the MLX backend (its forward is exported with a sliding-window bound
+  // and one-shot prefill spikes Metal memory). Other backends (XNNPACK/CoreML)
+  // pass 0 → original one-shot behavior.
   TextPrefiller(TextDecoderRunner *text_decoder_runner, bool use_kv_cache,
-                bool enable_parallel_prefill, int64_t max_seq_len = 128);
+                bool enable_parallel_prefill, int64_t max_seq_len = 128,
+                int32_t prefill_chunk_size = 0);
 
   virtual ~TextPrefiller() = default;
   /**
@@ -70,6 +76,7 @@ class TextPrefiller {
   bool use_kv_cache_;
   bool enable_parallel_prefill_;
   int64_t max_seq_len_;
+  int32_t prefill_chunk_size_;
 };
 
 } // namespace llm
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
index 5a75e00b4a..bbcc0d8981 100644
--- a/packages/react-native-executorch/common/runner/text_runner.cpp
+++ b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -26,11 +26,24 @@ Error TextRunner::load_subcomponents() {
 
   Stats *stats_ptr = &stats_;
 
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
-      *module_, io_manager_.get(), config_);
+  text_decoder_runner_ =
+      std::make_unique<TextDecoderRunner>(*module_, io_manager_.get(), config_);
+
+  int32_t prefill_chunk_size = 0;
+  auto fwd_meta = module_->method_meta("forward");
+  if (fwd_meta.ok() && fwd_meta->uses_backend("MLXBackend")) {
+    auto input_meta = fwd_meta->input_tensor_meta(0);
+    if (input_meta.ok()) {
+      auto sizes = input_meta->sizes();
+      if (sizes.size() >= 2 && sizes[sizes.size() - 1] > 0) {
+        prefill_chunk_size = sizes[sizes.size() - 1];
+      }
+    }
+  }
+
   text_prefiller_ = std::make_unique<TextPrefiller>(
       text_decoder_runner_.get(), config_.enable_kv_cache,
-      config_.enable_dynamic_shape, config_.max_seq_len);
+      config_.enable_dynamic_shape, config_.max_seq_len, prefill_chunk_size);
   text_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
       std::move(eos_ids_), stats_ptr, config_);
@@ -65,6 +78,10 @@ Error TextRunner::generate_internal(
 
   stats_.inference_start_ms = time_in_ms();
 
+  // Multi-turn: JS re-renders the full chat history each call, so reset KV
+  // position to 0 and re-prefill from scratch.
+  pos_ = 0;
+
   int64_t context_len_left =
       static_cast<int64_t>(config_.max_context_length) - pos_;
 
@@ -79,16 +96,25 @@ Error TextRunner::generate_internal(
   std::vector<uint64_t> prompt_tokens = encodeResult.get();
   int num_prompt_tokens = prompt_tokens.size();
 
+  // For dynamic-shape PTEs (e.g. Gemma4 MLX/Vulkan), get_max_seq_len is the
+  // per-call decoder chunk size (e.g. the sliding window) and the real
+  // generation budget lives in get_max_context_len. Static-shape PTEs set both
+  // equal, so this collapses to the old behavior. Without this the budget is
+  // computed from the small chunk size, so max_new_tokens can resolve to ~0 and
+  // generation ends immediately after prefill.
+  const int32_t seq_cap = config_.enable_dynamic_shape
+                              ? config_.max_context_length
+                              : config_.max_seq_len;
+
   ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
                            "Expected at least 1 prompt token");
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < config_.max_seq_len,
-                           InvalidArgument,
-                           "num_prompt_tokens %d >= max_seq_len %" PRId32,
-                           num_prompt_tokens, config_.max_seq_len);
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < seq_cap, InvalidArgument,
+                           "num_prompt_tokens %d >= seq cap %" PRId32,
+                           num_prompt_tokens, seq_cap);
 
   int32_t max_new_tokens = resolve_max_new_tokens(
-      num_prompt_tokens, config_.max_seq_len,
-      static_cast<int32_t>(context_len_left), config_.max_new_tokens);
+      num_prompt_tokens, seq_cap, static_cast<int32_t>(context_len_left),
+      config_.max_new_tokens);
 
   ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
                            "Max new tokens %d is <= 0", max_new_tokens);
diff --git a/packages/react-native-executorch/common/runner/text_token_generator.h b/packages/react-native-executorch/common/runner/text_token_generator.h
index 7ecf6177a9..241758e619 100644
--- a/packages/react-native-executorch/common/runner/text_token_generator.h
+++ b/packages/react-native-executorch/common/runner/text_token_generator.h
@@ -100,8 +100,8 @@ class TextTokenGenerator {
       prev_token = cur_token;
 
       stats_->on_sampling_begin();
-      cur_token =
-          text_decoder_runner_->logits_to_token(logits_tensor, generated_tokens);
+      cur_token = text_decoder_runner_->logits_to_token(logits_tensor,
+                                                        generated_tokens);
       stats_->on_sampling_end();
 
       pos++;
@@ -152,7 +152,6 @@ class TextTokenGenerator {
       if (should_stop_) {
         break;
       }
-
       // data-dependent terminating condition: we have n_eos_ number of EOS
       if (eos_ids_->find(cur_token) != eos_ids_->end()) {
         printf("\n");
diff --git a/packages/react-native-executorch/common/runner/util.h b/packages/react-native-executorch/common/runner/util.h
index 640b96319f..b1e707034b 100644
--- a/packages/react-native-executorch/common/runner/util.h
+++ b/packages/react-native-executorch/common/runner/util.h
@@ -8,7 +8,6 @@
 
 #pragma once
 #include "constants.h"
-#include "text_prefiller.h"
 #include <cctype>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json
index b99726a2e8..cfbf1361a8 100644
--- a/packages/react-native-executorch/package.json
+++ b/packages/react-native-executorch/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-executorch",
-  "version": "0.9.0",
+  "version": "0.9.1",
   "description": "An easy way to run AI models in React Native with ExecuTorch",
   "source": "./src/index.ts",
   "main": "./lib/module/index.js",
diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec
index 849759243f..daf0da27c7 100644
--- a/packages/react-native-executorch/react-native-executorch.podspec
+++ b/packages/react-native-executorch/react-native-executorch.podspec
@@ -62,6 +62,12 @@ Pod::Spec.new do |s|
 
   s.libraries = "z"
   s.ios.vendored_frameworks = "third-party/ios/ExecutorchLib.xcframework"
+
+  # NOTE: mlx.metallib (the MLX GPU kernels) is bundled INSIDE
+  # ExecutorchLib.framework, colocated with the binary that contains the MLX
+  # code. MLX's runtime loader resolves the metallib relative to that binary
+  # (via dladdr), so it must live next to it in the framework — not at the app
+  # bundle root.
   # Exclude file with tests to not introduce gtest dependency.
   # Do not include the headers from common/rnexecutorch/jsi/ as source files.
   # Xcode/Cocoapods leaks them to other pods that an app also depends on, so if
diff --git a/packages/react-native-executorch/src/constants/llmDefaults.ts b/packages/react-native-executorch/src/constants/llmDefaults.ts
index a27a2f7a4f..77a60fe311 100644
--- a/packages/react-native-executorch/src/constants/llmDefaults.ts
+++ b/packages/react-native-executorch/src/constants/llmDefaults.ts
@@ -6,7 +6,7 @@ import { SlidingWindowContextStrategy } from '../utils/llms/context_strategy';
  * @category Utilities - LLM
  */
 export const DEFAULT_SYSTEM_PROMPT =
-  "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user. Don't return too much text.";
+  "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user. Don't return too much text. If provided with audio samples treat it with at most importance";
 
 /**
  * Generates a default structured output prompt based on the provided JSON schema.
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index 9c9da9c420..c3cda78498 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -38,7 +38,7 @@ import { RnExecutorchErrorCode } from '../errors/ErrorCodes';
  * compile-time error.
  * @category Utils
  */
-export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn';
+export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn' | 'mlx';
 
 /**
  * Options for a `models` accessor call.
@@ -78,7 +78,7 @@ type ConfigOf<V> = Extract<
 >;
 type BackendsOf<V> = Extract<keyof V, Backend>;
 
-const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'vulkan', 'qnn'];
+const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'mlx', 'vulkan', 'qnn'];
 
 function firstBackend(variants: AnyVariantMap): Backend {
   for (const b of BACKEND_ORDER) {
@@ -181,6 +181,33 @@ function tts<C extends TextToSpeechModelConfig>(c: C): () => C {
 // Per-backend variant maps for models that ship more than one backend.
 // ─────────────────────────────────────────────────────────────────────────────
 
+const GEMMA4_E2B_VARIANTS = {
+  mlx: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_MLX_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+  xnnpack: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_XNNPACK_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+  vulkan: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_VULKAN_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+};
+
 const EFFICIENTNET_V2_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -496,10 +523,15 @@ export const models = {
       M.LFM2_5_1_2B_INSTRUCT_QUANTIZED
     ),
     bielik_v3_0_1_5b: pair(M.BIELIK_V3_0_1_5B, M.BIELIK_V3_0_1_5B_QUANTIZED),
+    gemma4_e2b: variant(GEMMA4_E2B_VARIANTS, {
+      ios: 'mlx',
+      android: 'vulkan',
+    }),
     // Multimodal LLMs — same hook/module as plain LLMs, listed here so users
     // pick a model by capability ("LLM") rather than by modality.
     lfm2_5_vl_1_6b: base(M.LFM2_5_VL_1_6B_QUANTIZED),
     lfm2_5_vl_450m: base(M.LFM2_5_VL_450M_QUANTIZED),
+    gemma4_e2b_multimodal: base(M.GEMMA4_E2B_MM),
   },
   classification: {
     efficientnet_v2_s: variant(EFFICIENTNET_V2_S_VARIANTS),
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 17c523f881..3f26537acc 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -125,6 +125,47 @@ export const QWEN3_0_6B_QUANTIZED = {
   generationConfig: QWEN3_GENERATION_CONFIG,
 } as const;
 
+// GEMMA 4 — separate HF repo; tokenizer files live at the e2b root and are
+// shared by all backend variants.
+const GEMMA4_E2B_PREFIX = `${URL_PREFIX}-gemma-4/${VERSION_TAG}/e2b`;
+export const GEMMA4_E2B_MLX_MODEL = `${GEMMA4_E2B_PREFIX}/mlx/gemma4_e2b_mlx_int4.pte`;
+export const GEMMA4_E2B_XNNPACK_MODEL = `${GEMMA4_E2B_PREFIX}/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
+export const GEMMA4_E2B_VULKAN_MODEL = `${GEMMA4_E2B_PREFIX}/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
+export const GEMMA4_E2B_TOKENIZER = `${GEMMA4_E2B_PREFIX}/tokenizer.json`;
+export const GEMMA4_E2B_TOKENIZER_CONFIG = `${GEMMA4_E2B_PREFIX}/tokenizer_config.json`;
+
+const GEMMA4_E2B_MODEL =
+  Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MODEL : GEMMA4_E2B_MLX_MODEL;
+
+const GEMMA4_E2B_MLX_MM = `${URL_PREFIX}-gemma-4-multimodal/${VERSION_TAG}/e2b/mlx/gemma4_e2b_mlx_int4.pte`;
+const GEMMA4_E2B_VULKAN_MM = `${URL_PREFIX}-gemma-4-multimodal/${VERSION_TAG}/e2b/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
+
+/**
+ * @category Models - LLM
+ */
+export const GEMMA4_E2B = {
+  modelName: 'gemma4-e2b',
+  modelSource: GEMMA4_E2B_MODEL,
+  tokenizerSource: GEMMA4_E2B_TOKENIZER,
+  tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
+} as const;
+
+/**
+ * @category Models - LLM Multimodal
+ */
+export const GEMMA4_E2B_MM = {
+  modelName: 'gemma4-e2b-multimodal',
+  modelSource:
+    Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MM : GEMMA4_E2B_MLX_MM,
+  tokenizerSource: GEMMA4_E2B_TOKENIZER,
+  tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
+  capabilities: ['vision', 'audio'],
+  audioConfig: {
+    samplesPerBlock: 7680,
+    tokensPerBlock: 12,
+  },
+} as const;
+
 /**
  * @category Models - LLM
  */
@@ -816,27 +857,27 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 // S2T
 export const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
-export const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
+export const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp16.pte`;
 
 export const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
-export const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
+export const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp16.pte`;
 
 export const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
-export const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
+export const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp16.pte`;
 
 export const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
-export const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
+export const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp16.pte`;
 
 export const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
-export const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
+export const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp16.pte`;
 
 export const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
-export const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
+export const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp16.pte`;
 
 /**
  * @category Models - Speech To Text
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index bceca47a56..4385ca909b 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -1,11 +1,11 @@
-import { ResourceSource } from '../types/common';
 import { ResourceFetcher } from '../utils/ResourceFetcher';
 import { Template } from '@huggingface/jinja';
 import { DEFAULT_CHAT_CONFIG } from '../constants/llmDefaults';
 import {
+  AudioConfig,
   ChatConfig,
   GenerationConfig,
-  LLMCapability,
+  LLMModel,
   LLMTool,
   Message,
   SPECIAL_TOKENS,
@@ -30,6 +30,7 @@ export class LLMController {
   private messageHistoryCallback: (messageHistory: Message[]) => void;
   private isReadyCallback: (isReady: boolean) => void;
   private isGeneratingCallback: (isGenerating: boolean) => void;
+  private audioConfig: AudioConfig | undefined;
 
   constructor({
     tokenCallback,
@@ -72,18 +73,10 @@ export class LLMController {
   }
 
   public async load({
-    modelSource,
-    tokenizerSource,
-    tokenizerConfigSource,
-    capabilities,
-    defaultGenerationConfig,
+    model,
     onDownloadProgressCallback,
   }: {
-    modelSource: ResourceSource;
-    tokenizerSource: ResourceSource;
-    tokenizerConfigSource: ResourceSource;
-    capabilities?: readonly LLMCapability[];
-    defaultGenerationConfig?: GenerationConfig;
+    model: LLMModel;
     onDownloadProgressCallback?: (downloadProgress: number) => void;
   }) {
     // reset inner state when loading new model
@@ -94,13 +87,13 @@ export class LLMController {
     try {
       const tokenizersPromise = ResourceFetcher.fetch(
         undefined,
-        tokenizerSource,
-        tokenizerConfigSource
+        model.tokenizerSource,
+        model.tokenizerConfigSource
       );
 
       const modelPromise = ResourceFetcher.fetch(
         onDownloadProgressCallback,
-        modelSource
+        model.modelSource
       );
 
       const [tokenizersResults, modelResult] = await Promise.all([
@@ -124,16 +117,18 @@ export class LLMController {
         this.nativeModule.unload();
       }
 
+      this.audioConfig = model.audioConfig;
+
       this.nativeModule = await global.loadLLM(
         modelPath,
         tokenizerPath,
-        capabilities ?? []
+        model.capabilities ?? []
       );
-      if (defaultGenerationConfig) {
+      if (model.generationConfig) {
         // Apply model-specific recommended sampling defaults before flipping
         // isReady so callers that react to it see the right config on first
         // send. User-provided `configure()` calls still override these.
-        this.applyGenerationConfig(defaultGenerationConfig);
+        this.applyGenerationConfig(model.generationConfig);
       }
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
@@ -236,6 +231,17 @@ export class LLMController {
     return token;
   }
 
+  private getAudioToken(): string {
+    const token = this.tokenizerConfig.audio_token;
+    if (!token) {
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.InvalidConfig,
+        "Tokenizer config is missing 'audio_token'. Audio-capable models require tokenizerConfigSource with an 'audio_token' field."
+      );
+    }
+    return token;
+  }
+
   private filterSpecialTokens(text: string): string {
     let filtered = text;
     if (
@@ -244,6 +250,12 @@ export class LLMController {
     ) {
       filtered = filtered.replaceAll(this.tokenizerConfig.eos_token, '');
     }
+    if (
+      SPECIAL_TOKENS.EOT_TOKEN in this.tokenizerConfig &&
+      this.tokenizerConfig.eot_token
+    ) {
+      filtered = filtered.replaceAll(this.tokenizerConfig.eot_token, '');
+    }
     if (
       SPECIAL_TOKENS.PAD_TOKEN in this.tokenizerConfig &&
       this.tokenizerConfig.pad_token
@@ -269,25 +281,37 @@ export class LLMController {
     this.isGeneratingCallback(false);
   }
 
-  public async forward(input: string, imagePaths?: string[]): Promise<string> {
+  public async forward(
+    input: string,
+    imagePaths?: string[],
+    audioWaveforms?: Float32Array[]
+  ): Promise<string> {
     if (!this._isReady) {
       throw new RnExecutorchError(RnExecutorchErrorCode.ModuleNotLoaded);
     }
     if (this._isGenerating) {
       throw new RnExecutorchError(RnExecutorchErrorCode.ModelGenerating);
     }
+    const hasImages = !!imagePaths && imagePaths.length > 0;
+    const hasAudio = !!audioWaveforms && audioWaveforms.length > 0;
     try {
       this.isGeneratingCallback(true);
       this.nativeModule.reset();
-      const response =
-        imagePaths && imagePaths.length > 0
-          ? await this.nativeModule.generateMultimodal(
-              input,
-              imagePaths.map(normalizeImagePath),
-              this.getImageToken(),
-              this.onToken
-            )
-          : await this.nativeModule.generate(input, this.onToken);
+      let response: string;
+      if (hasImages || hasAudio) {
+        response = await this.nativeModule.generateMultimodal(
+          input,
+          this.onToken,
+          {
+            imagePaths: hasImages ? imagePaths!.map(normalizeImagePath) : null,
+            imageToken: hasImages ? this.getImageToken() : null,
+            audioWaveforms: hasAudio ? audioWaveforms! : null,
+            audioToken: hasAudio ? this.getAudioToken() : null,
+          }
+        );
+      } else {
+        response = await this.nativeModule.generate(input, this.onToken);
+      }
       return this.filterSpecialTokens(response);
     } catch (e) {
       throw parseUnknownError(e);
@@ -355,7 +379,9 @@ export class LLMController {
     const imagePaths = messages
       .filter((m) => m.mediaPath)
       .map((m) => m.mediaPath!);
-
+    const audioWaveforms = messages
+      .filter((m) => m.audioWaveform)
+      .map((m) => m.audioWaveform!);
     const renderedChat: string = this.applyChatTemplate(
       messages,
       this.tokenizerConfig,
@@ -365,19 +391,22 @@ export class LLMController {
 
     return await this.forward(
       renderedChat,
-      imagePaths.length > 0 ? imagePaths : undefined
+      imagePaths.length > 0 ? imagePaths : undefined,
+      audioWaveforms.length > 0 ? audioWaveforms : undefined
     );
   }
 
   public async sendMessage(
     message: string,
-    media?: { imagePath?: string }
+    media?: { imagePath?: string; audioBuffer?: Float32Array }
   ): Promise<string> {
     const mediaPath = media?.imagePath;
+    const audioBuffer = media?.audioBuffer;
     const newMessage: Message = {
       content: message,
       role: 'user',
       ...(mediaPath ? { mediaPath } : {}),
+      ...(audioBuffer ? { audioWaveform: audioBuffer } : {}),
     };
     const updatedHistory = [...this._messageHistory, newMessage];
     this.messageHistoryCallback(updatedHistory);
@@ -392,7 +421,22 @@ export class LLMController {
       );
       const textTokens = this.nativeModule.countTextTokens(rendered);
       const imageCount = messages.filter((m) => m.mediaPath).length;
-      return textTokens + imageCount * (visualTokenCount - 1);
+      // Audio soft-token expansion: audio_encoder pads samples to
+      // multiples of this.audioConfig.samplesPerBlock (7680 @ 16 kHz) and emits
+      // this.audioConfig.tokensPerBlock (~12) soft tokens per padded block. The
+      // rendered template only contributes 1 token for the audio placeholder,
+      // so add (expansion - 1) per audio message to match prefill consumption.
+      const audioTokenExpansion = messages.reduce((acc, m) => {
+        if (!m.audioWaveform) return acc;
+        const kBlocks = Math.max(
+          1,
+          Math.ceil(m.audioWaveform.length / this.audioConfig!.samplesPerBlock)
+        );
+        return acc + (this.audioConfig!.tokensPerBlock * kBlocks - 1);
+      }, 0);
+      return (
+        textTokens + imageCount * (visualTokenCount - 1) + audioTokenExpansion
+      );
     };
     const maxContextLength = this.nativeModule.getMaxContextLength();
     const messageHistoryWithPrompt =
@@ -497,12 +541,17 @@ function normalizeImagePath(path: string): string {
  * @returns Messages with image-bearing turns rewritten to structured content.
  */
 function messagesForChatTemplate(messages: Message[]): any[] {
-  return messages.map((m) =>
-    m.mediaPath && typeof m.content === 'string'
-      ? {
-          ...m,
-          content: [{ type: 'image' }, { type: 'text', text: m.content }],
-        }
-      : m
-  );
+  return messages.map((m) => {
+    if (typeof m.content !== 'string') return m;
+    const hasImage = !!m.mediaPath;
+    const hasAudio = !!m.audioWaveform;
+    if (!hasImage && !hasAudio) return m;
+    const parts: any[] = [];
+    if (hasImage) parts.push({ type: 'image' });
+    if (hasAudio) parts.push({ type: 'audio' });
+    parts.push({ type: 'text', text: m.content });
+    // Drop the Float32Array on the clone only — passing it into the Jinja
+    // template engine slows render past 3s. Don't mutate m;
+    return { ...m, content: parts, audioWaveform: undefined };
+  });
 }
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 027e237997..a8daef8d91 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -58,11 +58,7 @@ export function useLLM({
     (async () => {
       try {
         await controllerInstance.load({
-          modelSource: model.modelSource,
-          tokenizerSource: model.tokenizerSource,
-          tokenizerConfigSource: model.tokenizerConfigSource!,
-          capabilities: model.capabilities,
-          defaultGenerationConfig: model.generationConfig,
+          model: model,
           onDownloadProgressCallback: setDownloadProgress,
         });
       } catch (e) {
@@ -106,7 +102,10 @@ export function useLLM({
   );
 
   const sendMessage = useCallback(
-    (message: string, media?: { imagePath?: string }) => {
+    (
+      message: string,
+      media?: { imagePath?: string; audioBuffer?: Float32Array }
+    ) => {
       setResponse('');
       return controllerInstance.sendMessage(message, media);
     },
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts
index bdb5ada699..be6ecb229b 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts
@@ -3,6 +3,7 @@ import { Logger } from '../../common/Logger';
 import { parseUnknownError } from '../../errors/errorUtils';
 import { ResourceSource } from '../../types/common';
 import {
+  AudioConfig,
   LLMCapability,
   LLMConfig,
   LLMModelName,
@@ -51,6 +52,7 @@ export class LLMModule {
       tokenizerSource: ResourceSource;
       tokenizerConfigSource: ResourceSource;
       capabilities?: readonly LLMCapability[];
+      audioConfig?: AudioConfig;
     },
     onDownloadProgress: (progress: number) => void = () => {},
     tokenCallback?: (token: string) => void,
@@ -59,10 +61,14 @@ export class LLMModule {
     const instance = new LLMModule({ tokenCallback, messageHistoryCallback });
     try {
       await instance.controller.load({
-        modelSource: namedSources.modelSource,
-        tokenizerSource: namedSources.tokenizerSource,
-        tokenizerConfigSource: namedSources.tokenizerConfigSource,
-        capabilities: namedSources.capabilities,
+        model: {
+          modelName: namedSources.modelName,
+          modelSource: namedSources.modelSource,
+          tokenizerSource: namedSources.tokenizerSource,
+          tokenizerConfigSource: namedSources.tokenizerConfigSource,
+          capabilities: namedSources.capabilities,
+          audioConfig: namedSources.audioConfig,
+        },
         onDownloadProgressCallback: onDownloadProgress,
       });
       return instance;
@@ -140,10 +146,15 @@ export class LLMModule {
    * If you want a simple chat with model the consider using `sendMessage`
    * @param input - Raw input string containing the prompt and conversation history.
    * @param imagePaths - Optional array of local image paths for multimodal inference. Each entry may be either `file:///absolute/path` or `/absolute/path` — the controller normalizes the path before passing it to native code.
+   * @param audioWaveforms - Optional array of 16kHz waveforms of audio recordings for multimodal inference.
    * @returns The generated response as a string.
    */
-  async forward(input: string, imagePaths?: string[]): Promise<string> {
-    return await this.controller.forward(input, imagePaths);
+  async forward(
+    input: string,
+    imagePaths?: string[],
+    audioWaveforms?: Float32Array[]
+  ): Promise<string> {
+    return await this.controller.forward(input, imagePaths, audioWaveforms);
   }
 
   /**
@@ -162,12 +173,12 @@ export class LLMModule {
    * After model responds it will call `messageHistoryCallback()` containing both user message and model response.
    * It also returns them.
    * @param message - The message string to send.
-   * @param media - Optional media object containing a local image path for multimodal models.
+   * @param media - Optional media object containing a local image path or 16kHz waveform of an audio recording for multimodal models.
    * @returns - Updated message history including the new user message and model response.
    */
   async sendMessage(
     message: string,
-    media?: { imagePath?: string }
+    media?: { imagePath?: string; audioBuffer?: Float32Array }
   ): Promise<Message[]> {
     await this.controller.sendMessage(message, media);
     return this.controller.messageHistory;
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 6254775c15..1d8da7bd70 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -5,20 +5,23 @@ import { ResourceSource } from './common';
  * Capabilities a multimodal LLM can have.
  * @category Types
  */
-export type LLMCapability = 'vision';
+export type LLMCapability = 'vision' | 'audio';
 
 /**
  * Derives the media argument shape for `sendMessage` from a capabilities tuple.
  * @category Types
  */
 export type MediaArg<C extends readonly LLMCapability[]> =
-  'vision' extends C[number] ? { imagePath?: string } : object;
+  ('vision' extends C[number] ? { imagePath?: string } : object) &
+    ('audio' extends C[number] ? { audioBuffer?: Float32Array } : object);
 
 /**
  * Union of all built-in LLM model names.
  * @category Types
  */
 export type LLMModelName =
+  | 'gemma4-e2b'
+  | 'gemma4-e2b-multimodal'
   | 'llama-3.2-3b'
   | 'llama-3.2-3b-qlora'
   | 'llama-3.2-3b-spinquant'
@@ -62,43 +65,63 @@ export type LLMModelName =
   | 'bielik-v3.0-1.5b'
   | 'bielik-v3.0-1.5b-quantized';
 
+/**
+ * Audio soft-token expansion constants for audio_encoder.
+ * @category Types
+ */
+export interface AudioConfig {
+  samplesPerBlock: number;
+  tokensPerBlock: number;
+}
+
+/**
+ * Properties defining LLMModel.
+ * @category Types
+ */
+export interface LLMModel {
+  /**
+   * The built-in model name (e.g. `'llama-3.2-3b'`). Used for telemetry and hook reload triggers.
+   * Pass one of the pre-built LLM constants (e.g. `LLAMA3_2_3B`) to populate all required fields.
+   */
+  modelName: LLMModelName;
+  /**
+   * `ResourceSource` that specifies the location of the model binary.
+   */
+  modelSource: ResourceSource;
+  /**
+   * `ResourceSource` pointing to the JSON file which contains the tokenizer.
+   */
+  tokenizerSource: ResourceSource;
+  /**
+   * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
+   */
+  tokenizerConfigSource: ResourceSource;
+  /**
+   * Optional list of modality capabilities the model supports.
+   * Determines the type of the `media` argument in `sendMessage`.
+   * Example: `['vision']` enables `sendMessage(text, { imagePath })`.
+   */
+  capabilities?: readonly LLMCapability[];
+  /**
+   * Recommended default generation settings, typically copied from the
+   * upstream `generation_config.json` or the model card. Applied automatically
+   * after the native module loads and before any user `configure()` call,
+   * so callers only need to override the values they want to change.
+   */
+  generationConfig?: GenerationConfig;
+  /**
+   * Defines config for audio input modality for multimodal LLMs.
+   * `capabilities` must include 'audio'.
+   */
+  audioConfig?: AudioConfig;
+}
+
 /**
  * Properties for initializing and configuring a Large Language Model (LLM) instance.
  * @category Types
  */
 export interface LLMProps {
-  model: {
-    /**
-     * The built-in model name (e.g. `'llama-3.2-3b'`). Used for telemetry and hook reload triggers.
-     * Pass one of the pre-built LLM constants (e.g. `LLAMA3_2_3B`) to populate all required fields.
-     */
-    modelName: LLMModelName;
-    /**
-     * `ResourceSource` that specifies the location of the model binary.
-     */
-    modelSource: ResourceSource;
-    /**
-     * `ResourceSource` pointing to the JSON file which contains the tokenizer.
-     */
-    tokenizerSource: ResourceSource;
-    /**
-     * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
-     */
-    tokenizerConfigSource: ResourceSource;
-    /**
-     * Optional list of modality capabilities the model supports.
-     * Determines the type of the `media` argument in `sendMessage`.
-     * Example: `['vision']` enables `sendMessage(text, { imagePath })`.
-     */
-    capabilities?: readonly LLMCapability[];
-    /**
-     * Recommended default generation settings, typically copied from the
-     * upstream `generation_config.json` or the model card. Applied automatically
-     * after the native module loads and before any user `configure()` call,
-     * so callers only need to override the values they want to change.
-     */
-    generationConfig?: GenerationConfig;
-  };
+  model: LLMModel;
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
    */
@@ -289,6 +312,12 @@ export interface Message {
    * controller normalizes the path before passing it to native code.
    */
   mediaPath?: string;
+  /**
+   * Optional fp32 mono 16 kHz PCM buffer. Only valid on `user` messages for
+   * models with the `'audio'` capability. The controller forwards it to the
+   * native `generateMultimodal` path.
+   */
+  audioWaveform?: Float32Array;
 }
 
 /**
@@ -386,6 +415,7 @@ export interface ContextStrategy {
 export const SPECIAL_TOKENS = {
   BOS_TOKEN: 'bos_token',
   EOS_TOKEN: 'eos_token',
+  EOT_TOKEN: 'eot_token',
   UNK_TOKEN: 'unk_token',
   SEP_TOKEN: 'sep_token',
   PAD_TOKEN: 'pad_token',
diff --git a/packages/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so b/packages/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so
index 8c65aa5d85..36cb0919c2 100644
Binary files a/packages/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so and b/packages/react-native-executorch/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so differ
diff --git a/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so b/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so
index a56a5d20ac..7d39ee85a3 100644
Binary files a/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so and b/packages/react-native-executorch/third-party/android/libs/executorch/x86_64/libexecutorch.so differ
diff --git a/packages/react-native-executorch/third-party/include/executorch/ExecuTorch.h b/packages/react-native-executorch/third-party/include/executorch/ExecuTorch.h
index 3a12a5ddba..d0ad6c2840 100644
--- a/packages/react-native-executorch/third-party/include/executorch/ExecuTorch.h
+++ b/packages/react-native-executorch/third-party/include/executorch/ExecuTorch.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchBackendOption.h"
+#import "ExecuTorchBackendOptionsMap.h"
 #import "ExecuTorchError.h"
 #import "ExecuTorchLog.h"
 #import "ExecuTorchModule.h"
diff --git a/packages/react-native-executorch/third-party/include/executorch/ExecuTorchModule.h b/packages/react-native-executorch/third-party/include/executorch/ExecuTorchModule.h
index 823e5cf5cb..51b8abfa68 100644
--- a/packages/react-native-executorch/third-party/include/executorch/ExecuTorchModule.h
+++ b/packages/react-native-executorch/third-party/include/executorch/ExecuTorchModule.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchBackendOption.h"
+#import "ExecuTorchBackendOptionsMap.h"
 #import "ExecuTorchValue.h"
 
 NS_ASSUME_NONNULL_BEGIN
@@ -198,6 +200,37 @@ NS_SWIFT_NAME(Module)
  */
 - (BOOL)load:(NSError **)error;
 
+/**
+ * Loads the module's program with per-delegate backend options.
+ *
+ * The receiver retains @c options for as long as the underlying program
+ * references it (lifetime tracked via ARC).
+ *
+ * @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
+ *        load-time configuration, built once via
+ *        `[ExecuTorchBackendOptionsMap mapWithOptions:error:]`.
+ * @param verification The verification level to apply when loading the program.
+ * @param error A pointer to an NSError pointer that will be set if an error
+ * occurs.
+ * @return YES if the program was successfully loaded; otherwise, NO.
+ */
+- (BOOL)loadWithOptions:(ExecuTorchBackendOptionsMap *)options
+           verification:(ExecuTorchVerification)verification
+                  error:(NSError **)error NS_REFINED_FOR_SWIFT;
+
+/**
+ * Loads the module's program with per-delegate backend options using minimal
+ * verification.
+ *
+ * @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
+ *        load-time configuration.
+ * @param error A pointer to an NSError pointer that will be set if an error
+ * occurs.
+ * @return YES if the program was successfully loaded; otherwise, NO.
+ */
+- (BOOL)loadWithOptions:(ExecuTorchBackendOptionsMap *)options
+                  error:(NSError **)error NS_REFINED_FOR_SWIFT;
+
 /**
  * Checks if the module is loaded.
  *
@@ -215,6 +248,19 @@ NS_SWIFT_NAME(Module)
 - (BOOL)loadMethod:(NSString *)methodName
              error:(NSError **)error NS_SWIFT_NAME(load(_:));
 
+/**
+ * Loads a specific method from the program with per-delegate backend options.
+ *
+ * @param methodName A string representing the name of the method to load.
+ * @param options A `ExecuTorchBackendOptionsMap` containing per-delegate
+ *        load-time configuration.
+ * @param error A pointer to an NSError pointer that is set if an error occurs.
+ * @return YES if the method was successfully loaded; otherwise, NO.
+ */
+- (BOOL)loadMethod:(NSString *)methodName
+           options:(ExecuTorchBackendOptionsMap *)options
+             error:(NSError **)error NS_REFINED_FOR_SWIFT;
+
 /**
  * Checks if a specific method is loaded.
  *
diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/buffer_data_loader.h b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/buffer_data_loader.h
index 38322aff43..b3744d0970 100644
--- a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/buffer_data_loader.h
+++ b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/buffer_data_loader.h
@@ -36,9 +36,10 @@ class BufferDataLoader final : public executorch::runtime::DataLoader {
        ET_UNUSED const DataLoader::SegmentInfo &segment_info) const override {
     size_t total_size;
     bool overflow = c10::add_overflows(offset, size, &total_size);
-    ET_CHECK_OR_RETURN_ERROR(!overflow && total_size <= size_, InvalidArgument,
-                             "offset %zu + size %zu > size_ %zu", offset, size,
-                             size_);
+    ET_CHECK_OR_RETURN_ERROR(
+        !overflow && total_size <= size_, InvalidArgument,
+        "offset %zu + size %zu > size_ %zu, or overflow detected", offset, size,
+        size_);
     return executorch::runtime::FreeableBuffer(data_ + offset, size,
                                                /*free_fn=*/nullptr);
   }
diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mman.h b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mman.h
index 788560c168..c35fd07f46 100644
--- a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mman.h
+++ b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mman.h
@@ -17,6 +17,7 @@
 
 #ifndef _WIN32
 
+#include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
@@ -41,6 +42,34 @@ ET_INLINE off_t get_mmap_offset(size_t offset) {
   return static_cast<off_t>(offset);
 }
 
+/**
+ * Hint the kernel to prefetch pages eagerly and to optimize for sequential
+ * reads. Intended to reduce page-fault stutter during model initialization
+ * when the caller does not want to mlock the pages into RAM.
+ */
+ET_INLINE void madvise_pages_willneed_sequential(void *addr, size_t len) {
+  ::madvise(addr, len, MADV_WILLNEED);
+  ::madvise(addr, len, MADV_SEQUENTIAL);
+}
+
+/**
+ * On Apple platforms, schedule kernel read-ahead on the file descriptor itself
+ * via fcntl(F_RDADVISE). This is more aggressive than madvise for cold starts:
+ * it brings pages into the unified buffer cache so first-touch faults are
+ * serviced from RAM instead of storage. No-op on non-Apple POSIX platforms.
+ */
+ET_INLINE void fcntl_rdadvise_apple(int fd, size_t file_size) {
+#if defined(__APPLE__)
+  struct radvisory advice;
+  advice.ra_offset = 0;
+  advice.ra_count = static_cast<int>(file_size);
+  ::fcntl(fd, F_RDADVISE, &advice);
+#else
+  (void)fd;
+  (void)file_size;
+#endif
+}
+
 #else
 
 #define NOMINMAX
@@ -78,4 +107,21 @@ ET_INLINE uint64_t get_mmap_offset(size_t offset) {
   return static_cast<uint64_t>(offset);
 }
 
+/**
+ * No-op on Windows: there is no direct equivalent to madvise(MADV_WILLNEED |
+ * MADV_SEQUENTIAL) and the existing mman_windows shim does not implement one.
+ */
+ET_INLINE void madvise_pages_willneed_sequential(void *addr, size_t len) {
+  (void)addr;
+  (void)len;
+}
+
+/**
+ * No-op on Windows: F_RDADVISE is an Apple-specific fcntl command.
+ */
+ET_INLINE void fcntl_rdadvise_apple(int fd, size_t file_size) {
+  (void)fd;
+  (void)file_size;
+}
+
 #endif
diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mmap_data_loader.h b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mmap_data_loader.h
index cbc2ced2d9..1b648ce35f 100644
--- a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mmap_data_loader.h
+++ b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/mmap_data_loader.h
@@ -38,6 +38,10 @@ class MmapDataLoader final : public executorch::runtime::DataLoader {
     UseMlock,
     /// Call `mlock()` on loaded pages, ignoring errors if it fails.
     UseMlockIgnoreErrors,
+    /// Use madvise(MADV_WILLNEED | MADV_SEQUENTIAL) instead of mlock.
+    /// Tells the kernel to prefetch pages eagerly and optimize for
+    /// sequential reads, without pinning them in RAM.
+    UseMadvise,
   };
 
   /**
diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/shared_ptr_data_loader.h b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/shared_ptr_data_loader.h
index dc0e179187..e9bb26322f 100644
--- a/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/shared_ptr_data_loader.h
+++ b/packages/react-native-executorch/third-party/include/executorch/extension/data_loader/shared_ptr_data_loader.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <c10/util/safe_numerics.h>
 #include <executorch/runtime/core/data_loader.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
@@ -32,9 +33,12 @@ class SharedPtrDataLoader final : public executorch::runtime::DataLoader {
   executorch::runtime::Result<executorch::runtime::FreeableBuffer>
   load(size_t offset, size_t size,
        ET_UNUSED const DataLoader::SegmentInfo &segment_info) const override {
-    ET_CHECK_OR_RETURN_ERROR(offset + size <= size_, InvalidArgument,
-                             "offset %zu + size %zu > size_ %zu", offset, size,
-                             size_);
+    size_t total_size;
+    bool overflow = c10::add_overflows(offset, size, &total_size);
+    ET_CHECK_OR_RETURN_ERROR(
+        !overflow && total_size <= size_, InvalidArgument,
+        "offset %zu + size %zu > size_ %zu, or overflow detected", offset, size,
+        size_);
     return executorch::runtime::FreeableBuffer(
         static_cast<uint8_t *>(data_.get()) + offset, size,
         /*free_fn=*/nullptr);
diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/module/module.h b/packages/react-native-executorch/third-party/include/executorch/extension/module/module.h
index 278f996ed2..9dadc846ee 100644
--- a/packages/react-native-executorch/third-party/include/executorch/extension/module/module.h
+++ b/packages/react-native-executorch/third-party/include/executorch/extension/module/module.h
@@ -14,6 +14,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include <executorch/runtime/backend/backend_options_map.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/executor/program.h>
 
 #ifdef USE_ATEN_LIB
@@ -25,6 +27,7 @@
 namespace executorch {
 namespace extension {
 
+using ET_RUNTIME_NAMESPACE::Kernel;
 using ET_RUNTIME_NAMESPACE::Method;
 using ET_RUNTIME_NAMESPACE::MethodMeta;
 using ET_RUNTIME_NAMESPACE::NamedDataMap;
@@ -51,6 +54,8 @@ class Module {
     MmapUseMlock,
     /// Use memory locking and ignore errors.
     MmapUseMlockIgnoreErrors,
+    /// Use mmap with madvise(MADV_WILLNEED | MADV_SEQUENTIAL) hints.
+    MmapUseMadvise,
   };
 
   /**
@@ -182,9 +187,18 @@ class Module {
   /**
    * Loads the program with per-delegate runtime options.
    *
-   * @param[in] backend_options A LoadBackendOptionsMap containing per-delegate
-   * load-time configuration options. The caller must ensure this object
-   * outlives any methods loaded with these options.
+   * The Module deep-copies `backend_options` into internal storage, so the
+   * caller may release the input (and any backing BackendOption arrays its
+   * Spans referenced) immediately after this call returns. Future lazy
+   * `load_method` calls (e.g. triggered by `forward`) consume the
+   * Module-owned copy.
+   *
+   * Transactional: on failure, the previously-installed backend options
+   * (if any) are left in place; the input is not committed.
+   *
+   * @param[in] backend_options A LoadBackendOptionsMap containing
+   * per-delegate load-time configuration options. Deep-copied into the
+   * Module on success; not retained on failure.
    * @param[in] verification The type of verification to do before returning
    * success.
    *
@@ -195,6 +209,21 @@ class Module {
        const Program::Verification verification =
            Program::Verification::Minimal);
 
+  /**
+   * Returns the deep-copied LoadBackendOptionsMap most recently installed
+   * via `load(LoadBackendOptionsMap, ...)`. The returned reference is owned
+   * by the Module and remains valid until the next call to
+   * `load(LoadBackendOptionsMap, ...)` or until the Module is destroyed.
+   *
+   * If `load(LoadBackendOptionsMap, ...)` has never been called, returns a
+   * default-constructed (empty, `size() == 0`) map.
+   *
+   * @returns Const reference to the Module-owned LoadBackendOptionsMap.
+   */
+  inline const LoadBackendOptionsMap &backend_options() const {
+    return backend_options_map_;
+  }
+
   /**
    * Checks if the program is loaded.
    *
@@ -246,7 +275,8 @@ class Module {
   load_method(const std::string &method_name,
               runtime::HierarchicalAllocator *planned_memory = nullptr,
               torch::executor::EventTracer *event_tracer = nullptr,
-              const LoadBackendOptionsMap *backend_options = nullptr);
+              const LoadBackendOptionsMap *backend_options = nullptr,
+              std::vector<Kernel> kernel_registry = {});
 
   ET_DEPRECATED ET_NODISCARD runtime::Error inline load_method(
       const std::string &method_name,
@@ -294,9 +324,10 @@ class Module {
   ET_NODISCARD inline runtime::Error
   load_forward(runtime::HierarchicalAllocator *planned_memory = nullptr,
                torch::executor::EventTracer *event_tracer = nullptr,
-               const LoadBackendOptionsMap *backend_options = nullptr) {
-    return load_method("forward", planned_memory, event_tracer,
-                       backend_options);
+               const LoadBackendOptionsMap *backend_options = nullptr,
+               std::vector<Kernel> kernel_registry = {}) {
+    return load_method("forward", planned_memory, event_tracer, backend_options,
+                       std::move(kernel_registry));
   }
 
   ET_DEPRECATED ET_NODISCARD inline runtime::Error
@@ -678,6 +709,7 @@ class Module {
     std::unique_ptr<PlannedMemory> planned_memory;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<Method> method;
+    std::vector<Kernel> kernel_registry;
   };
 
   std::string file_path_;
@@ -693,7 +725,14 @@ class Module {
   std::unique_ptr<NamedDataMap> merged_data_map_;
   std::vector<std::vector<uint8_t>> shared_arenas_;
   ET_DEPRECATED std::vector<uint8_t> debug_buffer_;
-  const LoadBackendOptionsMap *backend_options_ = nullptr;
+  // Module-owned deep-copy of the backend options most recently installed
+  // via load(LoadBackendOptionsMap, ...). `backend_options_storage_` owns
+  // the per-backend BackendOption arrays; `backend_options_map_` is a
+  // LoadBackendOptionsMap whose Spans reference those owned arrays. An
+  // empty map (`size() == 0`) is observationally indistinguishable from
+  // "never set" by downstream consumers, so we don't track that bit.
+  std::vector<std::vector<runtime::BackendOption>> backend_options_storage_;
+  LoadBackendOptionsMap backend_options_map_;
   bool share_memory_arenas_;
 
   ET_NODISCARD runtime::Error
diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/tensor/tensor_ptr.h b/packages/react-native-executorch/third-party/include/executorch/extension/tensor/tensor_ptr.h
index 3dd4e890d3..15321ae608 100644
--- a/packages/react-native-executorch/third-party/include/executorch/extension/tensor/tensor_ptr.h
+++ b/packages/react-native-executorch/third-party/include/executorch/extension/tensor/tensor_ptr.h
@@ -14,6 +14,7 @@
 #include <vector>
 
 #include <c10/macros/Macros.h>
+#include <c10/util/safe_numerics.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -105,13 +106,21 @@ make_tensor_ptr(std::vector<executorch::aten::SizesType> sizes,
                 executorch::aten::ScalarType type = deduced_type,
                 executorch::aten::TensorShapeDynamism dynamism =
                     executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  ET_CHECK_MSG(data.size() ==
-                   executorch::aten::compute_numel(sizes.data(), sizes.size()),
+  auto numel_result = executorch::aten::safe_numel(sizes.data(), sizes.size());
+  ET_CHECK_MSG(numel_result.ok(), "safe_numel failed: %d",
+               static_cast<int>(numel_result.error()));
+  ET_CHECK_MSG(data.size() == static_cast<size_t>(numel_result.get()),
                "Data size does not match tensor size.");
   if (type != deduced_type) {
     ET_CHECK_MSG(runtime::canCast(deduced_type, type),
                  "Cannot cast deduced type to specified type.");
-    std::vector<uint8_t> casted_data(data.size() * aten::elementSize(type));
+    size_t casted_bytes = 0;
+    ET_CHECK_MSG(!c10::mul_overflows(
+                     data.size(), static_cast<size_t>(aten::elementSize(type)),
+                     &casted_bytes),
+                 "casted_data size overflow: %zu elements * %zu bytes/element",
+                 data.size(), static_cast<size_t>(aten::elementSize(type)));
+    std::vector<uint8_t> casted_data(casted_bytes);
 
     // Create a minimal context for error handling in ET_SWITCH
     struct {
@@ -327,8 +336,11 @@ make_tensor_ptr(const executorch::aten::Tensor &tensor,
   const auto same_rank = sizes.size() == static_cast<size_t>(tensor.dim());
   const auto same_shape = same_rank && std::equal(sizes.begin(), sizes.end(),
                                                   tensor.sizes().begin());
-  const auto element_count =
-      executorch::aten::compute_numel(sizes.data(), sizes.size());
+  auto element_count_result =
+      executorch::aten::safe_numel(sizes.data(), sizes.size());
+  ET_CHECK_MSG(element_count_result.ok(), "safe_numel failed: %d",
+               static_cast<int>(element_count_result.error()));
+  const auto element_count = element_count_result.get();
   const auto parent_element_count = tensor.numel();
   ET_CHECK_MSG(
       element_count <= parent_element_count,
diff --git a/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/Functions.h b/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/Functions.h
index 1b9b4a44b3..cd934e7997 100644
--- a/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/Functions.h
+++ b/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/Functions.h
@@ -91,6 +91,12 @@ TORCH_API inline torch::executor::Tensor & gelu_outf(torch::executor::KernelRunt
 }
 
 
+// aten::grid_sampler_2d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+TORCH_API inline torch::executor::Tensor & grid_sampler_2d_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & input, const torch::executor::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, torch::executor::Tensor & out) {
+    return ::torch::executor::native::opt_grid_sampler_2d_out(context, input, grid, interpolation_mode, padding_mode, align_corners, out);
+}
+
+
 // aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
 TORCH_API inline torch::executor::Tensor & le_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, const torch::executor::Scalar & other, torch::executor::Tensor & out) {
     return ::torch::executor::native::opt_le_scalar_out(context, self, other, out);
@@ -139,6 +145,12 @@ TORCH_API inline torch::executor::Tensor & sub_outf(torch::executor::KernelRunti
 }
 
 
+// aten::sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+TORCH_API inline torch::executor::Tensor & sum_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, bool keepdim, torch::executor::optional<torch::executor::ScalarType> dtype, torch::executor::Tensor & out) {
+    return ::torch::executor::native::opt_sum_dim_out(context, self, dim, keepdim, dtype, out);
+}
+
+
 // aten::sub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
 TORCH_API inline torch::executor::Tensor & sub_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, const torch::executor::Scalar & other, const torch::executor::Scalar & alpha, torch::executor::Tensor & out) {
     return ::torch::executor::native::opt_sub_scalar_out(context, self, other, alpha, out);
diff --git a/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/NativeFunctions.h b/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/NativeFunctions.h
index 3e83af3688..6dff2dae23 100644
--- a/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/NativeFunctions.h
+++ b/packages/react-native-executorch/third-party/include/executorch/kernels/optimized/NativeFunctions.h
@@ -42,6 +42,8 @@ torch::executor::Tensor & opt_exp_out(const torch::executor::Tensor & self, torc
 torch::executor::Tensor & opt_exp_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_gelu_out(const torch::executor::Tensor & self, torch::executor::string_view approximate, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_gelu_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::string_view approximate, torch::executor::Tensor & out);
+torch::executor::Tensor & opt_grid_sampler_2d_out(const torch::executor::Tensor & input, const torch::executor::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, torch::executor::Tensor & out);
+torch::executor::Tensor & opt_grid_sampler_2d_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & input, const torch::executor::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_le_scalar_out(const torch::executor::Tensor & self, const torch::executor::Scalar & other, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_le_scalar_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, const torch::executor::Scalar & other, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_le_tensor_out(const torch::executor::Tensor & self, const torch::executor::Tensor & other, torch::executor::Tensor & out);
@@ -58,6 +60,8 @@ ::std::tuple<torch::executor::Tensor &,torch::executor::Tensor &,torch::executor
 ::std::tuple<torch::executor::Tensor &,torch::executor::Tensor &,torch::executor::Tensor &> opt_native_layer_norm_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & input, torch::executor::ArrayRef<int64_t> normalized_shape, const torch::executor::optional<torch::executor::Tensor> & weight, const torch::executor::optional<torch::executor::Tensor> & bias, double eps, torch::executor::Tensor & out0, torch::executor::Tensor & out1, torch::executor::Tensor & out2);
 torch::executor::Tensor & opt_sub_out(const torch::executor::Tensor & self, const torch::executor::Tensor & other, const torch::executor::Scalar & alpha, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_sub_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, const torch::executor::Tensor & other, const torch::executor::Scalar & alpha, torch::executor::Tensor & out);
+torch::executor::Tensor & opt_sum_dim_out(const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, bool keepdim, torch::executor::optional<torch::executor::ScalarType> dtype, torch::executor::Tensor & out);
+torch::executor::Tensor & opt_sum_dim_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, bool keepdim, torch::executor::optional<torch::executor::ScalarType> dtype, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_sub_scalar_out(const torch::executor::Tensor & self, const torch::executor::Scalar & other, const torch::executor::Scalar & alpha, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_sub_scalar_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, const torch::executor::Scalar & other, const torch::executor::Scalar & alpha, torch::executor::Tensor & out);
 torch::executor::Tensor & opt_where_out(const torch::executor::Tensor & condition, const torch::executor::Tensor & self, const torch::executor::Tensor & other, torch::executor::Tensor & out);
diff --git a/packages/react-native-executorch/third-party/include/executorch/kernels/portable/Functions.h b/packages/react-native-executorch/third-party/include/executorch/kernels/portable/Functions.h
index b1911d7ae2..c96f3daf4b 100644
--- a/packages/react-native-executorch/third-party/include/executorch/kernels/portable/Functions.h
+++ b/packages/react-native-executorch/third-party/include/executorch/kernels/portable/Functions.h
@@ -25,12 +25,24 @@ namespace executor {
 
 namespace aten {
 
+// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+TORCH_API inline torch::executor::Tensor & _adaptive_avg_pool2d_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::ArrayRef<int64_t> output_size, torch::executor::Tensor & out) {
+    return ::torch::executor::native::_adaptive_avg_pool2d_out(context, self, output_size, out);
+}
+
+
 // aten::_cdist_forward.out(Tensor x1, Tensor x2, float p, int? compute_mode, *, Tensor(a!) out) -> Tensor(a!)
 TORCH_API inline torch::executor::Tensor & _cdist_forward_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & x1, const torch::executor::Tensor & x2, double p, torch::executor::optional<int64_t> compute_mode, torch::executor::Tensor & out) {
     return ::torch::executor::native::_cdist_forward_out(context, x1, x2, p, compute_mode, out);
 }
 
 
+// aten::_conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+TORCH_API inline torch::executor::Tensor & _conj_physical_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::Tensor & out) {
+    return ::torch::executor::native::_conj_physical_out(context, self, out);
+}
+
+
 // aten::_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
 TORCH_API inline torch::executor::Tensor & _log_softmax_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, int64_t dim, bool half_to_float, torch::executor::Tensor & out) {
     return ::torch::executor::native::log_softmax_out(context, self, dim, half_to_float, out);
@@ -1201,6 +1213,12 @@ TORCH_API inline torch::executor::Tensor & var_outf(torch::executor::KernelRunti
 }
 
 
+// aten::var_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+TORCH_API inline ::std::tuple<torch::executor::Tensor &,torch::executor::Tensor &> var_mean_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, const torch::executor::optional<torch::executor::Scalar> & correction, bool keepdim, torch::executor::Tensor & out0, torch::executor::Tensor & out1) {
+    return ::torch::executor::native::var_mean_correction_out(context, self, dim, correction, keepdim, out0, out1);
+}
+
+
 // aten::var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 TORCH_API inline torch::executor::Tensor & var_outf(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, bool unbiased, bool keepdim, torch::executor::Tensor & out) {
     return ::torch::executor::native::var_out(context, self, dim, unbiased, keepdim, out);
diff --git a/packages/react-native-executorch/third-party/include/executorch/kernels/portable/NativeFunctions.h b/packages/react-native-executorch/third-party/include/executorch/kernels/portable/NativeFunctions.h
index c36896b7a1..e2a6cc3f52 100644
--- a/packages/react-native-executorch/third-party/include/executorch/kernels/portable/NativeFunctions.h
+++ b/packages/react-native-executorch/third-party/include/executorch/kernels/portable/NativeFunctions.h
@@ -20,8 +20,12 @@
 namespace torch {
 namespace executor {
 namespace native {
+torch::executor::Tensor & _adaptive_avg_pool2d_out(const torch::executor::Tensor & self, torch::executor::ArrayRef<int64_t> output_size, torch::executor::Tensor & out);
+torch::executor::Tensor & _adaptive_avg_pool2d_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::ArrayRef<int64_t> output_size, torch::executor::Tensor & out);
 torch::executor::Tensor & _cdist_forward_out(const torch::executor::Tensor & x1, const torch::executor::Tensor & x2, double p, torch::executor::optional<int64_t> compute_mode, torch::executor::Tensor & out);
 torch::executor::Tensor & _cdist_forward_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & x1, const torch::executor::Tensor & x2, double p, torch::executor::optional<int64_t> compute_mode, torch::executor::Tensor & out);
+torch::executor::Tensor & _conj_physical_out(const torch::executor::Tensor & self, torch::executor::Tensor & out);
+torch::executor::Tensor & _conj_physical_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::Tensor & out);
 torch::executor::Tensor & log_softmax_out(const torch::executor::Tensor & self, int64_t dim, bool half_to_float, torch::executor::Tensor & out);
 torch::executor::Tensor & log_softmax_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, int64_t dim, bool half_to_float, torch::executor::Tensor & out);
 ::std::tuple<torch::executor::Tensor &,torch::executor::Tensor &,torch::executor::Tensor &> _native_batch_norm_legit_out(const torch::executor::Tensor & input, const torch::executor::optional<torch::executor::Tensor> & weight, const torch::executor::optional<torch::executor::Tensor> & bias, torch::executor::Tensor & running_mean, torch::executor::Tensor & running_var, bool training, double momentum, double eps, torch::executor::Tensor & out, torch::executor::Tensor & save_mean, torch::executor::Tensor & save_invstd);
@@ -412,6 +416,8 @@ torch::executor::Tensor & upsample_nearest2d_vec_out(const torch::executor::Tens
 torch::executor::Tensor & upsample_nearest2d_vec_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & input, torch::executor::optional<torch::executor::ArrayRef<int64_t>> output_size, torch::executor::optional<torch::executor::ArrayRef<double>> scale_factors, torch::executor::Tensor & out);
 torch::executor::Tensor & var_correction_out(const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, const torch::executor::optional<torch::executor::Scalar> & correction, bool keepdim, torch::executor::Tensor & out);
 torch::executor::Tensor & var_correction_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, const torch::executor::optional<torch::executor::Scalar> & correction, bool keepdim, torch::executor::Tensor & out);
+::std::tuple<torch::executor::Tensor &,torch::executor::Tensor &> var_mean_correction_out(const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, const torch::executor::optional<torch::executor::Scalar> & correction, bool keepdim, torch::executor::Tensor & out0, torch::executor::Tensor & out1);
+::std::tuple<torch::executor::Tensor &,torch::executor::Tensor &> var_mean_correction_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, const torch::executor::optional<torch::executor::Scalar> & correction, bool keepdim, torch::executor::Tensor & out0, torch::executor::Tensor & out1);
 torch::executor::Tensor & var_out(const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, bool unbiased, bool keepdim, torch::executor::Tensor & out);
 torch::executor::Tensor & var_out(torch::executor::KernelRuntimeContext & context, const torch::executor::Tensor & self, torch::executor::optional<torch::executor::ArrayRef<int64_t>> dim, bool unbiased, bool keepdim, torch::executor::Tensor & out);
 torch::executor::Tensor & view_as_real_copy_out(const torch::executor::Tensor & self, torch::executor::Tensor & out);
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/backend/backend_options_map.h b/packages/react-native-executorch/third-party/include/executorch/runtime/backend/backend_options_map.h
index 8ae9543c29..5acf6cca2d 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/backend/backend_options_map.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/backend/backend_options_map.h
@@ -11,6 +11,7 @@
 #include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/assert.h>
 
 #include <cstring>
 
@@ -168,6 +169,42 @@ class LoadBackendOptionsMap final {
    */
   size_t size() const { return size_; }
 
+  /**
+   * Non-owning view of a single (backend_id, options) entry, returned by
+   * entry_at(). The pointer / span are valid until the map is mutated or
+   * destroyed.
+   */
+  struct EntryView {
+    const char *backend_id = nullptr;
+    Span<const BackendOption> options;
+  };
+
+  /**
+   * Returns the (backend_id, options) entry at the given index for
+   * enumeration over the map's contents.
+   *
+   * @param index The entry index. Must be < size(); behavior is undefined
+   *     otherwise. Use this together with size() to walk every entry.
+   * @return EntryView referencing the entry's backend_id and options. The
+   *     view is valid until the next mutation of, or destruction of, this
+   *     map.
+   *
+   * Example:
+   * @code
+   *   for (size_t i = 0; i < map.size(); ++i) {
+   *     const auto entry = map.entry_at(i);
+   *     // use entry.backend_id and entry.options ...
+   *   }
+   * @endcode
+   */
+  EntryView entry_at(size_t index) const {
+    ET_DCHECK_MSG(index < size_, "entry_at index %zu out of bounds (size=%zu)",
+                  index, size_);
+    return EntryView{entries_[index].backend_id,
+                     Span<const BackendOption>(entries_[index].options.data(),
+                                               entries_[index].options.size())};
+  }
+
 private:
   static constexpr size_t kMaxBackends = 8;
   static constexpr size_t kMaxBackendIdLength = 64;
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/array_ref.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/array_ref.h
index 01dd581663..867520137f 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/array_ref.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/array_ref.h
@@ -30,6 +30,7 @@
 #include <cstdint>
 
 #include <c10/util/irange.h>
+#include <c10/util/safe_numerics.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
@@ -146,7 +147,8 @@ template <typename T> class ArrayRef final {
   /// slice(n, m) - Take M elements of the array starting at element N
   ArrayRef<T> slice(size_t N, size_t M) const {
     // cant slice longer then the array
-    ET_CHECK(N + M <= size());
+    size_t end = 0;
+    ET_CHECK(!c10::add_overflows(N, M, &end) && end <= size());
     return ArrayRef<T>(data() + N, M);
   }
 
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/error.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/error.h
index 6500a6f907..c107dd43db 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/error.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/error.h
@@ -152,6 +152,7 @@ constexpr const char *to_string(const Error error) {
   case Error::RegistrationAlreadyRegistered:
     return "Error::RegistrationAlreadyRegistered";
   }
+  return "Error::Unknown";
 }
 
 } // namespace runtime
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/evalue.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/evalue.h
index b3b0689ba7..2e8bb5389e 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/evalue.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/evalue.h
@@ -8,6 +8,7 @@
 
 #pragma once
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/tag.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -67,6 +68,29 @@ template <typename T> class BoxedEvalueList {
    */
   executorch::aten::ArrayRef<T> get() const;
 
+  /**
+   * Result-returning counterpart of get(). Validates each wrapped EValue's
+   * tag before materializing; returns Error::InvalidType if any element's
+   * tag does not match T and Error::InvalidState if any element pointer is
+   * null. Use this when materializing lists from untrusted .pte data so that
+   * a malformed program cannot force a process abort inside to<T>() /
+   * ET_CHECK.
+   */
+  Result<executorch::aten::ArrayRef<T>> tryGet() const;
+
+  /**
+   * Destroys the unwrapped elements without re-dereferencing wrapped_vals_.
+   * This is safe to call during EValue destruction because it does not
+   * dereference wrapped_vals_, which may point to EValues mutated by
+   * MoveCall instructions.
+   */
+  void destroy_elements() {
+    for (typename executorch::aten::ArrayRef<T>::size_type i = 0;
+         i < wrapped_vals_.size(); i++) {
+      unwrapped_vals_[i].~T();
+    }
+  }
+
 private:
   static EValue **checkWrappedVals(EValue **wrapped_vals, int size) {
     ET_CHECK_MSG(wrapped_vals != nullptr, "wrapped_vals cannot be null");
@@ -89,6 +113,10 @@ template <>
 executorch::aten::ArrayRef<std::optional<executorch::aten::Tensor>>
 BoxedEvalueList<std::optional<executorch::aten::Tensor>>::get() const;
 
+template <>
+Result<executorch::aten::ArrayRef<std::optional<executorch::aten::Tensor>>>
+BoxedEvalueList<std::optional<executorch::aten::Tensor>>::tryGet() const;
+
 // Aggregate typing system similar to IValue only slimmed down with less
 // functionality, no dependencies on atomic, and fewer supported types to better
 // suit embedded systems (ie no intrusive ptr)
@@ -165,6 +193,13 @@ struct EValue {
     return payload.copyable_union.as_int;
   }
 
+  Result<int64_t> tryToInt() const {
+    if (!isInt()) {
+      return Error::InvalidType;
+    }
+    return payload.copyable_union.as_int;
+  }
+
   /****** Double Type ******/
   /*implicit*/ EValue(double d) : tag(Tag::Double) {
     payload.copyable_union.as_double = d;
@@ -177,6 +212,13 @@ struct EValue {
     return payload.copyable_union.as_double;
   }
 
+  Result<double> tryToDouble() const {
+    if (!isDouble()) {
+      return Error::InvalidType;
+    }
+    return payload.copyable_union.as_double;
+  }
+
   /****** Bool Type ******/
   /*implicit*/ EValue(bool b) : tag(Tag::Bool) {
     payload.copyable_union.as_bool = b;
@@ -189,6 +231,13 @@ struct EValue {
     return payload.copyable_union.as_bool;
   }
 
+  Result<bool> tryToBool() const {
+    if (!isBool()) {
+      return Error::InvalidType;
+    }
+    return payload.copyable_union.as_bool;
+  }
+
   /****** Scalar Type ******/
   /// Construct an EValue using the implicit value of a Scalar.
   /*implicit*/ EValue(executorch::aten::Scalar s) {
@@ -224,6 +273,19 @@ struct EValue {
     }
   }
 
+  Result<executorch::aten::Scalar> tryToScalar() const {
+    if (isDouble()) {
+      return executorch::aten::Scalar(payload.copyable_union.as_double);
+    }
+    if (isInt()) {
+      return executorch::aten::Scalar(payload.copyable_union.as_int);
+    }
+    if (isBool()) {
+      return executorch::aten::Scalar(payload.copyable_union.as_bool);
+    }
+    return Error::InvalidType;
+  }
+
   /****** Tensor Type ******/
   /*implicit*/ EValue(executorch::aten::Tensor t) : tag(Tag::Tensor) {
     // When built in aten mode, at::Tensor has a non trivial constructor
@@ -270,6 +332,16 @@ struct EValue {
     return payload.as_tensor;
   }
 
+  // Returns a copy of the Tensor handle (one intrusive_ptr refcount bump in
+  // ATen mode; free in lean mode). Unlike toTensor()'s const& / & overloads,
+  // tryToTensor() cannot return a reference — Result<T> wraps by value.
+  Result<executorch::aten::Tensor> tryToTensor() const {
+    if (!isTensor()) {
+      return Error::InvalidType;
+    }
+    return payload.as_tensor;
+  }
+
   /****** String Type ******/
   /*implicit*/ EValue(executorch::aten::ArrayRef<char> *s) : tag(Tag::String) {
     ET_CHECK_MSG(s != nullptr, "ArrayRef<char> pointer cannot be null");
@@ -286,6 +358,17 @@ struct EValue {
                             payload.copyable_union.as_string_ptr->size());
   }
 
+  Result<std::string_view> tryToString() const {
+    if (!isString()) {
+      return Error::InvalidType;
+    }
+    if (payload.copyable_union.as_string_ptr == nullptr) {
+      return Error::InvalidState;
+    }
+    return std::string_view(payload.copyable_union.as_string_ptr->data(),
+                            payload.copyable_union.as_string_ptr->size());
+  }
+
   /****** Int List Type ******/
   /*implicit*/ EValue(BoxedEvalueList<int64_t> *i) : tag(Tag::ListInt) {
     ET_CHECK_MSG(i != nullptr,
@@ -302,6 +385,16 @@ struct EValue {
     return (payload.copyable_union.as_int_list_ptr)->get();
   }
 
+  Result<executorch::aten::ArrayRef<int64_t>> tryToIntList() const {
+    if (!isIntList()) {
+      return Error::InvalidType;
+    }
+    if (payload.copyable_union.as_int_list_ptr == nullptr) {
+      return Error::InvalidState;
+    }
+    return (payload.copyable_union.as_int_list_ptr)->tryGet();
+  }
+
   /****** Bool List Type ******/
   /*implicit*/ EValue(executorch::aten::ArrayRef<bool> *b)
       : tag(Tag::ListBool) {
@@ -318,6 +411,16 @@ struct EValue {
     return *(payload.copyable_union.as_bool_list_ptr);
   }
 
+  Result<executorch::aten::ArrayRef<bool>> tryToBoolList() const {
+    if (!isBoolList()) {
+      return Error::InvalidType;
+    }
+    if (payload.copyable_union.as_bool_list_ptr == nullptr) {
+      return Error::InvalidState;
+    }
+    return *(payload.copyable_union.as_bool_list_ptr);
+  }
+
   /****** Double List Type ******/
   /*implicit*/ EValue(executorch::aten::ArrayRef<double> *d)
       : tag(Tag::ListDouble) {
@@ -334,6 +437,16 @@ struct EValue {
     return *(payload.copyable_union.as_double_list_ptr);
   }
 
+  Result<executorch::aten::ArrayRef<double>> tryToDoubleList() const {
+    if (!isDoubleList()) {
+      return Error::InvalidType;
+    }
+    if (payload.copyable_union.as_double_list_ptr == nullptr) {
+      return Error::InvalidState;
+    }
+    return *(payload.copyable_union.as_double_list_ptr);
+  }
+
   /****** Tensor List Type ******/
   /*implicit*/ EValue(BoxedEvalueList<executorch::aten::Tensor> *t)
       : tag(Tag::ListTensor) {
@@ -351,6 +464,17 @@ struct EValue {
     return payload.copyable_union.as_tensor_list_ptr->get();
   }
 
+  Result<executorch::aten::ArrayRef<executorch::aten::Tensor>>
+  tryToTensorList() const {
+    if (!isTensorList()) {
+      return Error::InvalidType;
+    }
+    if (payload.copyable_union.as_tensor_list_ptr == nullptr) {
+      return Error::InvalidState;
+    }
+    return payload.copyable_union.as_tensor_list_ptr->tryGet();
+  }
+
   /****** List Optional Tensor Type ******/
   /*implicit*/ EValue(
       BoxedEvalueList<std::optional<executorch::aten::Tensor>> *t)
@@ -371,6 +495,17 @@ struct EValue {
     return payload.copyable_union.as_list_optional_tensor_ptr->get();
   }
 
+  Result<executorch::aten::ArrayRef<std::optional<executorch::aten::Tensor>>>
+  tryToListOptionalTensor() const {
+    if (!isListOptionalTensor()) {
+      return Error::InvalidType;
+    }
+    if (payload.copyable_union.as_list_optional_tensor_ptr == nullptr) {
+      return Error::InvalidState;
+    }
+    return payload.copyable_union.as_list_optional_tensor_ptr->tryGet();
+  }
+
   /****** ScalarType Type ******/
   executorch::aten::ScalarType toScalarType() const {
     ET_CHECK_MSG(isInt(), "EValue is not a ScalarType.");
@@ -378,6 +513,14 @@ struct EValue {
         payload.copyable_union.as_int);
   }
 
+  Result<executorch::aten::ScalarType> tryToScalarType() const {
+    if (!isInt()) {
+      return Error::InvalidType;
+    }
+    return static_cast<executorch::aten::ScalarType>(
+        payload.copyable_union.as_int);
+  }
+
   /****** MemoryFormat Type ******/
   executorch::aten::MemoryFormat toMemoryFormat() const {
     ET_CHECK_MSG(isInt(), "EValue is not a MemoryFormat.");
@@ -385,12 +528,27 @@ struct EValue {
         payload.copyable_union.as_int);
   }
 
+  Result<executorch::aten::MemoryFormat> tryToMemoryFormat() const {
+    if (!isInt()) {
+      return Error::InvalidType;
+    }
+    return static_cast<executorch::aten::MemoryFormat>(
+        payload.copyable_union.as_int);
+  }
+
   /****** Layout Type ******/
   executorch::aten::Layout toLayout() const {
     ET_CHECK_MSG(isInt(), "EValue is not a Layout.");
     return static_cast<executorch::aten::Layout>(payload.copyable_union.as_int);
   }
 
+  Result<executorch::aten::Layout> tryToLayout() const {
+    if (!isInt()) {
+      return Error::InvalidType;
+    }
+    return static_cast<executorch::aten::Layout>(payload.copyable_union.as_int);
+  }
+
   /****** Device Type ******/
   executorch::aten::Device toDevice() const {
     ET_CHECK_MSG(isInt(), "EValue is not a Device.");
@@ -399,12 +557,29 @@ struct EValue {
                                     -1);
   }
 
+  Result<executorch::aten::Device> tryToDevice() const {
+    if (!isInt()) {
+      return Error::InvalidType;
+    }
+    return executorch::aten::Device(static_cast<executorch::aten::DeviceType>(
+                                        payload.copyable_union.as_int),
+                                    -1);
+  }
+
   template <typename T> T to() &&;
   template <typename T>
   typename internal::evalue_to_const_ref_overload_return<T>::type to() const &;
   template <typename T>
   typename internal::evalue_to_ref_overload_return<T>::type to() &;
 
+  /**
+   * Result-returning equivalent of `to<T>()`. Tag mismatch returns
+   * `Error::InvalidType`; a null list/string payload returns
+   * `Error::InvalidState`. Specializations are defined below via
+   * `EVALUE_DEFINE_TRY_TO`.
+   */
+  template <typename T> Result<T> tryTo() const;
+
   /**
    * Converts the EValue to an optional object that can represent both T and
    * an uninitialized state.
@@ -416,6 +591,22 @@ struct EValue {
     return this->to<T>();
   }
 
+  /**
+   * Result-returning equivalent of `toOptional<T>()`. None maps to an empty
+   * optional; any other tag that doesn't match T propagates `tryTo<T>()`'s
+   * error (`Error::InvalidType`).
+   */
+  template <typename T> inline Result<std::optional<T>> tryToOptional() const {
+    if (this->isNone()) {
+      return std::optional<T>(std::nullopt);
+    }
+    auto r = this->tryTo<T>();
+    if (!r.ok()) {
+      return r.error();
+    }
+    return std::optional<T>(std::move(r.get()));
+  }
+
 private:
   // Pre cond: the payload value has had its destructor called
   void clearToNone() noexcept {
@@ -446,17 +637,10 @@ struct EValue {
       payload.as_tensor.~Tensor();
     } else if (isTensorList() &&
                payload.copyable_union.as_tensor_list_ptr != nullptr) {
-      // for (auto& tensor : toTensorList()) {
-      for (auto &tensor : payload.copyable_union.as_tensor_list_ptr->get()) {
-        tensor.~Tensor();
-      }
+      payload.copyable_union.as_tensor_list_ptr->destroy_elements();
     } else if (isListOptionalTensor() &&
                payload.copyable_union.as_list_optional_tensor_ptr != nullptr) {
-      // for (auto& optional_tensor : toListOptionalTensor()) {
-      for (auto &optional_tensor :
-           payload.copyable_union.as_list_optional_tensor_ptr->get()) {
-        optional_tensor.~optional();
-      }
+      payload.copyable_union.as_list_optional_tensor_ptr->destroy_elements();
     }
   }
 
@@ -532,6 +716,53 @@ EVALUE_DEFINE_TO(
     toListOptionalTensor)
 #undef EVALUE_DEFINE_TO
 
+#define EVALUE_DEFINE_TRY_TO(T, method_name)                                   \
+  template <> inline Result<T> EValue::tryTo<T>() const {                      \
+    return this->method_name();                                                \
+  }
+
+EVALUE_DEFINE_TRY_TO(executorch::aten::Scalar, tryToScalar)
+EVALUE_DEFINE_TRY_TO(int64_t, tryToInt)
+EVALUE_DEFINE_TRY_TO(bool, tryToBool)
+EVALUE_DEFINE_TRY_TO(double, tryToDouble)
+EVALUE_DEFINE_TRY_TO(std::string_view, tryToString)
+EVALUE_DEFINE_TRY_TO(executorch::aten::ScalarType, tryToScalarType)
+EVALUE_DEFINE_TRY_TO(executorch::aten::MemoryFormat, tryToMemoryFormat)
+EVALUE_DEFINE_TRY_TO(executorch::aten::Layout, tryToLayout)
+EVALUE_DEFINE_TRY_TO(executorch::aten::Device, tryToDevice)
+// Tensor and Optional Tensor
+EVALUE_DEFINE_TRY_TO(executorch::aten::Tensor, tryToTensor)
+EVALUE_DEFINE_TRY_TO(std::optional<executorch::aten::Tensor>,
+                     tryToOptional<executorch::aten::Tensor>)
+
+// IntList and Optional IntList
+EVALUE_DEFINE_TRY_TO(executorch::aten::ArrayRef<int64_t>, tryToIntList)
+EVALUE_DEFINE_TRY_TO(std::optional<executorch::aten::ArrayRef<int64_t>>,
+                     tryToOptional<executorch::aten::ArrayRef<int64_t>>)
+
+// DoubleList and Optional DoubleList
+EVALUE_DEFINE_TRY_TO(executorch::aten::ArrayRef<double>, tryToDoubleList)
+EVALUE_DEFINE_TRY_TO(std::optional<executorch::aten::ArrayRef<double>>,
+                     tryToOptional<executorch::aten::ArrayRef<double>>)
+
+// BoolList and Optional BoolList
+EVALUE_DEFINE_TRY_TO(executorch::aten::ArrayRef<bool>, tryToBoolList)
+EVALUE_DEFINE_TRY_TO(std::optional<executorch::aten::ArrayRef<bool>>,
+                     tryToOptional<executorch::aten::ArrayRef<bool>>)
+
+// TensorList and Optional TensorList
+EVALUE_DEFINE_TRY_TO(executorch::aten::ArrayRef<executorch::aten::Tensor>,
+                     tryToTensorList)
+EVALUE_DEFINE_TRY_TO(
+    std::optional<executorch::aten::ArrayRef<executorch::aten::Tensor>>,
+    tryToOptional<executorch::aten::ArrayRef<executorch::aten::Tensor>>)
+
+// List of Optional Tensor
+EVALUE_DEFINE_TRY_TO(
+    executorch::aten::ArrayRef<std::optional<executorch::aten::Tensor>>,
+    tryToListOptionalTensor)
+#undef EVALUE_DEFINE_TRY_TO
+
 template <typename T>
 executorch::aten::ArrayRef<T> BoxedEvalueList<T>::get() const {
   for (typename executorch::aten::ArrayRef<T>::size_type i = 0;
@@ -542,6 +773,22 @@ executorch::aten::ArrayRef<T> BoxedEvalueList<T>::get() const {
   return executorch::aten::ArrayRef<T>{unwrapped_vals_, wrapped_vals_.size()};
 }
 
+template <typename T>
+Result<executorch::aten::ArrayRef<T>> BoxedEvalueList<T>::tryGet() const {
+  for (typename executorch::aten::ArrayRef<T>::size_type i = 0;
+       i < wrapped_vals_.size(); i++) {
+    if (wrapped_vals_[i] == nullptr) {
+      return Error::InvalidState;
+    }
+    auto r = wrapped_vals_[i]->template tryTo<T>();
+    if (!r.ok()) {
+      return r.error();
+    }
+    unwrapped_vals_[i] = std::move(r.get());
+  }
+  return executorch::aten::ArrayRef<T>{unwrapped_vals_, wrapped_vals_.size()};
+}
+
 } // namespace runtime
 } // namespace executorch
 
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/exec_aten/exec_aten.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/exec_aten/exec_aten.h
index b4a1b2721f..db5096a46a 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/exec_aten/exec_aten.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/exec_aten/exec_aten.h
@@ -8,7 +8,10 @@
 
 #pragma once
 
+#include <executorch/runtime/core/error.h>                 // @manual
+#include <executorch/runtime/core/result.h>                // @manual
 #include <executorch/runtime/core/tensor_shape_dynamism.h> // @manual
+#include <executorch/runtime/platform/assert.h>            // @manual
 #include <executorch/runtime/platform/compiler.h>
 #ifdef USE_ATEN_LIB
 #include <ATen/Tensor.h> // @manual
@@ -28,6 +31,7 @@
 #include <c10/util/quint2x4.h>      // @manual
 #include <c10/util/quint4x2.h>      // @manual
 #include <c10/util/quint8.h>        // @manual
+#include <c10/util/safe_numerics.h> // @manual
 #include <c10/util/string_view.h>   // @manual
 #include <torch/torch.h>
 #else                                                            // use executor
@@ -107,6 +111,25 @@ inline ssize_t compute_numel(const SizesType *sizes, ssize_t dim) {
       c10::multiply_integers(c10::ArrayRef<SizesType>(sizes, dim)));
 }
 
+inline ::executorch::runtime::Result<ssize_t> safe_numel(const SizesType *sizes,
+                                                         ssize_t dim) {
+  ET_CHECK_OR_RETURN_ERROR(dim == 0 || sizes != nullptr, InvalidArgument,
+                           "Sizes must be provided for non-scalar tensors");
+  ssize_t numel = 1;
+  for (ssize_t i = 0; i < dim; i++) {
+    ET_CHECK_OR_RETURN_ERROR(
+        sizes[i] >= 0, InvalidArgument,
+        "Size must be non-negative, got %zd at dimension %zd",
+        static_cast<ssize_t>(sizes[i]), i);
+    ssize_t next_numel;
+    ET_CHECK_OR_RETURN_ERROR(
+        !c10::mul_overflows(numel, static_cast<ssize_t>(sizes[i]), &next_numel),
+        InvalidArgument, "Overflow computing numel at dimension %zd", i);
+    numel = next_numel;
+  }
+  return numel;
+}
+
 #undef ET_PRI_TENSOR_SIZE
 #define ET_PRI_TENSOR_SIZE PRId64
 
@@ -153,6 +176,7 @@ using OptionalArrayRef =
 using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
 
 using torch::executor::compute_numel;
+using torch::executor::safe_numel;
 
 #endif // Use ExecuTorch types
 
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/hierarchical_allocator.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/hierarchical_allocator.h
index 9e2335501b..e061c9d182 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/hierarchical_allocator.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/hierarchical_allocator.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <c10/util/irange.h>
+#include <c10/util/safe_numerics.h>
 
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/core/result.h>
@@ -56,17 +57,19 @@ class HierarchicalAllocator final {
                                                  size_t offset_bytes,
                                                  size_t size_bytes) {
     // Check for integer overflow in offset_bytes + size_bytes.
-    ET_CHECK_OR_RETURN_ERROR(size_bytes <= SIZE_MAX - offset_bytes,
-                             InvalidArgument,
-                             "Integer overflow in offset_bytes (%" ET_PRIsize_t
-                             ") + size_bytes (%" ET_PRIsize_t ")",
-                             offset_bytes, size_bytes);
+    size_t end_bytes = 0;
+    ET_CHECK_OR_RETURN_ERROR(
+        !c10::add_overflows(offset_bytes, size_bytes, &end_bytes),
+        InvalidArgument,
+        "Integer overflow in offset_bytes (%" ET_PRIsize_t
+        ") + size_bytes (%" ET_PRIsize_t ")",
+        offset_bytes, size_bytes);
     ET_CHECK_OR_RETURN_ERROR(memory_id < buffers_.size(), InvalidArgument,
                              "id %" PRIu32 " >= %" ET_PRIsize_t, memory_id,
                              buffers_.size());
     Span<uint8_t> buffer = buffers_[memory_id];
     ET_CHECK_OR_RETURN_ERROR(
-        offset_bytes + size_bytes <= buffer.size(), MemoryAllocationFailed,
+        end_bytes <= buffer.size(), MemoryAllocationFailed,
         "offset_bytes (%" ET_PRIsize_t ") + size_bytes (%" ET_PRIsize_t
         ") >= allocator size (%" ET_PRIsize_t ") "
         "for memory_id %" PRIu32,
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/device.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/device.h
index c6a42c55f0..32c53e9570 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/device.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/device.h
@@ -26,7 +26,6 @@ enum class DeviceType : int8_t {
 constexpr size_t kNumDeviceTypes = 2;
 
 /// An index representing a specific device; e.g. GPU 0 vs GPU 1.
-/// -1 means the default/unspecified device for that type.
 using DeviceIndex = int8_t;
 
 /**
@@ -41,7 +40,7 @@ struct Device final {
 
   /// Constructs a new `Device` from a `DeviceType` and an optional device
   /// index.
-  /* implicit */ Device(DeviceType type, DeviceIndex index = -1)
+  /* implicit */ Device(DeviceType type, DeviceIndex index = 0)
       : type_(type), index_(index) {}
 
   /// Returns the type of device the tensor data resides on.
@@ -50,7 +49,7 @@ struct Device final {
   /// Returns true if the device is of CPU type.
   bool is_cpu() const noexcept { return type_ == DeviceType::CPU; }
 
-  /// Returns the device index, or -1 if default/unspecified.
+  /// Returns the device index.
   DeviceIndex index() const noexcept { return index_; }
 
   bool operator==(const Device &other) const noexcept {
@@ -63,7 +62,7 @@ struct Device final {
 
 private:
   DeviceType type_;
-  DeviceIndex index_ = -1;
+  DeviceIndex index_ = 0;
 };
 
 } // namespace etensor
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/tensor_impl.h b/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/tensor_impl.h
index 4f79070626..c841aef0c8 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/tensor_impl.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/core/portable_type/tensor_impl.h
@@ -10,8 +10,11 @@
 
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/portable_type/scalar_type.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/tensor_shape_dynamism.h>
+#include <executorch/runtime/platform/compiler.h>
 
 // Forward declaration of a helper that provides access to internal resizing
 // methods of TensorImpl. Real definition is in
@@ -99,11 +102,15 @@ class TensorImpl {
    * @param strides Strides of the tensor at each dimension. Must contain `dim`
    *     entries.
    * @param dynamism The mutability of the shape of the tensor.
+   * @param device_type The type of device where tensor data resides.
+   * @param device_index The device index for multi-device scenarios.
    */
   TensorImpl(ScalarType type, ssize_t dim, SizesType *sizes,
              void *data = nullptr, DimOrderType *dim_order = nullptr,
              StridesType *strides = nullptr,
-             TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC);
+             TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC,
+             DeviceType device_type = DeviceType::CPU,
+             DeviceIndex device_index = 0);
 
   /**
    * Returns the size of the tensor in bytes.
@@ -161,6 +168,15 @@ class TensorImpl {
   /// Returns the mutability of the shape of the tensor.
   TensorShapeDynamism shape_dynamism() const { return shape_dynamism_; }
 
+  /// Returns the device where tensor data resides.
+  Device device() const { return device_; }
+
+  /// Returns the type of device where tensor data resides.
+  DeviceType device_type() const { return device_.type(); }
+
+  /// Returns the device index, or 0 if default/unspecified.
+  DeviceIndex device_index() const { return device_.index(); }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T> inline const T *data() const {
     return static_cast<const T *>(data());
@@ -238,6 +254,9 @@ class TensorImpl {
 
   /// Specifies the mutability of the shape of the tensor.
   const TensorShapeDynamism shape_dynamism_;
+
+  /// Device where tensor data resides (CPU, CUDA, etc.)
+  Device device_;
 };
 
 /**
@@ -247,6 +266,16 @@ ssize_t compute_numel(
     const ::executorch::runtime::etensor::TensorImpl::SizesType *sizes,
     ssize_t dim);
 
+/**
+ * Compute the number of elements based on the sizes of a tensor.
+ * Returns Error::InvalidArgument if any intermediate multiplication would
+ * overflow ssize_t, or if a size is negative. Prefer this over compute_numel()
+ * for paths that can propagate an Error upward.
+ */
+::executorch::runtime::Result<ssize_t>
+safe_numel(const ::executorch::runtime::etensor::TensorImpl::SizesType *sizes,
+           ssize_t dim);
+
 /// Appropriate format specifier for the result of calling
 /// size(). Must be used instead of using zd directly to support ATen
 /// mode.
@@ -276,6 +305,7 @@ namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
 using ::executorch::runtime::etensor::compute_numel;
+using ::executorch::runtime::etensor::safe_numel;
 using ::executorch::runtime::etensor::TensorImpl;
 } // namespace executor
 } // namespace torch
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method.h b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method.h
index aecf8c8fa0..dd66951ca5 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method.h
@@ -23,6 +23,7 @@
 #include <executorch/runtime/executor/memory_manager.h>
 #include <executorch/runtime/executor/merged_data_map.h>
 #include <executorch/runtime/executor/method_meta.h>
+#include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/compiler.h>
 
 // Forward declare flatbuffer types. This is a public header and must not
@@ -77,7 +78,7 @@ class Method final {
         merged_data_map_(std::move(rhs.merged_data_map_)),
         external_constants_(rhs.external_constants_),
         n_external_constants_(rhs.n_external_constants_),
-        init_state_(rhs.init_state_) {
+        kernel_registry_(rhs.kernel_registry_), init_state_(rhs.init_state_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
     rhs.n_value_ = 0;
@@ -323,13 +324,15 @@ class Method final {
   };
 
   Method(const Program *program, MemoryManager *memory_manager,
-         EventTracer *event_tracer, MemoryAllocator *temp_allocator)
+         EventTracer *event_tracer, MemoryAllocator *temp_allocator,
+         Span<const Kernel> kernel_registry = {})
       : step_state_(), program_(program), memory_manager_(memory_manager),
         temp_allocator_(temp_allocator), serialization_plan_(nullptr),
         event_tracer_(event_tracer), n_value_(0), values_(nullptr),
         input_set_(nullptr), n_delegate_(0), delegates_(nullptr), n_chains_(0),
         chains_(nullptr), merged_data_map_(nullptr),
         external_constants_(nullptr), n_external_constants_(0),
+        kernel_registry_(kernel_registry),
         init_state_(InitializationState::Uninitialized) {}
 
   /// Static factory used by Program.
@@ -337,7 +340,8 @@ class Method final {
   load(executorch_flatbuffer::ExecutionPlan *s_plan, const Program *program,
        MemoryManager *memory_manager, EventTracer *event_tracer,
        const NamedDataMap *named_data_map,
-       const LoadBackendOptionsMap *backend_options = nullptr);
+       const LoadBackendOptionsMap *backend_options = nullptr,
+       Span<const Kernel> kernel_registry = {});
 
   /**
    * Initialize the method from its serialized representation.
@@ -382,6 +386,8 @@ class Method final {
   NamedData *external_constants_;
   size_t n_external_constants_ = 0;
 
+  Span<const Kernel> kernel_registry_;
+
   InitializationState init_state_;
 
   /**
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method_meta.h b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method_meta.h
index 3d387c695a..2cacb49506 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method_meta.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/method_meta.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/core/tag.h>
@@ -230,6 +231,19 @@ class MethodMeta final {
    */
   Result<int64_t> memory_planned_buffer_size(size_t index) const;
 
+  /**
+   * Get the device placement for the specified memory-planned buffer.
+   *
+   * For CPU-only programs (no non_const_buffer_device in the PTE), all buffers
+   * default to Device{CPU, 0}. For programs with device annotations, returns
+   * the device type and index that the buffer should be allocated on.
+   *
+   * @param[in] index The index of the buffer to look up (0-based, same
+   *     indexing as memory_planned_buffer_size()).
+   * @returns The Device on success, or an error on failure.
+   */
+  Result<etensor::Device> memory_planned_buffer_device(size_t index) const;
+
   /**
    * Check to see if a backend is used in this method.
    *
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/platform_memory_allocator.h b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/platform_memory_allocator.h
index c80d274257..181181b923 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/platform_memory_allocator.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/platform_memory_allocator.h
@@ -12,6 +12,7 @@
 #include <cstdint>
 #include <stdio.h>
 
+#include <c10/util/safe_numerics.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -46,8 +47,17 @@ class PlatformMemoryAllocator final : public MemoryAllocator {
       return nullptr;
     }
 
-    // Allocate enough memory for the node, the data and the alignment bump.
-    size_t alloc_size = sizeof(AllocationNode) + size + alignment;
+    // Check for overflow before computing total allocation size.
+    // Allocate enough for the node, data, and alignment bump (at most
+    // alignment - 1 extra bytes to align the data pointer).
+    size_t alloc_size = 0;
+    if (c10::add_overflows(sizeof(AllocationNode), size, &alloc_size) ||
+        c10::add_overflows(alloc_size, alignment - 1, &alloc_size)) {
+      ET_LOG(Error, "Allocation size overflow: size %zu, alignment %zu", size,
+             alignment);
+      return nullptr;
+    }
+
     void *node_memory = runtime::pal_allocate(alloc_size);
 
     // If allocation failed, log message and return nullptr.
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/program.h b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/program.h
index 59ae346f36..01dbc1fdfa 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/program.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/program.h
@@ -21,6 +21,7 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/compiler.h>
 
 // Forward declare flatbuffer types. This is a public header and must not
@@ -148,7 +149,8 @@ class Program final {
   load_method(const char *method_name, MemoryManager *memory_manager,
               EventTracer *event_tracer = nullptr,
               const NamedDataMap *named_data_map = nullptr,
-              const LoadBackendOptionsMap *backend_options = nullptr) const;
+              const LoadBackendOptionsMap *backend_options = nullptr,
+              Span<const Kernel> kernel_registry = {}) const;
 
   /**
    * Gathers metadata for the named method.
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/tensor_parser.h b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/tensor_parser.h
index e1fc971d36..9d18e7a351 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/executor/tensor_parser.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/executor/tensor_parser.h
@@ -92,8 +92,12 @@ parseListOptionalType(const flatbuffers::Vector<int32_t> *value_indices,
       ET_CHECK_OR_RETURN_ERROR(
           index >= 0 && static_cast<size_t>(index) < values_len, InvalidProgram,
           "Invalid value index %" PRId32 " for ListOptional", index);
+      auto optional_result = values[index].tryToOptional<T>();
+      if (!optional_result.ok()) {
+        return optional_result.error();
+      }
       new (&optional_tensor_list[output_idx])
-          std::optional<T>(values[index].toOptional<T>());
+          std::optional<T>(std::move(optional_result.get()));
       evalp_list[output_idx] = &values[static_cast<size_t>(index)];
     }
     output_idx++;
diff --git a/packages/react-native-executorch/third-party/include/executorch/runtime/kernel/operator_registry.h b/packages/react-native-executorch/third-party/include/executorch/runtime/kernel/operator_registry.h
index ddb648d863..bc6c6729d5 100644
--- a/packages/react-native-executorch/third-party/include/executorch/runtime/kernel/operator_registry.h
+++ b/packages/react-native-executorch/third-party/include/executorch/runtime/kernel/operator_registry.h
@@ -216,6 +216,15 @@ ::executorch::runtime::Result<OpFunction>
 get_op_function_from_registry(const char *name,
                               Span<const TensorMeta> meta_list = {});
 
+/**
+ * Returns the operator with a given name and TensorMeta list from the provided
+ * kernel list instead of the global registry.
+ */
+::executorch::runtime::Result<OpFunction>
+get_op_function_from_registry(const char *name,
+                              Span<const TensorMeta> meta_list,
+                              Span<const Kernel> kernel_list);
+
 /**
  * Returns all registered kernels.
  */
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib
index f74ed53c6a..d081c6501b 100755
Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist
index b2b2aa2478..4b7da1be30 100644
Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/mlx.metallib b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/mlx.metallib
new file mode 100644
index 0000000000..104cc44a31
Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/mlx.metallib differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib
index 61193b77ef..160890caab 100755
Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist
index a6f2d4a5dc..38f26327ea 100644
Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/mlx.metallib b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/mlx.metallib
new file mode 100644
index 0000000000..104cc44a31
Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/mlx.metallib differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.pbxproj b/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.pbxproj
index a4d139dcaa..c51f7098ea 100644
--- a/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.pbxproj
+++ b/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.pbxproj
@@ -11,6 +11,7 @@
 		0E4A7F482D67549100D8DCBA /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0E4A7F452D67549100D8DCBA /* MetalPerformanceShaders.framework */; };
 		0E4A7F492D67549100D8DCBA /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0E4A7F462D67549100D8DCBA /* MetalPerformanceShadersGraph.framework */; };
 		5576B4B92CEF970E005027B7 /* ETModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5576B4B82CEF970C005027B7 /* ETModel.mm */; };
+		A1MLX0001RESBUILD0001 /* mlx.metallib in Resources */ = {isa = PBXBuildFile; fileRef = A1MLX0001FILEREF0001 /* mlx.metallib */; };
 		55EA2C572CB90E7D004315B3 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 55EA2C562CB90E7D004315B3 /* Accelerate.framework */; };
 		55EA2C592CB90E80004315B3 /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 55EA2C582CB90E80004315B3 /* CoreML.framework */; };
 		55EA2C5B2CB90E85004315B3 /* libsqlite3.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 55EA2C5A2CB90E85004315B3 /* libsqlite3.tbd */; };
@@ -22,6 +23,7 @@
 		0E4A7F462D67549100D8DCBA /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = SDKROOT; };
 		5576B4B62CEF9705005027B7 /* ETModel.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ETModel.h; sourceTree = "<group>"; };
 		5576B4B82CEF970C005027B7 /* ETModel.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = ETModel.mm; sourceTree = "<group>"; };
+		A1MLX0001FILEREF0001 /* mlx.metallib */ = {isa = PBXFileReference; lastKnownFileType = archive.metal-library; path = mlx.metallib; sourceTree = "<group>"; };
 		55EA2C1C2CB90C22004315B3 /* ExecutorchLib.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = ExecutorchLib.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		55EA2C562CB90E7D004315B3 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		55EA2C582CB90E80004315B3 /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
@@ -66,6 +68,7 @@
 			isa = PBXGroup;
 			children = (
 				55EA2C352CB90C7A004315B3 /* Exported */,
+				A1MLX0001FILEREF0001 /* mlx.metallib */,
 			);
 			path = ExecutorchLib;
 			sourceTree = "<group>";
@@ -153,6 +156,7 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				A1MLX0001RESBUILD0001 /* mlx.metallib in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -343,6 +347,8 @@
 					"-force_load",
 					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libbackend_mps_ios.a",
 					"-force_load",
+					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libbackend_mlx_ios.a",
+					"-force_load",
 					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libexecutorch_ios.a",
 					"-force_load",
 					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libexecutorch_llm_ios.a",
@@ -435,6 +441,8 @@
 					"-force_load",
 					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libbackend_mps_ios.a",
 					"-force_load",
+					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libbackend_mlx_ios.a",
+					"-force_load",
 					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libexecutorch_ios.a",
 					"-force_load",
 					"$(PROJECT_DIR)/../../../third-party/ios/libs/executorch/libexecutorch_llm_ios.a",
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib/mlx.metallib b/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib/mlx.metallib
new file mode 100644
index 0000000000..104cc44a31
Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/ExecutorchLib/ExecutorchLib/mlx.metallib differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_ios.a
index 4e2a80a220..796b9d6796 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_simulator.a
index c5b16beb50..8f113a2954 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_coreml_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mlx_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mlx_ios.a
new file mode 100644
index 0000000000..d34e650af7
Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mlx_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mlx_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mlx_simulator.a
new file mode 100644
index 0000000000..59999e734d
Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mlx_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_ios.a
index ce63bfd904..a8bf9728a9 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_simulator.a
index 3679faaa46..5cce760012 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_mps_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_ios.a
index 4049c0a640..2093739b1a 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_simulator.a
index bd145d474f..360569eb71 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libbackend_xnnpack_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_ios.a
index 221409be07..ebe540bbbf 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_ios.a
index 40b74e9c7d..a312c92a1e 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_simulator.a
index bec9105acb..8435e5b35a 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_llm_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_simulator.a
index 99449364f2..e5a58d122d 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libexecutorch_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_ios.a
index 5217a54902..ce8352f8d7 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_simulator.a
index 9add6b355c..c8a6cd42a4 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_llm_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_ios.a
index 9cc9a42bd2..7b530c9e28 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_simulator.a
index cb06a86712..666ab17b07 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_optimized_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_ios.a
index d54805247e..2aa82a12d6 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_simulator.a
index 41f06f64fb..567ca5555f 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_quantized_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_ios.a
index 19db3e80f4..b0d3385e72 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_simulator.a
index af26c633e1..662a2ac890 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libkernels_torchao_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_ios.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_ios.a
index 8c14cf924b..4d59154358 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_ios.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_ios.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_simulator.a b/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_simulator.a
index d6deeb5a00..1ec1dbe74e 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_simulator.a and b/packages/react-native-executorch/third-party/ios/libs/executorch/libthreadpool_simulator.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/executorch/mlx.metallib b/packages/react-native-executorch/third-party/ios/libs/executorch/mlx.metallib
new file mode 100644
index 0000000000..104cc44a31
Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/libs/executorch/mlx.metallib differ