software-mansion · mkopcins · Jun 11, 2026 · Apr 24, 2026 · Apr 30, 2026 · May 22, 2026
diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
@@ -12,6 +12,11 @@ import {
   View,
 } from 'react-native';
 import { launchImageLibrary } from 'react-native-image-picker';
+import {
+  AudioManager,
+  AudioRecorder,
+  AudioContext,
+} from 'react-native-audio-api';
 import { useIsFocused } from '@react-navigation/native';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
 import { models, useLLM } from 'react-native-executorch';
@@ -23,12 +28,14 @@ import Spinner from '../../components/Spinner';
 import { GeneratingContext } from '../../context';
 import SuggestedPrompts from '../../components/SuggestedPrompts';
 import ErrorBanner from '../../components/ErrorBanner';
+import AudioWaveform from '../../components/AudioWaveform';
 
 const SUGGESTED_PROMPTS = [
   "What's in this image?",
   'Describe this scene in detail',
   'What objects can you see?',
   'What text appears in this image?',
+  'Transcribe the audio',
 ];
 import { useLLMStats } from '../../hooks/useLLMStats';
 import { StatsBar } from '../../components/StatsBar';
@@ -46,12 +53,18 @@ function MultimodalLLMScreen() {
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
-  // Added error state
-  const [error, setError] = useState<string | null>(null);
+  const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
+  const [audioLabel, setAudioLabel] = useState<string | null>(null);
+  const [audioUrl, setAudioUrl] = useState('');
+  const [isFetchingAudio, setIsFetchingAudio] = useState(false);
+  const [isRecording, setIsRecording] = useState(false);
+  const [hasMicPermission, setHasMicPermission] = useState(false);
+  const recorder = useRef(new AudioRecorder());
+  const recordChunks = useRef<Float32Array[]>([]);
 
-  const vlm = useLLM({
-    model: models.llm.lfm2_5_vl_1_6b(),
-  });
+  const [error, setError] = useState<string | null>(null);
+  const model = models.llm.gemma4_e2b_multimodal();
+  const vlm = useLLM({ model: model });
   const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
   const { stats, onMessageSend } = useLLMStats(
     vlm.response,
@@ -68,6 +81,95 @@ function MultimodalLLMScreen() {
     if (vlm.error) setError(String(vlm.error));
   }, [vlm.error]);
 
+  useEffect(() => {
+    AudioManager.setAudioSessionOptions({
+      iosCategory: 'playAndRecord',
+      iosMode: 'spokenAudio',
+      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
+    });
+    (async () => {
+      const status = await AudioManager.requestRecordingPermissions();
+      setHasMicPermission(status === 'Granted');
+    })();
+
+    return () => {
+      if (vlm.isGenerating) vlm.interrupt();
+      // eslint-disable-next-line react-hooks/exhaustive-deps
+      recorder.current.stop();
+      AudioManager.setAudioSessionActivity(false);
+    };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  const loadAudioFromUrl = async () => {
+    const url = audioUrl.trim();
+    if (!url) return;
+    setIsFetchingAudio(true);
+    try {
+      const ctx = new AudioContext({ sampleRate: 16000 });
+      const decoded = await ctx.decodeAudioData(url);
+      const pcm = decoded.getChannelData(0);
+      const name = url.split('/').pop() || 'audio';
+      setAudioBuffer(pcm);
+      setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setIsFetchingAudio(false);
+    }
+  };
+
+  const startRecording = async () => {
+    if (!hasMicPermission) {
+      setError('Microphone permission denied. Please enable it in Settings.');
+      return;
+    }
+    recordChunks.current = [];
+    const sampleRate = 16000;
+    recorder.current.onAudioReady(
+      { sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
+      ({ buffer }) => {
+        recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
+      }
+    );
+    try {
+      const ok = await AudioManager.setAudioSessionActivity(true);
+      if (!ok) {
+        setError('Cannot start audio session');
+        return;
+      }
+      const result = recorder.current.start();
+      if (result.status === 'error') {
+        setError(`Recording problems: ${result.message}`);
+        return;
+      }
+      setIsRecording(true);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    }
+  };
+
+  const stopRecording = () => {
+    recorder.current.stop();
+    setIsRecording(false);
+    const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
+    if (total === 0) return;
+    const pcm = new Float32Array(total);
+    let off = 0;
+    for (const c of recordChunks.current) {
+      pcm.set(c, off);
+      off += c.length;
+    }
+    recordChunks.current = [];
+    setAudioBuffer(pcm);
+    setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
+  };
+
+  const clearAudio = () => {
+    setAudioBuffer(null);
+    setAudioLabel(null);
+  };
+
   const pickImage = async () => {
     try {
       const result = await launchImageLibrary({ mediaType: 'photo' });
@@ -81,19 +183,27 @@ function MultimodalLLMScreen() {
   };
 
   const sendMessage = async () => {
-    if (!userInput.trim() || vlm.isGenerating) return;
+    if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
+      return;
     onMessageSend();
     const text = userInput.trim();
     setUserInput('');
     textInputRef.current?.clear();
     Keyboard.dismiss();
     const currentImageUri = imageUri;
+    const currentAudio = audioBuffer;
     setImageUri(null);
+    setAudioBuffer(null);
+    setAudioLabel(null);
     try {
-      await vlm.sendMessage(
-        text,
-        currentImageUri ? { imagePath: currentImageUri } : undefined
-      );
+      const media =
+        currentImageUri || currentAudio
+          ? {
+              ...(currentImageUri ? { imagePath: currentImageUri } : {}),
+              ...(currentAudio ? { audioBuffer: currentAudio } : {}),
+            }
+          : undefined;
+      await vlm.sendMessage(text, media);
     } catch (e) {
       // Updated to set UI error instead of just console.error
       setError(e instanceof Error ? e.message : String(e));
@@ -135,7 +245,9 @@ function MultimodalLLMScreen() {
             <View style={styles.helloMessageContainer}>
               <Text style={styles.helloText}>Hello! 👋</Text>
               <Text style={styles.bottomHelloText}>
-                Pick an image and ask me anything about it.
+                {model.capabilities.find((c) => c === 'audio')
+                  ? 'Say hi, or pick an image, and ask me anything about it.'
+                  : 'Pick an image and ask me anything about it.'}
               </Text>
               <SuggestedPrompts
                 prompts={SUGGESTED_PROMPTS}
@@ -159,6 +271,48 @@ function MultimodalLLMScreen() {
             </TouchableOpacity>
           )}
 
+          {/* Audio URL input */}
+          <View style={styles.audioUrlRow}>
+            <TextInput
+              placeholder="Audio URL (mp3/wav/…)"
+              placeholderTextColor="#C1C6E5"
+              style={styles.audioUrlInput}
+              value={audioUrl}
+              onChangeText={setAudioUrl}
+              autoCapitalize="none"
+              autoCorrect={false}
+            />
+            <TouchableOpacity
+              style={[
+                styles.audioUrlButton,
+                (!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
+                  styles.disabled,
+              ]}
+              onPress={loadAudioFromUrl}
+              disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
+            >
+              <Text style={styles.audioUrlButtonText}>
+                {isFetchingAudio ? '…' : 'Load'}
+              </Text>
+            </TouchableOpacity>
+          </View>
+
+          {/* Audio attachment strip */}
+          {audioLabel && (
+            <View style={styles.audioAttachmentContainer}>
+              <View style={styles.audioAttachmentRow}>
+                <Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
+                <TouchableOpacity onPress={clearAudio}>
+                  <Text style={styles.audioAttachmentClear}>✕</Text>
+                </TouchableOpacity>
+              </View>
+              <AudioWaveform
+                buffer={audioBuffer}
+                style={styles.audioWaveform}
+              />
+            </View>
+          )}
+
           <StatsBar stats={stats} />
           <View
             style={[
@@ -178,6 +332,17 @@ function MultimodalLLMScreen() {
               <Text style={styles.imageButtonText}>📷</Text>
             </TouchableOpacity>
 
+            {/* Mic record / stop button */}
+            <TouchableOpacity
+              style={styles.imageButton}
+              onPress={isRecording ? stopRecording : startRecording}
+              disabled={vlm.isGenerating}
+            >
+              <Text style={styles.imageButtonText}>
+                {isRecording ? '⏹️' : '🎤'}
+              </Text>
+            </TouchableOpacity>
+
             <TextInput
               autoCorrect={false}
               ref={textInputRef}
@@ -198,14 +363,15 @@ function MultimodalLLMScreen() {
               onChangeText={setUserInput}
             />
 
-            {userInput.trim() && !vlm.isGenerating && (
-              <TouchableOpacity
-                style={styles.sendChatTouchable}
-                onPress={sendMessage}
-              >
-                <SendIcon height={24} width={24} padding={4} margin={8} />
-              </TouchableOpacity>
-            )}
+            {(imageUri || audioBuffer || userInput.trim()) &&
+              !vlm.isGenerating && (
+                <TouchableOpacity
+                  style={styles.sendChatTouchable}
+                  onPress={sendMessage}
+                >
+                  <SendIcon height={24} width={24} padding={4} margin={8} />
+                </TouchableOpacity>
+              )}
             {vlm.isGenerating && (
               <TouchableOpacity
                 style={styles.sendChatTouchable}
@@ -319,6 +485,71 @@ const styles = StyleSheet.create({
     fontFamily: 'regular',
     color: ColorPalette.blueDark,
   },
+  audioAttachmentContainer: {
+    flexDirection: 'column',
+    paddingHorizontal: 16,
+    paddingVertical: 8,
+    marginHorizontal: 16,
+    marginBottom: 4,
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    backgroundColor: '#fafbff',
+  },
+  audioAttachmentRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+  },
+  audioAttachmentText: {
+    fontSize: 13,
+    fontFamily: 'regular',
+    color: ColorPalette.blueDark,
+  },
+  audioAttachmentClear: {
+    fontSize: 16,
+    color: ColorPalette.blueDark,
+    paddingHorizontal: 8,
+  },
+  audioWaveform: {
+    marginTop: 6,
+    minWidth: 0,
+  },
+  audioUrlRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginHorizontal: 16,
+    marginBottom: 4,
+  },
+  audioUrlInput: {
+    flex: 1,
+    padding: 10,
+    borderTopLeftRadius: 8,
+    borderBottomLeftRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    borderRightWidth: 0,
+    fontFamily: 'regular',
+    fontSize: 13,
+    color: ColorPalette.primary,
+  },
+  audioUrlButton: {
+    paddingVertical: 10,
+    paddingHorizontal: 16,
+    backgroundColor: ColorPalette.strongPrimary,
+    borderTopRightRadius: 8,
+    borderBottomRightRadius: 8,
+    justifyContent: 'center',
+    alignItems: 'center',
+  },
+  audioUrlButtonText: {
+    color: '#fff',
+    fontFamily: 'medium',
+    fontSize: 13,
+  },
+  disabled: {
+    opacity: 0.5,
+  },
   bottomContainer: {
     height: 100,
     width: '100%',