Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 250 additions & 19 deletions apps/llm/app/multimodal_llm/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ import {
View,
} from 'react-native';
import { launchImageLibrary } from 'react-native-image-picker';
import {
AudioManager,
AudioRecorder,
AudioContext,
} from 'react-native-audio-api';
import { useIsFocused } from '@react-navigation/native';
import { useSafeAreaInsets } from 'react-native-safe-area-context';
import { models, useLLM } from 'react-native-executorch';
Expand All @@ -23,12 +28,14 @@ import Spinner from '../../components/Spinner';
import { GeneratingContext } from '../../context';
import SuggestedPrompts from '../../components/SuggestedPrompts';
import ErrorBanner from '../../components/ErrorBanner';
import AudioWaveform from '../../components/AudioWaveform';

const SUGGESTED_PROMPTS = [
"What's in this image?",
'Describe this scene in detail',
'What objects can you see?',
'What text appears in this image?',
Comment thread
mkopcins marked this conversation as resolved.
'Transcribe the audio',
];
import { useLLMStats } from '../../hooks/useLLMStats';
import { StatsBar } from '../../components/StatsBar';
Expand All @@ -46,12 +53,18 @@ function MultimodalLLMScreen() {
const textInputRef = useRef<TextInput>(null);
const { setGlobalGenerating } = useContext(GeneratingContext);

// Added error state
const [error, setError] = useState<string | null>(null);
const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
const [audioLabel, setAudioLabel] = useState<string | null>(null);
const [audioUrl, setAudioUrl] = useState('');
const [isFetchingAudio, setIsFetchingAudio] = useState(false);
const [isRecording, setIsRecording] = useState(false);
const [hasMicPermission, setHasMicPermission] = useState(false);
Comment thread
mkopcins marked this conversation as resolved.
const recorder = useRef(new AudioRecorder());
const recordChunks = useRef<Float32Array[]>([]);

const vlm = useLLM({
model: models.llm.lfm2_5_vl_1_6b(),
});
const [error, setError] = useState<string | null>(null);
const model = models.llm.gemma4_e2b_multimodal();
const vlm = useLLM({ model: model });
const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
const { stats, onMessageSend } = useLLMStats(
vlm.response,
Expand All @@ -68,6 +81,95 @@ function MultimodalLLMScreen() {
if (vlm.error) setError(String(vlm.error));
}, [vlm.error]);

useEffect(() => {
AudioManager.setAudioSessionOptions({
iosCategory: 'playAndRecord',
iosMode: 'spokenAudio',
iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
});
(async () => {
const status = await AudioManager.requestRecordingPermissions();
setHasMicPermission(status === 'Granted');
})();

return () => {
if (vlm.isGenerating) vlm.interrupt();
// eslint-disable-next-line react-hooks/exhaustive-deps
recorder.current.stop();
AudioManager.setAudioSessionActivity(false);
};
// eslint-disable-next-line react-hooks/exhaustive-deps
}, []);

const loadAudioFromUrl = async () => {
const url = audioUrl.trim();
if (!url) return;
setIsFetchingAudio(true);
try {
const ctx = new AudioContext({ sampleRate: 16000 });
const decoded = await ctx.decodeAudioData(url);
const pcm = decoded.getChannelData(0);
const name = url.split('/').pop() || 'audio';
setAudioBuffer(pcm);
setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
} catch (e) {
setError(e instanceof Error ? e.message : String(e));
} finally {
setIsFetchingAudio(false);
}
};

const startRecording = async () => {
if (!hasMicPermission) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mic permission is requested once on mount and this is a dead-end on denial: the button isn't disabled when !hasMicPermission, so tapping it only sets 'enable it in Settings' with no way to act, and there's no re-check or Linking.openSettings(). Re-request inside startRecording (await requestRecordingPermissions() and update state), offer Linking.openSettings() when Denied, and/or disable the button when permission is known-denied.

setError('Microphone permission denied. Please enable it in Settings.');
return;
}
recordChunks.current = [];
const sampleRate = 16000;
recorder.current.onAudioReady(
{ sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
({ buffer }) => {
recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
}
);
try {
const ok = await AudioManager.setAudioSessionActivity(true);
if (!ok) {
setError('Cannot start audio session');
return;
}
const result = recorder.current.start();
if (result.status === 'error') {
setError(`Recording problems: ${result.message}`);
return;
}
setIsRecording(true);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two recording-robustness issues: (1) this result.status === 'error' branch is effectively dead — with no file output, AudioRecorder.start() always returns { status: 'success' }, so a real native start failure still flips the UI to 'recording' and yields empty audio; register recorder.current.onError(...) instead. (2) onAudioReady pushes a Float32Array per ~0.1s with no cap, so a long recording grows memory unbounded and hands a huge buffer to sendMessage despite the model's ~30s window — enforce a max-duration stop (and reject oversized decoded buffers in loadAudioFromUrl).

} catch (e) {
setError(e instanceof Error ? e.message : String(e));
}
};

const stopRecording = () => {
recorder.current.stop();
setIsRecording(false);
const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
if (total === 0) return;
const pcm = new Float32Array(total);
let off = 0;
for (const c of recordChunks.current) {
pcm.set(c, off);
off += c.length;
}
recordChunks.current = [];
setAudioBuffer(pcm);
setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
};

const clearAudio = () => {
setAudioBuffer(null);
setAudioLabel(null);
};

const pickImage = async () => {
try {
const result = await launchImageLibrary({ mediaType: 'photo' });
Expand All @@ -81,19 +183,27 @@ function MultimodalLLMScreen() {
};

const sendMessage = async () => {
if (!userInput.trim() || vlm.isGenerating) return;
if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
return;
onMessageSend();
const text = userInput.trim();
setUserInput('');
textInputRef.current?.clear();
Keyboard.dismiss();
const currentImageUri = imageUri;
const currentAudio = audioBuffer;
setImageUri(null);
setAudioBuffer(null);
setAudioLabel(null);
try {
await vlm.sendMessage(
text,
currentImageUri ? { imagePath: currentImageUri } : undefined
);
const media =
currentImageUri || currentAudio
? {
...(currentImageUri ? { imagePath: currentImageUri } : {}),
...(currentAudio ? { audioBuffer: currentAudio } : {}),
}
: undefined;
await vlm.sendMessage(text, media);
} catch (e) {
// Updated to set UI error instead of just console.error
setError(e instanceof Error ? e.message : String(e));
Expand Down Expand Up @@ -135,7 +245,9 @@ function MultimodalLLMScreen() {
<View style={styles.helloMessageContainer}>
<Text style={styles.helloText}>Hello! 👋</Text>
<Text style={styles.bottomHelloText}>
Pick an image and ask me anything about it.
{model.capabilities.find((c) => c === 'audio')
? 'Say hi, or pick an image, and ask me anything about it.'
: 'Pick an image and ask me anything about it.'}
</Text>
<SuggestedPrompts
prompts={SUGGESTED_PROMPTS}
Expand All @@ -159,6 +271,48 @@ function MultimodalLLMScreen() {
</TouchableOpacity>
)}

{/* Audio URL input */}
<View style={styles.audioUrlRow}>
<TextInput
placeholder="Audio URL (mp3/wav/…)"
placeholderTextColor="#C1C6E5"
style={styles.audioUrlInput}
value={audioUrl}
onChangeText={setAudioUrl}
autoCapitalize="none"
autoCorrect={false}
/>
<TouchableOpacity
style={[
styles.audioUrlButton,
(!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
styles.disabled,
]}
onPress={loadAudioFromUrl}
disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
>
<Text style={styles.audioUrlButtonText}>
{isFetchingAudio ? '…' : 'Load'}
</Text>
</TouchableOpacity>
</View>

{/* Audio attachment strip */}
{audioLabel && (
<View style={styles.audioAttachmentContainer}>
<View style={styles.audioAttachmentRow}>
<Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
<TouchableOpacity onPress={clearAudio}>
<Text style={styles.audioAttachmentClear}>✕</Text>
</TouchableOpacity>
</View>
<AudioWaveform
buffer={audioBuffer}
style={styles.audioWaveform}
/>
</View>
)}

<StatsBar stats={stats} />
<View
style={[
Expand All @@ -178,6 +332,17 @@ function MultimodalLLMScreen() {
<Text style={styles.imageButtonText}>📷</Text>
</TouchableOpacity>

{/* Mic record / stop button */}
<TouchableOpacity
style={styles.imageButton}
onPress={isRecording ? stopRecording : startRecording}
disabled={vlm.isGenerating}
>
<Text style={styles.imageButtonText}>
{isRecording ? '⏹️' : '🎤'}
</Text>
</TouchableOpacity>

<TextInput
autoCorrect={false}
ref={textInputRef}
Expand All @@ -198,14 +363,15 @@ function MultimodalLLMScreen() {
onChangeText={setUserInput}
/>

{userInput.trim() && !vlm.isGenerating && (
<TouchableOpacity
style={styles.sendChatTouchable}
onPress={sendMessage}
>
<SendIcon height={24} width={24} padding={4} margin={8} />
</TouchableOpacity>
)}
{(imageUri || audioBuffer || userInput.trim()) &&
!vlm.isGenerating && (
<TouchableOpacity
style={styles.sendChatTouchable}
onPress={sendMessage}
>
<SendIcon height={24} width={24} padding={4} margin={8} />
</TouchableOpacity>
)}
{vlm.isGenerating && (
<TouchableOpacity
style={styles.sendChatTouchable}
Expand Down Expand Up @@ -319,6 +485,71 @@ const styles = StyleSheet.create({
fontFamily: 'regular',
color: ColorPalette.blueDark,
},
audioAttachmentContainer: {
flexDirection: 'column',
paddingHorizontal: 16,
paddingVertical: 8,
marginHorizontal: 16,
marginBottom: 4,
borderRadius: 8,
borderWidth: 1,
borderColor: ColorPalette.blueLight,
backgroundColor: '#fafbff',
},
audioAttachmentRow: {
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'space-between',
},
audioAttachmentText: {
fontSize: 13,
fontFamily: 'regular',
color: ColorPalette.blueDark,
},
audioAttachmentClear: {
fontSize: 16,
color: ColorPalette.blueDark,
paddingHorizontal: 8,
},
audioWaveform: {
marginTop: 6,
minWidth: 0,
},
audioUrlRow: {
flexDirection: 'row',
alignItems: 'center',
marginHorizontal: 16,
marginBottom: 4,
},
audioUrlInput: {
flex: 1,
padding: 10,
borderTopLeftRadius: 8,
borderBottomLeftRadius: 8,
borderWidth: 1,
borderColor: ColorPalette.blueLight,
borderRightWidth: 0,
fontFamily: 'regular',
fontSize: 13,
color: ColorPalette.primary,
},
audioUrlButton: {
paddingVertical: 10,
paddingHorizontal: 16,
backgroundColor: ColorPalette.strongPrimary,
borderTopRightRadius: 8,
borderBottomRightRadius: 8,
justifyContent: 'center',
alignItems: 'center',
},
audioUrlButtonText: {
color: '#fff',
fontFamily: 'medium',
fontSize: 13,
},
disabled: {
opacity: 0.5,
},
bottomContainer: {
height: 100,
width: '100%',
Expand Down
Loading
Loading