Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const VALID_CATEGORIES = [
'Models - Image Embeddings',
'Models - Image Generation',
'Models - LLM',
'Models - LLM Multimodal',
'Models - Object Detection',
'Models - Instance Segmentation',
'Models - Pose Estimation',
Expand Down
269 changes: 250 additions & 19 deletions apps/llm/app/multimodal_llm/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ import {
View,
} from 'react-native';
import { launchImageLibrary } from 'react-native-image-picker';
import {
AudioManager,
AudioRecorder,
AudioContext,
} from 'react-native-audio-api';
import { useIsFocused } from '@react-navigation/native';
import { useSafeAreaInsets } from 'react-native-safe-area-context';
import { models, useLLM } from 'react-native-executorch';
Expand All @@ -23,12 +28,14 @@ import Spinner from '../../components/Spinner';
import { GeneratingContext } from '../../context';
import SuggestedPrompts from '../../components/SuggestedPrompts';
import ErrorBanner from '../../components/ErrorBanner';
import AudioWaveform from '../../components/AudioWaveform';

const SUGGESTED_PROMPTS = [
"What's in this image?",
'Describe this scene in detail',
'What objects can you see?',
'What text appears in this image?',
'Transcribe the audio',
];
import { useLLMStats } from '../../hooks/useLLMStats';
import { StatsBar } from '../../components/StatsBar';
Expand All @@ -46,12 +53,18 @@ function MultimodalLLMScreen() {
const textInputRef = useRef<TextInput>(null);
const { setGlobalGenerating } = useContext(GeneratingContext);

// Added error state
const [error, setError] = useState<string | null>(null);
const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
const [audioLabel, setAudioLabel] = useState<string | null>(null);
const [audioUrl, setAudioUrl] = useState('');
const [isFetchingAudio, setIsFetchingAudio] = useState(false);
const [isRecording, setIsRecording] = useState(false);
const [hasMicPermission, setHasMicPermission] = useState(false);
const recorder = useRef(new AudioRecorder());
const recordChunks = useRef<Float32Array[]>([]);

const vlm = useLLM({
model: models.llm.lfm2_5_vl_1_6b(),
});
const [error, setError] = useState<string | null>(null);
const model = models.llm.gemma4_e2b_multimodal();
const vlm = useLLM({ model: model });
const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
const { stats, onMessageSend } = useLLMStats(
vlm.response,
Expand All @@ -68,6 +81,95 @@ function MultimodalLLMScreen() {
if (vlm.error) setError(String(vlm.error));
}, [vlm.error]);

useEffect(() => {
AudioManager.setAudioSessionOptions({
iosCategory: 'playAndRecord',
iosMode: 'spokenAudio',
iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
});
(async () => {
const status = await AudioManager.requestRecordingPermissions();
setHasMicPermission(status === 'Granted');
})();

return () => {
if (vlm.isGenerating) vlm.interrupt();
// eslint-disable-next-line react-hooks/exhaustive-deps
recorder.current.stop();
AudioManager.setAudioSessionActivity(false);
};
// eslint-disable-next-line react-hooks/exhaustive-deps
}, []);

const loadAudioFromUrl = async () => {
const url = audioUrl.trim();
if (!url) return;
setIsFetchingAudio(true);
try {
const ctx = new AudioContext({ sampleRate: 16000 });
const decoded = await ctx.decodeAudioData(url);
const pcm = decoded.getChannelData(0);
const name = url.split('/').pop() || 'audio';
setAudioBuffer(pcm);
setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
} catch (e) {
setError(e instanceof Error ? e.message : String(e));
} finally {
setIsFetchingAudio(false);
}
};

const startRecording = async () => {
if (!hasMicPermission) {
setError('Microphone permission denied. Please enable it in Settings.');
return;
}
recordChunks.current = [];
const sampleRate = 16000;
recorder.current.onAudioReady(
{ sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
({ buffer }) => {
recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
}
);
try {
const ok = await AudioManager.setAudioSessionActivity(true);
if (!ok) {
setError('Cannot start audio session');
return;
}
const result = recorder.current.start();
if (result.status === 'error') {
setError(`Recording problems: ${result.message}`);
return;
}
setIsRecording(true);
} catch (e) {
setError(e instanceof Error ? e.message : String(e));
}
};

const stopRecording = () => {
recorder.current.stop();
setIsRecording(false);
const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
if (total === 0) return;
const pcm = new Float32Array(total);
let off = 0;
for (const c of recordChunks.current) {
pcm.set(c, off);
off += c.length;
}
recordChunks.current = [];
setAudioBuffer(pcm);
setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
};

const clearAudio = () => {
setAudioBuffer(null);
setAudioLabel(null);
};

const pickImage = async () => {
try {
const result = await launchImageLibrary({ mediaType: 'photo' });
Expand All @@ -81,19 +183,27 @@ function MultimodalLLMScreen() {
};

const sendMessage = async () => {
if (!userInput.trim() || vlm.isGenerating) return;
if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
return;
onMessageSend();
const text = userInput.trim();
setUserInput('');
textInputRef.current?.clear();
Keyboard.dismiss();
const currentImageUri = imageUri;
const currentAudio = audioBuffer;
setImageUri(null);
setAudioBuffer(null);
setAudioLabel(null);
try {
await vlm.sendMessage(
text,
currentImageUri ? { imagePath: currentImageUri } : undefined
);
const media =
currentImageUri || currentAudio
? {
...(currentImageUri ? { imagePath: currentImageUri } : {}),
...(currentAudio ? { audioBuffer: currentAudio } : {}),
}
: undefined;
await vlm.sendMessage(text, media);
} catch (e) {
// Updated to set UI error instead of just console.error
setError(e instanceof Error ? e.message : String(e));
Expand Down Expand Up @@ -135,7 +245,9 @@ function MultimodalLLMScreen() {
<View style={styles.helloMessageContainer}>
<Text style={styles.helloText}>Hello! 👋</Text>
<Text style={styles.bottomHelloText}>
Pick an image and ask me anything about it.
{model.capabilities.find((c) => c === 'audio')
? 'Say hi, or pick an image, and ask me anything about it.'
: 'Pick an image and ask me anything about it.'}
</Text>
<SuggestedPrompts
prompts={SUGGESTED_PROMPTS}
Expand All @@ -159,6 +271,48 @@ function MultimodalLLMScreen() {
</TouchableOpacity>
)}

{/* Audio URL input */}
<View style={styles.audioUrlRow}>
<TextInput
placeholder="Audio URL (mp3/wav/…)"
placeholderTextColor="#C1C6E5"
style={styles.audioUrlInput}
value={audioUrl}
onChangeText={setAudioUrl}
autoCapitalize="none"
autoCorrect={false}
/>
<TouchableOpacity
style={[
styles.audioUrlButton,
(!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
styles.disabled,
]}
onPress={loadAudioFromUrl}
disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
>
<Text style={styles.audioUrlButtonText}>
{isFetchingAudio ? '…' : 'Load'}
</Text>
</TouchableOpacity>
</View>

{/* Audio attachment strip */}
{audioLabel && (
<View style={styles.audioAttachmentContainer}>
<View style={styles.audioAttachmentRow}>
<Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
<TouchableOpacity onPress={clearAudio}>
<Text style={styles.audioAttachmentClear}>✕</Text>
</TouchableOpacity>
</View>
<AudioWaveform
buffer={audioBuffer}
style={styles.audioWaveform}
/>
</View>
)}

<StatsBar stats={stats} />
<View
style={[
Expand All @@ -178,6 +332,17 @@ function MultimodalLLMScreen() {
<Text style={styles.imageButtonText}>📷</Text>
</TouchableOpacity>

{/* Mic record / stop button */}
<TouchableOpacity
style={styles.imageButton}
onPress={isRecording ? stopRecording : startRecording}
disabled={vlm.isGenerating}
>
<Text style={styles.imageButtonText}>
{isRecording ? '⏹️' : '🎤'}
</Text>
</TouchableOpacity>

<TextInput
autoCorrect={false}
ref={textInputRef}
Expand All @@ -198,14 +363,15 @@ function MultimodalLLMScreen() {
onChangeText={setUserInput}
/>

{userInput.trim() && !vlm.isGenerating && (
<TouchableOpacity
style={styles.sendChatTouchable}
onPress={sendMessage}
>
<SendIcon height={24} width={24} padding={4} margin={8} />
</TouchableOpacity>
)}
{(imageUri || audioBuffer || userInput.trim()) &&
!vlm.isGenerating && (
<TouchableOpacity
style={styles.sendChatTouchable}
onPress={sendMessage}
>
<SendIcon height={24} width={24} padding={4} margin={8} />
</TouchableOpacity>
)}
{vlm.isGenerating && (
<TouchableOpacity
style={styles.sendChatTouchable}
Expand Down Expand Up @@ -319,6 +485,71 @@ const styles = StyleSheet.create({
fontFamily: 'regular',
color: ColorPalette.blueDark,
},
audioAttachmentContainer: {
flexDirection: 'column',
paddingHorizontal: 16,
paddingVertical: 8,
marginHorizontal: 16,
marginBottom: 4,
borderRadius: 8,
borderWidth: 1,
borderColor: ColorPalette.blueLight,
backgroundColor: '#fafbff',
},
audioAttachmentRow: {
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'space-between',
},
audioAttachmentText: {
fontSize: 13,
fontFamily: 'regular',
color: ColorPalette.blueDark,
},
audioAttachmentClear: {
fontSize: 16,
color: ColorPalette.blueDark,
paddingHorizontal: 8,
},
audioWaveform: {
marginTop: 6,
minWidth: 0,
},
audioUrlRow: {
flexDirection: 'row',
alignItems: 'center',
marginHorizontal: 16,
marginBottom: 4,
},
audioUrlInput: {
flex: 1,
padding: 10,
borderTopLeftRadius: 8,
borderBottomLeftRadius: 8,
borderWidth: 1,
borderColor: ColorPalette.blueLight,
borderRightWidth: 0,
fontFamily: 'regular',
fontSize: 13,
color: ColorPalette.primary,
},
audioUrlButton: {
paddingVertical: 10,
paddingHorizontal: 16,
backgroundColor: ColorPalette.strongPrimary,
borderTopRightRadius: 8,
borderBottomRightRadius: 8,
justifyContent: 'center',
alignItems: 'center',
},
audioUrlButtonText: {
color: '#fff',
fontFamily: 'medium',
fontSize: 13,
},
disabled: {
opacity: 0.5,
},
bottomContainer: {
height: 100,
width: '100%',
Expand Down
Loading
Loading