diff --git a/.eslintrc.js b/.eslintrc.js index b09ff37d23..fd7eca7393 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -8,6 +8,7 @@ const VALID_CATEGORIES = [ 'Models - Image Embeddings', 'Models - Image Generation', 'Models - LLM', + 'Models - LLM Multimodal', 'Models - Object Detection', 'Models - Instance Segmentation', 'Models - Pose Estimation', diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 0de5004849..003c4fcb7d 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -12,6 +12,11 @@ import { View, } from 'react-native'; import { launchImageLibrary } from 'react-native-image-picker'; +import { + AudioManager, + AudioRecorder, + AudioContext, +} from 'react-native-audio-api'; import { useIsFocused } from '@react-navigation/native'; import { useSafeAreaInsets } from 'react-native-safe-area-context'; import { models, useLLM } from 'react-native-executorch'; @@ -23,12 +28,14 @@ import Spinner from '../../components/Spinner'; import { GeneratingContext } from '../../context'; import SuggestedPrompts from '../../components/SuggestedPrompts'; import ErrorBanner from '../../components/ErrorBanner'; +import AudioWaveform from '../../components/AudioWaveform'; const SUGGESTED_PROMPTS = [ "What's in this image?", 'Describe this scene in detail', 'What objects can you see?', 'What text appears in this image?', + 'Transcribe the audio', ]; import { useLLMStats } from '../../hooks/useLLMStats'; import { StatsBar } from '../../components/StatsBar'; @@ -46,12 +53,18 @@ function MultimodalLLMScreen() { const textInputRef = useRef(null); const { setGlobalGenerating } = useContext(GeneratingContext); - // Added error state - const [error, setError] = useState(null); + const [audioBuffer, setAudioBuffer] = useState(null); + const [audioLabel, setAudioLabel] = useState(null); + const [audioUrl, setAudioUrl] = useState(''); + const [isFetchingAudio, setIsFetchingAudio] = useState(false); + const [isRecording, setIsRecording] = useState(false); + const [hasMicPermission, setHasMicPermission] = useState(false); + const recorder = useRef(new AudioRecorder()); + const recordChunks = useRef([]); - const vlm = useLLM({ - model: models.llm.lfm2_5_vl_1_6b(), - }); + const [error, setError] = useState(null); + const model = models.llm.gemma4_e2b_multimodal(); + const vlm = useLLM({ model: model }); const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0; const { stats, onMessageSend } = useLLMStats( vlm.response, @@ -68,6 +81,95 @@ function MultimodalLLMScreen() { if (vlm.error) setError(String(vlm.error)); }, [vlm.error]); + useEffect(() => { + AudioManager.setAudioSessionOptions({ + iosCategory: 'playAndRecord', + iosMode: 'spokenAudio', + iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], + }); + (async () => { + const status = await AudioManager.requestRecordingPermissions(); + setHasMicPermission(status === 'Granted'); + })(); + + return () => { + if (vlm.isGenerating) vlm.interrupt(); + // eslint-disable-next-line react-hooks/exhaustive-deps + recorder.current.stop(); + AudioManager.setAudioSessionActivity(false); + }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const loadAudioFromUrl = async () => { + const url = audioUrl.trim(); + if (!url) return; + setIsFetchingAudio(true); + try { + const ctx = new AudioContext({ sampleRate: 16000 }); + const decoded = await ctx.decodeAudioData(url); + const pcm = decoded.getChannelData(0); + const name = url.split('/').pop() || 'audio'; + setAudioBuffer(pcm); + setAudioLabel(`${name} Β· ${(pcm.length / 16000).toFixed(1)}s`); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setIsFetchingAudio(false); + } + }; + + const startRecording = async () => { + if (!hasMicPermission) { + setError('Microphone permission denied. Please enable it in Settings.'); + return; + } + recordChunks.current = []; + const sampleRate = 16000; + recorder.current.onAudioReady( + { sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 }, + ({ buffer }) => { + recordChunks.current.push(new Float32Array(buffer.getChannelData(0))); + } + ); + try { + const ok = await AudioManager.setAudioSessionActivity(true); + if (!ok) { + setError('Cannot start audio session'); + return; + } + const result = recorder.current.start(); + if (result.status === 'error') { + setError(`Recording problems: ${result.message}`); + return; + } + setIsRecording(true); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } + }; + + const stopRecording = () => { + recorder.current.stop(); + setIsRecording(false); + const total = recordChunks.current.reduce((n, c) => n + c.length, 0); + if (total === 0) return; + const pcm = new Float32Array(total); + let off = 0; + for (const c of recordChunks.current) { + pcm.set(c, off); + off += c.length; + } + recordChunks.current = []; + setAudioBuffer(pcm); + setAudioLabel(`Recording Β· ${(pcm.length / 16000).toFixed(1)}s`); + }; + + const clearAudio = () => { + setAudioBuffer(null); + setAudioLabel(null); + }; + const pickImage = async () => { try { const result = await launchImageLibrary({ mediaType: 'photo' }); @@ -81,19 +183,27 @@ function MultimodalLLMScreen() { }; const sendMessage = async () => { - if (!userInput.trim() || vlm.isGenerating) return; + if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating) + return; onMessageSend(); const text = userInput.trim(); setUserInput(''); textInputRef.current?.clear(); Keyboard.dismiss(); const currentImageUri = imageUri; + const currentAudio = audioBuffer; setImageUri(null); + setAudioBuffer(null); + setAudioLabel(null); try { - await vlm.sendMessage( - text, - currentImageUri ? { imagePath: currentImageUri } : undefined - ); + const media = + currentImageUri || currentAudio + ? { + ...(currentImageUri ? { imagePath: currentImageUri } : {}), + ...(currentAudio ? { audioBuffer: currentAudio } : {}), + } + : undefined; + await vlm.sendMessage(text, media); } catch (e) { // Updated to set UI error instead of just console.error setError(e instanceof Error ? e.message : String(e)); @@ -135,7 +245,9 @@ function MultimodalLLMScreen() { Hello! πŸ‘‹ - Pick an image and ask me anything about it. + {model.capabilities.find((c) => c === 'audio') + ? 'Say hi, or pick an image, and ask me anything about it.' + : 'Pick an image and ask me anything about it.'} )} + {/* Audio URL input */} + + + + + {isFetchingAudio ? '…' : 'Load'} + + + + + {/* Audio attachment strip */} + {audioLabel && ( + + + 🎡 {audioLabel} + + βœ• + + + + + )} + πŸ“· + {/* Mic record / stop button */} + + + {isRecording ? '⏹️' : '🎀'} + + + - {userInput.trim() && !vlm.isGenerating && ( - - - - )} + {(imageUri || audioBuffer || userInput.trim()) && + !vlm.isGenerating && ( + + + + )} {vlm.isGenerating && ( ; +} + +const NUM_BARS = 32; + +export default function AudioWaveform({ buffer, style }: AudioWaveformProps) { + const bars = useMemo(() => { + if (!buffer || buffer.length === 0) return null; + const chunkSize = Math.max(1, Math.floor(buffer.length / NUM_BARS)); + const peaks: number[] = []; + let max = 0; + for (let i = 0; i < NUM_BARS; i++) { + const start = i * chunkSize; + const end = Math.min(start + chunkSize, buffer.length); + let peak = 0; + for (let j = start; j < end; j++) { + const v = Math.abs(buffer[j] ?? 0); + if (v > peak) peak = v; + } + peaks.push(peak); + if (peak > max) max = peak; + } + return max > 0 ? peaks.map((p) => p / max) : peaks; + }, [buffer]); + + if (!bars) return null; + + return ( + + {bars.map((amp, i) => ( + + ))} + + ); +} + +const styles = StyleSheet.create({ + container: { + flexDirection: 'row', + alignItems: 'center', + height: 16, + minWidth: 160, + gap: 2, + }, + bar: { + flex: 1, + borderRadius: 1, + backgroundColor: ColorPalette.blueDark, + opacity: 0.35, + }, +}); diff --git a/apps/llm/components/MessageItem.tsx b/apps/llm/components/MessageItem.tsx index 2c44714ac0..cda8609885 100644 --- a/apps/llm/components/MessageItem.tsx +++ b/apps/llm/components/MessageItem.tsx @@ -11,6 +11,7 @@ import MarkdownComponent from './MarkdownComponent'; import LlamaIcon from '../assets/icons/llama_icon.svg'; import ColorPalette from '../colors'; import { Message } from 'react-native-executorch'; +import AudioWaveform from './AudioWaveform'; interface MessageItemProps { message: Message; @@ -43,6 +44,12 @@ const MessageItem = memo(({ message, deleteMessage }: MessageItemProps) => { resizeMode="contain" /> )} + {message.audioWaveform && ( + + )} @@ -103,6 +110,9 @@ const styles = StyleSheet.create({ borderRadius: 6, marginBottom: 6, }, + userMessageWaveform: { + marginBottom: 6, + }, aiMessageIconContainer: { backgroundColor: ColorPalette.seaBlueLight, height: 32, diff --git a/apps/llm/components/llmModels.ts b/apps/llm/components/llmModels.ts index 1d80d7a395..58b8c01d74 100644 --- a/apps/llm/components/llmModels.ts +++ b/apps/llm/components/llmModels.ts @@ -10,6 +10,11 @@ const llm = models.llm; export type LLMModelSources = LLMProps['model']; export const LLM_MODELS: ModelOption[] = [ + //Gemma 4 + { + label: 'Gemma 4 E2B', + value: llm.gemma4_e2b(), + }, // Llama 3.2 { label: 'Llama 3.2 1B', diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 5f88f3764a..909c2da57f 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -22,12 +22,24 @@ import { ModelPicker, ModelOption } from '../components/ModelPicker'; const speechToText = models.speech_to_text; const vad = models.vad; +const isSimulator = DeviceInfo.isEmulatorSync(); +const backend = Platform.OS === 'ios' && !isSimulator ? 'coreml' : 'xnnpack'; + type STTModelSources = SpeechToTextProps['model']; const MODELS: ModelOption[] = [ - { label: 'Whisper Tiny EN', value: speechToText.whisper_tiny_en() }, - { label: 'Whisper Base EN', value: speechToText.whisper_base_en() }, - { label: 'Whisper Small EN', value: speechToText.whisper_small_en() }, + { + label: 'Whisper Tiny EN', + value: speechToText.whisper_tiny_en({ backend }), + }, + { + label: 'Whisper Base EN', + value: speechToText.whisper_base_en({ backend }), + }, + { + label: 'Whisper Small EN', + value: speechToText.whisper_small_en({ backend }), + }, ]; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { @@ -42,12 +54,10 @@ import DeviceInfo from 'react-native-device-info'; import { VerboseTranscription } from '../components/VerboseTranscription'; import ErrorBanner from '../components/ErrorBanner'; -const isSimulator = DeviceInfo.isEmulatorSync(); - export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const [selectedModel, setSelectedModel] = useState( - Platform.OS === 'ios' - ? speechToText.whisper_base_en() + Platform.OS === 'ios' && !isSimulator + ? speechToText.whisper_base_en({ backend }) : speechToText.whisper_tiny_en() ); diff --git a/docs/docs/03-hooks/01-natural-language-processing/useLLM.md b/docs/docs/03-hooks/01-natural-language-processing/useLLM.md index 7b1cb25158..29b1be4d72 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useLLM.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useLLM.md @@ -56,7 +56,7 @@ The code snippet above fetches the model from the specified URL, loads it into m `useLLM` takes [`LLMProps`](../../06-api-reference/interfaces/LLMProps.md) that consists of: -- [model source](../../06-api-reference/interfaces/LLMProps.md#modelsource), [tokenizer source](../../06-api-reference/interfaces/LLMProps.md#tokenizersource), and [tokenizer config source](../../06-api-reference/interfaces/LLMProps.md#tokenizerconfigsource). +- [model](../../06-api-reference/interfaces/LLMModel.md). - An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model. You need more details? Check the following resources: @@ -494,13 +494,13 @@ Depending on selected model and the user's device generation speed can be above ## Vision-Language Models (VLM) -Some models support multimodal input β€” text and images together. To use them, pass a `capabilities` array when loading the model. +Some models support multimodal input β€” text, images and/or audio together. To use them, pass a `capabilities` array when loading the model. ### Loading a VLM ```tsx import { models, useLLM } from 'react-native-executorch'; -const llm = useLLM({ model: models.llm.lfm2_5_vl_1_6b() }); +const llm = useLLM({ model: models.llm.gemma4_e2b_multimodal() }); ``` The `capabilities` field is already set on the model constant. You can also construct the model object explicitly: @@ -511,22 +511,26 @@ const llm = useLLM({ modelSource: '...', tokenizerSource: '...', tokenizerConfigSource: '...', - capabilities: ['vision'], + capabilities: ['vision', 'audio'], }, }); ``` Passing `capabilities` unlocks the typed `media` argument on `sendMessage`. -### Sending a message with an image +### Sending a message with an image or audio recording ```tsx -const llm = useLLM({ model: models.llm.lfm2_5_vl_1_6b() }); +const llm = useLLM({ model: models.llm.gemma4_e2b_multimodal() }); const send = () => { llm.sendMessage('What is in this image?', { imagePath: '/path/to/image.jpg', }); + // or + llm.sendMessage('What can you hear?', { + audioBuffer: audioRecording, + }); }; return ( @@ -538,6 +542,7 @@ return ( ``` The `imagePath` should be a local file path on the device. +The `audioBuffer` should be a `Float32Array` with 16kHz waveform. ### Functional generation with images diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md index 967625160c..48b3395f87 100644 --- a/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md +++ b/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md @@ -118,12 +118,12 @@ Model presets expose an optional `generationConfig` that `LLMModule.fromModelNam ## Vision-Language Models (VLM) -Some models support multimodal input β€” text and images together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname): +Some models support multimodal input β€” text, images and/or audio together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname): ```typescript import { models, LLMModule } from 'react-native-executorch'; const llm = await LLMModule.fromModelName( - models.llm.lfm2_5_vl_1_6b(), + models.llm.gemma4_e2b_multimodal(), undefined, (token) => console.log(token) ); @@ -133,20 +133,24 @@ The `capabilities` field is already set on the model constant. You can also cons ```typescript const llm = await LLMModule.fromModelName({ - modelName: 'lfm2.5-vl-1.6b-quantized', + modelName: 'gemma4-e2b-multimodal', modelSource: require('./path/to/model.pte'), tokenizerSource: require('./path/to/tokenizer.json'), tokenizerConfigSource: require('./path/to/tokenizer_config.json'), - capabilities: ['vision'], + capabilities: ['vision', 'audio'], }); ``` -Once loaded, pass `imagePath` to [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage): +Once loaded, pass `imagePath` or `audioBuffer` to [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage): ```typescript const response = await llm.sendMessage('What is in this image?', { imagePath: '/path/to/image.jpg', }); +// or +const response = await llm.sendMessage('What can you hear?', { + audioBuffer: audioRecording, //expected as waveform 16kHz +}); ``` Or use [`generate`](../../06-api-reference/classes/LLMModule.md#generate) with `mediaPath` on the message: @@ -159,7 +163,14 @@ const chat: Message[] = [ mediaPath: '/path/to/image.jpg', }, ]; - +// or +const chat: Message[] = [ + { + role: 'user', + content: 'Transcribe the recording.', + audioWaveform: audioRecording, + }, +]; const response = await llm.generate(chat); ``` diff --git a/docs/versioned_docs/version-0.9.x/03-hooks/01-natural-language-processing/useLLM.md b/docs/versioned_docs/version-0.9.x/03-hooks/01-natural-language-processing/useLLM.md new file mode 100644 index 0000000000..f19920a486 --- /dev/null +++ b/docs/versioned_docs/version-0.9.x/03-hooks/01-natural-language-processing/useLLM.md @@ -0,0 +1,580 @@ +--- +title: useLLM +keywords: + [ + react native, + react native ai, + react native llm, + react native qwen, + react native llama, + react native executorch, + executorch, + pytorch, + on-device ai, + mobile ai, + llama 3, + qwen, + text generation, + tool calling, + function calling, + ] +description: "Learn how to use LLMs in your React Native applications with React Native ExecuTorch's useLLM hook." +--- + +React Native ExecuTorch supports a variety of LLMs (checkout our [HuggingFace repository](https://huggingface.co/software-mansion) for model already converted to ExecuTorch format) including Llama 3.2. Before getting started, you’ll need to obtain the .pte binaryβ€”a serialized model, the tokenizer and tokenizer config JSON files. There are various ways to accomplish this: + +:::info +It is recommended to use models provided by us, which are available at our [HuggingFace repository](https://huggingface.co/collections/software-mansion/llm). You can also use [constants](../../06-api-reference/index.md#models---llm) shipped with our library. + +Alternatively, follow the official [tutorial](https://docs.pytorch.org/executorch/stable/llm/export-llm.html) made by ExecuTorch team to export an arbitrary LLM model. +::: + +:::warning +Lower-end devices might not be able to fit LLMs into memory. We recommend using quantized models to reduce the memory footprint. +::: + +## API Reference + +- For detailed API Reference for `useLLM` see: [`useLLM` API Reference](../../06-api-reference/functions/useLLM.md). +- For all LLM models available out-of-the-box in React Native ExecuTorch see: [LLM Models](../../06-api-reference/index.md#models---llm). +- For useful LLM utility functionalities please refer to the following link: [LLM Utility Functionalities](../../06-api-reference/index.md#utilities---llm). + +## Initializing + +In order to load a model into the app, you need to run the following code: + +```typescript +import { models, useLLM } from 'react-native-executorch'; +const llm = useLLM({ model: models.llm.lfm2_5_1_2b_instruct() }); +``` + +
+ +The code snippet above fetches the model from the specified URL, loads it into memory, and returns an object with various functions and properties for controlling the model. You can monitor the loading progress by checking the [`llm.downloadProgress`](../../06-api-reference/interfaces/LLMType.md#downloadprogress) and [`llm.isReady`](../../06-api-reference/interfaces/LLMType.md#isready) property, and if anything goes wrong, the [`llm.error`](../../06-api-reference/interfaces/LLMType.md#error) property will contain the error message. + +### Arguments + +`useLLM` takes [`LLMProps`](../../06-api-reference/interfaces/LLMProps.md) that consists of: + +- [model source](../../06-api-reference/interfaces/LLMProps.md#modelsource), [tokenizer source](../../06-api-reference/interfaces/LLMProps.md#tokenizersource), and [tokenizer config source](../../06-api-reference/interfaces/LLMProps.md#tokenizerconfigsource). +- An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model. + +You need more details? Check the following resources: + +- For detailed information about `useLLM` arguments check this section: [`useLLM` arguments](../../06-api-reference/functions/useLLM.md#parameters). +- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. +- For available LLM models please check out the following list: [LLM Models](../../06-api-reference/index.md#models---llm). + +### Returns + +`useLLM` returns [`LLMType`](../../06-api-reference/interfaces/LLMType.md) which provides: + +- State properties: [`response`](../../06-api-reference/interfaces/LLMType.md#response), [`token`](../../06-api-reference/interfaces/LLMType.md#token), [`isReady`](../../06-api-reference/interfaces/LLMType.md#isready), [`isGenerating`](../../06-api-reference/interfaces/LLMType.md#isgenerating), [`downloadProgress`](../../06-api-reference/interfaces/LLMType.md#downloadprogress), [`error`](../../06-api-reference/interfaces/LLMType.md#error), [`messageHistory`](../../06-api-reference/interfaces/LLMType.md#messagehistory) +- Generation methods: [`generate`](../../06-api-reference/interfaces/LLMType.md#generate), [`sendMessage`](../../06-api-reference/interfaces/LLMType.md#sendmessage), [`interrupt`](../../06-api-reference/interfaces/LLMType.md#interrupt) +- Configuration: [`configure`](../../06-api-reference/interfaces/LLMType.md#configure), [`deleteMessage`](../../06-api-reference/interfaces/LLMType.md#deletemessage) +- Token counting: [`getGeneratedTokenCount`](../../06-api-reference/interfaces/LLMType.md#getgeneratedtokencount), [`getPromptTokenCount`](../../06-api-reference/interfaces/LLMType.md#getprompttokencount), [`getTotalTokenCount`](../../06-api-reference/interfaces/LLMType.md#gettotaltokencount) + +For complete details, see the [LLMType API Reference](../../06-api-reference/interfaces/LLMType.md). + +## Functional vs managed + +You can use functions returned from this hooks in two manners: + +1. Functional/pure - we will not keep any state for you. You'll need to keep conversation history and handle function calling yourself. Use [`generate`](../../06-api-reference/interfaces/LLMType.md#generate) and [`response`](../../06-api-reference/interfaces/LLMType.md#response). Note that you don't need to run [`configure`](../../06-api-reference/interfaces/LLMType.md#configure) to use those. Furthermore, [`chatConfig`](../../06-api-reference/interfaces/LLMConfig.md#chatconfig) and [`toolsConfig`](../../06-api-reference/interfaces/LLMConfig.md#toolsconfig) will not have any effect on those functions. + +2. Managed/stateful - we will manage conversation state. Tool calls will be parsed and called automatically after passing appropriate callbacks. See more at [managed LLM chat](#managed-llm-chat). + +## Functional way + +### Simple generation + +To perform chat completion you can use the [`generate`](../../06-api-reference/interfaces/LLMType.md#generate) function. The [`response`](../../06-api-reference/interfaces/LLMType.md#response) value is updated with each token as it's generated, and the function returns a promise that resolves to the complete response when generation finishes. + +```tsx +const llm = useLLM({ model: models.llm.lfm2_5_1_2b_instruct() }); + +const handleGenerate = async () => { + const chat: Message[] = [ + { role: 'system', content: 'You are a helpful assistant' }, + { role: 'user', content: 'Hi!' }, + { role: 'assistant', content: 'Hi!, how can I help you?' }, + { role: 'user', content: 'What is the meaning of life?' }, + ]; + + // Chat completion - returns the generated response + const response = await llm.generate(chat); + console.log('Complete response:', response); +}; + +return ( + +