|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
2 | 2 | # PATCHED: Single-pass audio generation (no streaming/chunking) |
3 | 3 | # + Voice caching support (voice_id + voice_url) |
4 | | -# + SFT mode (system_prompt) vs Voice cloning mode (mutually exclusive) |
| 4 | +# + system_prompt + voice cloning work together (style + voice identity) |
5 | 5 | print(">>> PATCHED serving_audio.py LOADED <<<", flush=True) |
6 | 6 | import base64 |
7 | 7 | import hashlib |
@@ -424,47 +424,38 @@ def prepare_messages( |
424 | 424 | """ |
425 | 425 | Build the message list for TTS generation. |
426 | 426 |
|
427 | | - The model was trained with TWO MUTUALLY EXCLUSIVE modes: |
428 | | - 1. SFT mode: Custom system_prompt (with scene descriptions) WITHOUT reference audio |
429 | | - 2. Voice cloning mode: Reference audio WITH default system prompt |
| 427 | + Supports BOTH system_prompt AND voice cloning together: |
| 428 | + - system_prompt: Controls style/emotion via scene descriptions |
| 429 | + - reference_audio: Controls voice identity via in-context learning |
430 | 430 |
|
431 | | - If system_prompt is provided, we use SFT mode (no voice cloning). |
432 | | - If no system_prompt, we use voice cloning mode (with reference audio). |
| 431 | + This matches the original non-vLLM POC behavior. |
433 | 432 | """ |
434 | 433 | # Get preset fallbacks |
435 | 434 | preset_audio, preset_text, preset_prompt = self.tts_voice_raw( |
436 | 435 | request.voice, self.voice_presets_dir, voice_presets |
437 | 436 | ) |
438 | 437 |
|
439 | | - # Check if user provided a custom system_prompt |
440 | | - custom_system_prompt = getattr(request, 'system_prompt', None) |
441 | | - |
442 | | - # SFT mode: Custom system_prompt provided - use it WITHOUT reference audio |
443 | | - # This enables emotional/style control via scene descriptions |
444 | | - if custom_system_prompt: |
445 | | - logger.info("Using SFT mode with custom system_prompt (no reference audio)") |
446 | | - messages: list[ChatCompletionMessageParam] = [ |
447 | | - {"role": "system", "content": custom_system_prompt} |
448 | | - ] |
449 | | - messages.append({"role": "user", "content": request.input}) |
450 | | - return messages |
451 | | - |
452 | | - # Voice cloning mode: No custom system_prompt - use reference audio |
453 | | - # Use preset's system_prompt or default, and include reference audio for voice cloning |
454 | | - system_prompt = preset_prompt or TTS_SYSTEM_PROMPT |
| 438 | + # Resolve system prompt: request > preset > default |
| 439 | + system_prompt = getattr(request, 'system_prompt', None) or preset_prompt or TTS_SYSTEM_PROMPT |
| 440 | + |
| 441 | + # Resolve voice reference for cloning |
455 | 442 | reference_audio, reference_text = self._resolve_voice_reference( |
456 | 443 | request, preset_audio, preset_text |
457 | 444 | ) |
458 | 445 |
|
459 | | - logger.info("Using voice cloning mode with reference audio") |
| 446 | + # Build messages: system prompt + voice clone context + user text |
460 | 447 | messages: list[ChatCompletionMessageParam] = [ |
461 | 448 | {"role": "system", "content": system_prompt} |
462 | 449 | ] |
463 | 450 |
|
| 451 | + # Add voice cloning context if reference audio available |
464 | 452 | if reference_audio: |
| 453 | + logger.info("Adding voice cloning context (reference audio)") |
465 | 454 | messages.extend(self._build_voice_clone_messages(reference_audio, reference_text)) |
466 | 455 |
|
467 | 456 | messages.append({"role": "user", "content": request.input}) |
| 457 | + |
| 458 | + logger.info(f"Prepared {len(messages)} messages (system_prompt: {len(system_prompt)} chars, has_reference: {bool(reference_audio)})") |
468 | 459 | return messages |
469 | 460 |
|
470 | 461 | def tts_voice_raw(self, voice: str, voice_presets_dir: str, |
|
0 commit comments