Skip to content

Commit 528de0a

Browse files
bizybclaude
andcommitted
[HIGGS AUDIO] Fix: system_prompt + voice cloning work together
The previous commit wrongly made them mutually exclusive. The original non-vLLM POC supports BOTH: - system_prompt: Controls style/emotion via scene descriptions - reference_audio: Controls voice identity via in-context learning One character voice + any emotion via scene descriptions. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 4b516ac commit 528de0a

1 file changed

Lines changed: 14 additions & 23 deletions

File tree

higgs-audio-poc/vllm-v2/serving_audio_patched.py

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# PATCHED: Single-pass audio generation (no streaming/chunking)
33
# + Voice caching support (voice_id + voice_url)
4-
# + SFT mode (system_prompt) vs Voice cloning mode (mutually exclusive)
4+
# + system_prompt + voice cloning work together (style + voice identity)
55
print(">>> PATCHED serving_audio.py LOADED <<<", flush=True)
66
import base64
77
import hashlib
@@ -424,47 +424,38 @@ def prepare_messages(
424424
"""
425425
Build the message list for TTS generation.
426426
427-
The model was trained with TWO MUTUALLY EXCLUSIVE modes:
428-
1. SFT mode: Custom system_prompt (with scene descriptions) WITHOUT reference audio
429-
2. Voice cloning mode: Reference audio WITH default system prompt
427+
Supports BOTH system_prompt AND voice cloning together:
428+
- system_prompt: Controls style/emotion via scene descriptions
429+
- reference_audio: Controls voice identity via in-context learning
430430
431-
If system_prompt is provided, we use SFT mode (no voice cloning).
432-
If no system_prompt, we use voice cloning mode (with reference audio).
431+
This matches the original non-vLLM POC behavior.
433432
"""
434433
# Get preset fallbacks
435434
preset_audio, preset_text, preset_prompt = self.tts_voice_raw(
436435
request.voice, self.voice_presets_dir, voice_presets
437436
)
438437

439-
# Check if user provided a custom system_prompt
440-
custom_system_prompt = getattr(request, 'system_prompt', None)
441-
442-
# SFT mode: Custom system_prompt provided - use it WITHOUT reference audio
443-
# This enables emotional/style control via scene descriptions
444-
if custom_system_prompt:
445-
logger.info("Using SFT mode with custom system_prompt (no reference audio)")
446-
messages: list[ChatCompletionMessageParam] = [
447-
{"role": "system", "content": custom_system_prompt}
448-
]
449-
messages.append({"role": "user", "content": request.input})
450-
return messages
451-
452-
# Voice cloning mode: No custom system_prompt - use reference audio
453-
# Use preset's system_prompt or default, and include reference audio for voice cloning
454-
system_prompt = preset_prompt or TTS_SYSTEM_PROMPT
438+
# Resolve system prompt: request > preset > default
439+
system_prompt = getattr(request, 'system_prompt', None) or preset_prompt or TTS_SYSTEM_PROMPT
440+
441+
# Resolve voice reference for cloning
455442
reference_audio, reference_text = self._resolve_voice_reference(
456443
request, preset_audio, preset_text
457444
)
458445

459-
logger.info("Using voice cloning mode with reference audio")
446+
# Build messages: system prompt + voice clone context + user text
460447
messages: list[ChatCompletionMessageParam] = [
461448
{"role": "system", "content": system_prompt}
462449
]
463450

451+
# Add voice cloning context if reference audio available
464452
if reference_audio:
453+
logger.info("Adding voice cloning context (reference audio)")
465454
messages.extend(self._build_voice_clone_messages(reference_audio, reference_text))
466455

467456
messages.append({"role": "user", "content": request.input})
457+
458+
logger.info(f"Prepared {len(messages)} messages (system_prompt: {len(system_prompt)} chars, has_reference: {bool(reference_audio)})")
468459
return messages
469460

470461
def tts_voice_raw(self, voice: str, voice_presets_dir: str,

0 commit comments

Comments
 (0)