Skip to content

Commit 98185d6

Browse files
authored
feat(model/cosyvoice): support hot_fix params and max_prompt_audio_length (#191)
1 parent 77bc7d0 commit 98185d6

7 files changed

Lines changed: 108 additions & 0 deletions

File tree

src/main/java/com/alibaba/dashscope/aigc/multimodalconversation/MultiModalConversationParam.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,10 @@ public JsonObject getInput() {
182182
jsonObject.addProperty(ApiKeywords.VOICE, voice.getValue());
183183
}
184184

185+
if (parameters != null && !parameters.isEmpty() && parameters.containsKey(ApiKeywords.VOICE)) {
186+
jsonObject.addProperty(ApiKeywords.VOICE, (String) parameters.get(ApiKeywords.VOICE));
187+
}
188+
185189
if (languageType != null) {
186190
jsonObject.addProperty(ApiKeywords.LANGUAGE_TYPE, languageType);
187191
}

src/main/java/com/alibaba/dashscope/audio/tts/SpeechSynthesisApiKeywords.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ public class SpeechSynthesisApiKeywords {
66
public static final String TEXT_TYPE = "text_type";
77

88
public static final String FORMAT = "format";
9+
910
public static final String BIT_RATE = "bit_rate";
11+
1012
public static final String VOICE = "voice";
1113

1214
public static final String SAMPLE_RATE = "sample_rate";
@@ -21,6 +23,12 @@ public class SpeechSynthesisApiKeywords {
2123

2224
public static final String PHONEME_TIMESTAMP = "phoneme_timestamp_enabled";
2325

26+
public static final String HOT_FIX = "hot_fix";
27+
28+
public static final String PRONUNCIATION = "pronunciation";
29+
30+
public static final String REPLACE = "replace";
31+
2432
public static final String SENTENCE = "sentence";
2533

2634
public static final String WORDS = "words";
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright (c) Alibaba, Inc. and its affiliates.
2+
3+
package com.alibaba.dashscope.audio.ttsv2;
4+
5+
import java.util.ArrayList;
6+
import java.util.HashMap;
7+
import java.util.List;
8+
import lombok.AllArgsConstructor;
9+
import lombok.Data;
10+
11+
/** Hot fix configuration for speech synthesis, including pronunciation and replace rules. */
12+
@Data
13+
public class ParamHotFix {
14+
15+
/** Pronunciation rules to customize specific words. */
16+
private List<PronunciationItem> pronunciation;
17+
18+
/** Replace rules to replace specific words with others. */
19+
private List<ReplaceItem> replace;
20+
21+
public ArrayList<Object> getPronunciation() {
22+
if (pronunciation == null || pronunciation.isEmpty()) {
23+
return null;
24+
}
25+
ArrayList<Object> pronunciationList = new ArrayList<>();
26+
for (PronunciationItem item : pronunciation) {
27+
HashMap<String, String> pronunciationItem = new HashMap<>();
28+
pronunciationItem.put(item.getText(), item.getPinyin());
29+
pronunciationList.add(pronunciationItem);
30+
}
31+
32+
return pronunciationList;
33+
}
34+
35+
public ArrayList<Object> getReplace() {
36+
if (replace == null || replace.isEmpty()) {
37+
return null;
38+
}
39+
ArrayList<Object> replaceList = new ArrayList<>();
40+
for (ReplaceItem item : replace) {
41+
HashMap<String, String> replaceItem = new HashMap<>();
42+
replaceItem.put(item.getText(), item.getReplacement());
43+
replaceList.add(replaceItem);
44+
}
45+
46+
return replaceList;
47+
}
48+
49+
@Data
50+
@AllArgsConstructor
51+
public static class PronunciationItem {
52+
private String text;
53+
private String pinyin;
54+
}
55+
56+
@Data
57+
@AllArgsConstructor
58+
public static class ReplaceItem {
59+
private String text;
60+
private String replacement;
61+
}
62+
}

src/main/java/com/alibaba/dashscope/audio/ttsv2/SpeechSynthesisParam.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ public class SpeechSynthesisParam extends FullDuplexServiceParam {
5555
@Builder.Default private List<String> languageHints = null;
5656
/** synthesis style */
5757
@Builder.Default private int style = 0;
58+
/** Hot fix configuration for pronunciation and replace rules. */
59+
@Builder.Default private ParamHotFix hotFix = null;
5860

5961
@Override
6062
public Map<String, Object> getParameters() {
@@ -83,6 +85,20 @@ public Map<String, Object> getParameters() {
8385
if (getStyle() != 0) {
8486
params.put(SpeechSynthesisApiKeywords.STYLE, getStyle());
8587
}
88+
// Add hot fix parameters if present
89+
if (getHotFix() != null) {
90+
Map<String, Object> hotFixParams = new HashMap<>();
91+
if (getHotFix().getPronunciation() != null && !getHotFix().getPronunciation().isEmpty()) {
92+
hotFixParams.put(SpeechSynthesisApiKeywords.PRONUNCIATION, getHotFix().getPronunciation());
93+
}
94+
if (getHotFix().getReplace() != null && !getHotFix().getReplace().isEmpty()) {
95+
hotFixParams.put(SpeechSynthesisApiKeywords.REPLACE, getHotFix().getReplace());
96+
}
97+
if (!hotFixParams.isEmpty()) {
98+
params.put(SpeechSynthesisApiKeywords.HOT_FIX, hotFixParams);
99+
}
100+
}
101+
86102
params.putAll(parameters);
87103
return params;
88104
}

src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentParam.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ public class VoiceEnrollmentParam extends HalfDuplexServiceParam {
2525

2626
private int pageIndex;
2727
private int pageSize;
28+
/** Maximum length of prompt audio in seconds. */
29+
private float maxPromptAudioLength = 10.0f;
2830

2931
protected VoiceEnrollmentParam(HalfDuplexServiceParamBuilder<?, ?> b) {
3032
super(b);
@@ -50,6 +52,9 @@ public JsonObject getInput() {
5052
if (languageHints != null) {
5153
input.add("language_hints", JsonUtils.toJsonArray(languageHints));
5254
}
55+
if (maxPromptAudioLength > 0) {
56+
input.addProperty("max_prompt_audio_length", maxPromptAudioLength);
57+
}
5358
break;
5459
case LIST:
5560
input.addProperty(ApiKeywords.ACTION, operationType.getValue());

src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentService.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ public Voice createVoice(
144144
.languageHints(customParam.getLanguageHints())
145145
.headers(customParam.getHeaders())
146146
.resources(customParam.getResources())
147+
.maxPromptAudioLength(customParam.getMaxPromptAudioLength())
147148
.parameters(customParam.getParameters())
148149
.workspace(customParam.getWorkspace())
149150
.build();

src/test/java/com/alibaba/dashscope/TestTtsV2SpeechSynthesizer.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import static org.junit.Assert.assertEquals;
66

77
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
8+
import com.alibaba.dashscope.audio.ttsv2.ParamHotFix;
89
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
910
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
1011
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
@@ -129,6 +130,16 @@ public void testStreamingCall() {
129130

130131
// 获取 URL
131132
String url = mockServer.url("/binary").toString();
133+
ParamHotFix hotFix = new ParamHotFix();
134+
ArrayList<ParamHotFix.PronunciationItem> pronunciations = new ArrayList<>();
135+
pronunciations.add(new ParamHotFix.PronunciationItem("今天", "jin1 tian1"));
136+
pronunciations.add(new ParamHotFix.PronunciationItem("草地", "cao3 di4"));
137+
hotFix.setPronunciation(pronunciations);
138+
139+
ArrayList<ParamHotFix.ReplaceItem> replaces = new ArrayList<>();
140+
replaces.add(new ParamHotFix.ReplaceItem("草地", "草弟"));
141+
replaces.add(new ParamHotFix.ReplaceItem("惠州", "汇州"));
142+
hotFix.setReplace(replaces);
132143

133144
// 在真实世界中,你会在这里做 HTTP 请求,并得到响应
134145
System.out.println("Mock Server is running at: " + url);
@@ -138,6 +149,7 @@ public void testStreamingCall() {
138149
.model("cosyvoice-v1")
139150
.voice("longxiaochun")
140151
.format(SpeechSynthesisAudioFormat.MP3_16000HZ_MONO_128KBPS)
152+
.hotFix(hotFix)
141153
.build();
142154
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);
143155
synthesizer.setStartedTimeout(1000);

0 commit comments

Comments
 (0)