diff --git a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift index 5ca3f5bf5..40a7f2071 100644 --- a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift +++ b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift @@ -304,7 +304,22 @@ public struct KokoroSynthesizer { zeroFill: true ) + // Source noise for newer Kokoro models + let maxSeconds = variant.maxDurationSeconds + let noiseLength = TtsConstants.audioSampleRate * maxSeconds + let sourceNoise = try await multiArrayPool.rent( + shape: [1, noiseLength, 9], + dataType: .float16, + zeroFill: false + ) + let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9) + for i in 0..<(noiseLength * 9) { + let randomValue = Float.random(in: -1...1) + noisePointer[i] = Float16(randomValue).bitPattern + } + func recycleModelArrays() async { + await multiArrayPool.recycle(sourceNoise, zeroFill: false) await multiArrayPool.recycle(phasesArray, zeroFill: true) await multiArrayPool.recycle(attentionMask, zeroFill: false) await multiArrayPool.recycle(inputArray, zeroFill: false) @@ -338,6 +353,7 @@ public struct KokoroSynthesizer { "attention_mask": attentionMask, "ref_s": refStyle, "random_phases": phasesArray, + "source_noise": sourceNoise, ]) let predictionStart = Date() diff --git a/Sources/FluidAudio/TTS/TtsModels.swift b/Sources/FluidAudio/TTS/TtsModels.swift index b4d4b2996..5fe33c8bd 100644 --- a/Sources/FluidAudio/TTS/TtsModels.swift +++ b/Sources/FluidAudio/TTS/TtsModels.swift @@ -152,11 +152,25 @@ public struct TtsModels: Sendable { randomPhases[index] = NSNumber(value: Float(0)) } + // Source noise for newer Kokoro models + let maxSeconds = variant.maxDurationSeconds + let noiseLength = TtsConstants.audioSampleRate * maxSeconds + let sourceNoise = try MLMultiArray( + shape: [1, NSNumber(value: noiseLength), 9], + dataType: .float16 + ) + let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9) + for i in 0..<(noiseLength * 9) { + let randomValue = Float.random(in: -1...1) + noisePointer[i] = Float16(randomValue).bitPattern + } + let features = try MLDictionaryFeatureProvider(dictionary: [ "input_ids": inputIds, "attention_mask": attentionMask, "ref_s": refStyle, "random_phases": randomPhases, + "source_noise": sourceNoise, ]) let options: MLPredictionOptions = optimizedPredictionOptions()