diff --git a/Documentation/TTS/Qwen3TTS.md b/Documentation/TTS/Qwen3TTS.md new file mode 100644 index 000000000..ee70c5ee0 --- /dev/null +++ b/Documentation/TTS/Qwen3TTS.md @@ -0,0 +1,104 @@ +# Qwen3-TTS: Multilingual Text-to-Speech (Beta) + +## Overview + +Qwen3-TTS is an LLM-based multilingual TTS backend built on the Qwen3 language model. It supports 10 languages including English and Chinese, producing natural speech at 24 kHz via a 4-stage CoreML pipeline. + +> **Beta.** Qwen3-TTS is in early beta. It does not yet include a built-in text tokenizer — input must be pre-tokenized externally (e.g., via the Python `qwen-tts` package). If you run into issues or have feedback, please open an issue. We'd love help testing across languages and hardware configs. + +## Quick Start + +### CLI + +```bash +# English +swift run fluidaudiocli tts --backend qwen3 \ + "Hello world, this is a test of the text to speech system." \ + --output hello.wav + +# Chinese +swift run fluidaudiocli tts --backend qwen3 \ + "你好世界,这是一个文字转语音系统的测试。" \ + --output chinese.wav +``` + +Models are auto-downloaded from HuggingFace on first run. + +### Swift + +```swift +import FluidAudio + +let manager = Qwen3TtsManager() +try await manager.loadIfNeeded() + +// Token IDs must be generated externally (e.g., via Python qwen-tts processor) +let tokenIds = [9707, 1879, 11, 419, 374, 264, 1273, 315, 279, 1467, 4686, 1331, 39586, 1849, 13] +let result = try await manager.synthesize(text: "Hello world", tokenIds: tokenIds) + +let outputURL = URL(fileURLWithPath: "/tmp/qwen3_output.wav") +try result.audio.write(to: outputURL) +``` + +## Pipeline + +``` +text tokens ──► Prefill ──► LM Decode Loop ──► Audio Decoder ──► WAV + │ │ + │ ┌────┴────┐ + │ │ CB0 │ (greedy with repetition penalty) + │ │ CB1-15 │ (code predictor, temperature sampling) + │ └─────────┘ + │ + role_ids + text_ids + speaker_embed + TTS special tokens +``` + +### Stages + +| Stage | Model | Description | +|-------|-------|-------------| +| 1. Prefill | `qwen3_tts_lm_prefill_v9` | Encodes text context → initial logits, KV cache, past hidden state | +| 2. LM Decode | `qwen3_tts_lm_decode_v10` | Autoregressive loop generating CB0 tokens (main codebook) | +| 3. Code Predictor | `qwen3_tts_cp_prefill` + `qwen3_tts_cp_decode` | Generates CB1-15 from past hidden + CB0 per step | +| 4. Audio Decoder | `qwen3_tts_decoder_10s` | Converts 16-layer codebook frames to 24 kHz waveform | + +## Files + +| File | Role | +|------|------| +| `Qwen3TtsManager.swift` | Public API — `loadIfNeeded()`, `synthesize()` | +| `Qwen3TtsSynthesizer.swift` | Core inference pipeline — prefill, decode loop, code predictor, audio decoder | +| `Qwen3TtsModelStore.swift` | Loads and stores 5 CoreML models + embeddings from `.npy` files | +| `Qwen3TtsConstants.swift` | Model dimensions, special token IDs, sampling parameters | +| `Qwen3TtsResourceDownloader.swift` | Auto-downloads models from HuggingFace | + +## Sampling + +CB0 (main language model) uses greedy decoding with logit processors: +- Repetition penalty (1.05) on all previously generated CB0 tokens +- Token suppression: tokens 2048-3071 masked except EOS (2150) +- `min_new_tokens`: EOS suppressed for first 2 steps + +CB1-15 (code predictor) uses temperature sampling: +- Temperature: 0.9 +- Top-K: 50 +- Greedy code prediction produces silent/broken audio; temperature sampling is required. + +## Languages + +Qwen3-TTS supports 10 languages: Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian. + +Language IDs are embedded via the codec embedding table during prefill (e.g., English = 2050, Chinese = 2055). + +## Limitations + +- **No built-in tokenizer.** Text must be pre-tokenized using the Qwen3 tokenizer externally. The CLI currently supports two hardcoded test sentences. +- **Max 128 text tokens.** Longer inputs are truncated. +- **Max 125 codec frames.** Generates up to ~10 seconds of audio per call. +- **CPU+GPU compute.** Models run on `cpuAndGPU` compute units (no ANE optimization yet). + +## Model Source + +Models are hosted at [alexwengg/qwen3-tts-coreml](https://huggingface.co/alexwengg/qwen3-tts-coreml) on HuggingFace. + +Based on [Qwen/Qwen3-TTS-12Hz-0.6B-Base](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-0.6B-Base). diff --git a/Sources/FluidAudio/DownloadUtils.swift b/Sources/FluidAudio/DownloadUtils.swift index 5191aae21..883474383 100644 --- a/Sources/FluidAudio/DownloadUtils.swift +++ b/Sources/FluidAudio/DownloadUtils.swift @@ -329,6 +329,7 @@ public class DownloadUtils { shouldInclude = patterns.isEmpty || patterns.contains { itemPath.hasPrefix($0) } || itemPath.hasSuffix(".json") || itemPath.hasSuffix(".txt") + || itemPath.hasSuffix(".npy") || itemPath.hasSuffix(".bin") } if shouldInclude { let fileSize = item["size"] as? Int ?? -1 diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 05160cbf7..0859e6170 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -17,6 +17,7 @@ public enum Repo: String, CaseIterable { case pocketTts = "FluidInference/pocket-tts-coreml" case qwen3Asr = "FluidInference/qwen3-asr-0.6b-coreml/f32" case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8" + case qwen3Tts = "alexwengg/qwen3-tts-coreml" /// Repository slug (without owner) public var name: String { @@ -51,6 +52,8 @@ public enum Repo: String, CaseIterable { return "qwen3-asr-0.6b-coreml/f32" case .qwen3AsrInt8: return "qwen3-asr-0.6b-coreml/int8" + case .qwen3Tts: + return "qwen3-tts-coreml" } } @@ -69,6 +72,8 @@ public enum Repo: String, CaseIterable { return "FluidInference/ls-eend-coreml" case .qwen3Asr, .qwen3AsrInt8: return "FluidInference/qwen3-asr-0.6b-coreml" + case .qwen3Tts: + return "alexwengg/qwen3-tts-coreml" default: return "FluidInference/\(name)" } @@ -109,6 +114,8 @@ public enum Repo: String, CaseIterable { return "ls-eend" case .pocketTts: return "pocket-tts" + case .qwen3Tts: + return "qwen3-tts" default: return name } @@ -423,6 +430,35 @@ public enum ModelNames { ] } + /// Qwen3-TTS model names (LLM-based multilingual TTS) + public enum Qwen3TTS { + public static let textProjector = "TextProjector" + public static let codeEmbedder = "CodeEmbedder" + public static let multiCodeEmbedder = "MultiCodeEmbedder" + public static let codeDecoder = "CodeDecoder" + public static let multiCodeDecoder = "MultiCodeDecoder" + public static let speechDecoder = "SpeechDecoder" + + public static let textProjectorFile = textProjector + ".mlmodelc" + public static let codeEmbedderFile = codeEmbedder + ".mlmodelc" + public static let multiCodeEmbedderFile = multiCodeEmbedder + ".mlmodelc" + public static let codeDecoderFile = codeDecoder + ".mlmodelc" + public static let multiCodeDecoderFile = multiCodeDecoder + ".mlmodelc" + public static let speechDecoderFile = speechDecoder + ".mlmodelc" + + /// Optional speaker embedding file. + public static let speakerEmbeddingFile = "speaker_embedding_official.npy" + + public static let requiredModels: Set = [ + textProjectorFile, + codeEmbedderFile, + multiCodeEmbedderFile, + codeDecoderFile, + multiCodeDecoderFile, + speechDecoderFile, + ] + } + /// Multilingual G2P (CharsiuG2P ByT5) model names public enum MultilingualG2P { public static let encoder = "MultilingualG2PEncoder" @@ -540,6 +576,8 @@ public enum ModelNames { return ModelNames.LSEEND.requiredModels case .qwen3Asr, .qwen3AsrInt8: return ModelNames.Qwen3ASR.requiredModelsFull + case .qwen3Tts: + return ModelNames.Qwen3TTS.requiredModels } } } diff --git a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift index 5ca3f5bf5..40a7f2071 100644 --- a/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift +++ b/Sources/FluidAudio/TTS/Kokoro/Pipeline/Synthesize/KokoroSynthesizer.swift @@ -304,7 +304,22 @@ public struct KokoroSynthesizer { zeroFill: true ) + // Source noise for newer Kokoro models + let maxSeconds = variant.maxDurationSeconds + let noiseLength = TtsConstants.audioSampleRate * maxSeconds + let sourceNoise = try await multiArrayPool.rent( + shape: [1, noiseLength, 9], + dataType: .float16, + zeroFill: false + ) + let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9) + for i in 0..<(noiseLength * 9) { + let randomValue = Float.random(in: -1...1) + noisePointer[i] = Float16(randomValue).bitPattern + } + func recycleModelArrays() async { + await multiArrayPool.recycle(sourceNoise, zeroFill: false) await multiArrayPool.recycle(phasesArray, zeroFill: true) await multiArrayPool.recycle(attentionMask, zeroFill: false) await multiArrayPool.recycle(inputArray, zeroFill: false) @@ -338,6 +353,7 @@ public struct KokoroSynthesizer { "attention_mask": attentionMask, "ref_s": refStyle, "random_phases": phasesArray, + "source_noise": sourceNoise, ]) let predictionStart = Date() diff --git a/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsConstants.swift b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsConstants.swift new file mode 100644 index 000000000..b1ac9db14 --- /dev/null +++ b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsConstants.swift @@ -0,0 +1,90 @@ +import Foundation + +/// Constants for the Qwen3-TTS 6-model CoreML pipeline. +public enum Qwen3TtsConstants { + + // MARK: - Audio + + public static let audioSampleRate: Int = 24_000 + + /// Audio samples per codec frame (80ms at 24kHz). + public static let samplesPerFrame: Int = 1_920 + + // MARK: - Model dimensions + + public static let hiddenSize: Int = 1024 + public static let numCodebooks: Int = 16 + public static let codecVocabSize: Int = 2048 + + // MARK: - CodeDecoder KV cache + + /// Fixed KV cache sequence length for CodeDecoder. + /// key_cache / value_cache shape: [1, 28672, 1, 256] float16 + public static let cdKvLen: Int = 256 + + /// Consolidated KV dimension for CodeDecoder (28 layers). + public static let cdKvDim: Int = 28_672 + + // MARK: - MultiCodeDecoder KV cache + + /// Fixed KV cache sequence length for MultiCodeDecoder. + /// key_cache / value_cache shape: [1, 5120, 1, 16] float16 + public static let mcdKvLen: Int = 16 + + /// Consolidated KV dimension for MultiCodeDecoder (5 layers). + public static let mcdKvDim: Int = 5_120 + + // MARK: - Codec special token IDs + + public static let codecPadId: Int = 2148 + public static let codecBosId: Int = 2149 + public static let codecEosId: Int = 2150 + public static let codecThinkId: Int = 2154 + public static let codecNoThinkId: Int = 2155 + public static let codecThinkBosId: Int = 2156 + public static let codecThinkEosId: Int = 2157 + + // MARK: - Language IDs + + public static let languageIds: [String: Int] = [ + "english": 2050, + "chinese": 2055, + "german": 2053, + "italian": 2070, + "portuguese": 2071, + "spanish": 2054, + "japanese": 2058, + "korean": 2064, + "french": 2061, + "russian": 2069, + ] + + // MARK: - TTS special token IDs + + public static let ttsPadTokenId: Int = 151_671 + public static let ttsBosTokenId: Int = 151_672 + public static let ttsEosTokenId: Int = 151_673 + + // MARK: - Role prefix tokens + + /// [im_start, assistant, newline] + public static let rolePrefixTokens: [Int] = [151_644, 77_091, 198] + + // MARK: - Generation parameters + + public static let maxCodecTokens: Int = 125 + public static let temperature: Float = 0.9 + public static let topK: Int = 50 + public static let repetitionPenalty: Float = 1.05 + public static let minNewTokens: Int = 2 + + // MARK: - SpeechDecoder + + /// Fixed input time dimension for SpeechDecoder: [1, 16, 125]. + public static let speechDecoderFrames: Int = 125 + + // MARK: - Defaults + + public static let defaultVoice: String = "default" + public static let defaultLanguage: String = "english" +} diff --git a/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsManager.swift b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsManager.swift new file mode 100644 index 000000000..e99736d34 --- /dev/null +++ b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsManager.swift @@ -0,0 +1,132 @@ +import Foundation +import OSLog + +/// Manages text-to-speech synthesis using Qwen3-TTS CoreML models. +/// +/// - Important: **Beta.** Qwen3-TTS does not yet include a built-in text tokenizer. +/// Input must be pre-tokenized externally (e.g., via the Python `qwen-tts` package). +/// +/// Qwen3-TTS is a large language model-based TTS system that supports +/// multiple languages including English and Chinese. It uses a 4-stage +/// pipeline: prefill → LM decode → code predictor → audio decoder. +/// +/// Example usage: +/// ```swift +/// let manager = Qwen3TtsManager() +/// try await manager.loadFromDirectory(modelDirectory) +/// let audioData = try await manager.synthesize(text: "Hello world", tokenIds: [...]) +/// ``` +public actor Qwen3TtsManager { + + private let logger = AppLogger(category: "Qwen3TtsManager") + private let modelStore: Qwen3TtsModelStore + private var isInitialized = false + + /// Creates a new Qwen3-TTS manager. + public init() { + self.modelStore = Qwen3TtsModelStore() + } + + public var isAvailable: Bool { + isInitialized + } + + /// Download models from HuggingFace and initialize. + public func initialize() async throws { + try await modelStore.loadIfNeeded() + isInitialized = true + logger.notice("Qwen3TtsManager initialized (auto-download)") + } + + /// Load models from a local directory. + /// + /// - Parameter directory: Path to directory containing CoreML model bundles. + public func loadFromDirectory(_ directory: URL) async throws { + try await modelStore.loadFromDirectory(directory) + isInitialized = true + logger.notice("Qwen3TtsManager initialized from \(directory.lastPathComponent)") + } + + /// Synthesize text to WAV audio data. + /// + /// - Parameters: + /// - text: The text to synthesize (for logging purposes). + /// - tokenIds: Pre-tokenized text IDs from Qwen3 tokenizer. + /// - useSpeaker: Whether to use speaker embedding (default: true). + /// - language: Language for synthesis (default: "english"). + /// - Returns: WAV audio data at 24kHz. + public func synthesize( + text: String, + tokenIds: [Int], + useSpeaker: Bool = true, + language: String = Qwen3TtsConstants.defaultLanguage + ) async throws -> Data { + guard isInitialized else { + throw TTSError.modelNotFound("Qwen3-TTS models not initialized") + } + + return try await Qwen3TtsSynthesizer.withModelStore(modelStore) { + let result = try await Qwen3TtsSynthesizer.synthesize( + text: text, + tokenIds: tokenIds, + useSpeaker: useSpeaker, + language: language + ) + return result.audio + } + } + + /// Synthesize text and return detailed results. + public func synthesizeDetailed( + text: String, + tokenIds: [Int], + useSpeaker: Bool = true, + language: String = Qwen3TtsConstants.defaultLanguage + ) async throws -> Qwen3TtsSynthesizer.SynthesisResult { + guard isInitialized else { + throw TTSError.modelNotFound("Qwen3-TTS models not initialized") + } + + return try await Qwen3TtsSynthesizer.withModelStore(modelStore) { + try await Qwen3TtsSynthesizer.synthesize( + text: text, + tokenIds: tokenIds, + useSpeaker: useSpeaker, + language: language + ) + } + } + + /// Synthesize text and write the result directly to a file. + public func synthesizeToFile( + text: String, + tokenIds: [Int], + outputURL: URL, + useSpeaker: Bool = true, + language: String = Qwen3TtsConstants.defaultLanguage + ) async throws { + if FileManager.default.fileExists(atPath: outputURL.path) { + try FileManager.default.removeItem(at: outputURL) + } + + let audioData = try await synthesize( + text: text, + tokenIds: tokenIds, + useSpeaker: useSpeaker, + language: language + ) + + try audioData.write(to: outputURL) + logger.notice("Saved synthesized audio to: \(outputURL.lastPathComponent)") + } + + /// Get the underlying model store for advanced usage. + public func getModelStore() -> Qwen3TtsModelStore { + modelStore + } + + public func cleanup() async { + await modelStore.reset() + isInitialized = false + } +} diff --git a/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsModelStore.swift b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsModelStore.swift new file mode 100644 index 000000000..21d999b89 --- /dev/null +++ b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsModelStore.swift @@ -0,0 +1,225 @@ +@preconcurrency import CoreML +import Foundation +import OSLog + +/// Actor-based store for the 6 Qwen3-TTS CoreML models. +/// +/// Models: +/// - TextProjector — text token → embedding +/// - CodeEmbedder — codec token → embedding +/// - MultiCodeEmbedder — linearized codebook token → embedding +/// - CodeDecoder — 28-layer transformer with KV cache (generates CB0) +/// - MultiCodeDecoder — 5-layer transformer with KV cache (generates CB1-CB15) +/// - SpeechDecoder — codec frames → audio waveform +public actor Qwen3TtsModelStore { + + private let logger = AppLogger(category: "Qwen3TtsModelStore") + + private var textProjectorModel: MLModel? + private var codeEmbedderModel: MLModel? + private var multiCodeEmbedderModel: MLModel? + private var codeDecoderModel: MLModel? + private var multiCodeDecoderModel: MLModel? + private var speechDecoderModel: MLModel? + private var speakerEmbedding: [Float]? + private var repoDirectory: URL? + + public init() {} + + /// Download models from HuggingFace and load them. + public func loadIfNeeded() async throws { + guard textProjectorModel == nil else { return } + + let repoDir = try await Qwen3TtsResourceDownloader.ensureModels() + try await loadFromDirectory(repoDir) + } + + /// Load all CoreML models from a local directory. + public func loadFromDirectory(_ directory: URL) async throws { + guard textProjectorModel == nil else { return } + + self.repoDirectory = directory + + logger.info("Loading Qwen3-TTS CoreML models from \(directory.path)...") + + // Embedding models and SpeechDecoder use CPU+GPU (float32) + let f32Config = MLModelConfiguration() + f32Config.computeUnits = .cpuAndGPU + + // CodeDecoder also uses CPU+GPU to prevent inf/NaN from ANE float16 overflow + let allConfig = MLModelConfiguration() + allConfig.computeUnits = .cpuAndGPU + + let loadStart = Date() + + textProjectorModel = try loadModel( + at: directory.appendingPathComponent(ModelNames.Qwen3TTS.textProjectorFile), + config: f32Config, name: "TextProjector") + codeEmbedderModel = try loadModel( + at: directory.appendingPathComponent(ModelNames.Qwen3TTS.codeEmbedderFile), + config: f32Config, name: "CodeEmbedder") + multiCodeEmbedderModel = try loadModel( + at: directory.appendingPathComponent(ModelNames.Qwen3TTS.multiCodeEmbedderFile), + config: f32Config, name: "MultiCodeEmbedder") + codeDecoderModel = try loadModel( + at: directory.appendingPathComponent(ModelNames.Qwen3TTS.codeDecoderFile), + config: allConfig, name: "CodeDecoder") + // MultiCodeDecoder MUST use CPU_ONLY (all other configs produce NaN) + let mcdConfig = MLModelConfiguration() + mcdConfig.computeUnits = .cpuOnly + multiCodeDecoderModel = try loadModel( + at: directory.appendingPathComponent(ModelNames.Qwen3TTS.multiCodeDecoderFile), + config: mcdConfig, name: "MultiCodeDecoder") + speechDecoderModel = try loadModel( + at: directory.appendingPathComponent(ModelNames.Qwen3TTS.speechDecoderFile), + config: f32Config, name: "SpeechDecoder") + + // Load optional speaker embedding + let speakerURL = directory.appendingPathComponent( + ModelNames.Qwen3TTS.speakerEmbeddingFile) + if FileManager.default.fileExists(atPath: speakerURL.path) { + speakerEmbedding = try loadNumpyFloatArray(from: speakerURL) + logger.info("Loaded speaker embedding (\(speakerEmbedding!.count) floats)") + } + + let elapsed = Date().timeIntervalSince(loadStart) + logger.info("All Qwen3-TTS models loaded in \(String(format: "%.2f", elapsed))s") + } + + // MARK: - Accessors + + public func textProjector() throws -> MLModel { + guard let model = textProjectorModel else { + throw TTSError.modelNotFound("TextProjector model not loaded") + } + return model + } + + public func codeEmbedder() throws -> MLModel { + guard let model = codeEmbedderModel else { + throw TTSError.modelNotFound("CodeEmbedder model not loaded") + } + return model + } + + public func multiCodeEmbedder() throws -> MLModel { + guard let model = multiCodeEmbedderModel else { + throw TTSError.modelNotFound("MultiCodeEmbedder model not loaded") + } + return model + } + + public func codeDecoder() throws -> MLModel { + guard let model = codeDecoderModel else { + throw TTSError.modelNotFound("CodeDecoder model not loaded") + } + return model + } + + public func multiCodeDecoder() throws -> MLModel { + guard let model = multiCodeDecoderModel else { + throw TTSError.modelNotFound("MultiCodeDecoder model not loaded") + } + return model + } + + public func speechDecoder() throws -> MLModel { + guard let model = speechDecoderModel else { + throw TTSError.modelNotFound("SpeechDecoder model not loaded") + } + return model + } + + public func speaker() -> [Float]? { + speakerEmbedding + } + + public func repoDir() throws -> URL { + guard let dir = repoDirectory else { + throw TTSError.modelNotFound("Qwen3-TTS repository not loaded") + } + return dir + } + + public var isLoaded: Bool { + textProjectorModel != nil && codeEmbedderModel != nil + && multiCodeEmbedderModel != nil && codeDecoderModel != nil + && multiCodeDecoderModel != nil && speechDecoderModel != nil + } + + public func reset() { + textProjectorModel = nil + codeEmbedderModel = nil + multiCodeEmbedderModel = nil + codeDecoderModel = nil + multiCodeDecoderModel = nil + speechDecoderModel = nil + speakerEmbedding = nil + repoDirectory = nil + } + + // MARK: - Private Helpers + + private func loadModel( + at url: URL, + config: MLModelConfiguration, + name: String + ) throws -> MLModel { + let ext = url.pathExtension + + if ext == "mlpackage" { + logger.info("Compiling \(name) model...") + let compiledURL = try MLModel.compileModel(at: url) + let model = try MLModel(contentsOf: compiledURL, configuration: config) + logger.info("Loaded \(name) model (compiled)") + return model + } + + let model = try MLModel(contentsOf: url, configuration: config) + logger.info("Loaded \(name) model") + return model + } + + /// Load a numpy .npy file containing float32 array. + private func loadNumpyFloatArray(from url: URL) throws -> [Float] { + let data = try Data(contentsOf: url) + + guard data.count >= 12 else { + throw TTSError.processingFailed("Invalid NPY file: too small") + } + + let magic = data.prefix(6) + guard magic == Data([0x93, 0x4E, 0x55, 0x4D, 0x50, 0x59]) else { + throw TTSError.processingFailed("Invalid NPY magic number") + } + + let majorVersion = data[6] + + let headerLen: Int + let headerOffset: Int + if majorVersion == 1 { + headerLen = Int(data[8]) | (Int(data[9]) << 8) + headerOffset = 10 + } else { + headerLen = + Int(data[8]) | (Int(data[9]) << 8) | (Int(data[10]) << 16) + | (Int(data[11]) << 24) + headerOffset = 12 + } + + let dataOffset = headerOffset + headerLen + + let floatData = data.dropFirst(dataOffset) + let count = floatData.count / 4 + var result = [Float](repeating: 0, count: count) + + floatData.withUnsafeBytes { buffer in + let floatBuffer = buffer.bindMemory(to: Float.self) + for i in 0.. URL { + let cacheDirectory = try cacheDirectory() + let modelsDirectory = cacheDirectory.appendingPathComponent("Models") + + let repoDir = modelsDirectory.appendingPathComponent(Repo.qwen3Tts.folderName) + + // Check that all required files exist + let requiredModels = ModelNames.Qwen3TTS.requiredModels + let allPresent = requiredModels.allSatisfy { model in + FileManager.default.fileExists( + atPath: repoDir.appendingPathComponent(model).path) + } + + if !allPresent { + logger.info("Downloading Qwen3-TTS models from HuggingFace...") + try await DownloadUtils.downloadRepo(.qwen3Tts, to: modelsDirectory) + } else { + logger.info("Qwen3-TTS models found in cache") + } + + return repoDir + } + + // MARK: - Private + + private static func cacheDirectory() throws -> URL { + let baseDirectory: URL + #if os(macOS) + baseDirectory = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent(".cache") + #else + guard + let first = FileManager.default.urls( + for: .cachesDirectory, in: .userDomainMask + ).first + else { + throw TTSError.processingFailed("Failed to locate caches directory") + } + baseDirectory = first + #endif + + let cacheDirectory = baseDirectory.appendingPathComponent("fluidaudio") + if !FileManager.default.fileExists(atPath: cacheDirectory.path) { + try FileManager.default.createDirectory( + at: cacheDirectory, withIntermediateDirectories: true) + } + return cacheDirectory + } +} diff --git a/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsSynthesizer.swift b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsSynthesizer.swift new file mode 100644 index 000000000..5adad8fcc --- /dev/null +++ b/Sources/FluidAudio/TTS/Qwen3TTS/Qwen3TtsSynthesizer.swift @@ -0,0 +1,850 @@ +@preconcurrency import CoreML +import Foundation +import OSLog + +/// Qwen3-TTS 6-model CoreML synthesizer. +/// +/// Pipeline (Argmax-style, matching `inference.py`): +/// 1. Build prefill embeddings: TextProjector(text) + CodeEmbedder(codec) per position +/// 2. CodeDecoder prefill: feed each embedding one at a time with KV cache +/// 3. Autoregressive decode loop: +/// a. MultiCodeDecoder: hidden_states + CB0 → CB1-CB15 +/// b. Sum all 16 codec embeddings + tts_pad → CodeDecoder step → next CB0 +/// 4. SpeechDecoder: all codec frames → audio waveform +public struct Qwen3TtsSynthesizer { + + static let logger = AppLogger(category: "Qwen3TtsSynthesizer") + + private enum Context { + @TaskLocal static var modelStore: Qwen3TtsModelStore? + } + + static func withModelStore( + _ store: Qwen3TtsModelStore, + operation: () async throws -> T + ) async rethrows -> T { + try await Context.$modelStore.withValue(store) { + try await operation() + } + } + + static func currentModelStore() throws -> Qwen3TtsModelStore { + guard let store = Context.modelStore else { + throw TTSError.processingFailed( + "Qwen3TtsSynthesizer requires a model store context.") + } + return store + } + + // MARK: - Public Types + + /// Result of a Qwen3-TTS synthesis operation. + public struct SynthesisResult: Sendable { + /// WAV audio data (24kHz). + public let audio: Data + /// Raw Float32 audio samples. + public let samples: [Float] + /// Number of codec tokens generated. + public let tokenCount: Int + } + + // MARK: - Public API + + /// Synthesize audio from text. + /// + /// - Parameters: + /// - text: The text to synthesize. + /// - tokenIds: Pre-tokenized text IDs. + /// - useSpeaker: Whether to use speaker embedding (default: true). + /// - language: Language for synthesis (default: "english"). + /// - Returns: A synthesis result containing WAV audio data. + public static func synthesize( + text: String, + tokenIds: [Int]? = nil, + useSpeaker: Bool = true, + language: String = Qwen3TtsConstants.defaultLanguage + ) async throws -> SynthesisResult { + let store = try currentModelStore() + + logger.info("Qwen3-TTS synthesizing: '\(text)'") + + guard let textTokens = tokenIds else { + throw TTSError.processingFailed( + "Qwen3-TTS requires pre-tokenized input. Please provide tokenIds.") + } + + // 1. Build prefill embeddings + let prefillStart = Date() + let prefillEmbeds = try await buildPrefillEmbeddings( + textTokens: textTokens, + useSpeaker: useSpeaker, + language: language, + store: store + ) + let prefillBuildTime = Date().timeIntervalSince(prefillStart) + logger.info("Built \(prefillEmbeds.count) prefill embeddings in \(String(format: "%.2f", prefillBuildTime))s") + + // 2. CodeDecoder prefill + let cdPrefillStart = Date() + var cdState = CodeDecoderKVState() + var lastOutput: CodeDecoderOutput! + + for emb in prefillEmbeds { + lastOutput = try await runCodeDecoderStep( + inputEmbeds: emb, state: &cdState, store: store) + } + let cdPrefillTime = Date().timeIntervalSince(cdPrefillStart) + logger.info( + "CodeDecoder prefill: \(prefillEmbeds.count) positions in \(String(format: "%.2f", cdPrefillTime))s" + ) + + // 3. Sample first CB0 from prefill logits + var logits = extractFloatArray(from: lastOutput.logits) + + suppressControlTokens(&logits) + suppressEos(&logits) // min_new_tokens: suppress EOS for step 0 + let firstCb0 = sampleTopK(logits: &logits) + var generatedCb0s: [Int] = [firstCb0] + + logger.info("First CB0: \(firstCb0)") + + // 4. Autoregressive decode loop + let decodeStart = Date() + var allFrames: [[Int]] = [] + var currentCb0 = firstCb0 + var currentHidden = lastOutput.hiddenStates + + // Cache tts_pad embedding for decode loop + let textProjector = try await store.textProjector() + let ttsPadEmbed = try runTextProjector(textProjector, tokenId: Qwen3TtsConstants.ttsPadTokenId) + let codeEmbedder = try await store.codeEmbedder() + let multiCodeEmbedder = try await store.multiCodeEmbedder() + + // PERFORMANCE: No KV cache template needed - each frame will create fresh arrays + // The first frame will call getModelStridedKVCaches(), subsequent frames will + // reuse the model's output arrays from the previous frame's final position. + var mcdKeyTemplate: MLMultiArray? = nil + var mcdValTemplate: MLMultiArray? = nil + + for step in 0..