From b2cbf16cbd5d4a903007ddc42301a76eb5e41d23 Mon Sep 17 00:00:00 2001 From: hanweng9 Date: Thu, 6 Nov 2025 01:25:59 -0500 Subject: [PATCH] tried to use v2 and v3 from scratch --- README.md | 115 +++++++++++++++--- examples/cpp/parakeet_cli.cpp | 30 +++-- include/eddy/core/model_configs.hpp | 2 +- src/models/parakeet-v2/parakeet_openvino.cpp | 6 + .../parakeet-v2/parakeet_preprocessor.cpp | 25 +++- 5 files changed, 147 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 583dc1c..c300125 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,64 @@ For Apple platforms (macOS/iOS), use [FluidAudio](https://github.com/FluidInfere ## Building +### Prerequisites + +#### 1. Install OpenVINO 2025.x + +**Windows:** +Download and install from [OpenVINO Toolkit Downloads](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/download.html) + +Default location: `C:\Program Files (x86)\Intel\openvino_2025.0.0\` + +**Add OpenVINO to PATH** (required for runtime): +```bash +# Add to your system PATH or run before using executables: +set PATH=%PATH%;C:\Program Files (x86)\Intel\openvino_2025.0.0\runtime\bin\intel64\Release +``` + +**Linux:** +```bash +# Download and install from intel.com/openvino or use APT +wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.0/linux/l_openvino_toolkit_ubuntu22_2025.0.0.tar.gz +tar -xvzf l_openvino_toolkit_ubuntu22_2025.0.0.tar.gz +cd l_openvino_toolkit_ubuntu22_2025.0.0 +sudo ./install_openvino_dependencies.sh +source setupvars.sh +``` + +#### 2. Install Build Tools + +**Windows:** +- [CMake 3.16+](https://cmake.org/download/) +- [Visual Studio 2019/2022](https://visualstudio.microsoft.com/) with C++ Desktop Development workload +- Git (for vcpkg) + +**Linux:** +```bash +sudo apt install cmake build-essential git +``` + +#### 3. Install vcpkg + +**Windows:** +```bash +git clone https://github.com/microsoft/vcpkg.git C:\vcpkg +cd C:\vcpkg +.\bootstrap-vcpkg.bat +``` + +**Linux/macOS:** +```bash +git clone https://github.com/microsoft/vcpkg.git ~/vcpkg +cd ~/vcpkg +./bootstrap-vcpkg.sh +``` + +### Build with vcpkg + ```bash # Configure with vcpkg toolchain -cmake -S . -B build -DCMAKE_TOOLCHAIN_FILE=[path-to-vcpkg]/scripts/buildsystems/vcpkg.cmake +cmake -S . -B build -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake # Or specify OpenVINO manually if not using vcpkg cmake -S . -B build -DOpenVINO_DIR=/opt/intel/openvino/runtime/cmake @@ -32,9 +87,30 @@ cmake --build build --config Release The build produces: - **Static library**: `eddy` (linkable C++ library) -- **CLI tools**: `parakeet_cli.exe`, `whisper_example.exe` (examples) +- **CLI tools**: `parakeet_cli.exe`, `hf_fetch_models.exe` (examples) - **Benchmarks**: `benchmark_librispeech.exe`, `benchmark_fleurs.exe` +### Quick Start + +**Download models** (first time only): +```bash +# Windows (with OpenVINO in PATH) +build\examples\cpp\Release\hf_fetch_models.exe --model parakeet-v2 + +# Linux +build/examples/cpp/hf_fetch_models --model parakeet-v2 +``` + +**Test transcription:** +```bash +# Create test audio or use your own 16kHz WAV file +build\examples\cpp\Release\parakeet_cli.exe audio.wav --model parakeet-v2 --device CPU +``` + +Models auto-download on first inference if not manually fetched. Cached in: +- **Windows**: `%LOCALAPPDATA%\eddy\models\parakeet-v2\files` +- **Linux**: `~/.cache/eddy/models/parakeet-v2/files` + ### Optional: Whisper Support Whisper requires OpenVINO GenAI (not included by default): @@ -47,21 +123,21 @@ cmake -S . -B build -DEDDY_ENABLE_WHISPER=ON -DOpenVINOGenAI_DIR=" [options]\n\n"; std::cout << "Options:\n"; + std::cout << " --model Model version (default: parakeet-v2)\n"; + std::cout << " Options: parakeet-v2, parakeet-v3\n"; std::cout << " --device OpenVINO device (default: CPU)\n"; std::cout << " Options: CPU, AUTO\n"; std::cout << " --help Show this help message\n\n"; @@ -28,7 +30,7 @@ void print_usage(const char* program_name) { std::cout << " - Models will be loaded from cache or models/parakeet/\n\n"; std::cout << "Example:\n"; std::cout << " " << program_name << " test.wav\n"; - std::cout << " " << program_name << " test.wav --device AUTO\n"; + std::cout << " " << program_name << " test.wav --model parakeet-v3 --device CPU\n"; } int main(int argc, char* argv[]) { @@ -44,6 +46,7 @@ int main(int argc, char* argv[]) { std::string audio_file; std::string device = "CPU"; + std::string model_name = "parakeet-v2"; for (int i = 1; i < argc; i++) { std::string arg = argv[i]; @@ -51,6 +54,16 @@ int main(int argc, char* argv[]) { if (arg == "--help" || arg == "-h") { print_usage(argv[0]); return 0; + } else if (arg == "--model") { + if (i + 1 >= argc) { + std::cerr << "Error: --model requires an argument\n"; + return 1; + } + model_name = argv[++i]; + if (model_name != "parakeet-v2" && model_name != "parakeet-v3") { + std::cerr << "Error: Invalid model. Use 'parakeet-v2' or 'parakeet-v3'\n"; + return 1; + } } else if (arg == "--device") { if (i + 1 >= argc) { std::cerr << "Error: --device requires an argument\n"; @@ -69,7 +82,7 @@ int main(int argc, char* argv[]) { return 1; } - std::cout << "=== Parakeet TDT v2 Transcription CLI ===\n\n"; + std::cout << "=== Parakeet TDT Transcription CLI (" << model_name << ") ===\n\n"; try { // Load audio file @@ -85,7 +98,7 @@ int main(int argc, char* argv[]) { std::cout << "Initializing OpenVINO backend (" << device << ") ... "; std::cout.flush(); // Set compiled model cache to the per-model cache dir - auto compiled_cache_dir = eddy::get_model_dir("parakeet-v2").string(); + auto compiled_cache_dir = eddy::get_model_dir(model_name).string(); eddy::OpenVINOOptions ov_opts; ov_opts.device = device; ov_opts.cache_dir = compiled_cache_dir; @@ -93,7 +106,7 @@ int main(int argc, char* argv[]) { std::cout << "[OK]\n"; // Determine model directory: ensure cache has required files (centralized helper) - auto cache_model_dir = eddy::get_model_assets_dir("parakeet-v2"); + auto cache_model_dir = eddy::get_model_assets_dir(model_name); std::filesystem::path model_dir; std::string fetch_err; if (!eddy::parakeet::check_models_available(cache_model_dir, &fetch_err)) { @@ -112,7 +125,7 @@ int main(int argc, char* argv[]) { } else { // Fallback: legacy Windows path (%LOCALAPPDATA%\eddy\cache\models\\files) #if defined(_WIN32) - auto legacy_dir = eddy::get_app_data_dir() / "cache" / "models" / "parakeet-v2" / "files"; + auto legacy_dir = eddy::get_app_data_dir() / "cache" / "models" / model_name / "files"; if (exists_nonempty(legacy_dir / "parakeet_encoder.xml")) { model_dir = legacy_dir; std::cout << "Using legacy cached models at: " << legacy_dir.string() << "\n\n"; @@ -134,10 +147,11 @@ int main(int argc, char* argv[]) { .tokenizer_json = (model_dir / "parakeet_vocab.json").string() }; - // Configure runtime (v2 uses blank_token_id=1024) + // Configure runtime (v2 uses blank_token_id=1024, v3 uses blank_token_id=8192) + int blank_token_id = (model_name == "parakeet-v3") ? 8192 : 1024; eddy::parakeet::RuntimeConfig cfg{ .device = device, - .blank_token_id = 1024, + .blank_token_id = blank_token_id, .duration_bins = {0, 1, 2, 3, 4} }; @@ -228,7 +242,7 @@ int main(int argc, char* argv[]) { std::cerr << "\n[ERROR] " << e.what() << "\n\n"; std::cerr << "Troubleshooting:\n"; std::cerr << " 1. Ensure audio file is 16kHz WAV format\n"; - std::cerr << " 2. Check models are in: " << eddy::get_model_assets_dir("parakeet-v2").string() << "\n"; + std::cerr << " 2. Check models are in: " << eddy::get_model_assets_dir(model_name).string() << "\n"; std::cerr << " or in: models/parakeet/\n"; std::cerr << " 3. Verify OpenVINO runtime is properly installed\n"; std::cerr << " 4. Try --device CPU if AUTO fails\n"; diff --git a/include/eddy/core/model_configs.hpp b/include/eddy/core/model_configs.hpp index 8902267..129cba6 100644 --- a/include/eddy/core/model_configs.hpp +++ b/include/eddy/core/model_configs.hpp @@ -36,7 +36,7 @@ namespace model_configs { }; inline const ModelConfig PARAKEET_V3 = { - .repo_id = "FluidInference/parakeet-tdt-0.6b-v3-ov", + .repo_id = "FluidInference/parakeet-tdt-1.1b-v3-ov", .required_files = PARAKEET_STANDARD_FILES, .cache_subdir = "parakeet-v3" }; diff --git a/src/models/parakeet-v2/parakeet_openvino.cpp b/src/models/parakeet-v2/parakeet_openvino.cpp index 4cc1add..aed36cb 100644 --- a/src/models/parakeet-v2/parakeet_openvino.cpp +++ b/src/models/parakeet-v2/parakeet_openvino.cpp @@ -392,6 +392,12 @@ void OpenVINOParakeet::ensure_compiled_model() const { const auto mel_shape = impl_->encoder_ports.mel_in.value().get_shape(); impl_->encoder_expected_frames = mel_shape.empty() ? 0U : mel_shape.back(); + // Fallback for dynamic shapes: Use 1250 frames (10 seconds at 125 frames/sec mel rate) + // This enables chunking for models with dynamic encoder inputs + if (impl_->encoder_expected_frames == 0) { + impl_->encoder_expected_frames = 1250; + } + // ======================================== // Determine encoder output indices and hidden size // ======================================== diff --git a/src/models/parakeet-v2/parakeet_preprocessor.cpp b/src/models/parakeet-v2/parakeet_preprocessor.cpp index 2754357..7a0debf 100644 --- a/src/models/parakeet-v2/parakeet_preprocessor.cpp +++ b/src/models/parakeet-v2/parakeet_preprocessor.cpp @@ -44,6 +44,17 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) { throw std::invalid_argument("Parakeet OpenVINO pipeline expects 16 kHz audio samples"); } + // Workaround for v3 preprocessor bug: round sample count to nearest 1000 + // v3 fails on specific odd-length audio (e.g., 240,135 samples) + AudioSegment working_segment = segment; + const size_t original_size = segment.pcm.size(); + const size_t round_to = 1000; + const size_t rounded_size = ((original_size + round_to - 1) / round_to) * round_to; + + if (rounded_size != original_size) { + working_segment.pcm.resize(rounded_size, 0.0F); // Pad with zeros + } + // Query preprocessor window size from compiled model shape. // The Parakeet ONNX model always has static shape [1, 160000], but OpenVINO // may compile it with dynamic shapes for optimization (especially on CPU). @@ -65,6 +76,12 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) { } } + // Enforce maximum window size for dynamic models + // v3 preprocessor has dynamic shape - use same 10s windows as v2 + if (window_samples == 0 && working_segment.pcm.size() > 160000) { + window_samples = 160000; // 10 seconds at 16kHz (matches v2) + } + // Query length input type once (fixed at model export) const auto len_et = impl.preproc_model.input(1).get_element_type(); const bool use_i64 = (len_et == ov::element::i64); @@ -110,8 +127,8 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) { // ======================================== // Single-shot path: Dynamically compiled model OR short audio that fits in one window // ======================================== - if (window_samples == 0 || segment.pcm.size() <= window_samples) { - auto [mel_tensor, length_tensor] = run_window(segment.pcm.data(), segment.pcm.size()); + if (window_samples == 0 || working_segment.pcm.size() <= window_samples) { + auto [mel_tensor, length_tensor] = run_window(working_segment.pcm.data(), working_segment.pcm.size()); const int64_t valid_frames = read_length_scalar(length_tensor); if (valid_frames <= 0) { @@ -148,7 +165,7 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) { // ======================================== } else { constexpr size_t kMelBins = 128; - const size_t total_samples = segment.pcm.size(); + const size_t total_samples = working_segment.pcm.size(); std::vector> mel_bins(kMelBins); size_t offset = 0; @@ -158,7 +175,7 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) { const size_t remaining = total_samples - offset; const size_t this_count = std::min(window_samples, remaining); - auto [mel_tensor, length_tensor] = run_window(segment.pcm.data() + offset, this_count); + auto [mel_tensor, length_tensor] = run_window(working_segment.pcm.data() + offset, this_count); const int64_t vframes = read_length_scalar(length_tensor); if (vframes <= 0) {