From b2cbf16cbd5d4a903007ddc42301a76eb5e41d23 Mon Sep 17 00:00:00 2001
From: hanweng9 <hanweng9@gmail.com>
Date: Thu, 6 Nov 2025 01:25:59 -0500
Subject: [PATCH] tried to use v2 and v3 from scratch

---
 README.md                                     | 115 +++++++++++++++---
 examples/cpp/parakeet_cli.cpp                 |  30 +++--
 include/eddy/core/model_configs.hpp           |   2 +-
 src/models/parakeet-v2/parakeet_openvino.cpp  |   6 +
 .../parakeet-v2/parakeet_preprocessor.cpp     |  25 +++-
 5 files changed, 147 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 583dc1c..c300125 100644
--- a/README.md
+++ b/README.md
@@ -19,9 +19,64 @@ For Apple platforms (macOS/iOS), use [FluidAudio](https://github.com/FluidInfere
 
 ## Building
 
+### Prerequisites
+
+#### 1. Install OpenVINO 2025.x
+
+**Windows:**
+Download and install from [OpenVINO Toolkit Downloads](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/download.html)
+
+Default location: `C:\Program Files (x86)\Intel\openvino_2025.0.0\`
+
+**Add OpenVINO to PATH** (required for runtime):
+```bash
+# Add to your system PATH or run before using executables:
+set PATH=%PATH%;C:\Program Files (x86)\Intel\openvino_2025.0.0\runtime\bin\intel64\Release
+```
+
+**Linux:**
+```bash
+# Download and install from intel.com/openvino or use APT
+wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.0/linux/l_openvino_toolkit_ubuntu22_2025.0.0.tar.gz
+tar -xvzf l_openvino_toolkit_ubuntu22_2025.0.0.tar.gz
+cd l_openvino_toolkit_ubuntu22_2025.0.0
+sudo ./install_openvino_dependencies.sh
+source setupvars.sh
+```
+
+#### 2. Install Build Tools
+
+**Windows:**
+- [CMake 3.16+](https://cmake.org/download/)
+- [Visual Studio 2019/2022](https://visualstudio.microsoft.com/) with C++ Desktop Development workload
+- Git (for vcpkg)
+
+**Linux:**
+```bash
+sudo apt install cmake build-essential git
+```
+
+#### 3. Install vcpkg
+
+**Windows:**
+```bash
+git clone https://github.com/microsoft/vcpkg.git C:\vcpkg
+cd C:\vcpkg
+.\bootstrap-vcpkg.bat
+```
+
+**Linux/macOS:**
+```bash
+git clone https://github.com/microsoft/vcpkg.git ~/vcpkg
+cd ~/vcpkg
+./bootstrap-vcpkg.sh
+```
+
+### Build with vcpkg
+
 ```bash
 # Configure with vcpkg toolchain
-cmake -S . -B build -DCMAKE_TOOLCHAIN_FILE=[path-to-vcpkg]/scripts/buildsystems/vcpkg.cmake
+cmake -S . -B build -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
 
 # Or specify OpenVINO manually if not using vcpkg
 cmake -S . -B build -DOpenVINO_DIR=/opt/intel/openvino/runtime/cmake
@@ -32,9 +87,30 @@ cmake --build build --config Release
 
 The build produces:
 - **Static library**: `eddy` (linkable C++ library)
-- **CLI tools**: `parakeet_cli.exe`, `whisper_example.exe` (examples)
+- **CLI tools**: `parakeet_cli.exe`, `hf_fetch_models.exe` (examples)
 - **Benchmarks**: `benchmark_librispeech.exe`, `benchmark_fleurs.exe`
 
+### Quick Start
+
+**Download models** (first time only):
+```bash
+# Windows (with OpenVINO in PATH)
+build\examples\cpp\Release\hf_fetch_models.exe --model parakeet-v2
+
+# Linux
+build/examples/cpp/hf_fetch_models --model parakeet-v2
+```
+
+**Test transcription:**
+```bash
+# Create test audio or use your own 16kHz WAV file
+build\examples\cpp\Release\parakeet_cli.exe audio.wav --model parakeet-v2 --device CPU
+```
+
+Models auto-download on first inference if not manually fetched. Cached in:
+- **Windows**: `%LOCALAPPDATA%\eddy\models\parakeet-v2\files`
+- **Linux**: `~/.cache/eddy/models/parakeet-v2/files`
+
 ### Optional: Whisper Support
 
 Whisper requires OpenVINO GenAI (not included by default):
@@ -47,21 +123,21 @@ cmake -S . -B build -DEDDY_ENABLE_WHISPER=ON -DOpenVINOGenAI_DIR="<path-to-genai
 
 ### Basic Transcription
 
-**Parakeet V2** (English only):
+**Parakeet V2** (English only, 600MB):
 ```bash
+# NPU (Intel Core Ultra)
 build/examples/cpp/Release/parakeet_cli.exe audio.wav --model parakeet-v2 --device NPU
+
+# CPU (any x86_64)
+build/examples/cpp/Release/parakeet_cli.exe audio.wav --model parakeet-v2 --device CPU
 ```
 
-**Parakeet V3** (Multilingual - 24 languages):
+**Parakeet V3** (Multilingual - 24 languages, 1.1GB):
 ```bash
-# English (default)
-build/examples/cpp/Release/parakeet_cli.exe audio.wav --model parakeet-v3 --device NPU
-
-# Spanish
-build/examples/cpp/Release/parakeet_cli.exe audio_es.wav --model parakeet-v3 --language es --device NPU
+# English
+build/examples/cpp/Release/parakeet_cli.exe audio.wav --model parakeet-v3 --device CPU
 
-# French
-build/examples/cpp/Release/parakeet_cli.exe audio_fr.wav --model parakeet-v3 --language fr --device NPU
+# Note: V3 currently only tested with English. Multilingual support coming soon.
 ```
 
 **Whisper** (if built with `EDDY_ENABLE_WHISPER=ON`):
@@ -71,15 +147,18 @@ build/examples/cpp/Release/whisper_example.exe path/to/whisper-model audio.wav N
 
 ### Device Selection
 
-```bash
-# NPU (best performance on Intel Core Ultra)
---device NPU
+| Device | Best For | Performance |
+|--------|----------|-------------|
+| **NPU** | Intel Core Ultra (Meteor Lake+) | 38-41× RTFx |
+| **CPU** | Any x86_64 processor | 8× RTFx |
+| **AUTO** | Let OpenVINO choose | Varies |
 
-# CPU (fallback)
---device CPU
-```
+**Audio Requirements:**
+- Format: WAV (mono or stereo)
+- Sample Rate: 16kHz
+- Bit Depth: 16-bit PCM
 
-Models auto-download from HuggingFace on first run. See [C++ API documentation](docs/CPP_API.md) for library integration.
+See [C++ API documentation](docs/CPP_API.md) for library integration.
 
 ## Models & Performance
 
diff --git a/examples/cpp/parakeet_cli.cpp b/examples/cpp/parakeet_cli.cpp
index 5d525d9..70cd10e 100644
--- a/examples/cpp/parakeet_cli.cpp
+++ b/examples/cpp/parakeet_cli.cpp
@@ -20,6 +20,8 @@
 void print_usage(const char* program_name) {
     std::cout << "Usage: " << program_name << " <audio.wav> [options]\n\n";
     std::cout << "Options:\n";
+    std::cout << "  --model <model>      Model version (default: parakeet-v2)\n";
+    std::cout << "                       Options: parakeet-v2, parakeet-v3\n";
     std::cout << "  --device <device>    OpenVINO device (default: CPU)\n";
     std::cout << "                       Options: CPU, AUTO\n";
     std::cout << "  --help              Show this help message\n\n";
@@ -28,7 +30,7 @@ void print_usage(const char* program_name) {
     std::cout << "  - Models will be loaded from cache or models/parakeet/\n\n";
     std::cout << "Example:\n";
     std::cout << "  " << program_name << " test.wav\n";
-    std::cout << "  " << program_name << " test.wav --device AUTO\n";
+    std::cout << "  " << program_name << " test.wav --model parakeet-v3 --device CPU\n";
 }
 
 int main(int argc, char* argv[]) {
@@ -44,6 +46,7 @@ int main(int argc, char* argv[]) {
 
     std::string audio_file;
     std::string device = "CPU";
+    std::string model_name = "parakeet-v2";
 
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
@@ -51,6 +54,16 @@ int main(int argc, char* argv[]) {
         if (arg == "--help" || arg == "-h") {
             print_usage(argv[0]);
             return 0;
+        } else if (arg == "--model") {
+            if (i + 1 >= argc) {
+                std::cerr << "Error: --model requires an argument\n";
+                return 1;
+            }
+            model_name = argv[++i];
+            if (model_name != "parakeet-v2" && model_name != "parakeet-v3") {
+                std::cerr << "Error: Invalid model. Use 'parakeet-v2' or 'parakeet-v3'\n";
+                return 1;
+            }
         } else if (arg == "--device") {
             if (i + 1 >= argc) {
                 std::cerr << "Error: --device requires an argument\n";
@@ -69,7 +82,7 @@ int main(int argc, char* argv[]) {
         return 1;
     }
 
-    std::cout << "=== Parakeet TDT v2 Transcription CLI ===\n\n";
+    std::cout << "=== Parakeet TDT Transcription CLI (" << model_name << ") ===\n\n";
 
     try {
         // Load audio file
@@ -85,7 +98,7 @@ int main(int argc, char* argv[]) {
         std::cout << "Initializing OpenVINO backend (" << device << ") ... ";
         std::cout.flush();
         // Set compiled model cache to the per-model cache dir
-        auto compiled_cache_dir = eddy::get_model_dir("parakeet-v2").string();
+        auto compiled_cache_dir = eddy::get_model_dir(model_name).string();
         eddy::OpenVINOOptions ov_opts;
         ov_opts.device = device;
         ov_opts.cache_dir = compiled_cache_dir;
@@ -93,7 +106,7 @@ int main(int argc, char* argv[]) {
         std::cout << "[OK]\n";
 
         // Determine model directory: ensure cache has required files (centralized helper)
-        auto cache_model_dir = eddy::get_model_assets_dir("parakeet-v2");
+        auto cache_model_dir = eddy::get_model_assets_dir(model_name);
         std::filesystem::path model_dir;
         std::string fetch_err;
         if (!eddy::parakeet::check_models_available(cache_model_dir, &fetch_err)) {
@@ -112,7 +125,7 @@ int main(int argc, char* argv[]) {
         } else {
             // Fallback: legacy Windows path (%LOCALAPPDATA%\eddy\cache\models\<name>\files)
 #if defined(_WIN32)
-            auto legacy_dir = eddy::get_app_data_dir() / "cache" / "models" / "parakeet-v2" / "files";
+            auto legacy_dir = eddy::get_app_data_dir() / "cache" / "models" / model_name / "files";
             if (exists_nonempty(legacy_dir / "parakeet_encoder.xml")) {
                 model_dir = legacy_dir;
                 std::cout << "Using legacy cached models at: " << legacy_dir.string() << "\n\n";
@@ -134,10 +147,11 @@ int main(int argc, char* argv[]) {
             .tokenizer_json = (model_dir / "parakeet_vocab.json").string()
         };
 
-        // Configure runtime (v2 uses blank_token_id=1024)
+        // Configure runtime (v2 uses blank_token_id=1024, v3 uses blank_token_id=8192)
+        int blank_token_id = (model_name == "parakeet-v3") ? 8192 : 1024;
         eddy::parakeet::RuntimeConfig cfg{
             .device = device,
-            .blank_token_id = 1024,
+            .blank_token_id = blank_token_id,
             .duration_bins = {0, 1, 2, 3, 4}
         };
 
@@ -228,7 +242,7 @@ int main(int argc, char* argv[]) {
         std::cerr << "\n[ERROR] " << e.what() << "\n\n";
         std::cerr << "Troubleshooting:\n";
         std::cerr << "  1. Ensure audio file is 16kHz WAV format\n";
-        std::cerr << "  2. Check models are in: " << eddy::get_model_assets_dir("parakeet-v2").string() << "\n";
+        std::cerr << "  2. Check models are in: " << eddy::get_model_assets_dir(model_name).string() << "\n";
         std::cerr << "     or in: models/parakeet/\n";
         std::cerr << "  3. Verify OpenVINO runtime is properly installed\n";
         std::cerr << "  4. Try --device CPU if AUTO fails\n";
diff --git a/include/eddy/core/model_configs.hpp b/include/eddy/core/model_configs.hpp
index 8902267..129cba6 100644
--- a/include/eddy/core/model_configs.hpp
+++ b/include/eddy/core/model_configs.hpp
@@ -36,7 +36,7 @@ namespace model_configs {
     };
 
     inline const ModelConfig PARAKEET_V3 = {
-        .repo_id = "FluidInference/parakeet-tdt-0.6b-v3-ov",
+        .repo_id = "FluidInference/parakeet-tdt-1.1b-v3-ov",
         .required_files = PARAKEET_STANDARD_FILES,
         .cache_subdir = "parakeet-v3"
     };
diff --git a/src/models/parakeet-v2/parakeet_openvino.cpp b/src/models/parakeet-v2/parakeet_openvino.cpp
index 4cc1add..aed36cb 100644
--- a/src/models/parakeet-v2/parakeet_openvino.cpp
+++ b/src/models/parakeet-v2/parakeet_openvino.cpp
@@ -392,6 +392,12 @@ void OpenVINOParakeet::ensure_compiled_model() const {
     const auto mel_shape = impl_->encoder_ports.mel_in.value().get_shape();
     impl_->encoder_expected_frames = mel_shape.empty() ? 0U : mel_shape.back();
 
+    // Fallback for dynamic shapes: Use 1250 frames (10 seconds at 125 frames/sec mel rate)
+    // This enables chunking for models with dynamic encoder inputs
+    if (impl_->encoder_expected_frames == 0) {
+        impl_->encoder_expected_frames = 1250;
+    }
+
     // ========================================
     // Determine encoder output indices and hidden size
     // ========================================
diff --git a/src/models/parakeet-v2/parakeet_preprocessor.cpp b/src/models/parakeet-v2/parakeet_preprocessor.cpp
index 2754357..7a0debf 100644
--- a/src/models/parakeet-v2/parakeet_preprocessor.cpp
+++ b/src/models/parakeet-v2/parakeet_preprocessor.cpp
@@ -44,6 +44,17 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) {
     throw std::invalid_argument("Parakeet OpenVINO pipeline expects 16 kHz audio samples");
   }
 
+  // Workaround for v3 preprocessor bug: round sample count to nearest 1000
+  // v3 fails on specific odd-length audio (e.g., 240,135 samples)
+  AudioSegment working_segment = segment;
+  const size_t original_size = segment.pcm.size();
+  const size_t round_to = 1000;
+  const size_t rounded_size = ((original_size + round_to - 1) / round_to) * round_to;
+
+  if (rounded_size != original_size) {
+    working_segment.pcm.resize(rounded_size, 0.0F);  // Pad with zeros
+  }
+
   // Query preprocessor window size from compiled model shape.
   // The Parakeet ONNX model always has static shape [1, 160000], but OpenVINO
   // may compile it with dynamic shapes for optimization (especially on CPU).
@@ -65,6 +76,12 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) {
     }
   }
 
+  // Enforce maximum window size for dynamic models
+  // v3 preprocessor has dynamic shape - use same 10s windows as v2
+  if (window_samples == 0 && working_segment.pcm.size() > 160000) {
+    window_samples = 160000;  // 10 seconds at 16kHz (matches v2)
+  }
+
   // Query length input type once (fixed at model export)
   const auto len_et = impl.preproc_model.input(1).get_element_type();
   const bool use_i64 = (len_et == ov::element::i64);
@@ -110,8 +127,8 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) {
   // ========================================
   // Single-shot path: Dynamically compiled model OR short audio that fits in one window
   // ========================================
-  if (window_samples == 0 || segment.pcm.size() <= window_samples) {
-    auto [mel_tensor, length_tensor] = run_window(segment.pcm.data(), segment.pcm.size());
+  if (window_samples == 0 || working_segment.pcm.size() <= window_samples) {
+    auto [mel_tensor, length_tensor] = run_window(working_segment.pcm.data(), working_segment.pcm.size());
 
     const int64_t valid_frames = read_length_scalar(length_tensor);
     if (valid_frames <= 0) {
@@ -148,7 +165,7 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) {
   // ========================================
   } else {
     constexpr size_t kMelBins = 128;
-    const size_t total_samples = segment.pcm.size();
+    const size_t total_samples = working_segment.pcm.size();
 
     std::vector<std::vector<float>> mel_bins(kMelBins);
     size_t offset = 0;
@@ -158,7 +175,7 @@ MelFeatures run_preprocessor(ParakeetImpl& impl, const AudioSegment& segment) {
       const size_t remaining = total_samples - offset;
       const size_t this_count = std::min(window_samples, remaining);
 
-      auto [mel_tensor, length_tensor] = run_window(segment.pcm.data() + offset, this_count);
+      auto [mel_tensor, length_tensor] = run_window(working_segment.pcm.data() + offset, this_count);
       const int64_t vframes = read_length_scalar(length_tensor);
 
       if (vframes <= 0) {