From d7b1c387ffd5a6115bb1c76ce4f8e1bbea5b05e6 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Fri, 29 May 2026 18:15:41 +0800
Subject: [PATCH 1/3] fix: skip optimize_onnx for pre-quantized models

ORT Level 2 optimization fuses QDQ patterns (e.g. DQ->Conv->Q into
QLinearConv), breaking QNN/DML EP compatibility. Pre-quantized models
passed to build were being optimized despite the log saying 'skipping
optimize', causing ~30x latency regression (500+ms vs 17ms) due to
CPU fallback.

Now build_onnx_model truly skips the optimize stage for pre-quantized
models, preserving the original QDQ graph structure.
---
 src/winml/modelkit/build/onnx.py | 15 ++++-----------
 tests/unit/build/test_onnx.py    | 10 +++++-----
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/winml/modelkit/build/onnx.py b/src/winml/modelkit/build/onnx.py
index 2e7424e99..97ef7d22a 100644
--- a/src/winml/modelkit/build/onnx.py
+++ b/src/winml/modelkit/build/onnx.py
@@ -160,17 +160,10 @@ def build_onnx_model(
             "Skipping optimize + quantize, running analyze-only."
         )
         stages_skipped.append("optimize")
-        # Optimize+analyze only, no autoconf re-optimization
-        current_path, _, analyze_iters, analyze_unsupported, analyze_details = (
-            run_optimize_analyze_loop(
-                model_path=current_path,
-                optimized_path=optimized_path,
-                config=config,
-                ep=ep,
-                device=device,
-                **onnx_kwargs,
-            )
-        )
+        # Skip optimize entirely for pre-quantized models. ORT Level 2
+        # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
+        # which breaks QNN/DML EP compatibility and causes CPU fallback.
+        analyze_iters, analyze_unsupported, analyze_details = 0, 0, {}
     else:
         logger.info("Optimizing ONNX model...")
         current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = (
diff --git a/tests/unit/build/test_onnx.py b/tests/unit/build/test_onnx.py
index 1c1322907..4302b6851 100644
--- a/tests/unit/build/test_onnx.py
+++ b/tests/unit/build/test_onnx.py
@@ -379,7 +379,7 @@ def test_pre_quantized_skips_optimize_and_quantize(
         assert "quantize" in result.stages_skipped
         assert "optimize" not in result.stages_completed
         assert "quantize" not in result.stages_completed
-        mock_onnx_pipeline["optimize"].assert_called_once()
+        mock_onnx_pipeline["optimize"].assert_not_called()
         mock_onnx_pipeline["quantize"].assert_not_called()
 
     def test_pre_quantized_still_compiles(
@@ -400,7 +400,7 @@ def test_pre_quantized_still_compiles(
     def test_pre_quantized_runs_analyze_only(
         self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline
     ) -> None:
-        """Pre-quantized path runs optimize but skips autoconf (no analyze)."""
+        """Pre-quantized path skips optimize entirely (no analyze either)."""
         mock_onnx_pipeline["is_quantized_onnx"].return_value = True
 
         output_dir = tmp_path / "output"
@@ -409,9 +409,9 @@ def test_pre_quantized_runs_analyze_only(
             config=sample_onnx_config,
             output_dir=output_dir,
         )
-        # max_optim_iterations=0 means no analyze loop runs
+        # Pre-quantized models skip both optimize and analyze to preserve QDQ structure
         mock_onnx_pipeline["analyze"].assert_not_called()
-        mock_onnx_pipeline["optimize"].assert_called_once()
+        mock_onnx_pipeline["optimize"].assert_not_called()
 
     def test_skip_optimize_kwarg(
         self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline
@@ -428,7 +428,7 @@ def test_skip_optimize_kwarg(
         )
         assert "optimize" in result.stages_skipped
         assert "quantize" in result.stages_skipped
-        mock_onnx_pipeline["optimize"].assert_called_once()
+        mock_onnx_pipeline["optimize"].assert_not_called()
         mock_onnx_pipeline["quantize"].assert_not_called()
 
 

From 785266981ffb5ee4ecc05f62101d705427f35ddc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 15:03:20 +0800
Subject: [PATCH 2/3] fix: apply optimize skip to hf.py and fix build -m
 model.onnx without -c

Address reviewer feedback:
- build/hf.py: skip optimize_onnx for pre-quantized models (same fix
  as build/onnx.py, prevents QDQ-to-QLinear fusion that breaks EP compat)
- commands/build.py: detect ONNX file input and route to
  generate_onnx_build_config via onnx_path param, fixing 'not a valid
  JSON file' error when running winml build -m model.onnx without -c
- Update test_hf.py assertions to match new skip behavior
---
 src/winml/modelkit/build/hf.py       | 15 ++++-----------
 src/winml/modelkit/commands/build.py | 18 +++++++++++++-----
 tests/unit/build/test_hf.py          | 10 +++++-----
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/winml/modelkit/build/hf.py b/src/winml/modelkit/build/hf.py
index 78ef8576f..3440f2683 100644
--- a/src/winml/modelkit/build/hf.py
+++ b/src/winml/modelkit/build/hf.py
@@ -255,17 +255,10 @@ def _name(base: str) -> str:
             "Skipping optimize + quantize, running analyze-only."
         )
         stages_skipped.append("optimize")
-        # Optimize+analyze only, no autoconf re-optimization
-        current_path, _, analyze_iterations, analyze_unsupported_nodes, analyze_details = (
-            run_optimize_analyze_loop(
-                model_path=current_path,
-                optimized_path=optimized_path,
-                config=config,
-                ep=ep,
-                device=device,
-                **onnx_kwargs,
-            )
-        )
+        # Skip optimize entirely for pre-quantized models. ORT Level 2
+        # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
+        # which breaks QNN/DML EP compatibility and causes CPU fallback.
+        analyze_iterations, analyze_unsupported_nodes, analyze_details = 0, 0, {}
     else:
         logger.info("Optimizing ONNX model...")
         (
diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py
index db0a65677..29d4f58c8 100644
--- a/src/winml/modelkit/commands/build.py
+++ b/src/winml/modelkit/commands/build.py
@@ -588,11 +588,19 @@ def build(
                 raise click.UsageError("-m/--model is required when -c is not provided.")
             from ..config import generate_build_config
 
-            config_or_configs = generate_build_config(
-                model_id,
-                trust_remote_code=trust_remote_code,
-                device=device,
-            )
+            # Detect ONNX file input and route to generate_onnx_build_config
+            if cli_utils.is_onnx_file_path(model_id):
+                config_or_configs = generate_build_config(
+                    onnx_path=model_id,
+                    device=device,
+                    ep=ep,
+                )
+            else:
+                config_or_configs = generate_build_config(
+                    model_id,
+                    trust_remote_code=trust_remote_code,
+                    device=device,
+                )
             if not quant:
                 config_or_configs.quant = None
             # Auto-generated configs: compile disabled by default unless
diff --git a/tests/unit/build/test_hf.py b/tests/unit/build/test_hf.py
index 38203af44..ea03d8968 100644
--- a/tests/unit/build/test_hf.py
+++ b/tests/unit/build/test_hf.py
@@ -811,7 +811,7 @@ def test_post_export_qdq_skips_optimize_and_quantize(
         assert "quantize" in result.stages_skipped
         assert "optimize" not in result.stages_completed
         assert "quantize" not in result.stages_completed
-        mock_pipeline["optimize"].assert_called_once()
+        mock_pipeline["optimize"].assert_not_called()
         mock_pipeline["quantize"].assert_not_called()
 
     def test_post_export_qdq_still_exports(
@@ -847,7 +847,7 @@ def test_post_export_qdq_still_compiles(
     def test_post_export_qdq_runs_analyze_only(
         self, tmp_path: Path, sample_config, mock_pipeline
     ) -> None:
-        """Pre-quantized path runs optimize but skips autoconf (no analyze)."""
+        """Pre-quantized path skips optimize entirely (no analyze either)."""
         mock_pipeline["is_quantized_onnx"].return_value = True
 
         output_dir = tmp_path / "output"
@@ -856,9 +856,9 @@ def test_post_export_qdq_runs_analyze_only(
             output_dir=output_dir,
             pytorch_model=mock_pipeline["model"],
         )
-        # max_optim_iterations=0 means no analyze loop runs
+        # Pre-quantized models skip both optimize and analyze to preserve QDQ structure
         mock_pipeline["analyze"].assert_not_called()
-        mock_pipeline["optimize"].assert_called_once()
+        mock_pipeline["optimize"].assert_not_called()
 
     def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) -> None:
         """skip_optimize=True forces optimize+quantize skip."""
@@ -873,7 +873,7 @@ def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline)
         )
         assert "optimize" in result.stages_skipped
         assert "quantize" in result.stages_skipped
-        mock_pipeline["optimize"].assert_called_once()
+        mock_pipeline["optimize"].assert_not_called()
         mock_pipeline["quantize"].assert_not_called()
 
 

From bb0c2fda0125b3cd4563ced70a1cc4aac755c043 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 15:28:14 +0800
Subject: [PATCH 3/3] fix: add type annotations to satisfy mypy var-annotated

---
 src/winml/modelkit/build/hf.py   | 3 ++-
 src/winml/modelkit/build/onnx.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/winml/modelkit/build/hf.py b/src/winml/modelkit/build/hf.py
index 3440f2683..60ee0dabc 100644
--- a/src/winml/modelkit/build/hf.py
+++ b/src/winml/modelkit/build/hf.py
@@ -258,7 +258,8 @@ def _name(base: str) -> str:
         # Skip optimize entirely for pre-quantized models. ORT Level 2
         # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
         # which breaks QNN/DML EP compatibility and causes CPU fallback.
-        analyze_iterations, analyze_unsupported_nodes, analyze_details = 0, 0, {}
+        analyze_iterations, analyze_unsupported_nodes = 0, 0
+        analyze_details: dict[str, Any] = {}
     else:
         logger.info("Optimizing ONNX model...")
         (
diff --git a/src/winml/modelkit/build/onnx.py b/src/winml/modelkit/build/onnx.py
index 97ef7d22a..7b73bfe6d 100644
--- a/src/winml/modelkit/build/onnx.py
+++ b/src/winml/modelkit/build/onnx.py
@@ -163,7 +163,8 @@ def build_onnx_model(
         # Skip optimize entirely for pre-quantized models. ORT Level 2
         # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
         # which breaks QNN/DML EP compatibility and causes CPU fallback.
-        analyze_iters, analyze_unsupported, analyze_details = 0, 0, {}
+        analyze_iters, analyze_unsupported = 0, 0
+        analyze_details: dict[str, Any] = {}
     else:
         logger.info("Optimizing ONNX model...")
         current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = (