From d7b1c387ffd5a6115bb1c76ce4f8e1bbea5b05e6 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Fri, 29 May 2026 18:15:41 +0800 Subject: [PATCH 1/3] fix: skip optimize_onnx for pre-quantized models ORT Level 2 optimization fuses QDQ patterns (e.g. DQ->Conv->Q into QLinearConv), breaking QNN/DML EP compatibility. Pre-quantized models passed to build were being optimized despite the log saying 'skipping optimize', causing ~30x latency regression (500+ms vs 17ms) due to CPU fallback. Now build_onnx_model truly skips the optimize stage for pre-quantized models, preserving the original QDQ graph structure. --- src/winml/modelkit/build/onnx.py | 15 ++++----------- tests/unit/build/test_onnx.py | 10 +++++----- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/winml/modelkit/build/onnx.py b/src/winml/modelkit/build/onnx.py index 2e7424e99..97ef7d22a 100644 --- a/src/winml/modelkit/build/onnx.py +++ b/src/winml/modelkit/build/onnx.py @@ -160,17 +160,10 @@ def build_onnx_model( "Skipping optimize + quantize, running analyze-only." ) stages_skipped.append("optimize") - # Optimize+analyze only, no autoconf re-optimization - current_path, _, analyze_iters, analyze_unsupported, analyze_details = ( - run_optimize_analyze_loop( - model_path=current_path, - optimized_path=optimized_path, - config=config, - ep=ep, - device=device, - **onnx_kwargs, - ) - ) + # Skip optimize entirely for pre-quantized models. ORT Level 2 + # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv), + # which breaks QNN/DML EP compatibility and causes CPU fallback. + analyze_iters, analyze_unsupported, analyze_details = 0, 0, {} else: logger.info("Optimizing ONNX model...") current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = ( diff --git a/tests/unit/build/test_onnx.py b/tests/unit/build/test_onnx.py index 1c1322907..4302b6851 100644 --- a/tests/unit/build/test_onnx.py +++ b/tests/unit/build/test_onnx.py @@ -379,7 +379,7 @@ def test_pre_quantized_skips_optimize_and_quantize( assert "quantize" in result.stages_skipped assert "optimize" not in result.stages_completed assert "quantize" not in result.stages_completed - mock_onnx_pipeline["optimize"].assert_called_once() + mock_onnx_pipeline["optimize"].assert_not_called() mock_onnx_pipeline["quantize"].assert_not_called() def test_pre_quantized_still_compiles( @@ -400,7 +400,7 @@ def test_pre_quantized_still_compiles( def test_pre_quantized_runs_analyze_only( self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline ) -> None: - """Pre-quantized path runs optimize but skips autoconf (no analyze).""" + """Pre-quantized path skips optimize entirely (no analyze either).""" mock_onnx_pipeline["is_quantized_onnx"].return_value = True output_dir = tmp_path / "output" @@ -409,9 +409,9 @@ def test_pre_quantized_runs_analyze_only( config=sample_onnx_config, output_dir=output_dir, ) - # max_optim_iterations=0 means no analyze loop runs + # Pre-quantized models skip both optimize and analyze to preserve QDQ structure mock_onnx_pipeline["analyze"].assert_not_called() - mock_onnx_pipeline["optimize"].assert_called_once() + mock_onnx_pipeline["optimize"].assert_not_called() def test_skip_optimize_kwarg( self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline @@ -428,7 +428,7 @@ def test_skip_optimize_kwarg( ) assert "optimize" in result.stages_skipped assert "quantize" in result.stages_skipped - mock_onnx_pipeline["optimize"].assert_called_once() + mock_onnx_pipeline["optimize"].assert_not_called() mock_onnx_pipeline["quantize"].assert_not_called() From 785266981ffb5ee4ecc05f62101d705427f35ddc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 15:03:20 +0800 Subject: [PATCH 2/3] fix: apply optimize skip to hf.py and fix build -m model.onnx without -c Address reviewer feedback: - build/hf.py: skip optimize_onnx for pre-quantized models (same fix as build/onnx.py, prevents QDQ-to-QLinear fusion that breaks EP compat) - commands/build.py: detect ONNX file input and route to generate_onnx_build_config via onnx_path param, fixing 'not a valid JSON file' error when running winml build -m model.onnx without -c - Update test_hf.py assertions to match new skip behavior --- src/winml/modelkit/build/hf.py | 15 ++++----------- src/winml/modelkit/commands/build.py | 18 +++++++++++++----- tests/unit/build/test_hf.py | 10 +++++----- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/winml/modelkit/build/hf.py b/src/winml/modelkit/build/hf.py index 78ef8576f..3440f2683 100644 --- a/src/winml/modelkit/build/hf.py +++ b/src/winml/modelkit/build/hf.py @@ -255,17 +255,10 @@ def _name(base: str) -> str: "Skipping optimize + quantize, running analyze-only." ) stages_skipped.append("optimize") - # Optimize+analyze only, no autoconf re-optimization - current_path, _, analyze_iterations, analyze_unsupported_nodes, analyze_details = ( - run_optimize_analyze_loop( - model_path=current_path, - optimized_path=optimized_path, - config=config, - ep=ep, - device=device, - **onnx_kwargs, - ) - ) + # Skip optimize entirely for pre-quantized models. ORT Level 2 + # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv), + # which breaks QNN/DML EP compatibility and causes CPU fallback. + analyze_iterations, analyze_unsupported_nodes, analyze_details = 0, 0, {} else: logger.info("Optimizing ONNX model...") ( diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py index db0a65677..29d4f58c8 100644 --- a/src/winml/modelkit/commands/build.py +++ b/src/winml/modelkit/commands/build.py @@ -588,11 +588,19 @@ def build( raise click.UsageError("-m/--model is required when -c is not provided.") from ..config import generate_build_config - config_or_configs = generate_build_config( - model_id, - trust_remote_code=trust_remote_code, - device=device, - ) + # Detect ONNX file input and route to generate_onnx_build_config + if cli_utils.is_onnx_file_path(model_id): + config_or_configs = generate_build_config( + onnx_path=model_id, + device=device, + ep=ep, + ) + else: + config_or_configs = generate_build_config( + model_id, + trust_remote_code=trust_remote_code, + device=device, + ) if not quant: config_or_configs.quant = None # Auto-generated configs: compile disabled by default unless diff --git a/tests/unit/build/test_hf.py b/tests/unit/build/test_hf.py index 38203af44..ea03d8968 100644 --- a/tests/unit/build/test_hf.py +++ b/tests/unit/build/test_hf.py @@ -811,7 +811,7 @@ def test_post_export_qdq_skips_optimize_and_quantize( assert "quantize" in result.stages_skipped assert "optimize" not in result.stages_completed assert "quantize" not in result.stages_completed - mock_pipeline["optimize"].assert_called_once() + mock_pipeline["optimize"].assert_not_called() mock_pipeline["quantize"].assert_not_called() def test_post_export_qdq_still_exports( @@ -847,7 +847,7 @@ def test_post_export_qdq_still_compiles( def test_post_export_qdq_runs_analyze_only( self, tmp_path: Path, sample_config, mock_pipeline ) -> None: - """Pre-quantized path runs optimize but skips autoconf (no analyze).""" + """Pre-quantized path skips optimize entirely (no analyze either).""" mock_pipeline["is_quantized_onnx"].return_value = True output_dir = tmp_path / "output" @@ -856,9 +856,9 @@ def test_post_export_qdq_runs_analyze_only( output_dir=output_dir, pytorch_model=mock_pipeline["model"], ) - # max_optim_iterations=0 means no analyze loop runs + # Pre-quantized models skip both optimize and analyze to preserve QDQ structure mock_pipeline["analyze"].assert_not_called() - mock_pipeline["optimize"].assert_called_once() + mock_pipeline["optimize"].assert_not_called() def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) -> None: """skip_optimize=True forces optimize+quantize skip.""" @@ -873,7 +873,7 @@ def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) ) assert "optimize" in result.stages_skipped assert "quantize" in result.stages_skipped - mock_pipeline["optimize"].assert_called_once() + mock_pipeline["optimize"].assert_not_called() mock_pipeline["quantize"].assert_not_called() From bb0c2fda0125b3cd4563ced70a1cc4aac755c043 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 15:28:14 +0800 Subject: [PATCH 3/3] fix: add type annotations to satisfy mypy var-annotated --- src/winml/modelkit/build/hf.py | 3 ++- src/winml/modelkit/build/onnx.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/winml/modelkit/build/hf.py b/src/winml/modelkit/build/hf.py index 3440f2683..60ee0dabc 100644 --- a/src/winml/modelkit/build/hf.py +++ b/src/winml/modelkit/build/hf.py @@ -258,7 +258,8 @@ def _name(base: str) -> str: # Skip optimize entirely for pre-quantized models. ORT Level 2 # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv), # which breaks QNN/DML EP compatibility and causes CPU fallback. - analyze_iterations, analyze_unsupported_nodes, analyze_details = 0, 0, {} + analyze_iterations, analyze_unsupported_nodes = 0, 0 + analyze_details: dict[str, Any] = {} else: logger.info("Optimizing ONNX model...") ( diff --git a/src/winml/modelkit/build/onnx.py b/src/winml/modelkit/build/onnx.py index 97ef7d22a..7b73bfe6d 100644 --- a/src/winml/modelkit/build/onnx.py +++ b/src/winml/modelkit/build/onnx.py @@ -163,7 +163,8 @@ def build_onnx_model( # Skip optimize entirely for pre-quantized models. ORT Level 2 # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv), # which breaks QNN/DML EP compatibility and causes CPU fallback. - analyze_iters, analyze_unsupported, analyze_details = 0, 0, {} + analyze_iters, analyze_unsupported = 0, 0 + analyze_details: dict[str, Any] = {} else: logger.info("Optimizing ONNX model...") current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = (