diff --git a/src/winml/modelkit/build/hf.py b/src/winml/modelkit/build/hf.py index 78ef8576f..60ee0dabc 100644 --- a/src/winml/modelkit/build/hf.py +++ b/src/winml/modelkit/build/hf.py @@ -255,17 +255,11 @@ def _name(base: str) -> str: "Skipping optimize + quantize, running analyze-only." ) stages_skipped.append("optimize") - # Optimize+analyze only, no autoconf re-optimization - current_path, _, analyze_iterations, analyze_unsupported_nodes, analyze_details = ( - run_optimize_analyze_loop( - model_path=current_path, - optimized_path=optimized_path, - config=config, - ep=ep, - device=device, - **onnx_kwargs, - ) - ) + # Skip optimize entirely for pre-quantized models. ORT Level 2 + # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv), + # which breaks QNN/DML EP compatibility and causes CPU fallback. + analyze_iterations, analyze_unsupported_nodes = 0, 0 + analyze_details: dict[str, Any] = {} else: logger.info("Optimizing ONNX model...") ( diff --git a/src/winml/modelkit/build/onnx.py b/src/winml/modelkit/build/onnx.py index 2e7424e99..7b73bfe6d 100644 --- a/src/winml/modelkit/build/onnx.py +++ b/src/winml/modelkit/build/onnx.py @@ -160,17 +160,11 @@ def build_onnx_model( "Skipping optimize + quantize, running analyze-only." ) stages_skipped.append("optimize") - # Optimize+analyze only, no autoconf re-optimization - current_path, _, analyze_iters, analyze_unsupported, analyze_details = ( - run_optimize_analyze_loop( - model_path=current_path, - optimized_path=optimized_path, - config=config, - ep=ep, - device=device, - **onnx_kwargs, - ) - ) + # Skip optimize entirely for pre-quantized models. ORT Level 2 + # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv), + # which breaks QNN/DML EP compatibility and causes CPU fallback. + analyze_iters, analyze_unsupported = 0, 0 + analyze_details: dict[str, Any] = {} else: logger.info("Optimizing ONNX model...") current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = ( diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py index db0a65677..29d4f58c8 100644 --- a/src/winml/modelkit/commands/build.py +++ b/src/winml/modelkit/commands/build.py @@ -588,11 +588,19 @@ def build( raise click.UsageError("-m/--model is required when -c is not provided.") from ..config import generate_build_config - config_or_configs = generate_build_config( - model_id, - trust_remote_code=trust_remote_code, - device=device, - ) + # Detect ONNX file input and route to generate_onnx_build_config + if cli_utils.is_onnx_file_path(model_id): + config_or_configs = generate_build_config( + onnx_path=model_id, + device=device, + ep=ep, + ) + else: + config_or_configs = generate_build_config( + model_id, + trust_remote_code=trust_remote_code, + device=device, + ) if not quant: config_or_configs.quant = None # Auto-generated configs: compile disabled by default unless diff --git a/tests/unit/build/test_hf.py b/tests/unit/build/test_hf.py index 38203af44..ea03d8968 100644 --- a/tests/unit/build/test_hf.py +++ b/tests/unit/build/test_hf.py @@ -811,7 +811,7 @@ def test_post_export_qdq_skips_optimize_and_quantize( assert "quantize" in result.stages_skipped assert "optimize" not in result.stages_completed assert "quantize" not in result.stages_completed - mock_pipeline["optimize"].assert_called_once() + mock_pipeline["optimize"].assert_not_called() mock_pipeline["quantize"].assert_not_called() def test_post_export_qdq_still_exports( @@ -847,7 +847,7 @@ def test_post_export_qdq_still_compiles( def test_post_export_qdq_runs_analyze_only( self, tmp_path: Path, sample_config, mock_pipeline ) -> None: - """Pre-quantized path runs optimize but skips autoconf (no analyze).""" + """Pre-quantized path skips optimize entirely (no analyze either).""" mock_pipeline["is_quantized_onnx"].return_value = True output_dir = tmp_path / "output" @@ -856,9 +856,9 @@ def test_post_export_qdq_runs_analyze_only( output_dir=output_dir, pytorch_model=mock_pipeline["model"], ) - # max_optim_iterations=0 means no analyze loop runs + # Pre-quantized models skip both optimize and analyze to preserve QDQ structure mock_pipeline["analyze"].assert_not_called() - mock_pipeline["optimize"].assert_called_once() + mock_pipeline["optimize"].assert_not_called() def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) -> None: """skip_optimize=True forces optimize+quantize skip.""" @@ -873,7 +873,7 @@ def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) ) assert "optimize" in result.stages_skipped assert "quantize" in result.stages_skipped - mock_pipeline["optimize"].assert_called_once() + mock_pipeline["optimize"].assert_not_called() mock_pipeline["quantize"].assert_not_called() diff --git a/tests/unit/build/test_onnx.py b/tests/unit/build/test_onnx.py index 1c1322907..4302b6851 100644 --- a/tests/unit/build/test_onnx.py +++ b/tests/unit/build/test_onnx.py @@ -379,7 +379,7 @@ def test_pre_quantized_skips_optimize_and_quantize( assert "quantize" in result.stages_skipped assert "optimize" not in result.stages_completed assert "quantize" not in result.stages_completed - mock_onnx_pipeline["optimize"].assert_called_once() + mock_onnx_pipeline["optimize"].assert_not_called() mock_onnx_pipeline["quantize"].assert_not_called() def test_pre_quantized_still_compiles( @@ -400,7 +400,7 @@ def test_pre_quantized_still_compiles( def test_pre_quantized_runs_analyze_only( self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline ) -> None: - """Pre-quantized path runs optimize but skips autoconf (no analyze).""" + """Pre-quantized path skips optimize entirely (no analyze either).""" mock_onnx_pipeline["is_quantized_onnx"].return_value = True output_dir = tmp_path / "output" @@ -409,9 +409,9 @@ def test_pre_quantized_runs_analyze_only( config=sample_onnx_config, output_dir=output_dir, ) - # max_optim_iterations=0 means no analyze loop runs + # Pre-quantized models skip both optimize and analyze to preserve QDQ structure mock_onnx_pipeline["analyze"].assert_not_called() - mock_onnx_pipeline["optimize"].assert_called_once() + mock_onnx_pipeline["optimize"].assert_not_called() def test_skip_optimize_kwarg( self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline @@ -428,7 +428,7 @@ def test_skip_optimize_kwarg( ) assert "optimize" in result.stages_skipped assert "quantize" in result.stages_skipped - mock_onnx_pipeline["optimize"].assert_called_once() + mock_onnx_pipeline["optimize"].assert_not_called() mock_onnx_pipeline["quantize"].assert_not_called()