microsoft · DingmaomaoBJTU · May 29, 2026 · Jun 16, 2026 · Jun 16, 2026 · xieofxie
@@ -255,17 +255,11 @@ def _name(base: str) -> str:
             "Skipping optimize + quantize, running analyze-only."
         )
         stages_skipped.append("optimize")
-        # Optimize+analyze only, no autoconf re-optimization
-        current_path, _, analyze_iterations, analyze_unsupported_nodes, analyze_details = (
-            run_optimize_analyze_loop(
-                model_path=current_path,
-                optimized_path=optimized_path,
-                config=config,
-                ep=ep,
-                device=device,
-                **onnx_kwargs,
-            )
-        )
+        # Skip optimize entirely for pre-quantized models. ORT Level 2
+        # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
+        # which breaks QNN/DML EP compatibility and causes CPU fallback.
+        analyze_iterations, analyze_unsupported_nodes = 0, 0
+        analyze_details: dict[str, Any] = {}
     else:
         logger.info("Optimizing ONNX model...")
         (

@@ -160,17 +160,11 @@ def build_onnx_model(
             "Skipping optimize + quantize, running analyze-only."
         )
         stages_skipped.append("optimize")
-        # Optimize+analyze only, no autoconf re-optimization
-        current_path, _, analyze_iters, analyze_unsupported, analyze_details = (
-            run_optimize_analyze_loop(
-                model_path=current_path,
-                optimized_path=optimized_path,
-                config=config,
-                ep=ep,
-                device=device,
-                **onnx_kwargs,
-            )
-        )
+        # Skip optimize entirely for pre-quantized models. ORT Level 2
+        # optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
+        # which breaks QNN/DML EP compatibility and causes CPU fallback.
+        analyze_iters, analyze_unsupported = 0, 0
+        analyze_details: dict[str, Any] = {}
     else:
         logger.info("Optimizing ONNX model...")
         current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = (

@@ -588,11 +588,19 @@ def build(
                 raise click.UsageError("-m/--model is required when -c is not provided.")
             from ..config import generate_build_config
 
-            config_or_configs = generate_build_config(
-                model_id,
-                trust_remote_code=trust_remote_code,
-                device=device,
-            )
+            # Detect ONNX file input and route to generate_onnx_build_config
+            if cli_utils.is_onnx_file_path(model_id):
+                config_or_configs = generate_build_config(
+                    onnx_path=model_id,
+                    device=device,
+                    ep=ep,
+                )
+            else:
+                config_or_configs = generate_build_config(
+                    model_id,
+                    trust_remote_code=trust_remote_code,
+                    device=device,
+                )
             if not quant:
                 config_or_configs.quant = None
             # Auto-generated configs: compile disabled by default unless

@@ -811,7 +811,7 @@ def test_post_export_qdq_skips_optimize_and_quantize(
         assert "quantize" in result.stages_skipped
         assert "optimize" not in result.stages_completed
         assert "quantize" not in result.stages_completed
-        mock_pipeline["optimize"].assert_called_once()
+        mock_pipeline["optimize"].assert_not_called()
         mock_pipeline["quantize"].assert_not_called()
 
     def test_post_export_qdq_still_exports(
@@ -847,7 +847,7 @@ def test_post_export_qdq_still_compiles(
     def test_post_export_qdq_runs_analyze_only(
         self, tmp_path: Path, sample_config, mock_pipeline
     ) -> None:
-        """Pre-quantized path runs optimize but skips autoconf (no analyze)."""
+        """Pre-quantized path skips optimize entirely (no analyze either)."""
         mock_pipeline["is_quantized_onnx"].return_value = True
 
         output_dir = tmp_path / "output"
@@ -856,9 +856,9 @@ def test_post_export_qdq_runs_analyze_only(
             output_dir=output_dir,
             pytorch_model=mock_pipeline["model"],
         )
-        # max_optim_iterations=0 means no analyze loop runs
+        # Pre-quantized models skip both optimize and analyze to preserve QDQ structure
         mock_pipeline["analyze"].assert_not_called()
-        mock_pipeline["optimize"].assert_called_once()
+        mock_pipeline["optimize"].assert_not_called()
 
     def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) -> None:
         """skip_optimize=True forces optimize+quantize skip."""
@@ -873,7 +873,7 @@ def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline)
         )
         assert "optimize" in result.stages_skipped
         assert "quantize" in result.stages_skipped
-        mock_pipeline["optimize"].assert_called_once()
+        mock_pipeline["optimize"].assert_not_called()
         mock_pipeline["quantize"].assert_not_called()
 
 

@@ -379,7 +379,7 @@ def test_pre_quantized_skips_optimize_and_quantize(
         assert "quantize" in result.stages_skipped
         assert "optimize" not in result.stages_completed
         assert "quantize" not in result.stages_completed
-        mock_onnx_pipeline["optimize"].assert_called_once()
+        mock_onnx_pipeline["optimize"].assert_not_called()
         mock_onnx_pipeline["quantize"].assert_not_called()
 
     def test_pre_quantized_still_compiles(
@@ -400,7 +400,7 @@ def test_pre_quantized_still_compiles(
     def test_pre_quantized_runs_analyze_only(
         self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline
     ) -> None:
-        """Pre-quantized path runs optimize but skips autoconf (no analyze)."""
+        """Pre-quantized path skips optimize entirely (no analyze either)."""
         mock_onnx_pipeline["is_quantized_onnx"].return_value = True
 
         output_dir = tmp_path / "output"
@@ -409,9 +409,9 @@ def test_pre_quantized_runs_analyze_only(
             config=sample_onnx_config,
             output_dir=output_dir,
         )
-        # max_optim_iterations=0 means no analyze loop runs
+        # Pre-quantized models skip both optimize and analyze to preserve QDQ structure
         mock_onnx_pipeline["analyze"].assert_not_called()
-        mock_onnx_pipeline["optimize"].assert_called_once()
+        mock_onnx_pipeline["optimize"].assert_not_called()
 
     def test_skip_optimize_kwarg(
         self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline
@@ -428,7 +428,7 @@ def test_skip_optimize_kwarg(
         )
         assert "optimize" in result.stages_skipped
         assert "quantize" in result.stages_skipped
-        mock_onnx_pipeline["optimize"].assert_called_once()
+        mock_onnx_pipeline["optimize"].assert_not_called()
         mock_onnx_pipeline["quantize"].assert_not_called()