Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions src/winml/modelkit/build/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,17 +255,11 @@ def _name(base: str) -> str:
"Skipping optimize + quantize, running analyze-only."
)
stages_skipped.append("optimize")
# Optimize+analyze only, no autoconf re-optimization
current_path, _, analyze_iterations, analyze_unsupported_nodes, analyze_details = (
run_optimize_analyze_loop(
model_path=current_path,
optimized_path=optimized_path,
config=config,
ep=ep,
device=device,
**onnx_kwargs,
)
)
# Skip optimize entirely for pre-quantized models. ORT Level 2
# optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),
# which breaks QNN/DML EP compatibility and causes CPU fallback.
analyze_iterations, analyze_unsupported_nodes = 0, 0
analyze_details: dict[str, Any] = {}
else:
logger.info("Optimizing ONNX model...")
(
Expand Down
16 changes: 5 additions & 11 deletions src/winml/modelkit/build/onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,17 +160,11 @@ def build_onnx_model(
"Skipping optimize + quantize, running analyze-only."

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stale/misleading log message. This still logs "...running analyze-only," but after this change the analyze loop is gone — analyze_iters/analyze_unsupported are hardcoded to 0 and no analysis runs. The message now actively misdescribes the behavior (and ironically the PR description claims to fix misleading logs, yet no log line is touched in the diff). Suggest updating to something like "Pre-quantized model: skipping optimize + quantize to preserve QDQ structure." Same applies to the matching line in hf.py.

)
Comment thread
DingmaomaoBJTU marked this conversation as resolved.
stages_skipped.append("optimize")
# Optimize+analyze only, no autoconf re-optimization
current_path, _, analyze_iters, analyze_unsupported, analyze_details = (
run_optimize_analyze_loop(
model_path=current_path,
optimized_path=optimized_path,
config=config,
ep=ep,
device=device,
**onnx_kwargs,
)
)
# Skip optimize entirely for pre-quantized models. ORT Level 2
# optimization fuses QDQ patterns (e.g. DQ→Conv→Q → QLinearConv),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unresolved technical premise. This comment says ORT Level 2 fuses QDQ→QLinear (so optimize must be skipped), but the PR description claims the optimizer is safe and does not fuse QDQ. These can't both be true, and the PR's correctness hinges on which is.

Note run_optimize_analyze_loop calls optimize_onnx(model, output, **config.optim) unconditionally (common.py:83), so whether fusion happens depends on the graph-optimization level config.optim applies to a QDQ model. Please confirm empirically — does optimize actually emit QLinearConv/QLinearMatMul on a sample QDQ model? — and state the verified reason here.

# which breaks QNN/DML EP compatibility and causes CPU fallback.
analyze_iters, analyze_unsupported = 0, 0
analyze_details: dict[str, Any] = {}
Comment on lines +163 to +167

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description contradicts this code. The title/body say pre-quantized models now run the full optimize+autoconf+analyze loop and only skip quantize, and that "our custom optimizer is safe for QDQ models." This code does the opposite — it removes run_optimize_analyze_loop and skips optimize entirely (the updated tests flip optimize.assert_called_once()assert_not_called()).

None of the "Changes" bullets match the diff: there is no is_qdq variable (still is_pre_quantized), no log message is changed, and skip_optimize now skips optimize rather than "restoring original semantics — runs optimize_onnx." Please rewrite the description to match the implementation before merge so the commit history / bisect aren't misleading.

else:
logger.info("Optimizing ONNX model...")
current_path, opt_elapsed, analyze_iters, analyze_unsupported, analyze_details = (
Expand Down
18 changes: 13 additions & 5 deletions src/winml/modelkit/commands/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,11 +588,19 @@ def build(
raise click.UsageError("-m/--model is required when -c is not provided.")
from ..config import generate_build_config

config_or_configs = generate_build_config(
model_id,
trust_remote_code=trust_remote_code,
device=device,
)
# Detect ONNX file input and route to generate_onnx_build_config
if cli_utils.is_onnx_file_path(model_id):
config_or_configs = generate_build_config(
onnx_path=model_id,
device=device,
ep=ep,
)
else:
config_or_configs = generate_build_config(
model_id,
trust_remote_code=trust_remote_code,
device=device,
)
if not quant:
config_or_configs.quant = None
# Auto-generated configs: compile disabled by default unless
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/build/test_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ def test_post_export_qdq_skips_optimize_and_quantize(
assert "quantize" in result.stages_skipped
assert "optimize" not in result.stages_completed
assert "quantize" not in result.stages_completed
mock_pipeline["optimize"].assert_called_once()
mock_pipeline["optimize"].assert_not_called()
mock_pipeline["quantize"].assert_not_called()

def test_post_export_qdq_still_exports(
Expand Down Expand Up @@ -847,7 +847,7 @@ def test_post_export_qdq_still_compiles(
def test_post_export_qdq_runs_analyze_only(
self, tmp_path: Path, sample_config, mock_pipeline
) -> None:
"""Pre-quantized path runs optimize but skips autoconf (no analyze)."""
"""Pre-quantized path skips optimize entirely (no analyze either)."""
mock_pipeline["is_quantized_onnx"].return_value = True

output_dir = tmp_path / "output"
Expand All @@ -856,9 +856,9 @@ def test_post_export_qdq_runs_analyze_only(
output_dir=output_dir,
pytorch_model=mock_pipeline["model"],
)
# max_optim_iterations=0 means no analyze loop runs
# Pre-quantized models skip both optimize and analyze to preserve QDQ structure
mock_pipeline["analyze"].assert_not_called()
mock_pipeline["optimize"].assert_called_once()
mock_pipeline["optimize"].assert_not_called()

def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) -> None:
"""skip_optimize=True forces optimize+quantize skip."""
Expand All @@ -873,7 +873,7 @@ def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline)
)
assert "optimize" in result.stages_skipped
assert "quantize" in result.stages_skipped
mock_pipeline["optimize"].assert_called_once()
mock_pipeline["optimize"].assert_not_called()
mock_pipeline["quantize"].assert_not_called()


Expand Down
10 changes: 5 additions & 5 deletions tests/unit/build/test_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def test_pre_quantized_skips_optimize_and_quantize(
assert "quantize" in result.stages_skipped
assert "optimize" not in result.stages_completed
assert "quantize" not in result.stages_completed
mock_onnx_pipeline["optimize"].assert_called_once()
mock_onnx_pipeline["optimize"].assert_not_called()
mock_onnx_pipeline["quantize"].assert_not_called()

def test_pre_quantized_still_compiles(
Expand All @@ -400,7 +400,7 @@ def test_pre_quantized_still_compiles(
def test_pre_quantized_runs_analyze_only(
self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline
) -> None:
"""Pre-quantized path runs optimize but skips autoconf (no analyze)."""
"""Pre-quantized path skips optimize entirely (no analyze either)."""
mock_onnx_pipeline["is_quantized_onnx"].return_value = True

output_dir = tmp_path / "output"
Expand All @@ -409,9 +409,9 @@ def test_pre_quantized_runs_analyze_only(
config=sample_onnx_config,
output_dir=output_dir,
)
# max_optim_iterations=0 means no analyze loop runs
# Pre-quantized models skip both optimize and analyze to preserve QDQ structure
mock_onnx_pipeline["analyze"].assert_not_called()
mock_onnx_pipeline["optimize"].assert_called_once()
mock_onnx_pipeline["optimize"].assert_not_called()

def test_skip_optimize_kwarg(
self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline
Expand All @@ -428,7 +428,7 @@ def test_skip_optimize_kwarg(
)
assert "optimize" in result.stages_skipped
assert "quantize" in result.stages_skipped
mock_onnx_pipeline["optimize"].assert_called_once()
mock_onnx_pipeline["optimize"].assert_not_called()
mock_onnx_pipeline["quantize"].assert_not_called()


Expand Down
Loading