From 46bb3d34603c2c50f1d69dfa988fd289602a6a96 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Thu, 18 Jun 2026 17:12:00 +0800
Subject: [PATCH 1/2] add example

---
 src/winml/modelkit/commands/perf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 3753c3cd..3c6a9868 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -9,6 +9,7 @@
 Usage:
     winml perf -m microsoft/resnet-50
     winml perf -m microsoft/resnet-50 --device npu --iterations 100
+    winml perf -m microsoft/resnet-50 --module ResNetConvLayer
     winml perf -m bert-base-uncased --task text-classification
 """
 

From ba8d077c7771440bdfbf5380cee3c956666b5111 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Thu, 18 Jun 2026 17:16:09 +0800
Subject: [PATCH 2/2] fix(perf): render live HW chart for --monitor in --module
 mode (#654)

In --module mode, --monitor created an HWMonitor and collected metrics
into the JSON report but never drove the live utilization chart, so the
flag appeared to do nothing (the --monitor help promises a live chart).

Route the per-module monitored loop through the shared _run_monitored_loop
helper that single-model mode already uses, so each module instance renders
the same live chart while still persisting hw_monitor metrics to JSON.
---
 src/winml/modelkit/commands/perf.py     |  15 +++-
 tests/unit/commands/test_perf_module.py | 102 ++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 3c6a9868..dd4731b6 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -876,9 +876,20 @@ def _perf_modules(
                         )
 
                 if hw_ctx:
+                    # Drive the same live chart single-model mode uses so
+                    # --monitor renders a per-module HW utilization chart
+                    # instead of silently dumping metrics to JSON (issue #654).
                     with session.perf(warmup=warmup) as stats, hw_ctx as hw:
-                        for _ in range(total_iters):
-                            session.run(inputs)
+                        _run_monitored_loop(
+                            session,
+                            inputs,
+                            stats,
+                            hw,
+                            total_iterations=total_iters,
+                            warmup=warmup,
+                            model_id=label,
+                            device=resolved_device,
+                        )
                         hw_metrics = hw.to_dict()
                 else:
                     with session.perf(warmup=warmup) as stats:
diff --git a/tests/unit/commands/test_perf_module.py b/tests/unit/commands/test_perf_module.py
index a5b49f83..d97d82e6 100644
--- a/tests/unit/commands/test_perf_module.py
+++ b/tests/unit/commands/test_perf_module.py
@@ -316,6 +316,108 @@ def test_running_model_path_in_module_result(self, tmp_path: Path) -> None:
         assert instance["running_model_path"] == str(running_model_path)
 
 
+class TestPerfModuleMonitor:
+    """--monitor must drive the live HW utilization chart in --module mode.
+
+    Regression guard for #654: previously the module path created an
+    HWMonitor and dumped metrics to JSON but never rendered the live chart
+    (via _run_monitored_loop), so --monitor appeared to do nothing.
+    """
+
+    def test_monitor_drives_live_chart_per_module(self, tmp_path: Path) -> None:
+        fake_cfg = MagicMock()
+        fake_cfg.loader.model_type = "bert"
+        fake_cfg.loader.module_path = "encoder.layer.0"
+
+        fake_build_result = MagicMock()
+        fake_build_result.final_onnx_path = tmp_path / "model.onnx"
+
+        fake_stats = MagicMock()
+        for attr in ("mean_ms", "p50_ms", "p90_ms", "p95_ms", "p99_ms", "min_ms", "max_ms"):
+            setattr(fake_stats, attr, 1.0)
+        fake_stats.samples_ms = [1.0, 1.0]
+
+        fake_session = MagicMock()
+        fake_session.perf.return_value.__enter__.return_value = fake_stats
+        fake_session.running_model_path = tmp_path / "model_cpu_ctx.onnx"
+
+        fake_loader_cfg = MagicMock()
+        fake_loader_cfg.task = "fill-mask"
+
+        # HWMonitor instance: context-managed, with a JSON-serializable to_dict().
+        fake_hw = MagicMock()
+        fake_hw.__enter__.return_value = fake_hw
+        fake_hw.to_dict.return_value = {"monitor": "HWMonitor", "device_kind": None}
+        fake_hw_cls = MagicMock()
+        fake_hw_cls.is_available.return_value = True
+        fake_hw_cls.return_value = fake_hw
+
+        out_path = tmp_path / "out.json"
+
+        with (
+            patch(
+                "winml.modelkit.sysinfo.resolve_device",
+                return_value=("cpu", ["cpu"]),
+            ),
+            patch(
+                "winml.modelkit.config.generate_hf_build_config",
+                return_value=[fake_cfg],
+            ),
+            patch(
+                "winml.modelkit.loader.resolve_loader_config",
+                return_value=(fake_loader_cfg, MagicMock(), MagicMock(), MagicMock()),
+            ),
+            patch(
+                "winml.modelkit.commands.build._instantiate_parent_model",
+                return_value=MagicMock(),
+            ),
+            patch(
+                "winml.modelkit.build.build_hf_model",
+                return_value=fake_build_result,
+            ),
+            patch(
+                "winml.modelkit.session.WinMLSession",
+                return_value=fake_session,
+            ),
+            patch(
+                "winml.modelkit.commands.perf.generate_random_inputs",
+                return_value={},
+            ),
+            patch(
+                "winml.modelkit.session.monitor.hw_monitor.HWMonitor",
+                fake_hw_cls,
+            ),
+            patch(
+                "winml.modelkit.commands.perf._run_monitored_loop",
+            ) as mock_loop,
+        ):
+            runner = CliRunner()
+            result = runner.invoke(
+                main,
+                [
+                    "perf",
+                    "-m",
+                    "fake/model",
+                    "--module",
+                    "BertLayer",
+                    "--monitor",
+                    "--iterations",
+                    "1",
+                    "--warmup",
+                    "0",
+                    "-o",
+                    str(out_path),
+                ],
+            )
+
+        assert result.exit_code == 0, result.output
+        # The live-chart loop must be driven once for the single module instance.
+        mock_loop.assert_called_once()
+        # And the collected HW metrics still land in the JSON report.
+        report = json.loads(out_path.read_text(encoding="utf-8"))
+        assert report["instances"][0]["hw_monitor"]["monitor"] == "HWMonitor"
+
+
 class TestPerfModuleQuantCompileToggles:
     """--no-quantize and --compile/--no-compile clear cfg.quant / cfg.compile
     independently in the per-module build (mirrors the single-model path)."""