From 59ecb6a24f2016fc85fe1594136f9181b326fc2d Mon Sep 17 00:00:00 2001 From: Chen Li Date: Sun, 14 Jun 2026 19:43:45 -0700 Subject: [PATCH 1/2] [LLDB][AMDGPU] Refine core comparison framework --- .../amd/TestAmdGpuCoreFileComparison.py | 221 +++++++++++++++++- .../gpu/comparison/framework/comparator.py | 91 +++++++- .../framework/debugger_interface.py | 5 +- .../gpu/comparison/framework/gdb_driver.py | 34 ++- .../gpu/comparison/framework/lldb_driver.py | 65 +++++- 5 files changed, 400 insertions(+), 16 deletions(-) diff --git a/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py b/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py index 9ae4f38d8cd5d..398a688181777 100644 --- a/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py +++ b/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py @@ -13,6 +13,11 @@ test_thread_count__ test_registers__ test_local_variables__ + test_modules__ (combined module list) + test_backtrace__ (faulting GPU wave backtrace) + + Each comparison logs the data gathered from both debuggers (module lists, + backtraces) and any differences via self.trace(), visible in the dotest log. ARCHITECTURAL DIFFERENCE: - LLDB: Creates TWO targets (CPU + GPU). Must use `target select` to switch @@ -43,6 +48,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) from framework.comparator import ResultComparator +from framework.debugger_interface import DebuggerResult from framework.gdb_driver import GdbDriver from framework.lldb_driver import LldbDriver @@ -54,10 +60,14 @@ def _get_default_core_dir(): def _get_rocgdb_path(): - """Get ROCgdb path by looking in PATH. - TODO: make this configurable via lit configuration. + """Get the ROCgdb binary. + + Prefer GPU_COMPARISON_ROCGDB (the eval script points this at the + platform010 ROCm-7.0 rocgdb, which can open ROCm7 GPU cores; the system + /usr/bin/rocgdb fails to initialize the ROCm debug API on them). Fall + back to PATH. """ - return shutil.which("rocgdb") + return os.environ.get("GPU_COMPARISON_ROCGDB") or shutil.which("rocgdb") def _get_core_files(): @@ -89,7 +99,7 @@ class TestAmdGpuCoreFileComparison(TestBase): # Per-core-file setup / teardown helpers # ------------------------------------------------------------------ - def _load_core(self, core_path): + def _load_core(self, core_path, auto_load_debuginfo=False): """Load a core file in both debuggers and return (gdb_driver, lldb_driver, comparator).""" rocgdb_path = _get_rocgdb_path() if not rocgdb_path: @@ -119,8 +129,18 @@ def _load_core(self, core_path): pc_tolerance=0, ) - gdb_driver.load_core(core_path) - lldb_driver.load_core(core_path) + # Module and backtrace comparisons request auto-load-debuginfo; the + # other comparisons keep the cheaper plain core load. + gdb_result = gdb_driver.load_core( + core_path, auto_load_debuginfo=auto_load_debuginfo + ) + lldb_result = lldb_driver.load_core( + core_path, auto_load_debuginfo=auto_load_debuginfo + ) + for label, result in (("ROCgdb", gdb_result), ("ROCLLDB", lldb_result)): + auto_load_output = result.extra_data.get("auto_load_output") + if auto_load_output: + self.trace(f"{label} auto-load-debuginfo:\n{auto_load_output}") # Store for cleanup in tearDown self._active_gdb_driver = gdb_driver @@ -379,6 +399,169 @@ def _run_local_variables_comparison(self, core_path): ), ) + def _run_module_comparison(self, core_path): + """Compare combined module lists for a core file. + + ROCgdb reports modules as one flat objfile list. ROCLLDB keeps CPU and + GPU modules on separate targets, so gather both targets into one result + before comparing. This avoids depending on debugger-specific host/device + classification. + """ + gdb_driver, lldb_driver, comparator = self._load_core( + core_path, auto_load_debuginfo=True + ) + + gdb_result = gdb_driver.get_modules() + self.assertTrue( + gdb_result.success, + f"GDB failed to list modules: {gdb_result.error_message}", + ) + + lldb_result = self._get_lldb_combined_modules(lldb_driver) + self.assertTrue( + lldb_result.success, + f"LLDB failed to list modules: {lldb_result.error_message}", + ) + + comparison = comparator.compare_modules(gdb_result, lldb_result) + + def fmt(mod): + return f"{mod.name} uuid={mod.uuid or '?'}" + + self.trace("\n=== Module comparison ===") + self.trace(f"GDB modules: {len(gdb_result.modules)}") + self.trace(f"LLDB modules: {len(lldb_result.modules)}") + for mod in gdb_result.modules: + self.trace(f" GDB: {fmt(mod)}") + for mod in lldb_result.modules: + self.trace(f" LLDB: {fmt(mod)}") + + if not gdb_result.modules and not lldb_result.modules: + self.skipTest( + "No modules reported by either debugger " + "(executable/debug info unavailable for this core?)" + ) + + differences = comparison.differences + gdb_only_modules = comparison.gdb_only.get("modules", []) + lldb_only_modules = comparison.lldb_only.get("modules", []) + + # These keys have already been normalized by the comparator, so one-sided + # entries here are actionable comparison differences. + if gdb_only_modules: + self.trace( + f" GDB-only normalized module keys ({len(gdb_only_modules)}): " + + ", ".join(str(m) for m in gdb_only_modules[:10]) + ) + if lldb_only_modules: + self.trace( + f" LLDB-only normalized module keys ({len(lldb_only_modules)}): " + + ", ".join(str(m) for m in lldb_only_modules[:10]) + ) + + failure_lines = [] + if differences: + failure_lines.append(f"Module differences: {len(differences)}; first 10:") + for diff in differences[:10]: + self.trace(f" {diff.description}") + failure_lines.append(f" {diff.description}") + + if failure_lines: + self.fail("Module comparison failed:\n" + "\n".join(failure_lines)) + + def _get_lldb_combined_modules(self, lldb_driver): + modules = [] + errors = [] + + for target_name, select in ( + ("CPU", lldb_driver.select_cpu), + ("GPU", lldb_driver.select_gpu), + ): + select_result = select() + if not select_result.success: + self.trace( + f"\nLLDB {target_name} target modules: skipped " + f"({select_result.error_message})" + ) + continue + + result = lldb_driver.get_modules() + if not result.success: + errors.append( + f"LLDB failed to list {target_name} target modules: " + f"{result.error_message}" + ) + continue + + self.trace(f"\nLLDB {target_name} target modules: {len(result.modules)}") + modules.extend(result.modules) + + if errors: + return DebuggerResult(success=False, error_message="\n".join(errors)) + return DebuggerResult(success=True, modules=modules) + + def _run_backtrace_comparison(self, core_path): + """Compare the faulting GPU wave's backtrace between debuggers. + + Both debuggers select the faulting wave by default when loading a + core; rely on that selection. IMPORTANT: Do NOT call + get_all_threads() first as it changes GDB's selected thread! + + PCs must match unconditionally; function names are compared only + for frames both debuggers symbolized (production cores are often + unsymbolized unless debug info was loaded). + """ + gdb_driver, lldb_driver, comparator = self._load_core( + core_path, auto_load_debuginfo=True + ) + + gdb_result = gdb_driver.get_backtrace() + self.assertTrue( + gdb_result.success, + f"GDB failed to get backtrace: {gdb_result.error_message}", + ) + + select_result = lldb_driver.select_gpu() + if not select_result.success: + self.skipTest(select_result.error_message) + if lldb_driver.get_thread_count() == 0: + self.skipTest("No GPU threads in LLDB") + + lldb_result = lldb_driver.get_backtrace() + self.assertTrue( + lldb_result.success, + f"LLDB failed to get backtrace: {lldb_result.error_message}", + ) + + def fmt(frame): + location = f" at {frame.file}:{frame.line}" if frame.file else "" + return f"#{frame.index} {hex(frame.pc)} {frame.function}{location}" + + self.trace("\n=== GPU backtrace comparison (faulting wave) ===") + self.trace(f"GDB frames: {len(gdb_result.backtrace)}") + self.trace(f"LLDB frames: {len(lldb_result.backtrace)}") + for frame in gdb_result.backtrace[:10]: + self.trace(f" GDB: {fmt(frame)}") + for frame in lldb_result.backtrace[:10]: + self.trace(f" LLDB: {fmt(frame)}") + + comparison = comparator.compare_backtrace(gdb_result, lldb_result) + + # Empty on both sides is no coverage, not parity (see module test). + if not gdb_result.backtrace and not lldb_result.backtrace: + self.skipTest("Neither debugger produced any backtrace frames") + + if comparison.differences: + failure_lines = [ + f"Backtrace differences: {len(comparison.differences)}; first 10:" + ] + for diff in comparison.differences[:10]: + self.trace(f" {diff.description}") + failure_lines.append(f" {diff.description}") + self.fail( + "GPU backtrace comparison failed:\n" + "\n".join(failure_lines) + ) + # ------------------------------------------------------------------ # Placeholder when no core files are available # ------------------------------------------------------------------ @@ -436,6 +619,22 @@ def test(self): test.__doc__ = f"Local variables comparison for {os.path.basename(cp)}" return test + def make_modules_test(cp=core_path): + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test(self): + self._run_module_comparison(cp) + test.__doc__ = f"Module comparison for {os.path.basename(cp)}" + return test + + def make_backtrace_test(cp=core_path): + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test(self): + self._run_backtrace_comparison(cp) + test.__doc__ = f"GPU backtrace comparison for {os.path.basename(cp)}" + return test + setattr( TestAmdGpuCoreFileComparison, f"test_thread_count__{basename}", @@ -451,6 +650,16 @@ def test(self): f"test_local_variables__{basename}", make_local_variables_test(), ) + setattr( + TestAmdGpuCoreFileComparison, + f"test_modules__{basename}", + make_modules_test(), + ) + setattr( + TestAmdGpuCoreFileComparison, + f"test_backtrace__{basename}", + make_backtrace_test(), + ) _add_core_file_tests() diff --git a/lldb/test/API/gpu/comparison/framework/comparator.py b/lldb/test/API/gpu/comparison/framework/comparator.py index 9426cbd08dc4c..c8ad889a7bd4a 100644 --- a/lldb/test/API/gpu/comparison/framework/comparator.py +++ b/lldb/test/API/gpu/comparison/framework/comparator.py @@ -2,7 +2,10 @@ Result comparator for comparing GDB and LLDB debugging outputs. """ +from collections import Counter from dataclasses import dataclass, field +import os +import re from typing import List, Dict, Any, Optional, Set, Tuple from .debugger_interface import ( DebuggerResult, @@ -321,26 +324,102 @@ def compare_modules( """Compare loaded modules from GDB and LLDB.""" result = ComparisonResult() - gdb_modules = {m.name: m for m in gdb_result.modules} - lldb_modules = {m.name: m for m in lldb_result.modules} + gdb_modules = Counter( + key + for m in gdb_result.modules + if (key := self._normalize_module_key(m)) is not None + ) + lldb_modules = Counter( + key + for m in lldb_result.modules + if (key := self._normalize_module_key(m)) is not None + ) all_names = set(gdb_modules.keys()) | set(lldb_modules.keys()) for name in sorted(all_names): - gdb_mod = gdb_modules.get(name) - lldb_mod = lldb_modules.get(name) + gdb_count = gdb_modules.get(name, 0) + lldb_count = lldb_modules.get(name, 0) - if gdb_mod is None: + if gdb_count == 0: result.add_lldb_only("modules", name) + result.add_difference( + "modules", + name, + 0, + lldb_count, + f"Module '{name}' only in LLDB", + ) continue - if lldb_mod is None: + if lldb_count == 0: result.add_gdb_only("modules", name) + result.add_difference( + "modules", + name, + gdb_count, + 0, + f"Module '{name}' only in GDB", + ) continue + if gdb_count != lldb_count: + result.add_difference( + "modules", + name, + gdb_count, + lldb_count, + f"Module '{name}' count differs: GDB={gdb_count}, LLDB={lldb_count}", + ) + result.summary = result.get_summary() return result + def _normalize_module_key(self, module: ModuleInfo) -> str | None: + """Normalize debugger-specific module names into comparable keys.""" + raw_path = module.path or "" + name = module.name or os.path.basename(raw_path) or "" + + # GDB exposes .gnu_debugdata as a synthetic objfile. It is debug info + # for the following module, not a separately loaded module. + if raw_path.startswith(".gnu_debugdata for "): + return None + + if name == "[vdso]" or raw_path.startswith("system-supplied DSO"): + return "vdso" + + # ROCgdb memory code objects look like: + # 4377#offset=0x7c...&size=43392 + # memory://4377#offset=0x7c...&size=43392 + offset_match = re.search( + r"#offset=(0x[0-9a-fA-F]+|\d+)&size=(0x[0-9a-fA-F]+|\d+)", + raw_path or name, + ) + if offset_match and ( + (raw_path or name).startswith("memory://") + or (name.split("#", 1)[0].isdigit()) + ): + start = int(offset_match.group(1), 0) + size = int(offset_match.group(2), 0) + return f"memory:{start:x}-{start + size:x}" + + # ROCLLDB memory code objects look like: + # amd_memory_kernel[0x7c..., 0x7c...) + kernel_match = re.search( + r"amd_memory_kernel\[(0x[0-9a-fA-F]+),\s*(0x[0-9a-fA-F]+)\)", + name, + ) + if kernel_match: + start = int(kernel_match.group(1), 16) + end = int(kernel_match.group(2), 16) + return f"memory:{start:x}-{end:x}" + + # File-backed GPU code objects may have #offset/#size suffixes on one + # side. Keep the backing file basename as the module identity. + path = (raw_path or name).removeprefix("file://") + path = path.split("#offset=", 1)[0] + return os.path.basename(path) or name + def _normalize_function_name(self, name: str) -> str: """Normalize function name for comparison.""" if not name: diff --git a/lldb/test/API/gpu/comparison/framework/debugger_interface.py b/lldb/test/API/gpu/comparison/framework/debugger_interface.py index ba885ce93ab5a..7036710bafdd2 100644 --- a/lldb/test/API/gpu/comparison/framework/debugger_interface.py +++ b/lldb/test/API/gpu/comparison/framework/debugger_interface.py @@ -134,7 +134,10 @@ class DebuggerInterface(ABC): @abstractmethod def load_core( - self, core_path: str, executable_path: Optional[str] = None + self, + core_path: str, + executable_path: Optional[str] = None, + auto_load_debuginfo: bool = False, ) -> DebuggerResult: """Load a core file for debugging.""" pass diff --git a/lldb/test/API/gpu/comparison/framework/gdb_driver.py b/lldb/test/API/gpu/comparison/framework/gdb_driver.py index 7df9ee9de2d1e..a34f328405609 100644 --- a/lldb/test/API/gpu/comparison/framework/gdb_driver.py +++ b/lldb/test/API/gpu/comparison/framework/gdb_driver.py @@ -270,11 +270,17 @@ def _send_command(self, command: str, timeout: float = 60.0) -> str: return self._wait_for_prompt(timeout) def load_core( - self, core_path: str, executable_path: Optional[str] = None + self, + core_path: str, + executable_path: Optional[str] = None, + auto_load_debuginfo: bool = False, ) -> DebuggerResult: """Load a core file into the persistent GDB process.""" self._start_gdb() + if auto_load_debuginfo: + core_path = os.path.realpath(core_path) + self._core_path = core_path self._executable_path = executable_path @@ -285,6 +291,9 @@ def load_core( # Load the core file self._send_command(f"core-file {core_path}") self._core_loaded = True + auto_load_output = "" + if auto_load_debuginfo: + auto_load_output = self._auto_load_debuginfo() # Get thread info to verify core loaded script = """ @@ -309,8 +318,29 @@ def load_core( success=data.get("success", False), error_message=data.get("error", ""), raw_output=data.get("raw_output", ""), - extra_data={"thread_count": data.get("thread_count", 0)}, + extra_data={ + "thread_count": data.get("thread_count", 0), + "auto_load_output": auto_load_output, + }, + ) + + def _auto_load_debuginfo(self) -> str: + fbcode_path = os.environ.get("GPU_COMPARISON_FBCODE_PATH") + if not fbcode_path: + return "GPU_COMPARISON_FBCODE_PATH is not set; skipped auto-load-debuginfo\n" + + fbload_path = os.path.join(fbcode_path, "gdb", "scripts", "fbload.py") + output = [] + output.append(self._send_command(f"source {fbload_path}")) + output.append(self._send_command("fbload auto_debuginfo")) + output.append( + self._send_command( + "python import os; os.environ['LD_LIBRARY_PATH'] = " + "os.environ.get('LLDB_SYMBOL_STORAGE_LD_LIBRARY_PATH', '')" + ) ) + output.append(self._send_command("auto-load-debuginfo", timeout=600.0)) + return "".join(output) def get_all_threads(self) -> DebuggerResult: """Get list of all threads (CPU + GPU in flat view). diff --git a/lldb/test/API/gpu/comparison/framework/lldb_driver.py b/lldb/test/API/gpu/comparison/framework/lldb_driver.py index db82a345de132..f69a8ed4ae25e 100644 --- a/lldb/test/API/gpu/comparison/framework/lldb_driver.py +++ b/lldb/test/API/gpu/comparison/framework/lldb_driver.py @@ -6,6 +6,8 @@ direct access to LLDB's SB API. """ +import os + import lldb from typing import List, Optional, Dict, Any @@ -128,9 +130,15 @@ def get_thread_count(self) -> int: return self._process.GetNumThreads() def load_core( - self, core_path: str, executable_path: Optional[str] = None + self, + core_path: str, + executable_path: Optional[str] = None, + auto_load_debuginfo: bool = False, ) -> DebuggerResult: """Load a core file using the in-process LLDB API.""" + if auto_load_debuginfo: + return self._load_core_auto_debuginfo(core_path) + self._core_path = core_path try: @@ -185,6 +193,61 @@ def load_core( except Exception as e: return DebuggerResult(success=False, error_message=str(e)) + def _load_core_auto_debuginfo(self, core_path: str) -> DebuggerResult: + self._core_path = os.path.realpath(core_path) + output = [] + + fbcode_path = os.environ.get("GPU_COMPARISON_FBCODE_PATH") + if fbcode_path: + result = self.execute_command( + f"script import sys; sys.path.insert(0, {fbcode_path!r})" + ) + output.append(result.raw_output or result.error_message) + + result = self.execute_command( + "script import os; os.environ['LD_LIBRARY_PATH'] = " + "os.environ.get('LLDB_SYMBOL_STORAGE_LD_LIBRARY_PATH', '')" + ) + output.append(result.raw_output or result.error_message) + + result = self.execute_command("command script import fblldb") + output.append(result.raw_output or result.error_message) + + result = self.execute_command(f"auto-load-debuginfo {self._core_path}") + output.append(result.raw_output or result.error_message) + if not result.success: + return DebuggerResult( + success=False, + error_message=result.error_message, + raw_output=result.raw_output, + extra_data={"auto_load_output": "\n".join(output)}, + ) + + self._target = self._debugger.GetSelectedTarget() + if not self._target or not self._target.IsValid(): + return DebuggerResult( + success=False, + error_message="auto-load-debuginfo did not create a valid target", + extra_data={"auto_load_output": "\n".join(output)}, + ) + + self._process = self._target.GetProcess() + if not self._process or not self._process.IsValid(): + return DebuggerResult( + success=False, + error_message="auto-load-debuginfo did not create a valid process", + extra_data={"auto_load_output": "\n".join(output)}, + ) + + return DebuggerResult( + success=True, + extra_data={ + "thread_count": self._process.GetNumThreads(), + "target_triple": self._target.GetTriple(), + "auto_load_output": "\n".join(output), + }, + ) + def get_all_threads(self) -> DebuggerResult: """Get list of all threads from all targets (CPU + GPU). From 04dfd54d350a5326a7bfdee0d6e6b10da4c6dc98 Mon Sep 17 00:00:00 2001 From: Chen Li Date: Wed, 17 Jun 2026 17:23:51 -0700 Subject: [PATCH 2/2] [LLDB][AMDGPU] Refine GPU core comparison checks --- .../amd/TestAmdGpuCoreFileComparison.py | 165 +++++++++++------- .../gpu/comparison/framework/comparator.py | 89 +++++----- .../gpu/comparison/framework/gdb_driver.py | 77 ++++++++ .../gpu/comparison/framework/lldb_driver.py | 87 ++++++++- 4 files changed, 306 insertions(+), 112 deletions(-) diff --git a/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py b/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py index 398a688181777..fd5a60f116984 100644 --- a/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py +++ b/lldb/test/API/gpu/comparison/amd/TestAmdGpuCoreFileComparison.py @@ -161,6 +161,61 @@ def tearDown(self): # Comparison helpers # ------------------------------------------------------------------ + def _select_lldb_gpu_thread_matching_rocgdb(self, gdb_driver, lldb_driver): + """Select LLDB's GPU wave that matches ROCgDB's selected AMDGPU wave.""" + lldb_select_result = lldb_driver.select_gpu() + if not lldb_select_result.success: + self.skipTest(lldb_select_result.error_message) + if lldb_driver.get_thread_count() == 0: + self.skipTest("No GPU threads in LLDB") + + gdb_selected = gdb_driver.get_selected_thread() + if not gdb_selected.success: + self.trace( + "Could not query ROCgDB selected thread; keeping LLDB default " + f"GPU selection: {gdb_selected.error_message}" + ) + return None + + selected = gdb_selected.extra_data + self.trace( + "ROCgDB selected thread: " + f"gdb_thread={selected.get('id')} " + f"arch={selected.get('architecture')} " + f"wave={selected.get('amdgpu_wave_id')} " + f"lane={selected.get('amdgpu_lane_id')} " + f"pc={hex(selected.get('pc') or 0)} " + f"function={selected.get('function')}" + ) + if selected.get("selected_line"): + self.trace(f"ROCgDB selected thread line: {selected['selected_line']}") + + wave_id = selected.get("amdgpu_wave_id") + if wave_id is None: + self.trace( + "ROCgDB selected thread did not expose an AMDGPU wave id; " + "keeping LLDB default GPU selection" + ) + return None + + lldb_thread = lldb_driver.select_thread(wave_id) + if not lldb_thread.success: + self.fail( + "LLDB failed to select the ROCGDB-selected AMDGPU wave " + f"{wave_id}: {lldb_thread.error_message}" + ) + + info = lldb_thread.extra_data + self.trace( + "Selected LLDB GPU thread to match ROCGDB: " + f"wave={wave_id} " + f"lldb_tid={info.get('selected_thread')} " + f"index={info.get('selected_index_id')} " + f"name={info.get('selected_name')} " + f"target={info.get('target_triple')}" + ) + return wave_id + def _compare_variable_sets(self, comparator, gdb_vars, lldb_vars): """Compare variable sets between GDB and LLDB. @@ -244,12 +299,7 @@ def _run_register_comparison(self, core_path): gdb_driver, lldb_driver, comparator = self._load_core(core_path) gdb_result = gdb_driver.get_registers() - - lldb_select_result = lldb_driver.select_gpu() - if not lldb_select_result.success: - self.skipTest(lldb_select_result.error_message) - if lldb_driver.get_thread_count() == 0: - self.skipTest("No GPU threads in LLDB") + self._select_lldb_gpu_thread_matching_rocgdb(gdb_driver, lldb_driver) lldb_result = lldb_driver.get_registers() self.assertTrue( @@ -305,22 +355,17 @@ def _run_register_comparison(self, core_path): def _run_local_variables_comparison(self, core_path): """Compare GPU local variables between debuggers for a core file. - Both debuggers select the crashing thread by default when loading a core. - We rely on this default selection rather than searching for threads, - which would change GDB's selected thread state. + ROCgDB and ROCLLDB use different default faulting-wave selection + policies. Query ROCgDB's selected AMDGPU wave id, then select the + matching ROCLLDB GPU thread before reading LLDB locals. """ gdb_driver, lldb_driver, comparator = self._load_core(core_path) - lldb_select_result = lldb_driver.select_gpu() - if not lldb_select_result.success: - self.skipTest(lldb_select_result.error_message) - if lldb_driver.get_thread_count() == 0: - self.skipTest("No GPU threads in LLDB") - # Get local variables from GDB using the default selected thread. # IMPORTANT: Do NOT call get_all_threads() here as it changes GDB's # selected thread! gdb_vars = gdb_driver.get_local_variables() + self._select_lldb_gpu_thread_matching_rocgdb(gdb_driver, lldb_driver) # Get local variables from LLDB through the LLDB adapter only. lldb_vars = lldb_driver.get_local_variables() @@ -417,26 +462,47 @@ def _run_module_comparison(self, core_path): f"GDB failed to list modules: {gdb_result.error_message}", ) - lldb_result = self._get_lldb_combined_modules(lldb_driver) + lldb_result = lldb_driver.get_combined_modules() self.assertTrue( lldb_result.success, f"LLDB failed to list modules: {lldb_result.error_message}", ) comparison = comparator.compare_modules(gdb_result, lldb_result) + gdb_normalized_modules = comparator.get_normalized_module_counts(gdb_result) + lldb_normalized_modules = comparator.get_normalized_module_counts(lldb_result) def fmt(mod): return f"{mod.name} uuid={mod.uuid or '?'}" self.trace("\n=== Module comparison ===") - self.trace(f"GDB modules: {len(gdb_result.modules)}") - self.trace(f"LLDB modules: {len(lldb_result.modules)}") + self.trace(f"GDB normalized modules: {len(gdb_normalized_modules)}") + self.trace(f"LLDB normalized modules: {len(lldb_normalized_modules)}") + for target_info in lldb_result.extra_data.get("targets", []): + target_name = target_info["name"] + if target_info.get("skipped"): + self.trace( + f"\nLLDB {target_name} target modules: skipped " + f"({target_info.get('error', '')})" + ) + continue + + target_modules = DebuggerResult( + success=True, modules=target_info.get("modules", []) + ) + normalized_modules = comparator.get_normalized_module_counts( + target_modules + ) + self.trace( + f"\nLLDB {target_name} target normalized modules: " + f"{len(normalized_modules)}" + ) for mod in gdb_result.modules: self.trace(f" GDB: {fmt(mod)}") for mod in lldb_result.modules: self.trace(f" LLDB: {fmt(mod)}") - if not gdb_result.modules and not lldb_result.modules: + if not gdb_normalized_modules and not lldb_normalized_modules: self.skipTest( "No modules reported by either debugger " "(executable/debug info unavailable for this core?)" @@ -446,16 +512,19 @@ def fmt(mod): gdb_only_modules = comparison.gdb_only.get("modules", []) lldb_only_modules = comparison.lldb_only.get("modules", []) - # These keys have already been normalized by the comparator, so one-sided - # entries here are actionable comparison differences. + # These keys have already been normalized by the comparator. GDB-only + # entries are failures; LLDB-only entries are expected when LLDB sees + # extra placeholders or file-backed GPU code objects. if gdb_only_modules: self.trace( - f" GDB-only normalized module keys ({len(gdb_only_modules)}): " + f" GDB-only normalized module keys missing from LLDB " + f"({len(gdb_only_modules)}): " + ", ".join(str(m) for m in gdb_only_modules[:10]) ) if lldb_only_modules: self.trace( - f" LLDB-only normalized module keys ({len(lldb_only_modules)}): " + f" LLDB-extra normalized module keys allowed " + f"({len(lldb_only_modules)}): " + ", ".join(str(m) for m in lldb_only_modules[:10]) ) @@ -469,47 +538,16 @@ def fmt(mod): if failure_lines: self.fail("Module comparison failed:\n" + "\n".join(failure_lines)) - def _get_lldb_combined_modules(self, lldb_driver): - modules = [] - errors = [] - - for target_name, select in ( - ("CPU", lldb_driver.select_cpu), - ("GPU", lldb_driver.select_gpu), - ): - select_result = select() - if not select_result.success: - self.trace( - f"\nLLDB {target_name} target modules: skipped " - f"({select_result.error_message})" - ) - continue - - result = lldb_driver.get_modules() - if not result.success: - errors.append( - f"LLDB failed to list {target_name} target modules: " - f"{result.error_message}" - ) - continue - - self.trace(f"\nLLDB {target_name} target modules: {len(result.modules)}") - modules.extend(result.modules) - - if errors: - return DebuggerResult(success=False, error_message="\n".join(errors)) - return DebuggerResult(success=True, modules=modules) - def _run_backtrace_comparison(self, core_path): """Compare the faulting GPU wave's backtrace between debuggers. - Both debuggers select the faulting wave by default when loading a - core; rely on that selection. IMPORTANT: Do NOT call - get_all_threads() first as it changes GDB's selected thread! + ROCgDB and ROCLLDB use different default faulting-wave selection + policies. Query ROCgDB's selected AMDGPU wave id, then select the + matching ROCLLDB GPU thread before collecting the LLDB backtrace. - PCs must match unconditionally; function names are compared only - for frames both debuggers symbolized (production cores are often - unsymbolized unless debug info was loaded). + PCs and depth must match. Function names fail only when ROCgDB has a + real symbol and ROCLLDB reports it as unknown; extra ROCLLDB + symbolication and demangler spelling differences are diagnostics. """ gdb_driver, lldb_driver, comparator = self._load_core( core_path, auto_load_debuginfo=True @@ -520,12 +558,7 @@ def _run_backtrace_comparison(self, core_path): gdb_result.success, f"GDB failed to get backtrace: {gdb_result.error_message}", ) - - select_result = lldb_driver.select_gpu() - if not select_result.success: - self.skipTest(select_result.error_message) - if lldb_driver.get_thread_count() == 0: - self.skipTest("No GPU threads in LLDB") + self._select_lldb_gpu_thread_matching_rocgdb(gdb_driver, lldb_driver) lldb_result = lldb_driver.get_backtrace() self.assertTrue( diff --git a/lldb/test/API/gpu/comparison/framework/comparator.py b/lldb/test/API/gpu/comparison/framework/comparator.py index c8ad889a7bd4a..684f4bb6d2659 100644 --- a/lldb/test/API/gpu/comparison/framework/comparator.py +++ b/lldb/test/API/gpu/comparison/framework/comparator.py @@ -2,17 +2,19 @@ Result comparator for comparing GDB and LLDB debugging outputs. """ -from collections import Counter -from dataclasses import dataclass, field import os import re -from typing import List, Dict, Any, Optional, Set, Tuple +from collections import Counter +from dataclasses import dataclass, field +from typing import Any +from urllib.parse import unquote + from .debugger_interface import ( DebuggerResult, - ThreadInfo, FrameInfo, - VariableValue, ModuleInfo, + ThreadInfo, + VariableValue, ) @@ -32,9 +34,9 @@ class ComparisonResult: """Result of comparing GDB and LLDB outputs.""" is_equivalent: bool = True - differences: List[ComparisonDifference] = field(default_factory=list) - gdb_only: Dict[str, List[Any]] = field(default_factory=dict) - lldb_only: Dict[str, List[Any]] = field(default_factory=dict) + differences: list[ComparisonDifference] = field(default_factory=list) + gdb_only: dict[str, list[Any]] = field(default_factory=dict) + lldb_only: dict[str, list[Any]] = field(default_factory=dict) summary: str = "" def add_difference( @@ -218,13 +220,17 @@ def compare_backtrace( gdb_func = gdb_frame.function lldb_func = lldb_frame.function - if gdb_func != lldb_func: + # demangler spelling could affect the comparison, not fail on it if function names both exits + if gdb_func != lldb_func and ( + not self._is_unknown_function(gdb_func) + and self._is_unknown_function(lldb_func) + ): result.add_difference( f"frame[{i}]", "function", gdb_func, lldb_func, - f"Frame {i} function differs: GDB='{gdb_func}', LLDB='{lldb_func}'", + f"Frame {i} function missing in LLDB: GDB='{gdb_func}', LLDB='{lldb_func}'", ) result.summary = result.get_summary() @@ -234,7 +240,7 @@ def compare_registers( self, gdb_result: DebuggerResult, lldb_result: DebuggerResult, - register_names: Optional[List[str]] = None, + register_names: list[str] | None = None, ) -> ComparisonResult: """Compare register values from GDB and LLDB.""" result = ComparisonResult() @@ -321,19 +327,18 @@ def compare_variables( def compare_modules( self, gdb_result: DebuggerResult, lldb_result: DebuggerResult ) -> ComparisonResult: - """Compare loaded modules from GDB and LLDB.""" + """Compare loaded modules from GDB and LLDB. + + For production GPU cores, LLDB often reports extra placeholder or + file-backed modules that ROCgDB omits. Treat LLDB as successful when it + contains every normalized ROCGDB module key. LLDB-only keys and + duplicate count differences are useful diagnostics, but not parity + failures. + """ result = ComparisonResult() - gdb_modules = Counter( - key - for m in gdb_result.modules - if (key := self._normalize_module_key(m)) is not None - ) - lldb_modules = Counter( - key - for m in lldb_result.modules - if (key := self._normalize_module_key(m)) is not None - ) + gdb_modules = self.get_normalized_module_counts(gdb_result) + lldb_modules = self.get_normalized_module_counts(lldb_result) all_names = set(gdb_modules.keys()) | set(lldb_modules.keys()) @@ -343,13 +348,6 @@ def compare_modules( if gdb_count == 0: result.add_lldb_only("modules", name) - result.add_difference( - "modules", - name, - 0, - lldb_count, - f"Module '{name}' only in LLDB", - ) continue if lldb_count == 0: @@ -363,28 +361,24 @@ def compare_modules( ) continue - if gdb_count != lldb_count: - result.add_difference( - "modules", - name, - gdb_count, - lldb_count, - f"Module '{name}' count differs: GDB={gdb_count}, LLDB={lldb_count}", - ) - result.summary = result.get_summary() return result + def get_normalized_module_counts( + self, debugger_result: DebuggerResult + ) -> Counter[str]: + """Return normalized module keys and their observed counts.""" + return Counter( + key + for module in debugger_result.modules + if (key := self._normalize_module_key(module)) is not None + ) + def _normalize_module_key(self, module: ModuleInfo) -> str | None: """Normalize debugger-specific module names into comparable keys.""" raw_path = module.path or "" name = module.name or os.path.basename(raw_path) or "" - # GDB exposes .gnu_debugdata as a synthetic objfile. It is debug info - # for the following module, not a separately loaded module. - if raw_path.startswith(".gnu_debugdata for "): - return None - if name == "[vdso]" or raw_path.startswith("system-supplied DSO"): return "vdso" @@ -416,7 +410,7 @@ def _normalize_module_key(self, module: ModuleInfo) -> str | None: # File-backed GPU code objects may have #offset/#size suffixes on one # side. Keep the backing file basename as the module identity. - path = (raw_path or name).removeprefix("file://") + path = unquote(raw_path or name).removeprefix("file://") path = path.split("#offset=", 1)[0] return os.path.basename(path) or name @@ -434,6 +428,13 @@ def _normalize_function_name(self, name: str) -> str: return name + def _is_unknown_function(self, name: str) -> bool: + """Return whether a function name represents missing symbolication.""" + if not name: + return True + normalized = name.strip() + return normalized in ("", "??", "?? ()") + def normalize_pointer_value(self, value): """Normalize pointer value format for comparison. diff --git a/lldb/test/API/gpu/comparison/framework/gdb_driver.py b/lldb/test/API/gpu/comparison/framework/gdb_driver.py index a34f328405609..dd00b8b8720d3 100644 --- a/lldb/test/API/gpu/comparison/framework/gdb_driver.py +++ b/lldb/test/API/gpu/comparison/framework/gdb_driver.py @@ -431,6 +431,83 @@ def select_thread(self, thread_id: int) -> DebuggerResult: success=data.get("success", False), error_message=data.get("error", "") ) + def get_selected_thread(self) -> DebuggerResult: + """Return ROCgDB's current selected thread without changing selection. + + ROCgDB's flat GPU thread number does not match LLDB's GPU thread id. + For AMDGPU waves, the useful cross-debugger key is the wave id embedded + in names like: AMDGPU Lane 3:5:1:8192/0 (...). + """ + script = r""" +import gdb +import json +import re + +result = {"success": True, "error": ""} + +try: + thread = gdb.selected_thread() + frame = gdb.selected_frame() + current_thread_output = "" + try: + current_thread_output = gdb.execute("thread", to_string=True) + except: + pass + + thread_name = thread.name or "" + selected_line = current_thread_output.strip() + thread_text = " ".join([thread_name, selected_line]) + wave_match = re.search( + r"AMDGPU\s+Lane\s+(?:\d+:){3}(\d+)/(\d+)", + thread_text, + ) + if not wave_match: + info_threads = gdb.execute("info threads", to_string=True) + for line in info_threads.splitlines(): + if line.lstrip().startswith("*"): + selected_line = line.strip() + break + wave_match = re.search( + r"AMDGPU\s+Lane\s+(?:\d+:){3}(\d+)/(\d+)", + selected_line, + ) + + result.update({ + "id": thread.global_num, + "name": thread_name, + "selected_line": selected_line, + "pc": frame.pc(), + "function": frame.name() or "", + "architecture": frame.architecture().name(), + }) + + if wave_match: + result["amdgpu_wave_id"] = int(wave_match.group(1)) + result["amdgpu_lane_id"] = int(wave_match.group(2)) + +except Exception as e: + result["success"] = False + result["error"] = str(e) + +print("RESULT_JSON:" + json.dumps(result)) +""" + data = self._run_python_script(script) + + return DebuggerResult( + success=data.get("success", False), + error_message=data.get("error", ""), + extra_data={ + "id": data.get("id"), + "name": data.get("name"), + "selected_line": data.get("selected_line"), + "pc": data.get("pc"), + "function": data.get("function"), + "architecture": data.get("architecture"), + "amdgpu_wave_id": data.get("amdgpu_wave_id"), + "amdgpu_lane_id": data.get("amdgpu_lane_id"), + }, + ) + def get_backtrace(self, thread_id: Optional[int] = None) -> DebuggerResult: """Get backtrace for current or specified thread.""" thread_select = f'gdb.execute("thread {thread_id}")' if thread_id else "" diff --git a/lldb/test/API/gpu/comparison/framework/lldb_driver.py b/lldb/test/API/gpu/comparison/framework/lldb_driver.py index f69a8ed4ae25e..95874698b2cd4 100644 --- a/lldb/test/API/gpu/comparison/framework/lldb_driver.py +++ b/lldb/test/API/gpu/comparison/framework/lldb_driver.py @@ -325,7 +325,27 @@ def select_thread(self, thread_id: int) -> DebuggerResult: if not self._process or not self._process.IsValid(): return DebuggerResult(success=False, error_message="No valid process") - # Search in all targets + # Prefer the currently selected target/process. The comparison test + # selects the GPU target first, and ROCgDB's AMDGPU wave id maps to + # LLDB's GPU thread id. Searching CPU targets first can accidentally + # match an unrelated host thread with the same numeric id. + for i in range(self._process.GetNumThreads()): + thread = self._process.GetThreadAtIndex(i) + if thread.GetThreadID() == thread_id: + self._process.SetSelectedThread(thread) + return DebuggerResult( + success=True, + extra_data={ + "selected_thread": thread.GetThreadID(), + "selected_index_id": thread.GetIndexID(), + "selected_name": thread.GetName(), + "target_triple": self._target.GetTriple() + if self._target and self._target.IsValid() + else None, + }, + ) + + # Fallback for callers that have not selected the target first. for target_idx in range(self._debugger.GetNumTargets()): target = self._debugger.GetTargetAtIndex(target_idx) process = target.GetProcess() @@ -343,7 +363,12 @@ def select_thread(self, thread_id: int) -> DebuggerResult: self._process = process return DebuggerResult( success=True, - extra_data={"selected_thread": thread.GetThreadID()}, + extra_data={ + "selected_thread": thread.GetThreadID(), + "selected_index_id": thread.GetIndexID(), + "selected_name": thread.GetName(), + "target_triple": target.GetTriple(), + }, ) return DebuggerResult( @@ -582,6 +607,64 @@ def get_modules(self) -> DebuggerResult: except Exception as e: return DebuggerResult(success=False, error_message=str(e)) + def get_combined_modules(self) -> DebuggerResult: + """Get modules from all LLDB targets as one flat module list.""" + modules = [] + errors = [] + targets = [] + + for target_name, select in ( + ("CPU", self.select_cpu), + ("GPU", self.select_gpu), + ): + select_result = select() + if not select_result.success: + targets.append( + { + "name": target_name, + "skipped": True, + "error": select_result.error_message, + } + ) + continue + + result = self.get_modules() + if not result.success: + errors.append( + f"LLDB failed to list {target_name} target modules: " + f"{result.error_message}" + ) + targets.append( + { + "name": target_name, + "error": result.error_message, + "modules": [], + } + ) + continue + + modules.extend(result.modules) + targets.append( + { + "name": target_name, + "modules": result.modules, + } + ) + + if errors: + return DebuggerResult( + success=False, + error_message="\n".join(errors), + modules=modules, + extra_data={"targets": targets}, + ) + + return DebuggerResult( + success=True, + modules=modules, + extra_data={"targets": targets}, + ) + def select_frame(self, frame_index: int) -> DebuggerResult: """Select a frame by index.""" try: