From d473d6ae96d3ac1422b39b0a05f1bcb17f5d9a3b Mon Sep 17 00:00:00 2001 From: chun-wan Date: Thu, 2 Apr 2026 07:31:08 +0800 Subject: [PATCH] rocclr: Add optional hang recovery for SDMA D2H hang (all OFF by default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 4-layer hang recovery mechanism controlled by HIP_HANG_RECOVERY_ENABLE (default=0, disabled). When disabled, zero behavioral change from stock develop — all new code paths are gated by the master switch. When HIP_HANG_RECOVERY_ENABLE=1: L1 - Signal timeout abort (HIP_MAX_SIGNAL_WAIT, default 60s): WaitForSignal's existing 4-sec loop is extended with a configurable max wait. On timeout, hsa_signal_silent_store_relaxed(signal, 0) is used to force-complete the signal (bypassing roctracer interception) and the thread resumes. An 'aborted' flag propagates to CpuWaitForSignal. L2 - Permanent SDMA bypass: After first signal abort, SdmaHealthTracker::ForcePermanentBypass() is called. KernelBlitManager::copyBuffer then forces shader blit path for all subsequent copies, preventing further submissions to the faulted SDMA engine. L3 - callbackQueue abort suppression: When hang recovery is active, callbackQueue checks IsInHangRecovery() and suppresses abort(), logging the error instead. This prevents the process from being killed by GPU queue errors during recovery. L4 - SIGABRT handler: hangRecoveryAbortHandler intercepts SIGABRT from ROCr VM fault handler. When recovery is active, it re-registers itself (defeating abort's handler reset) and freezes the caller thread with pause(). The process survives even if ROCr calls abort(). Optional debug logging via HIP_DEBUG_LOG env var (rocdebuglog.hpp). WaitActiveStreams cascade detection warns at 10K/100K idle iterations. Background: Multi-process VRAM oversubscription can trigger KFD BO eviction during in-flight SDMA D2H copies, causing HSA signals to never complete. Without recovery, all threads hang permanently. Env vars: HIP_HANG_RECOVERY_ENABLE=0|1 (master switch, default 0) HIP_MAX_SIGNAL_WAIT=N (seconds, default 60, 0=infinite) HIP_DEBUG_LOG=0|1|path (optional logging, default off) Tested with KFD eviction reproducer: 120s stress test with HANG_HOGS=6, HIP_MAX_SIGNAL_WAIT=4: 26 hang recoveries, 0 permanent hang, 0 coredump, process survives to EXIT=0. Co-authored-by: Clement Lin Made-with: Cursor --- hipamd/src/hip_device.cpp | 20 +++++- hipamd/src/hip_memory.cpp | 6 ++ hipamd/src/hip_module.cpp | 9 +++ hipamd/src/hip_stream.cpp | 3 + rocclr/device/rocm/rocblit.cpp | 5 +- rocclr/device/rocm/rocdebuglog.hpp | 104 +++++++++++++++++++++++++++++ rocclr/device/rocm/rocdevice.cpp | 78 +++++++++++++++++++++- rocclr/device/rocm/rocdevice.hpp | 30 +++++++++ rocclr/device/rocm/rocvirtual.cpp | 11 ++- rocclr/device/rocm/rocvirtual.hpp | 52 +++++++++++++-- rocclr/utils/flags.hpp | 6 ++ 11 files changed, 313 insertions(+), 11 deletions(-) create mode 100644 rocclr/device/rocm/rocdebuglog.hpp diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index dbfd6978d8..921408de4f 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -11,6 +11,7 @@ #include "hip_internal.hpp" #include "hip_mempool_impl.hpp" #include "hip_platform.hpp" +#include "device/rocm/rocdebuglog.hpp" #undef hipGetDeviceProperties #undef hipDeviceProp_t @@ -195,19 +196,32 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre } } + if (HIP_HANG_RECOVERY_ENABLE) { + thread_local uint64_t idle_call_count = 0; + if (eventWaitList.empty() && !submitMarker) { + idle_call_count++; + if (idle_call_count == 10000 || idle_call_count == 100000) { + HIP_DLOG("[HIP-DEBUG] WaitActiveStreams WARNING: possible cascade hang, " + "idle_calls=%lu, blocking_stream=%p\n", + idle_call_count, (void*)blocking_stream); + LogPrintfWarning("[HIP-HANG] WaitActiveStreams spinning for %lu iterations", + idle_call_count); + } + } else { + idle_call_count = 0; + } + } + if (!eventWaitList.empty() || submitMarker) { auto* marker = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList); marker->enqueue(); marker->release(); } - // Release all active commands; safe after the marker was enqueued for (const auto& cmd : eventWaitList) { cmd->release(); } - // Release active queue references now that the marker has been fully enqueued - // and no longer needs to access the queues via eventWaitList commands for (const auto& q : activeQueues) { q->release(); } diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 3e5da69f48..3ee51e5832 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -765,6 +765,9 @@ hipError_t hipFree(void* ptr) { hipError_t hipMemcpy_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream = nullptr) { + if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) { + return hipErrorLaunchFailure; + } CHECK_STREAM_CAPTURING(); hip::Stream* hip_stream = nullptr; @@ -1389,6 +1392,9 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) { hipError_t hipMemcpyAsync_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { + if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) { + return hipErrorLaunchFailure; + } STREAM_CAPTURE(hipMemcpyAsync, stream, dst, src, sizeBytes, kind); if (static_cast(kind) > hipMemcpyDefault && kind != hipMemcpyDeviceToDeviceNoCU) { diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 2df1d4a6b4..497346d801 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -415,6 +415,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_par uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) { + if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) { + return hipErrorLaunchFailure; + } int deviceId = hip::Stream::DeviceId(hStream); // Ensure the stream's device matches the current device, @@ -489,10 +492,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_par } if (command->status() == CL_INVALID_OPERATION) { + if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) { + return hipErrorLaunchFailure; + } command->release(); return hipErrorIllegalState; } + if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) { + return hipErrorLaunchFailure; + } command->release(); return hipSuccess; diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp index 99c5db0893..de8c460395 100644 --- a/hipamd/src/hip_stream.cpp +++ b/hipamd/src/hip_stream.cpp @@ -356,6 +356,9 @@ hipError_t hipStreamGetId(hipStream_t stream, unsigned long long* streamId) { // ================================================================================================ hipError_t hipStreamSynchronize_common(hipStream_t stream) { + if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) { + return hipSuccess; + } getStreamPerThread(stream); if (stream == nullptr) { diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index ff3125d7b1..6afe5b4ece 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -2729,8 +2729,11 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds bool nonP2PIpcOrDirectAccess = !isP2pOrIpc && neitherMemoryIsHostDirectAccess && !isSdmaPreference; + bool sdmaPermanentBypass = HIP_HANG_RECOVERY_ENABLE && + const_cast(dev()).sdmaTracker().IsPermanentBypass(); const bool useShaderCopyPath = hwlCopyDisabled || smallSizeWithNonSdmaPreference || - nonP2PIpcOrDirectAccess || isBlitPreference; + nonP2PIpcOrDirectAccess || isBlitPreference || + sdmaPermanentBypass; if (!useShaderCopyPath) { if (amd::IS_HIP) { diff --git a/rocclr/device/rocm/rocdebuglog.hpp b/rocclr/device/rocm/rocdebuglog.hpp new file mode 100644 index 0000000000..2d07730f8f --- /dev/null +++ b/rocclr/device/rocm/rocdebuglog.hpp @@ -0,0 +1,104 @@ +/* Copyright (c) 2025 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hip_debug { + +inline int& logEnabled() { + static int e = -1; + return e; +} + +inline FILE*& logFile() { + static FILE* f = nullptr; + return f; +} + +inline pthread_once_t& onceCtrl() { + static pthread_once_t o = PTHREAD_ONCE_INIT; + return o; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +inline void initLog() { + const char* env = getenv("HIP_DEBUG_LOG"); + if (!env || env[0] == '\0' || (env[0] == '0' && env[1] == '\0')) { + logEnabled() = 0; + return; + } + + char path[512]; + if (strcmp(env, "1") == 0) { + snprintf(path, sizeof(path), "/tmp/hip_debug_%d.log", getpid()); + } else { + snprintf(path, sizeof(path), "%.*s", (int)(sizeof(path) - 1), env); + char* pct = strstr(path, "%d"); + if (pct) { + char tmp[512]; + *pct = '\0'; + snprintf(tmp, sizeof(tmp), "%s%d%s", path, getpid(), pct + 2); + snprintf(path, sizeof(path), "%s", tmp); + } + } + + logFile() = fopen(path, "a"); + if (logFile()) { + logEnabled() = 1; + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + fprintf(logFile(), "[%ld.%06ld] === HIP Debug Log opened (pid=%d) ===\n", + (long)ts.tv_sec, ts.tv_nsec / 1000, getpid()); + fflush(logFile()); + } else { + logEnabled() = 0; + } +} +#pragma GCC diagnostic pop + +inline void dlog(const char* fmt, ...) __attribute__((format(printf, 1, 2))); +inline void dlog(const char* fmt, ...) { + pthread_once(&onceCtrl(), initLog); + if (logEnabled() <= 0) return; + FILE* f = logFile(); + if (!f) return; + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + fprintf(f, "[%ld.%06ld] ", (long)ts.tv_sec, ts.tv_nsec / 1000); + va_list ap; + va_start(ap, fmt); + vfprintf(f, fmt, ap); + va_end(ap); + fflush(f); +} + +} // namespace hip_debug + +#define HIP_DLOG(fmt, ...) hip_debug::dlog(fmt, ##__VA_ARGS__) diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index e51e602de4..08db390f29 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -44,6 +44,9 @@ #include #include #include +#include +#include +#include #define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) #define OPENCL_C_VERSION_STR XSTR(OPENCL_C_MAJOR) "." XSTR(OPENCL_C_MINOR) @@ -72,6 +75,55 @@ std::vector roc::Device::cpu_agents_; address Device::mg_sync_ = nullptr; +std::atomic Device::g_hang_recovery_active_{false}; + +static struct sigaction g_old_sigabrt_action; +static std::atomic g_abort_handler_installed{false}; + +static void hangRecoveryAbortHandler(int sig, siginfo_t* info, void* ctx) { + if (Device::g_hang_recovery_active_.load(std::memory_order_acquire)) { + char msg[128]; + int len = snprintf(msg, sizeof(msg), + "[HIP-RECOVERY] SIGABRT intercepted — freezing caller thread (tid=%d)\n", + (int)syscall(SYS_gettid)); + if (len > 0) write(STDERR_FILENO, msg, len); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = hangRecoveryAbortHandler; + sa.sa_flags = SA_SIGINFO | SA_RESTART; + sigemptyset(&sa.sa_mask); + sigaction(SIGABRT, &sa, nullptr); + sigset_t unblock; + sigemptyset(&unblock); + sigaddset(&unblock, SIGABRT); + sigprocmask(SIG_UNBLOCK, &unblock, nullptr); + while (1) pause(); + __builtin_unreachable(); + } + if (g_old_sigabrt_action.sa_flags & SA_SIGINFO) { + if (g_old_sigabrt_action.sa_sigaction) { + g_old_sigabrt_action.sa_sigaction(sig, info, ctx); + } + } else { + if (g_old_sigabrt_action.sa_handler == SIG_DFL) { + signal(SIGABRT, SIG_DFL); + raise(SIGABRT); + } else if (g_old_sigabrt_action.sa_handler != SIG_IGN) { + g_old_sigabrt_action.sa_handler(sig); + } + } +} + +void Device::InstallAbortHandler() { + if (g_abort_handler_installed.exchange(true, std::memory_order_acq_rel)) return; + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = hangRecoveryAbortHandler; + sa.sa_flags = SA_SIGINFO | SA_RESTART; + sigemptyset(&sa.sa_mask); + sigaction(SIGABRT, &sa, &g_old_sigabrt_action); +} + bool NullDevice::create(const amd::Isa& isa) { if (!isa.runtimeRocSupported()) { LogPrintfError("Offline HSA device %s is not supported", isa.targetId()); @@ -3522,6 +3574,19 @@ hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, v } gpu_error_ = gpu_error; + + if (HIP_HANG_RECOVERY_ENABLE) { + HIP_DLOG("[HIP-RECOVERY] GPU event type %d — activating recovery\n", + event->event_type); + for (auto* dev : amd::Device::devices()) { + auto* rocDev = static_cast(dev); + if (rocDev) { + rocDev->ActivateHangRecovery(); + rocDev->sdmaTracker().ForcePermanentBypass(); + } + } + } + return HSA_STATUS_SUCCESS; } @@ -3879,13 +3944,24 @@ cl_int ConvertHSAErrorIntoCLError(hsa_status_t hsa_status) { void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { Device* dev = reinterpret_cast(data); + + if (HIP_HANG_RECOVERY_ENABLE && dev->IsInHangRecovery()) { + const char* errorMsg = 0; + Hsa::status_string(status, &errorMsg); + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, + "[HIP-RECOVERY] Queue %p error suppressed (hang recovery active): %s code: 0x%x", + queue->base_address, errorMsg, status); + HIP_DLOG("[HIP-DEBUG] callbackQueue: suppressed abort for queue=%p, status=0x%x\n", + queue->base_address, status); + return; + } + for (auto it : dev->vgpus()) { roc::VirtualGPU* vgpu = reinterpret_cast(it); if (vgpu->gpu_queue() == queue) { vgpu->AnalyzeAqlQueue(); } } - // Abort on device exceptions. const char* errorMsg = 0; Hsa::status_string(status, &errorMsg); if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 96ac7c2174..e93a6e13a5 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -753,6 +753,9 @@ class Device : public NullDevice { }; mutable SdmaEngineAllocator sdma_engine_allocator_; + SdmaHealthTracker sdma_tracker_; + std::atomic hang_recovery_mode_{false}; + //! Code object to kernel info map (used in the crash dump analysis) mutable std::map kernel_map_; @@ -762,6 +765,33 @@ class Device : public NullDevice { public: std::atomic numOfVgpus_; //!< Virtual gpu unique index + struct SdmaHealthTracker { + std::atomic permanent_bypass_{false}; + + void ForcePermanentBypass() { + permanent_bypass_.store(true, std::memory_order_release); + HIP_DLOG("[HIP-DEBUG] SdmaHealthTracker: PERMANENT SDMA bypass activated\n"); + } + + bool IsPermanentBypass() const { + return permanent_bypass_.load(std::memory_order_acquire); + } + }; + + SdmaHealthTracker& sdmaTracker() { return sdma_tracker_; } + + void ActivateHangRecovery() { + hang_recovery_mode_.store(true, std::memory_order_release); + g_hang_recovery_active_.store(true, std::memory_order_release); + InstallAbortHandler(); + } + bool IsInHangRecovery() const { + return hang_recovery_mode_.load(std::memory_order_acquire); + } + + static std::atomic g_hang_recovery_active_; + static void InstallAbortHandler(); + //! Returns the valid SDMA engine bitmask for the given operation type. uint32_t GetSdmaValidMask(HwQueueEngine engine_type) const { return (engine_type == HwQueueEngine::SdmaD2H) ? maxSdmaReadMask_ : maxSdmaWriteMask_; diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 622dc7ed8c..de81cf43b4 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -662,10 +662,19 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) { if (Hsa::signal_load_relaxed(signal->signal_) > 0) { ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Host wait on completion_signal=0x%zx", signal->signal_.handle); - if (!WaitForSignal(signal->signal_, gpu_.ActiveWait())) { + bool aborted = false; + if (!WaitForSignal(signal->signal_, gpu_.ActiveWait(), false, &aborted)) { LogPrintfError("Failed signal [0x%lx] wait", signal->signal_); return false; } + if (HIP_HANG_RECOVERY_ENABLE && aborted) { + auto& dev = const_cast(gpu_.dev()); + dev.ActivateHangRecovery(); + dev.sdmaTracker().ForcePermanentBypass(); + LogPrintfWarning("[HIP-RECOVERY] Signal 0x%lx aborted — " + "hang recovery activated, SDMA permanently bypassed", + signal->signal_.handle); + } } // Process this signal's timing before signal reuse diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 2bca836a85..9a65d8c937 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -13,9 +13,12 @@ #include "utils/util.hpp" #include "rocprintf.hpp" #include "rocsched.hpp" +#include "rocdebuglog.hpp" #include "device/device.hpp" #include "os/os.hpp" #include +#include +#include #include #include #include @@ -36,15 +39,23 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); constexpr static uint64_t kTimeout4Secs = 4 * M; -inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yield = false) { +inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yield = false, + bool* out_aborted = nullptr) { + if (out_aborted) *out_aborted = false; + hsa_wait_state_t wait_state = HSA_WAIT_STATE_BLOCKED; if (active_wait) { wait_state = HSA_WAIT_STATE_ACTIVE; } if (Hsa::signal_load_relaxed(signal) > 0) { - // When it is blocked wait, we wait in active state for 100 us before proceeding to wait in - // blocked state indefinitely. + auto wait_start = std::chrono::steady_clock::now(); + + if (HIP_HANG_RECOVERY_ENABLE) { + HIP_DLOG("[HIP-DEBUG] WaitForSignal ENTER: signal=0x%lx, active=%d, tid=%d\n", + signal.handle, active_wait ? 1 : 0, (int)syscall(SYS_gettid)); + } + if (!active_wait) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns", signal.handle, kTimeout100us); @@ -60,8 +71,9 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi } } - // This is unlimited wait, but we wait for 4 secs and check if the device is - // unstable, if so we return, otherwise we continue to wait in the while loop. + const long max_wait_ms = HIP_HANG_RECOVERY_ENABLE + ? static_cast(HIP_MAX_SIGNAL_WAIT) * 1000L : 0; + while (Hsa::signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kTimeout4Secs, wait_state) != 0) { if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) { @@ -71,10 +83,40 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi signal.handle, kTimeout4Secs); return true; } + + if (HIP_HANG_RECOVERY_ENABLE) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - wait_start).count(); + HIP_DLOG("[HIP-DEBUG] WaitForSignal STALL: signal=0x%lx, " + "elapsed=%ldms, tid=%d\n", + signal.handle, (long)elapsed, (int)syscall(SYS_gettid)); + + if (max_wait_ms > 0 && elapsed >= max_wait_ms) { + HIP_DLOG("[HIP-DEBUG] WaitForSignal TIMEOUT: signal=0x%lx HUNG for %ldms " + "(limit=%lds), forcing recovery. tid=%d\n", + signal.handle, (long)elapsed, (long)HIP_MAX_SIGNAL_WAIT, + (int)syscall(SYS_gettid)); + LogPrintfWarning("[HIP-HANG] Signal 0x%lx hung for %ld ms, forcing recovery", + signal.handle, (long)elapsed); + Hsa::signal_silent_store_relaxed(signal, 0); + if (out_aborted) *out_aborted = true; + return true; + } + } + if (yield && wait_state == HSA_WAIT_STATE_ACTIVE) { amd::Os::yield(); } } + + if (HIP_HANG_RECOVERY_ENABLE) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - wait_start).count(); + if (elapsed > 100) { + HIP_DLOG("[HIP-DEBUG] WaitForSignal DONE: signal=0x%lx, elapsed=%ldms, tid=%d\n", + signal.handle, (long)elapsed, (int)syscall(SYS_gettid)); + } + } } return true; diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index ea815f6e75..40f378c496 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -260,6 +260,12 @@ release(bool, DEBUG_HIP_IGNORE_STREAM_PRIORITY, false, \ "Ignore priority streams") \ release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \ "Set this to true, to avoid host side abort for GPU errors") \ +release(uint, HIP_HANG_RECOVERY_ENABLE, 0, \ + "Enable hang recovery features: signal timeout abort, SDMA bypass," \ + " SIGABRT handler. 0=off (default), 1=on") \ +release(uint, HIP_MAX_SIGNAL_WAIT, 60, \ + "Max seconds to wait for signal before abort when hang recovery" \ + " enabled. 0=infinite") \ release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \ "Force use of SPIRV instead of device specific code object.") \ release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \