From be35901bb51d7b326a3926d06e07b0ed7017b34f Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 1 Mar 2026 09:06:38 -0800 Subject: [PATCH 01/12] initial changes and refactoring --- intercept/src/controls.h | 1 + intercept/src/dispatch.cpp | 69 +++++++++++++------------------------- intercept/src/intercept.h | 45 +++++++------------------ 3 files changed, 36 insertions(+), 79 deletions(-) diff --git a/intercept/src/controls.h b/intercept/src/controls.h index 8a66ba03..02da6a35 100644 --- a/intercept/src/controls.h +++ b/intercept/src/controls.h @@ -14,6 +14,7 @@ CLI_CONTROL_SEPARATOR( Tracing Controls: ) CLI_CONTROL( bool, BetaExtensionIntercepting, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs." ) +CLI_CONTROL( bool, MultiThreadedProcessing, true, "TODO" ) CLI_CONTROL_SEPARATOR( Logging Controls: ) CLI_CONTROL( bool, SuppressLogging, false, "If set to a nonzero value, suppresses all logging output from the Intercept Layer for OpenCL Applications. This is particularly useful for tools that only want report data." ) diff --git a/intercept/src/dispatch.cpp b/intercept/src/dispatch.cpp index 4c45515b..6771e597 100644 --- a/intercept/src/dispatch.cpp +++ b/intercept/src/dispatch.cpp @@ -643,8 +643,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clReleaseContext)( ADD_OBJECT_RELEASE( context ); --ref_count; CALL_LOGGING_EXIT( retVal, "[ ref count = %d ]", ref_count ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( ref_count == 0 ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( ref_count == 0 ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( ref_count == 0 ); #if 0 pIntercept->report(); @@ -3042,8 +3041,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clWaitForEvents)( HOST_PERFORMANCE_TIMING_END(); CHECK_ERROR( retVal ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); return retVal; } @@ -3342,8 +3340,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clFinish)( HOST_PERFORMANCE_TIMING_END(); CHECK_ERROR( retVal ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); return retVal; } @@ -3428,8 +3425,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReadBuffer)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_read ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_read ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_read ); ADD_EVENT( event ? event[0] : NULL ); } @@ -3527,8 +3523,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReadBufferRect)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_read ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_read ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_read ); ADD_EVENT( event ? event[0] : NULL ); } @@ -3618,8 +3613,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWriteBuffer)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_write ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_write ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_write ); ADD_EVENT( event ? event[0] : NULL ); } @@ -3717,8 +3711,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWriteBufferRect)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_write ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_write ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_write ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4075,8 +4068,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReadImage)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_read ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_read ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_read ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4170,8 +4162,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWriteImage)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_write ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_write ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_write ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4554,8 +4545,7 @@ CL_API_ENTRY void* CL_API_CALL CLIRN(clEnqueueMapBuffer)( "[ map count = %d ] returned %p", map_count, retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_map ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_map ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_map ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4678,8 +4668,7 @@ CL_API_ENTRY void* CL_API_CALL CLIRN(clEnqueueMapImage)( "[ map count = %d ] returned %p", map_count, retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_map ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_map ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_map ); ADD_EVENT( event ? event[0] : NULL ); } @@ -5190,8 +5179,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWaitForEvents)( HOST_PERFORMANCE_TIMING_END(); CHECK_ERROR( retVal ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); } FINISH_OR_FLUSH_AFTER_ENQUEUE( command_queue ); @@ -6306,8 +6294,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReleaseGLObjects)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -6520,8 +6507,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueSVMMemcpy) ( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_copy ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_copy ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_copy ); ADD_EVENT( event ? event[0] : NULL ); } @@ -6659,8 +6645,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueSVMMap) ( ADD_MAP_POINTER( svm_ptr, map_flags, size ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_map ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_map ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_map ); ADD_EVENT( event ? event[0] : NULL ); } @@ -7637,8 +7622,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseExternalMemObjectsKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8026,8 +8010,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8331,8 +8314,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8548,8 +8530,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8765,8 +8746,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9ObjectsINTEL( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -9733,8 +9713,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseVA_APIMediaSurfacesINTEL( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -10269,8 +10248,7 @@ clMemBlockingFreeINTEL( CHECK_ERROR( retVal ); ADD_POINTER_FREE( ptr ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); return retVal; } @@ -10565,8 +10543,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemcpyINTEL( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking ); ADD_EVENT( event ? event[0] : NULL ); } diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index a54f1160..08f29665 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -3487,7 +3487,14 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } \ } -#define DEVICE_PERFORMANCE_TIMING_CHECK() \ +/////////////////////////////////////////////////////////////////////////////// +// +inline void CLIntercept::flushChromeTraceBuffering() +{ + m_ChromeTrace.flush(); +} + +#define PROCESS_DATA_AND_FLUSH() \ if( pIntercept->config().DevicePerformanceTiming || \ pIntercept->config().ITTPerformanceTiming || \ pIntercept->config().ChromePerformanceTiming || \ @@ -3497,29 +3504,7 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( TOOL_OVERHEAD_TIMING_START(); \ pIntercept->checkTimingEvents(); \ TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); \ - } - -#define DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( _condition ) \ - if( ( _condition ) && \ - ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling || \ - pIntercept->config().DevicePerfCounterTimeBasedSampling ) ) \ - { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->checkTimingEvents(); \ - TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); \ - } - -/////////////////////////////////////////////////////////////////////////////// -// -inline void CLIntercept::flushChromeTraceBuffering() -{ - m_ChromeTrace.flush(); -} - -#define FLUSH_CHROME_TRACE_BUFFERING() \ + } \ if( pIntercept->config().ChromeTraceBufferSize && \ pIntercept->config().ChromeTraceBufferingBlockingCallFlush && \ ( pIntercept->config().ChromeCallLogging || \ @@ -3530,16 +3515,10 @@ inline void CLIntercept::flushChromeTraceBuffering() TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); \ } -#define FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( _condition ) \ - if( ( _condition ) && \ - pIntercept->config().ChromeTraceBufferSize && \ - pIntercept->config().ChromeTraceBufferingBlockingCallFlush && \ - ( pIntercept->config().ChromeCallLogging || \ - pIntercept->config().ChromePerformanceTiming ) ) \ +#define PROCESS_DATA_AND_FLUSH_CONDITIONAL( _condition ) \ + if( _condition ) \ { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->flushChromeTraceBuffering(); \ - TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); \ + PROCESS_DATA_AND_FLUSH(); \ } /////////////////////////////////////////////////////////////////////////////// From c8ae39508fe4eb7f59c3468c9542c7385ec948f6 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 8 Mar 2026 21:00:12 -0700 Subject: [PATCH 02/12] add a separate mutex for managing the event list --- intercept/mdapi/intercept_mdapi.cpp | 105 ++++++++++++++-------------- intercept/src/eventlist.h | 71 +++++++++++++++++++ intercept/src/intercept.cpp | 102 +++++++++++++++++++++++---- intercept/src/intercept.h | 38 +++++----- 4 files changed, 230 insertions(+), 86 deletions(-) create mode 100644 intercept/src/eventlist.h diff --git a/intercept/mdapi/intercept_mdapi.cpp b/intercept/mdapi/intercept_mdapi.cpp index 8a716453..2d5fcd39 100644 --- a/intercept/mdapi/intercept_mdapi.cpp +++ b/intercept/mdapi/intercept_mdapi.cpp @@ -424,68 +424,65 @@ void CLIntercept::getMDAPICountersFromEvent( { const size_t reportSize = m_pMDHelper->GetQueryReportSize(); - char* pReport = new char[ reportSize ]; - if( pReport ) + std::vector report(reportSize); + + size_t outputSize = 0; + cl_int errorCode = dispatch().clGetEventProfilingInfo( + event, + CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL, + reportSize, + report.data(), + &outputSize ); + + std::lock_guard lock(m_Mutex); + + if( errorCode == CL_SUCCESS ) { - size_t outputSize = 0; - cl_int errorCode = dispatch().clGetEventProfilingInfo( - event, - CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL, - reportSize, - pReport, - &outputSize ); + // Check: The size of the queried report should be the expected size. + CLI_ASSERT( outputSize == reportSize ); - if( errorCode == CL_SUCCESS ) - { - // Check: The size of the queried report should be the expected size. - CLI_ASSERT( outputSize == reportSize ); + std::vector results; + std::vector maxValues; + std::vector ioInfoValues; // unused - std::vector results; - std::vector maxValues; - std::vector ioInfoValues; // unused + uint32_t numResults = m_pMDHelper->GetMetricsFromReports( + 1, + report.data(), + results, + maxValues ); - uint32_t numResults = m_pMDHelper->GetMetricsFromReports( - 1, - pReport, + if( numResults ) + { + m_pMDHelper->PrintMetricValues( + m_MetricDump, + name, + numResults, results, - maxValues ); - - if( numResults ) - { - m_pMDHelper->PrintMetricValues( - m_MetricDump, - name, - numResults, - results, - maxValues, - ioInfoValues ); - m_pMDHelper->AggregateMetrics( - m_MetricAggregations, - name, - results ); - } + maxValues, + ioInfoValues ); + m_pMDHelper->AggregateMetrics( + m_MetricAggregations, + name, + results ); } - else + } + else + { + // Currently, MDAPI data is only included for kernels, so only + // report an errors for kernel events. + cl_command_type type = 0; + dispatch().clGetEventInfo( + event, + CL_EVENT_COMMAND_TYPE, + sizeof(type), + &type, + NULL ); + if( type == CL_COMMAND_NDRANGE_KERNEL ) { - // Currently, MDAPI data is only included for kernels, so only - // report an errors for kernel events. - cl_command_type type = 0; - dispatch().clGetEventInfo( - event, - CL_EVENT_COMMAND_TYPE, - sizeof(type), - &type, - NULL ); - if( type == CL_COMMAND_NDRANGE_KERNEL ) - { - logf("Couldn't get MDAPI data for kernel! clGetEventProfilingInfo returned '%s' (%08X)!\n", - enumName().name(errorCode).c_str(), - errorCode ); - } + logf("Couldn't get MDAPI data for kernel! clGetEventProfilingInfo returned '%s' (%08X)!\n", + enumName().name(errorCode).c_str(), + errorCode ); } - - delete [] pReport; - pReport = NULL; } } } diff --git a/intercept/src/eventlist.h b/intercept/src/eventlist.h new file mode 100644 index 00000000..f99c2476 --- /dev/null +++ b/intercept/src/eventlist.h @@ -0,0 +1,71 @@ +/* +// Copyright (c) 2026 Intel Corporation +// +// SPDX-License-Identifier: MIT +*/ + +#pragma once + +#include +#include +#include +#include + +#include "common.h" + +class CEventList +{ +public: +// !!! TODO: figure out what to do here! +#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK) + using clock = std::chrono::high_resolution_clock; +#else + using clock = std::chrono::steady_clock; +#endif + + struct Node + { + cl_device_id Device; + unsigned int QueueNumber; + std::string Name; + uint64_t EnqueueCounter; + clock::time_point QueuedTime; + bool UseProfilingDelta; + int64_t ProfilingDeltaNS; + cl_event Event; + }; + + using const_iterator = std::list::const_iterator; + + CEventList() = default; + ~CEventList() = default; + CEventList( const CEventList& ) = delete; + CEventList& operator=( const CEventList& ) = delete; + + void addNode( Node&& node ) + { + std::lock_guard lock(m_Mutex); + m_EventList.push_back( std::move(node) ); + } + + void erase( const_iterator iterator ) + { + std::lock_guard lock(m_Mutex); + m_EventList.erase( iterator ); + } + + const_iterator begin() const + { + return m_EventList.begin(); + } + + const_iterator end() const + { + return m_EventList.end(); + } + +private: + std::mutex m_Mutex; + + std::list m_EventList; +}; diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 82b9a144..7c11df60 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -182,6 +182,14 @@ CLIntercept::~CLIntercept() stopAubCapture( NULL ); report(); + if( m_Config.MultiThreadedProcessing ) + { + fprintf(stderr, "Shutting down processing thread...\n"); + m_ProcessingDone = true; + m_ProcessingCondition.notify_all(); + m_ProcessingThread.join(); + } + std::lock_guard lock(m_Mutex); log( "CLIntercept is shutting down...\n" ); @@ -648,11 +656,74 @@ bool CLIntercept::init() m_ChromeTrace.addStartTimeMetadata( usStartTime ); } + if( m_Config.MultiThreadedProcessing ) + { + fprintf(stderr, "Starting processing thread...\n"); + m_ProcessingDone = false; + m_ProcessingThread = std::thread( CLIntercept::processingThreadFunc, this ); + } + log( "... loading complete.\n" ); return true; } +/////////////////////////////////////////////////////////////////////////////// +// +void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) +{ + while( !pIntercept->m_ProcessingDone ) + { + std::unique_lock lock( pIntercept->m_ProcessingMutex ); + pIntercept->m_ProcessingCondition.wait( lock ); + + //fprintf(stderr, "Processing thread woke up...\n" ); + + pIntercept->processData(); + + //fprintf(stderr, "Processing done.\n" ); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// +void CLIntercept::notifyProcessingThread() +{ + std::lock_guard processingLock(m_ProcessingMutex); + std::lock_guard lock(m_Mutex); + + m_ProcessingCondition.notify_one(); +} + +/////////////////////////////////////////////////////////////////////////////// +// +void CLIntercept::processData() +{ + CLIntercept* pIntercept = this; + + GET_ENQUEUE_COUNTER(); + + if( pIntercept->config().DevicePerformanceTiming || + pIntercept->config().ITTPerformanceTiming || + pIntercept->config().ChromePerformanceTiming || + pIntercept->config().DevicePerfCounterEventBasedSampling || + pIntercept->config().DevicePerfCounterTimeBasedSampling ) + { + TOOL_OVERHEAD_TIMING_START(); + pIntercept->checkTimingEvents(); + TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); + } + if( pIntercept->config().ChromeTraceBufferSize && + pIntercept->config().ChromeTraceBufferingBlockingCallFlush && + ( pIntercept->config().ChromeCallLogging || + pIntercept->config().ChromePerformanceTiming ) ) + { + TOOL_OVERHEAD_TIMING_START(); + pIntercept->flushChromeTraceBuffering(); + TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); + } +} + /////////////////////////////////////////////////////////////////////////////// // void CLIntercept::report() @@ -1336,6 +1407,7 @@ void CLIntercept::cachePlatformInfo() void CLIntercept::cacheDeviceInfo( cl_device_id device ) { + // TODO: Can we be smarter here and return the found device info? if( device && m_DeviceInfoMap.find(device) == m_DeviceInfoMap.end() ) { SDeviceInfo& deviceInfo = m_DeviceInfoMap[device]; @@ -6517,8 +6589,6 @@ void CLIntercept::addTimingEvent( const cl_command_queue queue, cl_event event ) { - std::lock_guard lock(m_Mutex); - if( event == NULL ) { logf( "Unexpectedly got a NULL timing event for %s, check for OpenCL errors!\n", @@ -6526,9 +6596,7 @@ void CLIntercept::addTimingEvent( return; } - m_EventList.emplace_back(); - - SEventListNode& node = m_EventList.back(); + CEventList::Node node; cl_device_id device = NULL; dispatch().clGetCommandQueueInfo( @@ -6538,10 +6606,6 @@ void CLIntercept::addTimingEvent( &device, NULL ); - // Cache the device info if it's not cached already, since we'll print - // the device name and other device properties as part of the report. - cacheDeviceInfo( device ); - dispatch().clRetainEvent( event ); node.Device = device; @@ -6555,6 +6619,12 @@ void CLIntercept::addTimingEvent( if( device ) { + std::lock_guard lock(m_Mutex); + + // Cache the device info if it's not cached already, since we'll print + // the device name and other device properties as part of the report. + cacheDeviceInfo( device ); + const SDeviceInfo& deviceInfo = m_DeviceInfoMap[device]; // Note: Even though ideally the intercept timer and the host timer should advance @@ -6600,16 +6670,16 @@ void CLIntercept::addTimingEvent( // hostTimeNS ); } } + + m_EventList.addNode(std::move(node)); } /////////////////////////////////////////////////////////////////////////////// // void CLIntercept::checkTimingEvents() { - std::lock_guard lock(m_Mutex); - - CEventList::iterator current = m_EventList.begin(); - CEventList::iterator next; + CEventList::const_iterator current = m_EventList.begin(); + CEventList::const_iterator next; while( current != m_EventList.end() ) { @@ -6619,7 +6689,7 @@ void CLIntercept::checkTimingEvents() next = current; ++next; - const SEventListNode& node = *current; + const CEventList::Node& node = *current; errorCode = dispatch().clGetEventInfo( node.Event, @@ -6668,6 +6738,8 @@ void CLIntercept::checkTimingEvents() NULL ); if( errorCode == CL_SUCCESS ) { + std::lock_guard lock(m_Mutex); + cl_ulong delta = commandEnd - commandStart; SDeviceTimingStats& deviceTimingStats = m_DeviceTimingStatsMap[node.Device][node.Name]; @@ -6776,6 +6848,8 @@ void CLIntercept::checkTimingEvents() break; case CL_INVALID_EVENT: { + std::lock_guard lock(m_Mutex); + // This is unexpected. We retained the event when we // added it to the list. Remove the event from the // list. diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index 08f29665..3870eccd 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include #include @@ -25,6 +27,7 @@ #include "chrometracer.h" #include "cmdbufrecorder.h" #include "enummap.h" +#include "eventlist.h" #include "dispatch.h" #include "objtracker.h" @@ -61,6 +64,9 @@ class CLIntercept static bool Create( void* pGlobalData, CLIntercept*& pIntercept ); static void Delete( CLIntercept*& pIntercept ); + void notifyProcessingThread(); + void processData(); + void report(); void callLoggingEnter( @@ -1082,6 +1088,7 @@ class CLIntercept std::ofstream m_InterceptLog; CChromeTracer m_ChromeTrace; + CEventList m_EventList; mutable char m_StringBuffer[CLI_STRING_BUFFER_SIZE]; @@ -1101,6 +1108,15 @@ class CLIntercept unsigned int m_ProgramNumber; + // Multi-threaded processing and flushing: + + std::thread m_ProcessingThread; + std::mutex m_ProcessingMutex; + std::condition_variable m_ProcessingCondition; + std::atomic m_ProcessingDone; + + static void processingThreadFunc( CLIntercept* pIntercept ); + // This defines a mapping between a sub-device handle and information // about the sub-device. @@ -1251,9 +1267,6 @@ class CLIntercept cl_event Event; }; - typedef std::list< SEventListNode > CEventList; - CEventList m_EventList; - #if defined(USE_MDAPI) MetricsDiscovery::MDHelper* m_pMDHelper; MetricsDiscovery::CMetricAggregations m_MetricAggregations; @@ -3495,24 +3508,13 @@ inline void CLIntercept::flushChromeTraceBuffering() } #define PROCESS_DATA_AND_FLUSH() \ - if( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling || \ - pIntercept->config().DevicePerfCounterTimeBasedSampling ) \ + if( pIntercept->config().MultiThreadedProcessing ) \ { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->checkTimingEvents(); \ - TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); \ + pIntercept->notifyProcessingThread(); \ } \ - if( pIntercept->config().ChromeTraceBufferSize && \ - pIntercept->config().ChromeTraceBufferingBlockingCallFlush && \ - ( pIntercept->config().ChromeCallLogging || \ - pIntercept->config().ChromePerformanceTiming ) ) \ + else \ { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->flushChromeTraceBuffering(); \ - TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); \ + pIntercept->processData(); \ } #define PROCESS_DATA_AND_FLUSH_CONDITIONAL( _condition ) \ From 6539abd1aff6790c8ad59d4d503576ad5600b6a4 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 8 Mar 2026 22:40:20 -0700 Subject: [PATCH 03/12] multi-threaded processing is working --- intercept/src/intercept.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 7c11df60..31994570 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -179,17 +179,23 @@ CLIntercept::CLIntercept( void* pGlobalData ) // CLIntercept::~CLIntercept() { - stopAubCapture( NULL ); - report(); - if( m_Config.MultiThreadedProcessing ) { - fprintf(stderr, "Shutting down processing thread...\n"); - m_ProcessingDone = true; + { + std::lock_guard lock(m_Mutex); + log( "Waiting for processing thread...\n" ); + } + { + std::lock_guard lock(m_ProcessingMutex); + m_ProcessingDone = true; + } m_ProcessingCondition.notify_all(); m_ProcessingThread.join(); } + stopAubCapture( NULL ); + report(); + std::lock_guard lock(m_Mutex); log( "CLIntercept is shutting down...\n" ); @@ -658,9 +664,9 @@ bool CLIntercept::init() if( m_Config.MultiThreadedProcessing ) { - fprintf(stderr, "Starting processing thread...\n"); m_ProcessingDone = false; m_ProcessingThread = std::thread( CLIntercept::processingThreadFunc, this ); + log( "Processing Thread Started!\n" ); } log( "... loading complete.\n" ); @@ -689,9 +695,6 @@ void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) // void CLIntercept::notifyProcessingThread() { - std::lock_guard processingLock(m_ProcessingMutex); - std::lock_guard lock(m_Mutex); - m_ProcessingCondition.notify_one(); } From f43206e135367a06814c5421e41f3dfee95cb242 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 8 Mar 2026 23:38:17 -0700 Subject: [PATCH 04/12] add a cliloader control for min and max enqueue --- cliloader/cliloader.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cliloader/cliloader.cpp b/cliloader/cliloader.cpp index c85c8a87..006d6ba9 100644 --- a/cliloader/cliloader.cpp +++ b/cliloader/cliloader.cpp @@ -517,6 +517,24 @@ static bool parseArguments(int argc, char *argv[]) { checkSetEnv("CLI_HostPerformanceTiming", "1"); } + else if( !strcmp(argv[i], "-min") || !strcmp(argv[i], "--min-enqueue") ) + { + ++i; + if( i < argc ) + { + checkSetEnv("CLI_HostPerformanceTimingMinEnqueue", argv[i]); + checkSetEnv("CLI_DevicePerformanceTimingMinEnqueue", argv[i]); + } + } + else if( !strcmp(argv[i], "-max") || !strcmp(argv[i], "--max-enqueue") ) + { + ++i; + if( i < argc ) + { + checkSetEnv("CLI_HostPerformanceTimingMaxEnqueue", argv[i]); + checkSetEnv("CLI_DevicePerformanceTimingMaxEnqueue", argv[i]); + } + } else if( !strcmp(argv[i], "-l") || !strcmp(argv[i], "--leak-checking") ) { checkSetEnv("CLI_LeakChecking", "1"); @@ -639,6 +657,8 @@ static bool parseArguments(int argc, char *argv[]) " --mdapi-group Choose MDAPI Metrics to Collect (Intel GPU Only)\n" " --mdapi-device Choose MDAPI Device for Metrics (Intel GPU Only)\n" " --host-timing [-h] Report Host API Execution Time\n" + " --min-enqueue [-min] Minimum Enqueue for Timing and Chrome Tracing\n" + " --max-enqueue [-max] Maximum Enqueue for Timing and Chrome Tracing\n" " --capture-enqueue Capture the Specified Kernel Enqueue\n" " --capture-kernel Capture the Specified Kernel Name\n" " --leak-checking [-l] Track and Report OpenCL Leaks\n" From 3f9d8ee405923d28f7b9382e9c0b52dfdd903c07 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 8 Mar 2026 23:38:32 -0700 Subject: [PATCH 05/12] improve output when waiting for the processing thread to complete --- intercept/src/eventlist.h | 6 ++++++ intercept/src/intercept.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/intercept/src/eventlist.h b/intercept/src/eventlist.h index f99c2476..1e50a335 100644 --- a/intercept/src/eventlist.h +++ b/intercept/src/eventlist.h @@ -64,6 +64,12 @@ class CEventList return m_EventList.end(); } + size_t size() + { + std::lock_guard lock(m_Mutex); + return m_EventList.size(); + } + private: std::mutex m_Mutex; diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 31994570..9b85c684 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -183,7 +183,7 @@ CLIntercept::~CLIntercept() { { std::lock_guard lock(m_Mutex); - log( "Waiting for processing thread...\n" ); + logf( "CLIntercept is waiting for the processing thread to complete (%zu elements)...\n", m_EventList.size() ); } { std::lock_guard lock(m_ProcessingMutex); From f3d9b25eb6bef3f71d93ada6a8391bbb615b1fb6 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Thu, 12 Mar 2026 22:26:40 -0700 Subject: [PATCH 06/12] stop processing events during shutdown --- intercept/src/intercept.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 9b85c684..4acc1d32 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -179,16 +179,16 @@ CLIntercept::CLIntercept( void* pGlobalData ) // CLIntercept::~CLIntercept() { + if( m_EventList.size() > 0 ) + { + std::lock_guard lock(m_Mutex); + + logf( "CLIntercept is shutting down, but %zu events are unprocessed!\n", + m_EventList.size() ); + } if( m_Config.MultiThreadedProcessing ) { - { - std::lock_guard lock(m_Mutex); - logf( "CLIntercept is waiting for the processing thread to complete (%zu elements)...\n", m_EventList.size() ); - } - { - std::lock_guard lock(m_ProcessingMutex); - m_ProcessingDone = true; - } + m_ProcessingDone.store(true); m_ProcessingCondition.notify_all(); m_ProcessingThread.join(); } @@ -201,7 +201,7 @@ CLIntercept::~CLIntercept() log( "CLIntercept is shutting down...\n" ); // Set the dispatch to the dummy dispatch. The destructor is called - // as the process is terminating. We don't know when each DLL gets + // as the process is terminating. We don't know when drivers have been // unloaded, so it's not safe to call into any OpenCL functions in // our destructor. Setting to the dummy dispatch ensures that no // OpenCL functions get called. Note that this means we do potentially @@ -664,7 +664,7 @@ bool CLIntercept::init() if( m_Config.MultiThreadedProcessing ) { - m_ProcessingDone = false; + m_ProcessingDone.store(false); m_ProcessingThread = std::thread( CLIntercept::processingThreadFunc, this ); log( "Processing Thread Started!\n" ); } @@ -678,7 +678,7 @@ bool CLIntercept::init() // void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) { - while( !pIntercept->m_ProcessingDone ) + while( pIntercept->m_ProcessingDone.load() == false ) { std::unique_lock lock( pIntercept->m_ProcessingMutex ); pIntercept->m_ProcessingCondition.wait( lock ); @@ -6686,6 +6686,15 @@ void CLIntercept::checkTimingEvents() while( current != m_EventList.end() ) { + if( config().MultiThreadedProcessing && + m_ProcessingDone.load() == true ) + { + // If we are shutting down, drivers may have been unloaded, so it is + // no longer safe to make calls to check events and process data. + // There is not much else we can do, so just stop processing. + break; + } + cl_int errorCode = CL_SUCCESS; cl_int eventStatus = 0; From 20d80ce2beccab494788f72e4a942981d229b6fb Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Wed, 18 Mar 2026 17:47:21 -0700 Subject: [PATCH 07/12] enable thread sanitizer build option --- CMakeLists.txt | 1 + intercept/CMakeLists.txt | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f9e2039..ca7263c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ option(ENABLE_CLIPROF "Enable cliprof Support and Build the Executable") option(ENABLE_ITT "Enable ITT (Instrumentation Tracing Technology) API Support") option(ENABLE_MDAPI "Enable MDAPI Support" ON) option(ENABLE_HIGH_RESOLUTION_CLOCK "Use the high_resolution_clock for timing instead of the steady_clock") +option(ENABLE_TSAN "Enable Thread Sanitizer") if(WIN32) option(ENABLE_CLICONFIG "Build the cliconfig Configuration Utility" ON) endif() diff --git a/intercept/CMakeLists.txt b/intercept/CMakeLists.txt index 11e81e7e..25df81c5 100644 --- a/intercept/CMakeLists.txt +++ b/intercept/CMakeLists.txt @@ -104,7 +104,6 @@ configure_file(src/git_version.rc.in "${CMAKE_CURRENT_BINARY_DIR}/git_version.rc # When set, this is used for guessing the location of the real OpenCL ICD loader. if(CMAKE_LIBRARY_ARCHITECTURE) - message(STATUS "CMAKE_LIBRARY_ARCHITECTURE is: ${CMAKE_LIBRARY_ARCHITECTURE}") target_compile_definitions(OpenCL PRIVATE CLINTERCEPT_LIBRARY_ARCHITECTURE="${CMAKE_LIBRARY_ARCHITECTURE}") endif() @@ -134,20 +133,45 @@ if(WIN32) elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Android") # conditionally enabled on Linux and Android + + # If using the lld linker, we need to set linker emulation flags + if(NOT DEFINED LLD_EMULATION_FLAGS) + get_filename_component(LINKER_EXECUTABLE ${CMAKE_LINKER} NAME) + if(LINKER_EXECUTABLE MATCHES "lld" OR + ${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") # default on FreeBSD? + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR + ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686") + set(LLD_EMULATION_FLAGS "-melf_i386") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR + ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "amd64") + set(LLD_EMULATION_FLAGS "-melf_x86_64") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") + set(LLD_EMULATION_FLAGS "-maarch64elf") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64") + set(LLD_EMULATION_FLAGS "-melf64lriscv") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm" OR + ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7") + set(LLD_EMULATION_FLAGS "-marmelf") + else() + message(STATUS "Unknown system processor ${CMAKE_SYSTEM_PROCESSOR} for lld target emulation flags!") + endif() + endif() + endif() + if(ENABLE_KERNEL_OVERRIDES) target_compile_definitions(OpenCL PRIVATE USE_KERNEL_OVERRIDES) set(CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/kernels) add_custom_command(OUTPUT ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/precompiled_kernels.o COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY} - COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack + COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack kernels/precompiled_kernels.cl -o ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/precompiled_kernels.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/precompiled_kernels.cl ) add_custom_command(OUTPUT ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/builtin_kernels.o COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY} - COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack + COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack kernels/builtin_kernels.cl -o ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/builtin_kernels.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/builtin_kernels.cl @@ -163,7 +187,7 @@ elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR set(CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/scripts) add_custom_command(OUTPUT ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}/run_py.o COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY} - COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack + COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack scripts/run.py -o ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}/run_py.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run.py @@ -250,6 +274,12 @@ if(ENABLE_ITT) target_link_libraries(OpenCL ${VTUNE_ITTNOTIFY_LIB} ${CMAKE_THREAD_LIBS_INIT}) endif() +# Thread Sanitizer Support (optional) +if(ENABLE_TSAN) + target_compile_options(OpenCL PRIVATE -fsanitize=thread) + target_link_options(OpenCL PRIVATE -fsanitize=thread) +endif() + if(WIN32) foreach(OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG_UPPER) From 86f7cf1498fa329944107545f04d35c3b48cce11 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Mon, 16 Mar 2026 16:21:33 -0700 Subject: [PATCH 08/12] log the kernel handle in more places for consistency (#460) --- intercept/src/dispatch.cpp | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/intercept/src/dispatch.cpp b/intercept/src/dispatch.cpp index 6771e597..fb5291f8 100644 --- a/intercept/src/dispatch.cpp +++ b/intercept/src/dispatch.cpp @@ -2900,7 +2900,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelInfo)( if( pIntercept && pIntercept->dispatch().clGetKernelInfo ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER_KERNEL( kernel, "param_name = %s (%X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, param_name = %s (%X)", + kernel, pIntercept->enumName().name( param_name ).c_str(), param_name ); HOST_PERFORMANCE_TIMING_START(); @@ -2938,7 +2941,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelArgInfo)( if( pIntercept && pIntercept->dispatch().clGetKernelArgInfo ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER_KERNEL( kernel, "param_name = %s (%X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, param_name = %s (%X)", + kernel, pIntercept->enumName().name( param_name ).c_str(), param_name ); HOST_PERFORMANCE_TIMING_START(); @@ -2985,7 +2991,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelWorkGroupInfo)( &device, deviceInfo ); } - CALL_LOGGING_ENTER_KERNEL( kernel, "device = %s, param_name = %s (%X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, device = %s, param_name = %s (%X)", + kernel, deviceInfo.c_str(), pIntercept->enumName().name( param_name ).c_str(), param_name ); @@ -6768,7 +6777,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clSetKernelExecInfo) ( if( pIntercept && pIntercept->dispatch().clSetKernelExecInfo ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER_KERNEL( kernel, "param_name = %s (%08X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, param_name = %s (%08X)", + kernel, pIntercept->enumName().name( param_name ).c_str(), param_name ); HOST_PERFORMANCE_TIMING_START(); @@ -7323,7 +7335,8 @@ CL_API_ENTRY cl_kernel CL_API_CALL CLIRN(clCloneKernel) ( if( pIntercept && pIntercept->dispatch().clCloneKernel ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER(); + CALL_LOGGING_ENTER( "source_kernel = %p", + source_kernel ); CHECK_ERROR_INIT( errcode_ret ); HOST_PERFORMANCE_TIMING_START(); @@ -7374,7 +7387,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelSubGroupInfo) ( &device, deviceInfo ); } - CALL_LOGGING_ENTER_KERNEL( kernel, "device = %s, param_name = %s (%08X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, device = %s, param_name = %s (%08X)", + kernel, deviceInfo.c_str(), pIntercept->enumName().name( param_name ).c_str(), param_name ); @@ -7431,7 +7447,10 @@ CL_API_ENTRY cl_int CL_API_CALL clGetKernelSubGroupInfoKHR( &device, deviceInfo ); } - CALL_LOGGING_ENTER_KERNEL( kernel, "device = %s, param_name = %s (%08X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, device = %s, param_name = %s (%08X)", + kernel, deviceInfo.c_str(), pIntercept->enumName().name( param_name ).c_str(), param_name ); From 20f4a46aa1f074f4095fdda2bb9cf0bce5e705d3 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 22 Mar 2026 22:30:23 -0700 Subject: [PATCH 09/12] update documentation --- cliloader/printcontrols.h | 2 +- docs/FAQ.md | 17 +++++++++++++++++ docs/controls.md | 6 +++++- intercept/src/controls.h | 4 ++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/cliloader/printcontrols.h b/cliloader/printcontrols.h index 12b08058..cfec7cf5 100644 --- a/cliloader/printcontrols.h +++ b/cliloader/printcontrols.h @@ -37,7 +37,7 @@ struct cliControl static const std::vector controls = { { true, "Startup Controls:", "", ""}, - { false, "BreakOnLoad", "bool", "If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when the DLL is loaded." }, + { false, "BreakOnLoad", "bool", "If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when it is loaded." }, { false, "std::string", "OpenCLFileName", "Used to control the DLL or Shared Library that the Intercept Layer for OpenCL Applications loads to make real OpenCL calls. If present, only this file name is loaded. If omitted, the Intercept Layer for OpenCL Applications will search a default set of real OpenCL file names." }, #include "src/controls.h" }; diff --git a/docs/FAQ.md b/docs/FAQ.md index b8081cd5..83c71305 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -179,6 +179,23 @@ This causes each instance of the application to dump to a unique directory. Some other setups may choose set a unique `DumpDir` for each process instance via environment variables, although setting `AppendPid` is usually sufficient and a much simpler solution. +## What does "CLIntercept is shutting down, but N events are unprocessed" mean? + +This is a warning about missing device performance timing information. +Device performance timing works by attaching events to device commands when the commands are enqueued. +The events are then added to a device performance timing list for eventual processing, typically when the application synchronizes with the host. +If the application shuts down while events are still on the device performance timing list, the remaining events will not be processed. +Note that the OpenCL Intercept Layer cannot safely process events during shutdown because other OpenCL libraries or drivers may have been unloaded already. + +Events will typically be unprocessed for one of two reasons: + +* If an application enqueues a non-blocking command without making any other calls that cause the device performance timing list to be processed, then the event may be unprocessed when the application terminates. +This may especially happen if the application does not flush the queue after enqueueing the non-blocking command. +Setting `FlushAfterEnqueue` or `FinishAfterEnqueue` will usually enable these events to be processed, however these controls can have a large impact on performance and should be used with caution. +* To reduce overhead, the OpenCL Intercept Layer typically processes the device performance timing list on a separate thread. +If the application enqueues a lot of commands, then the device performance timing list may take a lot of time to process, and the separate thread may still be processing events when the application terminates. +Disabling `MultiThreadedProcessing` will cause these events to be processed on the main application thread instead, which increases overhead, but will usually enable the events to be processed prior to application exit. + ## How do I submit a bug? Please file a GitHub issue to report a bug. diff --git a/docs/controls.md b/docs/controls.md index c8a53f69..0b62e6eb 100644 --- a/docs/controls.md +++ b/docs/controls.md @@ -129,12 +129,16 @@ The old name may still be used for backwards compatibility, but switching to the If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when it is loaded. -### Tracing Controls +### Basic Controls ##### `BetaExtensionIntercepting` (bool) If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs. +##### `MultiThreadedProcessing` (bool) + +If set to a nonzero value, the Intercept Layer for OpenCL Applications will process device performance timing and flush chrome tracing buffers on a separate thread to reduce overhead. Setting this control to zero will process device performance timing and flush chrome tracing buffers on the main application thread instead. + ### Logging Controls ##### `SuppressLogging` (bool) diff --git a/intercept/src/controls.h b/intercept/src/controls.h index 02da6a35..12a3e1ad 100644 --- a/intercept/src/controls.h +++ b/intercept/src/controls.h @@ -12,9 +12,9 @@ #define CLI_CONTROL_SEPARATOR( _name ) #endif -CLI_CONTROL_SEPARATOR( Tracing Controls: ) +CLI_CONTROL_SEPARATOR( Basic Controls: ) CLI_CONTROL( bool, BetaExtensionIntercepting, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs." ) -CLI_CONTROL( bool, MultiThreadedProcessing, true, "TODO" ) +CLI_CONTROL( bool, MultiThreadedProcessing, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will process device performance timing and flush chrome tracing buffers on a separate thread to reduce overhead. Setting this control to zero will process device performance timing and flush chrome tracing buffers on the main application thread instead." ) CLI_CONTROL_SEPARATOR( Logging Controls: ) CLI_CONTROL( bool, SuppressLogging, false, "If set to a nonzero value, suppresses all logging output from the Intercept Layer for OpenCL Applications. This is particularly useful for tools that only want report data." ) From 000c961731a1e89f9e2cb6fd58765df2591c3f47 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Sun, 22 Mar 2026 22:44:03 -0700 Subject: [PATCH 10/12] add a cliloader control to disable multi-threading --- cliloader/cliloader.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cliloader/cliloader.cpp b/cliloader/cliloader.cpp index 006d6ba9..2b5a71de 100644 --- a/cliloader/cliloader.cpp +++ b/cliloader/cliloader.cpp @@ -401,6 +401,10 @@ static bool parseArguments(int argc, char *argv[]) set_LD_LIBRARY_PATH = false; } #endif + else if ( !strcmp(argv[i], "-nt") || !strcmp(argv[i], "--no-threads") ) + { + checkSetEnv("CLI_MultiThreadedProcessing", "0"); + } else if( !strcmp(argv[i], "-q") || !strcmp(argv[i], "--quiet") ) { checkSetEnv("CLI_SuppressLogging", "1"); @@ -517,7 +521,7 @@ static bool parseArguments(int argc, char *argv[]) { checkSetEnv("CLI_HostPerformanceTiming", "1"); } - else if( !strcmp(argv[i], "-min") || !strcmp(argv[i], "--min-enqueue") ) + else if( !strcmp(argv[i], "--min-enqueue") ) { ++i; if( i < argc ) @@ -526,7 +530,7 @@ static bool parseArguments(int argc, char *argv[]) checkSetEnv("CLI_DevicePerformanceTimingMinEnqueue", argv[i]); } } - else if( !strcmp(argv[i], "-max") || !strcmp(argv[i], "--max-enqueue") ) + else if( !strcmp(argv[i], "--max-enqueue") ) { ++i; if( i < argc ) @@ -628,11 +632,12 @@ static bool parseArguments(int argc, char *argv[]) " --metrics Print All MDAPI Metrics and Exit\n" " --mdapi-devices Print All MDAPI Devices and Exit\n" #if defined(_WIN32) - " --no-DLL-load Do not load the Intercept DLL into the child process\n" + " --no-DLL-load Do Not Load the Intercept DLL Into The Child Process\n" #else // not Windows - " --no-LD_PRELOAD Do not set LD_PRELOAD\n" - " --no-LD_LIBRARY_PATH Do not set LD_LIBRARY_PATH\n" + " --no-LD_PRELOAD Do Not Set LD_PRELOAD\n" + " --no-LD_LIBRARY_PATH Do Not Set LD_LIBRARY_PATH\n" #endif + " --no-threads [-nt] Do Not Create Additional Processing Threads\n" "\n" " --quiet [-q] Disable Logging\n" " --call-logging [-c] Trace Host API Calls\n" @@ -657,8 +662,8 @@ static bool parseArguments(int argc, char *argv[]) " --mdapi-group Choose MDAPI Metrics to Collect (Intel GPU Only)\n" " --mdapi-device Choose MDAPI Device for Metrics (Intel GPU Only)\n" " --host-timing [-h] Report Host API Execution Time\n" - " --min-enqueue [-min] Minimum Enqueue for Timing and Chrome Tracing\n" - " --max-enqueue [-max] Maximum Enqueue for Timing and Chrome Tracing\n" + " --min-enqueue Minimum Enqueue for Timing and Chrome Tracing\n" + " --max-enqueue Maximum Enqueue for Timing and Chrome Tracing\n" " --capture-enqueue Capture the Specified Kernel Enqueue\n" " --capture-kernel Capture the Specified Kernel Name\n" " --leak-checking [-l] Track and Report OpenCL Leaks\n" From cd936c8928b4b647b96925573a509190a906d885 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Thu, 26 Mar 2026 08:36:20 -0700 Subject: [PATCH 11/12] change the way the processing thread is notified We need to hold a lock when we're setting the done flag, and we also need to hold a lock when we are checking the done flag in the processing thread. --- intercept/src/common.h | 13 +++++++++++++ intercept/src/eventlist.h | 8 +------- intercept/src/intercept.cpp | 38 +++++++++++++++++++++++++++---------- intercept/src/intercept.h | 33 +++++++++++++------------------- 4 files changed, 55 insertions(+), 37 deletions(-) diff --git a/intercept/src/common.h b/intercept/src/common.h index 0deeb75c..9ccd5454 100644 --- a/intercept/src/common.h +++ b/intercept/src/common.h @@ -83,3 +83,16 @@ #endif #define CLI_STRING_BUFFER_SIZE (16 * 1024) + +#include + +namespace +{ + +#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK) + using clock = std::chrono::high_resolution_clock; +#else + using clock = std::chrono::steady_clock; +#endif + +}; diff --git a/intercept/src/eventlist.h b/intercept/src/eventlist.h index 1e50a335..72551f75 100644 --- a/intercept/src/eventlist.h +++ b/intercept/src/eventlist.h @@ -6,7 +6,6 @@ #pragma once -#include #include #include #include @@ -16,12 +15,7 @@ class CEventList { public: -// !!! TODO: figure out what to do here! -#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK) - using clock = std::chrono::high_resolution_clock; -#else - using clock = std::chrono::steady_clock; -#endif + std::mutex CheckMutex; struct Node { diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 4acc1d32..60dd7d74 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -188,8 +188,16 @@ CLIntercept::~CLIntercept() } if( m_Config.MultiThreadedProcessing ) { - m_ProcessingDone.store(true); - m_ProcessingCondition.notify_all(); + // Note: We need to hold the processing condition lock while setting the + // done flag. This avoids a deadlock if this thread sets the done flag + // and notifies the processing thread between the time the processing + // thread checked the done flag and before it is waiting for + // notification. + { + std::lock_guard lock(m_ProcessingConditionMutex); + m_ProcessingDone.store(true); + } + m_ProcessingConditionVariable.notify_all(); m_ProcessingThread.join(); } @@ -678,16 +686,24 @@ bool CLIntercept::init() // void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) { - while( pIntercept->m_ProcessingDone.load() == false ) + while( true ) { - std::unique_lock lock( pIntercept->m_ProcessingMutex ); - pIntercept->m_ProcessingCondition.wait( lock ); - - //fprintf(stderr, "Processing thread woke up...\n" ); + // Note: We need to check whether processing is done while holding the + // processing condition lock. This avoids a deadlock in the case where + // this processing thread checks the done flag and gets interrupted, + // then the main thread sets the done flag and notifies this processing + // thread, then this processing thread waits on the notification that + // will never come. + { + std::unique_lock lock( pIntercept->m_ProcessingConditionMutex ); + if (pIntercept->m_ProcessingDone.load()) + { + break; + } + pIntercept->m_ProcessingConditionVariable.wait( lock ); + } pIntercept->processData(); - - //fprintf(stderr, "Processing done.\n" ); } } @@ -695,7 +711,7 @@ void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) // void CLIntercept::notifyProcessingThread() { - m_ProcessingCondition.notify_one(); + m_ProcessingConditionVariable.notify_one(); } /////////////////////////////////////////////////////////////////////////////// @@ -6681,6 +6697,8 @@ void CLIntercept::addTimingEvent( // void CLIntercept::checkTimingEvents() { + std::lock_guard lock(m_EventList.CheckMutex); + CEventList::const_iterator current = m_EventList.begin(); CEventList::const_iterator next; diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index 3870eccd..3ee850d4 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -6,7 +6,6 @@ #pragma once #include -#include #include #include #include @@ -55,12 +54,6 @@ class CLIntercept struct SConfig; public: -#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK) - using clock = std::chrono::high_resolution_clock; -#else - using clock = std::chrono::steady_clock; -#endif - static bool Create( void* pGlobalData, CLIntercept*& pIntercept ); static void Delete( CLIntercept*& pIntercept ); @@ -1111,8 +1104,8 @@ class CLIntercept // Multi-threaded processing and flushing: std::thread m_ProcessingThread; - std::mutex m_ProcessingMutex; - std::condition_variable m_ProcessingCondition; + std::mutex m_ProcessingConditionMutex; + std::condition_variable m_ProcessingConditionVariable; std::atomic m_ProcessingDone; static void processingThreadFunc( CLIntercept* pIntercept ); @@ -2114,10 +2107,10 @@ inline CObjectTracker& CLIntercept::objectTracker() /////////////////////////////////////////////////////////////////////////////// // #define BUILD_LOGGING_INIT() \ - CLIntercept::clock::time_point buildTimeStart; \ + clock::time_point buildTimeStart; \ if( pIntercept->config().BuildLogging ) \ { \ - buildTimeStart = CLIntercept::clock::now(); \ + buildTimeStart = clock::now(); \ } #define BUILD_LOGGING( program, num_devices, device_list ) \ @@ -3280,7 +3273,7 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( } #define HOST_PERFORMANCE_TIMING_START() \ - CLIntercept::clock::time_point cpuStart, cpuEnd; \ + clock::time_point cpuStart, cpuEnd; \ bool doHostPerformanceTiming = \ ( pIntercept->config().ChromeCallLogging || \ pIntercept->config().HostPerformanceTiming ) && \ @@ -3288,13 +3281,13 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( pIntercept->checkConditionalTiming(); \ if( doHostPerformanceTiming ) \ { \ - cpuStart = CLIntercept::clock::now(); \ + cpuStart = clock::now(); \ } #define HOST_PERFORMANCE_TIMING_END() \ if( doHostPerformanceTiming ) \ { \ - cpuEnd = CLIntercept::clock::now(); \ + cpuEnd = clock::now(); \ if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ @@ -3308,7 +3301,7 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( #define HOST_PERFORMANCE_TIMING_END_WITH_TAG() \ if( doHostPerformanceTiming ) \ { \ - cpuEnd = CLIntercept::clock::now(); \ + cpuEnd = clock::now(); \ if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ @@ -3320,7 +3313,7 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( } #define TOOL_OVERHEAD_TIMING_START() \ - CLIntercept::clock::time_point toolStart, toolEnd; \ + clock::time_point toolStart, toolEnd; \ bool doToolOverheadTiming = \ pIntercept->config().ToolOverheadTiming && \ ( pIntercept->config().ChromeCallLogging || \ @@ -3329,13 +3322,13 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( pIntercept->checkConditionalTiming(); \ if( doToolOverheadTiming ) \ { \ - toolStart = CLIntercept::clock::now(); \ + toolStart = clock::now(); \ } #define TOOL_OVERHEAD_TIMING_END( _tag ) \ if( doToolOverheadTiming ) \ { \ - toolEnd = CLIntercept::clock::now(); \ + toolEnd = clock::now(); \ if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ @@ -3410,7 +3403,7 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } #define DEVICE_PERFORMANCE_TIMING_START( pEvent ) \ - CLIntercept::clock::time_point queuedTime; \ + clock::time_point queuedTime; \ cl_event local_event = NULL; \ bool isLocalEvent = false; \ bool doDevicePerformanceTiming = \ @@ -3422,7 +3415,7 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( pIntercept->checkConditionalTiming(); \ if( doDevicePerformanceTiming ) \ { \ - queuedTime = CLIntercept::clock::now(); \ + queuedTime = clock::now(); \ if( pEvent == NULL ) \ { \ pEvent = &local_event; \ From d4a16a935c8304b91953692d822c2b7473f043ef Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Wed, 1 Apr 2026 22:53:05 -0700 Subject: [PATCH 12/12] name threads on chrome traces --- intercept/src/chrometracer.h | 4 +++- intercept/src/intercept.cpp | 14 +++++++++----- intercept/src/intercept.h | 14 ++++++++------ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/intercept/src/chrometracer.h b/intercept/src/chrometracer.h index 36d0d48a..26417929 100644 --- a/intercept/src/chrometracer.h +++ b/intercept/src/chrometracer.h @@ -56,6 +56,7 @@ class CChromeTracer } void addThreadMetadata( + const std::string& threadName, uint64_t threadId, uint32_t threadNumber ) { @@ -63,7 +64,8 @@ class CChromeTracer m_TraceFile << "{\"ph\":\"M\",\"name\":\"thread_name\",\"pid\":" << m_ProcessId << ",\"tid\":" << threadId - << ",\"args\":{\"name\":\"Host Thread " << threadId + << ",\"args\":{\"name\":\"" << threadName << " " + //<< threadId << "\"}},\n"; m_TraceFile << "{\"ph\":\"M\",\"name\":\"thread_sort_index\",\"pid\":" << m_ProcessId diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 60dd7d74..a8259a22 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -686,13 +686,17 @@ bool CLIntercept::init() // void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) { + // This names the processing thread. + uint64_t threadId = pIntercept->OS().GetThreadID(); + pIntercept->getThreadNumber( "Host Processing Thread", threadId ); + while( true ) { // Note: We need to check whether processing is done while holding the // processing condition lock. This avoids a deadlock in the case where - // this processing thread checks the done flag and gets interrupted, - // then the main thread sets the done flag and notifies this processing - // thread, then this processing thread waits on the notification that + // the processing thread checks the done flag and gets interrupted, then + // the main thread sets the done flag and notifies this processing + // thread, then the processing thread waits on the notification that // will never come. { std::unique_lock lock( pIntercept->m_ProcessingConditionMutex ); @@ -1157,7 +1161,7 @@ void CLIntercept::getCallLoggingPrefix( } if( m_Config.CallLoggingThreadNumber ) { - unsigned int threadNumber = getThreadNumber( threadId ); + uint32_t threadNumber = getThreadNumber( "Host Thread", threadId ); ss << "TNum = "; ss << threadNumber; ss << " "; @@ -14616,7 +14620,7 @@ void CLIntercept::chromeCallLoggingExit( uint64_t threadId = OS().GetThreadID(); // This will name the thread if it is not named already. - getThreadNumber( threadId ); + getThreadNumber( "Host Thread", threadId ); using ns = std::chrono::nanoseconds; uint64_t nsStart = diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index 3ee850d4..b72e54ca 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -901,7 +901,7 @@ class CLIntercept const size_t* gws, const size_t* lws); - unsigned int getThreadNumber( uint64_t threadId ); + uint32_t getThreadNumber( const char* threadName, uint64_t threadId ); void saveProgramNumber( const cl_program program ); unsigned int getProgramNumber() const; @@ -1091,7 +1091,7 @@ class CLIntercept clock::time_point m_StartTime; - typedef std::map< uint64_t, unsigned int> CThreadNumberMap; + typedef std::map< uint64_t, uint32_t > CThreadNumberMap; CThreadNumberMap m_ThreadNumberMap; typedef std::map< cl_device_id, std::vector > CSubDeviceCacheMap; @@ -3671,10 +3671,12 @@ inline std::string CLIntercept::getShortKernelNameWithHash( /////////////////////////////////////////////////////////////////////////////// // -inline unsigned int CLIntercept::getThreadNumber( uint64_t threadId ) +inline uint32_t CLIntercept::getThreadNumber( + const char* threadName, + uint64_t threadId ) { CThreadNumberMap::const_iterator iter = m_ThreadNumberMap.find( threadId ); - unsigned int threadNumber = 0; + uint32_t threadNumber = 0; if( iter != m_ThreadNumberMap.end() ) { @@ -3682,12 +3684,12 @@ inline unsigned int CLIntercept::getThreadNumber( uint64_t threadId ) } else { - threadNumber = (unsigned int)m_ThreadNumberMap.size(); + threadNumber = static_cast(m_ThreadNumberMap.size()); m_ThreadNumberMap[ threadId ] = threadNumber; if( m_Config.ChromeCallLogging ) { - m_ChromeTrace.addThreadMetadata( threadId, threadNumber ); + m_ChromeTrace.addThreadMetadata( threadName, threadId, threadNumber ); } }