diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f9e2039..ca7263c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ option(ENABLE_CLIPROF "Enable cliprof Support and Build the Executable") option(ENABLE_ITT "Enable ITT (Instrumentation Tracing Technology) API Support") option(ENABLE_MDAPI "Enable MDAPI Support" ON) option(ENABLE_HIGH_RESOLUTION_CLOCK "Use the high_resolution_clock for timing instead of the steady_clock") +option(ENABLE_TSAN "Enable Thread Sanitizer") if(WIN32) option(ENABLE_CLICONFIG "Build the cliconfig Configuration Utility" ON) endif() diff --git a/cliloader/cliloader.cpp b/cliloader/cliloader.cpp index c85c8a87..2b5a71de 100644 --- a/cliloader/cliloader.cpp +++ b/cliloader/cliloader.cpp @@ -401,6 +401,10 @@ static bool parseArguments(int argc, char *argv[]) set_LD_LIBRARY_PATH = false; } #endif + else if ( !strcmp(argv[i], "-nt") || !strcmp(argv[i], "--no-threads") ) + { + checkSetEnv("CLI_MultiThreadedProcessing", "0"); + } else if( !strcmp(argv[i], "-q") || !strcmp(argv[i], "--quiet") ) { checkSetEnv("CLI_SuppressLogging", "1"); @@ -517,6 +521,24 @@ static bool parseArguments(int argc, char *argv[]) { checkSetEnv("CLI_HostPerformanceTiming", "1"); } + else if( !strcmp(argv[i], "--min-enqueue") ) + { + ++i; + if( i < argc ) + { + checkSetEnv("CLI_HostPerformanceTimingMinEnqueue", argv[i]); + checkSetEnv("CLI_DevicePerformanceTimingMinEnqueue", argv[i]); + } + } + else if( !strcmp(argv[i], "--max-enqueue") ) + { + ++i; + if( i < argc ) + { + checkSetEnv("CLI_HostPerformanceTimingMaxEnqueue", argv[i]); + checkSetEnv("CLI_DevicePerformanceTimingMaxEnqueue", argv[i]); + } + } else if( !strcmp(argv[i], "-l") || !strcmp(argv[i], "--leak-checking") ) { checkSetEnv("CLI_LeakChecking", "1"); @@ -610,11 +632,12 @@ static bool parseArguments(int argc, char *argv[]) " --metrics Print All MDAPI Metrics and Exit\n" " --mdapi-devices Print All MDAPI Devices and Exit\n" #if defined(_WIN32) - " --no-DLL-load Do not load the Intercept DLL into the child process\n" + " --no-DLL-load Do Not Load the Intercept DLL Into The Child Process\n" #else // not Windows - " --no-LD_PRELOAD Do not set LD_PRELOAD\n" - " --no-LD_LIBRARY_PATH Do not set LD_LIBRARY_PATH\n" + " --no-LD_PRELOAD Do Not Set LD_PRELOAD\n" + " --no-LD_LIBRARY_PATH Do Not Set LD_LIBRARY_PATH\n" #endif + " --no-threads [-nt] Do Not Create Additional Processing Threads\n" "\n" " --quiet [-q] Disable Logging\n" " --call-logging [-c] Trace Host API Calls\n" @@ -639,6 +662,8 @@ static bool parseArguments(int argc, char *argv[]) " --mdapi-group Choose MDAPI Metrics to Collect (Intel GPU Only)\n" " --mdapi-device Choose MDAPI Device for Metrics (Intel GPU Only)\n" " --host-timing [-h] Report Host API Execution Time\n" + " --min-enqueue Minimum Enqueue for Timing and Chrome Tracing\n" + " --max-enqueue Maximum Enqueue for Timing and Chrome Tracing\n" " --capture-enqueue Capture the Specified Kernel Enqueue\n" " --capture-kernel Capture the Specified Kernel Name\n" " --leak-checking [-l] Track and Report OpenCL Leaks\n" diff --git a/cliloader/printcontrols.h b/cliloader/printcontrols.h index 12b08058..cfec7cf5 100644 --- a/cliloader/printcontrols.h +++ b/cliloader/printcontrols.h @@ -37,7 +37,7 @@ struct cliControl static const std::vector controls = { { true, "Startup Controls:", "", ""}, - { false, "BreakOnLoad", "bool", "If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when the DLL is loaded." }, + { false, "BreakOnLoad", "bool", "If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when it is loaded." }, { false, "std::string", "OpenCLFileName", "Used to control the DLL or Shared Library that the Intercept Layer for OpenCL Applications loads to make real OpenCL calls. If present, only this file name is loaded. If omitted, the Intercept Layer for OpenCL Applications will search a default set of real OpenCL file names." }, #include "src/controls.h" }; diff --git a/docs/FAQ.md b/docs/FAQ.md index b8081cd5..83c71305 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -179,6 +179,23 @@ This causes each instance of the application to dump to a unique directory. Some other setups may choose set a unique `DumpDir` for each process instance via environment variables, although setting `AppendPid` is usually sufficient and a much simpler solution. +## What does "CLIntercept is shutting down, but N events are unprocessed" mean? + +This is a warning about missing device performance timing information. +Device performance timing works by attaching events to device commands when the commands are enqueued. +The events are then added to a device performance timing list for eventual processing, typically when the application synchronizes with the host. +If the application shuts down while events are still on the device performance timing list, the remaining events will not be processed. +Note that the OpenCL Intercept Layer cannot safely process events during shutdown because other OpenCL libraries or drivers may have been unloaded already. + +Events will typically be unprocessed for one of two reasons: + +* If an application enqueues a non-blocking command without making any other calls that cause the device performance timing list to be processed, then the event may be unprocessed when the application terminates. +This may especially happen if the application does not flush the queue after enqueueing the non-blocking command. +Setting `FlushAfterEnqueue` or `FinishAfterEnqueue` will usually enable these events to be processed, however these controls can have a large impact on performance and should be used with caution. +* To reduce overhead, the OpenCL Intercept Layer typically processes the device performance timing list on a separate thread. +If the application enqueues a lot of commands, then the device performance timing list may take a lot of time to process, and the separate thread may still be processing events when the application terminates. +Disabling `MultiThreadedProcessing` will cause these events to be processed on the main application thread instead, which increases overhead, but will usually enable the events to be processed prior to application exit. + ## How do I submit a bug? Please file a GitHub issue to report a bug. diff --git a/docs/controls.md b/docs/controls.md index c8a53f69..0b62e6eb 100644 --- a/docs/controls.md +++ b/docs/controls.md @@ -129,12 +129,16 @@ The old name may still be used for backwards compatibility, but switching to the If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when it is loaded. -### Tracing Controls +### Basic Controls ##### `BetaExtensionIntercepting` (bool) If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs. +##### `MultiThreadedProcessing` (bool) + +If set to a nonzero value, the Intercept Layer for OpenCL Applications will process device performance timing and flush chrome tracing buffers on a separate thread to reduce overhead. Setting this control to zero will process device performance timing and flush chrome tracing buffers on the main application thread instead. + ### Logging Controls ##### `SuppressLogging` (bool) diff --git a/intercept/CMakeLists.txt b/intercept/CMakeLists.txt index 11e81e7e..25df81c5 100644 --- a/intercept/CMakeLists.txt +++ b/intercept/CMakeLists.txt @@ -104,7 +104,6 @@ configure_file(src/git_version.rc.in "${CMAKE_CURRENT_BINARY_DIR}/git_version.rc # When set, this is used for guessing the location of the real OpenCL ICD loader. if(CMAKE_LIBRARY_ARCHITECTURE) - message(STATUS "CMAKE_LIBRARY_ARCHITECTURE is: ${CMAKE_LIBRARY_ARCHITECTURE}") target_compile_definitions(OpenCL PRIVATE CLINTERCEPT_LIBRARY_ARCHITECTURE="${CMAKE_LIBRARY_ARCHITECTURE}") endif() @@ -134,20 +133,45 @@ if(WIN32) elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Android") # conditionally enabled on Linux and Android + + # If using the lld linker, we need to set linker emulation flags + if(NOT DEFINED LLD_EMULATION_FLAGS) + get_filename_component(LINKER_EXECUTABLE ${CMAKE_LINKER} NAME) + if(LINKER_EXECUTABLE MATCHES "lld" OR + ${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") # default on FreeBSD? + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR + ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686") + set(LLD_EMULATION_FLAGS "-melf_i386") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR + ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "amd64") + set(LLD_EMULATION_FLAGS "-melf_x86_64") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") + set(LLD_EMULATION_FLAGS "-maarch64elf") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64") + set(LLD_EMULATION_FLAGS "-melf64lriscv") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm" OR + ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7") + set(LLD_EMULATION_FLAGS "-marmelf") + else() + message(STATUS "Unknown system processor ${CMAKE_SYSTEM_PROCESSOR} for lld target emulation flags!") + endif() + endif() + endif() + if(ENABLE_KERNEL_OVERRIDES) target_compile_definitions(OpenCL PRIVATE USE_KERNEL_OVERRIDES) set(CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/kernels) add_custom_command(OUTPUT ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/precompiled_kernels.o COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY} - COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack + COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack kernels/precompiled_kernels.cl -o ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/precompiled_kernels.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/precompiled_kernels.cl ) add_custom_command(OUTPUT ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/builtin_kernels.o COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY} - COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack + COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack kernels/builtin_kernels.cl -o ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/builtin_kernels.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/builtin_kernels.cl @@ -163,7 +187,7 @@ elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR set(CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/scripts) add_custom_command(OUTPUT ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}/run_py.o COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY} - COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack + COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack scripts/run.py -o ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}/run_py.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run.py @@ -250,6 +274,12 @@ if(ENABLE_ITT) target_link_libraries(OpenCL ${VTUNE_ITTNOTIFY_LIB} ${CMAKE_THREAD_LIBS_INIT}) endif() +# Thread Sanitizer Support (optional) +if(ENABLE_TSAN) + target_compile_options(OpenCL PRIVATE -fsanitize=thread) + target_link_options(OpenCL PRIVATE -fsanitize=thread) +endif() + if(WIN32) foreach(OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG_UPPER) diff --git a/intercept/mdapi/intercept_mdapi.cpp b/intercept/mdapi/intercept_mdapi.cpp index 8a716453..2d5fcd39 100644 --- a/intercept/mdapi/intercept_mdapi.cpp +++ b/intercept/mdapi/intercept_mdapi.cpp @@ -424,68 +424,65 @@ void CLIntercept::getMDAPICountersFromEvent( { const size_t reportSize = m_pMDHelper->GetQueryReportSize(); - char* pReport = new char[ reportSize ]; - if( pReport ) + std::vector report(reportSize); + + size_t outputSize = 0; + cl_int errorCode = dispatch().clGetEventProfilingInfo( + event, + CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL, + reportSize, + report.data(), + &outputSize ); + + std::lock_guard lock(m_Mutex); + + if( errorCode == CL_SUCCESS ) { - size_t outputSize = 0; - cl_int errorCode = dispatch().clGetEventProfilingInfo( - event, - CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL, - reportSize, - pReport, - &outputSize ); + // Check: The size of the queried report should be the expected size. + CLI_ASSERT( outputSize == reportSize ); - if( errorCode == CL_SUCCESS ) - { - // Check: The size of the queried report should be the expected size. - CLI_ASSERT( outputSize == reportSize ); + std::vector results; + std::vector maxValues; + std::vector ioInfoValues; // unused - std::vector results; - std::vector maxValues; - std::vector ioInfoValues; // unused + uint32_t numResults = m_pMDHelper->GetMetricsFromReports( + 1, + report.data(), + results, + maxValues ); - uint32_t numResults = m_pMDHelper->GetMetricsFromReports( - 1, - pReport, + if( numResults ) + { + m_pMDHelper->PrintMetricValues( + m_MetricDump, + name, + numResults, results, - maxValues ); - - if( numResults ) - { - m_pMDHelper->PrintMetricValues( - m_MetricDump, - name, - numResults, - results, - maxValues, - ioInfoValues ); - m_pMDHelper->AggregateMetrics( - m_MetricAggregations, - name, - results ); - } + maxValues, + ioInfoValues ); + m_pMDHelper->AggregateMetrics( + m_MetricAggregations, + name, + results ); } - else + } + else + { + // Currently, MDAPI data is only included for kernels, so only + // report an errors for kernel events. + cl_command_type type = 0; + dispatch().clGetEventInfo( + event, + CL_EVENT_COMMAND_TYPE, + sizeof(type), + &type, + NULL ); + if( type == CL_COMMAND_NDRANGE_KERNEL ) { - // Currently, MDAPI data is only included for kernels, so only - // report an errors for kernel events. - cl_command_type type = 0; - dispatch().clGetEventInfo( - event, - CL_EVENT_COMMAND_TYPE, - sizeof(type), - &type, - NULL ); - if( type == CL_COMMAND_NDRANGE_KERNEL ) - { - logf("Couldn't get MDAPI data for kernel! clGetEventProfilingInfo returned '%s' (%08X)!\n", - enumName().name(errorCode).c_str(), - errorCode ); - } + logf("Couldn't get MDAPI data for kernel! clGetEventProfilingInfo returned '%s' (%08X)!\n", + enumName().name(errorCode).c_str(), + errorCode ); } - - delete [] pReport; - pReport = NULL; } } } diff --git a/intercept/src/chrometracer.h b/intercept/src/chrometracer.h index 36d0d48a..26417929 100644 --- a/intercept/src/chrometracer.h +++ b/intercept/src/chrometracer.h @@ -56,6 +56,7 @@ class CChromeTracer } void addThreadMetadata( + const std::string& threadName, uint64_t threadId, uint32_t threadNumber ) { @@ -63,7 +64,8 @@ class CChromeTracer m_TraceFile << "{\"ph\":\"M\",\"name\":\"thread_name\",\"pid\":" << m_ProcessId << ",\"tid\":" << threadId - << ",\"args\":{\"name\":\"Host Thread " << threadId + << ",\"args\":{\"name\":\"" << threadName << " " + //<< threadId << "\"}},\n"; m_TraceFile << "{\"ph\":\"M\",\"name\":\"thread_sort_index\",\"pid\":" << m_ProcessId diff --git a/intercept/src/common.h b/intercept/src/common.h index 0deeb75c..9ccd5454 100644 --- a/intercept/src/common.h +++ b/intercept/src/common.h @@ -83,3 +83,16 @@ #endif #define CLI_STRING_BUFFER_SIZE (16 * 1024) + +#include + +namespace +{ + +#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK) + using clock = std::chrono::high_resolution_clock; +#else + using clock = std::chrono::steady_clock; +#endif + +}; diff --git a/intercept/src/controls.h b/intercept/src/controls.h index 8a66ba03..12a3e1ad 100644 --- a/intercept/src/controls.h +++ b/intercept/src/controls.h @@ -12,8 +12,9 @@ #define CLI_CONTROL_SEPARATOR( _name ) #endif -CLI_CONTROL_SEPARATOR( Tracing Controls: ) +CLI_CONTROL_SEPARATOR( Basic Controls: ) CLI_CONTROL( bool, BetaExtensionIntercepting, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs." ) +CLI_CONTROL( bool, MultiThreadedProcessing, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will process device performance timing and flush chrome tracing buffers on a separate thread to reduce overhead. Setting this control to zero will process device performance timing and flush chrome tracing buffers on the main application thread instead." ) CLI_CONTROL_SEPARATOR( Logging Controls: ) CLI_CONTROL( bool, SuppressLogging, false, "If set to a nonzero value, suppresses all logging output from the Intercept Layer for OpenCL Applications. This is particularly useful for tools that only want report data." ) diff --git a/intercept/src/dispatch.cpp b/intercept/src/dispatch.cpp index 4c45515b..fb5291f8 100644 --- a/intercept/src/dispatch.cpp +++ b/intercept/src/dispatch.cpp @@ -643,8 +643,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clReleaseContext)( ADD_OBJECT_RELEASE( context ); --ref_count; CALL_LOGGING_EXIT( retVal, "[ ref count = %d ]", ref_count ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( ref_count == 0 ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( ref_count == 0 ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( ref_count == 0 ); #if 0 pIntercept->report(); @@ -2901,7 +2900,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelInfo)( if( pIntercept && pIntercept->dispatch().clGetKernelInfo ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER_KERNEL( kernel, "param_name = %s (%X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, param_name = %s (%X)", + kernel, pIntercept->enumName().name( param_name ).c_str(), param_name ); HOST_PERFORMANCE_TIMING_START(); @@ -2939,7 +2941,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelArgInfo)( if( pIntercept && pIntercept->dispatch().clGetKernelArgInfo ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER_KERNEL( kernel, "param_name = %s (%X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, param_name = %s (%X)", + kernel, pIntercept->enumName().name( param_name ).c_str(), param_name ); HOST_PERFORMANCE_TIMING_START(); @@ -2986,7 +2991,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelWorkGroupInfo)( &device, deviceInfo ); } - CALL_LOGGING_ENTER_KERNEL( kernel, "device = %s, param_name = %s (%X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, device = %s, param_name = %s (%X)", + kernel, deviceInfo.c_str(), pIntercept->enumName().name( param_name ).c_str(), param_name ); @@ -3042,8 +3050,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clWaitForEvents)( HOST_PERFORMANCE_TIMING_END(); CHECK_ERROR( retVal ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); return retVal; } @@ -3342,8 +3349,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clFinish)( HOST_PERFORMANCE_TIMING_END(); CHECK_ERROR( retVal ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); return retVal; } @@ -3428,8 +3434,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReadBuffer)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_read ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_read ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_read ); ADD_EVENT( event ? event[0] : NULL ); } @@ -3527,8 +3532,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReadBufferRect)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_read ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_read ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_read ); ADD_EVENT( event ? event[0] : NULL ); } @@ -3618,8 +3622,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWriteBuffer)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_write ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_write ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_write ); ADD_EVENT( event ? event[0] : NULL ); } @@ -3717,8 +3720,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWriteBufferRect)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_write ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_write ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_write ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4075,8 +4077,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReadImage)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_read ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_read ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_read ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4170,8 +4171,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWriteImage)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_write ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_write ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_write ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4554,8 +4554,7 @@ CL_API_ENTRY void* CL_API_CALL CLIRN(clEnqueueMapBuffer)( "[ map count = %d ] returned %p", map_count, retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_map ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_map ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_map ); ADD_EVENT( event ? event[0] : NULL ); } @@ -4678,8 +4677,7 @@ CL_API_ENTRY void* CL_API_CALL CLIRN(clEnqueueMapImage)( "[ map count = %d ] returned %p", map_count, retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_map ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_map ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_map ); ADD_EVENT( event ? event[0] : NULL ); } @@ -5190,8 +5188,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueWaitForEvents)( HOST_PERFORMANCE_TIMING_END(); CHECK_ERROR( retVal ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); } FINISH_OR_FLUSH_AFTER_ENQUEUE( command_queue ); @@ -6306,8 +6303,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueReleaseGLObjects)( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -6520,8 +6516,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueSVMMemcpy) ( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_copy ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_copy ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_copy ); ADD_EVENT( event ? event[0] : NULL ); } @@ -6659,8 +6654,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueSVMMap) ( ADD_MAP_POINTER( svm_ptr, map_flags, size ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking_map ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking_map ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking_map ); ADD_EVENT( event ? event[0] : NULL ); } @@ -6783,7 +6777,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clSetKernelExecInfo) ( if( pIntercept && pIntercept->dispatch().clSetKernelExecInfo ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER_KERNEL( kernel, "param_name = %s (%08X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, param_name = %s (%08X)", + kernel, pIntercept->enumName().name( param_name ).c_str(), param_name ); HOST_PERFORMANCE_TIMING_START(); @@ -7338,7 +7335,8 @@ CL_API_ENTRY cl_kernel CL_API_CALL CLIRN(clCloneKernel) ( if( pIntercept && pIntercept->dispatch().clCloneKernel ) { GET_ENQUEUE_COUNTER(); - CALL_LOGGING_ENTER(); + CALL_LOGGING_ENTER( "source_kernel = %p", + source_kernel ); CHECK_ERROR_INIT( errcode_ret ); HOST_PERFORMANCE_TIMING_START(); @@ -7389,7 +7387,10 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clGetKernelSubGroupInfo) ( &device, deviceInfo ); } - CALL_LOGGING_ENTER_KERNEL( kernel, "device = %s, param_name = %s (%08X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, device = %s, param_name = %s (%08X)", + kernel, deviceInfo.c_str(), pIntercept->enumName().name( param_name ).c_str(), param_name ); @@ -7446,7 +7447,10 @@ CL_API_ENTRY cl_int CL_API_CALL clGetKernelSubGroupInfoKHR( &device, deviceInfo ); } - CALL_LOGGING_ENTER_KERNEL( kernel, "device = %s, param_name = %s (%08X)", + CALL_LOGGING_ENTER_KERNEL( + kernel, + "kernel = %p, device = %s, param_name = %s (%08X)", + kernel, deviceInfo.c_str(), pIntercept->enumName().name( param_name ).c_str(), param_name ); @@ -7637,8 +7641,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseExternalMemObjectsKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8026,8 +8029,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8331,8 +8333,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8548,8 +8549,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -8765,8 +8765,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9ObjectsINTEL( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -9733,8 +9732,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseVA_APIMediaSurfacesINTEL( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); ADD_EVENT( event ? event[0] : NULL ); } @@ -10269,8 +10267,7 @@ clMemBlockingFreeINTEL( CHECK_ERROR( retVal ); ADD_POINTER_FREE( ptr ); CALL_LOGGING_EXIT( retVal ); - DEVICE_PERFORMANCE_TIMING_CHECK(); - FLUSH_CHROME_TRACE_BUFFERING(); + PROCESS_DATA_AND_FLUSH(); return retVal; } @@ -10565,8 +10562,7 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemcpyINTEL( CHECK_ERROR( retVal ); ADD_OBJECT_ALLOCATION_EVENT( retVal, event ); CALL_LOGGING_EXIT_EVENT_WITH_TAG( retVal, event ); - DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( blocking ); - FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( blocking ); + PROCESS_DATA_AND_FLUSH_CONDITIONAL( blocking ); ADD_EVENT( event ? event[0] : NULL ); } diff --git a/intercept/src/eventlist.h b/intercept/src/eventlist.h new file mode 100644 index 00000000..72551f75 --- /dev/null +++ b/intercept/src/eventlist.h @@ -0,0 +1,71 @@ +/* +// Copyright (c) 2026 Intel Corporation +// +// SPDX-License-Identifier: MIT +*/ + +#pragma once + +#include +#include +#include + +#include "common.h" + +class CEventList +{ +public: + std::mutex CheckMutex; + + struct Node + { + cl_device_id Device; + unsigned int QueueNumber; + std::string Name; + uint64_t EnqueueCounter; + clock::time_point QueuedTime; + bool UseProfilingDelta; + int64_t ProfilingDeltaNS; + cl_event Event; + }; + + using const_iterator = std::list::const_iterator; + + CEventList() = default; + ~CEventList() = default; + CEventList( const CEventList& ) = delete; + CEventList& operator=( const CEventList& ) = delete; + + void addNode( Node&& node ) + { + std::lock_guard lock(m_Mutex); + m_EventList.push_back( std::move(node) ); + } + + void erase( const_iterator iterator ) + { + std::lock_guard lock(m_Mutex); + m_EventList.erase( iterator ); + } + + const_iterator begin() const + { + return m_EventList.begin(); + } + + const_iterator end() const + { + return m_EventList.end(); + } + + size_t size() + { + std::lock_guard lock(m_Mutex); + return m_EventList.size(); + } + +private: + std::mutex m_Mutex; + + std::list m_EventList; +}; diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 82b9a144..a8259a22 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -179,6 +179,28 @@ CLIntercept::CLIntercept( void* pGlobalData ) // CLIntercept::~CLIntercept() { + if( m_EventList.size() > 0 ) + { + std::lock_guard lock(m_Mutex); + + logf( "CLIntercept is shutting down, but %zu events are unprocessed!\n", + m_EventList.size() ); + } + if( m_Config.MultiThreadedProcessing ) + { + // Note: We need to hold the processing condition lock while setting the + // done flag. This avoids a deadlock if this thread sets the done flag + // and notifies the processing thread between the time the processing + // thread checked the done flag and before it is waiting for + // notification. + { + std::lock_guard lock(m_ProcessingConditionMutex); + m_ProcessingDone.store(true); + } + m_ProcessingConditionVariable.notify_all(); + m_ProcessingThread.join(); + } + stopAubCapture( NULL ); report(); @@ -187,7 +209,7 @@ CLIntercept::~CLIntercept() log( "CLIntercept is shutting down...\n" ); // Set the dispatch to the dummy dispatch. The destructor is called - // as the process is terminating. We don't know when each DLL gets + // as the process is terminating. We don't know when drivers have been // unloaded, so it's not safe to call into any OpenCL functions in // our destructor. Setting to the dummy dispatch ensures that no // OpenCL functions get called. Note that this means we do potentially @@ -648,11 +670,83 @@ bool CLIntercept::init() m_ChromeTrace.addStartTimeMetadata( usStartTime ); } + if( m_Config.MultiThreadedProcessing ) + { + m_ProcessingDone.store(false); + m_ProcessingThread = std::thread( CLIntercept::processingThreadFunc, this ); + log( "Processing Thread Started!\n" ); + } + log( "... loading complete.\n" ); return true; } +/////////////////////////////////////////////////////////////////////////////// +// +void CLIntercept::processingThreadFunc( CLIntercept* pIntercept ) +{ + // This names the processing thread. + uint64_t threadId = pIntercept->OS().GetThreadID(); + pIntercept->getThreadNumber( "Host Processing Thread", threadId ); + + while( true ) + { + // Note: We need to check whether processing is done while holding the + // processing condition lock. This avoids a deadlock in the case where + // the processing thread checks the done flag and gets interrupted, then + // the main thread sets the done flag and notifies this processing + // thread, then the processing thread waits on the notification that + // will never come. + { + std::unique_lock lock( pIntercept->m_ProcessingConditionMutex ); + if (pIntercept->m_ProcessingDone.load()) + { + break; + } + pIntercept->m_ProcessingConditionVariable.wait( lock ); + } + + pIntercept->processData(); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// +void CLIntercept::notifyProcessingThread() +{ + m_ProcessingConditionVariable.notify_one(); +} + +/////////////////////////////////////////////////////////////////////////////// +// +void CLIntercept::processData() +{ + CLIntercept* pIntercept = this; + + GET_ENQUEUE_COUNTER(); + + if( pIntercept->config().DevicePerformanceTiming || + pIntercept->config().ITTPerformanceTiming || + pIntercept->config().ChromePerformanceTiming || + pIntercept->config().DevicePerfCounterEventBasedSampling || + pIntercept->config().DevicePerfCounterTimeBasedSampling ) + { + TOOL_OVERHEAD_TIMING_START(); + pIntercept->checkTimingEvents(); + TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); + } + if( pIntercept->config().ChromeTraceBufferSize && + pIntercept->config().ChromeTraceBufferingBlockingCallFlush && + ( pIntercept->config().ChromeCallLogging || + pIntercept->config().ChromePerformanceTiming ) ) + { + TOOL_OVERHEAD_TIMING_START(); + pIntercept->flushChromeTraceBuffering(); + TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); + } +} + /////////////////////////////////////////////////////////////////////////////// // void CLIntercept::report() @@ -1067,7 +1161,7 @@ void CLIntercept::getCallLoggingPrefix( } if( m_Config.CallLoggingThreadNumber ) { - unsigned int threadNumber = getThreadNumber( threadId ); + uint32_t threadNumber = getThreadNumber( "Host Thread", threadId ); ss << "TNum = "; ss << threadNumber; ss << " "; @@ -1336,6 +1430,7 @@ void CLIntercept::cachePlatformInfo() void CLIntercept::cacheDeviceInfo( cl_device_id device ) { + // TODO: Can we be smarter here and return the found device info? if( device && m_DeviceInfoMap.find(device) == m_DeviceInfoMap.end() ) { SDeviceInfo& deviceInfo = m_DeviceInfoMap[device]; @@ -6517,8 +6612,6 @@ void CLIntercept::addTimingEvent( const cl_command_queue queue, cl_event event ) { - std::lock_guard lock(m_Mutex); - if( event == NULL ) { logf( "Unexpectedly got a NULL timing event for %s, check for OpenCL errors!\n", @@ -6526,9 +6619,7 @@ void CLIntercept::addTimingEvent( return; } - m_EventList.emplace_back(); - - SEventListNode& node = m_EventList.back(); + CEventList::Node node; cl_device_id device = NULL; dispatch().clGetCommandQueueInfo( @@ -6538,10 +6629,6 @@ void CLIntercept::addTimingEvent( &device, NULL ); - // Cache the device info if it's not cached already, since we'll print - // the device name and other device properties as part of the report. - cacheDeviceInfo( device ); - dispatch().clRetainEvent( event ); node.Device = device; @@ -6555,6 +6642,12 @@ void CLIntercept::addTimingEvent( if( device ) { + std::lock_guard lock(m_Mutex); + + // Cache the device info if it's not cached already, since we'll print + // the device name and other device properties as part of the report. + cacheDeviceInfo( device ); + const SDeviceInfo& deviceInfo = m_DeviceInfoMap[device]; // Note: Even though ideally the intercept timer and the host timer should advance @@ -6600,26 +6693,37 @@ void CLIntercept::addTimingEvent( // hostTimeNS ); } } + + m_EventList.addNode(std::move(node)); } /////////////////////////////////////////////////////////////////////////////// // void CLIntercept::checkTimingEvents() { - std::lock_guard lock(m_Mutex); + std::lock_guard lock(m_EventList.CheckMutex); - CEventList::iterator current = m_EventList.begin(); - CEventList::iterator next; + CEventList::const_iterator current = m_EventList.begin(); + CEventList::const_iterator next; while( current != m_EventList.end() ) { + if( config().MultiThreadedProcessing && + m_ProcessingDone.load() == true ) + { + // If we are shutting down, drivers may have been unloaded, so it is + // no longer safe to make calls to check events and process data. + // There is not much else we can do, so just stop processing. + break; + } + cl_int errorCode = CL_SUCCESS; cl_int eventStatus = 0; next = current; ++next; - const SEventListNode& node = *current; + const CEventList::Node& node = *current; errorCode = dispatch().clGetEventInfo( node.Event, @@ -6668,6 +6772,8 @@ void CLIntercept::checkTimingEvents() NULL ); if( errorCode == CL_SUCCESS ) { + std::lock_guard lock(m_Mutex); + cl_ulong delta = commandEnd - commandStart; SDeviceTimingStats& deviceTimingStats = m_DeviceTimingStatsMap[node.Device][node.Name]; @@ -6776,6 +6882,8 @@ void CLIntercept::checkTimingEvents() break; case CL_INVALID_EVENT: { + std::lock_guard lock(m_Mutex); + // This is unexpected. We retained the event when we // added it to the list. Remove the event from the // list. @@ -14512,7 +14620,7 @@ void CLIntercept::chromeCallLoggingExit( uint64_t threadId = OS().GetThreadID(); // This will name the thread if it is not named already. - getThreadNumber( threadId ); + getThreadNumber( "Host Thread", threadId ); using ns = std::chrono::nanoseconds; uint64_t nsStart = diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index a54f1160..b72e54ca 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -6,8 +6,8 @@ #pragma once #include -#include #include +#include #include #include #include @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ #include "chrometracer.h" #include "cmdbufrecorder.h" #include "enummap.h" +#include "eventlist.h" #include "dispatch.h" #include "objtracker.h" @@ -52,15 +54,12 @@ class CLIntercept struct SConfig; public: -#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK) - using clock = std::chrono::high_resolution_clock; -#else - using clock = std::chrono::steady_clock; -#endif - static bool Create( void* pGlobalData, CLIntercept*& pIntercept ); static void Delete( CLIntercept*& pIntercept ); + void notifyProcessingThread(); + void processData(); + void report(); void callLoggingEnter( @@ -902,7 +901,7 @@ class CLIntercept const size_t* gws, const size_t* lws); - unsigned int getThreadNumber( uint64_t threadId ); + uint32_t getThreadNumber( const char* threadName, uint64_t threadId ); void saveProgramNumber( const cl_program program ); unsigned int getProgramNumber() const; @@ -1082,6 +1081,7 @@ class CLIntercept std::ofstream m_InterceptLog; CChromeTracer m_ChromeTrace; + CEventList m_EventList; mutable char m_StringBuffer[CLI_STRING_BUFFER_SIZE]; @@ -1091,7 +1091,7 @@ class CLIntercept clock::time_point m_StartTime; - typedef std::map< uint64_t, unsigned int> CThreadNumberMap; + typedef std::map< uint64_t, uint32_t > CThreadNumberMap; CThreadNumberMap m_ThreadNumberMap; typedef std::map< cl_device_id, std::vector > CSubDeviceCacheMap; @@ -1101,6 +1101,15 @@ class CLIntercept unsigned int m_ProgramNumber; + // Multi-threaded processing and flushing: + + std::thread m_ProcessingThread; + std::mutex m_ProcessingConditionMutex; + std::condition_variable m_ProcessingConditionVariable; + std::atomic m_ProcessingDone; + + static void processingThreadFunc( CLIntercept* pIntercept ); + // This defines a mapping between a sub-device handle and information // about the sub-device. @@ -1251,9 +1260,6 @@ class CLIntercept cl_event Event; }; - typedef std::list< SEventListNode > CEventList; - CEventList m_EventList; - #if defined(USE_MDAPI) MetricsDiscovery::MDHelper* m_pMDHelper; MetricsDiscovery::CMetricAggregations m_MetricAggregations; @@ -2101,10 +2107,10 @@ inline CObjectTracker& CLIntercept::objectTracker() /////////////////////////////////////////////////////////////////////////////// // #define BUILD_LOGGING_INIT() \ - CLIntercept::clock::time_point buildTimeStart; \ + clock::time_point buildTimeStart; \ if( pIntercept->config().BuildLogging ) \ { \ - buildTimeStart = CLIntercept::clock::now(); \ + buildTimeStart = clock::now(); \ } #define BUILD_LOGGING( program, num_devices, device_list ) \ @@ -3267,7 +3273,7 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( } #define HOST_PERFORMANCE_TIMING_START() \ - CLIntercept::clock::time_point cpuStart, cpuEnd; \ + clock::time_point cpuStart, cpuEnd; \ bool doHostPerformanceTiming = \ ( pIntercept->config().ChromeCallLogging || \ pIntercept->config().HostPerformanceTiming ) && \ @@ -3275,13 +3281,13 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( pIntercept->checkConditionalTiming(); \ if( doHostPerformanceTiming ) \ { \ - cpuStart = CLIntercept::clock::now(); \ + cpuStart = clock::now(); \ } #define HOST_PERFORMANCE_TIMING_END() \ if( doHostPerformanceTiming ) \ { \ - cpuEnd = CLIntercept::clock::now(); \ + cpuEnd = clock::now(); \ if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ @@ -3295,7 +3301,7 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( #define HOST_PERFORMANCE_TIMING_END_WITH_TAG() \ if( doHostPerformanceTiming ) \ { \ - cpuEnd = CLIntercept::clock::now(); \ + cpuEnd = clock::now(); \ if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ @@ -3307,7 +3313,7 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( } #define TOOL_OVERHEAD_TIMING_START() \ - CLIntercept::clock::time_point toolStart, toolEnd; \ + clock::time_point toolStart, toolEnd; \ bool doToolOverheadTiming = \ pIntercept->config().ToolOverheadTiming && \ ( pIntercept->config().ChromeCallLogging || \ @@ -3316,13 +3322,13 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( pIntercept->checkConditionalTiming(); \ if( doToolOverheadTiming ) \ { \ - toolStart = CLIntercept::clock::now(); \ + toolStart = clock::now(); \ } #define TOOL_OVERHEAD_TIMING_END( _tag ) \ if( doToolOverheadTiming ) \ { \ - toolEnd = CLIntercept::clock::now(); \ + toolEnd = clock::now(); \ if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ @@ -3397,7 +3403,7 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } #define DEVICE_PERFORMANCE_TIMING_START( pEvent ) \ - CLIntercept::clock::time_point queuedTime; \ + clock::time_point queuedTime; \ cl_event local_event = NULL; \ bool isLocalEvent = false; \ bool doDevicePerformanceTiming = \ @@ -3409,7 +3415,7 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( pIntercept->checkConditionalTiming(); \ if( doDevicePerformanceTiming ) \ { \ - queuedTime = CLIntercept::clock::now(); \ + queuedTime = clock::now(); \ if( pEvent == NULL ) \ { \ pEvent = &local_event; \ @@ -3487,31 +3493,6 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } \ } -#define DEVICE_PERFORMANCE_TIMING_CHECK() \ - if( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling || \ - pIntercept->config().DevicePerfCounterTimeBasedSampling ) \ - { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->checkTimingEvents(); \ - TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); \ - } - -#define DEVICE_PERFORMANCE_TIMING_CHECK_CONDITIONAL( _condition ) \ - if( ( _condition ) && \ - ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling || \ - pIntercept->config().DevicePerfCounterTimeBasedSampling ) ) \ - { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->checkTimingEvents(); \ - TOOL_OVERHEAD_TIMING_END( "(device timing overhead)" ); \ - } - /////////////////////////////////////////////////////////////////////////////// // inline void CLIntercept::flushChromeTraceBuffering() @@ -3519,27 +3500,20 @@ inline void CLIntercept::flushChromeTraceBuffering() m_ChromeTrace.flush(); } -#define FLUSH_CHROME_TRACE_BUFFERING() \ - if( pIntercept->config().ChromeTraceBufferSize && \ - pIntercept->config().ChromeTraceBufferingBlockingCallFlush && \ - ( pIntercept->config().ChromeCallLogging || \ - pIntercept->config().ChromePerformanceTiming ) ) \ +#define PROCESS_DATA_AND_FLUSH() \ + if( pIntercept->config().MultiThreadedProcessing ) \ { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->flushChromeTraceBuffering(); \ - TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); \ + pIntercept->notifyProcessingThread(); \ + } \ + else \ + { \ + pIntercept->processData(); \ } -#define FLUSH_CHROME_TRACE_BUFFERING_CONDITIONAL( _condition ) \ - if( ( _condition ) && \ - pIntercept->config().ChromeTraceBufferSize && \ - pIntercept->config().ChromeTraceBufferingBlockingCallFlush && \ - ( pIntercept->config().ChromeCallLogging || \ - pIntercept->config().ChromePerformanceTiming ) ) \ +#define PROCESS_DATA_AND_FLUSH_CONDITIONAL( _condition ) \ + if( _condition ) \ { \ - TOOL_OVERHEAD_TIMING_START(); \ - pIntercept->flushChromeTraceBuffering(); \ - TOOL_OVERHEAD_TIMING_END( "(chrome trace flush overhead)" ); \ + PROCESS_DATA_AND_FLUSH(); \ } /////////////////////////////////////////////////////////////////////////////// @@ -3697,10 +3671,12 @@ inline std::string CLIntercept::getShortKernelNameWithHash( /////////////////////////////////////////////////////////////////////////////// // -inline unsigned int CLIntercept::getThreadNumber( uint64_t threadId ) +inline uint32_t CLIntercept::getThreadNumber( + const char* threadName, + uint64_t threadId ) { CThreadNumberMap::const_iterator iter = m_ThreadNumberMap.find( threadId ); - unsigned int threadNumber = 0; + uint32_t threadNumber = 0; if( iter != m_ThreadNumberMap.end() ) { @@ -3708,12 +3684,12 @@ inline unsigned int CLIntercept::getThreadNumber( uint64_t threadId ) } else { - threadNumber = (unsigned int)m_ThreadNumberMap.size(); + threadNumber = static_cast(m_ThreadNumberMap.size()); m_ThreadNumberMap[ threadId ] = threadNumber; if( m_Config.ChromeCallLogging ) { - m_ChromeTrace.addThreadMetadata( threadId, threadNumber ); + m_ChromeTrace.addThreadMetadata( threadName, threadId, threadNumber ); } }