Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ option(ENABLE_CLIPROF "Enable cliprof Support and Build the Executable")
option(ENABLE_ITT "Enable ITT (Instrumentation Tracing Technology) API Support")
option(ENABLE_MDAPI "Enable MDAPI Support" ON)
option(ENABLE_HIGH_RESOLUTION_CLOCK "Use the high_resolution_clock for timing instead of the steady_clock")
option(ENABLE_TSAN "Enable Thread Sanitizer")
if(WIN32)
option(ENABLE_CLICONFIG "Build the cliconfig Configuration Utility" ON)
endif()
Expand Down
31 changes: 28 additions & 3 deletions cliloader/cliloader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,10 @@ static bool parseArguments(int argc, char *argv[])
set_LD_LIBRARY_PATH = false;
}
#endif
else if ( !strcmp(argv[i], "-nt") || !strcmp(argv[i], "--no-threads") )
{
checkSetEnv("CLI_MultiThreadedProcessing", "0");
}
else if( !strcmp(argv[i], "-q") || !strcmp(argv[i], "--quiet") )
{
checkSetEnv("CLI_SuppressLogging", "1");
Expand Down Expand Up @@ -517,6 +521,24 @@ static bool parseArguments(int argc, char *argv[])
{
checkSetEnv("CLI_HostPerformanceTiming", "1");
}
else if( !strcmp(argv[i], "--min-enqueue") )
{
++i;
if( i < argc )
{
checkSetEnv("CLI_HostPerformanceTimingMinEnqueue", argv[i]);
checkSetEnv("CLI_DevicePerformanceTimingMinEnqueue", argv[i]);
}
}
else if( !strcmp(argv[i], "--max-enqueue") )
{
++i;
if( i < argc )
{
checkSetEnv("CLI_HostPerformanceTimingMaxEnqueue", argv[i]);
checkSetEnv("CLI_DevicePerformanceTimingMaxEnqueue", argv[i]);
}
}
else if( !strcmp(argv[i], "-l") || !strcmp(argv[i], "--leak-checking") )
{
checkSetEnv("CLI_LeakChecking", "1");
Expand Down Expand Up @@ -610,11 +632,12 @@ static bool parseArguments(int argc, char *argv[])
" --metrics Print All MDAPI Metrics and Exit\n"
" --mdapi-devices Print All MDAPI Devices and Exit\n"
#if defined(_WIN32)
" --no-DLL-load Do not load the Intercept DLL into the child process\n"
" --no-DLL-load Do Not Load the Intercept DLL Into The Child Process\n"
#else // not Windows
" --no-LD_PRELOAD Do not set LD_PRELOAD\n"
" --no-LD_LIBRARY_PATH Do not set LD_LIBRARY_PATH\n"
" --no-LD_PRELOAD Do Not Set LD_PRELOAD\n"
" --no-LD_LIBRARY_PATH Do Not Set LD_LIBRARY_PATH\n"
#endif
" --no-threads [-nt] Do Not Create Additional Processing Threads\n"
"\n"
" --quiet [-q] Disable Logging\n"
" --call-logging [-c] Trace Host API Calls\n"
Expand All @@ -639,6 +662,8 @@ static bool parseArguments(int argc, char *argv[])
" --mdapi-group <NAME> Choose MDAPI Metrics to Collect (Intel GPU Only)\n"
" --mdapi-device <INDEX> Choose MDAPI Device for Metrics (Intel GPU Only)\n"
" --host-timing [-h] Report Host API Execution Time\n"
" --min-enqueue <NUMBER> Minimum Enqueue for Timing and Chrome Tracing\n"
" --max-enqueue <NUMBER> Maximum Enqueue for Timing and Chrome Tracing\n"
" --capture-enqueue <NUMBER> Capture the Specified Kernel Enqueue\n"
" --capture-kernel <NAME> Capture the Specified Kernel Name\n"
" --leak-checking [-l] Track and Report OpenCL Leaks\n"
Expand Down
2 changes: 1 addition & 1 deletion cliloader/printcontrols.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ struct cliControl
static const std::vector<cliControl> controls =
{
{ true, "Startup Controls:", "", ""},
{ false, "BreakOnLoad", "bool", "If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when the DLL is loaded." },
{ false, "BreakOnLoad", "bool", "If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when it is loaded." },
{ false, "std::string", "OpenCLFileName", "Used to control the DLL or Shared Library that the Intercept Layer for OpenCL Applications loads to make real OpenCL calls. If present, only this file name is loaded. If omitted, the Intercept Layer for OpenCL Applications will search a default set of real OpenCL file names." },
#include "src/controls.h"
};
Expand Down
17 changes: 17 additions & 0 deletions docs/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,23 @@ This causes each instance of the application to dump to a unique directory.

Some other setups may choose set a unique `DumpDir` for each process instance via environment variables, although setting `AppendPid` is usually sufficient and a much simpler solution.

## What does "CLIntercept is shutting down, but N events are unprocessed" mean?

This is a warning about missing device performance timing information.
Device performance timing works by attaching events to device commands when the commands are enqueued.
The events are then added to a device performance timing list for eventual processing, typically when the application synchronizes with the host.
If the application shuts down while events are still on the device performance timing list, the remaining events will not be processed.
Note that the OpenCL Intercept Layer cannot safely process events during shutdown because other OpenCL libraries or drivers may have been unloaded already.

Events will typically be unprocessed for one of two reasons:

* If an application enqueues a non-blocking command without making any other calls that cause the device performance timing list to be processed, then the event may be unprocessed when the application terminates.
This may especially happen if the application does not flush the queue after enqueueing the non-blocking command.
Setting `FlushAfterEnqueue` or `FinishAfterEnqueue` will usually enable these events to be processed, however these controls can have a large impact on performance and should be used with caution.
* To reduce overhead, the OpenCL Intercept Layer typically processes the device performance timing list on a separate thread.
If the application enqueues a lot of commands, then the device performance timing list may take a lot of time to process, and the separate thread may still be processing events when the application terminates.
Disabling `MultiThreadedProcessing` will cause these events to be processed on the main application thread instead, which increases overhead, but will usually enable the events to be processed prior to application exit.

## How do I submit a bug?

Please file a GitHub issue to report a bug.
Expand Down
6 changes: 5 additions & 1 deletion docs/controls.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,16 @@ The old name may still be used for backwards compatibility, but switching to the

If set to a nonzero value, the Intercept Layer for OpenCL Applications will break into the debugger when it is loaded.

### Tracing Controls
### Basic Controls

##### `BetaExtensionIntercepting` (bool)

If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs.

##### `MultiThreadedProcessing` (bool)

If set to a nonzero value, the Intercept Layer for OpenCL Applications will process device performance timing and flush chrome tracing buffers on a separate thread to reduce overhead. Setting this control to zero will process device performance timing and flush chrome tracing buffers on the main application thread instead.

### Logging Controls

##### `SuppressLogging` (bool)
Expand Down
38 changes: 34 additions & 4 deletions intercept/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ configure_file(src/git_version.rc.in "${CMAKE_CURRENT_BINARY_DIR}/git_version.rc

# When set, this is used for guessing the location of the real OpenCL ICD loader.
if(CMAKE_LIBRARY_ARCHITECTURE)
message(STATUS "CMAKE_LIBRARY_ARCHITECTURE is: ${CMAKE_LIBRARY_ARCHITECTURE}")
target_compile_definitions(OpenCL PRIVATE CLINTERCEPT_LIBRARY_ARCHITECTURE="${CMAKE_LIBRARY_ARCHITECTURE}")
endif()

Expand Down Expand Up @@ -134,20 +133,45 @@ if(WIN32)
elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR
${CMAKE_SYSTEM_NAME} STREQUAL "Android")
# conditionally enabled on Linux and Android

# If using the lld linker, we need to set linker emulation flags
if(NOT DEFINED LLD_EMULATION_FLAGS)
get_filename_component(LINKER_EXECUTABLE ${CMAKE_LINKER} NAME)
if(LINKER_EXECUTABLE MATCHES "lld" OR
${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") # default on FreeBSD?
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR
${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686")
set(LLD_EMULATION_FLAGS "-melf_i386")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR
${CMAKE_SYSTEM_PROCESSOR} STREQUAL "amd64")
set(LLD_EMULATION_FLAGS "-melf_x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
set(LLD_EMULATION_FLAGS "-maarch64elf")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
set(LLD_EMULATION_FLAGS "-melf64lriscv")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm" OR
${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7")
set(LLD_EMULATION_FLAGS "-marmelf")
else()
message(STATUS "Unknown system processor ${CMAKE_SYSTEM_PROCESSOR} for lld target emulation flags!")
endif()
endif()
endif()

if(ENABLE_KERNEL_OVERRIDES)
target_compile_definitions(OpenCL PRIVATE USE_KERNEL_OVERRIDES)

set(CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/kernels)
add_custom_command(OUTPUT ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/precompiled_kernels.o
COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack
kernels/precompiled_kernels.cl
-o ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/precompiled_kernels.o
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/precompiled_kernels.cl
)
add_custom_command(OUTPUT ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/builtin_kernels.o
COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack
kernels/builtin_kernels.cl
-o ${CLINTERCEPT_KERNELS_OUTPUT_DIRECTORY}/builtin_kernels.o
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/builtin_kernels.cl
Expand All @@ -163,7 +187,7 @@ elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR
set(CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/scripts)
add_custom_command(OUTPUT ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}/run_py.o
COMMAND ${CMAKE_COMMAND} -E make_directory ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary -z noexecstack
COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_LINKER} -r -b binary ${LLD_EMULATION_FLAGS} -z noexecstack
scripts/run.py
-o ${CLINTERCEPT_SCRIPTS_OUTPUT_DIRECTORY}/run_py.o
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run.py
Expand Down Expand Up @@ -250,6 +274,12 @@ if(ENABLE_ITT)
target_link_libraries(OpenCL ${VTUNE_ITTNOTIFY_LIB} ${CMAKE_THREAD_LIBS_INIT})
endif()

# Thread Sanitizer Support (optional)
if(ENABLE_TSAN)
target_compile_options(OpenCL PRIVATE -fsanitize=thread)
target_link_options(OpenCL PRIVATE -fsanitize=thread)
endif()

if(WIN32)
foreach(OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string(TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG_UPPER)
Expand Down
105 changes: 51 additions & 54 deletions intercept/mdapi/intercept_mdapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,68 +424,65 @@ void CLIntercept::getMDAPICountersFromEvent(
{
const size_t reportSize = m_pMDHelper->GetQueryReportSize();

char* pReport = new char[ reportSize ];
if( pReport )
std::vector<char> report(reportSize);

size_t outputSize = 0;
cl_int errorCode = dispatch().clGetEventProfilingInfo(
event,
CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL,
reportSize,
report.data(),
&outputSize );

std::lock_guard<std::mutex> lock(m_Mutex);

if( errorCode == CL_SUCCESS )
{
size_t outputSize = 0;
cl_int errorCode = dispatch().clGetEventProfilingInfo(
event,
CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL,
reportSize,
pReport,
&outputSize );
// Check: The size of the queried report should be the expected size.
CLI_ASSERT( outputSize == reportSize );

if( errorCode == CL_SUCCESS )
{
// Check: The size of the queried report should be the expected size.
CLI_ASSERT( outputSize == reportSize );
std::vector<MetricsDiscovery::TTypedValue_1_0> results;
std::vector<MetricsDiscovery::TTypedValue_1_0> maxValues;
std::vector<MetricsDiscovery::TTypedValue_1_0> ioInfoValues; // unused

std::vector<MetricsDiscovery::TTypedValue_1_0> results;
std::vector<MetricsDiscovery::TTypedValue_1_0> maxValues;
std::vector<MetricsDiscovery::TTypedValue_1_0> ioInfoValues; // unused
uint32_t numResults = m_pMDHelper->GetMetricsFromReports(
1,
report.data(),
results,
maxValues );

uint32_t numResults = m_pMDHelper->GetMetricsFromReports(
1,
pReport,
if( numResults )
{
m_pMDHelper->PrintMetricValues(
m_MetricDump,
name,
numResults,
results,
maxValues );

if( numResults )
{
m_pMDHelper->PrintMetricValues(
m_MetricDump,
name,
numResults,
results,
maxValues,
ioInfoValues );
m_pMDHelper->AggregateMetrics(
m_MetricAggregations,
name,
results );
}
maxValues,
ioInfoValues );
m_pMDHelper->AggregateMetrics(
m_MetricAggregations,
name,
results );
}
else
}
else
{
// Currently, MDAPI data is only included for kernels, so only
// report an errors for kernel events.
cl_command_type type = 0;
dispatch().clGetEventInfo(
event,
CL_EVENT_COMMAND_TYPE,
sizeof(type),
&type,
NULL );
if( type == CL_COMMAND_NDRANGE_KERNEL )
{
// Currently, MDAPI data is only included for kernels, so only
// report an errors for kernel events.
cl_command_type type = 0;
dispatch().clGetEventInfo(
event,
CL_EVENT_COMMAND_TYPE,
sizeof(type),
&type,
NULL );
if( type == CL_COMMAND_NDRANGE_KERNEL )
{
logf("Couldn't get MDAPI data for kernel! clGetEventProfilingInfo returned '%s' (%08X)!\n",
enumName().name(errorCode).c_str(),
errorCode );
}
logf("Couldn't get MDAPI data for kernel! clGetEventProfilingInfo returned '%s' (%08X)!\n",
enumName().name(errorCode).c_str(),
errorCode );
}

delete [] pReport;
pReport = NULL;
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion intercept/src/chrometracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,16 @@ class CChromeTracer
}

void addThreadMetadata(
const std::string& threadName,
uint64_t threadId,
uint32_t threadNumber )
{
std::lock_guard<std::mutex> lock(m_Mutex);
m_TraceFile
<< "{\"ph\":\"M\",\"name\":\"thread_name\",\"pid\":" << m_ProcessId
<< ",\"tid\":" << threadId
<< ",\"args\":{\"name\":\"Host Thread " << threadId
<< ",\"args\":{\"name\":\"" << threadName << " "
//<< threadId
<< "\"}},\n";
m_TraceFile
<< "{\"ph\":\"M\",\"name\":\"thread_sort_index\",\"pid\":" << m_ProcessId
Expand Down
13 changes: 13 additions & 0 deletions intercept/src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,16 @@
#endif

#define CLI_STRING_BUFFER_SIZE (16 * 1024)

#include <chrono>

namespace
{

#if defined(CLINTERCEPT_HIGH_RESOLUTON_CLOCK)
using clock = std::chrono::high_resolution_clock;
#else
using clock = std::chrono::steady_clock;
#endif

};
3 changes: 2 additions & 1 deletion intercept/src/controls.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
#define CLI_CONTROL_SEPARATOR( _name )
#endif

CLI_CONTROL_SEPARATOR( Tracing Controls: )
CLI_CONTROL_SEPARATOR( Basic Controls: )
CLI_CONTROL( bool, BetaExtensionIntercepting, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will intercept extension APIs for beta extensions that are subject to change. If an application uses beta extensions and does not function correctly with the Intercept Layer for OpenCL Applications, setting this control to zero may allow the application to function correctly, albeit without the ability to debug and analyze the beta extension APIs." )
CLI_CONTROL( bool, MultiThreadedProcessing, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will process device performance timing and flush chrome tracing buffers on a separate thread to reduce overhead. Setting this control to zero will process device performance timing and flush chrome tracing buffers on the main application thread instead." )

CLI_CONTROL_SEPARATOR( Logging Controls: )
CLI_CONTROL( bool, SuppressLogging, false, "If set to a nonzero value, suppresses all logging output from the Intercept Layer for OpenCL Applications. This is particularly useful for tools that only want report data." )
Expand Down
Loading
Loading