Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libkineto/src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
std::string errPrefixMsg(
"Fail to enable Kineto Profiler on XPU due to error code: ");
errPrefixMsg = errPrefixMsg + std::to_string(returnCode);
#if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 9
#if PTI_VERSION_AT_LEAST(0, 10)
std::string errMsg(ptiResultTypeToString(returnCode));
throw std::runtime_error(
errPrefixMsg + std::string(". The detailed error message is: ") +
Expand Down
4 changes: 2 additions & 2 deletions libkineto/src/plugin/xpupti/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ if(TARGET Pti::pti_view)
foreach(ver_major RANGE 0 1)
foreach(ver_minor RANGE 0 19)
list(APPEND pti_view_names pti_view-${ver_major}-${ver_minor})
endforeach()
endforeach()
endforeach()
endforeach()

get_target_property(PTI_INCLUDE_DIR Pti::pti_view INTERFACE_INCLUDE_DIRECTORIES)
find_library(PTI_VIEW_LIBRARY NAMES ${pti_view_names} PATHS "${PTI_INCLUDE_DIR}/../lib")
Expand Down
1 change: 0 additions & 1 deletion libkineto/src/plugin/xpupti/FindSYCLToolkit.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,5 @@ if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY))
endif()

message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")

set(SYCLTOOLKIT_FOUND True)
190 changes: 90 additions & 100 deletions libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,9 @@

#include "XpuptiActivityApi.h"

#include <assert.h>
#include <algorithm>
#include <chrono>
#include <mutex>
#include <thread>

using namespace std::chrono;
#include <vector>

namespace KINETO_NAMESPACE {

Expand Down Expand Up @@ -58,8 +55,10 @@ void XpuptiActivityApi::popCorrelationID(CorrelationFlowType type) {
#endif
}

static bool
nextActivityRecord(uint8_t* buffer, size_t valid_size, Pti_Activity*& record) {
static bool nextActivityRecord(
uint8_t* buffer,
size_t valid_size,
pti_view_record_base*& record) {
#ifdef HAS_XPUPTI
pti_result status = ptiViewGetNextRecord(buffer, valid_size, &record);
if (status != pti_result::PTI_SUCCESS) {
Expand All @@ -69,10 +68,6 @@ nextActivityRecord(uint8_t* buffer, size_t valid_size, Pti_Activity*& record) {
return record != nullptr;
}

void XpuptiActivityApi::setMaxBufferSize(int size) {
maxGpuBufferCount_ = 1 + size / kBufSize;
}

void XpuptiActivityApi::bufferRequestedTrampoline(
uint8_t** buffer,
size_t* size) {
Expand All @@ -81,9 +76,6 @@ void XpuptiActivityApi::bufferRequestedTrampoline(

void XpuptiActivityApi::bufferRequested(uint8_t** buffer, size_t* size) {
std::lock_guard<std::mutex> guard(mutex_);
if (allocatedGpuTraceBuffers_.size() >= (size_t)maxGpuBufferCount_) {
Comment thread
moksiuc marked this conversation as resolved.
stopCollection = true;
}

auto buf = std::make_unique<XpuptiActivityBuffer>(kBufSize);
*buffer = buf->data();
Expand All @@ -101,7 +93,7 @@ std::unique_ptr<XpuptiActivityBufferMap> XpuptiActivityApi::activityBuffers() {
}

#ifdef HAS_XPUPTI
time_point<system_clock> t1;
std::chrono::time_point<std::chrono::system_clock> t1;
XPUPTI_CALL(ptiFlushAllViews());
#endif

Expand All @@ -113,10 +105,10 @@ std::unique_ptr<XpuptiActivityBufferMap> XpuptiActivityApi::activityBuffers() {
int XpuptiActivityApi::processActivitiesForBuffer(
uint8_t* buf,
size_t validSize,
std::function<void(const Pti_Activity*)> handler) {
std::function<void(const pti_view_record_base*)> handler) {
int count = 0;
if (buf && validSize) {
Pti_Activity* record{nullptr};
pti_view_record_base* record{nullptr};
while (nextActivityRecord(buf, validSize, record)) {
handler(record);
++count;
Expand All @@ -128,7 +120,7 @@ int XpuptiActivityApi::processActivitiesForBuffer(

const std::pair<int, int> XpuptiActivityApi::processActivities(
XpuptiActivityBufferMap& buffers,
std::function<void(const Pti_Activity*)> handler) {
std::function<void(const pti_view_record_base*)> handler) {
std::pair<int, int> res{0, 0};
#ifdef HAS_XPUPTI
for (auto& pair : buffers) {
Expand Down Expand Up @@ -184,40 +176,29 @@ void XpuptiActivityApi::bufferCompleted(
}
#endif

#if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 10
#if PTI_VERSION_AT_LEAST(0, 11)
static void enableSpecifcRuntimeAPIsTracing() {
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueUSMFill_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueUSMFill2D_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueUSMMemcpy_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueUSMMemcpy2D_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueKernelLaunch_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1,
pti_api_group_id::PTI_API_GROUP_SYCL,
urEnqueueKernelLaunchCustomExp_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1,
pti_api_group_id::PTI_API_GROUP_SYCL,
urEnqueueCooperativeKernelLaunchExp_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueMemBufferFill_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueMemBufferRead_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueMemBufferWrite_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urEnqueueMemBufferCopy_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urUSMHostAlloc_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urUSMSharedAlloc_id));
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, urUSMDeviceAlloc_id));
constexpr const std::array<pti_api_id_runtime_sycl, 14>
specifcRuntimeAPIsTracing = {
urEnqueueUSMFill_id,
urEnqueueUSMFill2D_id,
urEnqueueUSMMemcpy_id,
urEnqueueUSMMemcpy2D_id,
urEnqueueKernelLaunch_id,
urEnqueueKernelLaunchCustomExp_id,
urEnqueueCooperativeKernelLaunchExp_id,
urEnqueueMemBufferFill_id,
urEnqueueMemBufferRead_id,
urEnqueueMemBufferWrite_id,
urEnqueueMemBufferCopy_id,
urUSMHostAlloc_id,
urUSMSharedAlloc_id,
urUSMDeviceAlloc_id};

for (auto tracing_id : specifcRuntimeAPIsTracing) {
XPUPTI_CALL(ptiViewEnableRuntimeApi(
1, pti_api_group_id::PTI_API_GROUP_SYCL, tracing_id));
}
}
#endif

Expand All @@ -229,68 +210,77 @@ void XpuptiActivityApi::enableXpuptiActivities(

externalCorrelationEnabled_ = false;
for (const auto& activity : selected_activities) {
if (activity == ActivityType::GPU_MEMCPY) {
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY));
}
if (activity == ActivityType::GPU_MEMSET) {
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL));
}
if (activity == ActivityType::CONCURRENT_KERNEL) {
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL));
}
if (activity == ActivityType::EXTERNAL_CORRELATION) {
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_EXTERNAL_CORRELATION));
externalCorrelationEnabled_ = true;
}
if (activity == ActivityType::XPU_RUNTIME) {
#if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 11
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_RUNTIME_API));
XPUPTI_CALL(ptiViewEnableRuntimeApiClass(
1, PTI_API_CLASS_GPU_OPERATION_CORE, PTI_API_GROUP_ALL));
#elif PTI_VERSION_MAJOR == 0 && PTI_VERSION_MINOR == 11
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_RUNTIME_API));
enableSpecifcRuntimeAPIsTracing();
switch (activity) {
case ActivityType::GPU_MEMCPY:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY));
break;

case ActivityType::GPU_MEMSET:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL));
break;

case ActivityType::CONCURRENT_KERNEL:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL));
break;

case ActivityType::EXTERNAL_CORRELATION:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_EXTERNAL_CORRELATION));
externalCorrelationEnabled_ = true;
break;

case ActivityType::XPU_RUNTIME:
#if PTI_VERSION_AT_LEAST(0, 12)
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_RUNTIME_API));
XPUPTI_CALL(ptiViewEnableRuntimeApiClass(
1, PTI_API_CLASS_GPU_OPERATION_CORE, PTI_API_GROUP_ALL));
#elif PTI_VERSION_AT_LEAST(0, 11)
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_RUNTIME_API));
enableSpecifcRuntimeAPIsTracing();
#else
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_SYCL_RUNTIME_CALLS));
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_SYCL_RUNTIME_CALLS));
#endif
}
if (activity == ActivityType::OVERHEAD) {
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_COLLECTION_OVERHEAD));
break;

case ActivityType::OVERHEAD:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_COLLECTION_OVERHEAD));
break;
}
}

tracingEnabled_ = 1;
#endif

stopCollection = false;
}

void XpuptiActivityApi::disablePtiActivities(
const std::set<ActivityType>& selected_activities) {
#ifdef HAS_XPUPTI
for (const auto& activity : selected_activities) {
if (activity == ActivityType::GPU_MEMCPY) {
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY));
}
if (activity == ActivityType::GPU_MEMSET) {
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL));
}
if (activity == ActivityType::CONCURRENT_KERNEL) {
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL));
}
if (activity == ActivityType::EXTERNAL_CORRELATION) {
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_EXTERNAL_CORRELATION));
}
if (activity == ActivityType::XPU_RUNTIME) {
#if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 10
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_RUNTIME_API));
switch (activity) {
case ActivityType::GPU_MEMCPY:
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY));
break;

case ActivityType::GPU_MEMSET:
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL));
break;

case ActivityType::CONCURRENT_KERNEL:
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL));
break;

case ActivityType::EXTERNAL_CORRELATION:
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_EXTERNAL_CORRELATION));
break;

case ActivityType::XPU_RUNTIME:
#if PTI_VERSION_AT_LEAST(0, 11)
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_RUNTIME_API));
#else

XPUPTI_CALL(ptiViewDisable(PTI_VIEW_SYCL_RUNTIME_CALLS));
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_SYCL_RUNTIME_CALLS));
#endif
}
if (activity == ActivityType::OVERHEAD) {
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_COLLECTION_OVERHEAD));
break;

case ActivityType::OVERHEAD:
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_COLLECTION_OVERHEAD));
break;
}
}
externalCorrelationEnabled_ = false;
Expand Down
23 changes: 9 additions & 14 deletions libkineto/src/plugin/xpupti/XpuptiActivityApi.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,19 @@
#include "XpuptiActivityBuffer.h"
#include "XpuptiProfilerMacros.h"

#include <atomic>
#include "ActivityType.h"
#include "Config.h"

#include <pti/pti_view.h>

#include <functional>
#include <memory>
#include <mutex>
#include <optional>
#include <set>

namespace KINETO_NAMESPACE {

using Pti_Activity = pti_view_record_base;

class XpuptiActivityApi {
public:
enum CorrelationFlowType { Default, User };
Expand All @@ -45,27 +49,18 @@ class XpuptiActivityApi {

virtual const std::pair<int, int> processActivities(
XpuptiActivityBufferMap&,
std::function<void(const Pti_Activity*)> handler);

void setMaxBufferSize(int size);
// void setDeviceBufferSize(size_t size);
// void setDeviceBufferPoolLimit(size_t limit);

std::atomic_bool stopCollection{false};
int64_t flushOverhead{0};
std::function<void(const pti_view_record_base*)> handler);

private:
int maxGpuBufferCount_{0};
XpuptiActivityBufferMap allocatedGpuTraceBuffers_;
std::unique_ptr<XpuptiActivityBufferMap> readyGpuTraceBuffers_;
std::mutex mutex_;
std::atomic<uint32_t> tracingEnabled_{0};
bool externalCorrelationEnabled_{false};

int processActivitiesForBuffer(
uint8_t* buf,
size_t validSize,
std::function<void(const Pti_Activity*)> handler);
std::function<void(const pti_view_record_base*)> handler);
static void bufferRequestedTrampoline(uint8_t** buffer, size_t* size);
static void
bufferCompletedTrampoline(uint8_t* buffer, size_t size, size_t validSize);
Expand Down
15 changes: 11 additions & 4 deletions libkineto/src/plugin/xpupti/XpuptiActivityBuffer.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
#pragma once
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "XpuptiProfilerMacros.h"
#pragma once

#include <assert.h>
#include <stdlib.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <map>
#include <memory>
#include <vector>
Expand Down
Loading