From 2aa50f6437790b1486a7f6ac322a70ada8fa75c5 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 08:51:15 +0200 Subject: [PATCH 01/10] Bump JLLs. --- CUDACore/Project.toml | 2 +- lib/cublas/Project.toml | 2 +- lib/cufft/Project.toml | 2 +- lib/cupti/Project.toml | 2 +- lib/curand/Project.toml | 2 +- lib/cusolver/Project.toml | 2 +- lib/cusparse/Project.toml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 820b3cf86e..85fb03c1b2 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -48,7 +48,7 @@ CUDA = "6" CUDA_Compiler_jll = "0.3, 0.4" CUDA_Driver_jll = "13" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" diff --git a/lib/cublas/Project.toml b/lib/cublas/Project.toml index 2fcdbae5f8..40cee2cf8d 100644 --- a/lib/cublas/Project.toml +++ b/lib/cublas/Project.toml @@ -30,7 +30,7 @@ BFloat16s = "0.5, 0.6" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" EnzymeCore = "0.8.2" GPUArrays = "11.4.1" GPUToolbox = "0.3, 1" diff --git a/lib/cufft/Project.toml b/lib/cufft/Project.toml index d7e3bb27cc..fb6f726460 100644 --- a/lib/cufft/Project.toml +++ b/lib/cufft/Project.toml @@ -21,7 +21,7 @@ AbstractFFTs = "0.5, 1.0" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "0.3, 1" LinearAlgebra = "1" Reexport = "1.0" diff --git a/lib/cupti/Project.toml b/lib/cupti/Project.toml index 8d86aaaa11..a11d578acf 100644 --- a/lib/cupti/Project.toml +++ b/lib/cupti/Project.toml @@ -14,6 +14,6 @@ GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "1.1" julia = "1.10" diff --git a/lib/curand/Project.toml b/lib/curand/Project.toml index 5177bcb3c2..48d41f8016 100644 --- a/lib/curand/Project.toml +++ b/lib/curand/Project.toml @@ -20,7 +20,7 @@ RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "0.3, 1" Random = "1" Random123 = "1.2" diff --git a/lib/cusolver/Project.toml b/lib/cusolver/Project.toml index 9df0ef4147..3aa905b5c1 100644 --- a/lib/cusolver/Project.toml +++ b/lib/cusolver/Project.toml @@ -25,7 +25,7 @@ cuSPARSE = {path = "../cusparse"} CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "0.3, 1" LinearAlgebra = "1" SparseArrays = "1" diff --git a/lib/cusparse/Project.toml b/lib/cusparse/Project.toml index 5f5125e9b5..edc866e4c0 100644 --- a/lib/cusparse/Project.toml +++ b/lib/cusparse/Project.toml @@ -29,7 +29,7 @@ Adapt = "4.4" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUArrays = "11.4.1" GPUToolbox = "0.3, 1" KernelAbstractions = "0.9.38" From 7f47c1d2cc4ea5cd15edca6aa8def3c7beec5069 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 08:52:52 +0200 Subject: [PATCH 02/10] Update headers. --- CUDACore/lib/cudadrv/libcuda.jl | 208 +++++++- lib/cublas/src/libcublasLt.jl | 35 +- lib/cupti/src/libcupti.jl | 863 +++++++++++++++++++++----------- lib/cusolver/src/libcusolver.jl | 86 +++- lib/cusparse/src/libcusparse.jl | 68 +++ lib/cutensor/src/libcutensor.jl | 2 +- lib/nvml/src/libnvml.jl | 191 ++++++- res/wrap/Manifest.toml | 36 +- res/wrap/custatevec.toml | 3 - 9 files changed, 1146 insertions(+), 346 deletions(-) diff --git a/CUDACore/lib/cudadrv/libcuda.jl b/CUDACore/lib/cudadrv/libcuda.jl index 8d1c56df79..4055feb378 100644 --- a/CUDACore/lib/cudadrv/libcuda.jl +++ b/CUDACore/lib/cudadrv/libcuda.jl @@ -146,6 +146,7 @@ const CUdevice = CUdevice_v1 CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915 CUDA_ERROR_KEY_ROTATION = 916 CUDA_ERROR_STREAM_DETACHED = 917 + CUDA_ERROR_GRAPH_RECAPTURE_FAILURE = 918 CUDA_ERROR_UNKNOWN = 999 end @@ -1243,6 +1244,7 @@ end CU_GRAPH_NODE_TYPE_MEM_FREE = 11 CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 CU_GRAPH_NODE_TYPE_CONDITIONAL = 13 + CU_GRAPH_NODE_TYPE_RESERVED_16 = 16 end const CUgraphNodeType = CUgraphNodeType_enum @@ -1268,6 +1270,7 @@ function Base.getproperty(x::Ptr{CUgraphNodeParams_st}, f::Symbol) f === :free && return Ptr{CUDA_MEM_FREE_NODE_PARAMS}(x + 16) f === :memOp && return Ptr{CUDA_BATCH_MEM_OP_NODE_PARAMS_v2}(x + 16) f === :conditional && return Ptr{CUDA_CONDITIONAL_NODE_PARAMS}(x + 16) + f === :asBytes && return Ptr{NTuple{232,Cchar}}(x + 16) f === :reserved2 && return Ptr{Clonglong}(x + 248) return getfield(x, f) end @@ -1286,7 +1289,7 @@ end function Base.propertynames(x::CUgraphNodeParams_st, private::Bool=false) return (:type, :reserved0, :reserved1, :kernel, :memcpy, :memset, :host, :graph, :eventWait, :eventRecord, :extSemSignal, :extSemWait, :alloc, :free, :memOp, - :conditional, :reserved2, if private + :conditional, :asBytes, :reserved2, if private fieldnames(typeof(x)) else () @@ -1931,7 +1934,13 @@ const CUfilter_mode = CUfilter_mode_enum CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED = 146 CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = 147 CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = 148 - CU_DEVICE_ATTRIBUTE_MAX = 149 + CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED = 151 + CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED = 152 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED = 153 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED = 154 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED = 155 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED = 156 + CU_DEVICE_ATTRIBUTE_MAX = 157 end const CUdevice_attribute = CUdevice_attribute_enum @@ -1996,7 +2005,8 @@ const CUpointer_attribute = CUpointer_attribute_enum CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13 CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14 CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15 - CU_FUNC_ATTRIBUTE_MAX = 16 + CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED = 16 + CU_FUNC_ATTRIBUTE_MAX = 17 end const CUfunction_attribute = CUfunction_attribute_enum @@ -3940,8 +3950,7 @@ const CUcheckpointGpuPair = CUcheckpointGpuPair_st struct CUcheckpointRestoreArgs_st gpuPairs::Ptr{CUcheckpointGpuPair} gpuPairsCount::Cuint - reserved::NTuple{44,Cchar} - reserved1::cuuint64_t + reserved::NTuple{52,Cchar} end const CUcheckpointRestoreArgs = CUcheckpointRestoreArgs_st @@ -4905,6 +4914,154 @@ end option::CUmulticastGranularity_flags)::CUresult end +const CUlogicalEndpointId = cuuint32_t + +@cenum CUlogicalEndpointIpcHandleType_enum::UInt32 begin + CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE = 0 + CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC = 1 +end + +const CUlogicalEndpointIpcHandleType = CUlogicalEndpointIpcHandleType_enum + +struct CUlogicalEndpointFabricHandle_st + data::NTuple{64,Cuchar} +end + +const CUlogicalEndpointFabricHandle = CUlogicalEndpointFabricHandle_st + +@cenum CUlogicalEndpointType_enum::UInt32 begin + CU_LOGICAL_ENDPOINT_TYPE_INVALID = 0 + CU_LOGICAL_ENDPOINT_TYPE_UNICAST = 1 + CU_LOGICAL_ENDPOINT_TYPE_MULTICAST = 2 +end + +const CUlogicalEndpointType = CUlogicalEndpointType_enum + +@cenum CUlogicalEndpointFlag_enum::UInt32 begin + CU_LOGICAL_ENDPOINT_FLAG_NONE = 0 + CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS = 1 +end + +const CUlogicalEndpointFlag = CUlogicalEndpointFlag_enum + +struct CUlogicalEndpointProp_struct + data::NTuple{24,UInt8} +end + +function Base.getproperty(x::Ptr{CUlogicalEndpointProp_struct}, f::Symbol) + f === :type && return Ptr{CUlogicalEndpointType}(x + 0) + f === :unicast && return Ptr{Cvoid}(x + 4) + f === :multicast && return Ptr{Cvoid}(x + 4) + f === :size && return Ptr{Culonglong}(x + 8) + f === :ipcHandleTypes && return Ptr{Cuint}(x + 16) + f === :flags && return Ptr{Cuint}(x + 20) + return getfield(x, f) +end + +function Base.getproperty(x::CUlogicalEndpointProp_struct, f::Symbol) + r = Ref{CUlogicalEndpointProp_struct}(x) + ptr = Base.unsafe_convert(Ptr{CUlogicalEndpointProp_struct}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{CUlogicalEndpointProp_struct}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::CUlogicalEndpointProp_struct, private::Bool=false) + return (:type, :unicast, :multicast, :size, :ipcHandleTypes, :flags, + if private + fieldnames(typeof(x)) + else + () + end...) +end + +const CUlogicalEndpointProp = CUlogicalEndpointProp_struct + +@checked function cuLogicalEndpointIdReserve(baseLeId, count) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointIdReserve(baseLeId::Ptr{CUlogicalEndpointId}, + count::cuuint32_t)::CUresult +end + +@checked function cuLogicalEndpointIdRelease(baseLeId, count) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointIdRelease(baseLeId::CUlogicalEndpointId, + count::cuuint32_t)::CUresult +end + +@checked function cuLogicalEndpointCreate(leId, prop) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointCreate(leId::CUlogicalEndpointId, + prop::Ptr{CUlogicalEndpointProp})::CUresult +end + +@checked function cuLogicalEndpointAddDevice(leId, dev) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointAddDevice(leId::CUlogicalEndpointId, + dev::CUdevice)::CUresult +end + +@checked function cuLogicalEndpointDestroy(leId) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointDestroy(leId::CUlogicalEndpointId)::CUresult +end + +@checked function cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, flags) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointBindAddr(leId::CUlogicalEndpointId, + dev::CUdevice, offset::cuuint64_t, + ptr::Ptr{Cvoid}, size::cuuint64_t, + flags::Culonglong)::CUresult +end + +@checked function cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, + flags) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointBindMem(leId::CUlogicalEndpointId, dev::CUdevice, + offset::cuuint64_t, + memHandle::CUmemGenericAllocationHandle, + memOffset::cuuint64_t, size::cuuint64_t, + flags::Culonglong)::CUresult +end + +@checked function cuLogicalEndpointUnbind(leId, dev, offset, size) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointUnbind(leId::CUlogicalEndpointId, dev::CUdevice, + offset::cuuint64_t, + size::cuuint64_t)::CUresult +end + +@checked function cuLogicalEndpointExport(handle, leId, handleType) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointExport(handle::Ptr{Cvoid}, + leId::CUlogicalEndpointId, + handleType::CUlogicalEndpointIpcHandleType)::CUresult +end + +@checked function cuLogicalEndpointImport(leId, handle, handleType) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointImport(leId::CUlogicalEndpointId, + handle::Ptr{Cvoid}, + handleType::CUlogicalEndpointIpcHandleType)::CUresult +end + +@checked function cuLogicalEndpointGetLimits(bindAlignment, maxSize, prop) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointGetLimits(bindAlignment::Ptr{cuuint64_t}, + maxSize::Ptr{cuuint64_t}, + prop::Ptr{CUlogicalEndpointProp})::CUresult +end + +@checked function cuLogicalEndpointQuery(leId, count, queryStatus) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointQuery(leId::CUlogicalEndpointId, + count::cuuint32_t, + queryStatus::Ptr{Cint})::CUresult +end + @checked function cuPointerGetAttribute(data, attribute, ptr) initialize_context() @gcsafe_ccall libcuda.cuPointerGetAttribute(data::Ptr{Cvoid}, @@ -5048,6 +5205,27 @@ end userData::Ptr{Cvoid}, flags::Cuint)::CUresult end +@cenum CUgraphRecaptureStatus_enum::UInt32 begin + CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE = 0 + CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE = 1 + CU_GRAPH_RECAPTURE_ERROR = 2 +end + +const CUgraphRecaptureStatus = CUgraphRecaptureStatus_enum + +# typedef CUresult ( CUDA_CB * CUgraphRecaptureCallback ) ( void * data , CUgraphNode node , const CUgraphNodeParams * originalParams , const CUgraphNodeParams * recaptureParams , CUgraphRecaptureStatus status ) +const CUgraphRecaptureCallback = Ptr{Cvoid} + +@checked function cuStreamBeginRecaptureToGraph(hStream, mode, hGraph, callbackFunc, + userData) + initialize_context() + @gcsafe_ccall libcuda.cuStreamBeginRecaptureToGraph(hStream::CUstream, + mode::CUstreamCaptureMode, + hGraph::CUgraph, + callbackFunc::CUgraphRecaptureCallback, + userData::Ptr{Cvoid})::CUresult +end + @checked function cuStreamBeginCaptureToGraph(hStream, hGraph, dependencies, dependencyData, numDependencies, mode) initialize_context() @@ -5279,13 +5457,6 @@ end kernelParams::Ptr{Ptr{Cvoid}})::CUresult end -@checked function cuLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags) - initialize_context() - @gcsafe_ccall libcuda.cuLaunchCooperativeKernelMultiDevice(launchParamsList::Ptr{CUDA_LAUNCH_PARAMS}, - numDevices::Cuint, - flags::Cuint)::CUresult -end - @checked function cuLaunchHostFunc(hStream, fn, userData) initialize_context() @gcsafe_ccall libcuda.cuLaunchHostFunc(hStream::CUstream, fn::CUhostFn, @@ -5350,6 +5521,13 @@ end grid_height::Cint, hStream::CUstream)::CUresult end +@checked function cuLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags) + initialize_context() + @gcsafe_ccall libcuda.cuLaunchCooperativeKernelMultiDevice(launchParamsList::Ptr{CUDA_LAUNCH_PARAMS}, + numDevices::Cuint, + flags::Cuint)::CUresult +end + @checked function cuParamSetTexRef(hfunc, texunit, hTexRef) initialize_context() @gcsafe_ccall libcuda.cuParamSetTexRef(hfunc::CUfunction, texunit::Cint, @@ -6312,7 +6490,7 @@ end const CUcoredumpSettings = CUcoredumpSettings_enum -@cenum CUCoredumpGenerationFlags::UInt32 begin +@cenum CUCoredumpGenerationFlags::Int32 begin CU_COREDUMP_DEFAULT_FLAGS = 0 CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = 1 CU_COREDUMP_SKIP_GLOBAL_MEMORY = 2 @@ -6321,6 +6499,9 @@ const CUcoredumpSettings = CUcoredumpSettings_enum CU_COREDUMP_SKIP_ABORT = 16 CU_COREDUMP_SKIP_CONSTBANK_MEMORY = 32 CU_COREDUMP_GZIP_COMPRESS = 64 + CU_COREDUMP_FAULTED_CONTEXTS_ONLY = 128 + CU_COREDUMP_NO_ERRBAR_AT_EXIT = 1073741824 + CU_COREDUMP_LOG_ONLY = -2147483648 CU_COREDUMP_LIGHTWEIGHT_FLAGS = 47 end @@ -6394,6 +6575,7 @@ mutable struct CUdevResourceDesc_st end const CUdevResourceDesc = Ptr{CUdevResourceDesc_st} @cenum CUgreenCtxCreate_flags::UInt32 begin + CU_GREEN_CTX_NONE = 0 CU_GREEN_CTX_DEFAULT_STREAM = 1 end diff --git a/lib/cublas/src/libcublasLt.jl b/lib/cublas/src/libcublasLt.jl index 4094554ad6..493f1d5405 100644 --- a/lib/cublas/src/libcublasLt.jl +++ b/lib/cublas/src/libcublasLt.jl @@ -1318,6 +1318,25 @@ end returnAlgoCount::Ptr{Cint})::cublasStatus_t end +@checked function cublasLtMatmulAlgoGetHeuristicForStream(lightHandle, operationDesc, Adesc, + Bdesc, Cdesc, Ddesc, preference, + requestedAlgoCount, + heuristicResultsArray, + returnAlgoCount, stream) + initialize_context() + @gcsafe_ccall libcublasLt.cublasLtMatmulAlgoGetHeuristicForStream(lightHandle::cublasLtHandle_t, + operationDesc::cublasLtMatmulDesc_t, + Adesc::cublasLtMatrixLayout_t, + Bdesc::cublasLtMatrixLayout_t, + Cdesc::cublasLtMatrixLayout_t, + Ddesc::cublasLtMatrixLayout_t, + preference::cublasLtMatmulPreference_t, + requestedAlgoCount::Cint, + heuristicResultsArray::Ptr{cublasLtMatmulHeuristicResult_t}, + returnAlgoCount::Ptr{Cint}, + stream::cudaStream_t)::cublasStatus_t +end + @checked function cublasLtMatmulAlgoGetIds(lightHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, requestedAlgoCount, algoIdsArray, returnAlgoCount) @@ -1360,6 +1379,20 @@ end result::Ptr{cublasLtMatmulHeuristicResult_t})::cublasStatus_t end +@checked function cublasLtMatmulAlgoCheckForStream(lightHandle, operationDesc, Adesc, Bdesc, + Cdesc, Ddesc, algo, result, stream) + initialize_context() + @gcsafe_ccall libcublasLt.cublasLtMatmulAlgoCheckForStream(lightHandle::cublasLtHandle_t, + operationDesc::cublasLtMatmulDesc_t, + Adesc::cublasLtMatrixLayout_t, + Bdesc::cublasLtMatrixLayout_t, + Cdesc::cublasLtMatrixLayout_t, + Ddesc::cublasLtMatrixLayout_t, + algo::Ptr{cublasLtMatmulAlgo_t}, + result::Ptr{cublasLtMatmulHeuristicResult_t}, + stream::cudaStream_t)::cublasStatus_t +end + @cenum cublasLtMatmulAlgoCapAttributes_t::UInt32 begin CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0 CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1 @@ -1452,7 +1485,7 @@ end @gcsafe_ccall libcublasLt.cublasLtLoggerSetMask(mask::Cint)::cublasStatus_t end -# no prototype is found for this function at cublasLt.h:2795:29, please use with caution +# no prototype is found for this function at cublasLt.h:2869:29, please use with caution @checked function cublasLtLoggerForceDisable() initialize_context() @gcsafe_ccall libcublasLt.cublasLtLoggerForceDisable()::cublasStatus_t diff --git a/lib/cupti/src/libcupti.jl b/lib/cupti/src/libcupti.jl index 4ff3cd6f3b..c5895684f6 100644 --- a/lib/cupti/src/libcupti.jl +++ b/lib/cupti/src/libcupti.jl @@ -214,27 +214,27 @@ struct CUpti_CallbackData correlationId::UInt32 end -struct var"##Ctag#425" +struct var"##Ctag#419" data::NTuple{8,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#425"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#419"}, f::Symbol) f === :stream && return Ptr{CUstream}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#425", f::Symbol) - r = Ref{var"##Ctag#425"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#425"}, r) +function Base.getproperty(x::var"##Ctag#419", f::Symbol) + r = Ref{var"##Ctag#419"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#419"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#425"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#419"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#425", private::Bool=false) +function Base.propertynames(x::var"##Ctag#419", private::Bool=false) return (:stream, if private fieldnames(typeof(x)) else @@ -248,7 +248,7 @@ end function Base.getproperty(x::Ptr{CUpti_ResourceData}, f::Symbol) f === :context && return Ptr{CUcontext}(x + 0) - f === :resourceHandle && return Ptr{var"##Ctag#425"}(x + 8) + f === :resourceHandle && return Ptr{var"##Ctag#419"}(x + 8) f === :resourceDescriptor && return Ptr{Ptr{Cvoid}}(x + 16) return getfield(x, f) end @@ -1036,8 +1036,8 @@ struct CUpti_ActivityObjectKindId end function Base.getproperty(x::Ptr{CUpti_ActivityObjectKindId}, f::Symbol) - f === :pt && return Ptr{var"##Ctag#441"}(x + 0) - f === :dcs && return Ptr{var"##Ctag#442"}(x + 0) + f === :pt && return Ptr{var"##Ctag#437"}(x + 0) + f === :dcs && return Ptr{var"##Ctag#438"}(x + 0) return getfield(x, f) end @@ -1956,28 +1956,36 @@ end CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT = 2147483647 end -struct var"##Ctag#443" +@cenum CUpti_FuncExecutionModel::UInt32 begin + CUPTI_FUNC_EXECUTION_MODEL_UNKNOWN = 0 + CUPTI_FUNC_EXECUTION_MODEL_SIMT = 1 + CUPTI_FUNC_EXECUTION_MODEL_TILE = 2 + CUPTI_FUNC_EXECUTION_MODEL_SIZE = 3 + CUPTI_FUNC_EXECUTION_MODEL_FORCE_INT = 2147483647 +end + +struct var"##Ctag#424" data::NTuple{1,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#443"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#424"}, f::Symbol) f === :both && return Ptr{UInt8}(x + 0) - f === :config && return Ptr{var"##Ctag#444"}(x + 0) + f === :config && return Ptr{var"##Ctag#425"}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#443", f::Symbol) - r = Ref{var"##Ctag#443"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#443"}, r) +function Base.getproperty(x::var"##Ctag#424", f::Symbol) + r = Ref{var"##Ctag#424"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#424"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#443"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#424"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#443", private::Bool=false) +function Base.propertynames(x::var"##Ctag#424", private::Bool=false) return (:both, :config, if private fieldnames(typeof(x)) else @@ -1985,13 +1993,13 @@ function Base.propertynames(x::var"##Ctag#443", private::Bool=false) end...) end -struct CUpti_ActivityKernel11 - data::NTuple{224,UInt8} +struct CUpti_ActivityKernel12 + data::NTuple{232,UInt8} end -function Base.getproperty(x::Ptr{CUpti_ActivityKernel11}, f::Symbol) +function Base.getproperty(x::Ptr{CUpti_ActivityKernel12}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) - f === :cacheConfig && return Ptr{var"##Ctag#443"}(x + 4) + f === :cacheConfig && return Ptr{var"##Ctag#424"}(x + 4) f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) f === :registersPerThread && return Ptr{UInt16}(x + 6) f === :partitionedGlobalCacheRequested && @@ -2042,21 +2050,23 @@ function Base.getproperty(x::Ptr{CUpti_ActivityKernel11}, f::Symbol) f === :padding3 && return Ptr{NTuple{7,UInt8}}(x + 209) f === :priority && return Ptr{Int32}(x + 216) f === :padding4 && return Ptr{UInt32}(x + 220) + f === :executionModel && return Ptr{UInt32}(x + 224) + f === :padding5 && return Ptr{UInt32}(x + 228) return getfield(x, f) end -function Base.getproperty(x::CUpti_ActivityKernel11, f::Symbol) - r = Ref{CUpti_ActivityKernel11}(x) - ptr = Base.unsafe_convert(Ptr{CUpti_ActivityKernel11}, r) +function Base.getproperty(x::CUpti_ActivityKernel12, f::Symbol) + r = Ref{CUpti_ActivityKernel12}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityKernel12}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{CUpti_ActivityKernel11}, f::Symbol, v) +function Base.setproperty!(x::Ptr{CUpti_ActivityKernel12}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::CUpti_ActivityKernel11, private::Bool=false) +function Base.propertynames(x::CUpti_ActivityKernel12, private::Bool=false) return (:kind, :cacheConfig, :sharedMemoryConfig, :registersPerThread, :partitionedGlobalCacheRequested, :partitionedGlobalCacheExecuted, :start, :_end, :completed, :deviceId, :contextId, :streamId, :gridX, :gridY, :gridZ, @@ -2067,7 +2077,7 @@ function Base.propertynames(x::CUpti_ActivityKernel11, private::Bool=false) :shmemLimitConfig, :graphId, :pAccessPolicyWindow, :channelID, :channelType, :clusterX, :clusterY, :clusterZ, :clusterSchedulingPolicy, :localMemoryTotal_v2, :maxPotentialClusterSize, :maxActiveClusters, :isDeviceLaunched, :padding3, - :priority, :padding4, if private + :priority, :padding4, :executionModel, :padding5, if private fieldnames(typeof(x)) else () @@ -2121,7 +2131,8 @@ end KERNEL_FIELD_MAX_ACTIVE_CLUSTERS = 43 KERNEL_FIELD_IS_DEVICE_LAUNCHED = 44 KERNEL_FIELD_LAUNCH_PRIORITY = 45 - KERNEL_FIELD_MAX = 46 + KERNEL_FIELD_EXECUTION_MODEL = 46 + KERNEL_FIELD_MAX = 47 end struct var"##Ctag#351" @@ -2660,28 +2671,28 @@ end DEVICE_FIELD_MAX = 38 end -struct var"##Ctag#411" +struct var"##Ctag#405" data::NTuple{4,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#411"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#405"}, f::Symbol) f === :cu && return Ptr{CUdevice_attribute}(x + 0) f === :cupti && return Ptr{CUpti_DeviceAttribute}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#411", f::Symbol) - r = Ref{var"##Ctag#411"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#411"}, r) +function Base.getproperty(x::var"##Ctag#405", f::Symbol) + r = Ref{var"##Ctag#405"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#405"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#411"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#405"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#411", private::Bool=false) +function Base.propertynames(x::var"##Ctag#405", private::Bool=false) return (:cu, :cupti, if private fieldnames(typeof(x)) else @@ -2689,11 +2700,11 @@ function Base.propertynames(x::var"##Ctag#411", private::Bool=false) end...) end -struct var"##Ctag#412" +struct var"##Ctag#406" data::NTuple{8,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#412"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#406"}, f::Symbol) f === :vDouble && return Ptr{Cdouble}(x + 0) f === :vUint32 && return Ptr{UInt32}(x + 0) f === :vUint64 && return Ptr{UInt64}(x + 0) @@ -2702,18 +2713,18 @@ function Base.getproperty(x::Ptr{var"##Ctag#412"}, f::Symbol) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#412", f::Symbol) - r = Ref{var"##Ctag#412"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#412"}, r) +function Base.getproperty(x::var"##Ctag#406", f::Symbol) + r = Ref{var"##Ctag#406"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#406"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#412"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#406"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#412", private::Bool=false) +function Base.propertynames(x::var"##Ctag#406", private::Bool=false) return (:vDouble, :vUint32, :vUint64, :vInt32, :vInt64, if private fieldnames(typeof(x)) else @@ -2729,8 +2740,8 @@ function Base.getproperty(x::Ptr{CUpti_ActivityDeviceAttribute}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) f === :flags && return Ptr{CUpti_ActivityFlag}(x + 4) f === :deviceId && return Ptr{UInt32}(x + 8) - f === :attribute && return Ptr{var"##Ctag#411"}(x + 12) - f === :value && return Ptr{var"##Ctag#412"}(x + 16) + f === :attribute && return Ptr{var"##Ctag#405"}(x + 12) + f === :value && return Ptr{var"##Ctag#406"}(x + 16) return getfield(x, f) end @@ -2995,15 +3006,36 @@ end OVERHEAD_FIELD_MAX = 8 end +struct CUpti_ActivityEnvironmentSpeed + smClock::UInt32 + memoryClock::UInt32 + pcieLinkGen::UInt32 + pcieLinkWidth::UInt32 + clocksThrottleReasons::CUpti_EnvironmentClocksThrottleReason +end + +struct CUpti_ActivityEnvironmentTemperature + gpuTemperature::UInt32 +end + +struct CUpti_ActivityEnvironmentPower + power::UInt32 + powerLimit::UInt32 +end + +struct CUpti_ActivityEnvironmentCooling + fanSpeed::UInt32 +end + struct var"##Ctag#400" data::NTuple{20,UInt8} end function Base.getproperty(x::Ptr{var"##Ctag#400"}, f::Symbol) - f === :speed && return Ptr{var"##Ctag#401"}(x + 0) - f === :temperature && return Ptr{var"##Ctag#402"}(x + 0) - f === :power && return Ptr{var"##Ctag#403"}(x + 0) - f === :cooling && return Ptr{var"##Ctag#404"}(x + 0) + f === :speed && return Ptr{CUpti_ActivityEnvironmentSpeed}(x + 0) + f === :temperature && return Ptr{CUpti_ActivityEnvironmentTemperature}(x + 0) + f === :power && return Ptr{CUpti_ActivityEnvironmentPower}(x + 0) + f === :cooling && return Ptr{CUpti_ActivityEnvironmentCooling}(x + 0) return getfield(x, f) end @@ -3059,6 +3091,15 @@ function Base.propertynames(x::CUpti_ActivityEnvironment, private::Bool=false) end...) end +@cenum CUpti_ActivityEnvironmentFieldIds::UInt32 begin + ENVIRONMENT_FIELD_KIND = 0 + ENVIRONMENT_FIELD_DEVICE_ID = 1 + ENVIRONMENT_FIELD_TIMESTAMP = 2 + ENVIRONMENT_FIELD_ENVIRONMENT_KIND = 3 + ENVIRONMENT_FIELD_ENVIRONMENT_KIND_DATA = 4 + ENVIRONMENT_FIELD_MAX = 5 +end + struct CUpti_ActivityInstructionExecution data::NTuple{48,UInt8} end @@ -3682,6 +3723,38 @@ function Base.propertynames(x::CUpti_ActivityOpenAccData, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenAccDataFieldIds::UInt32 begin + OPENACC_DATA_FIELD_KIND = 0 + OPENACC_DATA_FIELD_EVENT_KIND = 1 + OPENACC_DATA_FIELD_PARENT_CONSTRUCT = 2 + OPENACC_DATA_FIELD_VERSION = 3 + OPENACC_DATA_FIELD_IMPLICIT = 4 + OPENACC_DATA_FIELD_DEVICE_TYPE = 5 + OPENACC_DATA_FIELD_DEVICE_NUMBER = 6 + OPENACC_DATA_FIELD_THREAD_ID = 7 + OPENACC_DATA_FIELD_ASYNC = 8 + OPENACC_DATA_FIELD_ASYNC_MAP = 9 + OPENACC_DATA_FIELD_LINE_NO = 10 + OPENACC_DATA_FIELD_END_LINE_NO = 11 + OPENACC_DATA_FIELD_FUNC_LINE_NO = 12 + OPENACC_DATA_FIELD_FUNC_END_LINE_NO = 13 + OPENACC_DATA_FIELD_START = 14 + OPENACC_DATA_FIELD_END = 15 + OPENACC_DATA_FIELD_CU_DEVICE_ID = 16 + OPENACC_DATA_FIELD_CU_CONTEXT_ID = 17 + OPENACC_DATA_FIELD_CU_STREAM_ID = 18 + OPENACC_DATA_FIELD_CU_PROCESS_ID = 19 + OPENACC_DATA_FIELD_CU_THREAD_ID = 20 + OPENACC_DATA_FIELD_EXTERNAL_ID = 21 + OPENACC_DATA_FIELD_SRC_FILE = 22 + OPENACC_DATA_FIELD_FUNC_NAME = 23 + OPENACC_DATA_FIELD_BYTES = 24 + OPENACC_DATA_FIELD_HOST_PTR = 25 + OPENACC_DATA_FIELD_DEVICE_PTR = 26 + OPENACC_DATA_FIELD_VAR_NAME = 27 + OPENACC_DATA_FIELD_MAX = 28 +end + struct CUpti_ActivityOpenAccLaunch data::NTuple{152,UInt8} end @@ -3741,6 +3814,38 @@ function Base.propertynames(x::CUpti_ActivityOpenAccLaunch, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenAccLaunchFieldIds::UInt32 begin + OPENACC_LAUNCH_FIELD_KIND = 0 + OPENACC_LAUNCH_FIELD_EVENT_KIND = 1 + OPENACC_LAUNCH_FIELD_PARENT_CONSTRUCT = 2 + OPENACC_LAUNCH_FIELD_VERSION = 3 + OPENACC_LAUNCH_FIELD_IMPLICIT = 4 + OPENACC_LAUNCH_FIELD_DEVICE_TYPE = 5 + OPENACC_LAUNCH_FIELD_DEVICE_NUMBER = 6 + OPENACC_LAUNCH_FIELD_THREAD_ID = 7 + OPENACC_LAUNCH_FIELD_ASYNC = 8 + OPENACC_LAUNCH_FIELD_ASYNC_MAP = 9 + OPENACC_LAUNCH_FIELD_LINE_NO = 10 + OPENACC_LAUNCH_FIELD_END_LINE_NO = 11 + OPENACC_LAUNCH_FIELD_FUNC_LINE_NO = 12 + OPENACC_LAUNCH_FIELD_FUNC_END_LINE_NO = 13 + OPENACC_LAUNCH_FIELD_START = 14 + OPENACC_LAUNCH_FIELD_END = 15 + OPENACC_LAUNCH_FIELD_CU_DEVICE_ID = 16 + OPENACC_LAUNCH_FIELD_CU_CONTEXT_ID = 17 + OPENACC_LAUNCH_FIELD_CU_STREAM_ID = 18 + OPENACC_LAUNCH_FIELD_CU_PROCESS_ID = 19 + OPENACC_LAUNCH_FIELD_CU_THREAD_ID = 20 + OPENACC_LAUNCH_FIELD_EXTERNAL_ID = 21 + OPENACC_LAUNCH_FIELD_SRC_FILE = 22 + OPENACC_LAUNCH_FIELD_FUNC_NAME = 23 + OPENACC_LAUNCH_FIELD_NUM_GANGS = 24 + OPENACC_LAUNCH_FIELD_NUM_WORKERS = 25 + OPENACC_LAUNCH_FIELD_VECTOR_LENGTH = 26 + OPENACC_LAUNCH_FIELD_KERNEL_NAME = 27 + OPENACC_LAUNCH_FIELD_MAX = 28 +end + struct CUpti_ActivityOpenAccOther data::NTuple{120,UInt8} end @@ -3796,6 +3901,34 @@ function Base.propertynames(x::CUpti_ActivityOpenAccOther, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenAccOtherFieldIds::UInt32 begin + OPENACC_OTHER_FIELD_KIND = 0 + OPENACC_OTHER_FIELD_EVENT_KIND = 1 + OPENACC_OTHER_FIELD_PARENT_CONSTRUCT = 2 + OPENACC_OTHER_FIELD_VERSION = 3 + OPENACC_OTHER_FIELD_IMPLICIT = 4 + OPENACC_OTHER_FIELD_DEVICE_TYPE = 5 + OPENACC_OTHER_FIELD_DEVICE_NUMBER = 6 + OPENACC_OTHER_FIELD_THREAD_ID = 7 + OPENACC_OTHER_FIELD_ASYNC = 8 + OPENACC_OTHER_FIELD_ASYNC_MAP = 9 + OPENACC_OTHER_FIELD_LINE_NO = 10 + OPENACC_OTHER_FIELD_END_LINE_NO = 11 + OPENACC_OTHER_FIELD_FUNC_LINE_NO = 12 + OPENACC_OTHER_FIELD_FUNC_END_LINE_NO = 13 + OPENACC_OTHER_FIELD_START = 14 + OPENACC_OTHER_FIELD_END = 15 + OPENACC_OTHER_FIELD_CU_DEVICE_ID = 16 + OPENACC_OTHER_FIELD_CU_CONTEXT_ID = 17 + OPENACC_OTHER_FIELD_CU_STREAM_ID = 18 + OPENACC_OTHER_FIELD_CU_PROCESS_ID = 19 + OPENACC_OTHER_FIELD_CU_THREAD_ID = 20 + OPENACC_OTHER_FIELD_EXTERNAL_ID = 21 + OPENACC_OTHER_FIELD_SRC_FILE = 22 + OPENACC_OTHER_FIELD_FUNC_NAME = 23 + OPENACC_OTHER_FIELD_MAX = 24 +end + struct CUpti_ActivityOpenMp data::NTuple{40,UInt8} end @@ -3832,6 +3965,16 @@ function Base.propertynames(x::CUpti_ActivityOpenMp, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenMpFieldIds::UInt32 begin + OPENMP_FIELD_KIND = 0 + OPENMP_FIELD_EVENT_KIND = 1 + OPENMP_FIELD_START = 2 + OPENMP_FIELD_END = 3 + OPENMP_FIELD_CU_THREAD_ID = 4 + OPENMP_FIELD_CU_PROCESS_ID = 5 + OPENMP_FIELD_MAX = 6 +end + @cenum CUpti_ExternalCorrelationKind::UInt32 begin CUPTI_EXTERNAL_CORRELATION_KIND_INVALID = 0 CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN = 1 @@ -3891,28 +4034,33 @@ end CUPTI_DEV_TYPE_FORCE_INT = 2147483647 end -struct var"##Ctag#445" +struct CUpti_ActivityNvLinkNpu + index::UInt32 + domainId::UInt32 +end + +struct var"##Ctag#441" data::NTuple{16,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#445"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#441"}, f::Symbol) f === :uuidDev && return Ptr{CUuuid}(x + 0) - f === :npu && return Ptr{var"##Ctag#446"}(x + 0) + f === :npu && return Ptr{CUpti_ActivityNvLinkNpu}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#445", f::Symbol) - r = Ref{var"##Ctag#445"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#445"}, r) +function Base.getproperty(x::var"##Ctag#441", f::Symbol) + r = Ref{var"##Ctag#441"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#441"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#445"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#441"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#445", private::Bool=false) +function Base.propertynames(x::var"##Ctag#441", private::Bool=false) return (:uuidDev, :npu, if private fieldnames(typeof(x)) else @@ -3920,28 +4068,28 @@ function Base.propertynames(x::var"##Ctag#445", private::Bool=false) end...) end -struct var"##Ctag#447" +struct var"##Ctag#442" data::NTuple{16,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#447"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#442"}, f::Symbol) f === :uuidDev && return Ptr{CUuuid}(x + 0) - f === :npu && return Ptr{var"##Ctag#448"}(x + 0) + f === :npu && return Ptr{CUpti_ActivityNvLinkNpu}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#447", f::Symbol) - r = Ref{var"##Ctag#447"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#447"}, r) +function Base.getproperty(x::var"##Ctag#442", f::Symbol) + r = Ref{var"##Ctag#442"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#442"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#447"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#442"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#447", private::Bool=false) +function Base.propertynames(x::var"##Ctag#442", private::Bool=false) return (:uuidDev, :npu, if private fieldnames(typeof(x)) else @@ -3958,8 +4106,8 @@ function Base.getproperty(x::Ptr{CUpti_ActivityNvLink5}, f::Symbol) f === :nvlinkVersion && return Ptr{UInt32}(x + 4) f === :typeDev0 && return Ptr{CUpti_DevType}(x + 8) f === :typeDev1 && return Ptr{CUpti_DevType}(x + 12) - f === :idDev0 && return Ptr{var"##Ctag#445"}(x + 16) - f === :idDev1 && return Ptr{var"##Ctag#447"}(x + 32) + f === :idDev0 && return Ptr{var"##Ctag#441"}(x + 16) + f === :idDev1 && return Ptr{var"##Ctag#442"}(x + 32) f === :flag && return Ptr{UInt32}(x + 48) f === :physicalNvLinkCount && return Ptr{UInt32}(x + 52) f === :portDev0 && return Ptr{Ptr{UInt32}}(x + 56) @@ -3991,34 +4139,62 @@ function Base.propertynames(x::CUpti_ActivityNvLink5, private::Bool=false) end...) end +@cenum CUpti_ActivityNvLinkFieldIds::UInt32 begin + NVLINK_FIELD_KIND = 0 + NVLINK_FIELD_NVLINK_VERSION = 1 + NVLINK_FIELD_TYPE_DEV0 = 2 + NVLINK_FIELD_TYPE_DEV1 = 3 + NVLINK_FIELD_ID_DEV0 = 4 + NVLINK_FIELD_ID_DEV1 = 5 + NVLINK_FIELD_FLAG = 6 + NVLINK_FIELD_PHYSICAL_NVLINK_COUNT = 7 + NVLINK_FIELD_PORT_DEV0 = 8 + NVLINK_FIELD_PORT_DEV1 = 9 + NVLINK_FIELD_BANDWIDTH = 10 + NVLINK_FIELD_NVSWITCH_CONNECTED = 11 + NVLINK_FIELD_MAX = 12 +end + @cenum CUpti_PcieDeviceType::UInt32 begin CUPTI_PCIE_DEVICE_TYPE_GPU = 0 CUPTI_PCIE_DEVICE_TYPE_BRIDGE = 1 CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 2147483647 end -struct var"##Ctag#405" +struct CUpti_ActivityPcieGpuAttr + uuidDev::CUuuid + peerDev::NTuple{32,CUdevice} +end + +struct CUpti_ActivityPcieBridgeAttr + secondaryBus::UInt16 + deviceId::UInt16 + vendorId::UInt16 + pad0::UInt16 +end + +struct var"##Ctag#401" data::NTuple{4,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#405"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#401"}, f::Symbol) f === :devId && return Ptr{CUdevice}(x + 0) f === :bridgeId && return Ptr{UInt32}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#405", f::Symbol) - r = Ref{var"##Ctag#405"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#405"}, r) +function Base.getproperty(x::var"##Ctag#401", f::Symbol) + r = Ref{var"##Ctag#401"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#401"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#405"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#401"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#405", private::Bool=false) +function Base.propertynames(x::var"##Ctag#401", private::Bool=false) return (:devId, :bridgeId, if private fieldnames(typeof(x)) else @@ -4026,28 +4202,28 @@ function Base.propertynames(x::var"##Ctag#405", private::Bool=false) end...) end -struct var"##Ctag#406" +struct var"##Ctag#402" data::NTuple{144,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#406"}, f::Symbol) - f === :gpuAttr && return Ptr{var"##Ctag#407"}(x + 0) - f === :bridgeAttr && return Ptr{var"##Ctag#408"}(x + 0) +function Base.getproperty(x::Ptr{var"##Ctag#402"}, f::Symbol) + f === :gpuAttr && return Ptr{CUpti_ActivityPcieGpuAttr}(x + 0) + f === :bridgeAttr && return Ptr{CUpti_ActivityPcieBridgeAttr}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#406", f::Symbol) - r = Ref{var"##Ctag#406"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#406"}, r) +function Base.getproperty(x::var"##Ctag#402", f::Symbol) + r = Ref{var"##Ctag#402"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#402"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#406"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#402"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#406", private::Bool=false) +function Base.propertynames(x::var"##Ctag#402", private::Bool=false) return (:gpuAttr, :bridgeAttr, if private fieldnames(typeof(x)) else @@ -4062,13 +4238,13 @@ end function Base.getproperty(x::Ptr{CUpti_ActivityPcie}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) f === :type && return Ptr{CUpti_PcieDeviceType}(x + 4) - f === :id && return Ptr{var"##Ctag#405"}(x + 8) + f === :id && return Ptr{var"##Ctag#401"}(x + 8) f === :domain && return Ptr{UInt32}(x + 12) f === :pcieGeneration && return Ptr{UInt16}(x + 16) f === :linkRate && return Ptr{UInt16}(x + 18) f === :linkWidth && return Ptr{UInt16}(x + 20) f === :upstreamBus && return Ptr{UInt16}(x + 22) - f === :attr && return Ptr{var"##Ctag#406"}(x + 24) + f === :attr && return Ptr{var"##Ctag#402"}(x + 24) return getfield(x, f) end @@ -4092,6 +4268,19 @@ function Base.propertynames(x::CUpti_ActivityPcie, private::Bool=false) end...) end +@cenum CUpti_ActivityPcieFieldIds::UInt32 begin + PCIE_FIELD_KIND = 0 + PCIE_FIELD_TYPE = 1 + PCIE_FIELD_ID = 2 + PCIE_FIELD_DOMAIN = 3 + PCIE_FIELD_PCIE_GENERATION = 4 + PCIE_FIELD_LINK_RATE = 5 + PCIE_FIELD_LINK_WIDTH = 6 + PCIE_FIELD_UPSTREAM_BUS = 7 + PCIE_FIELD_ATTR = 8 + PCIE_FIELD_MAX = 9 +end + @cenum CUpti_PcieGen::UInt32 begin CUPTI_PCIE_GEN_GEN1 = 1 CUPTI_PCIE_GEN_GEN2 = 2 @@ -4541,11 +4730,11 @@ end HOST_LAUNCH_FIELD_MAX = 9 end -struct CUpti_ActivityGreenContext - data::NTuple{152,UInt8} +struct CUpti_ActivityGreenContext2 + data::NTuple{168,UInt8} end -function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol) +function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext2}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) f === :contextId && return Ptr{UInt32}(x + 4) f === :parentContextId && return Ptr{UInt32}(x + 8) @@ -4555,23 +4744,28 @@ function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol) f === :logicalTpcMaskSize && return Ptr{UInt8}(x + 22) f === :padding && return Ptr{UInt8}(x + 23) f === :logicalTpcMask && return Ptr{NTuple{32,UInt32}}(x + 24) + f === :workqueueResourceId && return Ptr{UInt64}(x + 152) + f === :workqueueConcurrencyLimit && return Ptr{UInt32}(x + 160) + f === :workqueueSharingScope && return Ptr{UInt32}(x + 164) return getfield(x, f) end -function Base.getproperty(x::CUpti_ActivityGreenContext, f::Symbol) - r = Ref{CUpti_ActivityGreenContext}(x) - ptr = Base.unsafe_convert(Ptr{CUpti_ActivityGreenContext}, r) +function Base.getproperty(x::CUpti_ActivityGreenContext2, f::Symbol) + r = Ref{CUpti_ActivityGreenContext2}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityGreenContext2}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol, v) +function Base.setproperty!(x::Ptr{CUpti_ActivityGreenContext2}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::CUpti_ActivityGreenContext, private::Bool=false) +function Base.propertynames(x::CUpti_ActivityGreenContext2, private::Bool=false) return (:kind, :contextId, :parentContextId, :deviceId, :numTpcs, :numMultiprocessors, - :logicalTpcMaskSize, :padding, :logicalTpcMask, if private + :logicalTpcMaskSize, :padding, :logicalTpcMask, :workqueueResourceId, + :workqueueConcurrencyLimit, :workqueueSharingScope, + if private fieldnames(typeof(x)) else () @@ -4587,7 +4781,10 @@ end GREEN_CONTEXT_FIELD_NUM_TPCS = 5 GREEN_CONTEXT_FIELD_LOGICAL_TPC_MASK_SIZE = 6 GREEN_CONTEXT_FIELD_LOGICAL_TPC_MASK = 7 - GREEN_CONTEXT_FIELD_MAX = 8 + GREEN_CONTEXT_FIELD_WORKQUEUE_RESOURCE_ID = 8 + GREEN_CONTEXT_FIELD_WORKQUEUE_CONCURRENCY_LIMIT = 9 + GREEN_CONTEXT_FIELD_WORKQUEUE_SHARING_SCOPE = 10 + GREEN_CONTEXT_FIELD_MAX = 11 end @cenum CUpti_ActivityAttribute::UInt32 begin @@ -4611,10 +4808,10 @@ end CUPTI_ACTIVITY_ATTR_ENABLE_CUDA_EVENT_DEVICE_TIMESTAMPS = 17 CUPTI_ACTIVITY_ATTR_ENABLE_KERNEL_LAUNCH_ATTRIBUTES = 18 CUPTI_ACTIVITY_ATTR_ENABLE_DEVICE_GRAPH_TRACE = 19 - CUPTI_ACTIVITY_ATTR_ENABLE_MULTI_SUBSCRIBER_GRAPH_LEVEL_TRACE = 20 + CUPTI_ACTIVITY_ATTR_ENABLE_MULTI_SUBSCRIBER_GRAPH_TRACE = 20 CUPTI_ACTIVITY_ATTR_THREAD_ID_TYPE = 21 CUPTI_ACTIVITY_ATTR_TIMESTAMP_CALLBACK = 22 - CUPTI_ACTIVITY_ATTR_CIG_MODE = 23 + CUPTI_ACTIVITY_ATTR_ENABLE_CIG_MODE = 23 CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT = 2147483647 end @@ -4630,6 +4827,12 @@ end @gcsafe_ccall libcupti.cuptiGetTimestamp(timestamp::Ptr{UInt64})::CUptiResult end +@checked function cuptiGetTimestamp_v2(subscriber, timestamp) + initialize_context() + @gcsafe_ccall libcupti.cuptiGetTimestamp_v2(subscriber::CUpti_SubscriberHandle, + timestamp::Ptr{UInt64})::CUptiResult +end + @checked function cuptiGetContextId(context, contextId) initialize_context() @gcsafe_ccall libcupti.cuptiGetContextId(context::CUcontext, @@ -4748,6 +4951,15 @@ end dropped::Ptr{Csize_t})::CUptiResult end +@checked function cuptiActivityGetNumDroppedRecords_v2(subscriber, context, streamId, + dropped) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityGetNumDroppedRecords_v2(subscriber::CUpti_SubscriberHandle, + context::CUcontext, + streamId::UInt32, + dropped::Ptr{Csize_t})::CUptiResult +end + @checked function cuptiActivityGetNextRecord(buffer, validBufferSizeBytes, record) initialize_context() @gcsafe_ccall libcupti.cuptiActivityGetNextRecord(buffer::Ptr{UInt8}, @@ -4755,6 +4967,15 @@ end record::Ptr{Ptr{CUpti_Activity}})::CUptiResult end +@checked function cuptiActivityGetNextRecord_v2(subscriber, buffer, validBufferSizeBytes, + record) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityGetNextRecord_v2(subscriber::CUpti_SubscriberHandle, + buffer::Ptr{UInt8}, + validBufferSizeBytes::Csize_t, + record::Ptr{Ptr{CUpti_Activity}})::CUptiResult +end + # typedef void ( CUPTIAPI * CUpti_BuffersCallbackRequestFunc ) ( uint8_t * * buffer , size_t * size , size_t * maxNumRecords ) const CUpti_BuffersCallbackRequestFunc = Ptr{Cvoid} @@ -4908,6 +5129,20 @@ end lastId::Ptr{UInt64})::CUptiResult end +@checked function cuptiActivityPushExternalCorrelationId_v2(subscriber, kind, id) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityPushExternalCorrelationId_v2(subscriber::CUpti_SubscriberHandle, + kind::CUpti_ExternalCorrelationKind, + id::UInt64)::CUptiResult +end + +@checked function cuptiActivityPopExternalCorrelationId_v2(subscriber, kind, lastId) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityPopExternalCorrelationId_v2(subscriber::CUpti_SubscriberHandle, + kind::CUpti_ExternalCorrelationKind, + lastId::Ptr{UInt64})::CUptiResult +end + @checked function cuptiActivityEnableLatencyTimestamps(enable) initialize_context() @gcsafe_ccall libcupti.cuptiActivityEnableLatencyTimestamps(enable::UInt8)::CUptiResult @@ -4942,12 +5177,26 @@ end enable::UInt8)::CUptiResult end +@checked function cuptiActivityEnableDriverApi_v2(subscriber, cbid, enable) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityEnableDriverApi_v2(subscriber::CUpti_SubscriberHandle, + cbid::CUpti_CallbackId, + enable::UInt8)::CUptiResult +end + @checked function cuptiActivityEnableRuntimeApi(cbid, enable) initialize_context() @gcsafe_ccall libcupti.cuptiActivityEnableRuntimeApi(cbid::CUpti_CallbackId, enable::UInt8)::CUptiResult end +@checked function cuptiActivityEnableRuntimeApi_v2(subscriber, cbid, enable) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityEnableRuntimeApi_v2(subscriber::CUpti_SubscriberHandle, + cbid::CUpti_CallbackId, + enable::UInt8)::CUptiResult +end + @checked function cuptiActivityEnableHWTrace(enable) initialize_context() @gcsafe_ccall libcupti.cuptiActivityEnableHWTrace(enable::UInt8)::CUptiResult @@ -5429,28 +5678,28 @@ function Base.propertynames(x::CUpti_ActivityKernel, private::Bool=false) end...) end -struct var"##Ctag#409" +struct var"##Ctag#403" data::NTuple{1,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#409"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#403"}, f::Symbol) f === :both && return Ptr{UInt8}(x + 0) - f === :config && return Ptr{var"##Ctag#410"}(x + 0) + f === :config && return Ptr{var"##Ctag#404"}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#409", f::Symbol) - r = Ref{var"##Ctag#409"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#409"}, r) +function Base.getproperty(x::var"##Ctag#403", f::Symbol) + r = Ref{var"##Ctag#403"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#403"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#409"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#403"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#409", private::Bool=false) +function Base.propertynames(x::var"##Ctag#403", private::Bool=false) return (:both, :config, if private fieldnames(typeof(x)) else @@ -5464,7 +5713,7 @@ end function Base.getproperty(x::Ptr{CUpti_ActivityKernel2}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) - f === :cacheConfig && return Ptr{var"##Ctag#409"}(x + 4) + f === :cacheConfig && return Ptr{var"##Ctag#403"}(x + 4) f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) f === :registersPerThread && return Ptr{UInt16}(x + 6) f === :start && return Ptr{UInt64}(x + 8) @@ -6008,28 +6257,28 @@ function Base.propertynames(x::CUpti_ActivityKernel7, private::Bool=false) end...) end -struct var"##Ctag#423" +struct var"##Ctag#417" data::NTuple{1,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#423"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#417"}, f::Symbol) f === :both && return Ptr{UInt8}(x + 0) - f === :config && return Ptr{var"##Ctag#424"}(x + 0) + f === :config && return Ptr{var"##Ctag#418"}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#423", f::Symbol) - r = Ref{var"##Ctag#423"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#423"}, r) +function Base.getproperty(x::var"##Ctag#417", f::Symbol) + r = Ref{var"##Ctag#417"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#417"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#423"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#417"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#423", private::Bool=false) +function Base.propertynames(x::var"##Ctag#417", private::Bool=false) return (:both, :config, if private fieldnames(typeof(x)) else @@ -6043,7 +6292,7 @@ end function Base.getproperty(x::Ptr{CUpti_ActivityKernel8}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) - f === :cacheConfig && return Ptr{var"##Ctag#423"}(x + 4) + f === :cacheConfig && return Ptr{var"##Ctag#417"}(x + 4) f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) f === :registersPerThread && return Ptr{UInt16}(x + 6) f === :partitionedGlobalCacheRequested && @@ -6348,6 +6597,124 @@ function Base.propertynames(x::CUpti_ActivityKernel10, private::Bool=false) end...) end +struct var"##Ctag#439" + data::NTuple{1,UInt8} +end + +function Base.getproperty(x::Ptr{var"##Ctag#439"}, f::Symbol) + f === :both && return Ptr{UInt8}(x + 0) + f === :config && return Ptr{var"##Ctag#440"}(x + 0) + return getfield(x, f) +end + +function Base.getproperty(x::var"##Ctag#439", f::Symbol) + r = Ref{var"##Ctag#439"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#439"}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{var"##Ctag#439"}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::var"##Ctag#439", private::Bool=false) + return (:both, :config, if private + fieldnames(typeof(x)) + else + () + end...) +end + +struct CUpti_ActivityKernel11 + data::NTuple{224,UInt8} +end + +function Base.getproperty(x::Ptr{CUpti_ActivityKernel11}, f::Symbol) + f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) + f === :cacheConfig && return Ptr{var"##Ctag#439"}(x + 4) + f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) + f === :registersPerThread && return Ptr{UInt16}(x + 6) + f === :partitionedGlobalCacheRequested && + return Ptr{CUpti_ActivityPartitionedGlobalCacheConfig}(x + 8) + f === :partitionedGlobalCacheExecuted && + return Ptr{CUpti_ActivityPartitionedGlobalCacheConfig}(x + 12) + f === :start && return Ptr{UInt64}(x + 16) + f === :_end && return Ptr{UInt64}(x + 24) + f === :completed && return Ptr{UInt64}(x + 32) + f === :deviceId && return Ptr{UInt32}(x + 40) + f === :contextId && return Ptr{UInt32}(x + 44) + f === :streamId && return Ptr{UInt32}(x + 48) + f === :gridX && return Ptr{Int32}(x + 52) + f === :gridY && return Ptr{Int32}(x + 56) + f === :gridZ && return Ptr{Int32}(x + 60) + f === :blockX && return Ptr{Int32}(x + 64) + f === :blockY && return Ptr{Int32}(x + 68) + f === :blockZ && return Ptr{Int32}(x + 72) + f === :staticSharedMemory && return Ptr{Int32}(x + 76) + f === :dynamicSharedMemory && return Ptr{Int32}(x + 80) + f === :localMemoryPerThread && return Ptr{UInt32}(x + 84) + f === :localMemoryTotal && return Ptr{UInt32}(x + 88) + f === :correlationId && return Ptr{UInt32}(x + 92) + f === :gridId && return Ptr{Int64}(x + 96) + f === :name && return Ptr{Cstring}(x + 104) + f === :reserved0 && return Ptr{Ptr{Cvoid}}(x + 112) + f === :queued && return Ptr{UInt64}(x + 120) + f === :submitted && return Ptr{UInt64}(x + 128) + f === :launchType && return Ptr{UInt8}(x + 136) + f === :isSharedMemoryCarveoutRequested && return Ptr{UInt8}(x + 137) + f === :sharedMemoryCarveoutRequested && return Ptr{UInt8}(x + 138) + f === :padding && return Ptr{UInt8}(x + 139) + f === :sharedMemoryExecuted && return Ptr{UInt32}(x + 140) + f === :graphNodeId && return Ptr{UInt64}(x + 144) + f === :shmemLimitConfig && return Ptr{CUpti_FuncShmemLimitConfig}(x + 152) + f === :graphId && return Ptr{UInt32}(x + 156) + f === :pAccessPolicyWindow && return Ptr{Ptr{CUaccessPolicyWindow}}(x + 160) + f === :channelID && return Ptr{UInt32}(x + 168) + f === :channelType && return Ptr{CUpti_ChannelType}(x + 172) + f === :clusterX && return Ptr{UInt32}(x + 176) + f === :clusterY && return Ptr{UInt32}(x + 180) + f === :clusterZ && return Ptr{UInt32}(x + 184) + f === :clusterSchedulingPolicy && return Ptr{UInt32}(x + 188) + f === :localMemoryTotal_v2 && return Ptr{UInt64}(x + 192) + f === :maxPotentialClusterSize && return Ptr{UInt32}(x + 200) + f === :maxActiveClusters && return Ptr{UInt32}(x + 204) + f === :isDeviceLaunched && return Ptr{UInt8}(x + 208) + f === :padding3 && return Ptr{NTuple{7,UInt8}}(x + 209) + f === :priority && return Ptr{Int32}(x + 216) + f === :padding4 && return Ptr{UInt32}(x + 220) + return getfield(x, f) +end + +function Base.getproperty(x::CUpti_ActivityKernel11, f::Symbol) + r = Ref{CUpti_ActivityKernel11}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityKernel11}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{CUpti_ActivityKernel11}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::CUpti_ActivityKernel11, private::Bool=false) + return (:kind, :cacheConfig, :sharedMemoryConfig, :registersPerThread, + :partitionedGlobalCacheRequested, :partitionedGlobalCacheExecuted, :start, + :_end, :completed, :deviceId, :contextId, :streamId, :gridX, :gridY, :gridZ, + :blockX, :blockY, :blockZ, :staticSharedMemory, :dynamicSharedMemory, + :localMemoryPerThread, :localMemoryTotal, :correlationId, :gridId, :name, + :reserved0, :queued, :submitted, :launchType, :isSharedMemoryCarveoutRequested, + :sharedMemoryCarveoutRequested, :padding, :sharedMemoryExecuted, :graphNodeId, + :shmemLimitConfig, :graphId, :pAccessPolicyWindow, :channelID, :channelType, + :clusterX, :clusterY, :clusterZ, :clusterSchedulingPolicy, :localMemoryTotal_v2, + :maxPotentialClusterSize, :maxActiveClusters, :isDeviceLaunched, :padding3, + :priority, :padding4, if private + fieldnames(typeof(x)) + else + () + end...) +end + struct CUpti_ActivityMemcpy data::NTuple{64,UInt8} end @@ -8067,6 +8434,43 @@ function Base.propertynames(x::CUpti_ActivityMarkerData, private::Bool=false) end...) end +struct CUpti_ActivityGreenContext + data::NTuple{152,UInt8} +end + +function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol) + f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) + f === :contextId && return Ptr{UInt32}(x + 4) + f === :parentContextId && return Ptr{UInt32}(x + 8) + f === :deviceId && return Ptr{UInt32}(x + 12) + f === :numTpcs && return Ptr{UInt32}(x + 16) + f === :numMultiprocessors && return Ptr{UInt16}(x + 20) + f === :logicalTpcMaskSize && return Ptr{UInt8}(x + 22) + f === :padding && return Ptr{UInt8}(x + 23) + f === :logicalTpcMask && return Ptr{NTuple{32,UInt32}}(x + 24) + return getfield(x, f) +end + +function Base.getproperty(x::CUpti_ActivityGreenContext, f::Symbol) + r = Ref{CUpti_ActivityGreenContext}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityGreenContext}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::CUpti_ActivityGreenContext, private::Bool=false) + return (:kind, :contextId, :parentContextId, :deviceId, :numTpcs, :numMultiprocessors, + :logicalTpcMaskSize, :padding, :logicalTpcMask, if private + fieldnames(typeof(x)) + else + () + end...) +end + @cenum CUpti_driver_api_trace_cbid_enum::UInt32 begin CUPTI_DRIVER_TRACE_CBID_INVALID = 0 CUPTI_DRIVER_TRACE_CBID_cuInit = 1 @@ -8914,7 +9318,9 @@ end CUPTI_DRIVER_TRACE_CBID_cuCoredumpRegisterCompleteCallback = 843 CUPTI_DRIVER_TRACE_CBID_cuCoredumpDeregisterStartCallback = 844 CUPTI_DRIVER_TRACE_CBID_cuCoredumpDeregisterCompleteCallback = 845 - CUPTI_DRIVER_TRACE_CBID_SIZE = 846 + CUPTI_DRIVER_TRACE_CBID_cuStreamBeginRecaptureToGraph = 846 + CUPTI_DRIVER_TRACE_CBID_cuStreamBeginRecaptureToGraph_ptsz = 847 + CUPTI_DRIVER_TRACE_CBID_SIZE = 848 CUPTI_DRIVER_TRACE_CBID_FORCE_INT = 2147483647 end @@ -9715,7 +10121,7 @@ struct CUpti_Profiler_GetCounterAvailability_Params ctx::CUcontext counterAvailabilityImageSize::Csize_t pCounterAvailabilityImage::Ptr{UInt8} - bAllowDeviceLevelCounters::Cint + bAllowDeviceLevelCounters::UInt8 end @cenum CUpti_Profiler_Support_Level::UInt32 begin @@ -9723,6 +10129,7 @@ end CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED = 1 CUPTI_PROFILER_CONFIGURATION_DISABLED = 2 CUPTI_PROFILER_CONFIGURATION_SUPPORTED = 3 + CUPTI_PROFILER_CONFIGURATION_LIMITED_SUPPORT = 4 end @cenum CUpti_Profiler_API::UInt32 begin @@ -10184,79 +10591,13 @@ function Base.setproperty!(x::Ptr{var"##Ctag#399"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#401" - smClock::UInt32 - memoryClock::UInt32 - pcieLinkGen::UInt32 - pcieLinkWidth::UInt32 - clocksThrottleReasons::CUpti_EnvironmentClocksThrottleReason -end -function Base.getproperty(x::Ptr{var"##Ctag#401"}, f::Symbol) - f === :smClock && return Ptr{UInt32}(x + 0) - f === :memoryClock && return Ptr{UInt32}(x + 4) - f === :pcieLinkGen && return Ptr{UInt32}(x + 8) - f === :pcieLinkWidth && return Ptr{UInt32}(x + 12) - f === :clocksThrottleReasons && - return Ptr{CUpti_EnvironmentClocksThrottleReason}(x + 16) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#401", f::Symbol) - r = Ref{var"##Ctag#401"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#401"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#401"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#402" - gpuTemperature::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#402"}, f::Symbol) - f === :gpuTemperature && return Ptr{UInt32}(x + 0) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#402", f::Symbol) - r = Ref{var"##Ctag#402"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#402"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#402"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#403" - power::UInt32 - powerLimit::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#403"}, f::Symbol) - f === :power && return Ptr{UInt32}(x + 0) - f === :powerLimit && return Ptr{UInt32}(x + 4) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#403", f::Symbol) - r = Ref{var"##Ctag#403"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#403"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#403"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - struct var"##Ctag#404" - fanSpeed::UInt32 + requested::UInt8 + executed::UInt8 end function Base.getproperty(x::Ptr{var"##Ctag#404"}, f::Symbol) - f === :fanSpeed && return Ptr{UInt32}(x + 0) + f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) + f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end @@ -10271,198 +10612,110 @@ function Base.setproperty!(x::Ptr{var"##Ctag#404"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#407" - uuidDev::CUuuid - peerDev::NTuple{32,CUdevice} -end -function Base.getproperty(x::Ptr{var"##Ctag#407"}, f::Symbol) - f === :uuidDev && return Ptr{CUuuid}(x + 0) - f === :peerDev && return Ptr{NTuple{32,CUdevice}}(x + 16) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#407", f::Symbol) - r = Ref{var"##Ctag#407"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#407"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#407"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#408" - secondaryBus::UInt16 - deviceId::UInt16 - vendorId::UInt16 - pad0::UInt16 -end -function Base.getproperty(x::Ptr{var"##Ctag#408"}, f::Symbol) - f === :secondaryBus && return Ptr{UInt16}(x + 0) - f === :deviceId && return Ptr{UInt16}(x + 2) - f === :vendorId && return Ptr{UInt16}(x + 4) - f === :pad0 && return Ptr{UInt16}(x + 6) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#408", f::Symbol) - r = Ref{var"##Ctag#408"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#408"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#408"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#410" +struct var"##Ctag#418" requested::UInt8 executed::UInt8 end -function Base.getproperty(x::Ptr{var"##Ctag#410"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#418"}, f::Symbol) f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#410", f::Symbol) - r = Ref{var"##Ctag#410"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#410"}, r) +function Base.getproperty(x::var"##Ctag#418", f::Symbol) + r = Ref{var"##Ctag#418"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#418"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#410"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#418"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#424" +struct var"##Ctag#425" requested::UInt8 executed::UInt8 end -function Base.getproperty(x::Ptr{var"##Ctag#424"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#425"}, f::Symbol) f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#424", f::Symbol) - r = Ref{var"##Ctag#424"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#424"}, r) +function Base.getproperty(x::var"##Ctag#425", f::Symbol) + r = Ref{var"##Ctag#425"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#425"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#424"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#425"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#441" +struct var"##Ctag#437" processId::UInt32 threadId::UInt32 end -function Base.getproperty(x::Ptr{var"##Ctag#441"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#437"}, f::Symbol) f === :processId && return Ptr{UInt32}(x + 0) f === :threadId && return Ptr{UInt32}(x + 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#441", f::Symbol) - r = Ref{var"##Ctag#441"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#441"}, r) +function Base.getproperty(x::var"##Ctag#437", f::Symbol) + r = Ref{var"##Ctag#437"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#437"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#441"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#437"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#442" +struct var"##Ctag#438" deviceId::UInt32 contextId::UInt32 streamId::UInt32 end -function Base.getproperty(x::Ptr{var"##Ctag#442"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#438"}, f::Symbol) f === :deviceId && return Ptr{UInt32}(x + 0) f === :contextId && return Ptr{UInt32}(x + 4) f === :streamId && return Ptr{UInt32}(x + 8) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#442", f::Symbol) - r = Ref{var"##Ctag#442"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#442"}, r) +function Base.getproperty(x::var"##Ctag#438", f::Symbol) + r = Ref{var"##Ctag#438"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#438"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#442"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#438"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#444" +struct var"##Ctag#440" requested::UInt8 executed::UInt8 end -function Base.getproperty(x::Ptr{var"##Ctag#444"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#440"}, f::Symbol) f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#444", f::Symbol) - r = Ref{var"##Ctag#444"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#444"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#444"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#446" - index::UInt32 - domainId::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#446"}, f::Symbol) - f === :index && return Ptr{UInt32}(x + 0) - f === :domainId && return Ptr{UInt32}(x + 4) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#446", f::Symbol) - r = Ref{var"##Ctag#446"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#446"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#446"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#448" - index::UInt32 - domainId::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#448"}, f::Symbol) - f === :index && return Ptr{UInt32}(x + 0) - f === :domainId && return Ptr{UInt32}(x + 4) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#448", f::Symbol) - r = Ref{var"##Ctag#448"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#448"}, r) +function Base.getproperty(x::var"##Ctag#440", f::Symbol) + r = Ref{var"##Ctag#440"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#440"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#448"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#440"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end diff --git a/lib/cusolver/src/libcusolver.jl b/lib/cusolver/src/libcusolver.jl index 063a51588a..e5d25d6020 100644 --- a/lib/cusolver/src/libcusolver.jl +++ b/lib/cusolver/src/libcusolver.jl @@ -54,6 +54,7 @@ const cusolverDnParams_t = Ptr{cusolverDnParams} CUSOLVERDN_GETRF = 0 CUSOLVERDN_POTRF = 1 CUSOLVERDN_SYEVBATCHED = 2 + CUSOLVERDN_GEQRF = 3 end const cusolver_int_t = Cint @@ -103,6 +104,12 @@ end CUSOLVER_EIG_RANGE_V = 1003 end +@cenum cusolverEigComp_t::UInt32 begin + CUSOLVER_EIG_COMP_N = 10 + CUSOLVER_EIG_COMP_I = 11 + CUSOLVER_EIG_COMP_V = 12 +end + @cenum cusolverNorm_t::UInt32 begin CUSOLVER_INF_NORM = 104 CUSOLVER_MAX_NORM = 105 @@ -4630,6 +4637,43 @@ end info::CuPtr{Cint})::cusolverStatus_t end +@checked function cusolverDnXstedc_bufferSize(handle, params, compz, n, dataTypeDE, D, E, + dataTypeZ, Z, ldz, computeType, + workspaceInBytesOnDevice, + workspaceInBytesOnHost) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXstedc_bufferSize(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + compz::cusolverEigComp_t, + n::Int64, + dataTypeDE::cudaDataType, + D::Ptr{Cvoid}, E::Ptr{Cvoid}, + dataTypeZ::cudaDataType, + Z::Ptr{Cvoid}, ldz::Int64, + computeType::cudaDataType, + workspaceInBytesOnDevice::Ptr{Csize_t}, + workspaceInBytesOnHost::Ptr{Csize_t})::cusolverStatus_t +end + +@checked function cusolverDnXstedc(handle, params, compz, n, dataTypeDE, D, E, dataTypeZ, Z, + ldz, computeType, bufferOnDevice, + workspaceInBytesOnDevice, bufferOnHost, + workspaceInBytesOnHost, info) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXstedc(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + compz::cusolverEigComp_t, n::Int64, + dataTypeDE::cudaDataType, D::Ptr{Cvoid}, + E::Ptr{Cvoid}, dataTypeZ::cudaDataType, + Z::Ptr{Cvoid}, ldz::Int64, + computeType::cudaDataType, + bufferOnDevice::Ptr{Cvoid}, + workspaceInBytesOnDevice::Csize_t, + bufferOnHost::Ptr{Cvoid}, + workspaceInBytesOnHost::Csize_t, + info::Ptr{Cint})::cusolverStatus_t +end + @checked function cusolverDnXsyevBatched_bufferSize(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, workspaceInBytesOnDevice, @@ -4974,12 +5018,52 @@ end @gcsafe_ccall libcusolver.cusolverDnLoggerSetMask(mask::Cint)::cusolverStatus_t end -# no prototype is found for this function at cusolverDn.h:4897:32, please use with caution +# no prototype is found for this function at cusolverDn.h:4932:32, please use with caution @checked function cusolverDnLoggerForceDisable() initialize_context() @gcsafe_ccall libcusolver.cusolverDnLoggerForceDisable()::cusolverStatus_t end +@checked function cusolverDnXpolar_bufferSize(handle, params, uplo, M, N, dataTypeA, A, lda, + dataTypeH, H, ldh, computeType, + workspaceInBytesOnDevice, + workspaceInBytesOnHost) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXpolar_bufferSize(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + uplo::cublasFillMode_t, M::Int64, + N::Int64, dataTypeA::cudaDataType, + A::Ptr{Cvoid}, lda::Int64, + dataTypeH::cudaDataType, + H::Ptr{Cvoid}, ldh::Int64, + computeType::cudaDataType, + workspaceInBytesOnDevice::Ptr{Csize_t}, + workspaceInBytesOnHost::Ptr{Csize_t})::cusolverStatus_t +end + +@checked function cusolverDnXpolar(handle, params, uplo, M, N, dataTypeA, A, lda, dataTypeH, + H, ldh, computeType, bufferOnDevice, + workspaceInBytesOnDevice, bufferOnHost, + workspaceInBytesOnHost, d_res_nrm, d_A_nrmF, d_rcond, + d_info) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXpolar(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + uplo::cublasFillMode_t, M::Int64, N::Int64, + dataTypeA::cudaDataType, A::Ptr{Cvoid}, + lda::Int64, dataTypeH::cudaDataType, + H::Ptr{Cvoid}, ldh::Int64, + computeType::cudaDataType, + bufferOnDevice::Ptr{Cvoid}, + workspaceInBytesOnDevice::Csize_t, + bufferOnHost::Ptr{Cvoid}, + workspaceInBytesOnHost::Csize_t, + d_res_nrm::Ptr{Cdouble}, + d_A_nrmF::Ptr{Cdouble}, + d_rcond::Ptr{Cdouble}, + d_info::Ptr{Cint})::cusolverStatus_t +end + mutable struct cusolverSpContext end const cusolverSpHandle_t = Ptr{cusolverSpContext} diff --git a/lib/cusparse/src/libcusparse.jl b/lib/cusparse/src/libcusparse.jl index fe205ce761..9957dc5d1c 100644 --- a/lib/cusparse/src/libcusparse.jl +++ b/lib/cusparse/src/libcusparse.jl @@ -6222,6 +6222,74 @@ end spgemmDescr::cusparseSpGEMMDescr_t)::cusparseStatus_t end +@cenum cusparseSpGEAMAlg_t::UInt32 begin + CUSPARSE_SPGEAM_ALG_DEFAULT = 0 + CUSPARSE_SPGEAM_ALG1 = 1 +end + +mutable struct cusparseSpGEAMDescr end + +const cusparseSpGEAMDescr_t = Ptr{cusparseSpGEAMDescr} + +@checked function cusparseSpGEAM_createDescr(descr) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_createDescr(descr::Ptr{cusparseSpGEAMDescr_t})::cusparseStatus_t +end + +@checked function cusparseSpGEAM_destroyDescr(descr) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_destroyDescr(descr::cusparseSpGEAMDescr_t)::cusparseStatus_t +end + +@checked function cusparseSpGEAM_bufferSize(handle, opA, opB, alpha, matA, beta, matB, matC, + computeType, alg, spgeamDescr, bufferSize) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_bufferSize(handle::cusparseHandle_t, + opA::cusparseOperation_t, + opB::cusparseOperation_t, + alpha::Ptr{Cvoid}, + matA::cusparseSpMatDescr_t, + beta::Ptr{Cvoid}, + matB::cusparseSpMatDescr_t, + matC::cusparseSpMatDescr_t, + computeType::cudaDataType, + alg::cusparseSpGEAMAlg_t, + spgeamDescr::cusparseSpGEAMDescr_t, + bufferSize::Ptr{Csize_t})::cusparseStatus_t +end + +@checked function cusparseSpGEAM_nnz(handle, opA, opB, alpha, matA, beta, matB, matC, + computeType, alg, spgeamDescr, externalBuffer) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_nnz(handle::cusparseHandle_t, + opA::cusparseOperation_t, + opB::cusparseOperation_t, + alpha::Ptr{Cvoid}, + matA::cusparseSpMatDescr_t, + beta::Ptr{Cvoid}, + matB::cusparseSpMatDescr_t, + matC::cusparseSpMatDescr_t, + computeType::cudaDataType, + alg::cusparseSpGEAMAlg_t, + spgeamDescr::cusparseSpGEAMDescr_t, + externalBuffer::Ptr{Cvoid})::cusparseStatus_t +end + +@checked function cusparseSpGEAM(handle, opA, opB, alpha, matA, beta, matB, matC, + computeType, alg, spgeamDescr, externalBuffer) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM(handle::cusparseHandle_t, + opA::cusparseOperation_t, + opB::cusparseOperation_t, alpha::Ptr{Cvoid}, + matA::cusparseSpMatDescr_t, beta::Ptr{Cvoid}, + matB::cusparseSpMatDescr_t, + matC::cusparseSpMatDescr_t, + computeType::cudaDataType, + alg::cusparseSpGEAMAlg_t, + spgeamDescr::cusparseSpGEAMDescr_t, + externalBuffer::Ptr{Cvoid})::cusparseStatus_t +end + @cenum cusparseSDDMMAlg_t::UInt32 begin CUSPARSE_SDDMM_ALG_DEFAULT = 0 end diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl index b33560b723..84ea9dca94 100644 --- a/lib/cutensor/src/libcutensor.jl +++ b/lib/cutensor/src/libcutensor.jl @@ -550,7 +550,7 @@ end beta::Ptr{Cvoid}, C::Ptr{CuPtr{Cvoid}}, D::Ptr{CuPtr{Cvoid}}, - workspace::CuPtr{Cvoid}, + workspace::Ptr{CuPtr{Cvoid}}, workspaceSize::UInt64, stream::cudaStream_t)::cutensorStatus_t end diff --git a/lib/nvml/src/libnvml.jl b/lib/nvml/src/libnvml.jl index 1ea71656dd..683c3a6cc6 100644 --- a/lib/nvml/src/libnvml.jl +++ b/lib/nvml/src/libnvml.jl @@ -47,6 +47,8 @@ end const nvmlMemoryErrorType_t = nvmlMemoryErrorType_enum +const nvmlCPERCursorHandle_t = Culonglong + mutable struct nvmlDevice_st end const nvmlDevice_t = Ptr{nvmlDevice_st} @@ -149,6 +151,16 @@ struct nvmlProcessDetail_v1_t usedGpuCcProtectedMemory::Culonglong end +@cenum nvmlProcessMode_enum::UInt32 begin + NVML_PROCESS_MODE_COMPUTE = 0 + NVML_PROCESS_MODE_GRAPHICS = 1 + NVML_PROCESS_MODE_MPS = 2 + NVML_PROCESS_MODE_ALL = 3 + NVML_PROCESS_MODE_MAX = 4 +end + +const nvmlProcessMode_t = nvmlProcessMode_enum + struct nvmlProcessDetailList_v1_t version::Cuint mode::Cuint @@ -650,6 +662,10 @@ end const nvmlPdi_t = nvmlPdi_v1_t +struct nvmlBBXTimeData_v1_t + timeRun::Cuint +end + @cenum nvmlEnableState_enum::UInt32 begin NVML_FEATURE_DISABLED = 0 NVML_FEATURE_ENABLED = 1 @@ -1019,6 +1035,15 @@ end const nvmlEccSramUniqueUncorrectedErrorCounts_t = nvmlEccSramUniqueUncorrectedErrorCounts_v1_t +struct nvmlRemappedRowsInfo_v2_t + corrActiveRemaps::Cuint + corrInactiveRemaps::Cuint + uncActiveRemaps::Cuint + uncInactiveRemaps::Cuint + bPending::Cuint + bFailureOccurred::Cuint +end + struct nvmlRusdSettings_v1_t version::Cuint pollMask::Culonglong @@ -1586,6 +1611,7 @@ const nvmlGridLicensableFeatures_t = nvmlGridLicensableFeatures_st NVML_GPU_RECOVERY_ACTION_NODE_REBOOT = 2 NVML_GPU_RECOVERY_ACTION_DRAIN_P2P = 3 NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET = 4 + NVML_GPU_RECOVERY_ACTION_RECOVER_IMEX_DOMAIN = 5 end const nvmlDeviceGpuRecoveryAction_t = nvmlDeviceGpuRecoveryAction_s @@ -1939,6 +1965,19 @@ end const nvmlAccountingStats_t = nvmlAccountingStats_st +struct nvmlAccountingStats_v2_t + pid::Cuint + isRunning::Cuint + gpuUtilization::Cuint + memoryUtilization::Cuint + maxMemoryUsage::Culonglong + sampleCount::Cuint + sumGpuUtil::Culonglong + sumFbUtil::Culonglong + time::Culonglong + startTime::Culonglong +end + @cenum nvmlEncoderQueryType_enum::UInt32 begin NVML_ENCODER_QUERY_H264 = 0 NVML_ENCODER_QUERY_HEVC = 1 @@ -2126,6 +2165,22 @@ function nvmlErrorString(result) @gcsafe_ccall libnvml.nvmlErrorString(result::nvmlReturn_t)::Cstring end +@cenum nvmlCPERType_t::UInt32 begin + NVML_CPER_ACCESS_TYPE_GPU = 1 +end + +struct nvmlCPERCursor_v1_t + cperTypeMask::Cuint + uuid::NTuple{80,Cchar} + handle::nvmlCPERCursorHandle_t +end + +struct nvmlGetCPER_v1_t + cursor::nvmlCPERCursor_v1_t + buffer::Ptr{Cuchar} + bufferSize::Cuint +end + @checked function nvmlSystemGetDriverVersion(version, length) initialize_context() @gcsafe_ccall libnvml.nvmlSystemGetDriverVersion(version::Cstring, @@ -2179,6 +2234,11 @@ const nvmlSystemDriverBranchInfo_t = nvmlSystemDriverBranchInfo_v1_t length::Cuint)::nvmlReturn_t end +@checked function nvmlSystemGetCPER_v1(cper) + initialize_context() + @gcsafe_ccall libnvml.nvmlSystemGetCPER_v1(cper::Ptr{nvmlGetCPER_v1_t})::nvmlReturn_t +end + @checked function nvmlUnitGetCount(unitCount) initialize_context() @gcsafe_ccall libnvml.nvmlUnitGetCount(unitCount::Ptr{Cuint})::nvmlReturn_t @@ -2436,6 +2496,12 @@ end durationUs::Ptr{Culong})::nvmlReturn_t end +@checked function nvmlDeviceGetBBXTimeData_v1(device, timeData) + initialize_context() + @gcsafe_ccall libnvml.nvmlDeviceGetBBXTimeData_v1(device::nvmlDevice_t, + timeData::Ptr{nvmlBBXTimeData_v1_t})::nvmlReturn_t +end + @checked function nvmlDeviceGetDisplayMode(device, display) initialize_context() @gcsafe_ccall libnvml.nvmlDeviceGetDisplayMode(device::nvmlDevice_t, @@ -3256,6 +3322,12 @@ end bufferSize::Ptr{Cuint})::nvmlReturn_t end +@checked function nvmlDeviceGetAccountingStats_v2(device, stats) + initialize_context() + @gcsafe_ccall libnvml.nvmlDeviceGetAccountingStats_v2(device::nvmlDevice_t, + stats::Ptr{nvmlAccountingStats_v2_t})::nvmlReturn_t +end + @checked function nvmlDeviceGetRetiredPages(device, cause, pageCount, addresses) initialize_context() @gcsafe_ccall libnvml.nvmlDeviceGetRetiredPages(device::nvmlDevice_t, @@ -3517,6 +3589,7 @@ end NVML_NVLINK_VERSION_3_1 = 5 NVML_NVLINK_VERSION_4_0 = 6 NVML_NVLINK_VERSION_5_0 = 7 + NVML_NVLINK_VERSION_6_0 = 8 end const nvmlNvlinkVersion_t = nvmlNvlinkVersion_enum @@ -5054,6 +5127,42 @@ end NVML_GPM_METRIC_GR7_CTXSW_REQUESTS = 207 NVML_GPM_METRIC_GR7_CTXSW_CYCLES_PER_REQ = 208 NVML_GPM_METRIC_GR7_CTXSW_ACTIVE_PCT = 209 + NVML_GPM_METRIC_NVLINK_L18_RX_PER_SEC = 212 + NVML_GPM_METRIC_NVLINK_L18_TX_PER_SEC = 213 + NVML_GPM_METRIC_NVLINK_L19_RX_PER_SEC = 214 + NVML_GPM_METRIC_NVLINK_L19_TX_PER_SEC = 215 + NVML_GPM_METRIC_NVLINK_L20_RX_PER_SEC = 216 + NVML_GPM_METRIC_NVLINK_L20_TX_PER_SEC = 217 + NVML_GPM_METRIC_NVLINK_L21_RX_PER_SEC = 218 + NVML_GPM_METRIC_NVLINK_L21_TX_PER_SEC = 219 + NVML_GPM_METRIC_NVLINK_L22_RX_PER_SEC = 220 + NVML_GPM_METRIC_NVLINK_L22_TX_PER_SEC = 221 + NVML_GPM_METRIC_NVLINK_L23_RX_PER_SEC = 222 + NVML_GPM_METRIC_NVLINK_L23_TX_PER_SEC = 223 + NVML_GPM_METRIC_NVLINK_L24_RX_PER_SEC = 224 + NVML_GPM_METRIC_NVLINK_L24_TX_PER_SEC = 225 + NVML_GPM_METRIC_NVLINK_L25_RX_PER_SEC = 226 + NVML_GPM_METRIC_NVLINK_L25_TX_PER_SEC = 227 + NVML_GPM_METRIC_NVLINK_L26_RX_PER_SEC = 228 + NVML_GPM_METRIC_NVLINK_L26_TX_PER_SEC = 229 + NVML_GPM_METRIC_NVLINK_L27_RX_PER_SEC = 230 + NVML_GPM_METRIC_NVLINK_L27_TX_PER_SEC = 231 + NVML_GPM_METRIC_NVLINK_L28_RX_PER_SEC = 232 + NVML_GPM_METRIC_NVLINK_L28_TX_PER_SEC = 233 + NVML_GPM_METRIC_NVLINK_L29_RX_PER_SEC = 234 + NVML_GPM_METRIC_NVLINK_L29_TX_PER_SEC = 235 + NVML_GPM_METRIC_NVLINK_L30_RX_PER_SEC = 236 + NVML_GPM_METRIC_NVLINK_L30_TX_PER_SEC = 237 + NVML_GPM_METRIC_NVLINK_L31_RX_PER_SEC = 238 + NVML_GPM_METRIC_NVLINK_L31_TX_PER_SEC = 239 + NVML_GPM_METRIC_NVLINK_L32_RX_PER_SEC = 240 + NVML_GPM_METRIC_NVLINK_L32_TX_PER_SEC = 241 + NVML_GPM_METRIC_NVLINK_L33_RX_PER_SEC = 242 + NVML_GPM_METRIC_NVLINK_L33_TX_PER_SEC = 243 + NVML_GPM_METRIC_NVLINK_L34_RX_PER_SEC = 244 + NVML_GPM_METRIC_NVLINK_L34_TX_PER_SEC = 245 + NVML_GPM_METRIC_NVLINK_L35_RX_PER_SEC = 246 + NVML_GPM_METRIC_NVLINK_L35_TX_PER_SEC = 247 NVML_GPM_METRIC_SM_CYCLES_ELAPSED = 248 NVML_GPM_METRIC_SM_CYCLES_ACTIVE = 249 NVML_GPM_METRIC_MMA_CYCLES_ACTIVE = 250 @@ -5103,6 +5212,42 @@ end NVML_GPM_METRIC_NVLINK_L16_TX = 294 NVML_GPM_METRIC_NVLINK_L17_RX = 295 NVML_GPM_METRIC_NVLINK_L17_TX = 296 + NVML_GPM_METRIC_NVLINK_L18_RX = 297 + NVML_GPM_METRIC_NVLINK_L18_TX = 298 + NVML_GPM_METRIC_NVLINK_L19_RX = 299 + NVML_GPM_METRIC_NVLINK_L19_TX = 300 + NVML_GPM_METRIC_NVLINK_L20_RX = 301 + NVML_GPM_METRIC_NVLINK_L20_TX = 302 + NVML_GPM_METRIC_NVLINK_L21_RX = 303 + NVML_GPM_METRIC_NVLINK_L21_TX = 304 + NVML_GPM_METRIC_NVLINK_L22_RX = 305 + NVML_GPM_METRIC_NVLINK_L22_TX = 306 + NVML_GPM_METRIC_NVLINK_L23_RX = 307 + NVML_GPM_METRIC_NVLINK_L23_TX = 308 + NVML_GPM_METRIC_NVLINK_L24_RX = 309 + NVML_GPM_METRIC_NVLINK_L24_TX = 310 + NVML_GPM_METRIC_NVLINK_L25_RX = 311 + NVML_GPM_METRIC_NVLINK_L25_TX = 312 + NVML_GPM_METRIC_NVLINK_L26_RX = 313 + NVML_GPM_METRIC_NVLINK_L26_TX = 314 + NVML_GPM_METRIC_NVLINK_L27_RX = 315 + NVML_GPM_METRIC_NVLINK_L27_TX = 316 + NVML_GPM_METRIC_NVLINK_L28_RX = 317 + NVML_GPM_METRIC_NVLINK_L28_TX = 318 + NVML_GPM_METRIC_NVLINK_L29_RX = 319 + NVML_GPM_METRIC_NVLINK_L29_TX = 320 + NVML_GPM_METRIC_NVLINK_L30_RX = 321 + NVML_GPM_METRIC_NVLINK_L30_TX = 322 + NVML_GPM_METRIC_NVLINK_L31_RX = 323 + NVML_GPM_METRIC_NVLINK_L31_TX = 324 + NVML_GPM_METRIC_NVLINK_L32_RX = 325 + NVML_GPM_METRIC_NVLINK_L32_TX = 326 + NVML_GPM_METRIC_NVLINK_L33_RX = 327 + NVML_GPM_METRIC_NVLINK_L33_TX = 328 + NVML_GPM_METRIC_NVLINK_L34_RX = 329 + NVML_GPM_METRIC_NVLINK_L34_TX = 330 + NVML_GPM_METRIC_NVLINK_L35_RX = 331 + NVML_GPM_METRIC_NVLINK_L35_TX = 332 NVML_GPM_METRIC_MAX = 333 end @@ -5375,6 +5520,12 @@ end errorCounts::Ptr{nvmlEccSramUniqueUncorrectedErrorCounts_t})::nvmlReturn_t end +@checked function nvmlDeviceGetRemappedRows_v2(device, info) + initialize_context() + @gcsafe_ccall libnvml.nvmlDeviceGetRemappedRows_v2(device::nvmlDevice_t, + info::Ptr{nvmlRemappedRowsInfo_v2_t})::nvmlReturn_t +end + @checked function nvmlDeviceSetRusdSettings_v1(device, settings) initialize_context() @gcsafe_ccall libnvml.nvmlDeviceSetRusdSettings_v1(device::nvmlDevice_t, @@ -5632,7 +5783,7 @@ const nvmlDeviceAddressingMode_v1 = @NVML_STRUCT_VERSION(DeviceAddressingMode, 1 const nvmlRepairStatus_v1 = @NVML_STRUCT_VERSION(RepairStatus, 1) -const NVML_NVLINK_MAX_LINKS = 18 +const NVML_NVLINK_MAX_LINKS = 36 const NVML_TOPOLOGY_CPU = NVML_TOPOLOGY_NODE @@ -5739,6 +5890,8 @@ const NVML_DEVICE_ARCH_HOPPER = 9 const NVML_DEVICE_ARCH_BLACKWELL = 10 +const NVML_DEVICE_ARCH_RUBIN = 13 + const NVML_DEVICE_ARCH_UNKNOWN = 0xffffffff const NVML_BUS_TYPE_UNKNOWN = 0 @@ -5803,6 +5956,10 @@ const NVML_POWER_SCOPE_MODULE = Cuint(1) const NVML_POWER_SCOPE_MEMORY = Cuint(2) +const NVML_POWER_SCOPE_GPU_BASE = Cuint(3) + +const NVML_POWER_SCOPE_COUNT = Cuint(4) + const nvmlPowerValue_v2 = @NVML_STRUCT_VERSION(PowerValue, 2) const NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0 @@ -6506,7 +6663,27 @@ const NVML_FI_DEV_NVLINK_PLR_XMIT_BLOCKS = 294 const NVML_FI_DEV_NVLINK_PLR_XMIT_RETRY_BLOCKS = 295 -const NVML_FI_MAX = 296 +const NVML_FI_DEV_NVLINK_GET_DATA_RATE = 296 + +const NVML_FI_DEV_MMA_STALL_PERCENT = 297 + +const NVML_FI_DEV_MCLK_SWITCH_TYPE = 298 + +const NVML_FI_DEV_MCLK_MIN_SWITCH_INTERVAL_MILLISECONDS = 299 + +const NVML_FI_PWR_SMOOTHING_SOC_POWER_SMOOTHING_ENABLED = 300 + +const NVML_FI_DEV_REMAPPED_ROWS_COR_INACTIVE = 301 + +const NVML_FI_DEV_REMAPPED_ROWS_UNC_INACTIVE = 302 + +const NVML_FI_MAX = 303 + +const NVML_MCLK_SWITCH_TYPE_NOT_SUPPORTED = 0x00 + +const NVML_MCLK_SWITCH_TYPE_DEFERRED = 0x01 + +const NVML_MCLK_SWITCH_TYPE_RUNTIME = 0x02 const NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_100US = 0x00 @@ -6802,6 +6979,8 @@ const NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 const NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 +const NVML_CPER_CURSOR_HANDLE_INIT = nvmlCPERCursorHandle_t(0) + const nvmlSystemDriverBranchInfo_v1 = @NVML_STRUCT_VERSION(SystemDriverBranchInfo, 1) const NVML_AFFINITY_SCOPE_NODE = 0 @@ -6888,7 +7067,9 @@ const NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME = 0x0f const NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME = 0x10 -const NVML_GPU_INSTANCE_PROFILE_COUNT = 0x11 +const NVML_GPU_INSTANCE_PROFILE_3_SLICE_GFX = 0x11 + +const NVML_GPU_INSTANCE_PROFILE_COUNT = 0x12 const NVML_GPU_INSTANCE_PROFILE_CAPS_P2P = 0x01 @@ -6918,7 +7099,9 @@ const NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE = 0x06 const NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x07 -const NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x08 +const NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE_NVL = 0x08 + +const NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x09 const NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x00 diff --git a/res/wrap/Manifest.toml b/res/wrap/Manifest.toml index 835abe4c02..1bbb50a446 100644 --- a/res/wrap/Manifest.toml +++ b/res/wrap/Manifest.toml @@ -29,21 +29,21 @@ version = "3.4.3" [[deps.CUDA_Driver_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "TOML"] -git-tree-sha1 = "061f39cc84e99928830aa1005d79f7e99097ba28" +git-tree-sha1 = "2d6222474d868469a72de5bd47c5c25c0e1fe518" uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" -version = "13.2.0+0" +version = "13.3.0+0" [[deps.CUDA_Runtime_jll]] deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "af17d37b5b8b4d7525f8902eba1ef6141a9a7d3b" +git-tree-sha1 = "a648a4cb5c2e5ae8bf2f6ff1f6985f4c6ede2e8d" uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" -version = "0.21.0+0" +version = "0.23.0+0" [[deps.CUDA_SDK_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "c1fd6fe1a1a198afa28b425737f6afffe667096e" +git-tree-sha1 = "5ad75f2abd7e66fb090da859e03f0e36c2720aaf" uuid = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0" -version = "13.2.0+0" +version = "13.3.0+0" [[deps.CUDNN_jll]] deps = ["Artifacts", "CUDA_Runtime_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] @@ -65,15 +65,15 @@ version = "0.19.3" [[deps.Clang_unified_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "TOML", "Zlib_jll", "Zstd_jll", "libLLVM_jll"] -git-tree-sha1 = "fffacfad25e1f33d5b9864ff723d9180bc930148" +git-tree-sha1 = "05e53efbcd4efa450a3bf343b62793436b2cc5d1" uuid = "ffc816e1-ba66-5fa9-9ecc-bcc5cb19bea1" -version = "0.1.4+0" +version = "0.1.5+1" [[deps.CommonMark]] deps = ["PrecompileTools"] -git-tree-sha1 = "65ea18ada9814f09c5013924c42fe8b53d6ee467" +git-tree-sha1 = "019ad9e55bb3549403f2d5a9b314fbb29a806ecb" uuid = "a80b9123-70ca-4bc0-993e-6e3bcb318db6" -version = "0.10.3" +version = "1.0.1" [deps.CommonMark.extensions] CommonMarkMarkdownASTExt = "MarkdownAST" @@ -103,21 +103,21 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" version = "1.11.0" [[deps.Glob]] -git-tree-sha1 = "83cb0092e2792b9e3a865b6655e88f5b862607e2" +git-tree-sha1 = "246c628cec062230b7d183aab88841fa94fcabe9" uuid = "c27321d9-0574-5035-807b-f59d2c89b15c" -version = "1.4.0" +version = "1.5.0" [[deps.JLLWrappers]] deps = ["Artifacts", "Preferences"] -git-tree-sha1 = "0533e564aae234aff59ab625543145446d8b6ec2" +git-tree-sha1 = "7204148362dafe5fe6a273f855b8ccbe4df8173e" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.7.1" +version = "1.8.0" [[deps.JuliaFormatter]] deps = ["CommonMark", "Glob", "JuliaSyntax", "PrecompileTools", "TOML"] -git-tree-sha1 = "ec5e99ad66c51b6fcd3dbab6655b17118a29a588" +git-tree-sha1 = "734a648c54e0512112049150965e85c887ae7fa3" uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899" -version = "2.3.0" +version = "2.4.0" [[deps.JuliaSyntax]] git-tree-sha1 = "937da4713526b96ac9a178e2035019d3b78ead4a" @@ -204,9 +204,9 @@ version = "1.12.1" [[deps.PrecompileTools]] deps = ["Preferences"] -git-tree-sha1 = "07a921781cab75691315adc645096ed5e370cb77" +git-tree-sha1 = "edbeefc7a4889f528644251bdb5fc9ab5348bc2c" uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.3.3" +version = "1.3.4" [[deps.Preferences]] deps = ["TOML"] diff --git a/res/wrap/custatevec.toml b/res/wrap/custatevec.toml index 1aef3fe7e6..3b63c0e646 100644 --- a/res/wrap/custatevec.toml +++ b/res/wrap/custatevec.toml @@ -67,9 +67,6 @@ needs_context = false [api.custatevecBatchMeasure.argtypes] 2 = "CuPtr{Cvoid}" -[api.custatevecMeasureBatched.argtypes] -2 = "CuPtr{Cvoid}" - [api.custatevecBatchMeasureWithOffset.argtypes] 2 = "CuPtr{Cvoid}" From 953b6e20ae38a128c59aa84299c85b8fec683dd5 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 08:58:36 +0200 Subject: [PATCH 03/10] Bump compat databases. --- CUDACore/src/compatibility.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 0f512a0255..d6dea25b25 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -133,6 +133,7 @@ const ptxas_ptx_db = Dict( v"9.0" => between(v"13.0", highest), v"9.1" => between(v"13.1", highest), v"9.2" => between(v"13.2", highest), + v"9.3" => between(v"13.3", highest), ) function ptxas_ptx_support(ver::VersionNumber) @@ -296,7 +297,8 @@ const llvm_ptx_db = Dict( v"8.7" => between(v"20", highest), v"8.8" => between(v"21", highest), v"9.0" => between(v"22", highest), - v"9.1" => between(v"24", highest), + v"9.1" => between(v"23", highest), + v"9.2" => between(v"23", highest), ) function llvm_ptx_support(ver::VersionNumber) From f525aae6c2b0b5191984b6c46ab075733a337a91 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 09:22:56 +0200 Subject: [PATCH 04/10] cuFFT: fix error descriptions for the 13.x result enum CUDA 13.0 removed CUFFT_INCOMPLETE_PARAMETER_LIST, CUFFT_PARSE_ERROR and CUFFT_LICENSE_ERROR from cufftResult. Since the bindings are regenerated against 13.3, those names no longer exist, so description() threw an UndefVarError for any error code that fell through to them. Drop the dead branches and add descriptions for the new codes (CUFFT_MISSING_DEPENDENCY, CUFFT_NVRTC_FAILURE, CUFFT_NVJITLINK_FAILURE, CUFFT_NVSHMEM_FAILURE). Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/cufft/src/error.jl | 14 ++++++++------ lib/cufft/src/wrappers.jl | 4 ++-- lib/cufft/test/errors.jl | 10 ++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 lib/cufft/test/errors.jl diff --git a/lib/cufft/src/error.jl b/lib/cufft/src/error.jl index 15b212b798..5dfc6436d6 100644 --- a/lib/cufft/src/error.jl +++ b/lib/cufft/src/error.jl @@ -33,20 +33,22 @@ function description(err::CUFFTError) "user specified an invalid transform size" elseif err.code == CUFFT_UNALIGNED_DATA "cuFFT unaligned data" # No longer used - elseif err.code == CUFFT_INCOMPLETE_PARAMETER_LIST - "missing parameters in call" elseif err.code == CUFFT_INVALID_DEVICE "execution of a plan was on different GPU than plan creation" - elseif err.code == CUFFT_PARSE_ERROR - "internal plan database error" elseif err.code == CUFFT_NO_WORKSPACE "no workspace has been provided prior to plan execution" elseif err.code == CUFFT_NOT_IMPLEMENTED "function does not implement functionality for parameters given." - elseif err.code == CUFFT_LICENSE_ERROR - "cuFFT license error" # Used in previous versions. elseif err.code == CUFFT_NOT_SUPPORTED "operation is not supported for parameters given." + elseif err.code == CUFFT_MISSING_DEPENDENCY + "a required runtime dependency was not found" + elseif err.code == CUFFT_NVRTC_FAILURE + "NVRTC failed to compile a required kernel" + elseif err.code == CUFFT_NVJITLINK_FAILURE + "nvJitLink failed to link a required kernel" + elseif err.code == CUFFT_NVSHMEM_FAILURE + "an NVSHMEM operation failed" else "no description for this error" end diff --git a/lib/cufft/src/wrappers.jl b/lib/cufft/src/wrappers.jl index ad2ec12f8d..959194b704 100644 --- a/lib/cufft/src/wrappers.jl +++ b/lib/cufft/src/wrappers.jl @@ -30,8 +30,8 @@ function cufftMakePlan(output_type::Type{<:cufftNumber}, input_type::Type{<:cuff end # cuFFT half-precision transforms require all transform dim sizes to be # powers of 2 (NVIDIA cuFFT docs, "Half-precision transforms"). Catch this - # up-front so the user sees a Julia error rather than a bare - # CUFFT_INCOMPLETE_PARAMETER_LIST from inside the wrapper. + # up-front so the user sees a Julia error rather than a bare cuFFT error + # from inside the wrapper. if input_type <: Union{Float16, Complex{Float16}} || output_type <: Union{Float16, Complex{Float16}} for d in region diff --git a/lib/cufft/test/errors.jl b/lib/cufft/test/errors.jl new file mode 100644 index 0000000000..92bb5ccdee --- /dev/null +++ b/lib/cufft/test/errors.jl @@ -0,0 +1,10 @@ +@testset "error descriptions" begin + # every result code must have a printable description; this guards against + # referencing enum values that no longer exist in the headers (regression). + for code in instances(cuFFT.cufftResult) + err = cuFFT.CUFFTError(code) + msg = sprint(showerror, err) + @test occursin(string(code), msg) + @test !isempty(cuFFT.description(err)) + end +end From 3a4ffd70ec7f16dfd1d078b10132ffe9ff4792e2 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 09:25:49 +0200 Subject: [PATCH 05/10] CUDADrv: use cuEventElapsedTime_v2 when available cuEventElapsedTime_v2 (CUDA 12.8+) supersedes the now-deprecated v1 entry point with improved accuracy and argument validation. Branch on driver_version() so we call it on new enough drivers and keep the v1 fallback otherwise. Covered by the existing "events" testset. Co-Authored-By: Claude Opus 4.7 (1M context) --- CUDACore/lib/cudadrv/events.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CUDACore/lib/cudadrv/events.jl b/CUDACore/lib/cudadrv/events.jl index 9b36cb6384..2912488206 100644 --- a/CUDACore/lib/cudadrv/events.jl +++ b/CUDACore/lib/cudadrv/events.jl @@ -85,7 +85,13 @@ Computes the elapsed time between two events (in seconds). """ function elapsed(start::CuEvent, stop::CuEvent) time_ref = Ref{Cfloat}() - cuEventElapsedTime(time_ref, start, stop) + # cuEventElapsedTime_v2 (CUDA 12.8+) has improved accuracy and validation; + # fall back to the deprecated v1 entry point on older drivers. + if driver_version() >= v"12.8" + cuEventElapsedTime_v2(time_ref, start, stop) + else + cuEventElapsedTime(time_ref, start, stop) + end return time_ref[]/1000 end From 1e962784c0f89adb8c89d7b7e35f34f68f75f740 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 09:31:11 +0200 Subject: [PATCH 06/10] cuSPARSE: use the generic SpGEAM API for sparse addition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cuSPARSE 12.8.1 (CUDA 13.3) added the generic SpGEAM API for C = αA + βB, replacing the type-specific csrgeam2 routines. Prefer it when available and keep csrgeam2 as the fallback for older versions. Also fix the generated SpGEAM bindings: the device workspace was typed as a host Ptr{Cvoid} (it must be CuPtr{Cvoid}), and the alpha/beta scalars are now PtrOrCuPtr{Cvoid} to match the other generic APIs. Fixed in res/wrap too. Covered by the existing geam tests in interfaces/mul.jl. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/cusparse/src/extra.jl | 49 ++++++++++++++++++++++++++++++++- lib/cusparse/src/helpers.jl | 14 ++++++++++ lib/cusparse/src/libcusparse.jl | 18 ++++++------ res/wrap/cusparse.toml | 15 ++++++++++ 4 files changed, 86 insertions(+), 10 deletions(-) diff --git a/lib/cusparse/src/extra.jl b/lib/cusparse/src/extra.jl index 4cbbcd103e..09486bc8a5 100644 --- a/lib/cusparse/src/extra.jl +++ b/lib/cusparse/src/extra.jl @@ -7,12 +7,59 @@ Performs `C = alpha * A + beta * B`. `A` and `B` are sparse matrices defined in """ geam(alpha::Number, A::CuSparseMatrixCSR, beta::Number, B::CuSparseMatrixCSR, index::SparseChar) +function geam(alpha::Number, A::CuSparseMatrixCSR{T}, beta::Number, B::CuSparseMatrixCSR{T}, index::SparseChar) where {T <: BlasFloat} + # the generic SpGEAM API was added in cuSPARSE 12.8.1 (CUDA 13.3); use the + # legacy csrgeam2 path on older versions. + version() >= v"12.8.1" ? geam_spgeam(alpha, A, beta, B, index) : + geam_csrgeam2(alpha, A, beta, B, index) +end + +function geam_spgeam(alpha::Number, A::CuSparseMatrixCSR{T}, beta::Number, B::CuSparseMatrixCSR{T}, index::SparseChar) where {T <: BlasFloat} + m, n = size(A) + (m, n) == size(B) || throw(DimensionMismatch("dimensions must match: A has dims $(size(A)), B has dims $(size(B))")) + + alpha_ref = Ref{T}(convert(T, alpha)) + beta_ref = Ref{T}(convert(T, beta)) + descA = CuSparseMatrixDescriptor(A, index) + descB = CuSparseMatrixDescriptor(B, index) + + rowPtrC = CuVector{Cint}(undef, m + 1) + descC = CuSparseMatrixDescriptor(CuSparseMatrixCSR, rowPtrC, T, Cint, m, n, index) + spgeam_desc = CuSpGEAMDescriptor() + + local C + GC.@preserve rowPtrC begin + function bufferSize() + out = Ref{Csize_t}() + cusparseSpGEAM_bufferSize(handle(), 'N', 'N', alpha_ref, descA, beta_ref, descB, + descC, T, CUSPARSE_SPGEAM_ALG_DEFAULT, spgeam_desc, out) + return out[] + end + with_workspace(bufferSize) do buffer + # determine the sparsity pattern (fills rowPtrC) and the number of nonzeros + cusparseSpGEAM_nnz(handle(), 'N', 'N', alpha_ref, descA, beta_ref, descB, + descC, T, CUSPARSE_SPGEAM_ALG_DEFAULT, spgeam_desc, buffer) + nnzC = Ref{Int64}() + cusparseSpMatGetSize(descC, Ref{Int64}(), Ref{Int64}(), nnzC) + + colValC = CuVector{Cint}(undef, nnzC[]) + nzValC = CuVector{T}(undef, nnzC[]) + C = CuSparseMatrixCSR{T, Cint}(rowPtrC, colValC, nzValC, (m, n)) + cusparseCsrSetPointers(descC, C.rowPtr, C.colVal, C.nzVal) + + cusparseSpGEAM(handle(), 'N', 'N', alpha_ref, descA, beta_ref, descB, + descC, T, CUSPARSE_SPGEAM_ALG_DEFAULT, spgeam_desc, buffer) + end + end + return C +end + for (bname,gname,elty) in ((:cusparseScsrgeam2_bufferSizeExt, :cusparseScsrgeam2, :Float32), (:cusparseDcsrgeam2_bufferSizeExt, :cusparseDcsrgeam2, :Float64), (:cusparseCcsrgeam2_bufferSizeExt, :cusparseCcsrgeam2, :ComplexF32), (:cusparseZcsrgeam2_bufferSizeExt, :cusparseZcsrgeam2, :ComplexF64)) @eval begin - function geam(alpha::Number, A::CuSparseMatrixCSR{$elty}, beta::Number, B::CuSparseMatrixCSR{$elty}, index::SparseChar) + function geam_csrgeam2(alpha::Number, A::CuSparseMatrixCSR{$elty}, beta::Number, B::CuSparseMatrixCSR{$elty}, index::SparseChar) m, n = size(A) (m, n) == size(B) || throw(DimensionMismatch("dimensions must match: A has dims $(size(A)), B has dims $(size(B))")) descrA = CuMatrixDescriptor('G', 'L', 'N', index) diff --git a/lib/cusparse/src/helpers.jl b/lib/cusparse/src/helpers.jl index 55d8326247..fd0ddb2546 100644 --- a/lib/cusparse/src/helpers.jl +++ b/lib/cusparse/src/helpers.jl @@ -272,6 +272,20 @@ end Base.unsafe_convert(::Type{cusparseSpGEMMDescr_t}, desc::CuSpGEMMDescriptor) = desc.handle +mutable struct CuSpGEAMDescriptor + handle::cusparseSpGEAMDescr_t + + function CuSpGEAMDescriptor() + descr_ref = Ref{cusparseSpGEAMDescr_t}() + cusparseSpGEAM_createDescr(descr_ref) + obj = new(descr_ref[]) + finalizer(cusparseSpGEAM_destroyDescr, obj) + obj + end +end + +Base.unsafe_convert(::Type{cusparseSpGEAMDescr_t}, desc::CuSpGEAMDescriptor) = desc.handle + mutable struct CuSparseSpSVDescriptor handle::cusparseSpSVDescr_t diff --git a/lib/cusparse/src/libcusparse.jl b/lib/cusparse/src/libcusparse.jl index 9957dc5d1c..8183b49986 100644 --- a/lib/cusparse/src/libcusparse.jl +++ b/lib/cusparse/src/libcusparse.jl @@ -6247,15 +6247,15 @@ end @gcsafe_ccall libcusparse.cusparseSpGEAM_bufferSize(handle::cusparseHandle_t, opA::cusparseOperation_t, opB::cusparseOperation_t, - alpha::Ptr{Cvoid}, + alpha::PtrOrCuPtr{Cvoid}, matA::cusparseSpMatDescr_t, - beta::Ptr{Cvoid}, + beta::PtrOrCuPtr{Cvoid}, matB::cusparseSpMatDescr_t, matC::cusparseSpMatDescr_t, computeType::cudaDataType, alg::cusparseSpGEAMAlg_t, spgeamDescr::cusparseSpGEAMDescr_t, - bufferSize::Ptr{Csize_t})::cusparseStatus_t + bufferSize::Ref{Csize_t})::cusparseStatus_t end @checked function cusparseSpGEAM_nnz(handle, opA, opB, alpha, matA, beta, matB, matC, @@ -6264,15 +6264,15 @@ end @gcsafe_ccall libcusparse.cusparseSpGEAM_nnz(handle::cusparseHandle_t, opA::cusparseOperation_t, opB::cusparseOperation_t, - alpha::Ptr{Cvoid}, + alpha::PtrOrCuPtr{Cvoid}, matA::cusparseSpMatDescr_t, - beta::Ptr{Cvoid}, + beta::PtrOrCuPtr{Cvoid}, matB::cusparseSpMatDescr_t, matC::cusparseSpMatDescr_t, computeType::cudaDataType, alg::cusparseSpGEAMAlg_t, spgeamDescr::cusparseSpGEAMDescr_t, - externalBuffer::Ptr{Cvoid})::cusparseStatus_t + externalBuffer::CuPtr{Cvoid})::cusparseStatus_t end @checked function cusparseSpGEAM(handle, opA, opB, alpha, matA, beta, matB, matC, @@ -6280,14 +6280,14 @@ end initialize_context() @gcsafe_ccall libcusparse.cusparseSpGEAM(handle::cusparseHandle_t, opA::cusparseOperation_t, - opB::cusparseOperation_t, alpha::Ptr{Cvoid}, - matA::cusparseSpMatDescr_t, beta::Ptr{Cvoid}, + opB::cusparseOperation_t, alpha::PtrOrCuPtr{Cvoid}, + matA::cusparseSpMatDescr_t, beta::PtrOrCuPtr{Cvoid}, matB::cusparseSpMatDescr_t, matC::cusparseSpMatDescr_t, computeType::cudaDataType, alg::cusparseSpGEAMAlg_t, spgeamDescr::cusparseSpGEAMDescr_t, - externalBuffer::Ptr{Cvoid})::cusparseStatus_t + externalBuffer::CuPtr{Cvoid})::cusparseStatus_t end @cenum cusparseSDDMMAlg_t::UInt32 begin diff --git a/res/wrap/cusparse.toml b/res/wrap/cusparse.toml index 4292493f9d..b324cf5fba 100644 --- a/res/wrap/cusparse.toml +++ b/res/wrap/cusparse.toml @@ -920,6 +920,21 @@ needs_context = false 4 = "PtrOrCuPtr{Cvoid}" 7 = "PtrOrCuPtr{Cvoid}" +[api.cusparseSpGEAM_bufferSize.argtypes] +4 = "PtrOrCuPtr{Cvoid}" +6 = "PtrOrCuPtr{Cvoid}" +12 = "Ref{Csize_t}" + +[api.cusparseSpGEAM_nnz.argtypes] +4 = "PtrOrCuPtr{Cvoid}" +6 = "PtrOrCuPtr{Cvoid}" +12 = "CuPtr{Cvoid}" + +[api.cusparseSpGEAM.argtypes] +4 = "PtrOrCuPtr{Cvoid}" +6 = "PtrOrCuPtr{Cvoid}" +12 = "CuPtr{Cvoid}" + [api.cusparseConstrainedGeMM.argtypes] 4 = "PtrOrCuPtr{Cvoid}" 7 = "PtrOrCuPtr{Cvoid}" From 671d6c593e1f8ca2d288f42dc4abe0b381734e0e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 09:33:36 +0200 Subject: [PATCH 07/10] cuSPARSE: use native CSC in SpSV/SpSM when available cuSPARSE 12.8.1 (CUDA 13.3) added native CSC support to the triangular solve APIs. Use it instead of modelling a CSC matrix as its transposed CSR on new enough versions; the workaround couldn't represent transa = 'C', so the adjoint of a complex CSC matrix now works too. Relax the corresponding test skips accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/cusparse/src/generic.jl | 16 ++++++++++------ lib/cusparse/test/generic.jl | 6 ++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/lib/cusparse/src/generic.jl b/lib/cusparse/src/generic.jl index e0d10e4a70..3470b20186 100644 --- a/lib/cusparse/src/generic.jl +++ b/lib/cusparse/src/generic.jl @@ -725,7 +725,10 @@ function sv!(transa::SparseChar, uplo::SparseChar, diag::SparseChar, # Support transa = 'C' for real matrices transa = T <: Real && transa == 'C' ? 'T' : transa - if isa(A, CuSparseMatrixCSC) && transa == 'C' && T <: Complex + # native CSC support was added in cuSPARSE 12.8.1 (CUDA 13.3); older versions + # model a CSC matrix as its transposed CSR, which cannot represent transa = 'C'. + csc_workaround = isa(A, CuSparseMatrixCSC) && version() < v"12.8.1" + if csc_workaround && transa == 'C' && T <: Complex throw(ArgumentError("Backward and forward sweeps with the adjoint of a complex CSC matrix is not supported. Use a CSR or COO matrix instead.")) end @@ -737,8 +740,7 @@ function sv!(transa::SparseChar, uplo::SparseChar, diag::SparseChar, (mX != mA) && throw(DimensionMismatch("X must have length $mA, but has length $mX")) (mY != mA) && throw(DimensionMismatch("Y must have length $mA, but has length $mY")) - if isa(A, CuSparseMatrixCSC) - # cusparseSpSV doesn't support CSC matrices so we use Aᵀ to model them as CSR matrices. + if csc_workaround descA = CuSparseMatrixDescriptor(A, index, transposed=true) transa = transa == 'N' ? 'T' : 'N' uplo = uplo == 'U' ? 'L' : 'U' @@ -780,7 +782,10 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa # In that case we need to update the descriptor of C such that it represents Bᵀ. is_C_transposed = (B === C) && (transb != 'N') - if isa(A, CuSparseMatrixCSC) && transa == 'C' && T <: Complex + # native CSC support was added in cuSPARSE 12.8.1 (CUDA 13.3); older versions + # model a CSC matrix as its transposed CSR, which cannot represent transa = 'C'. + csc_workaround = isa(A, CuSparseMatrixCSC) && version() < v"12.8.1" + if csc_workaround && transa == 'C' && T <: Complex throw(ArgumentError("Backward and forward sweeps with the adjoint of a complex CSC matrix is not supported. Use a CSR or COO matrix instead.")) end @@ -795,8 +800,7 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa (nB != nC) && (transb == 'N') && throw(DimensionMismatch("B and C must have the same number of columns, but B has $nB columns and C has $nC columns")) (mB != nC) && (transb != 'N') && throw(DimensionMismatch("B must have the same the number of rows that C has as columns, but B has $mB rows and C has $nC columns")) - if isa(A, CuSparseMatrixCSC) - # cusparseSpSM doesn't support CSC matrices so we use Aᵀ to model them as CSR matrices. + if csc_workaround descA = CuSparseMatrixDescriptor(A, index, transposed=true) transa = transa == 'N' ? 'T' : 'N' uplo = uplo == 'U' ? 'L' : 'U' diff --git a/lib/cusparse/test/generic.jl b/lib/cusparse/test/generic.jl index e441b3bc85..146f9b7ce4 100644 --- a/lib/cusparse/test/generic.jl +++ b/lib/cusparse/test/generic.jl @@ -196,7 +196,8 @@ for SparseMatrixType in [CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO @testset "$SparseMatrixType -- sv! algo=$algo" for algo in SPSV_ALGOS[SparseMatrixType] @testset "sv! $T" for T in [Float64, ComplexF64] @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)] - SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue + # adjoint of a complex CSC matrix needs native CSC support (cuSPARSE 12.8.1+) + SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && cuSPARSE.version() < v"12.8.1" && continue @testset "uplo = $uplo" for uplo in ('L', 'U') @testset "diag = $diag" for diag in ('U', 'N') A = rand(T, 10, 10) @@ -220,7 +221,8 @@ for SparseMatrixType in [CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO @testset "$SparseMatrixType -- sm! algo=$algo" for algo in SPSM_ALGOS[SparseMatrixType] @testset "sm! $T" for T in [Float64, ComplexF64] @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)] - SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue + # adjoint of a complex CSC matrix needs native CSC support (cuSPARSE 12.8.1+) + SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && cuSPARSE.version() < v"12.8.1" && continue @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose)] @testset "uplo = $uplo" for uplo in ('L', 'U') @testset "diag = $diag" for diag in ('U', 'N') From 1cf51587e94eadec74871fd90c456809ae94c818 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 09:53:24 +0200 Subject: [PATCH 08/10] cuBLAS: expose FP64 fixed-point and BF16x9 emulation math modes CUDA added tensor-core emulation of higher precisions: BF16x9 reproduces full FP32 accuracy (cuBLAS 12.9+) and the Ozaki fixed-point scheme emulates FP64 (cuBLAS 13.1+, i.e. CUDA 13.0 Update 2). Expose them through the existing `math_mode!`/`math_precision` mechanism: under FAST_MATH, a `:BFloat16x9` precision selects FP32 emulation and `:FixedPoint` selects FP64 emulation. The math mode is applied to the cuBLAS handle (covering plain GEMMs) and the matching compute types are returned from gemmExComputeType (covering gemmEx!); the handle now also re-applies when the precision alone changes. Version gates use the cuBLAS library version, which does not track the toolkit version (CUDA 13.0u2 ships cuBLAS 13.1.0, CUDA 13.3 ships 13.5.1). Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/cublas/src/cuBLAS.jl | 30 +++++++++++++++++++----------- lib/cublas/src/wrappers.jl | 24 +++++++++++++++++++----- lib/cublas/test/level3/gemm.jl | 21 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/lib/cublas/src/cuBLAS.jl b/lib/cublas/src/cuBLAS.jl index ab4c82e2d9..3c2a9482b4 100644 --- a/lib/cublas/src/cuBLAS.jl +++ b/lib/cublas/src/cuBLAS.jl @@ -46,7 +46,7 @@ include("wrappers.jl") # high-level integrations include("linalg.jl") -function math_mode!(handle, mode) +function math_mode!(handle, mode, precision=CUDACore.math_precision()) flags = 0 # https://github.com/facebookresearch/faiss/issues/1385 @@ -58,8 +58,16 @@ function math_mode!(handle, mode) elseif mode == CUDACore.DEFAULT_MATH CUBLAS_DEFAULT_MATH elseif mode == CUDACore.FAST_MATH - # we'll additionally select a compute-mode with reduced precision whenever possible - CUBLAS_TF32_TENSOR_OP_MATH + # downcast to a reduced-precision compute mode whenever possible; the + # emulation modes additionally engage matching emulated kernels for + # plain (non-gemmEx) GEMMs. See also `gemmExComputeType`. + if precision === :BFloat16x9 && version() >= v"12.9" + CUBLAS_FP32_EMULATED_BF16X9_MATH + elseif precision === :FixedPoint && version() >= v"13.1" + CUBLAS_FP64_EMULATED_FIXEDPOINT_MATH + else + CUBLAS_TF32_TENSOR_OP_MATH + end end cublasSetMathMode(handle, cublasMath_t(flags)) @@ -96,7 +104,7 @@ function handle_finalizer(h::Handle) push!(idle_handles, h.ctx, h.handle) end -const LibraryState = @NamedTuple{handle::Handle, stream::CuStream, math_mode::CUDACore.MathMode} +const LibraryState = @NamedTuple{handle::Handle, stream::CuStream, math_mode::CUDACore.MathMode, math_precision::Symbol} const state_cache = CUDACore.TaskLocalCache{CuContext, LibraryState}(:CUBLAS) function handle() @@ -112,9 +120,9 @@ function handle() cublasSetStream_v2(new_handle, cuda.stream) cublasSetPointerMode_v2(new_handle, CUBLAS_POINTER_MODE_DEVICE) - math_mode!(new_handle, cuda.math_mode) + math_mode!(new_handle, cuda.math_mode, cuda.math_precision) - (; handle=wrapped, cuda.stream, cuda.math_mode) + (; handle=wrapped, cuda.stream, cuda.math_mode, cuda.math_precision) end state = get!(states, cuda.context) do new_state(cuda) @@ -123,18 +131,18 @@ function handle() # update stream @noinline function update_stream(cuda, state) cublasSetStream_v2(state.handle, cuda.stream) - (; state.handle, stream=cuda.stream, state.math_mode) + (; state.handle, stream=cuda.stream, state.math_mode, state.math_precision) end if state.stream != cuda.stream states[cuda.context] = state = update_stream(cuda, state) end - # update math mode + # update math mode (the precision feeds into the emulation math modes) @noinline function update_math_mode(cuda, state) - math_mode!(state.handle, cuda.math_mode) - (; state.handle, state.stream, math_mode=cuda.math_mode) + math_mode!(state.handle, cuda.math_mode, cuda.math_precision) + (; state.handle, state.stream, math_mode=cuda.math_mode, math_precision=cuda.math_precision) end - if state.math_mode != cuda.math_mode + if state.math_mode != cuda.math_mode || state.math_precision != cuda.math_precision states[cuda.context] = state = update_math_mode(cuda, state) end diff --git a/lib/cublas/src/wrappers.jl b/lib/cublas/src/wrappers.jl index 2effe8179e..d398039716 100644 --- a/lib/cublas/src/wrappers.jl +++ b/lib/cublas/src/wrappers.jl @@ -50,9 +50,10 @@ function juliaStorageType(T::Type{<:Real}, ct::cublasComputeType_t) return T == BFloat16 ? BFloat16 : Float16 elseif ct == CUBLAS_COMPUTE_32F || ct == CUBLAS_COMPUTE_32F_PEDANTIC || ct == CUBLAS_COMPUTE_32F_FAST_16F || ct == CUBLAS_COMPUTE_32F_FAST_16BF || - ct == CUBLAS_COMPUTE_32F_FAST_TF32 + ct == CUBLAS_COMPUTE_32F_FAST_TF32 || ct == CUBLAS_COMPUTE_32F_EMULATED_16BFX9 return Float32 - elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC + elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC || + ct == CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT return Float64 elseif ct == CUBLAS_COMPUTE_32I || ct == CUBLAS_COMPUTE_32I_PEDANTIC return Int32 @@ -66,9 +67,10 @@ function juliaStorageType(T::Type{<:Complex}, ct::cublasComputeType_t) return T == Complex{BFloat16} == Complex{BFloat16} : Complex{Float16} elseif ct == CUBLAS_COMPUTE_32F || ct == CUBLAS_COMPUTE_32F_PEDANTIC || ct == CUBLAS_COMPUTE_32F_FAST_16F || ct == CUBLAS_COMPUTE_32F_FAST_16BF || - ct == CUBLAS_COMPUTE_32F_FAST_TF32 + ct == CUBLAS_COMPUTE_32F_FAST_TF32 || ct == CUBLAS_COMPUTE_32F_EMULATED_16BFX9 return Complex{Float32} - elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC + elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC || + ct == CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT return Complex{Float64} else throw(ArgumentError("Julia type equivalent for compute type $ct does not exist!")) @@ -1199,11 +1201,23 @@ function gemmExComputeType(TA, TB, TC, m, k, n) return CUBLAS_COMPUTE_32F_FAST_16F elseif reduced_precision === :BFloat16 return CUBLAS_COMPUTE_32F_FAST_16BF + elseif reduced_precision === :BFloat16x9 + # BF16x9 emulates full FP32 accuracy via tensor cores (cuBLAS 12.9+) + version() >= v"12.9" || + throw(ArgumentError("BFloat16x9 emulation requires cuBLAS 12.9 or higher")) + return CUBLAS_COMPUTE_32F_EMULATED_16BFX9 elseif reduced_precision === :TensorFloat32 return CUBLAS_COMPUTE_32F_FAST_TF32 - else + elseif reduced_precision !== :FixedPoint # FixedPoint only applies to Float64 throw(ArgumentError("Unknown reduced precision type $reduced_precision")) end + elseif (sig === (Float64, Float64) || + sig === (Complex{Float64}, Complex{Float64})) && + reduced_precision === :FixedPoint + # fixed-point (Ozaki) emulation of FP64 on tensor cores (cuBLAS 13.1+) + version() >= v"13.1" || + throw(ArgumentError("FixedPoint emulation requires cuBLAS 13.1 or higher")) + return CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT end end diff --git a/lib/cublas/test/level3/gemm.jl b/lib/cublas/test/level3/gemm.jl index c64d777260..65a299bada 100644 --- a/lib/cublas/test/level3/gemm.jl +++ b/lib/cublas/test/level3/gemm.jl @@ -504,6 +504,27 @@ k = 13 @test C ≈ Array(dC) rtol=rtol end end + # emulation math modes: BF16x9 reproduces full FP32 accuracy (cuBLAS 12.9+), + # fixed-point emulates FP64 on tensor cores (cuBLAS 13.0+). + if cuBLAS.version() >= v"12.9" + for (AT, CT) in ((Float32, Float32), (ComplexF32, ComplexF32)) + CUDACore.math_mode!(CUDACore.FAST_MATH; precision=:BFloat16x9) + A = rand(AT, m, k); B = rand(AT, k, n) + dC = similar(CuArray(B), CT) + mul!(dC, CuArray(A), CuArray(B)) + @test A*B ≈ Array(dC) rtol=Base.rtoldefault(AT, AT, 0) + end + end + if cuBLAS.version() >= v"13.1" + for (AT, CT) in ((Float64, Float64), (ComplexF64, ComplexF64)) + CUDACore.math_mode!(CUDACore.FAST_MATH; precision=:FixedPoint) + A = rand(AT, m, k); B = rand(AT, k, n) + dC = similar(CuArray(B), CT) + mul!(dC, CuArray(A), CuArray(B)) + @test A*B ≈ Array(dC) rtol=Base.rtoldefault(AT, AT, 0) + end + end + CUDACore.math_mode!(CUDACore.FAST_MATH; precision = :Bad) @test_throws ArgumentError("Unknown reduced precision type Bad") cuBLAS.gemmExComputeType(Float32, Float32, Float32, m, k, n) finally From a7904cb265e94eec68c08235a44b2dba7460dbe0 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 11:52:11 +0200 Subject: [PATCH 09/10] Remove test that passes now. --- test/core/cudadrv.jl | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl index acb1511dc6..4df1f3fef1 100644 --- a/test/core/cudadrv.jl +++ b/test/core/cudadrv.jl @@ -775,13 +775,6 @@ end add_data!(link, "vadd_parent", read(f, String)) end @test_throws ArgumentError add_data!(link, "vadd_parent", "\0") - - # object code - # TODO: test with valid object code - # NOTE: apparently, on Windows cuLinkAddData! _does_ accept object data containing \0 - if !Sys.iswindows() - @test_throws Exception add_data!(link, "vadd_parent", UInt8[0]) - end end @testset "error log" begin From cb21ca6d7aec3a28af50a3e0b60da10aa2f00978 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 28 May 2026 11:36:34 +0200 Subject: [PATCH 10/10] Add CI for CUDA 13.3. --- .buildkite/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 3492c76350..ef2f2b54ac 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -100,6 +100,7 @@ steps: matrix: setup: cuda: + - "13.3" - "13.2" - "13.1" - "13.0"