diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 3492c76350..ef2f2b54ac 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -100,6 +100,7 @@ steps: matrix: setup: cuda: + - "13.3" - "13.2" - "13.1" - "13.0" diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 820b3cf86e..85fb03c1b2 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -48,7 +48,7 @@ CUDA = "6" CUDA_Compiler_jll = "0.3, 0.4" CUDA_Driver_jll = "13" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" diff --git a/CUDACore/lib/cudadrv/events.jl b/CUDACore/lib/cudadrv/events.jl index 9b36cb6384..2912488206 100644 --- a/CUDACore/lib/cudadrv/events.jl +++ b/CUDACore/lib/cudadrv/events.jl @@ -85,7 +85,13 @@ Computes the elapsed time between two events (in seconds). """ function elapsed(start::CuEvent, stop::CuEvent) time_ref = Ref{Cfloat}() - cuEventElapsedTime(time_ref, start, stop) + # cuEventElapsedTime_v2 (CUDA 12.8+) has improved accuracy and validation; + # fall back to the deprecated v1 entry point on older drivers. + if driver_version() >= v"12.8" + cuEventElapsedTime_v2(time_ref, start, stop) + else + cuEventElapsedTime(time_ref, start, stop) + end return time_ref[]/1000 end diff --git a/CUDACore/lib/cudadrv/libcuda.jl b/CUDACore/lib/cudadrv/libcuda.jl index 8d1c56df79..4055feb378 100644 --- a/CUDACore/lib/cudadrv/libcuda.jl +++ b/CUDACore/lib/cudadrv/libcuda.jl @@ -146,6 +146,7 @@ const CUdevice = CUdevice_v1 CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915 CUDA_ERROR_KEY_ROTATION = 916 CUDA_ERROR_STREAM_DETACHED = 917 + CUDA_ERROR_GRAPH_RECAPTURE_FAILURE = 918 CUDA_ERROR_UNKNOWN = 999 end @@ -1243,6 +1244,7 @@ end CU_GRAPH_NODE_TYPE_MEM_FREE = 11 CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 CU_GRAPH_NODE_TYPE_CONDITIONAL = 13 + CU_GRAPH_NODE_TYPE_RESERVED_16 = 16 end const CUgraphNodeType = CUgraphNodeType_enum @@ -1268,6 +1270,7 @@ function Base.getproperty(x::Ptr{CUgraphNodeParams_st}, f::Symbol) f === :free && return Ptr{CUDA_MEM_FREE_NODE_PARAMS}(x + 16) f === :memOp && return Ptr{CUDA_BATCH_MEM_OP_NODE_PARAMS_v2}(x + 16) f === :conditional && return Ptr{CUDA_CONDITIONAL_NODE_PARAMS}(x + 16) + f === :asBytes && return Ptr{NTuple{232,Cchar}}(x + 16) f === :reserved2 && return Ptr{Clonglong}(x + 248) return getfield(x, f) end @@ -1286,7 +1289,7 @@ end function Base.propertynames(x::CUgraphNodeParams_st, private::Bool=false) return (:type, :reserved0, :reserved1, :kernel, :memcpy, :memset, :host, :graph, :eventWait, :eventRecord, :extSemSignal, :extSemWait, :alloc, :free, :memOp, - :conditional, :reserved2, if private + :conditional, :asBytes, :reserved2, if private fieldnames(typeof(x)) else () @@ -1931,7 +1934,13 @@ const CUfilter_mode = CUfilter_mode_enum CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED = 146 CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = 147 CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = 148 - CU_DEVICE_ATTRIBUTE_MAX = 149 + CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED = 151 + CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED = 152 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_SUPPORTED = 153 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_MULTICAST_SUPPORTED = 154 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_COUNTED_OPS_SUPPORTED = 155 + CU_DEVICE_ATTRIBUTE_LOGICAL_ENDPOINT_UNICAST_ACCESS_ON_OWNER_DEVICE_SUPPORTED = 156 + CU_DEVICE_ATTRIBUTE_MAX = 157 end const CUdevice_attribute = CUdevice_attribute_enum @@ -1996,7 +2005,8 @@ const CUpointer_attribute = CUpointer_attribute_enum CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13 CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14 CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15 - CU_FUNC_ATTRIBUTE_MAX = 16 + CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED = 16 + CU_FUNC_ATTRIBUTE_MAX = 17 end const CUfunction_attribute = CUfunction_attribute_enum @@ -3940,8 +3950,7 @@ const CUcheckpointGpuPair = CUcheckpointGpuPair_st struct CUcheckpointRestoreArgs_st gpuPairs::Ptr{CUcheckpointGpuPair} gpuPairsCount::Cuint - reserved::NTuple{44,Cchar} - reserved1::cuuint64_t + reserved::NTuple{52,Cchar} end const CUcheckpointRestoreArgs = CUcheckpointRestoreArgs_st @@ -4905,6 +4914,154 @@ end option::CUmulticastGranularity_flags)::CUresult end +const CUlogicalEndpointId = cuuint32_t + +@cenum CUlogicalEndpointIpcHandleType_enum::UInt32 begin + CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_NONE = 0 + CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC = 1 +end + +const CUlogicalEndpointIpcHandleType = CUlogicalEndpointIpcHandleType_enum + +struct CUlogicalEndpointFabricHandle_st + data::NTuple{64,Cuchar} +end + +const CUlogicalEndpointFabricHandle = CUlogicalEndpointFabricHandle_st + +@cenum CUlogicalEndpointType_enum::UInt32 begin + CU_LOGICAL_ENDPOINT_TYPE_INVALID = 0 + CU_LOGICAL_ENDPOINT_TYPE_UNICAST = 1 + CU_LOGICAL_ENDPOINT_TYPE_MULTICAST = 2 +end + +const CUlogicalEndpointType = CUlogicalEndpointType_enum + +@cenum CUlogicalEndpointFlag_enum::UInt32 begin + CU_LOGICAL_ENDPOINT_FLAG_NONE = 0 + CU_LOGICAL_ENDPOINT_FLAG_COUNTED_OPS = 1 +end + +const CUlogicalEndpointFlag = CUlogicalEndpointFlag_enum + +struct CUlogicalEndpointProp_struct + data::NTuple{24,UInt8} +end + +function Base.getproperty(x::Ptr{CUlogicalEndpointProp_struct}, f::Symbol) + f === :type && return Ptr{CUlogicalEndpointType}(x + 0) + f === :unicast && return Ptr{Cvoid}(x + 4) + f === :multicast && return Ptr{Cvoid}(x + 4) + f === :size && return Ptr{Culonglong}(x + 8) + f === :ipcHandleTypes && return Ptr{Cuint}(x + 16) + f === :flags && return Ptr{Cuint}(x + 20) + return getfield(x, f) +end + +function Base.getproperty(x::CUlogicalEndpointProp_struct, f::Symbol) + r = Ref{CUlogicalEndpointProp_struct}(x) + ptr = Base.unsafe_convert(Ptr{CUlogicalEndpointProp_struct}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{CUlogicalEndpointProp_struct}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::CUlogicalEndpointProp_struct, private::Bool=false) + return (:type, :unicast, :multicast, :size, :ipcHandleTypes, :flags, + if private + fieldnames(typeof(x)) + else + () + end...) +end + +const CUlogicalEndpointProp = CUlogicalEndpointProp_struct + +@checked function cuLogicalEndpointIdReserve(baseLeId, count) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointIdReserve(baseLeId::Ptr{CUlogicalEndpointId}, + count::cuuint32_t)::CUresult +end + +@checked function cuLogicalEndpointIdRelease(baseLeId, count) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointIdRelease(baseLeId::CUlogicalEndpointId, + count::cuuint32_t)::CUresult +end + +@checked function cuLogicalEndpointCreate(leId, prop) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointCreate(leId::CUlogicalEndpointId, + prop::Ptr{CUlogicalEndpointProp})::CUresult +end + +@checked function cuLogicalEndpointAddDevice(leId, dev) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointAddDevice(leId::CUlogicalEndpointId, + dev::CUdevice)::CUresult +end + +@checked function cuLogicalEndpointDestroy(leId) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointDestroy(leId::CUlogicalEndpointId)::CUresult +end + +@checked function cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, flags) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointBindAddr(leId::CUlogicalEndpointId, + dev::CUdevice, offset::cuuint64_t, + ptr::Ptr{Cvoid}, size::cuuint64_t, + flags::Culonglong)::CUresult +end + +@checked function cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, + flags) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointBindMem(leId::CUlogicalEndpointId, dev::CUdevice, + offset::cuuint64_t, + memHandle::CUmemGenericAllocationHandle, + memOffset::cuuint64_t, size::cuuint64_t, + flags::Culonglong)::CUresult +end + +@checked function cuLogicalEndpointUnbind(leId, dev, offset, size) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointUnbind(leId::CUlogicalEndpointId, dev::CUdevice, + offset::cuuint64_t, + size::cuuint64_t)::CUresult +end + +@checked function cuLogicalEndpointExport(handle, leId, handleType) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointExport(handle::Ptr{Cvoid}, + leId::CUlogicalEndpointId, + handleType::CUlogicalEndpointIpcHandleType)::CUresult +end + +@checked function cuLogicalEndpointImport(leId, handle, handleType) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointImport(leId::CUlogicalEndpointId, + handle::Ptr{Cvoid}, + handleType::CUlogicalEndpointIpcHandleType)::CUresult +end + +@checked function cuLogicalEndpointGetLimits(bindAlignment, maxSize, prop) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointGetLimits(bindAlignment::Ptr{cuuint64_t}, + maxSize::Ptr{cuuint64_t}, + prop::Ptr{CUlogicalEndpointProp})::CUresult +end + +@checked function cuLogicalEndpointQuery(leId, count, queryStatus) + initialize_context() + @gcsafe_ccall libcuda.cuLogicalEndpointQuery(leId::CUlogicalEndpointId, + count::cuuint32_t, + queryStatus::Ptr{Cint})::CUresult +end + @checked function cuPointerGetAttribute(data, attribute, ptr) initialize_context() @gcsafe_ccall libcuda.cuPointerGetAttribute(data::Ptr{Cvoid}, @@ -5048,6 +5205,27 @@ end userData::Ptr{Cvoid}, flags::Cuint)::CUresult end +@cenum CUgraphRecaptureStatus_enum::UInt32 begin + CU_GRAPH_RECAPTURE_ELIGIBLE_FOR_UPDATE = 0 + CU_GRAPH_RECAPTURE_INELIGIBLE_FOR_UPDATE = 1 + CU_GRAPH_RECAPTURE_ERROR = 2 +end + +const CUgraphRecaptureStatus = CUgraphRecaptureStatus_enum + +# typedef CUresult ( CUDA_CB * CUgraphRecaptureCallback ) ( void * data , CUgraphNode node , const CUgraphNodeParams * originalParams , const CUgraphNodeParams * recaptureParams , CUgraphRecaptureStatus status ) +const CUgraphRecaptureCallback = Ptr{Cvoid} + +@checked function cuStreamBeginRecaptureToGraph(hStream, mode, hGraph, callbackFunc, + userData) + initialize_context() + @gcsafe_ccall libcuda.cuStreamBeginRecaptureToGraph(hStream::CUstream, + mode::CUstreamCaptureMode, + hGraph::CUgraph, + callbackFunc::CUgraphRecaptureCallback, + userData::Ptr{Cvoid})::CUresult +end + @checked function cuStreamBeginCaptureToGraph(hStream, hGraph, dependencies, dependencyData, numDependencies, mode) initialize_context() @@ -5279,13 +5457,6 @@ end kernelParams::Ptr{Ptr{Cvoid}})::CUresult end -@checked function cuLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags) - initialize_context() - @gcsafe_ccall libcuda.cuLaunchCooperativeKernelMultiDevice(launchParamsList::Ptr{CUDA_LAUNCH_PARAMS}, - numDevices::Cuint, - flags::Cuint)::CUresult -end - @checked function cuLaunchHostFunc(hStream, fn, userData) initialize_context() @gcsafe_ccall libcuda.cuLaunchHostFunc(hStream::CUstream, fn::CUhostFn, @@ -5350,6 +5521,13 @@ end grid_height::Cint, hStream::CUstream)::CUresult end +@checked function cuLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags) + initialize_context() + @gcsafe_ccall libcuda.cuLaunchCooperativeKernelMultiDevice(launchParamsList::Ptr{CUDA_LAUNCH_PARAMS}, + numDevices::Cuint, + flags::Cuint)::CUresult +end + @checked function cuParamSetTexRef(hfunc, texunit, hTexRef) initialize_context() @gcsafe_ccall libcuda.cuParamSetTexRef(hfunc::CUfunction, texunit::Cint, @@ -6312,7 +6490,7 @@ end const CUcoredumpSettings = CUcoredumpSettings_enum -@cenum CUCoredumpGenerationFlags::UInt32 begin +@cenum CUCoredumpGenerationFlags::Int32 begin CU_COREDUMP_DEFAULT_FLAGS = 0 CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = 1 CU_COREDUMP_SKIP_GLOBAL_MEMORY = 2 @@ -6321,6 +6499,9 @@ const CUcoredumpSettings = CUcoredumpSettings_enum CU_COREDUMP_SKIP_ABORT = 16 CU_COREDUMP_SKIP_CONSTBANK_MEMORY = 32 CU_COREDUMP_GZIP_COMPRESS = 64 + CU_COREDUMP_FAULTED_CONTEXTS_ONLY = 128 + CU_COREDUMP_NO_ERRBAR_AT_EXIT = 1073741824 + CU_COREDUMP_LOG_ONLY = -2147483648 CU_COREDUMP_LIGHTWEIGHT_FLAGS = 47 end @@ -6394,6 +6575,7 @@ mutable struct CUdevResourceDesc_st end const CUdevResourceDesc = Ptr{CUdevResourceDesc_st} @cenum CUgreenCtxCreate_flags::UInt32 begin + CU_GREEN_CTX_NONE = 0 CU_GREEN_CTX_DEFAULT_STREAM = 1 end diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 0f512a0255..d6dea25b25 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -133,6 +133,7 @@ const ptxas_ptx_db = Dict( v"9.0" => between(v"13.0", highest), v"9.1" => between(v"13.1", highest), v"9.2" => between(v"13.2", highest), + v"9.3" => between(v"13.3", highest), ) function ptxas_ptx_support(ver::VersionNumber) @@ -296,7 +297,8 @@ const llvm_ptx_db = Dict( v"8.7" => between(v"20", highest), v"8.8" => between(v"21", highest), v"9.0" => between(v"22", highest), - v"9.1" => between(v"24", highest), + v"9.1" => between(v"23", highest), + v"9.2" => between(v"23", highest), ) function llvm_ptx_support(ver::VersionNumber) diff --git a/lib/cublas/Project.toml b/lib/cublas/Project.toml index 2fcdbae5f8..40cee2cf8d 100644 --- a/lib/cublas/Project.toml +++ b/lib/cublas/Project.toml @@ -30,7 +30,7 @@ BFloat16s = "0.5, 0.6" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" EnzymeCore = "0.8.2" GPUArrays = "11.4.1" GPUToolbox = "0.3, 1" diff --git a/lib/cublas/src/cuBLAS.jl b/lib/cublas/src/cuBLAS.jl index ab4c82e2d9..3c2a9482b4 100644 --- a/lib/cublas/src/cuBLAS.jl +++ b/lib/cublas/src/cuBLAS.jl @@ -46,7 +46,7 @@ include("wrappers.jl") # high-level integrations include("linalg.jl") -function math_mode!(handle, mode) +function math_mode!(handle, mode, precision=CUDACore.math_precision()) flags = 0 # https://github.com/facebookresearch/faiss/issues/1385 @@ -58,8 +58,16 @@ function math_mode!(handle, mode) elseif mode == CUDACore.DEFAULT_MATH CUBLAS_DEFAULT_MATH elseif mode == CUDACore.FAST_MATH - # we'll additionally select a compute-mode with reduced precision whenever possible - CUBLAS_TF32_TENSOR_OP_MATH + # downcast to a reduced-precision compute mode whenever possible; the + # emulation modes additionally engage matching emulated kernels for + # plain (non-gemmEx) GEMMs. See also `gemmExComputeType`. + if precision === :BFloat16x9 && version() >= v"12.9" + CUBLAS_FP32_EMULATED_BF16X9_MATH + elseif precision === :FixedPoint && version() >= v"13.1" + CUBLAS_FP64_EMULATED_FIXEDPOINT_MATH + else + CUBLAS_TF32_TENSOR_OP_MATH + end end cublasSetMathMode(handle, cublasMath_t(flags)) @@ -96,7 +104,7 @@ function handle_finalizer(h::Handle) push!(idle_handles, h.ctx, h.handle) end -const LibraryState = @NamedTuple{handle::Handle, stream::CuStream, math_mode::CUDACore.MathMode} +const LibraryState = @NamedTuple{handle::Handle, stream::CuStream, math_mode::CUDACore.MathMode, math_precision::Symbol} const state_cache = CUDACore.TaskLocalCache{CuContext, LibraryState}(:CUBLAS) function handle() @@ -112,9 +120,9 @@ function handle() cublasSetStream_v2(new_handle, cuda.stream) cublasSetPointerMode_v2(new_handle, CUBLAS_POINTER_MODE_DEVICE) - math_mode!(new_handle, cuda.math_mode) + math_mode!(new_handle, cuda.math_mode, cuda.math_precision) - (; handle=wrapped, cuda.stream, cuda.math_mode) + (; handle=wrapped, cuda.stream, cuda.math_mode, cuda.math_precision) end state = get!(states, cuda.context) do new_state(cuda) @@ -123,18 +131,18 @@ function handle() # update stream @noinline function update_stream(cuda, state) cublasSetStream_v2(state.handle, cuda.stream) - (; state.handle, stream=cuda.stream, state.math_mode) + (; state.handle, stream=cuda.stream, state.math_mode, state.math_precision) end if state.stream != cuda.stream states[cuda.context] = state = update_stream(cuda, state) end - # update math mode + # update math mode (the precision feeds into the emulation math modes) @noinline function update_math_mode(cuda, state) - math_mode!(state.handle, cuda.math_mode) - (; state.handle, state.stream, math_mode=cuda.math_mode) + math_mode!(state.handle, cuda.math_mode, cuda.math_precision) + (; state.handle, state.stream, math_mode=cuda.math_mode, math_precision=cuda.math_precision) end - if state.math_mode != cuda.math_mode + if state.math_mode != cuda.math_mode || state.math_precision != cuda.math_precision states[cuda.context] = state = update_math_mode(cuda, state) end diff --git a/lib/cublas/src/libcublasLt.jl b/lib/cublas/src/libcublasLt.jl index 4094554ad6..493f1d5405 100644 --- a/lib/cublas/src/libcublasLt.jl +++ b/lib/cublas/src/libcublasLt.jl @@ -1318,6 +1318,25 @@ end returnAlgoCount::Ptr{Cint})::cublasStatus_t end +@checked function cublasLtMatmulAlgoGetHeuristicForStream(lightHandle, operationDesc, Adesc, + Bdesc, Cdesc, Ddesc, preference, + requestedAlgoCount, + heuristicResultsArray, + returnAlgoCount, stream) + initialize_context() + @gcsafe_ccall libcublasLt.cublasLtMatmulAlgoGetHeuristicForStream(lightHandle::cublasLtHandle_t, + operationDesc::cublasLtMatmulDesc_t, + Adesc::cublasLtMatrixLayout_t, + Bdesc::cublasLtMatrixLayout_t, + Cdesc::cublasLtMatrixLayout_t, + Ddesc::cublasLtMatrixLayout_t, + preference::cublasLtMatmulPreference_t, + requestedAlgoCount::Cint, + heuristicResultsArray::Ptr{cublasLtMatmulHeuristicResult_t}, + returnAlgoCount::Ptr{Cint}, + stream::cudaStream_t)::cublasStatus_t +end + @checked function cublasLtMatmulAlgoGetIds(lightHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, requestedAlgoCount, algoIdsArray, returnAlgoCount) @@ -1360,6 +1379,20 @@ end result::Ptr{cublasLtMatmulHeuristicResult_t})::cublasStatus_t end +@checked function cublasLtMatmulAlgoCheckForStream(lightHandle, operationDesc, Adesc, Bdesc, + Cdesc, Ddesc, algo, result, stream) + initialize_context() + @gcsafe_ccall libcublasLt.cublasLtMatmulAlgoCheckForStream(lightHandle::cublasLtHandle_t, + operationDesc::cublasLtMatmulDesc_t, + Adesc::cublasLtMatrixLayout_t, + Bdesc::cublasLtMatrixLayout_t, + Cdesc::cublasLtMatrixLayout_t, + Ddesc::cublasLtMatrixLayout_t, + algo::Ptr{cublasLtMatmulAlgo_t}, + result::Ptr{cublasLtMatmulHeuristicResult_t}, + stream::cudaStream_t)::cublasStatus_t +end + @cenum cublasLtMatmulAlgoCapAttributes_t::UInt32 begin CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0 CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1 @@ -1452,7 +1485,7 @@ end @gcsafe_ccall libcublasLt.cublasLtLoggerSetMask(mask::Cint)::cublasStatus_t end -# no prototype is found for this function at cublasLt.h:2795:29, please use with caution +# no prototype is found for this function at cublasLt.h:2869:29, please use with caution @checked function cublasLtLoggerForceDisable() initialize_context() @gcsafe_ccall libcublasLt.cublasLtLoggerForceDisable()::cublasStatus_t diff --git a/lib/cublas/src/wrappers.jl b/lib/cublas/src/wrappers.jl index 2effe8179e..d398039716 100644 --- a/lib/cublas/src/wrappers.jl +++ b/lib/cublas/src/wrappers.jl @@ -50,9 +50,10 @@ function juliaStorageType(T::Type{<:Real}, ct::cublasComputeType_t) return T == BFloat16 ? BFloat16 : Float16 elseif ct == CUBLAS_COMPUTE_32F || ct == CUBLAS_COMPUTE_32F_PEDANTIC || ct == CUBLAS_COMPUTE_32F_FAST_16F || ct == CUBLAS_COMPUTE_32F_FAST_16BF || - ct == CUBLAS_COMPUTE_32F_FAST_TF32 + ct == CUBLAS_COMPUTE_32F_FAST_TF32 || ct == CUBLAS_COMPUTE_32F_EMULATED_16BFX9 return Float32 - elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC + elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC || + ct == CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT return Float64 elseif ct == CUBLAS_COMPUTE_32I || ct == CUBLAS_COMPUTE_32I_PEDANTIC return Int32 @@ -66,9 +67,10 @@ function juliaStorageType(T::Type{<:Complex}, ct::cublasComputeType_t) return T == Complex{BFloat16} == Complex{BFloat16} : Complex{Float16} elseif ct == CUBLAS_COMPUTE_32F || ct == CUBLAS_COMPUTE_32F_PEDANTIC || ct == CUBLAS_COMPUTE_32F_FAST_16F || ct == CUBLAS_COMPUTE_32F_FAST_16BF || - ct == CUBLAS_COMPUTE_32F_FAST_TF32 + ct == CUBLAS_COMPUTE_32F_FAST_TF32 || ct == CUBLAS_COMPUTE_32F_EMULATED_16BFX9 return Complex{Float32} - elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC + elseif ct == CUBLAS_COMPUTE_64F || ct == CUBLAS_COMPUTE_64F_PEDANTIC || + ct == CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT return Complex{Float64} else throw(ArgumentError("Julia type equivalent for compute type $ct does not exist!")) @@ -1199,11 +1201,23 @@ function gemmExComputeType(TA, TB, TC, m, k, n) return CUBLAS_COMPUTE_32F_FAST_16F elseif reduced_precision === :BFloat16 return CUBLAS_COMPUTE_32F_FAST_16BF + elseif reduced_precision === :BFloat16x9 + # BF16x9 emulates full FP32 accuracy via tensor cores (cuBLAS 12.9+) + version() >= v"12.9" || + throw(ArgumentError("BFloat16x9 emulation requires cuBLAS 12.9 or higher")) + return CUBLAS_COMPUTE_32F_EMULATED_16BFX9 elseif reduced_precision === :TensorFloat32 return CUBLAS_COMPUTE_32F_FAST_TF32 - else + elseif reduced_precision !== :FixedPoint # FixedPoint only applies to Float64 throw(ArgumentError("Unknown reduced precision type $reduced_precision")) end + elseif (sig === (Float64, Float64) || + sig === (Complex{Float64}, Complex{Float64})) && + reduced_precision === :FixedPoint + # fixed-point (Ozaki) emulation of FP64 on tensor cores (cuBLAS 13.1+) + version() >= v"13.1" || + throw(ArgumentError("FixedPoint emulation requires cuBLAS 13.1 or higher")) + return CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT end end diff --git a/lib/cublas/test/level3/gemm.jl b/lib/cublas/test/level3/gemm.jl index c64d777260..65a299bada 100644 --- a/lib/cublas/test/level3/gemm.jl +++ b/lib/cublas/test/level3/gemm.jl @@ -504,6 +504,27 @@ k = 13 @test C ≈ Array(dC) rtol=rtol end end + # emulation math modes: BF16x9 reproduces full FP32 accuracy (cuBLAS 12.9+), + # fixed-point emulates FP64 on tensor cores (cuBLAS 13.0+). + if cuBLAS.version() >= v"12.9" + for (AT, CT) in ((Float32, Float32), (ComplexF32, ComplexF32)) + CUDACore.math_mode!(CUDACore.FAST_MATH; precision=:BFloat16x9) + A = rand(AT, m, k); B = rand(AT, k, n) + dC = similar(CuArray(B), CT) + mul!(dC, CuArray(A), CuArray(B)) + @test A*B ≈ Array(dC) rtol=Base.rtoldefault(AT, AT, 0) + end + end + if cuBLAS.version() >= v"13.1" + for (AT, CT) in ((Float64, Float64), (ComplexF64, ComplexF64)) + CUDACore.math_mode!(CUDACore.FAST_MATH; precision=:FixedPoint) + A = rand(AT, m, k); B = rand(AT, k, n) + dC = similar(CuArray(B), CT) + mul!(dC, CuArray(A), CuArray(B)) + @test A*B ≈ Array(dC) rtol=Base.rtoldefault(AT, AT, 0) + end + end + CUDACore.math_mode!(CUDACore.FAST_MATH; precision = :Bad) @test_throws ArgumentError("Unknown reduced precision type Bad") cuBLAS.gemmExComputeType(Float32, Float32, Float32, m, k, n) finally diff --git a/lib/cufft/Project.toml b/lib/cufft/Project.toml index d7e3bb27cc..fb6f726460 100644 --- a/lib/cufft/Project.toml +++ b/lib/cufft/Project.toml @@ -21,7 +21,7 @@ AbstractFFTs = "0.5, 1.0" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "0.3, 1" LinearAlgebra = "1" Reexport = "1.0" diff --git a/lib/cufft/src/error.jl b/lib/cufft/src/error.jl index 15b212b798..5dfc6436d6 100644 --- a/lib/cufft/src/error.jl +++ b/lib/cufft/src/error.jl @@ -33,20 +33,22 @@ function description(err::CUFFTError) "user specified an invalid transform size" elseif err.code == CUFFT_UNALIGNED_DATA "cuFFT unaligned data" # No longer used - elseif err.code == CUFFT_INCOMPLETE_PARAMETER_LIST - "missing parameters in call" elseif err.code == CUFFT_INVALID_DEVICE "execution of a plan was on different GPU than plan creation" - elseif err.code == CUFFT_PARSE_ERROR - "internal plan database error" elseif err.code == CUFFT_NO_WORKSPACE "no workspace has been provided prior to plan execution" elseif err.code == CUFFT_NOT_IMPLEMENTED "function does not implement functionality for parameters given." - elseif err.code == CUFFT_LICENSE_ERROR - "cuFFT license error" # Used in previous versions. elseif err.code == CUFFT_NOT_SUPPORTED "operation is not supported for parameters given." + elseif err.code == CUFFT_MISSING_DEPENDENCY + "a required runtime dependency was not found" + elseif err.code == CUFFT_NVRTC_FAILURE + "NVRTC failed to compile a required kernel" + elseif err.code == CUFFT_NVJITLINK_FAILURE + "nvJitLink failed to link a required kernel" + elseif err.code == CUFFT_NVSHMEM_FAILURE + "an NVSHMEM operation failed" else "no description for this error" end diff --git a/lib/cufft/src/wrappers.jl b/lib/cufft/src/wrappers.jl index ad2ec12f8d..959194b704 100644 --- a/lib/cufft/src/wrappers.jl +++ b/lib/cufft/src/wrappers.jl @@ -30,8 +30,8 @@ function cufftMakePlan(output_type::Type{<:cufftNumber}, input_type::Type{<:cuff end # cuFFT half-precision transforms require all transform dim sizes to be # powers of 2 (NVIDIA cuFFT docs, "Half-precision transforms"). Catch this - # up-front so the user sees a Julia error rather than a bare - # CUFFT_INCOMPLETE_PARAMETER_LIST from inside the wrapper. + # up-front so the user sees a Julia error rather than a bare cuFFT error + # from inside the wrapper. if input_type <: Union{Float16, Complex{Float16}} || output_type <: Union{Float16, Complex{Float16}} for d in region diff --git a/lib/cufft/test/errors.jl b/lib/cufft/test/errors.jl new file mode 100644 index 0000000000..92bb5ccdee --- /dev/null +++ b/lib/cufft/test/errors.jl @@ -0,0 +1,10 @@ +@testset "error descriptions" begin + # every result code must have a printable description; this guards against + # referencing enum values that no longer exist in the headers (regression). + for code in instances(cuFFT.cufftResult) + err = cuFFT.CUFFTError(code) + msg = sprint(showerror, err) + @test occursin(string(code), msg) + @test !isempty(cuFFT.description(err)) + end +end diff --git a/lib/cupti/Project.toml b/lib/cupti/Project.toml index 8d86aaaa11..a11d578acf 100644 --- a/lib/cupti/Project.toml +++ b/lib/cupti/Project.toml @@ -14,6 +14,6 @@ GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "1.1" julia = "1.10" diff --git a/lib/cupti/src/libcupti.jl b/lib/cupti/src/libcupti.jl index 4ff3cd6f3b..c5895684f6 100644 --- a/lib/cupti/src/libcupti.jl +++ b/lib/cupti/src/libcupti.jl @@ -214,27 +214,27 @@ struct CUpti_CallbackData correlationId::UInt32 end -struct var"##Ctag#425" +struct var"##Ctag#419" data::NTuple{8,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#425"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#419"}, f::Symbol) f === :stream && return Ptr{CUstream}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#425", f::Symbol) - r = Ref{var"##Ctag#425"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#425"}, r) +function Base.getproperty(x::var"##Ctag#419", f::Symbol) + r = Ref{var"##Ctag#419"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#419"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#425"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#419"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#425", private::Bool=false) +function Base.propertynames(x::var"##Ctag#419", private::Bool=false) return (:stream, if private fieldnames(typeof(x)) else @@ -248,7 +248,7 @@ end function Base.getproperty(x::Ptr{CUpti_ResourceData}, f::Symbol) f === :context && return Ptr{CUcontext}(x + 0) - f === :resourceHandle && return Ptr{var"##Ctag#425"}(x + 8) + f === :resourceHandle && return Ptr{var"##Ctag#419"}(x + 8) f === :resourceDescriptor && return Ptr{Ptr{Cvoid}}(x + 16) return getfield(x, f) end @@ -1036,8 +1036,8 @@ struct CUpti_ActivityObjectKindId end function Base.getproperty(x::Ptr{CUpti_ActivityObjectKindId}, f::Symbol) - f === :pt && return Ptr{var"##Ctag#441"}(x + 0) - f === :dcs && return Ptr{var"##Ctag#442"}(x + 0) + f === :pt && return Ptr{var"##Ctag#437"}(x + 0) + f === :dcs && return Ptr{var"##Ctag#438"}(x + 0) return getfield(x, f) end @@ -1956,28 +1956,36 @@ end CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT = 2147483647 end -struct var"##Ctag#443" +@cenum CUpti_FuncExecutionModel::UInt32 begin + CUPTI_FUNC_EXECUTION_MODEL_UNKNOWN = 0 + CUPTI_FUNC_EXECUTION_MODEL_SIMT = 1 + CUPTI_FUNC_EXECUTION_MODEL_TILE = 2 + CUPTI_FUNC_EXECUTION_MODEL_SIZE = 3 + CUPTI_FUNC_EXECUTION_MODEL_FORCE_INT = 2147483647 +end + +struct var"##Ctag#424" data::NTuple{1,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#443"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#424"}, f::Symbol) f === :both && return Ptr{UInt8}(x + 0) - f === :config && return Ptr{var"##Ctag#444"}(x + 0) + f === :config && return Ptr{var"##Ctag#425"}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#443", f::Symbol) - r = Ref{var"##Ctag#443"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#443"}, r) +function Base.getproperty(x::var"##Ctag#424", f::Symbol) + r = Ref{var"##Ctag#424"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#424"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#443"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#424"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#443", private::Bool=false) +function Base.propertynames(x::var"##Ctag#424", private::Bool=false) return (:both, :config, if private fieldnames(typeof(x)) else @@ -1985,13 +1993,13 @@ function Base.propertynames(x::var"##Ctag#443", private::Bool=false) end...) end -struct CUpti_ActivityKernel11 - data::NTuple{224,UInt8} +struct CUpti_ActivityKernel12 + data::NTuple{232,UInt8} end -function Base.getproperty(x::Ptr{CUpti_ActivityKernel11}, f::Symbol) +function Base.getproperty(x::Ptr{CUpti_ActivityKernel12}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) - f === :cacheConfig && return Ptr{var"##Ctag#443"}(x + 4) + f === :cacheConfig && return Ptr{var"##Ctag#424"}(x + 4) f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) f === :registersPerThread && return Ptr{UInt16}(x + 6) f === :partitionedGlobalCacheRequested && @@ -2042,21 +2050,23 @@ function Base.getproperty(x::Ptr{CUpti_ActivityKernel11}, f::Symbol) f === :padding3 && return Ptr{NTuple{7,UInt8}}(x + 209) f === :priority && return Ptr{Int32}(x + 216) f === :padding4 && return Ptr{UInt32}(x + 220) + f === :executionModel && return Ptr{UInt32}(x + 224) + f === :padding5 && return Ptr{UInt32}(x + 228) return getfield(x, f) end -function Base.getproperty(x::CUpti_ActivityKernel11, f::Symbol) - r = Ref{CUpti_ActivityKernel11}(x) - ptr = Base.unsafe_convert(Ptr{CUpti_ActivityKernel11}, r) +function Base.getproperty(x::CUpti_ActivityKernel12, f::Symbol) + r = Ref{CUpti_ActivityKernel12}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityKernel12}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{CUpti_ActivityKernel11}, f::Symbol, v) +function Base.setproperty!(x::Ptr{CUpti_ActivityKernel12}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::CUpti_ActivityKernel11, private::Bool=false) +function Base.propertynames(x::CUpti_ActivityKernel12, private::Bool=false) return (:kind, :cacheConfig, :sharedMemoryConfig, :registersPerThread, :partitionedGlobalCacheRequested, :partitionedGlobalCacheExecuted, :start, :_end, :completed, :deviceId, :contextId, :streamId, :gridX, :gridY, :gridZ, @@ -2067,7 +2077,7 @@ function Base.propertynames(x::CUpti_ActivityKernel11, private::Bool=false) :shmemLimitConfig, :graphId, :pAccessPolicyWindow, :channelID, :channelType, :clusterX, :clusterY, :clusterZ, :clusterSchedulingPolicy, :localMemoryTotal_v2, :maxPotentialClusterSize, :maxActiveClusters, :isDeviceLaunched, :padding3, - :priority, :padding4, if private + :priority, :padding4, :executionModel, :padding5, if private fieldnames(typeof(x)) else () @@ -2121,7 +2131,8 @@ end KERNEL_FIELD_MAX_ACTIVE_CLUSTERS = 43 KERNEL_FIELD_IS_DEVICE_LAUNCHED = 44 KERNEL_FIELD_LAUNCH_PRIORITY = 45 - KERNEL_FIELD_MAX = 46 + KERNEL_FIELD_EXECUTION_MODEL = 46 + KERNEL_FIELD_MAX = 47 end struct var"##Ctag#351" @@ -2660,28 +2671,28 @@ end DEVICE_FIELD_MAX = 38 end -struct var"##Ctag#411" +struct var"##Ctag#405" data::NTuple{4,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#411"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#405"}, f::Symbol) f === :cu && return Ptr{CUdevice_attribute}(x + 0) f === :cupti && return Ptr{CUpti_DeviceAttribute}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#411", f::Symbol) - r = Ref{var"##Ctag#411"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#411"}, r) +function Base.getproperty(x::var"##Ctag#405", f::Symbol) + r = Ref{var"##Ctag#405"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#405"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#411"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#405"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#411", private::Bool=false) +function Base.propertynames(x::var"##Ctag#405", private::Bool=false) return (:cu, :cupti, if private fieldnames(typeof(x)) else @@ -2689,11 +2700,11 @@ function Base.propertynames(x::var"##Ctag#411", private::Bool=false) end...) end -struct var"##Ctag#412" +struct var"##Ctag#406" data::NTuple{8,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#412"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#406"}, f::Symbol) f === :vDouble && return Ptr{Cdouble}(x + 0) f === :vUint32 && return Ptr{UInt32}(x + 0) f === :vUint64 && return Ptr{UInt64}(x + 0) @@ -2702,18 +2713,18 @@ function Base.getproperty(x::Ptr{var"##Ctag#412"}, f::Symbol) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#412", f::Symbol) - r = Ref{var"##Ctag#412"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#412"}, r) +function Base.getproperty(x::var"##Ctag#406", f::Symbol) + r = Ref{var"##Ctag#406"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#406"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#412"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#406"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#412", private::Bool=false) +function Base.propertynames(x::var"##Ctag#406", private::Bool=false) return (:vDouble, :vUint32, :vUint64, :vInt32, :vInt64, if private fieldnames(typeof(x)) else @@ -2729,8 +2740,8 @@ function Base.getproperty(x::Ptr{CUpti_ActivityDeviceAttribute}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) f === :flags && return Ptr{CUpti_ActivityFlag}(x + 4) f === :deviceId && return Ptr{UInt32}(x + 8) - f === :attribute && return Ptr{var"##Ctag#411"}(x + 12) - f === :value && return Ptr{var"##Ctag#412"}(x + 16) + f === :attribute && return Ptr{var"##Ctag#405"}(x + 12) + f === :value && return Ptr{var"##Ctag#406"}(x + 16) return getfield(x, f) end @@ -2995,15 +3006,36 @@ end OVERHEAD_FIELD_MAX = 8 end +struct CUpti_ActivityEnvironmentSpeed + smClock::UInt32 + memoryClock::UInt32 + pcieLinkGen::UInt32 + pcieLinkWidth::UInt32 + clocksThrottleReasons::CUpti_EnvironmentClocksThrottleReason +end + +struct CUpti_ActivityEnvironmentTemperature + gpuTemperature::UInt32 +end + +struct CUpti_ActivityEnvironmentPower + power::UInt32 + powerLimit::UInt32 +end + +struct CUpti_ActivityEnvironmentCooling + fanSpeed::UInt32 +end + struct var"##Ctag#400" data::NTuple{20,UInt8} end function Base.getproperty(x::Ptr{var"##Ctag#400"}, f::Symbol) - f === :speed && return Ptr{var"##Ctag#401"}(x + 0) - f === :temperature && return Ptr{var"##Ctag#402"}(x + 0) - f === :power && return Ptr{var"##Ctag#403"}(x + 0) - f === :cooling && return Ptr{var"##Ctag#404"}(x + 0) + f === :speed && return Ptr{CUpti_ActivityEnvironmentSpeed}(x + 0) + f === :temperature && return Ptr{CUpti_ActivityEnvironmentTemperature}(x + 0) + f === :power && return Ptr{CUpti_ActivityEnvironmentPower}(x + 0) + f === :cooling && return Ptr{CUpti_ActivityEnvironmentCooling}(x + 0) return getfield(x, f) end @@ -3059,6 +3091,15 @@ function Base.propertynames(x::CUpti_ActivityEnvironment, private::Bool=false) end...) end +@cenum CUpti_ActivityEnvironmentFieldIds::UInt32 begin + ENVIRONMENT_FIELD_KIND = 0 + ENVIRONMENT_FIELD_DEVICE_ID = 1 + ENVIRONMENT_FIELD_TIMESTAMP = 2 + ENVIRONMENT_FIELD_ENVIRONMENT_KIND = 3 + ENVIRONMENT_FIELD_ENVIRONMENT_KIND_DATA = 4 + ENVIRONMENT_FIELD_MAX = 5 +end + struct CUpti_ActivityInstructionExecution data::NTuple{48,UInt8} end @@ -3682,6 +3723,38 @@ function Base.propertynames(x::CUpti_ActivityOpenAccData, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenAccDataFieldIds::UInt32 begin + OPENACC_DATA_FIELD_KIND = 0 + OPENACC_DATA_FIELD_EVENT_KIND = 1 + OPENACC_DATA_FIELD_PARENT_CONSTRUCT = 2 + OPENACC_DATA_FIELD_VERSION = 3 + OPENACC_DATA_FIELD_IMPLICIT = 4 + OPENACC_DATA_FIELD_DEVICE_TYPE = 5 + OPENACC_DATA_FIELD_DEVICE_NUMBER = 6 + OPENACC_DATA_FIELD_THREAD_ID = 7 + OPENACC_DATA_FIELD_ASYNC = 8 + OPENACC_DATA_FIELD_ASYNC_MAP = 9 + OPENACC_DATA_FIELD_LINE_NO = 10 + OPENACC_DATA_FIELD_END_LINE_NO = 11 + OPENACC_DATA_FIELD_FUNC_LINE_NO = 12 + OPENACC_DATA_FIELD_FUNC_END_LINE_NO = 13 + OPENACC_DATA_FIELD_START = 14 + OPENACC_DATA_FIELD_END = 15 + OPENACC_DATA_FIELD_CU_DEVICE_ID = 16 + OPENACC_DATA_FIELD_CU_CONTEXT_ID = 17 + OPENACC_DATA_FIELD_CU_STREAM_ID = 18 + OPENACC_DATA_FIELD_CU_PROCESS_ID = 19 + OPENACC_DATA_FIELD_CU_THREAD_ID = 20 + OPENACC_DATA_FIELD_EXTERNAL_ID = 21 + OPENACC_DATA_FIELD_SRC_FILE = 22 + OPENACC_DATA_FIELD_FUNC_NAME = 23 + OPENACC_DATA_FIELD_BYTES = 24 + OPENACC_DATA_FIELD_HOST_PTR = 25 + OPENACC_DATA_FIELD_DEVICE_PTR = 26 + OPENACC_DATA_FIELD_VAR_NAME = 27 + OPENACC_DATA_FIELD_MAX = 28 +end + struct CUpti_ActivityOpenAccLaunch data::NTuple{152,UInt8} end @@ -3741,6 +3814,38 @@ function Base.propertynames(x::CUpti_ActivityOpenAccLaunch, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenAccLaunchFieldIds::UInt32 begin + OPENACC_LAUNCH_FIELD_KIND = 0 + OPENACC_LAUNCH_FIELD_EVENT_KIND = 1 + OPENACC_LAUNCH_FIELD_PARENT_CONSTRUCT = 2 + OPENACC_LAUNCH_FIELD_VERSION = 3 + OPENACC_LAUNCH_FIELD_IMPLICIT = 4 + OPENACC_LAUNCH_FIELD_DEVICE_TYPE = 5 + OPENACC_LAUNCH_FIELD_DEVICE_NUMBER = 6 + OPENACC_LAUNCH_FIELD_THREAD_ID = 7 + OPENACC_LAUNCH_FIELD_ASYNC = 8 + OPENACC_LAUNCH_FIELD_ASYNC_MAP = 9 + OPENACC_LAUNCH_FIELD_LINE_NO = 10 + OPENACC_LAUNCH_FIELD_END_LINE_NO = 11 + OPENACC_LAUNCH_FIELD_FUNC_LINE_NO = 12 + OPENACC_LAUNCH_FIELD_FUNC_END_LINE_NO = 13 + OPENACC_LAUNCH_FIELD_START = 14 + OPENACC_LAUNCH_FIELD_END = 15 + OPENACC_LAUNCH_FIELD_CU_DEVICE_ID = 16 + OPENACC_LAUNCH_FIELD_CU_CONTEXT_ID = 17 + OPENACC_LAUNCH_FIELD_CU_STREAM_ID = 18 + OPENACC_LAUNCH_FIELD_CU_PROCESS_ID = 19 + OPENACC_LAUNCH_FIELD_CU_THREAD_ID = 20 + OPENACC_LAUNCH_FIELD_EXTERNAL_ID = 21 + OPENACC_LAUNCH_FIELD_SRC_FILE = 22 + OPENACC_LAUNCH_FIELD_FUNC_NAME = 23 + OPENACC_LAUNCH_FIELD_NUM_GANGS = 24 + OPENACC_LAUNCH_FIELD_NUM_WORKERS = 25 + OPENACC_LAUNCH_FIELD_VECTOR_LENGTH = 26 + OPENACC_LAUNCH_FIELD_KERNEL_NAME = 27 + OPENACC_LAUNCH_FIELD_MAX = 28 +end + struct CUpti_ActivityOpenAccOther data::NTuple{120,UInt8} end @@ -3796,6 +3901,34 @@ function Base.propertynames(x::CUpti_ActivityOpenAccOther, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenAccOtherFieldIds::UInt32 begin + OPENACC_OTHER_FIELD_KIND = 0 + OPENACC_OTHER_FIELD_EVENT_KIND = 1 + OPENACC_OTHER_FIELD_PARENT_CONSTRUCT = 2 + OPENACC_OTHER_FIELD_VERSION = 3 + OPENACC_OTHER_FIELD_IMPLICIT = 4 + OPENACC_OTHER_FIELD_DEVICE_TYPE = 5 + OPENACC_OTHER_FIELD_DEVICE_NUMBER = 6 + OPENACC_OTHER_FIELD_THREAD_ID = 7 + OPENACC_OTHER_FIELD_ASYNC = 8 + OPENACC_OTHER_FIELD_ASYNC_MAP = 9 + OPENACC_OTHER_FIELD_LINE_NO = 10 + OPENACC_OTHER_FIELD_END_LINE_NO = 11 + OPENACC_OTHER_FIELD_FUNC_LINE_NO = 12 + OPENACC_OTHER_FIELD_FUNC_END_LINE_NO = 13 + OPENACC_OTHER_FIELD_START = 14 + OPENACC_OTHER_FIELD_END = 15 + OPENACC_OTHER_FIELD_CU_DEVICE_ID = 16 + OPENACC_OTHER_FIELD_CU_CONTEXT_ID = 17 + OPENACC_OTHER_FIELD_CU_STREAM_ID = 18 + OPENACC_OTHER_FIELD_CU_PROCESS_ID = 19 + OPENACC_OTHER_FIELD_CU_THREAD_ID = 20 + OPENACC_OTHER_FIELD_EXTERNAL_ID = 21 + OPENACC_OTHER_FIELD_SRC_FILE = 22 + OPENACC_OTHER_FIELD_FUNC_NAME = 23 + OPENACC_OTHER_FIELD_MAX = 24 +end + struct CUpti_ActivityOpenMp data::NTuple{40,UInt8} end @@ -3832,6 +3965,16 @@ function Base.propertynames(x::CUpti_ActivityOpenMp, private::Bool=false) end...) end +@cenum CUpti_ActivityOpenMpFieldIds::UInt32 begin + OPENMP_FIELD_KIND = 0 + OPENMP_FIELD_EVENT_KIND = 1 + OPENMP_FIELD_START = 2 + OPENMP_FIELD_END = 3 + OPENMP_FIELD_CU_THREAD_ID = 4 + OPENMP_FIELD_CU_PROCESS_ID = 5 + OPENMP_FIELD_MAX = 6 +end + @cenum CUpti_ExternalCorrelationKind::UInt32 begin CUPTI_EXTERNAL_CORRELATION_KIND_INVALID = 0 CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN = 1 @@ -3891,28 +4034,33 @@ end CUPTI_DEV_TYPE_FORCE_INT = 2147483647 end -struct var"##Ctag#445" +struct CUpti_ActivityNvLinkNpu + index::UInt32 + domainId::UInt32 +end + +struct var"##Ctag#441" data::NTuple{16,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#445"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#441"}, f::Symbol) f === :uuidDev && return Ptr{CUuuid}(x + 0) - f === :npu && return Ptr{var"##Ctag#446"}(x + 0) + f === :npu && return Ptr{CUpti_ActivityNvLinkNpu}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#445", f::Symbol) - r = Ref{var"##Ctag#445"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#445"}, r) +function Base.getproperty(x::var"##Ctag#441", f::Symbol) + r = Ref{var"##Ctag#441"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#441"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#445"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#441"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#445", private::Bool=false) +function Base.propertynames(x::var"##Ctag#441", private::Bool=false) return (:uuidDev, :npu, if private fieldnames(typeof(x)) else @@ -3920,28 +4068,28 @@ function Base.propertynames(x::var"##Ctag#445", private::Bool=false) end...) end -struct var"##Ctag#447" +struct var"##Ctag#442" data::NTuple{16,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#447"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#442"}, f::Symbol) f === :uuidDev && return Ptr{CUuuid}(x + 0) - f === :npu && return Ptr{var"##Ctag#448"}(x + 0) + f === :npu && return Ptr{CUpti_ActivityNvLinkNpu}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#447", f::Symbol) - r = Ref{var"##Ctag#447"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#447"}, r) +function Base.getproperty(x::var"##Ctag#442", f::Symbol) + r = Ref{var"##Ctag#442"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#442"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#447"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#442"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#447", private::Bool=false) +function Base.propertynames(x::var"##Ctag#442", private::Bool=false) return (:uuidDev, :npu, if private fieldnames(typeof(x)) else @@ -3958,8 +4106,8 @@ function Base.getproperty(x::Ptr{CUpti_ActivityNvLink5}, f::Symbol) f === :nvlinkVersion && return Ptr{UInt32}(x + 4) f === :typeDev0 && return Ptr{CUpti_DevType}(x + 8) f === :typeDev1 && return Ptr{CUpti_DevType}(x + 12) - f === :idDev0 && return Ptr{var"##Ctag#445"}(x + 16) - f === :idDev1 && return Ptr{var"##Ctag#447"}(x + 32) + f === :idDev0 && return Ptr{var"##Ctag#441"}(x + 16) + f === :idDev1 && return Ptr{var"##Ctag#442"}(x + 32) f === :flag && return Ptr{UInt32}(x + 48) f === :physicalNvLinkCount && return Ptr{UInt32}(x + 52) f === :portDev0 && return Ptr{Ptr{UInt32}}(x + 56) @@ -3991,34 +4139,62 @@ function Base.propertynames(x::CUpti_ActivityNvLink5, private::Bool=false) end...) end +@cenum CUpti_ActivityNvLinkFieldIds::UInt32 begin + NVLINK_FIELD_KIND = 0 + NVLINK_FIELD_NVLINK_VERSION = 1 + NVLINK_FIELD_TYPE_DEV0 = 2 + NVLINK_FIELD_TYPE_DEV1 = 3 + NVLINK_FIELD_ID_DEV0 = 4 + NVLINK_FIELD_ID_DEV1 = 5 + NVLINK_FIELD_FLAG = 6 + NVLINK_FIELD_PHYSICAL_NVLINK_COUNT = 7 + NVLINK_FIELD_PORT_DEV0 = 8 + NVLINK_FIELD_PORT_DEV1 = 9 + NVLINK_FIELD_BANDWIDTH = 10 + NVLINK_FIELD_NVSWITCH_CONNECTED = 11 + NVLINK_FIELD_MAX = 12 +end + @cenum CUpti_PcieDeviceType::UInt32 begin CUPTI_PCIE_DEVICE_TYPE_GPU = 0 CUPTI_PCIE_DEVICE_TYPE_BRIDGE = 1 CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 2147483647 end -struct var"##Ctag#405" +struct CUpti_ActivityPcieGpuAttr + uuidDev::CUuuid + peerDev::NTuple{32,CUdevice} +end + +struct CUpti_ActivityPcieBridgeAttr + secondaryBus::UInt16 + deviceId::UInt16 + vendorId::UInt16 + pad0::UInt16 +end + +struct var"##Ctag#401" data::NTuple{4,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#405"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#401"}, f::Symbol) f === :devId && return Ptr{CUdevice}(x + 0) f === :bridgeId && return Ptr{UInt32}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#405", f::Symbol) - r = Ref{var"##Ctag#405"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#405"}, r) +function Base.getproperty(x::var"##Ctag#401", f::Symbol) + r = Ref{var"##Ctag#401"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#401"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#405"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#401"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#405", private::Bool=false) +function Base.propertynames(x::var"##Ctag#401", private::Bool=false) return (:devId, :bridgeId, if private fieldnames(typeof(x)) else @@ -4026,28 +4202,28 @@ function Base.propertynames(x::var"##Ctag#405", private::Bool=false) end...) end -struct var"##Ctag#406" +struct var"##Ctag#402" data::NTuple{144,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#406"}, f::Symbol) - f === :gpuAttr && return Ptr{var"##Ctag#407"}(x + 0) - f === :bridgeAttr && return Ptr{var"##Ctag#408"}(x + 0) +function Base.getproperty(x::Ptr{var"##Ctag#402"}, f::Symbol) + f === :gpuAttr && return Ptr{CUpti_ActivityPcieGpuAttr}(x + 0) + f === :bridgeAttr && return Ptr{CUpti_ActivityPcieBridgeAttr}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#406", f::Symbol) - r = Ref{var"##Ctag#406"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#406"}, r) +function Base.getproperty(x::var"##Ctag#402", f::Symbol) + r = Ref{var"##Ctag#402"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#402"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#406"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#402"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#406", private::Bool=false) +function Base.propertynames(x::var"##Ctag#402", private::Bool=false) return (:gpuAttr, :bridgeAttr, if private fieldnames(typeof(x)) else @@ -4062,13 +4238,13 @@ end function Base.getproperty(x::Ptr{CUpti_ActivityPcie}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) f === :type && return Ptr{CUpti_PcieDeviceType}(x + 4) - f === :id && return Ptr{var"##Ctag#405"}(x + 8) + f === :id && return Ptr{var"##Ctag#401"}(x + 8) f === :domain && return Ptr{UInt32}(x + 12) f === :pcieGeneration && return Ptr{UInt16}(x + 16) f === :linkRate && return Ptr{UInt16}(x + 18) f === :linkWidth && return Ptr{UInt16}(x + 20) f === :upstreamBus && return Ptr{UInt16}(x + 22) - f === :attr && return Ptr{var"##Ctag#406"}(x + 24) + f === :attr && return Ptr{var"##Ctag#402"}(x + 24) return getfield(x, f) end @@ -4092,6 +4268,19 @@ function Base.propertynames(x::CUpti_ActivityPcie, private::Bool=false) end...) end +@cenum CUpti_ActivityPcieFieldIds::UInt32 begin + PCIE_FIELD_KIND = 0 + PCIE_FIELD_TYPE = 1 + PCIE_FIELD_ID = 2 + PCIE_FIELD_DOMAIN = 3 + PCIE_FIELD_PCIE_GENERATION = 4 + PCIE_FIELD_LINK_RATE = 5 + PCIE_FIELD_LINK_WIDTH = 6 + PCIE_FIELD_UPSTREAM_BUS = 7 + PCIE_FIELD_ATTR = 8 + PCIE_FIELD_MAX = 9 +end + @cenum CUpti_PcieGen::UInt32 begin CUPTI_PCIE_GEN_GEN1 = 1 CUPTI_PCIE_GEN_GEN2 = 2 @@ -4541,11 +4730,11 @@ end HOST_LAUNCH_FIELD_MAX = 9 end -struct CUpti_ActivityGreenContext - data::NTuple{152,UInt8} +struct CUpti_ActivityGreenContext2 + data::NTuple{168,UInt8} end -function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol) +function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext2}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) f === :contextId && return Ptr{UInt32}(x + 4) f === :parentContextId && return Ptr{UInt32}(x + 8) @@ -4555,23 +4744,28 @@ function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol) f === :logicalTpcMaskSize && return Ptr{UInt8}(x + 22) f === :padding && return Ptr{UInt8}(x + 23) f === :logicalTpcMask && return Ptr{NTuple{32,UInt32}}(x + 24) + f === :workqueueResourceId && return Ptr{UInt64}(x + 152) + f === :workqueueConcurrencyLimit && return Ptr{UInt32}(x + 160) + f === :workqueueSharingScope && return Ptr{UInt32}(x + 164) return getfield(x, f) end -function Base.getproperty(x::CUpti_ActivityGreenContext, f::Symbol) - r = Ref{CUpti_ActivityGreenContext}(x) - ptr = Base.unsafe_convert(Ptr{CUpti_ActivityGreenContext}, r) +function Base.getproperty(x::CUpti_ActivityGreenContext2, f::Symbol) + r = Ref{CUpti_ActivityGreenContext2}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityGreenContext2}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol, v) +function Base.setproperty!(x::Ptr{CUpti_ActivityGreenContext2}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::CUpti_ActivityGreenContext, private::Bool=false) +function Base.propertynames(x::CUpti_ActivityGreenContext2, private::Bool=false) return (:kind, :contextId, :parentContextId, :deviceId, :numTpcs, :numMultiprocessors, - :logicalTpcMaskSize, :padding, :logicalTpcMask, if private + :logicalTpcMaskSize, :padding, :logicalTpcMask, :workqueueResourceId, + :workqueueConcurrencyLimit, :workqueueSharingScope, + if private fieldnames(typeof(x)) else () @@ -4587,7 +4781,10 @@ end GREEN_CONTEXT_FIELD_NUM_TPCS = 5 GREEN_CONTEXT_FIELD_LOGICAL_TPC_MASK_SIZE = 6 GREEN_CONTEXT_FIELD_LOGICAL_TPC_MASK = 7 - GREEN_CONTEXT_FIELD_MAX = 8 + GREEN_CONTEXT_FIELD_WORKQUEUE_RESOURCE_ID = 8 + GREEN_CONTEXT_FIELD_WORKQUEUE_CONCURRENCY_LIMIT = 9 + GREEN_CONTEXT_FIELD_WORKQUEUE_SHARING_SCOPE = 10 + GREEN_CONTEXT_FIELD_MAX = 11 end @cenum CUpti_ActivityAttribute::UInt32 begin @@ -4611,10 +4808,10 @@ end CUPTI_ACTIVITY_ATTR_ENABLE_CUDA_EVENT_DEVICE_TIMESTAMPS = 17 CUPTI_ACTIVITY_ATTR_ENABLE_KERNEL_LAUNCH_ATTRIBUTES = 18 CUPTI_ACTIVITY_ATTR_ENABLE_DEVICE_GRAPH_TRACE = 19 - CUPTI_ACTIVITY_ATTR_ENABLE_MULTI_SUBSCRIBER_GRAPH_LEVEL_TRACE = 20 + CUPTI_ACTIVITY_ATTR_ENABLE_MULTI_SUBSCRIBER_GRAPH_TRACE = 20 CUPTI_ACTIVITY_ATTR_THREAD_ID_TYPE = 21 CUPTI_ACTIVITY_ATTR_TIMESTAMP_CALLBACK = 22 - CUPTI_ACTIVITY_ATTR_CIG_MODE = 23 + CUPTI_ACTIVITY_ATTR_ENABLE_CIG_MODE = 23 CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT = 2147483647 end @@ -4630,6 +4827,12 @@ end @gcsafe_ccall libcupti.cuptiGetTimestamp(timestamp::Ptr{UInt64})::CUptiResult end +@checked function cuptiGetTimestamp_v2(subscriber, timestamp) + initialize_context() + @gcsafe_ccall libcupti.cuptiGetTimestamp_v2(subscriber::CUpti_SubscriberHandle, + timestamp::Ptr{UInt64})::CUptiResult +end + @checked function cuptiGetContextId(context, contextId) initialize_context() @gcsafe_ccall libcupti.cuptiGetContextId(context::CUcontext, @@ -4748,6 +4951,15 @@ end dropped::Ptr{Csize_t})::CUptiResult end +@checked function cuptiActivityGetNumDroppedRecords_v2(subscriber, context, streamId, + dropped) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityGetNumDroppedRecords_v2(subscriber::CUpti_SubscriberHandle, + context::CUcontext, + streamId::UInt32, + dropped::Ptr{Csize_t})::CUptiResult +end + @checked function cuptiActivityGetNextRecord(buffer, validBufferSizeBytes, record) initialize_context() @gcsafe_ccall libcupti.cuptiActivityGetNextRecord(buffer::Ptr{UInt8}, @@ -4755,6 +4967,15 @@ end record::Ptr{Ptr{CUpti_Activity}})::CUptiResult end +@checked function cuptiActivityGetNextRecord_v2(subscriber, buffer, validBufferSizeBytes, + record) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityGetNextRecord_v2(subscriber::CUpti_SubscriberHandle, + buffer::Ptr{UInt8}, + validBufferSizeBytes::Csize_t, + record::Ptr{Ptr{CUpti_Activity}})::CUptiResult +end + # typedef void ( CUPTIAPI * CUpti_BuffersCallbackRequestFunc ) ( uint8_t * * buffer , size_t * size , size_t * maxNumRecords ) const CUpti_BuffersCallbackRequestFunc = Ptr{Cvoid} @@ -4908,6 +5129,20 @@ end lastId::Ptr{UInt64})::CUptiResult end +@checked function cuptiActivityPushExternalCorrelationId_v2(subscriber, kind, id) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityPushExternalCorrelationId_v2(subscriber::CUpti_SubscriberHandle, + kind::CUpti_ExternalCorrelationKind, + id::UInt64)::CUptiResult +end + +@checked function cuptiActivityPopExternalCorrelationId_v2(subscriber, kind, lastId) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityPopExternalCorrelationId_v2(subscriber::CUpti_SubscriberHandle, + kind::CUpti_ExternalCorrelationKind, + lastId::Ptr{UInt64})::CUptiResult +end + @checked function cuptiActivityEnableLatencyTimestamps(enable) initialize_context() @gcsafe_ccall libcupti.cuptiActivityEnableLatencyTimestamps(enable::UInt8)::CUptiResult @@ -4942,12 +5177,26 @@ end enable::UInt8)::CUptiResult end +@checked function cuptiActivityEnableDriverApi_v2(subscriber, cbid, enable) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityEnableDriverApi_v2(subscriber::CUpti_SubscriberHandle, + cbid::CUpti_CallbackId, + enable::UInt8)::CUptiResult +end + @checked function cuptiActivityEnableRuntimeApi(cbid, enable) initialize_context() @gcsafe_ccall libcupti.cuptiActivityEnableRuntimeApi(cbid::CUpti_CallbackId, enable::UInt8)::CUptiResult end +@checked function cuptiActivityEnableRuntimeApi_v2(subscriber, cbid, enable) + initialize_context() + @gcsafe_ccall libcupti.cuptiActivityEnableRuntimeApi_v2(subscriber::CUpti_SubscriberHandle, + cbid::CUpti_CallbackId, + enable::UInt8)::CUptiResult +end + @checked function cuptiActivityEnableHWTrace(enable) initialize_context() @gcsafe_ccall libcupti.cuptiActivityEnableHWTrace(enable::UInt8)::CUptiResult @@ -5429,28 +5678,28 @@ function Base.propertynames(x::CUpti_ActivityKernel, private::Bool=false) end...) end -struct var"##Ctag#409" +struct var"##Ctag#403" data::NTuple{1,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#409"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#403"}, f::Symbol) f === :both && return Ptr{UInt8}(x + 0) - f === :config && return Ptr{var"##Ctag#410"}(x + 0) + f === :config && return Ptr{var"##Ctag#404"}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#409", f::Symbol) - r = Ref{var"##Ctag#409"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#409"}, r) +function Base.getproperty(x::var"##Ctag#403", f::Symbol) + r = Ref{var"##Ctag#403"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#403"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#409"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#403"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#409", private::Bool=false) +function Base.propertynames(x::var"##Ctag#403", private::Bool=false) return (:both, :config, if private fieldnames(typeof(x)) else @@ -5464,7 +5713,7 @@ end function Base.getproperty(x::Ptr{CUpti_ActivityKernel2}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) - f === :cacheConfig && return Ptr{var"##Ctag#409"}(x + 4) + f === :cacheConfig && return Ptr{var"##Ctag#403"}(x + 4) f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) f === :registersPerThread && return Ptr{UInt16}(x + 6) f === :start && return Ptr{UInt64}(x + 8) @@ -6008,28 +6257,28 @@ function Base.propertynames(x::CUpti_ActivityKernel7, private::Bool=false) end...) end -struct var"##Ctag#423" +struct var"##Ctag#417" data::NTuple{1,UInt8} end -function Base.getproperty(x::Ptr{var"##Ctag#423"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#417"}, f::Symbol) f === :both && return Ptr{UInt8}(x + 0) - f === :config && return Ptr{var"##Ctag#424"}(x + 0) + f === :config && return Ptr{var"##Ctag#418"}(x + 0) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#423", f::Symbol) - r = Ref{var"##Ctag#423"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#423"}, r) +function Base.getproperty(x::var"##Ctag#417", f::Symbol) + r = Ref{var"##Ctag#417"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#417"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#423"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#417"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -function Base.propertynames(x::var"##Ctag#423", private::Bool=false) +function Base.propertynames(x::var"##Ctag#417", private::Bool=false) return (:both, :config, if private fieldnames(typeof(x)) else @@ -6043,7 +6292,7 @@ end function Base.getproperty(x::Ptr{CUpti_ActivityKernel8}, f::Symbol) f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) - f === :cacheConfig && return Ptr{var"##Ctag#423"}(x + 4) + f === :cacheConfig && return Ptr{var"##Ctag#417"}(x + 4) f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) f === :registersPerThread && return Ptr{UInt16}(x + 6) f === :partitionedGlobalCacheRequested && @@ -6348,6 +6597,124 @@ function Base.propertynames(x::CUpti_ActivityKernel10, private::Bool=false) end...) end +struct var"##Ctag#439" + data::NTuple{1,UInt8} +end + +function Base.getproperty(x::Ptr{var"##Ctag#439"}, f::Symbol) + f === :both && return Ptr{UInt8}(x + 0) + f === :config && return Ptr{var"##Ctag#440"}(x + 0) + return getfield(x, f) +end + +function Base.getproperty(x::var"##Ctag#439", f::Symbol) + r = Ref{var"##Ctag#439"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#439"}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{var"##Ctag#439"}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::var"##Ctag#439", private::Bool=false) + return (:both, :config, if private + fieldnames(typeof(x)) + else + () + end...) +end + +struct CUpti_ActivityKernel11 + data::NTuple{224,UInt8} +end + +function Base.getproperty(x::Ptr{CUpti_ActivityKernel11}, f::Symbol) + f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) + f === :cacheConfig && return Ptr{var"##Ctag#439"}(x + 4) + f === :sharedMemoryConfig && return Ptr{UInt8}(x + 5) + f === :registersPerThread && return Ptr{UInt16}(x + 6) + f === :partitionedGlobalCacheRequested && + return Ptr{CUpti_ActivityPartitionedGlobalCacheConfig}(x + 8) + f === :partitionedGlobalCacheExecuted && + return Ptr{CUpti_ActivityPartitionedGlobalCacheConfig}(x + 12) + f === :start && return Ptr{UInt64}(x + 16) + f === :_end && return Ptr{UInt64}(x + 24) + f === :completed && return Ptr{UInt64}(x + 32) + f === :deviceId && return Ptr{UInt32}(x + 40) + f === :contextId && return Ptr{UInt32}(x + 44) + f === :streamId && return Ptr{UInt32}(x + 48) + f === :gridX && return Ptr{Int32}(x + 52) + f === :gridY && return Ptr{Int32}(x + 56) + f === :gridZ && return Ptr{Int32}(x + 60) + f === :blockX && return Ptr{Int32}(x + 64) + f === :blockY && return Ptr{Int32}(x + 68) + f === :blockZ && return Ptr{Int32}(x + 72) + f === :staticSharedMemory && return Ptr{Int32}(x + 76) + f === :dynamicSharedMemory && return Ptr{Int32}(x + 80) + f === :localMemoryPerThread && return Ptr{UInt32}(x + 84) + f === :localMemoryTotal && return Ptr{UInt32}(x + 88) + f === :correlationId && return Ptr{UInt32}(x + 92) + f === :gridId && return Ptr{Int64}(x + 96) + f === :name && return Ptr{Cstring}(x + 104) + f === :reserved0 && return Ptr{Ptr{Cvoid}}(x + 112) + f === :queued && return Ptr{UInt64}(x + 120) + f === :submitted && return Ptr{UInt64}(x + 128) + f === :launchType && return Ptr{UInt8}(x + 136) + f === :isSharedMemoryCarveoutRequested && return Ptr{UInt8}(x + 137) + f === :sharedMemoryCarveoutRequested && return Ptr{UInt8}(x + 138) + f === :padding && return Ptr{UInt8}(x + 139) + f === :sharedMemoryExecuted && return Ptr{UInt32}(x + 140) + f === :graphNodeId && return Ptr{UInt64}(x + 144) + f === :shmemLimitConfig && return Ptr{CUpti_FuncShmemLimitConfig}(x + 152) + f === :graphId && return Ptr{UInt32}(x + 156) + f === :pAccessPolicyWindow && return Ptr{Ptr{CUaccessPolicyWindow}}(x + 160) + f === :channelID && return Ptr{UInt32}(x + 168) + f === :channelType && return Ptr{CUpti_ChannelType}(x + 172) + f === :clusterX && return Ptr{UInt32}(x + 176) + f === :clusterY && return Ptr{UInt32}(x + 180) + f === :clusterZ && return Ptr{UInt32}(x + 184) + f === :clusterSchedulingPolicy && return Ptr{UInt32}(x + 188) + f === :localMemoryTotal_v2 && return Ptr{UInt64}(x + 192) + f === :maxPotentialClusterSize && return Ptr{UInt32}(x + 200) + f === :maxActiveClusters && return Ptr{UInt32}(x + 204) + f === :isDeviceLaunched && return Ptr{UInt8}(x + 208) + f === :padding3 && return Ptr{NTuple{7,UInt8}}(x + 209) + f === :priority && return Ptr{Int32}(x + 216) + f === :padding4 && return Ptr{UInt32}(x + 220) + return getfield(x, f) +end + +function Base.getproperty(x::CUpti_ActivityKernel11, f::Symbol) + r = Ref{CUpti_ActivityKernel11}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityKernel11}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{CUpti_ActivityKernel11}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::CUpti_ActivityKernel11, private::Bool=false) + return (:kind, :cacheConfig, :sharedMemoryConfig, :registersPerThread, + :partitionedGlobalCacheRequested, :partitionedGlobalCacheExecuted, :start, + :_end, :completed, :deviceId, :contextId, :streamId, :gridX, :gridY, :gridZ, + :blockX, :blockY, :blockZ, :staticSharedMemory, :dynamicSharedMemory, + :localMemoryPerThread, :localMemoryTotal, :correlationId, :gridId, :name, + :reserved0, :queued, :submitted, :launchType, :isSharedMemoryCarveoutRequested, + :sharedMemoryCarveoutRequested, :padding, :sharedMemoryExecuted, :graphNodeId, + :shmemLimitConfig, :graphId, :pAccessPolicyWindow, :channelID, :channelType, + :clusterX, :clusterY, :clusterZ, :clusterSchedulingPolicy, :localMemoryTotal_v2, + :maxPotentialClusterSize, :maxActiveClusters, :isDeviceLaunched, :padding3, + :priority, :padding4, if private + fieldnames(typeof(x)) + else + () + end...) +end + struct CUpti_ActivityMemcpy data::NTuple{64,UInt8} end @@ -8067,6 +8434,43 @@ function Base.propertynames(x::CUpti_ActivityMarkerData, private::Bool=false) end...) end +struct CUpti_ActivityGreenContext + data::NTuple{152,UInt8} +end + +function Base.getproperty(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol) + f === :kind && return Ptr{CUpti_ActivityKind}(x + 0) + f === :contextId && return Ptr{UInt32}(x + 4) + f === :parentContextId && return Ptr{UInt32}(x + 8) + f === :deviceId && return Ptr{UInt32}(x + 12) + f === :numTpcs && return Ptr{UInt32}(x + 16) + f === :numMultiprocessors && return Ptr{UInt16}(x + 20) + f === :logicalTpcMaskSize && return Ptr{UInt8}(x + 22) + f === :padding && return Ptr{UInt8}(x + 23) + f === :logicalTpcMask && return Ptr{NTuple{32,UInt32}}(x + 24) + return getfield(x, f) +end + +function Base.getproperty(x::CUpti_ActivityGreenContext, f::Symbol) + r = Ref{CUpti_ActivityGreenContext}(x) + ptr = Base.unsafe_convert(Ptr{CUpti_ActivityGreenContext}, r) + fptr = getproperty(ptr, f) + GC.@preserve r unsafe_load(fptr) +end + +function Base.setproperty!(x::Ptr{CUpti_ActivityGreenContext}, f::Symbol, v) + return unsafe_store!(getproperty(x, f), v) +end + +function Base.propertynames(x::CUpti_ActivityGreenContext, private::Bool=false) + return (:kind, :contextId, :parentContextId, :deviceId, :numTpcs, :numMultiprocessors, + :logicalTpcMaskSize, :padding, :logicalTpcMask, if private + fieldnames(typeof(x)) + else + () + end...) +end + @cenum CUpti_driver_api_trace_cbid_enum::UInt32 begin CUPTI_DRIVER_TRACE_CBID_INVALID = 0 CUPTI_DRIVER_TRACE_CBID_cuInit = 1 @@ -8914,7 +9318,9 @@ end CUPTI_DRIVER_TRACE_CBID_cuCoredumpRegisterCompleteCallback = 843 CUPTI_DRIVER_TRACE_CBID_cuCoredumpDeregisterStartCallback = 844 CUPTI_DRIVER_TRACE_CBID_cuCoredumpDeregisterCompleteCallback = 845 - CUPTI_DRIVER_TRACE_CBID_SIZE = 846 + CUPTI_DRIVER_TRACE_CBID_cuStreamBeginRecaptureToGraph = 846 + CUPTI_DRIVER_TRACE_CBID_cuStreamBeginRecaptureToGraph_ptsz = 847 + CUPTI_DRIVER_TRACE_CBID_SIZE = 848 CUPTI_DRIVER_TRACE_CBID_FORCE_INT = 2147483647 end @@ -9715,7 +10121,7 @@ struct CUpti_Profiler_GetCounterAvailability_Params ctx::CUcontext counterAvailabilityImageSize::Csize_t pCounterAvailabilityImage::Ptr{UInt8} - bAllowDeviceLevelCounters::Cint + bAllowDeviceLevelCounters::UInt8 end @cenum CUpti_Profiler_Support_Level::UInt32 begin @@ -9723,6 +10129,7 @@ end CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED = 1 CUPTI_PROFILER_CONFIGURATION_DISABLED = 2 CUPTI_PROFILER_CONFIGURATION_SUPPORTED = 3 + CUPTI_PROFILER_CONFIGURATION_LIMITED_SUPPORT = 4 end @cenum CUpti_Profiler_API::UInt32 begin @@ -10184,79 +10591,13 @@ function Base.setproperty!(x::Ptr{var"##Ctag#399"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#401" - smClock::UInt32 - memoryClock::UInt32 - pcieLinkGen::UInt32 - pcieLinkWidth::UInt32 - clocksThrottleReasons::CUpti_EnvironmentClocksThrottleReason -end -function Base.getproperty(x::Ptr{var"##Ctag#401"}, f::Symbol) - f === :smClock && return Ptr{UInt32}(x + 0) - f === :memoryClock && return Ptr{UInt32}(x + 4) - f === :pcieLinkGen && return Ptr{UInt32}(x + 8) - f === :pcieLinkWidth && return Ptr{UInt32}(x + 12) - f === :clocksThrottleReasons && - return Ptr{CUpti_EnvironmentClocksThrottleReason}(x + 16) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#401", f::Symbol) - r = Ref{var"##Ctag#401"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#401"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#401"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#402" - gpuTemperature::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#402"}, f::Symbol) - f === :gpuTemperature && return Ptr{UInt32}(x + 0) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#402", f::Symbol) - r = Ref{var"##Ctag#402"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#402"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#402"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#403" - power::UInt32 - powerLimit::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#403"}, f::Symbol) - f === :power && return Ptr{UInt32}(x + 0) - f === :powerLimit && return Ptr{UInt32}(x + 4) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#403", f::Symbol) - r = Ref{var"##Ctag#403"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#403"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#403"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - struct var"##Ctag#404" - fanSpeed::UInt32 + requested::UInt8 + executed::UInt8 end function Base.getproperty(x::Ptr{var"##Ctag#404"}, f::Symbol) - f === :fanSpeed && return Ptr{UInt32}(x + 0) + f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) + f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end @@ -10271,198 +10612,110 @@ function Base.setproperty!(x::Ptr{var"##Ctag#404"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#407" - uuidDev::CUuuid - peerDev::NTuple{32,CUdevice} -end -function Base.getproperty(x::Ptr{var"##Ctag#407"}, f::Symbol) - f === :uuidDev && return Ptr{CUuuid}(x + 0) - f === :peerDev && return Ptr{NTuple{32,CUdevice}}(x + 16) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#407", f::Symbol) - r = Ref{var"##Ctag#407"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#407"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#407"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#408" - secondaryBus::UInt16 - deviceId::UInt16 - vendorId::UInt16 - pad0::UInt16 -end -function Base.getproperty(x::Ptr{var"##Ctag#408"}, f::Symbol) - f === :secondaryBus && return Ptr{UInt16}(x + 0) - f === :deviceId && return Ptr{UInt16}(x + 2) - f === :vendorId && return Ptr{UInt16}(x + 4) - f === :pad0 && return Ptr{UInt16}(x + 6) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#408", f::Symbol) - r = Ref{var"##Ctag#408"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#408"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#408"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#410" +struct var"##Ctag#418" requested::UInt8 executed::UInt8 end -function Base.getproperty(x::Ptr{var"##Ctag#410"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#418"}, f::Symbol) f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#410", f::Symbol) - r = Ref{var"##Ctag#410"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#410"}, r) +function Base.getproperty(x::var"##Ctag#418", f::Symbol) + r = Ref{var"##Ctag#418"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#418"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#410"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#418"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#424" +struct var"##Ctag#425" requested::UInt8 executed::UInt8 end -function Base.getproperty(x::Ptr{var"##Ctag#424"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#425"}, f::Symbol) f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#424", f::Symbol) - r = Ref{var"##Ctag#424"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#424"}, r) +function Base.getproperty(x::var"##Ctag#425", f::Symbol) + r = Ref{var"##Ctag#425"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#425"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#424"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#425"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#441" +struct var"##Ctag#437" processId::UInt32 threadId::UInt32 end -function Base.getproperty(x::Ptr{var"##Ctag#441"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#437"}, f::Symbol) f === :processId && return Ptr{UInt32}(x + 0) f === :threadId && return Ptr{UInt32}(x + 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#441", f::Symbol) - r = Ref{var"##Ctag#441"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#441"}, r) +function Base.getproperty(x::var"##Ctag#437", f::Symbol) + r = Ref{var"##Ctag#437"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#437"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#441"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#437"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#442" +struct var"##Ctag#438" deviceId::UInt32 contextId::UInt32 streamId::UInt32 end -function Base.getproperty(x::Ptr{var"##Ctag#442"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#438"}, f::Symbol) f === :deviceId && return Ptr{UInt32}(x + 0) f === :contextId && return Ptr{UInt32}(x + 4) f === :streamId && return Ptr{UInt32}(x + 8) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#442", f::Symbol) - r = Ref{var"##Ctag#442"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#442"}, r) +function Base.getproperty(x::var"##Ctag#438", f::Symbol) + r = Ref{var"##Ctag#438"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#438"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#442"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#438"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end -struct var"##Ctag#444" +struct var"##Ctag#440" requested::UInt8 executed::UInt8 end -function Base.getproperty(x::Ptr{var"##Ctag#444"}, f::Symbol) +function Base.getproperty(x::Ptr{var"##Ctag#440"}, f::Symbol) f === :requested && return (Ptr{UInt8}(x + 0), 0, 4) f === :executed && return (Ptr{UInt8}(x + 0), 4, 4) return getfield(x, f) end -function Base.getproperty(x::var"##Ctag#444", f::Symbol) - r = Ref{var"##Ctag#444"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#444"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#444"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#446" - index::UInt32 - domainId::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#446"}, f::Symbol) - f === :index && return Ptr{UInt32}(x + 0) - f === :domainId && return Ptr{UInt32}(x + 4) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#446", f::Symbol) - r = Ref{var"##Ctag#446"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#446"}, r) - fptr = getproperty(ptr, f) - GC.@preserve r unsafe_load(fptr) -end - -function Base.setproperty!(x::Ptr{var"##Ctag#446"}, f::Symbol, v) - return unsafe_store!(getproperty(x, f), v) -end - -struct var"##Ctag#448" - index::UInt32 - domainId::UInt32 -end -function Base.getproperty(x::Ptr{var"##Ctag#448"}, f::Symbol) - f === :index && return Ptr{UInt32}(x + 0) - f === :domainId && return Ptr{UInt32}(x + 4) - return getfield(x, f) -end - -function Base.getproperty(x::var"##Ctag#448", f::Symbol) - r = Ref{var"##Ctag#448"}(x) - ptr = Base.unsafe_convert(Ptr{var"##Ctag#448"}, r) +function Base.getproperty(x::var"##Ctag#440", f::Symbol) + r = Ref{var"##Ctag#440"}(x) + ptr = Base.unsafe_convert(Ptr{var"##Ctag#440"}, r) fptr = getproperty(ptr, f) GC.@preserve r unsafe_load(fptr) end -function Base.setproperty!(x::Ptr{var"##Ctag#448"}, f::Symbol, v) +function Base.setproperty!(x::Ptr{var"##Ctag#440"}, f::Symbol, v) return unsafe_store!(getproperty(x, f), v) end diff --git a/lib/curand/Project.toml b/lib/curand/Project.toml index 5177bcb3c2..48d41f8016 100644 --- a/lib/curand/Project.toml +++ b/lib/curand/Project.toml @@ -20,7 +20,7 @@ RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "0.3, 1" Random = "1" Random123 = "1.2" diff --git a/lib/cusolver/Project.toml b/lib/cusolver/Project.toml index 9df0ef4147..3aa905b5c1 100644 --- a/lib/cusolver/Project.toml +++ b/lib/cusolver/Project.toml @@ -25,7 +25,7 @@ cuSPARSE = {path = "../cusparse"} CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUToolbox = "0.3, 1" LinearAlgebra = "1" SparseArrays = "1" diff --git a/lib/cusolver/src/libcusolver.jl b/lib/cusolver/src/libcusolver.jl index 063a51588a..e5d25d6020 100644 --- a/lib/cusolver/src/libcusolver.jl +++ b/lib/cusolver/src/libcusolver.jl @@ -54,6 +54,7 @@ const cusolverDnParams_t = Ptr{cusolverDnParams} CUSOLVERDN_GETRF = 0 CUSOLVERDN_POTRF = 1 CUSOLVERDN_SYEVBATCHED = 2 + CUSOLVERDN_GEQRF = 3 end const cusolver_int_t = Cint @@ -103,6 +104,12 @@ end CUSOLVER_EIG_RANGE_V = 1003 end +@cenum cusolverEigComp_t::UInt32 begin + CUSOLVER_EIG_COMP_N = 10 + CUSOLVER_EIG_COMP_I = 11 + CUSOLVER_EIG_COMP_V = 12 +end + @cenum cusolverNorm_t::UInt32 begin CUSOLVER_INF_NORM = 104 CUSOLVER_MAX_NORM = 105 @@ -4630,6 +4637,43 @@ end info::CuPtr{Cint})::cusolverStatus_t end +@checked function cusolverDnXstedc_bufferSize(handle, params, compz, n, dataTypeDE, D, E, + dataTypeZ, Z, ldz, computeType, + workspaceInBytesOnDevice, + workspaceInBytesOnHost) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXstedc_bufferSize(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + compz::cusolverEigComp_t, + n::Int64, + dataTypeDE::cudaDataType, + D::Ptr{Cvoid}, E::Ptr{Cvoid}, + dataTypeZ::cudaDataType, + Z::Ptr{Cvoid}, ldz::Int64, + computeType::cudaDataType, + workspaceInBytesOnDevice::Ptr{Csize_t}, + workspaceInBytesOnHost::Ptr{Csize_t})::cusolverStatus_t +end + +@checked function cusolverDnXstedc(handle, params, compz, n, dataTypeDE, D, E, dataTypeZ, Z, + ldz, computeType, bufferOnDevice, + workspaceInBytesOnDevice, bufferOnHost, + workspaceInBytesOnHost, info) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXstedc(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + compz::cusolverEigComp_t, n::Int64, + dataTypeDE::cudaDataType, D::Ptr{Cvoid}, + E::Ptr{Cvoid}, dataTypeZ::cudaDataType, + Z::Ptr{Cvoid}, ldz::Int64, + computeType::cudaDataType, + bufferOnDevice::Ptr{Cvoid}, + workspaceInBytesOnDevice::Csize_t, + bufferOnHost::Ptr{Cvoid}, + workspaceInBytesOnHost::Csize_t, + info::Ptr{Cint})::cusolverStatus_t +end + @checked function cusolverDnXsyevBatched_bufferSize(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, workspaceInBytesOnDevice, @@ -4974,12 +5018,52 @@ end @gcsafe_ccall libcusolver.cusolverDnLoggerSetMask(mask::Cint)::cusolverStatus_t end -# no prototype is found for this function at cusolverDn.h:4897:32, please use with caution +# no prototype is found for this function at cusolverDn.h:4932:32, please use with caution @checked function cusolverDnLoggerForceDisable() initialize_context() @gcsafe_ccall libcusolver.cusolverDnLoggerForceDisable()::cusolverStatus_t end +@checked function cusolverDnXpolar_bufferSize(handle, params, uplo, M, N, dataTypeA, A, lda, + dataTypeH, H, ldh, computeType, + workspaceInBytesOnDevice, + workspaceInBytesOnHost) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXpolar_bufferSize(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + uplo::cublasFillMode_t, M::Int64, + N::Int64, dataTypeA::cudaDataType, + A::Ptr{Cvoid}, lda::Int64, + dataTypeH::cudaDataType, + H::Ptr{Cvoid}, ldh::Int64, + computeType::cudaDataType, + workspaceInBytesOnDevice::Ptr{Csize_t}, + workspaceInBytesOnHost::Ptr{Csize_t})::cusolverStatus_t +end + +@checked function cusolverDnXpolar(handle, params, uplo, M, N, dataTypeA, A, lda, dataTypeH, + H, ldh, computeType, bufferOnDevice, + workspaceInBytesOnDevice, bufferOnHost, + workspaceInBytesOnHost, d_res_nrm, d_A_nrmF, d_rcond, + d_info) + initialize_context() + @gcsafe_ccall libcusolver.cusolverDnXpolar(handle::cusolverDnHandle_t, + params::cusolverDnParams_t, + uplo::cublasFillMode_t, M::Int64, N::Int64, + dataTypeA::cudaDataType, A::Ptr{Cvoid}, + lda::Int64, dataTypeH::cudaDataType, + H::Ptr{Cvoid}, ldh::Int64, + computeType::cudaDataType, + bufferOnDevice::Ptr{Cvoid}, + workspaceInBytesOnDevice::Csize_t, + bufferOnHost::Ptr{Cvoid}, + workspaceInBytesOnHost::Csize_t, + d_res_nrm::Ptr{Cdouble}, + d_A_nrmF::Ptr{Cdouble}, + d_rcond::Ptr{Cdouble}, + d_info::Ptr{Cint})::cusolverStatus_t +end + mutable struct cusolverSpContext end const cusolverSpHandle_t = Ptr{cusolverSpContext} diff --git a/lib/cusparse/Project.toml b/lib/cusparse/Project.toml index 5f5125e9b5..edc866e4c0 100644 --- a/lib/cusparse/Project.toml +++ b/lib/cusparse/Project.toml @@ -29,7 +29,7 @@ Adapt = "4.4" CEnum = "0.2, 0.3, 0.4, 0.5" CUDACore = "=6.1.0" CUDA_Runtime_Discovery = "2" -CUDA_Runtime_jll = "0.22" +CUDA_Runtime_jll = "0.23" GPUArrays = "11.4.1" GPUToolbox = "0.3, 1" KernelAbstractions = "0.9.38" diff --git a/lib/cusparse/src/extra.jl b/lib/cusparse/src/extra.jl index 4cbbcd103e..09486bc8a5 100644 --- a/lib/cusparse/src/extra.jl +++ b/lib/cusparse/src/extra.jl @@ -7,12 +7,59 @@ Performs `C = alpha * A + beta * B`. `A` and `B` are sparse matrices defined in """ geam(alpha::Number, A::CuSparseMatrixCSR, beta::Number, B::CuSparseMatrixCSR, index::SparseChar) +function geam(alpha::Number, A::CuSparseMatrixCSR{T}, beta::Number, B::CuSparseMatrixCSR{T}, index::SparseChar) where {T <: BlasFloat} + # the generic SpGEAM API was added in cuSPARSE 12.8.1 (CUDA 13.3); use the + # legacy csrgeam2 path on older versions. + version() >= v"12.8.1" ? geam_spgeam(alpha, A, beta, B, index) : + geam_csrgeam2(alpha, A, beta, B, index) +end + +function geam_spgeam(alpha::Number, A::CuSparseMatrixCSR{T}, beta::Number, B::CuSparseMatrixCSR{T}, index::SparseChar) where {T <: BlasFloat} + m, n = size(A) + (m, n) == size(B) || throw(DimensionMismatch("dimensions must match: A has dims $(size(A)), B has dims $(size(B))")) + + alpha_ref = Ref{T}(convert(T, alpha)) + beta_ref = Ref{T}(convert(T, beta)) + descA = CuSparseMatrixDescriptor(A, index) + descB = CuSparseMatrixDescriptor(B, index) + + rowPtrC = CuVector{Cint}(undef, m + 1) + descC = CuSparseMatrixDescriptor(CuSparseMatrixCSR, rowPtrC, T, Cint, m, n, index) + spgeam_desc = CuSpGEAMDescriptor() + + local C + GC.@preserve rowPtrC begin + function bufferSize() + out = Ref{Csize_t}() + cusparseSpGEAM_bufferSize(handle(), 'N', 'N', alpha_ref, descA, beta_ref, descB, + descC, T, CUSPARSE_SPGEAM_ALG_DEFAULT, spgeam_desc, out) + return out[] + end + with_workspace(bufferSize) do buffer + # determine the sparsity pattern (fills rowPtrC) and the number of nonzeros + cusparseSpGEAM_nnz(handle(), 'N', 'N', alpha_ref, descA, beta_ref, descB, + descC, T, CUSPARSE_SPGEAM_ALG_DEFAULT, spgeam_desc, buffer) + nnzC = Ref{Int64}() + cusparseSpMatGetSize(descC, Ref{Int64}(), Ref{Int64}(), nnzC) + + colValC = CuVector{Cint}(undef, nnzC[]) + nzValC = CuVector{T}(undef, nnzC[]) + C = CuSparseMatrixCSR{T, Cint}(rowPtrC, colValC, nzValC, (m, n)) + cusparseCsrSetPointers(descC, C.rowPtr, C.colVal, C.nzVal) + + cusparseSpGEAM(handle(), 'N', 'N', alpha_ref, descA, beta_ref, descB, + descC, T, CUSPARSE_SPGEAM_ALG_DEFAULT, spgeam_desc, buffer) + end + end + return C +end + for (bname,gname,elty) in ((:cusparseScsrgeam2_bufferSizeExt, :cusparseScsrgeam2, :Float32), (:cusparseDcsrgeam2_bufferSizeExt, :cusparseDcsrgeam2, :Float64), (:cusparseCcsrgeam2_bufferSizeExt, :cusparseCcsrgeam2, :ComplexF32), (:cusparseZcsrgeam2_bufferSizeExt, :cusparseZcsrgeam2, :ComplexF64)) @eval begin - function geam(alpha::Number, A::CuSparseMatrixCSR{$elty}, beta::Number, B::CuSparseMatrixCSR{$elty}, index::SparseChar) + function geam_csrgeam2(alpha::Number, A::CuSparseMatrixCSR{$elty}, beta::Number, B::CuSparseMatrixCSR{$elty}, index::SparseChar) m, n = size(A) (m, n) == size(B) || throw(DimensionMismatch("dimensions must match: A has dims $(size(A)), B has dims $(size(B))")) descrA = CuMatrixDescriptor('G', 'L', 'N', index) diff --git a/lib/cusparse/src/generic.jl b/lib/cusparse/src/generic.jl index e0d10e4a70..3470b20186 100644 --- a/lib/cusparse/src/generic.jl +++ b/lib/cusparse/src/generic.jl @@ -725,7 +725,10 @@ function sv!(transa::SparseChar, uplo::SparseChar, diag::SparseChar, # Support transa = 'C' for real matrices transa = T <: Real && transa == 'C' ? 'T' : transa - if isa(A, CuSparseMatrixCSC) && transa == 'C' && T <: Complex + # native CSC support was added in cuSPARSE 12.8.1 (CUDA 13.3); older versions + # model a CSC matrix as its transposed CSR, which cannot represent transa = 'C'. + csc_workaround = isa(A, CuSparseMatrixCSC) && version() < v"12.8.1" + if csc_workaround && transa == 'C' && T <: Complex throw(ArgumentError("Backward and forward sweeps with the adjoint of a complex CSC matrix is not supported. Use a CSR or COO matrix instead.")) end @@ -737,8 +740,7 @@ function sv!(transa::SparseChar, uplo::SparseChar, diag::SparseChar, (mX != mA) && throw(DimensionMismatch("X must have length $mA, but has length $mX")) (mY != mA) && throw(DimensionMismatch("Y must have length $mA, but has length $mY")) - if isa(A, CuSparseMatrixCSC) - # cusparseSpSV doesn't support CSC matrices so we use Aᵀ to model them as CSR matrices. + if csc_workaround descA = CuSparseMatrixDescriptor(A, index, transposed=true) transa = transa == 'N' ? 'T' : 'N' uplo = uplo == 'U' ? 'L' : 'U' @@ -780,7 +782,10 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa # In that case we need to update the descriptor of C such that it represents Bᵀ. is_C_transposed = (B === C) && (transb != 'N') - if isa(A, CuSparseMatrixCSC) && transa == 'C' && T <: Complex + # native CSC support was added in cuSPARSE 12.8.1 (CUDA 13.3); older versions + # model a CSC matrix as its transposed CSR, which cannot represent transa = 'C'. + csc_workaround = isa(A, CuSparseMatrixCSC) && version() < v"12.8.1" + if csc_workaround && transa == 'C' && T <: Complex throw(ArgumentError("Backward and forward sweeps with the adjoint of a complex CSC matrix is not supported. Use a CSR or COO matrix instead.")) end @@ -795,8 +800,7 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa (nB != nC) && (transb == 'N') && throw(DimensionMismatch("B and C must have the same number of columns, but B has $nB columns and C has $nC columns")) (mB != nC) && (transb != 'N') && throw(DimensionMismatch("B must have the same the number of rows that C has as columns, but B has $mB rows and C has $nC columns")) - if isa(A, CuSparseMatrixCSC) - # cusparseSpSM doesn't support CSC matrices so we use Aᵀ to model them as CSR matrices. + if csc_workaround descA = CuSparseMatrixDescriptor(A, index, transposed=true) transa = transa == 'N' ? 'T' : 'N' uplo = uplo == 'U' ? 'L' : 'U' diff --git a/lib/cusparse/src/helpers.jl b/lib/cusparse/src/helpers.jl index 55d8326247..fd0ddb2546 100644 --- a/lib/cusparse/src/helpers.jl +++ b/lib/cusparse/src/helpers.jl @@ -272,6 +272,20 @@ end Base.unsafe_convert(::Type{cusparseSpGEMMDescr_t}, desc::CuSpGEMMDescriptor) = desc.handle +mutable struct CuSpGEAMDescriptor + handle::cusparseSpGEAMDescr_t + + function CuSpGEAMDescriptor() + descr_ref = Ref{cusparseSpGEAMDescr_t}() + cusparseSpGEAM_createDescr(descr_ref) + obj = new(descr_ref[]) + finalizer(cusparseSpGEAM_destroyDescr, obj) + obj + end +end + +Base.unsafe_convert(::Type{cusparseSpGEAMDescr_t}, desc::CuSpGEAMDescriptor) = desc.handle + mutable struct CuSparseSpSVDescriptor handle::cusparseSpSVDescr_t diff --git a/lib/cusparse/src/libcusparse.jl b/lib/cusparse/src/libcusparse.jl index fe205ce761..8183b49986 100644 --- a/lib/cusparse/src/libcusparse.jl +++ b/lib/cusparse/src/libcusparse.jl @@ -6222,6 +6222,74 @@ end spgemmDescr::cusparseSpGEMMDescr_t)::cusparseStatus_t end +@cenum cusparseSpGEAMAlg_t::UInt32 begin + CUSPARSE_SPGEAM_ALG_DEFAULT = 0 + CUSPARSE_SPGEAM_ALG1 = 1 +end + +mutable struct cusparseSpGEAMDescr end + +const cusparseSpGEAMDescr_t = Ptr{cusparseSpGEAMDescr} + +@checked function cusparseSpGEAM_createDescr(descr) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_createDescr(descr::Ptr{cusparseSpGEAMDescr_t})::cusparseStatus_t +end + +@checked function cusparseSpGEAM_destroyDescr(descr) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_destroyDescr(descr::cusparseSpGEAMDescr_t)::cusparseStatus_t +end + +@checked function cusparseSpGEAM_bufferSize(handle, opA, opB, alpha, matA, beta, matB, matC, + computeType, alg, spgeamDescr, bufferSize) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_bufferSize(handle::cusparseHandle_t, + opA::cusparseOperation_t, + opB::cusparseOperation_t, + alpha::PtrOrCuPtr{Cvoid}, + matA::cusparseSpMatDescr_t, + beta::PtrOrCuPtr{Cvoid}, + matB::cusparseSpMatDescr_t, + matC::cusparseSpMatDescr_t, + computeType::cudaDataType, + alg::cusparseSpGEAMAlg_t, + spgeamDescr::cusparseSpGEAMDescr_t, + bufferSize::Ref{Csize_t})::cusparseStatus_t +end + +@checked function cusparseSpGEAM_nnz(handle, opA, opB, alpha, matA, beta, matB, matC, + computeType, alg, spgeamDescr, externalBuffer) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM_nnz(handle::cusparseHandle_t, + opA::cusparseOperation_t, + opB::cusparseOperation_t, + alpha::PtrOrCuPtr{Cvoid}, + matA::cusparseSpMatDescr_t, + beta::PtrOrCuPtr{Cvoid}, + matB::cusparseSpMatDescr_t, + matC::cusparseSpMatDescr_t, + computeType::cudaDataType, + alg::cusparseSpGEAMAlg_t, + spgeamDescr::cusparseSpGEAMDescr_t, + externalBuffer::CuPtr{Cvoid})::cusparseStatus_t +end + +@checked function cusparseSpGEAM(handle, opA, opB, alpha, matA, beta, matB, matC, + computeType, alg, spgeamDescr, externalBuffer) + initialize_context() + @gcsafe_ccall libcusparse.cusparseSpGEAM(handle::cusparseHandle_t, + opA::cusparseOperation_t, + opB::cusparseOperation_t, alpha::PtrOrCuPtr{Cvoid}, + matA::cusparseSpMatDescr_t, beta::PtrOrCuPtr{Cvoid}, + matB::cusparseSpMatDescr_t, + matC::cusparseSpMatDescr_t, + computeType::cudaDataType, + alg::cusparseSpGEAMAlg_t, + spgeamDescr::cusparseSpGEAMDescr_t, + externalBuffer::CuPtr{Cvoid})::cusparseStatus_t +end + @cenum cusparseSDDMMAlg_t::UInt32 begin CUSPARSE_SDDMM_ALG_DEFAULT = 0 end diff --git a/lib/cusparse/test/generic.jl b/lib/cusparse/test/generic.jl index e441b3bc85..146f9b7ce4 100644 --- a/lib/cusparse/test/generic.jl +++ b/lib/cusparse/test/generic.jl @@ -196,7 +196,8 @@ for SparseMatrixType in [CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO @testset "$SparseMatrixType -- sv! algo=$algo" for algo in SPSV_ALGOS[SparseMatrixType] @testset "sv! $T" for T in [Float64, ComplexF64] @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)] - SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue + # adjoint of a complex CSC matrix needs native CSC support (cuSPARSE 12.8.1+) + SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && cuSPARSE.version() < v"12.8.1" && continue @testset "uplo = $uplo" for uplo in ('L', 'U') @testset "diag = $diag" for diag in ('U', 'N') A = rand(T, 10, 10) @@ -220,7 +221,8 @@ for SparseMatrixType in [CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO @testset "$SparseMatrixType -- sm! algo=$algo" for algo in SPSM_ALGOS[SparseMatrixType] @testset "sm! $T" for T in [Float64, ComplexF64] @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)] - SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue + # adjoint of a complex CSC matrix needs native CSC support (cuSPARSE 12.8.1+) + SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && cuSPARSE.version() < v"12.8.1" && continue @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose)] @testset "uplo = $uplo" for uplo in ('L', 'U') @testset "diag = $diag" for diag in ('U', 'N') diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl index b33560b723..84ea9dca94 100644 --- a/lib/cutensor/src/libcutensor.jl +++ b/lib/cutensor/src/libcutensor.jl @@ -550,7 +550,7 @@ end beta::Ptr{Cvoid}, C::Ptr{CuPtr{Cvoid}}, D::Ptr{CuPtr{Cvoid}}, - workspace::CuPtr{Cvoid}, + workspace::Ptr{CuPtr{Cvoid}}, workspaceSize::UInt64, stream::cudaStream_t)::cutensorStatus_t end diff --git a/lib/nvml/src/libnvml.jl b/lib/nvml/src/libnvml.jl index 1ea71656dd..683c3a6cc6 100644 --- a/lib/nvml/src/libnvml.jl +++ b/lib/nvml/src/libnvml.jl @@ -47,6 +47,8 @@ end const nvmlMemoryErrorType_t = nvmlMemoryErrorType_enum +const nvmlCPERCursorHandle_t = Culonglong + mutable struct nvmlDevice_st end const nvmlDevice_t = Ptr{nvmlDevice_st} @@ -149,6 +151,16 @@ struct nvmlProcessDetail_v1_t usedGpuCcProtectedMemory::Culonglong end +@cenum nvmlProcessMode_enum::UInt32 begin + NVML_PROCESS_MODE_COMPUTE = 0 + NVML_PROCESS_MODE_GRAPHICS = 1 + NVML_PROCESS_MODE_MPS = 2 + NVML_PROCESS_MODE_ALL = 3 + NVML_PROCESS_MODE_MAX = 4 +end + +const nvmlProcessMode_t = nvmlProcessMode_enum + struct nvmlProcessDetailList_v1_t version::Cuint mode::Cuint @@ -650,6 +662,10 @@ end const nvmlPdi_t = nvmlPdi_v1_t +struct nvmlBBXTimeData_v1_t + timeRun::Cuint +end + @cenum nvmlEnableState_enum::UInt32 begin NVML_FEATURE_DISABLED = 0 NVML_FEATURE_ENABLED = 1 @@ -1019,6 +1035,15 @@ end const nvmlEccSramUniqueUncorrectedErrorCounts_t = nvmlEccSramUniqueUncorrectedErrorCounts_v1_t +struct nvmlRemappedRowsInfo_v2_t + corrActiveRemaps::Cuint + corrInactiveRemaps::Cuint + uncActiveRemaps::Cuint + uncInactiveRemaps::Cuint + bPending::Cuint + bFailureOccurred::Cuint +end + struct nvmlRusdSettings_v1_t version::Cuint pollMask::Culonglong @@ -1586,6 +1611,7 @@ const nvmlGridLicensableFeatures_t = nvmlGridLicensableFeatures_st NVML_GPU_RECOVERY_ACTION_NODE_REBOOT = 2 NVML_GPU_RECOVERY_ACTION_DRAIN_P2P = 3 NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET = 4 + NVML_GPU_RECOVERY_ACTION_RECOVER_IMEX_DOMAIN = 5 end const nvmlDeviceGpuRecoveryAction_t = nvmlDeviceGpuRecoveryAction_s @@ -1939,6 +1965,19 @@ end const nvmlAccountingStats_t = nvmlAccountingStats_st +struct nvmlAccountingStats_v2_t + pid::Cuint + isRunning::Cuint + gpuUtilization::Cuint + memoryUtilization::Cuint + maxMemoryUsage::Culonglong + sampleCount::Cuint + sumGpuUtil::Culonglong + sumFbUtil::Culonglong + time::Culonglong + startTime::Culonglong +end + @cenum nvmlEncoderQueryType_enum::UInt32 begin NVML_ENCODER_QUERY_H264 = 0 NVML_ENCODER_QUERY_HEVC = 1 @@ -2126,6 +2165,22 @@ function nvmlErrorString(result) @gcsafe_ccall libnvml.nvmlErrorString(result::nvmlReturn_t)::Cstring end +@cenum nvmlCPERType_t::UInt32 begin + NVML_CPER_ACCESS_TYPE_GPU = 1 +end + +struct nvmlCPERCursor_v1_t + cperTypeMask::Cuint + uuid::NTuple{80,Cchar} + handle::nvmlCPERCursorHandle_t +end + +struct nvmlGetCPER_v1_t + cursor::nvmlCPERCursor_v1_t + buffer::Ptr{Cuchar} + bufferSize::Cuint +end + @checked function nvmlSystemGetDriverVersion(version, length) initialize_context() @gcsafe_ccall libnvml.nvmlSystemGetDriverVersion(version::Cstring, @@ -2179,6 +2234,11 @@ const nvmlSystemDriverBranchInfo_t = nvmlSystemDriverBranchInfo_v1_t length::Cuint)::nvmlReturn_t end +@checked function nvmlSystemGetCPER_v1(cper) + initialize_context() + @gcsafe_ccall libnvml.nvmlSystemGetCPER_v1(cper::Ptr{nvmlGetCPER_v1_t})::nvmlReturn_t +end + @checked function nvmlUnitGetCount(unitCount) initialize_context() @gcsafe_ccall libnvml.nvmlUnitGetCount(unitCount::Ptr{Cuint})::nvmlReturn_t @@ -2436,6 +2496,12 @@ end durationUs::Ptr{Culong})::nvmlReturn_t end +@checked function nvmlDeviceGetBBXTimeData_v1(device, timeData) + initialize_context() + @gcsafe_ccall libnvml.nvmlDeviceGetBBXTimeData_v1(device::nvmlDevice_t, + timeData::Ptr{nvmlBBXTimeData_v1_t})::nvmlReturn_t +end + @checked function nvmlDeviceGetDisplayMode(device, display) initialize_context() @gcsafe_ccall libnvml.nvmlDeviceGetDisplayMode(device::nvmlDevice_t, @@ -3256,6 +3322,12 @@ end bufferSize::Ptr{Cuint})::nvmlReturn_t end +@checked function nvmlDeviceGetAccountingStats_v2(device, stats) + initialize_context() + @gcsafe_ccall libnvml.nvmlDeviceGetAccountingStats_v2(device::nvmlDevice_t, + stats::Ptr{nvmlAccountingStats_v2_t})::nvmlReturn_t +end + @checked function nvmlDeviceGetRetiredPages(device, cause, pageCount, addresses) initialize_context() @gcsafe_ccall libnvml.nvmlDeviceGetRetiredPages(device::nvmlDevice_t, @@ -3517,6 +3589,7 @@ end NVML_NVLINK_VERSION_3_1 = 5 NVML_NVLINK_VERSION_4_0 = 6 NVML_NVLINK_VERSION_5_0 = 7 + NVML_NVLINK_VERSION_6_0 = 8 end const nvmlNvlinkVersion_t = nvmlNvlinkVersion_enum @@ -5054,6 +5127,42 @@ end NVML_GPM_METRIC_GR7_CTXSW_REQUESTS = 207 NVML_GPM_METRIC_GR7_CTXSW_CYCLES_PER_REQ = 208 NVML_GPM_METRIC_GR7_CTXSW_ACTIVE_PCT = 209 + NVML_GPM_METRIC_NVLINK_L18_RX_PER_SEC = 212 + NVML_GPM_METRIC_NVLINK_L18_TX_PER_SEC = 213 + NVML_GPM_METRIC_NVLINK_L19_RX_PER_SEC = 214 + NVML_GPM_METRIC_NVLINK_L19_TX_PER_SEC = 215 + NVML_GPM_METRIC_NVLINK_L20_RX_PER_SEC = 216 + NVML_GPM_METRIC_NVLINK_L20_TX_PER_SEC = 217 + NVML_GPM_METRIC_NVLINK_L21_RX_PER_SEC = 218 + NVML_GPM_METRIC_NVLINK_L21_TX_PER_SEC = 219 + NVML_GPM_METRIC_NVLINK_L22_RX_PER_SEC = 220 + NVML_GPM_METRIC_NVLINK_L22_TX_PER_SEC = 221 + NVML_GPM_METRIC_NVLINK_L23_RX_PER_SEC = 222 + NVML_GPM_METRIC_NVLINK_L23_TX_PER_SEC = 223 + NVML_GPM_METRIC_NVLINK_L24_RX_PER_SEC = 224 + NVML_GPM_METRIC_NVLINK_L24_TX_PER_SEC = 225 + NVML_GPM_METRIC_NVLINK_L25_RX_PER_SEC = 226 + NVML_GPM_METRIC_NVLINK_L25_TX_PER_SEC = 227 + NVML_GPM_METRIC_NVLINK_L26_RX_PER_SEC = 228 + NVML_GPM_METRIC_NVLINK_L26_TX_PER_SEC = 229 + NVML_GPM_METRIC_NVLINK_L27_RX_PER_SEC = 230 + NVML_GPM_METRIC_NVLINK_L27_TX_PER_SEC = 231 + NVML_GPM_METRIC_NVLINK_L28_RX_PER_SEC = 232 + NVML_GPM_METRIC_NVLINK_L28_TX_PER_SEC = 233 + NVML_GPM_METRIC_NVLINK_L29_RX_PER_SEC = 234 + NVML_GPM_METRIC_NVLINK_L29_TX_PER_SEC = 235 + NVML_GPM_METRIC_NVLINK_L30_RX_PER_SEC = 236 + NVML_GPM_METRIC_NVLINK_L30_TX_PER_SEC = 237 + NVML_GPM_METRIC_NVLINK_L31_RX_PER_SEC = 238 + NVML_GPM_METRIC_NVLINK_L31_TX_PER_SEC = 239 + NVML_GPM_METRIC_NVLINK_L32_RX_PER_SEC = 240 + NVML_GPM_METRIC_NVLINK_L32_TX_PER_SEC = 241 + NVML_GPM_METRIC_NVLINK_L33_RX_PER_SEC = 242 + NVML_GPM_METRIC_NVLINK_L33_TX_PER_SEC = 243 + NVML_GPM_METRIC_NVLINK_L34_RX_PER_SEC = 244 + NVML_GPM_METRIC_NVLINK_L34_TX_PER_SEC = 245 + NVML_GPM_METRIC_NVLINK_L35_RX_PER_SEC = 246 + NVML_GPM_METRIC_NVLINK_L35_TX_PER_SEC = 247 NVML_GPM_METRIC_SM_CYCLES_ELAPSED = 248 NVML_GPM_METRIC_SM_CYCLES_ACTIVE = 249 NVML_GPM_METRIC_MMA_CYCLES_ACTIVE = 250 @@ -5103,6 +5212,42 @@ end NVML_GPM_METRIC_NVLINK_L16_TX = 294 NVML_GPM_METRIC_NVLINK_L17_RX = 295 NVML_GPM_METRIC_NVLINK_L17_TX = 296 + NVML_GPM_METRIC_NVLINK_L18_RX = 297 + NVML_GPM_METRIC_NVLINK_L18_TX = 298 + NVML_GPM_METRIC_NVLINK_L19_RX = 299 + NVML_GPM_METRIC_NVLINK_L19_TX = 300 + NVML_GPM_METRIC_NVLINK_L20_RX = 301 + NVML_GPM_METRIC_NVLINK_L20_TX = 302 + NVML_GPM_METRIC_NVLINK_L21_RX = 303 + NVML_GPM_METRIC_NVLINK_L21_TX = 304 + NVML_GPM_METRIC_NVLINK_L22_RX = 305 + NVML_GPM_METRIC_NVLINK_L22_TX = 306 + NVML_GPM_METRIC_NVLINK_L23_RX = 307 + NVML_GPM_METRIC_NVLINK_L23_TX = 308 + NVML_GPM_METRIC_NVLINK_L24_RX = 309 + NVML_GPM_METRIC_NVLINK_L24_TX = 310 + NVML_GPM_METRIC_NVLINK_L25_RX = 311 + NVML_GPM_METRIC_NVLINK_L25_TX = 312 + NVML_GPM_METRIC_NVLINK_L26_RX = 313 + NVML_GPM_METRIC_NVLINK_L26_TX = 314 + NVML_GPM_METRIC_NVLINK_L27_RX = 315 + NVML_GPM_METRIC_NVLINK_L27_TX = 316 + NVML_GPM_METRIC_NVLINK_L28_RX = 317 + NVML_GPM_METRIC_NVLINK_L28_TX = 318 + NVML_GPM_METRIC_NVLINK_L29_RX = 319 + NVML_GPM_METRIC_NVLINK_L29_TX = 320 + NVML_GPM_METRIC_NVLINK_L30_RX = 321 + NVML_GPM_METRIC_NVLINK_L30_TX = 322 + NVML_GPM_METRIC_NVLINK_L31_RX = 323 + NVML_GPM_METRIC_NVLINK_L31_TX = 324 + NVML_GPM_METRIC_NVLINK_L32_RX = 325 + NVML_GPM_METRIC_NVLINK_L32_TX = 326 + NVML_GPM_METRIC_NVLINK_L33_RX = 327 + NVML_GPM_METRIC_NVLINK_L33_TX = 328 + NVML_GPM_METRIC_NVLINK_L34_RX = 329 + NVML_GPM_METRIC_NVLINK_L34_TX = 330 + NVML_GPM_METRIC_NVLINK_L35_RX = 331 + NVML_GPM_METRIC_NVLINK_L35_TX = 332 NVML_GPM_METRIC_MAX = 333 end @@ -5375,6 +5520,12 @@ end errorCounts::Ptr{nvmlEccSramUniqueUncorrectedErrorCounts_t})::nvmlReturn_t end +@checked function nvmlDeviceGetRemappedRows_v2(device, info) + initialize_context() + @gcsafe_ccall libnvml.nvmlDeviceGetRemappedRows_v2(device::nvmlDevice_t, + info::Ptr{nvmlRemappedRowsInfo_v2_t})::nvmlReturn_t +end + @checked function nvmlDeviceSetRusdSettings_v1(device, settings) initialize_context() @gcsafe_ccall libnvml.nvmlDeviceSetRusdSettings_v1(device::nvmlDevice_t, @@ -5632,7 +5783,7 @@ const nvmlDeviceAddressingMode_v1 = @NVML_STRUCT_VERSION(DeviceAddressingMode, 1 const nvmlRepairStatus_v1 = @NVML_STRUCT_VERSION(RepairStatus, 1) -const NVML_NVLINK_MAX_LINKS = 18 +const NVML_NVLINK_MAX_LINKS = 36 const NVML_TOPOLOGY_CPU = NVML_TOPOLOGY_NODE @@ -5739,6 +5890,8 @@ const NVML_DEVICE_ARCH_HOPPER = 9 const NVML_DEVICE_ARCH_BLACKWELL = 10 +const NVML_DEVICE_ARCH_RUBIN = 13 + const NVML_DEVICE_ARCH_UNKNOWN = 0xffffffff const NVML_BUS_TYPE_UNKNOWN = 0 @@ -5803,6 +5956,10 @@ const NVML_POWER_SCOPE_MODULE = Cuint(1) const NVML_POWER_SCOPE_MEMORY = Cuint(2) +const NVML_POWER_SCOPE_GPU_BASE = Cuint(3) + +const NVML_POWER_SCOPE_COUNT = Cuint(4) + const nvmlPowerValue_v2 = @NVML_STRUCT_VERSION(PowerValue, 2) const NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0 @@ -6506,7 +6663,27 @@ const NVML_FI_DEV_NVLINK_PLR_XMIT_BLOCKS = 294 const NVML_FI_DEV_NVLINK_PLR_XMIT_RETRY_BLOCKS = 295 -const NVML_FI_MAX = 296 +const NVML_FI_DEV_NVLINK_GET_DATA_RATE = 296 + +const NVML_FI_DEV_MMA_STALL_PERCENT = 297 + +const NVML_FI_DEV_MCLK_SWITCH_TYPE = 298 + +const NVML_FI_DEV_MCLK_MIN_SWITCH_INTERVAL_MILLISECONDS = 299 + +const NVML_FI_PWR_SMOOTHING_SOC_POWER_SMOOTHING_ENABLED = 300 + +const NVML_FI_DEV_REMAPPED_ROWS_COR_INACTIVE = 301 + +const NVML_FI_DEV_REMAPPED_ROWS_UNC_INACTIVE = 302 + +const NVML_FI_MAX = 303 + +const NVML_MCLK_SWITCH_TYPE_NOT_SUPPORTED = 0x00 + +const NVML_MCLK_SWITCH_TYPE_DEFERRED = 0x01 + +const NVML_MCLK_SWITCH_TYPE_RUNTIME = 0x02 const NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_100US = 0x00 @@ -6802,6 +6979,8 @@ const NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 const NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 +const NVML_CPER_CURSOR_HANDLE_INIT = nvmlCPERCursorHandle_t(0) + const nvmlSystemDriverBranchInfo_v1 = @NVML_STRUCT_VERSION(SystemDriverBranchInfo, 1) const NVML_AFFINITY_SCOPE_NODE = 0 @@ -6888,7 +7067,9 @@ const NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME = 0x0f const NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME = 0x10 -const NVML_GPU_INSTANCE_PROFILE_COUNT = 0x11 +const NVML_GPU_INSTANCE_PROFILE_3_SLICE_GFX = 0x11 + +const NVML_GPU_INSTANCE_PROFILE_COUNT = 0x12 const NVML_GPU_INSTANCE_PROFILE_CAPS_P2P = 0x01 @@ -6918,7 +7099,9 @@ const NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE = 0x06 const NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x07 -const NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x08 +const NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE_NVL = 0x08 + +const NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x09 const NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x00 diff --git a/res/wrap/Manifest.toml b/res/wrap/Manifest.toml index 835abe4c02..1bbb50a446 100644 --- a/res/wrap/Manifest.toml +++ b/res/wrap/Manifest.toml @@ -29,21 +29,21 @@ version = "3.4.3" [[deps.CUDA_Driver_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "TOML"] -git-tree-sha1 = "061f39cc84e99928830aa1005d79f7e99097ba28" +git-tree-sha1 = "2d6222474d868469a72de5bd47c5c25c0e1fe518" uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" -version = "13.2.0+0" +version = "13.3.0+0" [[deps.CUDA_Runtime_jll]] deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "af17d37b5b8b4d7525f8902eba1ef6141a9a7d3b" +git-tree-sha1 = "a648a4cb5c2e5ae8bf2f6ff1f6985f4c6ede2e8d" uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" -version = "0.21.0+0" +version = "0.23.0+0" [[deps.CUDA_SDK_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "c1fd6fe1a1a198afa28b425737f6afffe667096e" +git-tree-sha1 = "5ad75f2abd7e66fb090da859e03f0e36c2720aaf" uuid = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0" -version = "13.2.0+0" +version = "13.3.0+0" [[deps.CUDNN_jll]] deps = ["Artifacts", "CUDA_Runtime_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] @@ -65,15 +65,15 @@ version = "0.19.3" [[deps.Clang_unified_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "TOML", "Zlib_jll", "Zstd_jll", "libLLVM_jll"] -git-tree-sha1 = "fffacfad25e1f33d5b9864ff723d9180bc930148" +git-tree-sha1 = "05e53efbcd4efa450a3bf343b62793436b2cc5d1" uuid = "ffc816e1-ba66-5fa9-9ecc-bcc5cb19bea1" -version = "0.1.4+0" +version = "0.1.5+1" [[deps.CommonMark]] deps = ["PrecompileTools"] -git-tree-sha1 = "65ea18ada9814f09c5013924c42fe8b53d6ee467" +git-tree-sha1 = "019ad9e55bb3549403f2d5a9b314fbb29a806ecb" uuid = "a80b9123-70ca-4bc0-993e-6e3bcb318db6" -version = "0.10.3" +version = "1.0.1" [deps.CommonMark.extensions] CommonMarkMarkdownASTExt = "MarkdownAST" @@ -103,21 +103,21 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" version = "1.11.0" [[deps.Glob]] -git-tree-sha1 = "83cb0092e2792b9e3a865b6655e88f5b862607e2" +git-tree-sha1 = "246c628cec062230b7d183aab88841fa94fcabe9" uuid = "c27321d9-0574-5035-807b-f59d2c89b15c" -version = "1.4.0" +version = "1.5.0" [[deps.JLLWrappers]] deps = ["Artifacts", "Preferences"] -git-tree-sha1 = "0533e564aae234aff59ab625543145446d8b6ec2" +git-tree-sha1 = "7204148362dafe5fe6a273f855b8ccbe4df8173e" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.7.1" +version = "1.8.0" [[deps.JuliaFormatter]] deps = ["CommonMark", "Glob", "JuliaSyntax", "PrecompileTools", "TOML"] -git-tree-sha1 = "ec5e99ad66c51b6fcd3dbab6655b17118a29a588" +git-tree-sha1 = "734a648c54e0512112049150965e85c887ae7fa3" uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899" -version = "2.3.0" +version = "2.4.0" [[deps.JuliaSyntax]] git-tree-sha1 = "937da4713526b96ac9a178e2035019d3b78ead4a" @@ -204,9 +204,9 @@ version = "1.12.1" [[deps.PrecompileTools]] deps = ["Preferences"] -git-tree-sha1 = "07a921781cab75691315adc645096ed5e370cb77" +git-tree-sha1 = "edbeefc7a4889f528644251bdb5fc9ab5348bc2c" uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.3.3" +version = "1.3.4" [[deps.Preferences]] deps = ["TOML"] diff --git a/res/wrap/cusparse.toml b/res/wrap/cusparse.toml index 4292493f9d..b324cf5fba 100644 --- a/res/wrap/cusparse.toml +++ b/res/wrap/cusparse.toml @@ -920,6 +920,21 @@ needs_context = false 4 = "PtrOrCuPtr{Cvoid}" 7 = "PtrOrCuPtr{Cvoid}" +[api.cusparseSpGEAM_bufferSize.argtypes] +4 = "PtrOrCuPtr{Cvoid}" +6 = "PtrOrCuPtr{Cvoid}" +12 = "Ref{Csize_t}" + +[api.cusparseSpGEAM_nnz.argtypes] +4 = "PtrOrCuPtr{Cvoid}" +6 = "PtrOrCuPtr{Cvoid}" +12 = "CuPtr{Cvoid}" + +[api.cusparseSpGEAM.argtypes] +4 = "PtrOrCuPtr{Cvoid}" +6 = "PtrOrCuPtr{Cvoid}" +12 = "CuPtr{Cvoid}" + [api.cusparseConstrainedGeMM.argtypes] 4 = "PtrOrCuPtr{Cvoid}" 7 = "PtrOrCuPtr{Cvoid}" diff --git a/res/wrap/custatevec.toml b/res/wrap/custatevec.toml index 1aef3fe7e6..3b63c0e646 100644 --- a/res/wrap/custatevec.toml +++ b/res/wrap/custatevec.toml @@ -67,9 +67,6 @@ needs_context = false [api.custatevecBatchMeasure.argtypes] 2 = "CuPtr{Cvoid}" -[api.custatevecMeasureBatched.argtypes] -2 = "CuPtr{Cvoid}" - [api.custatevecBatchMeasureWithOffset.argtypes] 2 = "CuPtr{Cvoid}" diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl index acb1511dc6..4df1f3fef1 100644 --- a/test/core/cudadrv.jl +++ b/test/core/cudadrv.jl @@ -775,13 +775,6 @@ end add_data!(link, "vadd_parent", read(f, String)) end @test_throws ArgumentError add_data!(link, "vadd_parent", "\0") - - # object code - # TODO: test with valid object code - # NOTE: apparently, on Windows cuLinkAddData! _does_ accept object data containing \0 - if !Sys.iswindows() - @test_throws Exception add_data!(link, "vadd_parent", UInt8[0]) - end end @testset "error log" begin