[Do not merge] Test KernelIntrinsics#2944
Open
christiangnrd wants to merge 4 commits into
Open
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5e39ab68e..7f8ec7ad4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -162,29 +162,29 @@ end
KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = cufunction(f, tt; name, kwargs...)
- KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+ return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
end
function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+ obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
return nothing
end
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
kernel_config = launch_configuration(kernel.kern.fun)
- Int(min(kernel_config.threads, max_work_items))
+ return Int(min(kernel_config.threads, max_work_items))
end
function KI.max_work_group_size(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
end
function KI.multiprocessor_count(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
end
## indexing
@@ -199,7 +199,7 @@ end
end
@device_override @inline function KI.get_global_id()
- return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+ return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
end
@device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 54fab2119..e0631d387 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
temp = CuDynamicSharedArray(T, (2*threads,))
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
if j > length(Rother)
return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
block = KI.get_group_id().x
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
@inbounds if i <= length(Rdim) && j <= length(Rother)
I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
@inbounds global_random_counters()[warpId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId%UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
end
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
- threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
- # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+ threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+ )
+ # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
GPUArrays.mapreducedim!(identity, op, R, partial; init)
end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
- "CPU synchronization",
- "fallback test: callable types",]))
+Testsuite.testsuite(
+ () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+ [
+ "CPU synchronization",
+ "fallback test: callable types",
+ ]
+ )
+)
for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802c832e5..11584ca92 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "CUDA.jl") == "CUDA.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Distributed |
edaeb41 to
8e3e1d4
Compare
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
497ef42 to
506e02d
Compare
Contributor
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: cc9e99d | Previous: 54c7586 | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
102863 ns |
100040 ns |
1.03 |
array/accumulate/Float32/dims=1 |
76858 ns |
75598 ns |
1.02 |
array/accumulate/Float32/dims=1L |
1612907 ns |
1585923 ns |
1.02 |
array/accumulate/Float32/dims=2 |
142610 ns |
140396 ns |
1.02 |
array/accumulate/Float32/dims=2L |
663488 ns |
652952 ns |
1.02 |
array/accumulate/Int64/1d |
120081 ns |
116760 ns |
1.03 |
array/accumulate/Int64/dims=1 |
81357 ns |
78966 ns |
1.03 |
array/accumulate/Int64/dims=1L |
1738738 ns |
1697893 ns |
1.02 |
array/accumulate/Int64/dims=2 |
152875 ns |
150510 ns |
1.02 |
array/accumulate/Int64/dims=2L |
961211 ns |
959254 ns |
1.00 |
array/broadcast |
18714 ns |
18315 ns |
1.02 |
array/construct |
1233.7 ns |
1307.5 ns |
0.94 |
array/copy |
16727 ns |
16659 ns |
1.00 |
array/copyto!/cpu_to_gpu |
214216 ns |
211984 ns |
1.01 |
array/copyto!/gpu_to_cpu |
279232 ns |
280422 ns |
1.00 |
array/copyto!/gpu_to_gpu |
10501 ns |
10380 ns |
1.01 |
array/iteration/findall/bool |
135277 ns |
131500 ns |
1.03 |
array/iteration/findall/int |
149390 ns |
145666 ns |
1.03 |
array/iteration/findfirst/bool |
70218 ns |
68920 ns |
1.02 |
array/iteration/findfirst/int |
72711 ns |
71508 ns |
1.02 |
array/iteration/findmin/1d |
67591 ns |
66419 ns |
1.02 |
array/iteration/findmin/2d |
101064 ns |
101084 ns |
1.00 |
array/iteration/logical |
197508 ns |
190475 ns |
1.04 |
array/iteration/scalar |
65391 ns |
65525 ns |
1.00 |
array/permutedims/2d |
49687 ns |
49765 ns |
1.00 |
array/permutedims/3d |
52205 ns |
50702 ns |
1.03 |
array/permutedims/4d |
53378 ns |
51020 ns |
1.05 |
array/random/rand/Float32 |
11842 ns |
11632 ns |
1.02 |
array/random/rand/Int64 |
22838 ns |
22338 ns |
1.02 |
array/random/rand!/Float32 |
9865.333333333334 ns |
7935.333333333333 ns |
1.24 |
array/random/rand!/Int64 |
21292 ns |
17826 ns |
1.19 |
array/random/randn/Float32 |
36568 ns |
36306 ns |
1.01 |
array/random/randn!/Float32 |
24173 ns |
23975 ns |
1.01 |
array/reductions/mapreduce/Float32/1d |
34927 ns |
33733 ns |
1.04 |
array/reductions/mapreduce/Float32/dims=1 |
41939 ns |
38085 ns |
1.10 |
array/reductions/mapreduce/Float32/dims=1L |
51268 ns |
50507 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2 |
61266 ns |
55668 ns |
1.10 |
array/reductions/mapreduce/Float32/dims=2L |
69788 ns |
67497 ns |
1.03 |
array/reductions/mapreduce/Int64/1d |
43255 ns |
40096 ns |
1.08 |
array/reductions/mapreduce/Int64/dims=1 |
45532 ns |
40849 ns |
1.11 |
array/reductions/mapreduce/Int64/dims=1L |
87025 ns |
86591 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2 |
65507 ns |
58177 ns |
1.13 |
array/reductions/mapreduce/Int64/dims=2L |
86051 ns |
83232 ns |
1.03 |
array/reductions/reduce/Float32/1d |
35080 ns |
33766 ns |
1.04 |
array/reductions/reduce/Float32/dims=1 |
41985 ns |
38178 ns |
1.10 |
array/reductions/reduce/Float32/dims=1L |
51132 ns |
50557 ns |
1.01 |
array/reductions/reduce/Float32/dims=2 |
61001 ns |
55534 ns |
1.10 |
array/reductions/reduce/Float32/dims=2L |
70190 ns |
68199 ns |
1.03 |
array/reductions/reduce/Int64/1d |
42765 ns |
39982 ns |
1.07 |
array/reductions/reduce/Int64/dims=1 |
45585 ns |
41049 ns |
1.11 |
array/reductions/reduce/Int64/dims=1L |
86875 ns |
86662 ns |
1.00 |
array/reductions/reduce/Int64/dims=2 |
65593 ns |
58182 ns |
1.13 |
array/reductions/reduce/Int64/dims=2L |
85819 ns |
83392 ns |
1.03 |
array/reverse/1d |
17160 ns |
17096 ns |
1.00 |
array/reverse/1dL |
68165 ns |
67978 ns |
1.00 |
array/reverse/1dL_inplace |
65617 ns |
65357 ns |
1.00 |
array/reverse/1d_inplace |
8617.333333333334 ns |
8332.666666666666 ns |
1.03 |
array/reverse/2d |
20329 ns |
20040 ns |
1.01 |
array/reverse/2dL |
72257 ns |
71744 ns |
1.01 |
array/reverse/2dL_inplace |
65273 ns |
65037 ns |
1.00 |
array/reverse/2d_inplace |
10013 ns |
9712 ns |
1.03 |
array/sorting/1d |
2715945 ns |
2725346 ns |
1.00 |
array/sorting/2d |
1063388 ns |
1061549 ns |
1.00 |
array/sorting/by |
3269427 ns |
3268510 ns |
1.00 |
cuda/synchronization/context/auto |
1137.7 ns |
1148.8 ns |
0.99 |
cuda/synchronization/context/blocking |
947.2307692307693 ns |
947 ns |
1.00 |
cuda/synchronization/context/nonblocking |
6077.4 ns |
6099.8 ns |
1.00 |
cuda/synchronization/stream/auto |
996.2 ns |
1006 ns |
0.99 |
cuda/synchronization/stream/blocking |
802.9010989010989 ns |
859.8142857142857 ns |
0.93 |
cuda/synchronization/stream/nonblocking |
5941.4 ns |
5966.2 ns |
1.00 |
integration/byval/reference |
143449 ns |
143152 ns |
1.00 |
integration/byval/slices=1 |
145551 ns |
145205 ns |
1.00 |
integration/byval/slices=2 |
283846 ns |
283811 ns |
1.00 |
integration/byval/slices=3 |
422584 ns |
422282 ns |
1.00 |
integration/cudadevrt |
101944 ns |
101717 ns |
1.00 |
integration/volumerhs |
8890230 ns |
8896793 ns |
1.00 |
kernel/indexing |
12860 ns |
12625 ns |
1.02 |
kernel/indexing_checked |
13729 ns |
13467 ns |
1.02 |
kernel/launch |
2328.222222222222 ns |
2103.8888888888887 ns |
1.11 |
kernel/occupancy |
694.2013422818792 ns |
692.2105263157895 ns |
1.00 |
kernel/rand |
14406 ns |
14754 ns |
0.98 |
latency/import |
4017459962 ns |
3848826206 ns |
1.04 |
latency/precompile |
4758697989 ns |
4629101097 ns |
1.03 |
latency/ttfp |
4644072202 ns |
4482824517 ns |
1.04 |
This comment was automatically generated by workflow using github-action-benchmark.
aef3728 to
fef539a
Compare
e2d7489 to
180f4a5
Compare
180f4a5 to
d4c271b
Compare
80b75b2 to
14f29e5
Compare
14f29e5 to
b64dcd6
Compare
b64dcd6 to
3c75eb6
Compare
3c75eb6 to
487ad3b
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
[only tests]
[only benchmarks]