Skip to content

[Do not merge] Test KernelIntrinsics#2944

Open
christiangnrd wants to merge 4 commits into
JuliaGPU:mainfrom
christiangnrd:intrinsics
Open

[Do not merge] Test KernelIntrinsics#2944
christiangnrd wants to merge 4 commits into
JuliaGPU:mainfrom
christiangnrd:intrinsics

Conversation

@christiangnrd
Copy link
Copy Markdown
Member

[only tests]
[only benchmarks]

@github-actions
Copy link
Copy Markdown
Contributor

github-actions Bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5e39ab68e..7f8ec7ad4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -162,29 +162,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
 function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
     return nothing
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kernel.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -199,7 +199,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 54fab2119..e0631d387 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802c832e5..11584ca92 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "CUDA.jl") == "CUDA.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Distributed

Comment thread src/mapreduce.jl Outdated
Comment thread src/mapreduce.jl Outdated
Comment thread src/CUDAKernels.jl Outdated
Copy link
Copy Markdown
Contributor

@github-actions github-actions Bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Details
Benchmark suite Current: cc9e99d Previous: 54c7586 Ratio
array/accumulate/Float32/1d 102863 ns 100040 ns 1.03
array/accumulate/Float32/dims=1 76858 ns 75598 ns 1.02
array/accumulate/Float32/dims=1L 1612907 ns 1585923 ns 1.02
array/accumulate/Float32/dims=2 142610 ns 140396 ns 1.02
array/accumulate/Float32/dims=2L 663488 ns 652952 ns 1.02
array/accumulate/Int64/1d 120081 ns 116760 ns 1.03
array/accumulate/Int64/dims=1 81357 ns 78966 ns 1.03
array/accumulate/Int64/dims=1L 1738738 ns 1697893 ns 1.02
array/accumulate/Int64/dims=2 152875 ns 150510 ns 1.02
array/accumulate/Int64/dims=2L 961211 ns 959254 ns 1.00
array/broadcast 18714 ns 18315 ns 1.02
array/construct 1233.7 ns 1307.5 ns 0.94
array/copy 16727 ns 16659 ns 1.00
array/copyto!/cpu_to_gpu 214216 ns 211984 ns 1.01
array/copyto!/gpu_to_cpu 279232 ns 280422 ns 1.00
array/copyto!/gpu_to_gpu 10501 ns 10380 ns 1.01
array/iteration/findall/bool 135277 ns 131500 ns 1.03
array/iteration/findall/int 149390 ns 145666 ns 1.03
array/iteration/findfirst/bool 70218 ns 68920 ns 1.02
array/iteration/findfirst/int 72711 ns 71508 ns 1.02
array/iteration/findmin/1d 67591 ns 66419 ns 1.02
array/iteration/findmin/2d 101064 ns 101084 ns 1.00
array/iteration/logical 197508 ns 190475 ns 1.04
array/iteration/scalar 65391 ns 65525 ns 1.00
array/permutedims/2d 49687 ns 49765 ns 1.00
array/permutedims/3d 52205 ns 50702 ns 1.03
array/permutedims/4d 53378 ns 51020 ns 1.05
array/random/rand/Float32 11842 ns 11632 ns 1.02
array/random/rand/Int64 22838 ns 22338 ns 1.02
array/random/rand!/Float32 9865.333333333334 ns 7935.333333333333 ns 1.24
array/random/rand!/Int64 21292 ns 17826 ns 1.19
array/random/randn/Float32 36568 ns 36306 ns 1.01
array/random/randn!/Float32 24173 ns 23975 ns 1.01
array/reductions/mapreduce/Float32/1d 34927 ns 33733 ns 1.04
array/reductions/mapreduce/Float32/dims=1 41939 ns 38085 ns 1.10
array/reductions/mapreduce/Float32/dims=1L 51268 ns 50507 ns 1.02
array/reductions/mapreduce/Float32/dims=2 61266 ns 55668 ns 1.10
array/reductions/mapreduce/Float32/dims=2L 69788 ns 67497 ns 1.03
array/reductions/mapreduce/Int64/1d 43255 ns 40096 ns 1.08
array/reductions/mapreduce/Int64/dims=1 45532 ns 40849 ns 1.11
array/reductions/mapreduce/Int64/dims=1L 87025 ns 86591 ns 1.01
array/reductions/mapreduce/Int64/dims=2 65507 ns 58177 ns 1.13
array/reductions/mapreduce/Int64/dims=2L 86051 ns 83232 ns 1.03
array/reductions/reduce/Float32/1d 35080 ns 33766 ns 1.04
array/reductions/reduce/Float32/dims=1 41985 ns 38178 ns 1.10
array/reductions/reduce/Float32/dims=1L 51132 ns 50557 ns 1.01
array/reductions/reduce/Float32/dims=2 61001 ns 55534 ns 1.10
array/reductions/reduce/Float32/dims=2L 70190 ns 68199 ns 1.03
array/reductions/reduce/Int64/1d 42765 ns 39982 ns 1.07
array/reductions/reduce/Int64/dims=1 45585 ns 41049 ns 1.11
array/reductions/reduce/Int64/dims=1L 86875 ns 86662 ns 1.00
array/reductions/reduce/Int64/dims=2 65593 ns 58182 ns 1.13
array/reductions/reduce/Int64/dims=2L 85819 ns 83392 ns 1.03
array/reverse/1d 17160 ns 17096 ns 1.00
array/reverse/1dL 68165 ns 67978 ns 1.00
array/reverse/1dL_inplace 65617 ns 65357 ns 1.00
array/reverse/1d_inplace 8617.333333333334 ns 8332.666666666666 ns 1.03
array/reverse/2d 20329 ns 20040 ns 1.01
array/reverse/2dL 72257 ns 71744 ns 1.01
array/reverse/2dL_inplace 65273 ns 65037 ns 1.00
array/reverse/2d_inplace 10013 ns 9712 ns 1.03
array/sorting/1d 2715945 ns 2725346 ns 1.00
array/sorting/2d 1063388 ns 1061549 ns 1.00
array/sorting/by 3269427 ns 3268510 ns 1.00
cuda/synchronization/context/auto 1137.7 ns 1148.8 ns 0.99
cuda/synchronization/context/blocking 947.2307692307693 ns 947 ns 1.00
cuda/synchronization/context/nonblocking 6077.4 ns 6099.8 ns 1.00
cuda/synchronization/stream/auto 996.2 ns 1006 ns 0.99
cuda/synchronization/stream/blocking 802.9010989010989 ns 859.8142857142857 ns 0.93
cuda/synchronization/stream/nonblocking 5941.4 ns 5966.2 ns 1.00
integration/byval/reference 143449 ns 143152 ns 1.00
integration/byval/slices=1 145551 ns 145205 ns 1.00
integration/byval/slices=2 283846 ns 283811 ns 1.00
integration/byval/slices=3 422584 ns 422282 ns 1.00
integration/cudadevrt 101944 ns 101717 ns 1.00
integration/volumerhs 8890230 ns 8896793 ns 1.00
kernel/indexing 12860 ns 12625 ns 1.02
kernel/indexing_checked 13729 ns 13467 ns 1.02
kernel/launch 2328.222222222222 ns 2103.8888888888887 ns 1.11
kernel/occupancy 694.2013422818792 ns 692.2105263157895 ns 1.00
kernel/rand 14406 ns 14754 ns 0.98
latency/import 4017459962 ns 3848826206 ns 1.04
latency/precompile 4758697989 ns 4629101097 ns 1.03
latency/ttfp 4644072202 ns 4482824517 ns 1.04

This comment was automatically generated by workflow using github-action-benchmark.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant