[Do not merge] Test KernelIntrinsics by christiangnrd · Pull Request #2944 · JuliaGPU/CUDA.jl

christiangnrd · 2025-10-22T14:56:20Z

[only tests]
[only benchmarks]

github-actions · 2025-10-22T14:57:06Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5e39ab68e..7f8ec7ad4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -162,29 +162,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
 function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
     return nothing
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kernel.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -199,7 +199,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 54fab2119..e0631d387 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802c832e5..11584ca92 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "CUDA.jl") == "CUDA.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Distributed

github-actions

CUDA.jl Benchmarks

Details

Benchmark suite	Current: `cc9e99d`	Previous: `54c7586`	Ratio
`array/accumulate/Float32/1d`	`102863` ns	`100040` ns	`1.03`
`array/accumulate/Float32/dims=1`	`76858` ns	`75598` ns	`1.02`
`array/accumulate/Float32/dims=1L`	`1612907` ns	`1585923` ns	`1.02`
`array/accumulate/Float32/dims=2`	`142610` ns	`140396` ns	`1.02`
`array/accumulate/Float32/dims=2L`	`663488` ns	`652952` ns	`1.02`
`array/accumulate/Int64/1d`	`120081` ns	`116760` ns	`1.03`
`array/accumulate/Int64/dims=1`	`81357` ns	`78966` ns	`1.03`
`array/accumulate/Int64/dims=1L`	`1738738` ns	`1697893` ns	`1.02`
`array/accumulate/Int64/dims=2`	`152875` ns	`150510` ns	`1.02`
`array/accumulate/Int64/dims=2L`	`961211` ns	`959254` ns	`1.00`
`array/broadcast`	`18714` ns	`18315` ns	`1.02`
`array/construct`	`1233.7` ns	`1307.5` ns	`0.94`
`array/copy`	`16727` ns	`16659` ns	`1.00`
`array/copyto!/cpu_to_gpu`	`214216` ns	`211984` ns	`1.01`
`array/copyto!/gpu_to_cpu`	`279232` ns	`280422` ns	`1.00`
`array/copyto!/gpu_to_gpu`	`10501` ns	`10380` ns	`1.01`
`array/iteration/findall/bool`	`135277` ns	`131500` ns	`1.03`
`array/iteration/findall/int`	`149390` ns	`145666` ns	`1.03`
`array/iteration/findfirst/bool`	`70218` ns	`68920` ns	`1.02`
`array/iteration/findfirst/int`	`72711` ns	`71508` ns	`1.02`
`array/iteration/findmin/1d`	`67591` ns	`66419` ns	`1.02`
`array/iteration/findmin/2d`	`101064` ns	`101084` ns	`1.00`
`array/iteration/logical`	`197508` ns	`190475` ns	`1.04`
`array/iteration/scalar`	`65391` ns	`65525` ns	`1.00`
`array/permutedims/2d`	`49687` ns	`49765` ns	`1.00`
`array/permutedims/3d`	`52205` ns	`50702` ns	`1.03`
`array/permutedims/4d`	`53378` ns	`51020` ns	`1.05`
`array/random/rand/Float32`	`11842` ns	`11632` ns	`1.02`
`array/random/rand/Int64`	`22838` ns	`22338` ns	`1.02`
`array/random/rand!/Float32`	`9865.333333333334` ns	`7935.333333333333` ns	`1.24`
`array/random/rand!/Int64`	`21292` ns	`17826` ns	`1.19`
`array/random/randn/Float32`	`36568` ns	`36306` ns	`1.01`
`array/random/randn!/Float32`	`24173` ns	`23975` ns	`1.01`
`array/reductions/mapreduce/Float32/1d`	`34927` ns	`33733` ns	`1.04`
`array/reductions/mapreduce/Float32/dims=1`	`41939` ns	`38085` ns	`1.10`
`array/reductions/mapreduce/Float32/dims=1L`	`51268` ns	`50507` ns	`1.02`
`array/reductions/mapreduce/Float32/dims=2`	`61266` ns	`55668` ns	`1.10`
`array/reductions/mapreduce/Float32/dims=2L`	`69788` ns	`67497` ns	`1.03`
`array/reductions/mapreduce/Int64/1d`	`43255` ns	`40096` ns	`1.08`
`array/reductions/mapreduce/Int64/dims=1`	`45532` ns	`40849` ns	`1.11`
`array/reductions/mapreduce/Int64/dims=1L`	`87025` ns	`86591` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=2`	`65507` ns	`58177` ns	`1.13`
`array/reductions/mapreduce/Int64/dims=2L`	`86051` ns	`83232` ns	`1.03`
`array/reductions/reduce/Float32/1d`	`35080` ns	`33766` ns	`1.04`
`array/reductions/reduce/Float32/dims=1`	`41985` ns	`38178` ns	`1.10`
`array/reductions/reduce/Float32/dims=1L`	`51132` ns	`50557` ns	`1.01`
`array/reductions/reduce/Float32/dims=2`	`61001` ns	`55534` ns	`1.10`
`array/reductions/reduce/Float32/dims=2L`	`70190` ns	`68199` ns	`1.03`
`array/reductions/reduce/Int64/1d`	`42765` ns	`39982` ns	`1.07`
`array/reductions/reduce/Int64/dims=1`	`45585` ns	`41049` ns	`1.11`
`array/reductions/reduce/Int64/dims=1L`	`86875` ns	`86662` ns	`1.00`
`array/reductions/reduce/Int64/dims=2`	`65593` ns	`58182` ns	`1.13`
`array/reductions/reduce/Int64/dims=2L`	`85819` ns	`83392` ns	`1.03`
`array/reverse/1d`	`17160` ns	`17096` ns	`1.00`
`array/reverse/1dL`	`68165` ns	`67978` ns	`1.00`
`array/reverse/1dL_inplace`	`65617` ns	`65357` ns	`1.00`
`array/reverse/1d_inplace`	`8617.333333333334` ns	`8332.666666666666` ns	`1.03`
`array/reverse/2d`	`20329` ns	`20040` ns	`1.01`
`array/reverse/2dL`	`72257` ns	`71744` ns	`1.01`
`array/reverse/2dL_inplace`	`65273` ns	`65037` ns	`1.00`
`array/reverse/2d_inplace`	`10013` ns	`9712` ns	`1.03`
`array/sorting/1d`	`2715945` ns	`2725346` ns	`1.00`
`array/sorting/2d`	`1063388` ns	`1061549` ns	`1.00`
`array/sorting/by`	`3269427` ns	`3268510` ns	`1.00`
`cuda/synchronization/context/auto`	`1137.7` ns	`1148.8` ns	`0.99`
`cuda/synchronization/context/blocking`	`947.2307692307693` ns	`947` ns	`1.00`
`cuda/synchronization/context/nonblocking`	`6077.4` ns	`6099.8` ns	`1.00`
`cuda/synchronization/stream/auto`	`996.2` ns	`1006` ns	`0.99`
`cuda/synchronization/stream/blocking`	`802.9010989010989` ns	`859.8142857142857` ns	`0.93`
`cuda/synchronization/stream/nonblocking`	`5941.4` ns	`5966.2` ns	`1.00`
`integration/byval/reference`	`143449` ns	`143152` ns	`1.00`
`integration/byval/slices=1`	`145551` ns	`145205` ns	`1.00`
`integration/byval/slices=2`	`283846` ns	`283811` ns	`1.00`
`integration/byval/slices=3`	`422584` ns	`422282` ns	`1.00`
`integration/cudadevrt`	`101944` ns	`101717` ns	`1.00`
`integration/volumerhs`	`8890230` ns	`8896793` ns	`1.00`
`kernel/indexing`	`12860` ns	`12625` ns	`1.02`
`kernel/indexing_checked`	`13729` ns	`13467` ns	`1.02`
`kernel/launch`	`2328.222222222222` ns	`2103.8888888888887` ns	`1.11`
`kernel/occupancy`	`694.2013422818792` ns	`692.2105263157895` ns	`1.00`
`kernel/rand`	`14406` ns	`14754` ns	`0.98`
`latency/import`	`4017459962` ns	`3848826206` ns	`1.04`
`latency/precompile`	`4758697989` ns	`4629101097` ns	`1.03`
`latency/ttfp`	`4644072202` ns	`4482824517` ns	`1.04`

This comment was automatically generated by workflow using github-action-benchmark.

sdf

christiangnrd force-pushed the intrinsics branch from edaeb41 to 8e3e1d4 Compare October 22, 2025 15:00

christiangnrd commented Oct 22, 2025

View reviewed changes

Comment thread src/mapreduce.jl Outdated

christiangnrd commented Oct 22, 2025

View reviewed changes

Comment thread src/mapreduce.jl Outdated

christiangnrd commented Oct 22, 2025

View reviewed changes

Comment thread src/CUDAKernels.jl Outdated

christiangnrd force-pushed the intrinsics branch from 497ef42 to 506e02d Compare October 22, 2025 17:23

github-actions Bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd force-pushed the intrinsics branch 2 times, most recently from aef3728 to fef539a Compare November 6, 2025 14:51

christiangnrd force-pushed the intrinsics branch 6 times, most recently from e2d7489 to 180f4a5 Compare November 18, 2025 22:44

christiangnrd force-pushed the intrinsics branch from 180f4a5 to d4c271b Compare December 12, 2025 22:15

christiangnrd force-pushed the intrinsics branch from 80b75b2 to 14f29e5 Compare January 2, 2026 20:17

christiangnrd force-pushed the intrinsics branch from 14f29e5 to b64dcd6 Compare February 13, 2026 15:32

christiangnrd force-pushed the intrinsics branch from b64dcd6 to 3c75eb6 Compare March 24, 2026 16:47

maleadt force-pushed the master branch from f1e7455 to 5a6f767 Compare March 26, 2026 08:13

christiangnrd force-pushed the intrinsics branch from 3c75eb6 to 487ad3b Compare April 9, 2026 16:30

maleadt force-pushed the main branch from ce29a43 to d3fe605 Compare May 14, 2026 10:31

christiangnrd force-pushed the intrinsics branch from 487ad3b to 575e7cc Compare May 28, 2026 23:23

christiangnrd added 3 commits May 28, 2026 21:11

KernelIntrinsics

c35b8a2

Dogfood

ecb6032

Temp CI

cc9e99d

christiangnrd force-pushed the intrinsics branch from 40792e8 to cc9e99d Compare May 29, 2026 00:19

[temp] Pretend to support KA 0.9

2e789ec

sdf

christiangnrd force-pushed the intrinsics branch from 73a071f to 2e789ec Compare June 2, 2026 15:37

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Do not merge] Test KernelIntrinsics#2944

[Do not merge] Test KernelIntrinsics#2944
christiangnrd wants to merge 4 commits into
JuliaGPU:mainfrom
christiangnrd:intrinsics

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions Bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions Bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions Bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions Bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

github-actions Bot commented Oct 22, 2025 •

edited

Loading

github-actions Bot left a comment •

edited

Loading