JuliaGPU · AntonOresten · May 29, 2026
diff --git a/examples/Project.toml b/examples/Project.toml
@@ -1,11 +1,8 @@
 [deps]
-CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-cuBLAS = "182d3088-87b7-4494-8cad-fc6afaa545bc"
-cuFFT = "533571aa-0936-420e-b4be-9c66f5f626ca"
-cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431"
 cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a"
 
 [sources]

diff --git a/examples/batchmatmul.jl b/examples/batchmatmul.jl
@@ -5,8 +5,7 @@
 # Uses Julia-idiomatic batch-last ordering: A(M, K, Batch), B(K, N, Batch), C(M, N, Batch)
 # This provides optimal memory access with Julia's column-major layout.
 
-using CUDACore, NVTX
-import cuRAND, cuBLAS
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -78,15 +77,15 @@ function run(data; tm::Int=128, tn::Int=128, tk::Int=64, nruns::Int=1, warmup::I
     (; A, B, C, M, N, Batch) = data
     grid = (cld(M, tm), cld(N, tn), Batch)
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
     end
 
     times = Float64[]
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -122,14 +121,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     C_cublas = similar(A, M, N, Batch)
 
     # cuBLAS batched gemm via CUBLAS.gemm_strided_batched!
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
     end
     times_cublas = Float64[]
     NVTX.@range "cuBLAS batched" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
+                t = CUDA.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
                 push!(times_cublas, t * 1000)
             end
         end

diff --git a/examples/benchmarks.jl b/examples/benchmarks.jl
@@ -3,7 +3,7 @@
 # Generic benchmark runner for cuTile.jl examples
 # Discovers and benchmarks all examples in the examples/ directory
 
-using CUDACore
+using CUDA
 
 #=============================================================================
  Configuration
@@ -140,7 +140,7 @@ function main(args...)
     println()
     println("Configuration:")
     println("  Runs: $NRUNS (+ $WARMUP warmup)")
-    println("  GPU: ", CUDACore.name(CUDACore.device()))
+    println("  GPU: ", CUDA.name(CUDA.device()))
 
     for name in discover_benchmarks(args...)
         println("\nBenchmarking $name...")

diff --git a/examples/fft.jl b/examples/fft.jl
@@ -8,8 +8,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND, cuFFT
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 using Test
@@ -220,15 +219,15 @@ function run(data; nruns::Int=1, warmup::Int=0)
     BS = 1
     grid = (batch ÷ BS, 1, 1)
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
     end
 
     times = Float64[]
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -262,15 +261,15 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     results = Dict{String, Vector{Float64}}()
 
     plan = cuFFT.plan_fft!(input, 1)
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         plan * copy(input)
     end
     times_cufft = Float64[]
     NVTX.@range "cuFFT" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
                 input_copy = copy(input)
-                t = CUDACore.@elapsed plan * input_copy
+                t = CUDA.@elapsed plan * input_copy
                 push!(times_cufft, t * 1000)
             end
         end

diff --git a/examples/fmha.jl b/examples/fmha.jl
@@ -11,8 +11,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -254,7 +253,7 @@ end
 function run(data; nruns::Int=1, warmup::Int=0)
     (; Q, K, V, causal, tile_m, tile_n, query_group_size) = data
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal)
     end
 
@@ -263,7 +262,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed begin
+                t = CUDA.@elapsed begin
                     out = cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal)
                 end
                 push!(times, t * 1000)  # ms

diff --git a/examples/layernorm.jl b/examples/layernorm.jl
@@ -5,8 +5,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -252,7 +251,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
     end
 
     # Warmup
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         run_fwd()
         run_bwd()
     end
@@ -262,7 +261,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
     NVTX.@range "cuTile Fwd" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_fwd()
+                t = CUDA.@elapsed run_fwd()
                 push!(times_fwd, t * 1000)  # ms
             end
         end
@@ -273,7 +272,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
     NVTX.@range "cuTile Bwd" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_bwd()
+                t = CUDA.@elapsed run_bwd()
                 push!(times_bwd, t * 1000)  # ms
             end
         end

diff --git a/examples/matmul.jl b/examples/matmul.jl
@@ -2,8 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND, cuBLAS
+using CUDA, NVTX
 using LinearAlgebra
 using cuTile: cuTile
 import cuTile as ct
@@ -87,7 +86,7 @@ function run(data; tm::Int=64, tn::Int=64, tk::Int=64, nruns::Int=1, warmup::Int
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -117,14 +116,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     C_gpuarrays = similar(A, size(A, 1), size(B, 2))
 
     # GPUArrays (uses cuBLAS under the hood via LinearAlgebra.mul!)
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         mul!(C_gpuarrays, A, B)
     end
     times_gpuarrays = Float64[]
     NVTX.@range "cuBLAS" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed mul!(C_gpuarrays, A, B)
+                t = CUDA.@elapsed mul!(C_gpuarrays, A, B)
                 push!(times_gpuarrays, t * 1000)
             end
         end

diff --git a/examples/moe.jl b/examples/moe.jl
@@ -8,8 +8,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using Random: randperm
 using cuTile: cuTile
 import cuTile as ct
@@ -226,9 +225,9 @@ function cutile_moe(hidden_states::CuArray{T}, w1, w2, topk_weights, topk_ids,
 
     # Intermediate caches: reversed from Python for column-major
     # Python (num_tokens, topk, dim) → Julia (dim, topk, num_tokens)
-    cache1 = CUDACore.zeros(T, intermediate_size * 2, topk, num_tokens)
-    cache2 = CUDACore.zeros(T, intermediate_size, total_tokens)
-    cache3 = CUDACore.zeros(T, hidden_size, topk, num_tokens)
+    cache1 = CUDA.zeros(T, intermediate_size * 2, topk, num_tokens)
+    cache2 = CUDA.zeros(T, intermediate_size, total_tokens)
+    cache3 = CUDA.zeros(T, hidden_size, topk, num_tokens)
 
     sorted_token_ids, sorted_expert_ids = moe_align_tile_size(
         Array(topk_ids), tile_m, num_experts)
@@ -346,7 +345,7 @@ end
 function run(data; nruns::Int=1, warmup::Int=0)
     (; hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k) = data
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k)
     end
 
@@ -355,7 +354,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed begin
+                t = CUDA.@elapsed begin
                     out = cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids,
                                      tile_m, tile_n, tile_k)
                 end

diff --git a/examples/softmax.jl b/examples/softmax.jl
@@ -7,8 +7,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -142,7 +141,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
     end
 
     # Warmup
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         run_tma()
         run_chunked()
     end
@@ -152,7 +151,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
     NVTX.@range "cuTile TMA" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_tma()
+                t = CUDA.@elapsed run_tma()
                 push!(times_tma, t * 1000)
             end
         end
@@ -163,7 +162,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
     NVTX.@range "cuTile Chunked" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_chunked()
+                t = CUDA.@elapsed run_chunked()
                 push!(times_chunked, t * 1000)
             end
         end
@@ -215,14 +214,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
         out .= exps ./ sum(exps; dims=1)
     end
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         gpu_softmax!()
     end
     times = Float64[]
     NVTX.@range "GPUArrays" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed gpu_softmax!()
+                t = CUDA.@elapsed gpu_softmax!()
                 push!(times, t * 1000)
             end
         end

diff --git a/examples/transpose.jl b/examples/transpose.jl
@@ -2,8 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -39,15 +38,15 @@ function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0)
     (; x, y, m, n) = data
     grid = (cld(m, tm), cld(n, tn))
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
     end
 
     times = Float64[]
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -88,14 +87,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     y_simt = similar(x, n, m)
 
     # GPUArrays (permutedims)
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         permutedims!(y_gpuarrays, x, (2, 1))
     end
     times_gpuarrays = Float64[]
     NVTX.@range "GPUArrays" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed permutedims!(y_gpuarrays, x, (2, 1))
+                t = CUDA.@elapsed permutedims!(y_gpuarrays, x, (2, 1))
                 push!(times_gpuarrays, t * 1000)
             end
         end
@@ -105,14 +104,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     # SIMT naive kernel
     threads = (16, 16)
     blocks = (cld(m, threads[1]), cld(n, threads[2]))
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
     end
     times_simt = Float64[]
     NVTX.@range "SIMT naive" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
+                t = CUDA.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
                 push!(times_simt, t * 1000)
             end
         end