From 34d4c774647d8ef7f18a8422578a8629576f10c1 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Fri, 29 May 2026 21:56:14 +0200
Subject: [PATCH] Make test/Project.toml superset of `examples/Project.toml`

---
 examples/Project.toml   |  5 +----
 examples/batchmatmul.jl | 11 +++++------
 examples/benchmarks.jl  |  4 ++--
 examples/fft.jl         | 11 +++++------
 examples/fmha.jl        |  7 +++----
 examples/layernorm.jl   |  9 ++++-----
 examples/matmul.jl      |  9 ++++-----
 examples/moe.jl         | 13 ++++++-------
 examples/softmax.jl     | 13 ++++++-------
 examples/transpose.jl   | 15 +++++++--------
 examples/vadd.jl        | 23 +++++++++++------------
 test/Project.toml       |  3 +++
 test/runtests.jl        | 14 ++++----------
 13 files changed, 61 insertions(+), 76 deletions(-)

diff --git a/examples/Project.toml b/examples/Project.toml
index e2bda4e7..6d20616b 100644
--- a/examples/Project.toml
+++ b/examples/Project.toml
@@ -1,11 +1,8 @@
 [deps]
-CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-cuBLAS = "182d3088-87b7-4494-8cad-fc6afaa545bc"
-cuFFT = "533571aa-0936-420e-b4be-9c66f5f626ca"
-cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431"
 cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a"
 
 [sources]
diff --git a/examples/batchmatmul.jl b/examples/batchmatmul.jl
index 5cfb595a..fe8779d4 100644
--- a/examples/batchmatmul.jl
+++ b/examples/batchmatmul.jl
@@ -5,8 +5,7 @@
 # Uses Julia-idiomatic batch-last ordering: A(M, K, Batch), B(K, N, Batch), C(M, N, Batch)
 # This provides optimal memory access with Julia's column-major layout.
 
-using CUDACore, NVTX
-import cuRAND, cuBLAS
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -78,7 +77,7 @@ function run(data; tm::Int=128, tn::Int=128, tk::Int=64, nruns::Int=1, warmup::I
     (; A, B, C, M, N, Batch) = data
     grid = (cld(M, tm), cld(N, tn), Batch)
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
     end
 
@@ -86,7 +85,7 @@ function run(data; tm::Int=128, tn::Int=128, tk::Int=64, nruns::Int=1, warmup::I
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -122,14 +121,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     C_cublas = similar(A, M, N, Batch)
 
     # cuBLAS batched gemm via CUBLAS.gemm_strided_batched!
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
     end
     times_cublas = Float64[]
     NVTX.@range "cuBLAS batched" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
+                t = CUDA.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
                 push!(times_cublas, t * 1000)
             end
         end
diff --git a/examples/benchmarks.jl b/examples/benchmarks.jl
index 02d94012..c2ed970c 100644
--- a/examples/benchmarks.jl
+++ b/examples/benchmarks.jl
@@ -3,7 +3,7 @@
 # Generic benchmark runner for cuTile.jl examples
 # Discovers and benchmarks all examples in the examples/ directory
 
-using CUDACore
+using CUDA
 
 #=============================================================================
  Configuration
@@ -140,7 +140,7 @@ function main(args...)
     println()
     println("Configuration:")
     println("  Runs: $NRUNS (+ $WARMUP warmup)")
-    println("  GPU: ", CUDACore.name(CUDACore.device()))
+    println("  GPU: ", CUDA.name(CUDA.device()))
 
     for name in discover_benchmarks(args...)
         println("\nBenchmarking $name...")
diff --git a/examples/fft.jl b/examples/fft.jl
index 612b317c..24b54c4e 100644
--- a/examples/fft.jl
+++ b/examples/fft.jl
@@ -8,8 +8,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND, cuFFT
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 using Test
@@ -220,7 +219,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
     BS = 1
     grid = (batch ÷ BS, 1, 1)
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
     end
 
@@ -228,7 +227,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -262,7 +261,7 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     results = Dict{String, Vector{Float64}}()
 
     plan = cuFFT.plan_fft!(input, 1)
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         plan * copy(input)
     end
     times_cufft = Float64[]
@@ -270,7 +269,7 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
         for i in 1:nruns
             NVTX.@range "run $i" begin
                 input_copy = copy(input)
-                t = CUDACore.@elapsed plan * input_copy
+                t = CUDA.@elapsed plan * input_copy
                 push!(times_cufft, t * 1000)
             end
         end
diff --git a/examples/fmha.jl b/examples/fmha.jl
index 2d0bbd8a..068442cb 100644
--- a/examples/fmha.jl
+++ b/examples/fmha.jl
@@ -11,8 +11,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -254,7 +253,7 @@ end
 function run(data; nruns::Int=1, warmup::Int=0)
     (; Q, K, V, causal, tile_m, tile_n, query_group_size) = data
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal)
     end
 
@@ -263,7 +262,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed begin
+                t = CUDA.@elapsed begin
                     out = cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal)
                 end
                 push!(times, t * 1000)  # ms
diff --git a/examples/layernorm.jl b/examples/layernorm.jl
index b6a37d50..a769b22c 100644
--- a/examples/layernorm.jl
+++ b/examples/layernorm.jl
@@ -5,8 +5,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -252,7 +251,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
     end
 
     # Warmup
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         run_fwd()
         run_bwd()
     end
@@ -262,7 +261,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
     NVTX.@range "cuTile Fwd" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_fwd()
+                t = CUDA.@elapsed run_fwd()
                 push!(times_fwd, t * 1000)  # ms
             end
         end
@@ -273,7 +272,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
     NVTX.@range "cuTile Bwd" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_bwd()
+                t = CUDA.@elapsed run_bwd()
                 push!(times_bwd, t * 1000)  # ms
             end
         end
diff --git a/examples/matmul.jl b/examples/matmul.jl
index 5c0c1e50..2ece28cb 100644
--- a/examples/matmul.jl
+++ b/examples/matmul.jl
@@ -2,8 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND, cuBLAS
+using CUDA, NVTX
 using LinearAlgebra
 using cuTile: cuTile
 import cuTile as ct
@@ -87,7 +86,7 @@ function run(data; tm::Int=64, tn::Int=64, tk::Int=64, nruns::Int=1, warmup::Int
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -117,14 +116,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     C_gpuarrays = similar(A, size(A, 1), size(B, 2))
 
     # GPUArrays (uses cuBLAS under the hood via LinearAlgebra.mul!)
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         mul!(C_gpuarrays, A, B)
     end
     times_gpuarrays = Float64[]
     NVTX.@range "cuBLAS" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed mul!(C_gpuarrays, A, B)
+                t = CUDA.@elapsed mul!(C_gpuarrays, A, B)
                 push!(times_gpuarrays, t * 1000)
             end
         end
diff --git a/examples/moe.jl b/examples/moe.jl
index 7375a83f..99bdabb8 100644
--- a/examples/moe.jl
+++ b/examples/moe.jl
@@ -8,8 +8,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using Random: randperm
 using cuTile: cuTile
 import cuTile as ct
@@ -226,9 +225,9 @@ function cutile_moe(hidden_states::CuArray{T}, w1, w2, topk_weights, topk_ids,
 
     # Intermediate caches: reversed from Python for column-major
     # Python (num_tokens, topk, dim) → Julia (dim, topk, num_tokens)
-    cache1 = CUDACore.zeros(T, intermediate_size * 2, topk, num_tokens)
-    cache2 = CUDACore.zeros(T, intermediate_size, total_tokens)
-    cache3 = CUDACore.zeros(T, hidden_size, topk, num_tokens)
+    cache1 = CUDA.zeros(T, intermediate_size * 2, topk, num_tokens)
+    cache2 = CUDA.zeros(T, intermediate_size, total_tokens)
+    cache3 = CUDA.zeros(T, hidden_size, topk, num_tokens)
 
     sorted_token_ids, sorted_expert_ids = moe_align_tile_size(
         Array(topk_ids), tile_m, num_experts)
@@ -346,7 +345,7 @@ end
 function run(data; nruns::Int=1, warmup::Int=0)
     (; hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k) = data
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k)
     end
 
@@ -355,7 +354,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed begin
+                t = CUDA.@elapsed begin
                     out = cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids,
                                      tile_m, tile_n, tile_k)
                 end
diff --git a/examples/softmax.jl b/examples/softmax.jl
index a2ff2d39..d880d618 100644
--- a/examples/softmax.jl
+++ b/examples/softmax.jl
@@ -7,8 +7,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -142,7 +141,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
     end
 
     # Warmup
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         run_tma()
         run_chunked()
     end
@@ -152,7 +151,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
     NVTX.@range "cuTile TMA" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_tma()
+                t = CUDA.@elapsed run_tma()
                 push!(times_tma, t * 1000)
             end
         end
@@ -163,7 +162,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
     NVTX.@range "cuTile Chunked" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed run_chunked()
+                t = CUDA.@elapsed run_chunked()
                 push!(times_chunked, t * 1000)
             end
         end
@@ -215,14 +214,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
         out .= exps ./ sum(exps; dims=1)
     end
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         gpu_softmax!()
     end
     times = Float64[]
     NVTX.@range "GPUArrays" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed gpu_softmax!()
+                t = CUDA.@elapsed gpu_softmax!()
                 push!(times, t * 1000)
             end
         end
diff --git a/examples/transpose.jl b/examples/transpose.jl
index 7762a0fc..3130dc42 100644
--- a/examples/transpose.jl
+++ b/examples/transpose.jl
@@ -2,8 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -39,7 +38,7 @@ function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0)
     (; x, y, m, n) = data
     grid = (cld(m, tm), cld(n, tn))
 
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
     end
 
@@ -47,7 +46,7 @@ function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0)
     NVTX.@range "cuTile" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
+                t = CUDA.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
                 push!(times, t * 1000)  # ms
             end
         end
@@ -88,14 +87,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     y_simt = similar(x, n, m)
 
     # GPUArrays (permutedims)
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         permutedims!(y_gpuarrays, x, (2, 1))
     end
     times_gpuarrays = Float64[]
     NVTX.@range "GPUArrays" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed permutedims!(y_gpuarrays, x, (2, 1))
+                t = CUDA.@elapsed permutedims!(y_gpuarrays, x, (2, 1))
                 push!(times_gpuarrays, t * 1000)
             end
         end
@@ -105,14 +104,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
     # SIMT naive kernel
     threads = (16, 16)
     blocks = (cld(m, threads[1]), cld(n, threads[2]))
-    CUDACore.@sync for _ in 1:warmup
+    CUDA.@sync for _ in 1:warmup
         @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
     end
     times_simt = Float64[]
     NVTX.@range "SIMT naive" begin
         for i in 1:nruns
             NVTX.@range "run $i" begin
-                t = CUDACore.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
+                t = CUDA.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
                 push!(times_simt, t * 1000)
             end
         end
diff --git a/examples/vadd.jl b/examples/vadd.jl
index b5d68853..4180c895 100644
--- a/examples/vadd.jl
+++ b/examples/vadd.jl
@@ -2,8 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-using CUDACore, NVTX
-import cuRAND
+using CUDA, NVTX
 using cuTile: cuTile
 import cuTile as ct
 
@@ -72,7 +71,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup::
         tile_x, tile_y = tile isa Tuple ? tile : (tile, tile)
         grid = (cld(m, tile_x), cld(n, tile_y))
 
-        CUDACore.@sync for _ in 1:warmup
+        CUDA.@sync for _ in 1:warmup
             @cuda backend=cuTile blocks=grid vec_add_kernel_2d(a, b, c, ct.Constant(tile_x), ct.Constant(tile_y))
         end
 
@@ -80,7 +79,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup::
         NVTX.@range "cuTile" begin
             for i in 1:nruns
                 NVTX.@range "run $i" begin
-                    t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_2d(a, b, c, ct.Constant(tile_x), ct.Constant(tile_y))
+                    t = CUDA.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_2d(a, b, c, ct.Constant(tile_x), ct.Constant(tile_y))
                     push!(times, t * 1000)  # ms
                 end
             end
@@ -92,7 +91,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup::
         grid = cld(n, tile_val)
 
         if use_gather
-            CUDACore.@sync for _ in 1:warmup
+            CUDA.@sync for _ in 1:warmup
                 @cuda backend=cuTile blocks=grid vec_add_kernel_1d_gather(a, b, c, ct.Constant(tile_val))
             end
 
@@ -100,13 +99,13 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup::
             NVTX.@range "cuTile" begin
                 for i in 1:nruns
                     NVTX.@range "run $i" begin
-                        t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d_gather(a, b, c, ct.Constant(tile_val))
+                        t = CUDA.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d_gather(a, b, c, ct.Constant(tile_val))
                         push!(times, t * 1000)  # ms
                     end
                 end
             end
         else
-            CUDACore.@sync for _ in 1:warmup
+            CUDA.@sync for _ in 1:warmup
                 @cuda backend=cuTile blocks=grid vec_add_kernel_1d(a, b, c, ct.Constant(tile_val))
             end
 
@@ -114,7 +113,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup::
             NVTX.@range "cuTile" begin
                 for i in 1:nruns
                     NVTX.@range "run $i" begin
-                        t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d(a, b, c, ct.Constant(tile_val))
+                        t = CUDA.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d(a, b, c, ct.Constant(tile_val))
                         push!(times, t * 1000)  # ms
                     end
                 end
@@ -160,14 +159,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
         c_simt = similar(c)
 
         # GPUArrays (broadcasting)
-        CUDACore.@sync for _ in 1:warmup
+        CUDA.@sync for _ in 1:warmup
             c_gpuarrays .= a .+ b
         end
         times_gpuarrays = Float64[]
         NVTX.@range "GPUArrays" begin
             for i in 1:nruns
                 NVTX.@range "run $i" begin
-                    t = CUDACore.@elapsed c_gpuarrays .= a .+ b
+                    t = CUDA.@elapsed c_gpuarrays .= a .+ b
                     push!(times_gpuarrays, t * 1000)
                 end
             end
@@ -177,14 +176,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
         # SIMT kernel
         threads = 256
         blocks = cld(n, threads)
-        CUDACore.@sync for _ in 1:warmup
+        CUDA.@sync for _ in 1:warmup
             @cuda threads=threads blocks=blocks simt_kernel(a, b, c_simt, n)
         end
         times_simt = Float64[]
         NVTX.@range "SIMT" begin
             for i in 1:nruns
                 NVTX.@range "run $i" begin
-                    t = CUDACore.@elapsed @cuda threads=threads blocks=blocks simt_kernel(a, b, c_simt, n)
+                    t = CUDA.@elapsed @cuda threads=threads blocks=blocks simt_kernel(a, b, c_simt, n)
                     push!(times_simt, t * 1000)
                 end
             end
diff --git a/test/Project.toml b/test/Project.toml
index bc83556a..2e05a743 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,12 +1,14 @@
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c"
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb"
 IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Microfloats = "31c70f10-a750-4521-b13c-797315ae2933"
 NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9"
+NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -19,4 +21,5 @@ cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a"
 CUDA = "6.1"
 FileCheck = "1.0"
 NVML = "6.1"
+NVTX = "1.0"
 ParallelTestRunner = "2.0"
diff --git a/test/runtests.jl b/test/runtests.jl
index 6f8fe4b3..f74e92eb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -24,7 +24,7 @@ Pkg.precompile()
 testsuite = find_tests(@__DIR__)
 delete!(testsuite, "setup")
 
-# Add examples to the test suite (requires workspaces, a Julia 1.12+ feature)
+# Add examples to the test suite (only on Julia 1.12+, where they're supported)
 examples_root = joinpath(@__DIR__, "..", "examples")
 if VERSION >= v"1.12"
     for (name, body) in find_tests(examples_root)
@@ -33,15 +33,9 @@ if VERSION >= v"1.12"
         dir = dirname(path)
         testsuite["examples/$name"] = quote
             cd($dir) do
-                project = Base.active_project()
-                Base.set_active_project($dir)
-                try
-                    redirect_stdout(devnull) do
-                        $body
-                        @eval main()
-                    end
-                finally
-                    Base.set_active_project(project)
+                redirect_stdout(devnull) do
+                    $body
+                    @eval main()
                 end
             end
         end