From 34d4c774647d8ef7f18a8422578a8629576f10c1 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Fri, 29 May 2026 21:56:14 +0200 Subject: [PATCH] Make test/Project.toml superset of `examples/Project.toml` --- examples/Project.toml | 5 +---- examples/batchmatmul.jl | 11 +++++------ examples/benchmarks.jl | 4 ++-- examples/fft.jl | 11 +++++------ examples/fmha.jl | 7 +++---- examples/layernorm.jl | 9 ++++----- examples/matmul.jl | 9 ++++----- examples/moe.jl | 13 ++++++------- examples/softmax.jl | 13 ++++++------- examples/transpose.jl | 15 +++++++-------- examples/vadd.jl | 23 +++++++++++------------ test/Project.toml | 3 +++ test/runtests.jl | 14 ++++---------- 13 files changed, 61 insertions(+), 76 deletions(-) diff --git a/examples/Project.toml b/examples/Project.toml index e2bda4e7..6d20616b 100644 --- a/examples/Project.toml +++ b/examples/Project.toml @@ -1,11 +1,8 @@ [deps] -CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -cuBLAS = "182d3088-87b7-4494-8cad-fc6afaa545bc" -cuFFT = "533571aa-0936-420e-b4be-9c66f5f626ca" -cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431" cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a" [sources] diff --git a/examples/batchmatmul.jl b/examples/batchmatmul.jl index 5cfb595a..fe8779d4 100644 --- a/examples/batchmatmul.jl +++ b/examples/batchmatmul.jl @@ -5,8 +5,7 @@ # Uses Julia-idiomatic batch-last ordering: A(M, K, Batch), B(K, N, Batch), C(M, N, Batch) # This provides optimal memory access with Julia's column-major layout. -using CUDACore, NVTX -import cuRAND, cuBLAS +using CUDA, NVTX using cuTile: cuTile import cuTile as ct @@ -78,7 +77,7 @@ function run(data; tm::Int=128, tn::Int=128, tk::Int=64, nruns::Int=1, warmup::I (; A, B, C, M, N, Batch) = data grid = (cld(M, tm), cld(N, tn), Batch) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk)) end @@ -86,7 +85,7 @@ function run(data; tm::Int=128, tn::Int=128, tk::Int=64, nruns::Int=1, warmup::I NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk)) push!(times, t * 1000) # ms end end @@ -122,14 +121,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) C_cublas = similar(A, M, N, Batch) # cuBLAS batched gemm via CUBLAS.gemm_strided_batched! - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas) end times_cublas = Float64[] NVTX.@range "cuBLAS batched" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas) + t = CUDA.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas) push!(times_cublas, t * 1000) end end diff --git a/examples/benchmarks.jl b/examples/benchmarks.jl index 02d94012..c2ed970c 100644 --- a/examples/benchmarks.jl +++ b/examples/benchmarks.jl @@ -3,7 +3,7 @@ # Generic benchmark runner for cuTile.jl examples # Discovers and benchmarks all examples in the examples/ directory -using CUDACore +using CUDA #============================================================================= Configuration @@ -140,7 +140,7 @@ function main(args...) println() println("Configuration:") println(" Runs: $NRUNS (+ $WARMUP warmup)") - println(" GPU: ", CUDACore.name(CUDACore.device())) + println(" GPU: ", CUDA.name(CUDA.device())) for name in discover_benchmarks(args...) println("\nBenchmarking $name...") diff --git a/examples/fft.jl b/examples/fft.jl index 612b317c..24b54c4e 100644 --- a/examples/fft.jl +++ b/examples/fft.jl @@ -8,8 +8,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND, cuFFT +using CUDA, NVTX using cuTile: cuTile import cuTile as ct using Test @@ -220,7 +219,7 @@ function run(data; nruns::Int=1, warmup::Int=0) BS = 1 grid = (batch ÷ BS, 1, 1) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D)) end @@ -228,7 +227,7 @@ function run(data; nruns::Int=1, warmup::Int=0) NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D)) push!(times, t * 1000) # ms end end @@ -262,7 +261,7 @@ function run_others(data; nruns::Int=1, warmup::Int=0) results = Dict{String, Vector{Float64}}() plan = cuFFT.plan_fft!(input, 1) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup plan * copy(input) end times_cufft = Float64[] @@ -270,7 +269,7 @@ function run_others(data; nruns::Int=1, warmup::Int=0) for i in 1:nruns NVTX.@range "run $i" begin input_copy = copy(input) - t = CUDACore.@elapsed plan * input_copy + t = CUDA.@elapsed plan * input_copy push!(times_cufft, t * 1000) end end diff --git a/examples/fmha.jl b/examples/fmha.jl index 2d0bbd8a..068442cb 100644 --- a/examples/fmha.jl +++ b/examples/fmha.jl @@ -11,8 +11,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND +using CUDA, NVTX using cuTile: cuTile import cuTile as ct @@ -254,7 +253,7 @@ end function run(data; nruns::Int=1, warmup::Int=0) (; Q, K, V, causal, tile_m, tile_n, query_group_size) = data - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal) end @@ -263,7 +262,7 @@ function run(data; nruns::Int=1, warmup::Int=0) NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed begin + t = CUDA.@elapsed begin out = cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal) end push!(times, t * 1000) # ms diff --git a/examples/layernorm.jl b/examples/layernorm.jl index b6a37d50..a769b22c 100644 --- a/examples/layernorm.jl +++ b/examples/layernorm.jl @@ -5,8 +5,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND +using CUDA, NVTX using cuTile: cuTile import cuTile as ct @@ -252,7 +251,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0 end # Warmup - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup run_fwd() run_bwd() end @@ -262,7 +261,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0 NVTX.@range "cuTile Fwd" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed run_fwd() + t = CUDA.@elapsed run_fwd() push!(times_fwd, t * 1000) # ms end end @@ -273,7 +272,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0 NVTX.@range "cuTile Bwd" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed run_bwd() + t = CUDA.@elapsed run_bwd() push!(times_bwd, t * 1000) # ms end end diff --git a/examples/matmul.jl b/examples/matmul.jl index 5c0c1e50..2ece28cb 100644 --- a/examples/matmul.jl +++ b/examples/matmul.jl @@ -2,8 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND, cuBLAS +using CUDA, NVTX using LinearAlgebra using cuTile: cuTile import cuTile as ct @@ -87,7 +86,7 @@ function run(data; tm::Int=64, tn::Int=64, tk::Int=64, nruns::Int=1, warmup::Int NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk)) push!(times, t * 1000) # ms end end @@ -117,14 +116,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) C_gpuarrays = similar(A, size(A, 1), size(B, 2)) # GPUArrays (uses cuBLAS under the hood via LinearAlgebra.mul!) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup mul!(C_gpuarrays, A, B) end times_gpuarrays = Float64[] NVTX.@range "cuBLAS" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed mul!(C_gpuarrays, A, B) + t = CUDA.@elapsed mul!(C_gpuarrays, A, B) push!(times_gpuarrays, t * 1000) end end diff --git a/examples/moe.jl b/examples/moe.jl index 7375a83f..99bdabb8 100644 --- a/examples/moe.jl +++ b/examples/moe.jl @@ -8,8 +8,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND +using CUDA, NVTX using Random: randperm using cuTile: cuTile import cuTile as ct @@ -226,9 +225,9 @@ function cutile_moe(hidden_states::CuArray{T}, w1, w2, topk_weights, topk_ids, # Intermediate caches: reversed from Python for column-major # Python (num_tokens, topk, dim) → Julia (dim, topk, num_tokens) - cache1 = CUDACore.zeros(T, intermediate_size * 2, topk, num_tokens) - cache2 = CUDACore.zeros(T, intermediate_size, total_tokens) - cache3 = CUDACore.zeros(T, hidden_size, topk, num_tokens) + cache1 = CUDA.zeros(T, intermediate_size * 2, topk, num_tokens) + cache2 = CUDA.zeros(T, intermediate_size, total_tokens) + cache3 = CUDA.zeros(T, hidden_size, topk, num_tokens) sorted_token_ids, sorted_expert_ids = moe_align_tile_size( Array(topk_ids), tile_m, num_experts) @@ -346,7 +345,7 @@ end function run(data; nruns::Int=1, warmup::Int=0) (; hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k) = data - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k) end @@ -355,7 +354,7 @@ function run(data; nruns::Int=1, warmup::Int=0) NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed begin + t = CUDA.@elapsed begin out = cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k) end diff --git a/examples/softmax.jl b/examples/softmax.jl index a2ff2d39..d880d618 100644 --- a/examples/softmax.jl +++ b/examples/softmax.jl @@ -7,8 +7,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND +using CUDA, NVTX using cuTile: cuTile import cuTile as ct @@ -142,7 +141,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N), end # Warmup - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup run_tma() run_chunked() end @@ -152,7 +151,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N), NVTX.@range "cuTile TMA" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed run_tma() + t = CUDA.@elapsed run_tma() push!(times_tma, t * 1000) end end @@ -163,7 +162,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N), NVTX.@range "cuTile Chunked" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed run_chunked() + t = CUDA.@elapsed run_chunked() push!(times_chunked, t * 1000) end end @@ -215,14 +214,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) out .= exps ./ sum(exps; dims=1) end - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup gpu_softmax!() end times = Float64[] NVTX.@range "GPUArrays" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed gpu_softmax!() + t = CUDA.@elapsed gpu_softmax!() push!(times, t * 1000) end end diff --git a/examples/transpose.jl b/examples/transpose.jl index 7762a0fc..3130dc42 100644 --- a/examples/transpose.jl +++ b/examples/transpose.jl @@ -2,8 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND +using CUDA, NVTX using cuTile: cuTile import cuTile as ct @@ -39,7 +38,7 @@ function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0) (; x, y, m, n) = data grid = (cld(m, tm), cld(n, tn)) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn)) end @@ -47,7 +46,7 @@ function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0) NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn)) push!(times, t * 1000) # ms end end @@ -88,14 +87,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) y_simt = similar(x, n, m) # GPUArrays (permutedims) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup permutedims!(y_gpuarrays, x, (2, 1)) end times_gpuarrays = Float64[] NVTX.@range "GPUArrays" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed permutedims!(y_gpuarrays, x, (2, 1)) + t = CUDA.@elapsed permutedims!(y_gpuarrays, x, (2, 1)) push!(times_gpuarrays, t * 1000) end end @@ -105,14 +104,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) # SIMT naive kernel threads = (16, 16) blocks = (cld(m, threads[1]), cld(n, threads[2])) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n) end times_simt = Float64[] NVTX.@range "SIMT naive" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n) + t = CUDA.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n) push!(times_simt, t * 1000) end end diff --git a/examples/vadd.jl b/examples/vadd.jl index b5d68853..4180c895 100644 --- a/examples/vadd.jl +++ b/examples/vadd.jl @@ -2,8 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -using CUDACore, NVTX -import cuRAND +using CUDA, NVTX using cuTile: cuTile import cuTile as ct @@ -72,7 +71,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup:: tile_x, tile_y = tile isa Tuple ? tile : (tile, tile) grid = (cld(m, tile_x), cld(n, tile_y)) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda backend=cuTile blocks=grid vec_add_kernel_2d(a, b, c, ct.Constant(tile_x), ct.Constant(tile_y)) end @@ -80,7 +79,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup:: NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_2d(a, b, c, ct.Constant(tile_x), ct.Constant(tile_y)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_2d(a, b, c, ct.Constant(tile_x), ct.Constant(tile_y)) push!(times, t * 1000) # ms end end @@ -92,7 +91,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup:: grid = cld(n, tile_val) if use_gather - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda backend=cuTile blocks=grid vec_add_kernel_1d_gather(a, b, c, ct.Constant(tile_val)) end @@ -100,13 +99,13 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup:: NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d_gather(a, b, c, ct.Constant(tile_val)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d_gather(a, b, c, ct.Constant(tile_val)) push!(times, t * 1000) # ms end end end else - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda backend=cuTile blocks=grid vec_add_kernel_1d(a, b, c, ct.Constant(tile_val)) end @@ -114,7 +113,7 @@ function run(data; tile::Union{Int, Tuple{Int,Int}}=1024, nruns::Int=1, warmup:: NVTX.@range "cuTile" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d(a, b, c, ct.Constant(tile_val)) + t = CUDA.@elapsed @cuda backend=cuTile blocks=grid vec_add_kernel_1d(a, b, c, ct.Constant(tile_val)) push!(times, t * 1000) # ms end end @@ -160,14 +159,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) c_simt = similar(c) # GPUArrays (broadcasting) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup c_gpuarrays .= a .+ b end times_gpuarrays = Float64[] NVTX.@range "GPUArrays" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed c_gpuarrays .= a .+ b + t = CUDA.@elapsed c_gpuarrays .= a .+ b push!(times_gpuarrays, t * 1000) end end @@ -177,14 +176,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0) # SIMT kernel threads = 256 blocks = cld(n, threads) - CUDACore.@sync for _ in 1:warmup + CUDA.@sync for _ in 1:warmup @cuda threads=threads blocks=blocks simt_kernel(a, b, c_simt, n) end times_simt = Float64[] NVTX.@range "SIMT" begin for i in 1:nruns NVTX.@range "run $i" begin - t = CUDACore.@elapsed @cuda threads=threads blocks=blocks simt_kernel(a, b, c_simt, n) + t = CUDA.@elapsed @cuda threads=threads blocks=blocks simt_kernel(a, b, c_simt, n) push!(times_simt, t * 1000) end end diff --git a/test/Project.toml b/test/Project.toml index bc83556a..2e05a743 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,12 +1,14 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c" +FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb" IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Microfloats = "31c70f10-a750-4521-b13c-797315ae2933" NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -19,4 +21,5 @@ cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a" CUDA = "6.1" FileCheck = "1.0" NVML = "6.1" +NVTX = "1.0" ParallelTestRunner = "2.0" diff --git a/test/runtests.jl b/test/runtests.jl index 6f8fe4b3..f74e92eb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,7 +24,7 @@ Pkg.precompile() testsuite = find_tests(@__DIR__) delete!(testsuite, "setup") -# Add examples to the test suite (requires workspaces, a Julia 1.12+ feature) +# Add examples to the test suite (only on Julia 1.12+, where they're supported) examples_root = joinpath(@__DIR__, "..", "examples") if VERSION >= v"1.12" for (name, body) in find_tests(examples_root) @@ -33,15 +33,9 @@ if VERSION >= v"1.12" dir = dirname(path) testsuite["examples/$name"] = quote cd($dir) do - project = Base.active_project() - Base.set_active_project($dir) - try - redirect_stdout(devnull) do - $body - @eval main() - end - finally - Base.set_active_project(project) + redirect_stdout(devnull) do + $body + @eval main() end end end