Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions examples/Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
[deps]
CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
cuBLAS = "182d3088-87b7-4494-8cad-fc6afaa545bc"
cuFFT = "533571aa-0936-420e-b4be-9c66f5f626ca"
cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431"
cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a"

[sources]
Expand Down
11 changes: 5 additions & 6 deletions examples/batchmatmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
# Uses Julia-idiomatic batch-last ordering: A(M, K, Batch), B(K, N, Batch), C(M, N, Batch)
# This provides optimal memory access with Julia's column-major layout.

using CUDACore, NVTX
import cuRAND, cuBLAS
using CUDA, NVTX
using cuTile: cuTile
import cuTile as ct

Expand Down Expand Up @@ -78,15 +77,15 @@ function run(data; tm::Int=128, tn::Int=128, tk::Int=64, nruns::Int=1, warmup::I
(; A, B, C, M, N, Batch) = data
grid = (cld(M, tm), cld(N, tn), Batch)

CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
@cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
end

times = Float64[]
NVTX.@range "cuTile" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
t = CUDA.@elapsed @cuda backend=cuTile blocks=grid batch_matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
push!(times, t * 1000) # ms
end
end
Expand Down Expand Up @@ -122,14 +121,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
C_cublas = similar(A, M, N, Batch)

# cuBLAS batched gemm via CUBLAS.gemm_strided_batched!
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
end
times_cublas = Float64[]
NVTX.@range "cuBLAS batched" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
t = CUDA.@elapsed cuBLAS.gemm_strided_batched!('N', 'N', one(eltype(A)), A, B, zero(eltype(A)), C_cublas)
push!(times_cublas, t * 1000)
end
end
Expand Down
4 changes: 2 additions & 2 deletions examples/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Generic benchmark runner for cuTile.jl examples
# Discovers and benchmarks all examples in the examples/ directory

using CUDACore
using CUDA

#=============================================================================
Configuration
Expand Down Expand Up @@ -140,7 +140,7 @@ function main(args...)
println()
println("Configuration:")
println(" Runs: $NRUNS (+ $WARMUP warmup)")
println(" GPU: ", CUDACore.name(CUDACore.device()))
println(" GPU: ", CUDA.name(CUDA.device()))

for name in discover_benchmarks(args...)
println("\nBenchmarking $name...")
Expand Down
11 changes: 5 additions & 6 deletions examples/fft.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND, cuFFT
using CUDA, NVTX
using cuTile: cuTile
import cuTile as ct
using Test
Expand Down Expand Up @@ -220,15 +219,15 @@ function run(data; nruns::Int=1, warmup::Int=0)
BS = 1
grid = (batch ÷ BS, 1, 1)

CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
@cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
end

times = Float64[]
NVTX.@range "cuTile" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
t = CUDA.@elapsed @cuda backend=cuTile blocks=grid fft_kernel(x_packed, y_packed, W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu, ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2), ct.Constant(BS), ct.Constant(D))
push!(times, t * 1000) # ms
end
end
Expand Down Expand Up @@ -262,15 +261,15 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
results = Dict{String, Vector{Float64}}()

plan = cuFFT.plan_fft!(input, 1)
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
plan * copy(input)
end
times_cufft = Float64[]
NVTX.@range "cuFFT" begin
for i in 1:nruns
NVTX.@range "run $i" begin
input_copy = copy(input)
t = CUDACore.@elapsed plan * input_copy
t = CUDA.@elapsed plan * input_copy
push!(times_cufft, t * 1000)
end
end
Expand Down
7 changes: 3 additions & 4 deletions examples/fmha.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND
using CUDA, NVTX
using cuTile: cuTile
import cuTile as ct

Expand Down Expand Up @@ -254,7 +253,7 @@ end
function run(data; nruns::Int=1, warmup::Int=0)
(; Q, K, V, causal, tile_m, tile_n, query_group_size) = data

CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal)
end

Expand All @@ -263,7 +262,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
NVTX.@range "cuTile" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed begin
t = CUDA.@elapsed begin
out = cutile_fmha(Q, K, V; tile_m, tile_n, query_group_size, causal)
end
push!(times, t * 1000) # ms
Expand Down
9 changes: 4 additions & 5 deletions examples/layernorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND
using CUDA, NVTX
using cuTile: cuTile
import cuTile as ct

Expand Down Expand Up @@ -252,7 +251,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
end

# Warmup
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
run_fwd()
run_bwd()
end
Expand All @@ -262,7 +261,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
NVTX.@range "cuTile Fwd" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed run_fwd()
t = CUDA.@elapsed run_fwd()
push!(times_fwd, t * 1000) # ms
end
end
Expand All @@ -273,7 +272,7 @@ function run(data; TILE_N::Int=1024, TILE_M::Int=32, nruns::Int=1, warmup::Int=0
NVTX.@range "cuTile Bwd" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed run_bwd()
t = CUDA.@elapsed run_bwd()
push!(times_bwd, t * 1000) # ms
end
end
Expand Down
9 changes: 4 additions & 5 deletions examples/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND, cuBLAS
using CUDA, NVTX
using LinearAlgebra
using cuTile: cuTile
import cuTile as ct
Expand Down Expand Up @@ -87,7 +86,7 @@ function run(data; tm::Int=64, tn::Int=64, tk::Int=64, nruns::Int=1, warmup::Int
NVTX.@range "cuTile" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
t = CUDA.@elapsed @cuda backend=cuTile blocks=grid matmul_kernel(A, B, C, ct.Constant(tm), ct.Constant(tn), ct.Constant(tk))
push!(times, t * 1000) # ms
end
end
Expand Down Expand Up @@ -117,14 +116,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
C_gpuarrays = similar(A, size(A, 1), size(B, 2))

# GPUArrays (uses cuBLAS under the hood via LinearAlgebra.mul!)
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
mul!(C_gpuarrays, A, B)
end
times_gpuarrays = Float64[]
NVTX.@range "cuBLAS" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed mul!(C_gpuarrays, A, B)
t = CUDA.@elapsed mul!(C_gpuarrays, A, B)
push!(times_gpuarrays, t * 1000)
end
end
Expand Down
13 changes: 6 additions & 7 deletions examples/moe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND
using CUDA, NVTX
using Random: randperm
using cuTile: cuTile
import cuTile as ct
Expand Down Expand Up @@ -226,9 +225,9 @@ function cutile_moe(hidden_states::CuArray{T}, w1, w2, topk_weights, topk_ids,

# Intermediate caches: reversed from Python for column-major
# Python (num_tokens, topk, dim) → Julia (dim, topk, num_tokens)
cache1 = CUDACore.zeros(T, intermediate_size * 2, topk, num_tokens)
cache2 = CUDACore.zeros(T, intermediate_size, total_tokens)
cache3 = CUDACore.zeros(T, hidden_size, topk, num_tokens)
cache1 = CUDA.zeros(T, intermediate_size * 2, topk, num_tokens)
cache2 = CUDA.zeros(T, intermediate_size, total_tokens)
cache3 = CUDA.zeros(T, hidden_size, topk, num_tokens)

sorted_token_ids, sorted_expert_ids = moe_align_tile_size(
Array(topk_ids), tile_m, num_experts)
Expand Down Expand Up @@ -346,7 +345,7 @@ end
function run(data; nruns::Int=1, warmup::Int=0)
(; hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k) = data

CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids, tile_m, tile_n, tile_k)
end

Expand All @@ -355,7 +354,7 @@ function run(data; nruns::Int=1, warmup::Int=0)
NVTX.@range "cuTile" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed begin
t = CUDA.@elapsed begin
out = cutile_moe(hidden_states, w1, w2, topk_weights, topk_ids,
tile_m, tile_n, tile_k)
end
Expand Down
13 changes: 6 additions & 7 deletions examples/softmax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND
using CUDA, NVTX
using cuTile: cuTile
import cuTile as ct

Expand Down Expand Up @@ -142,7 +141,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
end

# Warmup
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
run_tma()
run_chunked()
end
Expand All @@ -152,7 +151,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
NVTX.@range "cuTile TMA" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed run_tma()
t = CUDA.@elapsed run_tma()
push!(times_tma, t * 1000)
end
end
Expand All @@ -163,7 +162,7 @@ function run(data; tile_tma::Int=next_power_of_2(data.N),
NVTX.@range "cuTile Chunked" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed run_chunked()
t = CUDA.@elapsed run_chunked()
push!(times_chunked, t * 1000)
end
end
Expand Down Expand Up @@ -215,14 +214,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
out .= exps ./ sum(exps; dims=1)
end

CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
gpu_softmax!()
end
times = Float64[]
NVTX.@range "GPUArrays" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed gpu_softmax!()
t = CUDA.@elapsed gpu_softmax!()
push!(times, t * 1000)
end
end
Expand Down
15 changes: 7 additions & 8 deletions examples/transpose.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

using CUDACore, NVTX
import cuRAND
using CUDA, NVTX
using cuTile: cuTile
import cuTile as ct

Expand Down Expand Up @@ -39,15 +38,15 @@ function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0)
(; x, y, m, n) = data
grid = (cld(m, tm), cld(n, tn))

CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
@cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
end

times = Float64[]
NVTX.@range "cuTile" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
t = CUDA.@elapsed @cuda backend=cuTile blocks=grid transpose_kernel(x, y, ct.Constant(tm), ct.Constant(tn))
push!(times, t * 1000) # ms
end
end
Expand Down Expand Up @@ -88,14 +87,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
y_simt = similar(x, n, m)

# GPUArrays (permutedims)
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
permutedims!(y_gpuarrays, x, (2, 1))
end
times_gpuarrays = Float64[]
NVTX.@range "GPUArrays" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed permutedims!(y_gpuarrays, x, (2, 1))
t = CUDA.@elapsed permutedims!(y_gpuarrays, x, (2, 1))
push!(times_gpuarrays, t * 1000)
end
end
Expand All @@ -105,14 +104,14 @@ function run_others(data; nruns::Int=1, warmup::Int=0)
# SIMT naive kernel
threads = (16, 16)
blocks = (cld(m, threads[1]), cld(n, threads[2]))
CUDACore.@sync for _ in 1:warmup
CUDA.@sync for _ in 1:warmup
@cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
end
times_simt = Float64[]
NVTX.@range "SIMT naive" begin
for i in 1:nruns
NVTX.@range "run $i" begin
t = CUDACore.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
t = CUDA.@elapsed @cuda threads=threads blocks=blocks simt_naive_kernel(x, y_simt, m, n)
push!(times_simt, t * 1000)
end
end
Expand Down
Loading