Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions problems/argmax/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Each reduction requires (reduce_size - 1) comparisons
return num_reductions * (reduce_size - 1)

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
shape = test_case["shape"]
dim = test_case["dim"]

# Total elements in the input tensor
total_elements = 1
for s in shape:
total_elements *= s

# Output tensor has reduced dimension (one less dimension)
reduce_size = shape[dim]
output_elements = total_elements // reduce_size

dtype_bytes = 4 # 4 bytes per float32 element (input) and int32 element (output)
return (total_elements + output_elements) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
25 changes: 25 additions & 0 deletions problems/argmin/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Each reduction requires (reduce_size - 1) comparisons
return num_reductions * (reduce_size - 1)

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
shape = test_case["shape"]
dim = test_case["dim"]

# Total elements in the input tensor
total_elements = 1
for s in shape:
total_elements *= s

# Output tensor has reduced dimension (one less dimension)
reduce_size = shape[dim]
output_elements = total_elements // reduce_size

dtype_bytes = 4 # 4 bytes per float32 element (input) and int32 element (output)
return (total_elements + output_elements) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
22 changes: 22 additions & 0 deletions problems/avg-pool-1d/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,28 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Total FLOPs for the entire output
return H_out * ops_per_output

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
H = test_case["size"]
K = test_case["kernel_size"]
S = test_case["stride"]
P = test_case["padding"]

# Calculate output dimensions
H_out = ((H + 2 * P - K) // S) + 1

# Input: H elements, Output: H_out elements
dtype_bytes = 4 # 4 bytes per float32 element
return (H + H_out) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
24 changes: 24 additions & 0 deletions problems/avg-pool-2d/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,30 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Total FLOPs for the entire output
return H_out * W_out * ops_per_output

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
H = test_case["height"]
W = test_case["width"]
K = test_case["kernel_size"]
S = test_case["stride"]
P = test_case["padding"]

# Calculate output dimensions
H_out = ((H + 2 * P - K) // S) + 1
W_out = ((W + 2 * P - K) // S) + 1

# Input: H*W elements, Output: H_out*W_out elements
dtype_bytes = 4 # 4 bytes per float32 element
return (H * W + H_out * W_out) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
26 changes: 26 additions & 0 deletions problems/avg-pool-3d/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,32 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Total FLOPs for the entire output
return H_out * W_out * D_out * ops_per_output

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
H = test_case["height"]
W = test_case["width"]
D = test_case["depth"]
K = test_case["kernel_size"]
S = test_case["stride"]
P = test_case["padding"]

# Calculate output dimensions
H_out = ((H + 2 * P - K) // S) + 1
W_out = ((W + 2 * P - K) // S) + 1
D_out = ((D + 2 * P - K) // S) + 1

# Input: H*W*D elements, Output: H_out*W_out*D_out elements
dtype_bytes = 4 # 4 bytes per float32 element
return (H * W * D + H_out * W_out * D_out) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
33 changes: 33 additions & 0 deletions problems/batch-norm/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:

return int(total_flops)

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
B = test_case["B"]
F = test_case["F"]
D1 = test_case["D1"]
D2 = test_case["D2"]

total_elements = B * F * D1 * D2
num_features = F * D1 * D2

# Naive batch normalization:
# 1. Read x to compute mean → B*F*D1*D2
# 2. Write mean → F*D1*D2
# 3. Read x to compute variance → B*F*D1*D2
# 4. Write variance → F*D1*D2
# 5. Read x to normalize → B*F*D1*D2
# 6. Read mean → F*D1*D2
# 7. Read variance → F*D1*D2
# 8. Write output → B*F*D1*D2

dtype_bytes = 4 # 4 bytes per float32 element
return (3 * total_elements + # 3 reads of x
3 * num_features + # mean write + variance write + mean read + variance read
total_elements) * dtype_bytes # output write

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters (dimensions) to pass to the CUDA solution.
Expand Down
21 changes: 21 additions & 0 deletions problems/box-blur/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
flops_per_pixel = kernel_size * kernel_size
return height * width * flops_per_pixel

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
height = test_case["height"]
width = test_case["width"]
kernel_size = test_case["kernel_size"]

# Input image: height*width elements
# Kernel: kernel_size*kernel_size elements (but this is typically small and reused)
# Output: height*width elements
# For memory bandwidth, we count input + output (kernel is small and cached)
dtype_bytes = 4 # 4 bytes per float32 element
return (height * width + height * width) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
19 changes: 19 additions & 0 deletions problems/conv-1d/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:

return N * flops_per_element

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
N = test_case["signal_size"]
K = test_case["kernel_size"]

# Input signal: N elements
# Kernel: K elements (typically small and reused, but counted for completeness)
# Output: N elements (same size as input due to padding)
dtype_bytes = 4 # 4 bytes per float32 element
return (N + K + N) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
21 changes: 21 additions & 0 deletions problems/conv-2d/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# This is slightly different from our detailed calculation but aligns with the test code
return 2 * H * W * Kh * Kw

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
H = test_case["height"]
W = test_case["width"]
Kh = test_case["kernel_height"]
Kw = test_case["kernel_width"]

# Input image: H*W elements
# Kernel: Kh*Kw elements (typically small and reused)
# Output: H*W elements (same size as input due to padding)
dtype_bytes = 4 # 4 bytes per float32 element
return (H * W + Kh * Kw + H * W) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
19 changes: 19 additions & 0 deletions problems/conv-square-3d/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Following similar convention as 2D case, we use 2*size^3*K^3
return 2 * size * size * size * K * K * K

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
size = test_case["size"] # D=H=W
K = test_case["kernel_size"]

# Input volume: size^3 elements
# Kernel: K^3 elements (typically small and reused)
# Output: size^3 elements (same size as input due to padding)
dtype_bytes = 4 # 4 bytes per float32 element
return (size * size * size + K * K * K + size * size * size) * dtype_bytes

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
33 changes: 33 additions & 0 deletions problems/conv2d-relu-hardswish/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:

return conv_flops + relu_flops + hardswish_flops

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
H = test_case["height"]
W = test_case["width"]
Kh = test_case["kernel_height"]
Kw = test_case["kernel_width"]

# Naive conv2d-relu-hardswish:
# 1. Read input → H*W
# 2. Read kernel → Kh*Kw
# 3. Write conv_output → H*W (materialized)
# 4. Read conv_output → H*W
# 5. Write relu_output → H*W (materialized)
# 6. Read relu_output → H*W
# 7. Write hardswish_output → H*W

dtype_bytes = 4 # 4 bytes per float32 element
return (H * W + # read input
Kh * Kw + # read kernel
H * W + # write conv_output
H * W + # read conv_output
H * W + # write relu_output
H * W + # read relu_output
H * W) * dtype_bytes # write hardswish_output

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
40 changes: 40 additions & 0 deletions problems/cosine-similarity/def.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,46 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
# Total per vector pair: approximately 5*D + 3 FLOPs
return N * (5 * D + 3)

def get_mem(self, test_case: Dict[str, Any]) -> int:
"""
Get the memory usage for the problem. Assumed to be all in DRAM

Args:
test_case: The test case dictionary

Returns:
Memory usage in bytes
"""
N = test_case["n"]
D = test_case["d"]

N = test_case["n"]
D = test_case["d"]

# Naive cosine similarity:
# 1. Read predictions → N*D
# 2. Read targets → N*D
# 3. Write dot_product = predictions · targets → N (materialized)
# 4. Read predictions → N*D (for norm)
# 5. Write norm_pred = ||predictions|| → N (materialized)
# 6. Read targets → N*D (for norm)
# 7. Write norm_targ = ||targets|| → N (materialized)
# 8. Read dot_product, norm_pred, norm_targ → 3*N
# 9. Write similarity = dot / (norm_pred * norm_targ) → N
# 10. Read similarity → N (if needed for output)

dtype_bytes = 4 # 4 bytes per float32 element
return (2 * N * D + # read predictions, targets (first time)
N * D + # read predictions (for norm)
N * D + # read targets (for norm)
N + # write dot_product
N + # read dot_product
N + # write norm_pred
N + # read norm_pred
N + # write norm_targ
N + # read norm_targ
N) * dtype_bytes # write similarity (output)

def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
"""
Get extra parameters to pass to the CUDA solution.
Expand Down
Loading