diff --git a/problems/argmax/def.py b/problems/argmax/def.py index 0b8b217..562d2d0 100644 --- a/problems/argmax/def.py +++ b/problems/argmax/def.py @@ -172,6 +172,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each reduction requires (reduce_size - 1) comparisons return num_reductions * (reduce_size - 1) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element (input) and int32 element (output) + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/argmin/def.py b/problems/argmin/def.py index 7055569..f531322 100644 --- a/problems/argmin/def.py +++ b/problems/argmin/def.py @@ -171,6 +171,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each reduction requires (reduce_size - 1) comparisons return num_reductions * (reduce_size - 1) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element (input) and int32 element (output) + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/avg-pool-1d/def.py b/problems/avg-pool-1d/def.py index 4e1f95f..a00bb3e 100644 --- a/problems/avg-pool-1d/def.py +++ b/problems/avg-pool-1d/def.py @@ -185,6 +185,28 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs for the entire output return H_out * ops_per_output + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["size"] + K = test_case["kernel_size"] + S = test_case["stride"] + P = test_case["padding"] + + # Calculate output dimensions + H_out = ((H + 2 * P - K) // S) + 1 + + # Input: H elements, Output: H_out elements + dtype_bytes = 4 # 4 bytes per float32 element + return (H + H_out) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/avg-pool-2d/def.py b/problems/avg-pool-2d/def.py index 12c0491..411b276 100644 --- a/problems/avg-pool-2d/def.py +++ b/problems/avg-pool-2d/def.py @@ -190,6 +190,30 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs for the entire output return H_out * W_out * ops_per_output + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["height"] + W = test_case["width"] + K = test_case["kernel_size"] + S = test_case["stride"] + P = test_case["padding"] + + # Calculate output dimensions + H_out = ((H + 2 * P - K) // S) + 1 + W_out = ((W + 2 * P - K) // S) + 1 + + # Input: H*W elements, Output: H_out*W_out elements + dtype_bytes = 4 # 4 bytes per float32 element + return (H * W + H_out * W_out) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/avg-pool-3d/def.py b/problems/avg-pool-3d/def.py index ebc709c..4a48ee4 100644 --- a/problems/avg-pool-3d/def.py +++ b/problems/avg-pool-3d/def.py @@ -195,6 +195,32 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs for the entire output return H_out * W_out * D_out * ops_per_output + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["height"] + W = test_case["width"] + D = test_case["depth"] + K = test_case["kernel_size"] + S = test_case["stride"] + P = test_case["padding"] + + # Calculate output dimensions + H_out = ((H + 2 * P - K) // S) + 1 + W_out = ((W + 2 * P - K) // S) + 1 + D_out = ((D + 2 * P - K) // S) + 1 + + # Input: H*W*D elements, Output: H_out*W_out*D_out elements + dtype_bytes = 4 # 4 bytes per float32 element + return (H * W * D + H_out * W_out * D_out) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/batch-norm/def.py b/problems/batch-norm/def.py index ca3aae9..a50904d 100644 --- a/problems/batch-norm/def.py +++ b/problems/batch-norm/def.py @@ -195,6 +195,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + B = test_case["B"] + F = test_case["F"] + D1 = test_case["D1"] + D2 = test_case["D2"] + + total_elements = B * F * D1 * D2 + num_features = F * D1 * D2 + + # Naive batch normalization: + # 1. Read x to compute mean → B*F*D1*D2 + # 2. Write mean → F*D1*D2 + # 3. Read x to compute variance → B*F*D1*D2 + # 4. Write variance → F*D1*D2 + # 5. Read x to normalize → B*F*D1*D2 + # 6. Read mean → F*D1*D2 + # 7. Read variance → F*D1*D2 + # 8. Write output → B*F*D1*D2 + + dtype_bytes = 4 # 4 bytes per float32 element + return (3 * total_elements + # 3 reads of x + 3 * num_features + # mean write + variance write + mean read + variance read + total_elements) * dtype_bytes # output write + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters (dimensions) to pass to the CUDA solution. diff --git a/problems/box-blur/def.py b/problems/box-blur/def.py index 095aed2..8167792 100644 --- a/problems/box-blur/def.py +++ b/problems/box-blur/def.py @@ -188,6 +188,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: flops_per_pixel = kernel_size * kernel_size return height * width * flops_per_pixel + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + height = test_case["height"] + width = test_case["width"] + kernel_size = test_case["kernel_size"] + + # Input image: height*width elements + # Kernel: kernel_size*kernel_size elements (but this is typically small and reused) + # Output: height*width elements + # For memory bandwidth, we count input + output (kernel is small and cached) + dtype_bytes = 4 # 4 bytes per float32 element + return (height * width + height * width) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/conv-1d/def.py b/problems/conv-1d/def.py index cb655c9..ff5bd98 100644 --- a/problems/conv-1d/def.py +++ b/problems/conv-1d/def.py @@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return N * flops_per_element + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["signal_size"] + K = test_case["kernel_size"] + + # Input signal: N elements + # Kernel: K elements (typically small and reused, but counted for completeness) + # Output: N elements (same size as input due to padding) + dtype_bytes = 4 # 4 bytes per float32 element + return (N + K + N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/conv-2d/def.py b/problems/conv-2d/def.py index ee68f5a..42d2a21 100644 --- a/problems/conv-2d/def.py +++ b/problems/conv-2d/def.py @@ -189,6 +189,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # This is slightly different from our detailed calculation but aligns with the test code return 2 * H * W * Kh * Kw + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["height"] + W = test_case["width"] + Kh = test_case["kernel_height"] + Kw = test_case["kernel_width"] + + # Input image: H*W elements + # Kernel: Kh*Kw elements (typically small and reused) + # Output: H*W elements (same size as input due to padding) + dtype_bytes = 4 # 4 bytes per float32 element + return (H * W + Kh * Kw + H * W) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/conv-square-3d/def.py b/problems/conv-square-3d/def.py index 449583c..4b3fe1c 100644 --- a/problems/conv-square-3d/def.py +++ b/problems/conv-square-3d/def.py @@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Following similar convention as 2D case, we use 2*size^3*K^3 return 2 * size * size * size * K * K * K + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + size = test_case["size"] # D=H=W + K = test_case["kernel_size"] + + # Input volume: size^3 elements + # Kernel: K^3 elements (typically small and reused) + # Output: size^3 elements (same size as input due to padding) + dtype_bytes = 4 # 4 bytes per float32 element + return (size * size * size + K * K * K + size * size * size) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/conv2d-relu-hardswish/def.py b/problems/conv2d-relu-hardswish/def.py index 0e9024f..e7a7957 100644 --- a/problems/conv2d-relu-hardswish/def.py +++ b/problems/conv2d-relu-hardswish/def.py @@ -191,6 +191,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return conv_flops + relu_flops + hardswish_flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["height"] + W = test_case["width"] + Kh = test_case["kernel_height"] + Kw = test_case["kernel_width"] + + # Naive conv2d-relu-hardswish: + # 1. Read input → H*W + # 2. Read kernel → Kh*Kw + # 3. Write conv_output → H*W (materialized) + # 4. Read conv_output → H*W + # 5. Write relu_output → H*W (materialized) + # 6. Read relu_output → H*W + # 7. Write hardswish_output → H*W + + dtype_bytes = 4 # 4 bytes per float32 element + return (H * W + # read input + Kh * Kw + # read kernel + H * W + # write conv_output + H * W + # read conv_output + H * W + # write relu_output + H * W + # read relu_output + H * W) * dtype_bytes # write hardswish_output + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/cosine-similarity/def.py b/problems/cosine-similarity/def.py index fa9ea55..f86fb08 100644 --- a/problems/cosine-similarity/def.py +++ b/problems/cosine-similarity/def.py @@ -173,6 +173,46 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total per vector pair: approximately 5*D + 3 FLOPs return N * (5 * D + 3) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["n"] + D = test_case["d"] + + N = test_case["n"] + D = test_case["d"] + + # Naive cosine similarity: + # 1. Read predictions → N*D + # 2. Read targets → N*D + # 3. Write dot_product = predictions · targets → N (materialized) + # 4. Read predictions → N*D (for norm) + # 5. Write norm_pred = ||predictions|| → N (materialized) + # 6. Read targets → N*D (for norm) + # 7. Write norm_targ = ||targets|| → N (materialized) + # 8. Read dot_product, norm_pred, norm_targ → 3*N + # 9. Write similarity = dot / (norm_pred * norm_targ) → N + # 10. Read similarity → N (if needed for output) + + dtype_bytes = 4 # 4 bytes per float32 element + return (2 * N * D + # read predictions, targets (first time) + N * D + # read predictions (for norm) + N * D + # read targets (for norm) + N + # write dot_product + N + # read dot_product + N + # write norm_pred + N + # read norm_pred + N + # write norm_targ + N + # read norm_targ + N) * dtype_bytes # write similarity (output) + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/cumprod/def.py b/problems/cumprod/def.py index 5538ebc..07df3d3 100644 --- a/problems/cumprod/def.py +++ b/problems/cumprod/def.py @@ -143,6 +143,22 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: N = test_case["size"] return N - 1 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["size"] + + # Input: N elements, Output: N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/cumsum/def.py b/problems/cumsum/def.py index db3d1c1..00ed29e 100644 --- a/problems/cumsum/def.py +++ b/problems/cumsum/def.py @@ -141,6 +141,22 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: N = test_case["size"] return N - 1 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["size"] + + # Input: N elements, Output: N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/diagonal-matmul/def.py b/problems/diagonal-matmul/def.py index 685c30f..5ca8134 100644 --- a/problems/diagonal-matmul/def.py +++ b/problems/diagonal-matmul/def.py @@ -144,6 +144,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: N, M = test_case["dims"] return N * M + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N, M = test_case["dims"] + + # Input: diagonal vector (N) + matrix B (N*M) + # Output: matrix C (N*M) + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N * M + N * M) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/ecc-point-negation/def.py b/problems/ecc-point-negation/def.py index 66e4e06..9a365b5 100644 --- a/problems/ecc-point-negation/def.py +++ b/problems/ecc-point-negation/def.py @@ -171,6 +171,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: N = test_case["dims"][0] return 2 * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["dims"][0] + + # Input: xs (N) + ys (N) - both uint64_t (8 bytes) + # Output: out_xy (2*N) - uint64_t (8 bytes) + dtype_bytes = 8 # 8 bytes per uint64_t element + return (N + N + 2 * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Provide N as the trailing scalar param to the CUDA solution. diff --git a/problems/edge-detect/def.py b/problems/edge-detect/def.py index ba62162..f62f4ac 100644 --- a/problems/edge-detect/def.py +++ b/problems/edge-detect/def.py @@ -184,6 +184,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: interior_pixels = (height - 2) * (width - 2) return interior_pixels * 10 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + height = test_case["height"] + width = test_case["width"] + + # Input image: height*width elements + # Output image: height*width elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (height * width + height * width) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/elu/def.py b/problems/elu/def.py index dc494e3..0488928 100644 --- a/problems/elu/def.py +++ b/problems/elu/def.py @@ -162,6 +162,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We approximate this as 3 FLOPs per element (worst case) return 3 * M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/frobenius-norm/def.py b/problems/frobenius-norm/def.py index 140720d..c3c5ed6 100644 --- a/problems/frobenius-norm/def.py +++ b/problems/frobenius-norm/def.py @@ -192,6 +192,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + total_elements = 1 + for dim in shape: + total_elements *= dim + + # Input: total_elements, Output: total_elements (same shape) + dtype_bytes = 4 # 4 bytes per float32 element + return (total_elements + total_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/gelu/def.py b/problems/gelu/def.py index 21713f6..559611a 100644 --- a/problems/gelu/def.py +++ b/problems/gelu/def.py @@ -167,6 +167,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: flops_per_element = 24 return M * N * flops_per_element + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/gemm-multiply-leakyrelu/def.py b/problems/gemm-multiply-leakyrelu/def.py index 33dc42b..c906950 100644 --- a/problems/gemm-multiply-leakyrelu/def.py +++ b/problems/gemm-multiply-leakyrelu/def.py @@ -197,6 +197,38 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: leaky_relu_flops = M * N return gemm_flops + multiply_flops + leaky_relu_flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M, N, K = test_case["dims"] + + # Naive gemm-multiply-leakyrelu: + # 1. Read A → M*K + # 2. Read B → K*N + # 3. Write gemm_output → M*N (materialized) + # 4. Read gemm_output → M*N + # 5. Read C → M*N + # 6. Write multiply_output → M*N (materialized) + # 7. Read multiply_output → M*N + # 8. Write leakyrelu_output → M*N + + dtype_bytes = 4 # 4 bytes per float32 element + return (M * K + # read A + K * N + # read B + M * N + # write gemm_output + M * N + # read gemm_output + M * N + # read C + M * N + # write multiply_output + M * N + # read multiply_output + M * N) * dtype_bytes # write leakyrelu_output + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/gemm-relu/def.py b/problems/gemm-relu/def.py index b61e691..250027c 100644 --- a/problems/gemm-relu/def.py +++ b/problems/gemm-relu/def.py @@ -195,6 +195,36 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - Each of the B*M output elements requires 1 addition return 2 * B * N * M + B * M + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + B = test_case["batch_size"] + N = test_case["in_features"] + M = test_case["out_features"] + + # Naive gemm-relu: + # 1. Read input → B*N + # 2. Read weights → M*N + # 3. Read bias → M + # 4. Write matmul_output → B*M (materialized) + # 5. Read matmul_output → B*M + # 6. Write relu_output → B*M + + dtype_bytes = 4 # 4 bytes per float32 element + return (B * N + # read input + M * N + # read weights + M + # read bias + B * M + # write matmul_output + B * M + # read matmul_output + B * M) * dtype_bytes # write relu_output + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/grayscale/def.py b/problems/grayscale/def.py index 8782ad5..c99fd50 100644 --- a/problems/grayscale/def.py +++ b/problems/grayscale/def.py @@ -164,6 +164,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total: 5 FLOPs per pixel return height * width * 5 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + height = test_case["height"] + width = test_case["width"] + + # Input: RGB image (height*width*3) + # Output: grayscale image (height*width) + dtype_bytes = 4 # 4 bytes per float32 element + return (height * width * 3 + height * width) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/hard-sigmoid/def.py b/problems/hard-sigmoid/def.py index 017013a..a3497e6 100644 --- a/problems/hard-sigmoid/def.py +++ b/problems/hard-sigmoid/def.py @@ -162,6 +162,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 1 FLOP per element as per the test case return M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/hinge-loss/def.py b/problems/hinge-loss/def.py index 38c10a2..13139f2 100644 --- a/problems/hinge-loss/def.py +++ b/problems/hinge-loss/def.py @@ -153,6 +153,37 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # 3. max(0, sub) (1 FLOP, comparison) return N * 3 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["n"] + + # Naive hinge loss: + # 1. Read predictions → N + # 2. Read targets → N + # 3. Write mult = predictions * targets → N (materialized) + # 4. Read mult → N + # 5. Write sub = 1 - mult → N (materialized) + # 6. Read sub → N + # 7. Write max(0, sub) → N + # 8. Read max → N + # 9. Write sum → 1 (if summing, otherwise skip) + # Total: 4 reads, 4 writes = 8N element-moves (or 7N if no sum) + # Actually output is N, so final write is N, not 1 + + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N + # read predictions, targets + N + N + # write and read mult + N + N + # write and read sub + N + N) * dtype_bytes # write and read max (output) + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/histogram/def.py b/problems/histogram/def.py index f04534c..ca70b37 100644 --- a/problems/histogram/def.py +++ b/problems/histogram/def.py @@ -163,6 +163,34 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total: 4 operations per pixel return height * width * 4 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + height = test_case["height"] + width = test_case["width"] + num_bins = test_case["num_bins"] + + # Naive histogram: + # 1. Read input image → height*width + # 2. Read histogram bins (for each pixel, read current bin value) → height*width (reads to bins) + # 3. Write histogram bins (increment) → height*width (writes to bins) + # 4. Read histogram bins (final read) → num_bins + # 5. Write output histogram → num_bins + + dtype_bytes = 4 # 4 bytes per float32 element + return (height * width + # read input + height * width + # read bins during increment + height * width + # write bins during increment + num_bins + # read final bins + num_bins) * dtype_bytes # write output + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/huber-loss/def.py b/problems/huber-loss/def.py index a44b482..cc529d0 100644 --- a/problems/huber-loss/def.py +++ b/problems/huber-loss/def.py @@ -164,6 +164,34 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Conservatively estimate as 5 FLOPs per element. return N * 5 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["n"] + + # Naive huber loss: + # 1. Read predictions → N + # 2. Read targets → N + # 3. Write diff = predictions - targets → N (materialized) + # 4. Read diff → N + # 5. Write abs_diff = |diff| → N (materialized) + # 6. Read abs_diff → N + # 7. Write loss (based on condition) → N + # 8. Read loss → N (if needed) + + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N + # read predictions, targets + N + N + # write and read diff + N + N + # write and read abs_diff + N + N) * dtype_bytes # write and read loss (output) + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/kl-loss/def.py b/problems/kl-loss/def.py index 5f6313d..19eaee3 100644 --- a/problems/kl-loss/def.py +++ b/problems/kl-loss/def.py @@ -199,6 +199,41 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Conservatively estimate as 7 FLOPs per element return N * 7 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["n"] + + # Naive KL divergence loss: + # 1. Read predictions → N + # 2. Read targets → N + # 3. Write pred_eps = predictions + eps → N (materialized) + # 4. Write targ_eps = targets + eps → N (materialized) + # 5. Read pred_eps → N + # 6. Write log_pred = log(pred_eps) → N (materialized) + # 7. Read targ_eps → N + # 8. Write log_targ = log(targ_eps) → N (materialized) + # 9. Read log_pred, log_targ → 2*N + # 10. Write log_diff = log_targ - log_pred → N (materialized) + # 11. Read log_diff, targets → 2*N + # 12. Write loss = targets * log_diff → N + + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N + # read predictions, targets + N + N + # write pred_eps, targ_eps + N + N + # read pred_eps, write log_pred + N + N + # read targ_eps, write log_targ + N + N + # read log_pred, log_targ + N + N + # write and read log_diff + N + N) * dtype_bytes # read targets, write loss (output) + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/l1-norm/def.py b/problems/l1-norm/def.py index 0f70c6f..2ecdfb3 100644 --- a/problems/l1-norm/def.py +++ b/problems/l1-norm/def.py @@ -191,6 +191,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) # Return as integer + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + B = test_case["B"] + D = test_case["D"] + + # Input: B*D elements, Output: B*D elements (same shape) + dtype_bytes = 4 # 4 bytes per float32 element + return (B * D + B * D) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters (dimensions) to pass to the CUDA solution. diff --git a/problems/l2-norm/def.py b/problems/l2-norm/def.py index 8fd8773..13e52d9 100644 --- a/problems/l2-norm/def.py +++ b/problems/l2-norm/def.py @@ -192,6 +192,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) # Return as integer + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + B = test_case["B"] + D = test_case["D"] + + # Input: B*D elements, Output: B*D elements (same shape) + dtype_bytes = 4 # 4 bytes per float32 element + return (B * D + B * D) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters (dimensions) to pass to the CUDA solution. diff --git a/problems/layer-norm/def.py b/problems/layer-norm/def.py index ed69714..ae64132 100644 --- a/problems/layer-norm/def.py +++ b/problems/layer-norm/def.py @@ -218,6 +218,44 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) # Return as integer + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + B = test_case["B"] + F = test_case["F"] + D1 = test_case["D1"] + D2 = test_case["D2"] + + total_elements = B * F * D1 * D2 + param_elements = F * D1 * D2 + + # Naive layer normalization per batch item: + # 1. Read x to compute mean → B*F*D1*D2 + # 2. Write mean → B elements + # 3. Read x to compute variance → B*F*D1*D2 + # 4. Write variance → B elements + # 5. Read x to normalize → B*F*D1*D2 + # 6. Read mean → B elements + # 7. Read variance → B elements + # 8. Read gamma → F*D1*D2 + # 9. Read beta → F*D1*D2 + # 10. Write output → B*F*D1*D2 + + dtype_bytes = 4 # 4 bytes per float32 element + return (3 * total_elements + # 3 reads of x + 2 * B + # mean write + variance write + 2 * B + # mean read + variance read + param_elements + # gamma read + param_elements + # beta read + total_elements) * dtype_bytes # output write + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters (dimensions) to pass to the CUDA solution. diff --git a/problems/leaky-relu/def.py b/problems/leaky-relu/def.py index b02cd49..f9ab084 100644 --- a/problems/leaky-relu/def.py +++ b/problems/leaky-relu/def.py @@ -167,6 +167,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 1 FLOP per element as per the test case return M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/lower-trig-matmul/def.py b/problems/lower-trig-matmul/def.py index 057ade9..1e2ed52 100644 --- a/problems/lower-trig-matmul/def.py +++ b/problems/lower-trig-matmul/def.py @@ -172,6 +172,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: flops = N * (N + 1) * (N + 2) // 3 return flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["dims"][0] + + # Input: A (N*N) + B (N*N) - both lower triangular but stored as full matrices + # Output: C (N*N) - lower triangular but stored as full matrix + dtype_bytes = 4 # 4 bytes per float32 element + return (N * N + N * N + N * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matmul-3d/def.py b/problems/matmul-3d/def.py index f14d1c2..0da32e7 100644 --- a/problems/matmul-3d/def.py +++ b/problems/matmul-3d/def.py @@ -158,6 +158,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: N, M, K, L = test_case["dims"] return 2 * N * M * K * L + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N, M, K, L = test_case["dims"] + + # Input: A (N*M*K) + B (K*L) + # Output: C (N*M*L) + dtype_bytes = 4 # 4 bytes per float32 element + return (N * M * K + K * L + N * M * L) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matmul-4d/def.py b/problems/matmul-4d/def.py index 7b24129..94487ff 100644 --- a/problems/matmul-4d/def.py +++ b/problems/matmul-4d/def.py @@ -159,6 +159,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: b, i, j, l, k = test_case["dims"] return 2 * b * i * j * l * k + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + b, i, j, l, k = test_case["dims"] + + # Input: A (b*i*j*l) + B (l*k) + # Output: C (b*i*j*k) + dtype_bytes = 4 # 4 bytes per float32 element + return (b * i * j * l + l * k + b * i * j * k) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matmul-sigmoid-sum/def.py b/problems/matmul-sigmoid-sum/def.py index 45225eb..9d9426c 100644 --- a/problems/matmul-sigmoid-sum/def.py +++ b/problems/matmul-sigmoid-sum/def.py @@ -164,6 +164,36 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: sum_flops = M * N - 1 return matmul_flops + sigmoid_flops + sum_flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M, N, K = test_case["dims"] + + # Naive matmul-sigmoid-sum: + # 1. Read A → M*K + # 2. Read B → K*N + # 3. Write matmul_output → M*N (materialized) + # 4. Read matmul_output → M*N + # 5. Write sigmoid_output → M*N (materialized) + # 6. Read sigmoid_output → M*N + # 7. Write sum → 1 + + dtype_bytes = 4 # 4 bytes per float32 element + return (M * K + # read A + K * N + # read B + M * N + # write matmul_output + M * N + # read matmul_output + M * N + # write sigmoid_output + M * N + # read sigmoid_output + 1) * dtype_bytes # write sum (output) + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matmul-swish-scaling/def.py b/problems/matmul-swish-scaling/def.py index d5c22ed..ef79a2e 100644 --- a/problems/matmul-swish-scaling/def.py +++ b/problems/matmul-swish-scaling/def.py @@ -195,6 +195,36 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: scaling_flops = M * N return matmul_flops + swish_flops + scaling_flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M, N, K = test_case["dims"] + + # Naive matmul-swish-scaling: + # 1. Read A → M*K + # 2. Read B → K*N + # 3. Write matmul_output → M*N (materialized) + # 4. Read matmul_output → M*N + # 5. Write swish_output → M*N (materialized) + # 6. Read swish_output → M*N + # 7. Write scaled_output → M*N + + dtype_bytes = 4 # 4 bytes per float32 element + return (M * K + # read A + K * N + # read B + M * N + # write matmul_output + M * N + # read matmul_output + M * N + # write swish_output + M * N + # read swish_output + M * N) * dtype_bytes # write scaled_output + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matmul-swish/def.py b/problems/matmul-swish/def.py index 8accc19..1cf96ab 100644 --- a/problems/matmul-swish/def.py +++ b/problems/matmul-swish/def.py @@ -191,6 +191,42 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs for all elements return B * O * flops_per_element + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + B = test_case["batch_size"] + I = test_case["in_features"] + O = test_case["out_features"] + + # Naive matmul-swish: + # 1. Read input → B*I + # 2. Read weight → O*I + # 3. Read bias → O + # 4. Write matmul_output → B*O (materialized) + # 5. Read matmul_output → B*O + # 6. Write sigmoid_output → B*O (materialized) + # 7. Read sigmoid_output → B*O + # 8. Read matmul_output → B*O (for swish multiply) + # 9. Write swish_output → B*O + + dtype_bytes = 4 # 4 bytes per float32 element + return (B * I + # read input + O * I + # read weight + O + # read bias + B * O + # write matmul_output + B * O + # read matmul_output + B * O + # write sigmoid_output + B * O + # read sigmoid_output + B * O + # read matmul_output (again) + B * O) * dtype_bytes # write swish_output + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matrix-multiplication/def.py b/problems/matrix-multiplication/def.py index d1a7732..6ae27f8 100644 --- a/problems/matrix-multiplication/def.py +++ b/problems/matrix-multiplication/def.py @@ -168,6 +168,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: M, N, K = test_case["dims"] return 2 * M * N * K + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M, N, K = test_case["dims"] + + # Input: A (M*K) + B (K*N) + # Output: C (M*N) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * K + K * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matrix-power/def.py b/problems/matrix-power/def.py index 4f31548..7724e15 100644 --- a/problems/matrix-power/def.py +++ b/problems/matrix-power/def.py @@ -169,6 +169,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return (power - 1) * 2 * (N ** 3) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["size"] + + # Input: matrix (N*N) + # Output: matrix^n (N*N) + # Note: Intermediate results may be stored, but we count input + final output + dtype_bytes = 4 # 4 bytes per float32 element + return (N * N + N * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matrix-scalar/def.py b/problems/matrix-scalar/def.py index 65e5179..4cd8c57 100644 --- a/problems/matrix-scalar/def.py +++ b/problems/matrix-scalar/def.py @@ -153,6 +153,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - There are N*N elements in the matrix return N * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["size"] + + # Input: matrix (N*N) + # Output: matrix (N*N) (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (N * N + N * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/matrix-vector/def.py b/problems/matrix-vector/def.py index d9af637..d6f04d6 100644 --- a/problems/matrix-vector/def.py +++ b/problems/matrix-vector/def.py @@ -156,6 +156,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - Each MAD (Multiply-Add) counts as 2 FLOPs return M * K * 2 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + K = test_case["cols"] + + # Input: matrix (M*K) + vector (K) + # Output: vector (M) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * K + K + M) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/max-dim/def.py b/problems/max-dim/def.py index 30ef85a..c7390b7 100644 --- a/problems/max-dim/def.py +++ b/problems/max-dim/def.py @@ -169,6 +169,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each reduction requires (reduce_size - 1) comparisons return num_reductions * (reduce_size - 1) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension, but keepdim=True) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/max-pool-1d/def.py b/problems/max-pool-1d/def.py index 1f284bc..69e5bba 100644 --- a/problems/max-pool-1d/def.py +++ b/problems/max-pool-1d/def.py @@ -189,6 +189,29 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs (comparisons) for the entire output return H_out * comparisons_per_output + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["size"] + K = test_case["kernel_size"] + S = test_case["stride"] + P = test_case["padding"] + D = test_case["dilation"] + + # Calculate output dimensions + H_out = ((H + 2 * P - D * (K - 1) - 1) // S) + 1 + + # Input: H elements, Output: H_out elements + dtype_bytes = 4 # 4 bytes per float32 element + return (H + H_out) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/max-pool-2d/def.py b/problems/max-pool-2d/def.py index 55bfbf6..f91a24a 100644 --- a/problems/max-pool-2d/def.py +++ b/problems/max-pool-2d/def.py @@ -196,6 +196,30 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs (comparisons) for the entire output return H_out * W_out * comparisons_per_output + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["height"] + W = test_case["width"] + K = test_case["kernel_size"] + S = test_case["stride"] + P = test_case["padding"] + + # Calculate output dimensions + H_out = ((H + 2 * P - K) // S) + 1 + W_out = ((W + 2 * P - K) // S) + 1 + + # Input: H*W elements, Output: H_out*W_out elements + dtype_bytes = 4 # 4 bytes per float32 element + return (H * W + H_out * W_out) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/max-pool-3d/def.py b/problems/max-pool-3d/def.py index 013e019..9f44670 100644 --- a/problems/max-pool-3d/def.py +++ b/problems/max-pool-3d/def.py @@ -201,6 +201,32 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total FLOPs (comparisons) for the entire output return H_out * W_out * D_out * comparisons_per_output + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + H = test_case["height"] + W = test_case["width"] + D = test_case["depth"] + K = test_case["kernel_size"] + S = test_case["stride"] + P = test_case["padding"] + + # Calculate output dimensions + H_out = ((H + 2 * P - K) // S) + 1 + W_out = ((W + 2 * P - K) // S) + 1 + D_out = ((D + 2 * P - K) // S) + 1 + + # Input: H*W*D elements, Output: H_out*W_out*D_out elements + dtype_bytes = 4 # 4 bytes per float32 element + return (H * W * D + H_out * W_out * D_out) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/mean-dim/def.py b/problems/mean-dim/def.py index 4f67be1..8317fc5 100644 --- a/problems/mean-dim/def.py +++ b/problems/mean-dim/def.py @@ -170,6 +170,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each mean requires (reduce_size - 1) additions and 1 division return num_outputs * (reduce_size) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension, but keepdim=True) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/min-dim/def.py b/problems/min-dim/def.py index c08d613..7d85f7c 100644 --- a/problems/min-dim/def.py +++ b/problems/min-dim/def.py @@ -170,6 +170,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each reduction requires (reduce_size - 1) comparisons return num_reductions * (reduce_size - 1) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension, but keepdim=True) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/min-spanning-tree/def.py b/problems/min-spanning-tree/def.py index c02a199..10aec70 100644 --- a/problems/min-spanning-tree/def.py +++ b/problems/min-spanning-tree/def.py @@ -186,19 +186,6 @@ def get_function_signature(self) -> Dict[str, Any]: "restype": None } - # def get_flops(self, test_case: Dict[str, Any]) -> int: - # """ - # Get the number of floating point operations for the problem. - - # Args: - # test_case: The test case dictionary - - # Returns: - # Number of floating point operations - # """ - # N = test_case["dims"][0] - # return N * N * 2 - def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/mse-loss/def.py b/problems/mse-loss/def.py index 6e8c915..37f1463 100644 --- a/problems/mse-loss/def.py +++ b/problems/mse-loss/def.py @@ -157,6 +157,40 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + + # Total elements in the tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Naive MSE loss: + # 1. Read predictions → total_elements + # 2. Read targets → total_elements + # 3. Write diff = predictions - targets → total_elements (materialized) + # 4. Read diff → total_elements + # 5. Write squared = diff^2 → total_elements (materialized) + # 6. Read squared → total_elements + # 7. Write sum → 1 (materialized) + # 8. Read sum → 1 + # 9. Write mean → 1 + + dtype_bytes = 4 # 4 bytes per float32 element + return (2 * total_elements + # read predictions, targets + 2 * total_elements + # write and read diff + 2 * total_elements + # write and read squared + 2) * dtype_bytes # write and read sum, write mean + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/poly-multiply-ff/def.py b/problems/poly-multiply-ff/def.py index adcedeb..aa336e2 100644 --- a/problems/poly-multiply-ff/def.py +++ b/problems/poly-multiply-ff/def.py @@ -122,5 +122,22 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: n = test_case["dims"][0] return 2 * n * n # one mul + one add per term + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + n = test_case["dims"][0] + + # Input: A (n) + B (n) - both uint32 + # Output: C (2*n - 1) - uint32 + dtype_bytes = 4 # 4 bytes per uint32 element + return (n + n + (2 * n - 1)) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: return [test_case["dims"][0]] diff --git a/problems/product-dim/def.py b/problems/product-dim/def.py index fa90222..ffc4660 100644 --- a/problems/product-dim/def.py +++ b/problems/product-dim/def.py @@ -170,6 +170,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each reduction requires (reduce_size - 1) multiplications return num_reductions * (reduce_size - 1) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension, but keepdim=True) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/relu/def.py b/problems/relu/def.py index f425d49..2ba4379 100644 --- a/problems/relu/def.py +++ b/problems/relu/def.py @@ -169,6 +169,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 1 FLOP per element as per the test case return M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/rms-norm/def.py b/problems/rms-norm/def.py index 1155d7b..fa749f7 100644 --- a/problems/rms-norm/def.py +++ b/problems/rms-norm/def.py @@ -190,6 +190,35 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + batch_size = shape[0] + num_features = shape[1] + + total_elements = batch_size * num_features + + # Naive RMS normalization per batch item: + # 1. Read x to compute RMS (sum of squares) → batch_size*num_features + # 2. Write RMS → batch_size elements + # 3. Read x to normalize → batch_size*num_features + # 4. Read RMS → batch_size elements + # 5. Write output → batch_size*num_features + + dtype_bytes = 4 # 4 bytes per float32 element + return (2 * total_elements + # 2 reads of x + batch_size + # RMS write + batch_size + # RMS read + total_elements) * dtype_bytes # output write + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/running-sum-1d/def.py b/problems/running-sum-1d/def.py index 8fd8b98..c7e87d0 100644 --- a/problems/running-sum-1d/def.py +++ b/problems/running-sum-1d/def.py @@ -169,6 +169,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return 2*N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["signal_size"] + + # Input: signal (N elements) + # Output: running sum (N elements, same size due to padding) + dtype_bytes = 4 # 4 bytes per float32 element + return (N + N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/scaled-dot-attention/def.py b/problems/scaled-dot-attention/def.py index 8e10c76..659d697 100644 --- a/problems/scaled-dot-attention/def.py +++ b/problems/scaled-dot-attention/def.py @@ -221,6 +221,37 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return (4 * b * h * s * s * e) + (5 * b * h * s * s) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + b = test_case["batch"] + h = test_case["heads"] + s = test_case["seq_len"] + e = test_case["embed_dim"] + + # Naive scaled dot-product attention: + # 1. Read Q, K for QK^T → 2 * b*h*s*e + # 2. Write QK^T → b*h*s*s + # 3. Read QK^T (scaling) → b*h*s*s + # 4. Write scaled → b*h*s*s + # 5. Softmax: read scaled (max) → b*h*s*s, read scaled (exp) → b*h*s*s, write exp → b*h*s*s, + # read exp (norm) → b*h*s*s, write softmax → b*h*s*s = 5 * b*h*s*s + # 6. Read softmax, V for matmul → b*h*s*s + b*h*s*e + # 7. Write output → b*h*s*e + + dtype_bytes = 4 # 4 bytes per float32 element + return (2 * b * h * s * e + # Q, K reads + 8 * b * h * s * s + # QK^T, scaled, softmax intermediates (1+1+5+1) + b * h * s * e + # V read + b * h * s * e) * dtype_bytes # output write + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/selu/def.py b/problems/selu/def.py index 72f61ef..902e6ba 100644 --- a/problems/selu/def.py +++ b/problems/selu/def.py @@ -164,6 +164,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 3 FLOPs per element for the SELU calculation return M * N * 3 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/sigmoid/def.py b/problems/sigmoid/def.py index b5d4681..7da4601 100644 --- a/problems/sigmoid/def.py +++ b/problems/sigmoid/def.py @@ -161,6 +161,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 1 FLOP per element as per the test case return M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/soft-plus/def.py b/problems/soft-plus/def.py index 17032dc..747d4ae 100644 --- a/problems/soft-plus/def.py +++ b/problems/soft-plus/def.py @@ -159,6 +159,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 3 FLOPs per element for the Softplus calculation (exp, add, log) return M * N * 3 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/softmax/def.py b/problems/softmax/def.py index 0f5f39d..398ba75 100644 --- a/problems/softmax/def.py +++ b/problems/softmax/def.py @@ -191,6 +191,34 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return num_slices * flops_per_slice + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Naive stable softmax that materializes the exponential buffer: + # 1. Read x to compute max → N + # 2. Read x to compute exp and sum → N + # 3. Write e = exp(x-m) → N + # 4. Read e to normalize → N + # 5. Write y → N + # Total: 5N element-moves + + dtype_bytes = 4 # 4 bytes per float32 element + return 5 * total_elements * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/square-matmul/def.py b/problems/square-matmul/def.py index ddcadce..360edc2 100644 --- a/problems/square-matmul/def.py +++ b/problems/square-matmul/def.py @@ -151,6 +151,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - There are N*N output elements return N * N * N * 2 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["size"] + + # Input: A (N*N) + B (N*N) + # Output: C (N*N) + dtype_bytes = 4 # 4 bytes per float32 element + return (N * N + N * N + N * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/sum-dim/def.py b/problems/sum-dim/def.py index 764179b..3b97713 100644 --- a/problems/sum-dim/def.py +++ b/problems/sum-dim/def.py @@ -168,6 +168,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Each reduction requires (reduce_size - 1) additions return num_reductions * (reduce_size - 1) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + shape = test_case["shape"] + dim = test_case["dim"] + + # Total elements in the input tensor + total_elements = 1 + for s in shape: + total_elements *= s + + # Output tensor has reduced dimension (one less dimension, but keepdim=True) + reduce_size = shape[dim] + output_elements = total_elements // reduce_size + + dtype_bytes = 4 # 4 bytes per float32 element + return (total_elements + output_elements) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/swish/def.py b/problems/swish/def.py index 89dbf8d..325b700 100644 --- a/problems/swish/def.py +++ b/problems/swish/def.py @@ -162,6 +162,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 1 FLOP per element as per the test case return M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/symmetric-matmul/def.py b/problems/symmetric-matmul/def.py index ae8cc8d..324032a 100644 --- a/problems/symmetric-matmul/def.py +++ b/problems/symmetric-matmul/def.py @@ -157,6 +157,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - There are N*N output elements return N * N * N * 2 + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + N = test_case["size"] + + # Input: A (N*N) + B (N*N) - both symmetric but stored as full matrices + # Output: C (N*N) + dtype_bytes = 4 # 4 bytes per float32 element + return (N * N + N * N + N * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/tanh/def.py b/problems/tanh/def.py index 2cf35e7..7ac7206 100644 --- a/problems/tanh/def.py +++ b/problems/tanh/def.py @@ -163,6 +163,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # - We count this as 1 FLOP per element as per the test case return M * N + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + M = test_case["rows"] + N = test_case["cols"] + + # Input: M*N elements, Output: M*N elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (M * N + M * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/threshold/def.py b/problems/threshold/def.py index 980cae9..565406a 100644 --- a/problems/threshold/def.py +++ b/problems/threshold/def.py @@ -163,6 +163,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # Total: 1 FLOP per pixel return height * width + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + height = test_case["height"] + width = test_case["width"] + + # Input: height*width elements, Output: height*width elements (same size) + dtype_bytes = 4 # 4 bytes per float32 element + return (height * width + height * width) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/triplet-margin/def.py b/problems/triplet-margin/def.py index 104d92b..a2ed7a8 100644 --- a/problems/triplet-margin/def.py +++ b/problems/triplet-margin/def.py @@ -201,6 +201,50 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: return int(total_flops) + def get_mem(self, test_case: Dict[str, Any]) -> int: + """ + Get the memory usage for the problem. Assumed to be all in DRAM + + Args: + test_case: The test case dictionary + + Returns: + Memory usage in bytes + """ + batch = test_case["batch"] + embedding_dim = test_case["embedding_dim"] + + batch = test_case["batch"] + embedding_dim = test_case["embedding_dim"] + total_elements = batch * embedding_dim + + # Naive triplet margin loss: + # 1. Read anchor → batch*embedding_dim + # 2. Read positive → batch*embedding_dim + # 3. Write dist_pos → batch (materialized) + # 4. Read anchor → batch*embedding_dim (for negative distance) + # 5. Read negative → batch*embedding_dim + # 6. Write dist_neg → batch (materialized) + # 7. Read dist_pos, dist_neg → 2*batch + # 8. Write margin_loss = dist_pos - dist_neg + margin → batch (materialized) + # 9. Read margin_loss → batch + # 10. Write max(0, margin_loss) → batch + # 11. Read max → batch + # 12. Write sum → 1 + + dtype_bytes = 4 # 4 bytes per float32 element + return (2 * total_elements + # read anchor (twice) + positive + total_elements + # read negative + batch + # write dist_pos + batch + # read dist_pos + batch + # write dist_neg + batch + # read dist_neg + batch + # write margin_loss + batch + # read margin_loss + batch + # write max + batch + # read max + 1) * dtype_bytes # write sum (output) + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/upper-trig-matmul/def.py b/problems/upper-trig-matmul/def.py index 55080ed..13e951a 100644 --- a/problems/upper-trig-matmul/def.py +++ b/problems/upper-trig-matmul/def.py @@ -179,6 +179,14 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: flops = N * (N + 1) * (N + 2) // 3 return flops + def get_mem(self, test_case: Dict[str, Any]) -> int: + N = test_case["dims"][0] + + # Input: A (N*N) + B (N*N) - both upper triangular but stored as full matrices + # Output: C (N*N) - upper triangular but stored as full matrix + dtype_bytes = 4 # 4 bytes per float32 element + return (N * N + N * N + N * N) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/vector-addition/def.py b/problems/vector-addition/def.py index 08bd870..199f030 100644 --- a/problems/vector-addition/def.py +++ b/problems/vector-addition/def.py @@ -134,6 +134,11 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: N = test_case["dims"][0] return N + def get_mem(self, test_case: Dict[str, Any]) -> int: + N = test_case["dims"][0] + dtype_bytes = 4 # 4 bytes per float32 element + return 3 * N * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: """ Get extra parameters to pass to the CUDA solution. diff --git a/problems/vector-multiply-ff/def.py b/problems/vector-multiply-ff/def.py index 157f13d..8e3c647 100644 --- a/problems/vector-multiply-ff/def.py +++ b/problems/vector-multiply-ff/def.py @@ -114,5 +114,13 @@ def get_flops(self, test_case: Dict[str, Any]) -> int: # One multiply + one modular reduction per element (approximate as 2 ops) return 2 * test_case["dims"][0] + def get_mem(self, test_case: Dict[str, Any]) -> int: + n = test_case["dims"][0] + + # Input: A (n) + B (n) - both uint32 + # Output: C (n) - uint32 + dtype_bytes = 4 # 4 bytes per uint32 element + return (n + n + n) * dtype_bytes + def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]: return [test_case["dims"][0]]