tensara · knightron0 · Jan 17, 2026
diff --git a/problems/argmax/def.py b/problems/argmax/def.py
@@ -172,6 +172,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) comparisons
         return num_reductions * (reduce_size - 1)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+
+        # Output tensor has reduced dimension (one less dimension)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+
+        dtype_bytes = 4  # 4 bytes per float32 element (input) and int32 element (output)
+        return (total_elements + output_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/argmin/def.py b/problems/argmin/def.py
@@ -171,6 +171,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) comparisons
         return num_reductions * (reduce_size - 1)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+
+        # Output tensor has reduced dimension (one less dimension)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+
+        dtype_bytes = 4  # 4 bytes per float32 element (input) and int32 element (output)
+        return (total_elements + output_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/avg-pool-1d/def.py b/problems/avg-pool-1d/def.py
@@ -185,6 +185,28 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for the entire output
         return H_out * ops_per_output
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["size"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+
+        # Input: H elements, Output: H_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H + H_out) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/avg-pool-2d/def.py b/problems/avg-pool-2d/def.py
@@ -190,6 +190,30 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for the entire output
         return H_out * W_out * ops_per_output
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        W_out = ((W + 2 * P - K) // S) + 1
+
+        # Input: H*W elements, Output: H_out*W_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W + H_out * W_out) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/avg-pool-3d/def.py b/problems/avg-pool-3d/def.py
@@ -195,6 +195,32 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for the entire output
         return H_out * W_out * D_out * ops_per_output
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        D = test_case["depth"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        W_out = ((W + 2 * P - K) // S) + 1
+        D_out = ((D + 2 * P - K) // S) + 1
+
+        # Input: H*W*D elements, Output: H_out*W_out*D_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W * D + H_out * W_out * D_out) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/batch-norm/def.py b/problems/batch-norm/def.py
@@ -195,6 +195,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
 
         return int(total_flops)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["B"]
+        F = test_case["F"]
+        D1 = test_case["D1"]
+        D2 = test_case["D2"]
+
+        total_elements = B * F * D1 * D2
+        num_features = F * D1 * D2
+
+        # Naive batch normalization:
+        # 1. Read x to compute mean → B*F*D1*D2
+        # 2. Write mean → F*D1*D2
+        # 3. Read x to compute variance → B*F*D1*D2
+        # 4. Write variance → F*D1*D2
+        # 5. Read x to normalize → B*F*D1*D2
+        # 6. Read mean → F*D1*D2
+        # 7. Read variance → F*D1*D2
+        # 8. Write output → B*F*D1*D2
+
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (3 * total_elements +      # 3 reads of x
+                3 * num_features +        # mean write + variance write + mean read + variance read
+                total_elements) * dtype_bytes  # output write
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters (dimensions) to pass to the CUDA solution.

diff --git a/problems/box-blur/def.py b/problems/box-blur/def.py
@@ -188,6 +188,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         flops_per_pixel = kernel_size * kernel_size
         return height * width * flops_per_pixel
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        height = test_case["height"]
+        width = test_case["width"]
+        kernel_size = test_case["kernel_size"]
+
+        # Input image: height*width elements
+        # Kernel: kernel_size*kernel_size elements (but this is typically small and reused)
+        # Output: height*width elements
+        # For memory bandwidth, we count input + output (kernel is small and cached)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (height * width + height * width) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/conv-1d/def.py b/problems/conv-1d/def.py
@@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
 
         return N * flops_per_element
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["signal_size"]
+        K = test_case["kernel_size"]
+
+        # Input signal: N elements
+        # Kernel: K elements (typically small and reused, but counted for completeness)
+        # Output: N elements (same size as input due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + K + N) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/conv-2d/def.py b/problems/conv-2d/def.py
@@ -189,6 +189,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # This is slightly different from our detailed calculation but aligns with the test code
         return 2 * H * W * Kh * Kw
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        Kh = test_case["kernel_height"]
+        Kw = test_case["kernel_width"]
+
+        # Input image: H*W elements
+        # Kernel: Kh*Kw elements (typically small and reused)
+        # Output: H*W elements (same size as input due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W + Kh * Kw + H * W) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/conv-square-3d/def.py b/problems/conv-square-3d/def.py
@@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Following similar convention as 2D case, we use 2*size^3*K^3
         return 2 * size * size * size * K * K * K
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        size = test_case["size"]  # D=H=W
+        K = test_case["kernel_size"]
+
+        # Input volume: size^3 elements
+        # Kernel: K^3 elements (typically small and reused)
+        # Output: size^3 elements (same size as input due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (size * size * size + K * K * K + size * size * size) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/conv2d-relu-hardswish/def.py b/problems/conv2d-relu-hardswish/def.py
@@ -191,6 +191,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
 
         return conv_flops + relu_flops + hardswish_flops
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        Kh = test_case["kernel_height"]
+        Kw = test_case["kernel_width"]
+
+        # Naive conv2d-relu-hardswish:
+        # 1. Read input → H*W
+        # 2. Read kernel → Kh*Kw
+        # 3. Write conv_output → H*W (materialized)
+        # 4. Read conv_output → H*W
+        # 5. Write relu_output → H*W (materialized)
+        # 6. Read relu_output → H*W
+        # 7. Write hardswish_output → H*W
+
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W +              # read input
+                Kh * Kw +            # read kernel
+                H * W +              # write conv_output
+                H * W +              # read conv_output
+                H * W +              # write relu_output
+                H * W +              # read relu_output
+                H * W) * dtype_bytes  # write hardswish_output
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.

diff --git a/problems/cosine-similarity/def.py b/problems/cosine-similarity/def.py
@@ -173,6 +173,46 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total per vector pair: approximately 5*D + 3 FLOPs
         return N * (5 * D + 3)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+
+        Args:
+            test_case: The test case dictionary
+
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["n"]
+        D = test_case["d"]
+
+        N = test_case["n"]
+        D = test_case["d"]
+
+        # Naive cosine similarity:
+        # 1. Read predictions → N*D
+        # 2. Read targets → N*D
+        # 3. Write dot_product = predictions · targets → N (materialized)
+        # 4. Read predictions → N*D (for norm)
+        # 5. Write norm_pred = ||predictions|| → N (materialized)
+        # 6. Read targets → N*D (for norm)
+        # 7. Write norm_targ = ||targets|| → N (materialized)
+        # 8. Read dot_product, norm_pred, norm_targ → 3*N
+        # 9. Write similarity = dot / (norm_pred * norm_targ) → N
+        # 10. Read similarity → N (if needed for output)
+
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (2 * N * D +      # read predictions, targets (first time)
+                N * D +          # read predictions (for norm)
+                N * D +          # read targets (for norm)
+                N +              # write dot_product
+                N +              # read dot_product
+                N +              # write norm_pred
+                N +              # read norm_pred
+                N +              # write norm_targ
+                N +              # read norm_targ
+                N) * dtype_bytes  # write similarity (output)
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.