diff --git a/problems/argmax/def.py b/problems/argmax/def.py
index 0b8b217..562d2d0 100644
--- a/problems/argmax/def.py
+++ b/problems/argmax/def.py
@@ -172,6 +172,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) comparisons
         return num_reductions * (reduce_size - 1)
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element (input) and int32 element (output)
+        return (total_elements + output_elements) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/argmin/def.py b/problems/argmin/def.py
index 7055569..f531322 100644
--- a/problems/argmin/def.py
+++ b/problems/argmin/def.py
@@ -171,6 +171,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) comparisons
         return num_reductions * (reduce_size - 1)
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element (input) and int32 element (output)
+        return (total_elements + output_elements) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/avg-pool-1d/def.py b/problems/avg-pool-1d/def.py
index 4e1f95f..a00bb3e 100644
--- a/problems/avg-pool-1d/def.py
+++ b/problems/avg-pool-1d/def.py
@@ -185,6 +185,28 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for the entire output
         return H_out * ops_per_output
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["size"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+        
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        
+        # Input: H elements, Output: H_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H + H_out) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/avg-pool-2d/def.py b/problems/avg-pool-2d/def.py
index 12c0491..411b276 100644
--- a/problems/avg-pool-2d/def.py
+++ b/problems/avg-pool-2d/def.py
@@ -190,6 +190,30 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for the entire output
         return H_out * W_out * ops_per_output
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+        
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        W_out = ((W + 2 * P - K) // S) + 1
+        
+        # Input: H*W elements, Output: H_out*W_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W + H_out * W_out) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/avg-pool-3d/def.py b/problems/avg-pool-3d/def.py
index ebc709c..4a48ee4 100644
--- a/problems/avg-pool-3d/def.py
+++ b/problems/avg-pool-3d/def.py
@@ -195,6 +195,32 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for the entire output
         return H_out * W_out * D_out * ops_per_output
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        D = test_case["depth"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+        
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        W_out = ((W + 2 * P - K) // S) + 1
+        D_out = ((D + 2 * P - K) // S) + 1
+        
+        # Input: H*W*D elements, Output: H_out*W_out*D_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W * D + H_out * W_out * D_out) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/batch-norm/def.py b/problems/batch-norm/def.py
index ca3aae9..a50904d 100644
--- a/problems/batch-norm/def.py
+++ b/problems/batch-norm/def.py
@@ -195,6 +195,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["B"]
+        F = test_case["F"]
+        D1 = test_case["D1"]
+        D2 = test_case["D2"]
+        
+        total_elements = B * F * D1 * D2
+        num_features = F * D1 * D2
+        
+        # Naive batch normalization:
+        # 1. Read x to compute mean → B*F*D1*D2
+        # 2. Write mean → F*D1*D2
+        # 3. Read x to compute variance → B*F*D1*D2
+        # 4. Write variance → F*D1*D2
+        # 5. Read x to normalize → B*F*D1*D2
+        # 6. Read mean → F*D1*D2
+        # 7. Read variance → F*D1*D2
+        # 8. Write output → B*F*D1*D2
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (3 * total_elements +      # 3 reads of x
+                3 * num_features +        # mean write + variance write + mean read + variance read
+                total_elements) * dtype_bytes  # output write
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters (dimensions) to pass to the CUDA solution.
diff --git a/problems/box-blur/def.py b/problems/box-blur/def.py
index 095aed2..8167792 100644
--- a/problems/box-blur/def.py
+++ b/problems/box-blur/def.py
@@ -188,6 +188,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         flops_per_pixel = kernel_size * kernel_size
         return height * width * flops_per_pixel
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        height = test_case["height"]
+        width = test_case["width"]
+        kernel_size = test_case["kernel_size"]
+        
+        # Input image: height*width elements
+        # Kernel: kernel_size*kernel_size elements (but this is typically small and reused)
+        # Output: height*width elements
+        # For memory bandwidth, we count input + output (kernel is small and cached)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (height * width + height * width) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/conv-1d/def.py b/problems/conv-1d/def.py
index cb655c9..ff5bd98 100644
--- a/problems/conv-1d/def.py
+++ b/problems/conv-1d/def.py
@@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return N * flops_per_element
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["signal_size"]
+        K = test_case["kernel_size"]
+        
+        # Input signal: N elements
+        # Kernel: K elements (typically small and reused, but counted for completeness)
+        # Output: N elements (same size as input due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + K + N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/conv-2d/def.py b/problems/conv-2d/def.py
index ee68f5a..42d2a21 100644
--- a/problems/conv-2d/def.py
+++ b/problems/conv-2d/def.py
@@ -189,6 +189,27 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # This is slightly different from our detailed calculation but aligns with the test code
         return 2 * H * W * Kh * Kw
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        Kh = test_case["kernel_height"]
+        Kw = test_case["kernel_width"]
+        
+        # Input image: H*W elements
+        # Kernel: Kh*Kw elements (typically small and reused)
+        # Output: H*W elements (same size as input due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W + Kh * Kw + H * W) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/conv-square-3d/def.py b/problems/conv-square-3d/def.py
index 449583c..4b3fe1c 100644
--- a/problems/conv-square-3d/def.py
+++ b/problems/conv-square-3d/def.py
@@ -175,6 +175,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Following similar convention as 2D case, we use 2*size^3*K^3
         return 2 * size * size * size * K * K * K
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        size = test_case["size"]  # D=H=W
+        K = test_case["kernel_size"]
+        
+        # Input volume: size^3 elements
+        # Kernel: K^3 elements (typically small and reused)
+        # Output: size^3 elements (same size as input due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (size * size * size + K * K * K + size * size * size) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/conv2d-relu-hardswish/def.py b/problems/conv2d-relu-hardswish/def.py
index 0e9024f..e7a7957 100644
--- a/problems/conv2d-relu-hardswish/def.py
+++ b/problems/conv2d-relu-hardswish/def.py
@@ -191,6 +191,39 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return conv_flops + relu_flops + hardswish_flops
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        Kh = test_case["kernel_height"]
+        Kw = test_case["kernel_width"]
+        
+        # Naive conv2d-relu-hardswish:
+        # 1. Read input → H*W
+        # 2. Read kernel → Kh*Kw
+        # 3. Write conv_output → H*W (materialized)
+        # 4. Read conv_output → H*W
+        # 5. Write relu_output → H*W (materialized)
+        # 6. Read relu_output → H*W
+        # 7. Write hardswish_output → H*W
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W +              # read input
+                Kh * Kw +            # read kernel
+                H * W +              # write conv_output
+                H * W +              # read conv_output
+                H * W +              # write relu_output
+                H * W +              # read relu_output
+                H * W) * dtype_bytes  # write hardswish_output
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/cosine-similarity/def.py b/problems/cosine-similarity/def.py
index fa9ea55..f86fb08 100644
--- a/problems/cosine-similarity/def.py
+++ b/problems/cosine-similarity/def.py
@@ -173,6 +173,46 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total per vector pair: approximately 5*D + 3 FLOPs
         return N * (5 * D + 3)
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["n"]
+        D = test_case["d"]
+        
+        N = test_case["n"]
+        D = test_case["d"]
+        
+        # Naive cosine similarity:
+        # 1. Read predictions → N*D
+        # 2. Read targets → N*D
+        # 3. Write dot_product = predictions · targets → N (materialized)
+        # 4. Read predictions → N*D (for norm)
+        # 5. Write norm_pred = ||predictions|| → N (materialized)
+        # 6. Read targets → N*D (for norm)
+        # 7. Write norm_targ = ||targets|| → N (materialized)
+        # 8. Read dot_product, norm_pred, norm_targ → 3*N
+        # 9. Write similarity = dot / (norm_pred * norm_targ) → N
+        # 10. Read similarity → N (if needed for output)
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (2 * N * D +      # read predictions, targets (first time)
+                N * D +          # read predictions (for norm)
+                N * D +          # read targets (for norm)
+                N +              # write dot_product
+                N +              # read dot_product
+                N +              # write norm_pred
+                N +              # read norm_pred
+                N +              # write norm_targ
+                N +              # read norm_targ
+                N) * dtype_bytes  # write similarity (output)
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/cumprod/def.py b/problems/cumprod/def.py
index 5538ebc..07df3d3 100644
--- a/problems/cumprod/def.py
+++ b/problems/cumprod/def.py
@@ -143,6 +143,22 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         N = test_case["size"]
         return N - 1
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["size"]
+        
+        # Input: N elements, Output: N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/cumsum/def.py b/problems/cumsum/def.py
index db3d1c1..00ed29e 100644
--- a/problems/cumsum/def.py
+++ b/problems/cumsum/def.py
@@ -141,6 +141,22 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         N = test_case["size"]
         return N - 1
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["size"]
+        
+        # Input: N elements, Output: N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/diagonal-matmul/def.py b/problems/diagonal-matmul/def.py
index 685c30f..5ca8134 100644
--- a/problems/diagonal-matmul/def.py
+++ b/problems/diagonal-matmul/def.py
@@ -144,6 +144,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         N, M = test_case["dims"]
         return N * M
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N, M = test_case["dims"]
+        
+        # Input: diagonal vector (N) + matrix B (N*M)
+        # Output: matrix C (N*M)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N * M + N * M) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/ecc-point-negation/def.py b/problems/ecc-point-negation/def.py
index 66e4e06..9a365b5 100644
--- a/problems/ecc-point-negation/def.py
+++ b/problems/ecc-point-negation/def.py
@@ -171,6 +171,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         N = test_case["dims"][0]
         return 2 * N
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["dims"][0]
+        
+        # Input: xs (N) + ys (N) - both uint64_t (8 bytes)
+        # Output: out_xy (2*N) - uint64_t (8 bytes)
+        dtype_bytes = 8  # 8 bytes per uint64_t element
+        return (N + N + 2 * N) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Provide N as the trailing scalar param to the CUDA solution.
diff --git a/problems/edge-detect/def.py b/problems/edge-detect/def.py
index ba62162..f62f4ac 100644
--- a/problems/edge-detect/def.py
+++ b/problems/edge-detect/def.py
@@ -184,6 +184,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         interior_pixels = (height - 2) * (width - 2)
         return interior_pixels * 10
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        height = test_case["height"]
+        width = test_case["width"]
+        
+        # Input image: height*width elements
+        # Output image: height*width elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (height * width + height * width) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/elu/def.py b/problems/elu/def.py
index dc494e3..0488928 100644
--- a/problems/elu/def.py
+++ b/problems/elu/def.py
@@ -162,6 +162,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We approximate this as 3 FLOPs per element (worst case)
         return 3 * M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/frobenius-norm/def.py b/problems/frobenius-norm/def.py
index 140720d..c3c5ed6 100644
--- a/problems/frobenius-norm/def.py
+++ b/problems/frobenius-norm/def.py
@@ -192,6 +192,25 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        total_elements = 1
+        for dim in shape:
+            total_elements *= dim
+        
+        # Input: total_elements, Output: total_elements (same shape)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (total_elements + total_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/gelu/def.py b/problems/gelu/def.py
index 21713f6..559611a 100644
--- a/problems/gelu/def.py
+++ b/problems/gelu/def.py
@@ -167,6 +167,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         flops_per_element = 24
         return M * N * flops_per_element
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/gemm-multiply-leakyrelu/def.py b/problems/gemm-multiply-leakyrelu/def.py
index 33dc42b..c906950 100644
--- a/problems/gemm-multiply-leakyrelu/def.py
+++ b/problems/gemm-multiply-leakyrelu/def.py
@@ -197,6 +197,38 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         leaky_relu_flops = M * N
         return gemm_flops + multiply_flops + leaky_relu_flops
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M, N, K = test_case["dims"]
+        
+        # Naive gemm-multiply-leakyrelu:
+        # 1. Read A → M*K
+        # 2. Read B → K*N
+        # 3. Write gemm_output → M*N (materialized)
+        # 4. Read gemm_output → M*N
+        # 5. Read C → M*N
+        # 6. Write multiply_output → M*N (materialized)
+        # 7. Read multiply_output → M*N
+        # 8. Write leakyrelu_output → M*N
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * K +            # read A
+                K * N +            # read B
+                M * N +            # write gemm_output
+                M * N +            # read gemm_output
+                M * N +            # read C
+                M * N +            # write multiply_output
+                M * N +            # read multiply_output
+                M * N) * dtype_bytes  # write leakyrelu_output
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/gemm-relu/def.py b/problems/gemm-relu/def.py
index b61e691..250027c 100644
--- a/problems/gemm-relu/def.py
+++ b/problems/gemm-relu/def.py
@@ -195,6 +195,36 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - Each of the B*M output elements requires 1 addition
         return 2 * B * N * M + B * M
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["batch_size"]
+        N = test_case["in_features"]
+        M = test_case["out_features"]
+        
+        # Naive gemm-relu:
+        # 1. Read input → B*N
+        # 2. Read weights → M*N
+        # 3. Read bias → M
+        # 4. Write matmul_output → B*M (materialized)
+        # 5. Read matmul_output → B*M
+        # 6. Write relu_output → B*M
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (B * N +            # read input
+                M * N +            # read weights
+                M +                # read bias
+                B * M +            # write matmul_output
+                B * M +            # read matmul_output
+                B * M) * dtype_bytes  # write relu_output
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/grayscale/def.py b/problems/grayscale/def.py
index 8782ad5..c99fd50 100644
--- a/problems/grayscale/def.py
+++ b/problems/grayscale/def.py
@@ -164,6 +164,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total: 5 FLOPs per pixel
         return height * width * 5
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        height = test_case["height"]
+        width = test_case["width"]
+        
+        # Input: RGB image (height*width*3)
+        # Output: grayscale image (height*width)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (height * width * 3 + height * width) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/hard-sigmoid/def.py b/problems/hard-sigmoid/def.py
index 017013a..a3497e6 100644
--- a/problems/hard-sigmoid/def.py
+++ b/problems/hard-sigmoid/def.py
@@ -162,6 +162,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 1 FLOP per element as per the test case
         return M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/hinge-loss/def.py b/problems/hinge-loss/def.py
index 38c10a2..13139f2 100644
--- a/problems/hinge-loss/def.py
+++ b/problems/hinge-loss/def.py
@@ -153,6 +153,37 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # 3. max(0, sub)                        (1 FLOP, comparison)
         return N * 3
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["n"]
+        
+        # Naive hinge loss:
+        # 1. Read predictions → N
+        # 2. Read targets → N
+        # 3. Write mult = predictions * targets → N (materialized)
+        # 4. Read mult → N
+        # 5. Write sub = 1 - mult → N (materialized)
+        # 6. Read sub → N
+        # 7. Write max(0, sub) → N
+        # 8. Read max → N
+        # 9. Write sum → 1 (if summing, otherwise skip)
+        # Total: 4 reads, 4 writes = 8N element-moves (or 7N if no sum)
+        # Actually output is N, so final write is N, not 1
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N +          # read predictions, targets
+                N + N +          # write and read mult
+                N + N +          # write and read sub
+                N + N) * dtype_bytes  # write and read max (output)
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/histogram/def.py b/problems/histogram/def.py
index f04534c..ca70b37 100644
--- a/problems/histogram/def.py
+++ b/problems/histogram/def.py
@@ -163,6 +163,34 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total: 4 operations per pixel
         return height * width * 4
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        height = test_case["height"]
+        width = test_case["width"]
+        num_bins = test_case["num_bins"]
+        
+        # Naive histogram:
+        # 1. Read input image → height*width
+        # 2. Read histogram bins (for each pixel, read current bin value) → height*width (reads to bins)
+        # 3. Write histogram bins (increment) → height*width (writes to bins)
+        # 4. Read histogram bins (final read) → num_bins
+        # 5. Write output histogram → num_bins
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (height * width +        # read input
+                height * width +        # read bins during increment
+                height * width +        # write bins during increment
+                num_bins +              # read final bins
+                num_bins) * dtype_bytes  # write output
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/huber-loss/def.py b/problems/huber-loss/def.py
index a44b482..cc529d0 100644
--- a/problems/huber-loss/def.py
+++ b/problems/huber-loss/def.py
@@ -164,6 +164,34 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Conservatively estimate as 5 FLOPs per element.
         return N * 5 
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["n"]
+        
+        # Naive huber loss:
+        # 1. Read predictions → N
+        # 2. Read targets → N
+        # 3. Write diff = predictions - targets → N (materialized)
+        # 4. Read diff → N
+        # 5. Write abs_diff = |diff| → N (materialized)
+        # 6. Read abs_diff → N
+        # 7. Write loss (based on condition) → N
+        # 8. Read loss → N (if needed)
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N +          # read predictions, targets
+                N + N +          # write and read diff
+                N + N +          # write and read abs_diff
+                N + N) * dtype_bytes  # write and read loss (output)
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/kl-loss/def.py b/problems/kl-loss/def.py
index 5f6313d..19eaee3 100644
--- a/problems/kl-loss/def.py
+++ b/problems/kl-loss/def.py
@@ -199,6 +199,41 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Conservatively estimate as 7 FLOPs per element
         return N * 7 
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["n"]
+        
+        # Naive KL divergence loss:
+        # 1. Read predictions → N
+        # 2. Read targets → N
+        # 3. Write pred_eps = predictions + eps → N (materialized)
+        # 4. Write targ_eps = targets + eps → N (materialized)
+        # 5. Read pred_eps → N
+        # 6. Write log_pred = log(pred_eps) → N (materialized)
+        # 7. Read targ_eps → N
+        # 8. Write log_targ = log(targ_eps) → N (materialized)
+        # 9. Read log_pred, log_targ → 2*N
+        # 10. Write log_diff = log_targ - log_pred → N (materialized)
+        # 11. Read log_diff, targets → 2*N
+        # 12. Write loss = targets * log_diff → N
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N +          # read predictions, targets
+                N + N +          # write pred_eps, targ_eps
+                N + N +          # read pred_eps, write log_pred
+                N + N +          # read targ_eps, write log_targ
+                N + N +          # read log_pred, log_targ
+                N + N +          # write and read log_diff
+                N + N) * dtype_bytes  # read targets, write loss (output)
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/l1-norm/def.py b/problems/l1-norm/def.py
index 0f70c6f..2ecdfb3 100644
--- a/problems/l1-norm/def.py
+++ b/problems/l1-norm/def.py
@@ -191,6 +191,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops) # Return as integer
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["B"]
+        D = test_case["D"]
+        
+        # Input: B*D elements, Output: B*D elements (same shape)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (B * D + B * D) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters (dimensions) to pass to the CUDA solution.
diff --git a/problems/l2-norm/def.py b/problems/l2-norm/def.py
index 8fd8773..13e52d9 100644
--- a/problems/l2-norm/def.py
+++ b/problems/l2-norm/def.py
@@ -192,6 +192,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops) # Return as integer
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["B"]
+        D = test_case["D"]
+        
+        # Input: B*D elements, Output: B*D elements (same shape)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (B * D + B * D) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters (dimensions) to pass to the CUDA solution.
diff --git a/problems/layer-norm/def.py b/problems/layer-norm/def.py
index ed69714..ae64132 100644
--- a/problems/layer-norm/def.py
+++ b/problems/layer-norm/def.py
@@ -218,6 +218,44 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops) # Return as integer
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["B"]
+        F = test_case["F"]
+        D1 = test_case["D1"]
+        D2 = test_case["D2"]
+        
+        total_elements = B * F * D1 * D2
+        param_elements = F * D1 * D2
+        
+        # Naive layer normalization per batch item:
+        # 1. Read x to compute mean → B*F*D1*D2
+        # 2. Write mean → B elements
+        # 3. Read x to compute variance → B*F*D1*D2
+        # 4. Write variance → B elements
+        # 5. Read x to normalize → B*F*D1*D2
+        # 6. Read mean → B elements
+        # 7. Read variance → B elements
+        # 8. Read gamma → F*D1*D2
+        # 9. Read beta → F*D1*D2
+        # 10. Write output → B*F*D1*D2
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (3 * total_elements +      # 3 reads of x
+                2 * B +                    # mean write + variance write
+                2 * B +                    # mean read + variance read
+                param_elements +          # gamma read
+                param_elements +          # beta read
+                total_elements) * dtype_bytes  # output write
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters (dimensions) to pass to the CUDA solution.
diff --git a/problems/leaky-relu/def.py b/problems/leaky-relu/def.py
index b02cd49..f9ab084 100644
--- a/problems/leaky-relu/def.py
+++ b/problems/leaky-relu/def.py
@@ -167,6 +167,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 1 FLOP per element as per the test case
         return M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/lower-trig-matmul/def.py b/problems/lower-trig-matmul/def.py
index 057ade9..1e2ed52 100644
--- a/problems/lower-trig-matmul/def.py
+++ b/problems/lower-trig-matmul/def.py
@@ -172,6 +172,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         flops = N * (N + 1) * (N + 2) // 3 
         return flops 
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["dims"][0]
+        
+        # Input: A (N*N) + B (N*N) - both lower triangular but stored as full matrices
+        # Output: C (N*N) - lower triangular but stored as full matrix
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * N + N * N + N * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matmul-3d/def.py b/problems/matmul-3d/def.py
index f14d1c2..0da32e7 100644
--- a/problems/matmul-3d/def.py
+++ b/problems/matmul-3d/def.py
@@ -158,6 +158,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         N, M, K, L = test_case["dims"]
         return 2 * N * M * K * L
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N, M, K, L = test_case["dims"]
+        
+        # Input: A (N*M*K) + B (K*L)
+        # Output: C (N*M*L)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * M * K + K * L + N * M * L) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matmul-4d/def.py b/problems/matmul-4d/def.py
index 7b24129..94487ff 100644
--- a/problems/matmul-4d/def.py
+++ b/problems/matmul-4d/def.py
@@ -159,6 +159,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         b, i, j, l, k = test_case["dims"]
         return 2 * b * i * j * l * k
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        b, i, j, l, k = test_case["dims"]
+        
+        # Input: A (b*i*j*l) + B (l*k)
+        # Output: C (b*i*j*k)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (b * i * j * l + l * k + b * i * j * k) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matmul-sigmoid-sum/def.py b/problems/matmul-sigmoid-sum/def.py
index 45225eb..9d9426c 100644
--- a/problems/matmul-sigmoid-sum/def.py
+++ b/problems/matmul-sigmoid-sum/def.py
@@ -164,6 +164,36 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         sum_flops = M * N - 1
         return matmul_flops + sigmoid_flops + sum_flops
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M, N, K = test_case["dims"]
+        
+        # Naive matmul-sigmoid-sum:
+        # 1. Read A → M*K
+        # 2. Read B → K*N
+        # 3. Write matmul_output → M*N (materialized)
+        # 4. Read matmul_output → M*N
+        # 5. Write sigmoid_output → M*N (materialized)
+        # 6. Read sigmoid_output → M*N
+        # 7. Write sum → 1
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * K +            # read A
+                K * N +            # read B
+                M * N +            # write matmul_output
+                M * N +            # read matmul_output
+                M * N +            # write sigmoid_output
+                M * N +            # read sigmoid_output
+                1) * dtype_bytes    # write sum (output)
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matmul-swish-scaling/def.py b/problems/matmul-swish-scaling/def.py
index d5c22ed..ef79a2e 100644
--- a/problems/matmul-swish-scaling/def.py
+++ b/problems/matmul-swish-scaling/def.py
@@ -195,6 +195,36 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         scaling_flops = M * N
         return matmul_flops + swish_flops + scaling_flops
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M, N, K = test_case["dims"]
+        
+        # Naive matmul-swish-scaling:
+        # 1. Read A → M*K
+        # 2. Read B → K*N
+        # 3. Write matmul_output → M*N (materialized)
+        # 4. Read matmul_output → M*N
+        # 5. Write swish_output → M*N (materialized)
+        # 6. Read swish_output → M*N
+        # 7. Write scaled_output → M*N
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * K +            # read A
+                K * N +            # read B
+                M * N +            # write matmul_output
+                M * N +            # read matmul_output
+                M * N +            # write swish_output
+                M * N +            # read swish_output
+                M * N) * dtype_bytes  # write scaled_output
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matmul-swish/def.py b/problems/matmul-swish/def.py
index 8accc19..1cf96ab 100644
--- a/problems/matmul-swish/def.py
+++ b/problems/matmul-swish/def.py
@@ -191,6 +191,42 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs for all elements
         return B * O * flops_per_element
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        B = test_case["batch_size"]
+        I = test_case["in_features"]
+        O = test_case["out_features"]
+        
+        # Naive matmul-swish:
+        # 1. Read input → B*I
+        # 2. Read weight → O*I
+        # 3. Read bias → O
+        # 4. Write matmul_output → B*O (materialized)
+        # 5. Read matmul_output → B*O
+        # 6. Write sigmoid_output → B*O (materialized)
+        # 7. Read sigmoid_output → B*O
+        # 8. Read matmul_output → B*O (for swish multiply)
+        # 9. Write swish_output → B*O
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (B * I +            # read input
+                O * I +            # read weight
+                O +                # read bias
+                B * O +            # write matmul_output
+                B * O +            # read matmul_output
+                B * O +            # write sigmoid_output
+                B * O +            # read sigmoid_output
+                B * O +            # read matmul_output (again)
+                B * O) * dtype_bytes  # write swish_output
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matrix-multiplication/def.py b/problems/matrix-multiplication/def.py
index d1a7732..6ae27f8 100644
--- a/problems/matrix-multiplication/def.py
+++ b/problems/matrix-multiplication/def.py
@@ -168,6 +168,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         M, N, K = test_case["dims"]
         return 2 * M * N * K
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M, N, K = test_case["dims"]
+        
+        # Input: A (M*K) + B (K*N)
+        # Output: C (M*N)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * K + K * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matrix-power/def.py b/problems/matrix-power/def.py
index 4f31548..7724e15 100644
--- a/problems/matrix-power/def.py
+++ b/problems/matrix-power/def.py
@@ -169,6 +169,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return (power - 1) * 2 * (N ** 3)
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["size"]
+        
+        # Input: matrix (N*N)
+        # Output: matrix^n (N*N)
+        # Note: Intermediate results may be stored, but we count input + final output
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * N + N * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matrix-scalar/def.py b/problems/matrix-scalar/def.py
index 65e5179..4cd8c57 100644
--- a/problems/matrix-scalar/def.py
+++ b/problems/matrix-scalar/def.py
@@ -153,6 +153,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - There are N*N elements in the matrix
         return N * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["size"]
+        
+        # Input: matrix (N*N)
+        # Output: matrix (N*N) (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * N + N * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/matrix-vector/def.py b/problems/matrix-vector/def.py
index d9af637..d6f04d6 100644
--- a/problems/matrix-vector/def.py
+++ b/problems/matrix-vector/def.py
@@ -156,6 +156,24 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - Each MAD (Multiply-Add) counts as 2 FLOPs
         return M * K * 2
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        K = test_case["cols"]
+        
+        # Input: matrix (M*K) + vector (K)
+        # Output: vector (M)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * K + K + M) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/max-dim/def.py b/problems/max-dim/def.py
index 30ef85a..c7390b7 100644
--- a/problems/max-dim/def.py
+++ b/problems/max-dim/def.py
@@ -169,6 +169,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) comparisons
         return num_reductions * (reduce_size - 1)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension, but keepdim=True)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (total_elements + output_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/max-pool-1d/def.py b/problems/max-pool-1d/def.py
index 1f284bc..69e5bba 100644
--- a/problems/max-pool-1d/def.py
+++ b/problems/max-pool-1d/def.py
@@ -189,6 +189,29 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs (comparisons) for the entire output
         return H_out * comparisons_per_output
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["size"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+        D = test_case["dilation"]
+        
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - D * (K - 1) - 1) // S) + 1
+        
+        # Input: H elements, Output: H_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H + H_out) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/max-pool-2d/def.py b/problems/max-pool-2d/def.py
index 55bfbf6..f91a24a 100644
--- a/problems/max-pool-2d/def.py
+++ b/problems/max-pool-2d/def.py
@@ -196,6 +196,30 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs (comparisons) for the entire output
         return H_out * W_out * comparisons_per_output
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+        
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        W_out = ((W + 2 * P - K) // S) + 1
+        
+        # Input: H*W elements, Output: H_out*W_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W + H_out * W_out) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/max-pool-3d/def.py b/problems/max-pool-3d/def.py
index 013e019..9f44670 100644
--- a/problems/max-pool-3d/def.py
+++ b/problems/max-pool-3d/def.py
@@ -201,6 +201,32 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total FLOPs (comparisons) for the entire output
         return H_out * W_out * D_out * comparisons_per_output
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        H = test_case["height"]
+        W = test_case["width"]
+        D = test_case["depth"]
+        K = test_case["kernel_size"]
+        S = test_case["stride"]
+        P = test_case["padding"]
+        
+        # Calculate output dimensions
+        H_out = ((H + 2 * P - K) // S) + 1
+        W_out = ((W + 2 * P - K) // S) + 1
+        D_out = ((D + 2 * P - K) // S) + 1
+        
+        # Input: H*W*D elements, Output: H_out*W_out*D_out elements
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (H * W * D + H_out * W_out * D_out) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/mean-dim/def.py b/problems/mean-dim/def.py
index 4f67be1..8317fc5 100644
--- a/problems/mean-dim/def.py
+++ b/problems/mean-dim/def.py
@@ -170,6 +170,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each mean requires (reduce_size - 1) additions and 1 division
         return num_outputs * (reduce_size)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension, but keepdim=True)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (total_elements + output_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/min-dim/def.py b/problems/min-dim/def.py
index c08d613..7d85f7c 100644
--- a/problems/min-dim/def.py
+++ b/problems/min-dim/def.py
@@ -170,6 +170,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) comparisons
         return num_reductions * (reduce_size - 1)
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension, but keepdim=True)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (total_elements + output_elements) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/min-spanning-tree/def.py b/problems/min-spanning-tree/def.py
index c02a199..10aec70 100644
--- a/problems/min-spanning-tree/def.py
+++ b/problems/min-spanning-tree/def.py
@@ -186,19 +186,6 @@ def get_function_signature(self) -> Dict[str, Any]:
             "restype": None
         }
     
-    # def get_flops(self, test_case: Dict[str, Any]) -> int:
-    #     """
-    #     Get the number of floating point operations for the problem.
-        
-    #     Args:
-    #         test_case: The test case dictionary
-            
-    #     Returns:
-    #         Number of floating point operations
-    #     """
-    #     N = test_case["dims"][0]
-    #     return N * N * 2
-    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/mse-loss/def.py b/problems/mse-loss/def.py
index 6e8c915..37f1463 100644
--- a/problems/mse-loss/def.py
+++ b/problems/mse-loss/def.py
@@ -157,6 +157,40 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return flops
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        
+        # Total elements in the tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Naive MSE loss:
+        # 1. Read predictions → total_elements
+        # 2. Read targets → total_elements
+        # 3. Write diff = predictions - targets → total_elements (materialized)
+        # 4. Read diff → total_elements
+        # 5. Write squared = diff^2 → total_elements (materialized)
+        # 6. Read squared → total_elements
+        # 7. Write sum → 1 (materialized)
+        # 8. Read sum → 1
+        # 9. Write mean → 1
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (2 * total_elements +      # read predictions, targets
+                2 * total_elements +      # write and read diff
+                2 * total_elements +      # write and read squared
+                2) * dtype_bytes           # write and read sum, write mean
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/poly-multiply-ff/def.py b/problems/poly-multiply-ff/def.py
index adcedeb..aa336e2 100644
--- a/problems/poly-multiply-ff/def.py
+++ b/problems/poly-multiply-ff/def.py
@@ -122,5 +122,22 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         n = test_case["dims"][0]
         return 2 * n * n  # one mul + one add per term
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        n = test_case["dims"][0]
+        
+        # Input: A (n) + B (n) - both uint32
+        # Output: C (2*n - 1) - uint32
+        dtype_bytes = 4  # 4 bytes per uint32 element
+        return (n + n + (2 * n - 1)) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         return [test_case["dims"][0]]
diff --git a/problems/product-dim/def.py b/problems/product-dim/def.py
index fa90222..ffc4660 100644
--- a/problems/product-dim/def.py
+++ b/problems/product-dim/def.py
@@ -170,6 +170,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) multiplications
         return num_reductions * (reduce_size - 1)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension, but keepdim=True)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (total_elements + output_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/relu/def.py b/problems/relu/def.py
index f425d49..2ba4379 100644
--- a/problems/relu/def.py
+++ b/problems/relu/def.py
@@ -169,6 +169,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 1 FLOP per element as per the test case
         return M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/rms-norm/def.py b/problems/rms-norm/def.py
index 1155d7b..fa749f7 100644
--- a/problems/rms-norm/def.py
+++ b/problems/rms-norm/def.py
@@ -190,6 +190,35 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        batch_size = shape[0]
+        num_features = shape[1]
+        
+        total_elements = batch_size * num_features
+        
+        # Naive RMS normalization per batch item:
+        # 1. Read x to compute RMS (sum of squares) → batch_size*num_features
+        # 2. Write RMS → batch_size elements
+        # 3. Read x to normalize → batch_size*num_features
+        # 4. Read RMS → batch_size elements
+        # 5. Write output → batch_size*num_features
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (2 * total_elements +      # 2 reads of x
+                batch_size +              # RMS write
+                batch_size +              # RMS read
+                total_elements) * dtype_bytes  # output write
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/running-sum-1d/def.py b/problems/running-sum-1d/def.py
index 8fd8b98..c7e87d0 100644
--- a/problems/running-sum-1d/def.py
+++ b/problems/running-sum-1d/def.py
@@ -169,6 +169,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return 2*N 
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["signal_size"]
+        
+        # Input: signal (N elements)
+        # Output: running sum (N elements, same size due to padding)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N + N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/scaled-dot-attention/def.py b/problems/scaled-dot-attention/def.py
index 8e10c76..659d697 100644
--- a/problems/scaled-dot-attention/def.py
+++ b/problems/scaled-dot-attention/def.py
@@ -221,6 +221,37 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return (4 * b * h * s * s * e) + (5 * b * h * s * s)
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        b = test_case["batch"]
+        h = test_case["heads"]
+        s = test_case["seq_len"]
+        e = test_case["embed_dim"]
+        
+        # Naive scaled dot-product attention:
+        # 1. Read Q, K for QK^T → 2 * b*h*s*e
+        # 2. Write QK^T → b*h*s*s
+        # 3. Read QK^T (scaling) → b*h*s*s
+        # 4. Write scaled → b*h*s*s
+        # 5. Softmax: read scaled (max) → b*h*s*s, read scaled (exp) → b*h*s*s, write exp → b*h*s*s, 
+        #    read exp (norm) → b*h*s*s, write softmax → b*h*s*s = 5 * b*h*s*s
+        # 6. Read softmax, V for matmul → b*h*s*s + b*h*s*e
+        # 7. Write output → b*h*s*e
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (2 * b * h * s * e +  # Q, K reads
+                8 * b * h * s * s +  # QK^T, scaled, softmax intermediates (1+1+5+1)
+                b * h * s * e +      # V read
+                b * h * s * e) * dtype_bytes  # output write
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/selu/def.py b/problems/selu/def.py
index 72f61ef..902e6ba 100644
--- a/problems/selu/def.py
+++ b/problems/selu/def.py
@@ -164,6 +164,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 3 FLOPs per element for the SELU calculation
         return M * N * 3
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/sigmoid/def.py b/problems/sigmoid/def.py
index b5d4681..7da4601 100644
--- a/problems/sigmoid/def.py
+++ b/problems/sigmoid/def.py
@@ -161,6 +161,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 1 FLOP per element as per the test case
         return M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/soft-plus/def.py b/problems/soft-plus/def.py
index 17032dc..747d4ae 100644
--- a/problems/soft-plus/def.py
+++ b/problems/soft-plus/def.py
@@ -159,6 +159,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 3 FLOPs per element for the Softplus calculation (exp, add, log)
         return M * N * 3
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/softmax/def.py b/problems/softmax/def.py
index 0f5f39d..398ba75 100644
--- a/problems/softmax/def.py
+++ b/problems/softmax/def.py
@@ -191,6 +191,34 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
 
         return num_slices * flops_per_slice
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Naive stable softmax that materializes the exponential buffer:
+        # 1. Read x to compute max → N
+        # 2. Read x to compute exp and sum → N
+        # 3. Write e = exp(x-m) → N
+        # 4. Read e to normalize → N
+        # 5. Write y → N
+        # Total: 5N element-moves
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return 5 * total_elements * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/square-matmul/def.py b/problems/square-matmul/def.py
index ddcadce..360edc2 100644
--- a/problems/square-matmul/def.py
+++ b/problems/square-matmul/def.py
@@ -151,6 +151,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - There are N*N output elements
         return N * N * N * 2
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["size"]
+        
+        # Input: A (N*N) + B (N*N)
+        # Output: C (N*N)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * N + N * N + N * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/sum-dim/def.py b/problems/sum-dim/def.py
index 764179b..3b97713 100644
--- a/problems/sum-dim/def.py
+++ b/problems/sum-dim/def.py
@@ -168,6 +168,31 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Each reduction requires (reduce_size - 1) additions
         return num_reductions * (reduce_size - 1)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        shape = test_case["shape"]
+        dim = test_case["dim"]
+        
+        # Total elements in the input tensor
+        total_elements = 1
+        for s in shape:
+            total_elements *= s
+        
+        # Output tensor has reduced dimension (one less dimension, but keepdim=True)
+        reduce_size = shape[dim]
+        output_elements = total_elements // reduce_size
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (total_elements + output_elements) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/swish/def.py b/problems/swish/def.py
index 89dbf8d..325b700 100644
--- a/problems/swish/def.py
+++ b/problems/swish/def.py
@@ -162,6 +162,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 1 FLOP per element as per the test case
         return M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/symmetric-matmul/def.py b/problems/symmetric-matmul/def.py
index ae8cc8d..324032a 100644
--- a/problems/symmetric-matmul/def.py
+++ b/problems/symmetric-matmul/def.py
@@ -157,6 +157,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - There are N*N output elements
         return N * N * N * 2
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        N = test_case["size"]
+        
+        # Input: A (N*N) + B (N*N) - both symmetric but stored as full matrices
+        # Output: C (N*N)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * N + N * N + N * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/tanh/def.py b/problems/tanh/def.py
index 2cf35e7..7ac7206 100644
--- a/problems/tanh/def.py
+++ b/problems/tanh/def.py
@@ -163,6 +163,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # - We count this as 1 FLOP per element as per the test case
         return M * N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        M = test_case["rows"]
+        N = test_case["cols"]
+        
+        # Input: M*N elements, Output: M*N elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (M * N + M * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/threshold/def.py b/problems/threshold/def.py
index 980cae9..565406a 100644
--- a/problems/threshold/def.py
+++ b/problems/threshold/def.py
@@ -163,6 +163,23 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # Total: 1 FLOP per pixel
         return height * width
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        height = test_case["height"]
+        width = test_case["width"]
+        
+        # Input: height*width elements, Output: height*width elements (same size)
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (height * width + height * width) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/triplet-margin/def.py b/problems/triplet-margin/def.py
index 104d92b..a2ed7a8 100644
--- a/problems/triplet-margin/def.py
+++ b/problems/triplet-margin/def.py
@@ -201,6 +201,50 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         
         return int(total_flops)
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        """
+        Get the memory usage for the problem. Assumed to be all in DRAM
+        
+        Args:
+            test_case: The test case dictionary
+            
+        Returns:
+            Memory usage in bytes
+        """
+        batch = test_case["batch"]
+        embedding_dim = test_case["embedding_dim"]
+        
+        batch = test_case["batch"]
+        embedding_dim = test_case["embedding_dim"]
+        total_elements = batch * embedding_dim
+        
+        # Naive triplet margin loss:
+        # 1. Read anchor → batch*embedding_dim
+        # 2. Read positive → batch*embedding_dim
+        # 3. Write dist_pos → batch (materialized)
+        # 4. Read anchor → batch*embedding_dim (for negative distance)
+        # 5. Read negative → batch*embedding_dim
+        # 6. Write dist_neg → batch (materialized)
+        # 7. Read dist_pos, dist_neg → 2*batch
+        # 8. Write margin_loss = dist_pos - dist_neg + margin → batch (materialized)
+        # 9. Read margin_loss → batch
+        # 10. Write max(0, margin_loss) → batch
+        # 11. Read max → batch
+        # 12. Write sum → 1
+        
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (2 * total_elements +      # read anchor (twice) + positive
+                total_elements +          # read negative
+                batch +                   # write dist_pos
+                batch +                   # read dist_pos
+                batch +                   # write dist_neg
+                batch +                   # read dist_neg
+                batch +                   # write margin_loss
+                batch +                   # read margin_loss
+                batch +                   # write max
+                batch +                   # read max
+                1) * dtype_bytes           # write sum (output)
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/upper-trig-matmul/def.py b/problems/upper-trig-matmul/def.py
index 55080ed..13e951a 100644
--- a/problems/upper-trig-matmul/def.py
+++ b/problems/upper-trig-matmul/def.py
@@ -179,6 +179,14 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         flops = N * (N + 1) * (N + 2) // 3 
         return flops 
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        N = test_case["dims"][0]
+        
+        # Input: A (N*N) + B (N*N) - both upper triangular but stored as full matrices
+        # Output: C (N*N) - upper triangular but stored as full matrix
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return (N * N + N * N + N * N) * dtype_bytes
+    
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/vector-addition/def.py b/problems/vector-addition/def.py
index 08bd870..199f030 100644
--- a/problems/vector-addition/def.py
+++ b/problems/vector-addition/def.py
@@ -134,6 +134,11 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         N = test_case["dims"][0]
         return N
     
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        N = test_case["dims"][0]
+        dtype_bytes = 4  # 4 bytes per float32 element
+        return 3 * N * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         """
         Get extra parameters to pass to the CUDA solution.
diff --git a/problems/vector-multiply-ff/def.py b/problems/vector-multiply-ff/def.py
index 157f13d..8e3c647 100644
--- a/problems/vector-multiply-ff/def.py
+++ b/problems/vector-multiply-ff/def.py
@@ -114,5 +114,13 @@ def get_flops(self, test_case: Dict[str, Any]) -> int:
         # One multiply + one modular reduction per element (approximate as 2 ops)
         return 2 * test_case["dims"][0]
 
+    def get_mem(self, test_case: Dict[str, Any]) -> int:
+        n = test_case["dims"][0]
+        
+        # Input: A (n) + B (n) - both uint32
+        # Output: C (n) - uint32
+        dtype_bytes = 4  # 4 bytes per uint32 element
+        return (n + n + n) * dtype_bytes
+
     def get_extra_params(self, test_case: Dict[str, Any]) -> List[Any]:
         return [test_case["dims"][0]]