From 1e4cb960a66b3bf306b4303a21fbc313617b4b06 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anlvpn217-217.anl.gov>
Date: Thu, 11 Jun 2026 21:29:01 -0500
Subject: [PATCH 1/4] Replace hardcoded warp-size (32) literals with warpSize
 built-in

Pure CUDA refactor that substitutes hardcoded warp-size assumptions
in device-side kernels with the CUDA/HIP `warpSize` device-runtime
built-in. No HIP-specific code is introduced; all changes remain
correct on NVIDIA (warpSize=32) and become portable to architectures
where the warp/wavefront size differs (e.g. AMD CDNA gfx9xx with
wavefront=64).

This is a prerequisite for the companion `hip` branch (a HIP/ROCm
build) to produce correct results on wavefront-64 AMD GPUs. The two
branches are intentionally independent so the warpSize cleanup can
land on master without coupling to the HIP build system.

Device-side substitutions (warpSize is a runtime constant; safe in
__global__ / __device__ contexts):
- gpu4pyscf/lib/ecp/common.cu: classic block_reduce pattern --
  shared-memory reduce loop bound (s >= 32 -> s >= warpSize),
  warp-leader gate (tid < 32 -> tid < warpSize), and the
  intra-warp __shfl_down_sync sequence (16/8/4/2/1 unrolled ->
  for offset = warpSize/2; offset > 0; offset >>= 1).
- gpu4pyscf/lib/gvhf-md/{unrolled_md_j.cu, unrolled_md_j_4dm.cu,
  md_contract_j.cu}: 25 occurrences of `lane_id = X % 32` ->
  `lane_id = X % warpSize`.
- gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh: warp-stride
  reduction loop (_o = 16 -> _o = warpSize/2), warp-leader test
  (tid & 31 -> tid % warpSize), warp index (tid >> 5 ->
  tid / warpSize). Centralized the constexpr n_warps divisor
  into a named constant.

Host-side / constexpr substitutions (warpSize is device-only so a
named constant is used instead):
- gpu4pyscf/lib/gvhf-md/md_j_driver.cu: qd_offset_for_threads
  32-element alignment now uses a named WARP_SIZE_HOST constant
  derived from the assumed warp width.
- gpu4pyscf/lib/pbc/{int3c2e_create_tasks.cuh,
  int3c2e_create_tasks_o1.cuh, int3c2e.cuh, ft_ao.cuh, ft_ao.cu},
  gpu4pyscf/lib/multigrid/multigrid.cuh: existing
  `#define WARP_SIZE 32` directives wrapped in `#ifndef WARP_SIZE`
  so the build system can override the value (e.g. -DWARP_SIZE=64)
  for future wider-wavefront targets.

All changes are zero-cost on NVIDIA and unlock correct portability.
---
 gpu4pyscf/lib/ecp/common.cu                   | 16 ++++++-------
 gpu4pyscf/lib/gvhf-md/md_contract_j.cu        |  4 ++--
 gpu4pyscf/lib/gvhf-md/md_j_driver.cu          |  6 ++++-
 gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu        | 24 +++++++++----------
 gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu    | 22 ++++++++---------
 gpu4pyscf/lib/multigrid/multigrid.cuh         |  6 +++++
 .../lib/multigrid/multigrid_v2/evaluation.cuh | 15 ++++++++----
 gpu4pyscf/lib/pbc/ft_ao.cu                    |  6 +++++
 gpu4pyscf/lib/pbc/ft_ao.cuh                   |  6 +++++
 gpu4pyscf/lib/pbc/int3c2e.cuh                 |  6 +++++
 gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh    |  6 +++++
 gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh |  6 +++++
 12 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/gpu4pyscf/lib/ecp/common.cu b/gpu4pyscf/lib/ecp/common.cu
index 543d95e47..e05b9f1a2 100644
--- a/gpu4pyscf/lib/ecp/common.cu
+++ b/gpu4pyscf/lib/ecp/common.cu
@@ -125,22 +125,22 @@ void block_reduce(double val, double *d_out) {
     __syncthreads();
 
     // Perform reduction in shared memory.
-    // Reduce the data until 32 threads remain.
-    for (unsigned int s = THREADS / 2; s >= 32; s >>= 1) {
+    // Reduce the data until one warp remains. warpSize is a built-in
+    // device constant (32 on NVIDIA today; portable for future targets).
+    for (unsigned int s = THREADS / 2; s >= warpSize; s >>= 1) {
         if (tid < s) {
             sdata[tid] += sdata[tid + s];
         }
         __syncthreads();
     }
 
-    if (tid < 32) {
+    if (tid < warpSize) {
         double value = sdata[tid]; // load to register
         unsigned int mask = __activemask(); // all lanes in warp 0 are active
-        value += __shfl_down_sync(mask, value, 16);
-        value += __shfl_down_sync(mask, value, 8);
-        value += __shfl_down_sync(mask, value, 4);
-        value += __shfl_down_sync(mask, value, 2);
-        value += __shfl_down_sync(mask, value, 1);
+        // Final intra-warp reduction; stride starts at warpSize/2.
+        for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+            value += __shfl_down_sync(mask, value, offset);
+        }
         if (tid == 0) {
             d_out[0] += value;
         }
diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
index d34980e91..5afff3784 100644
--- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
@@ -98,7 +98,7 @@ void md_j_1dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     int nsq_per_block = blockDim.x;
     //assert(nsq_per_block == threadsx * threadsy);
     int t_id = gout_id * nsq_per_block + sq_id;
-    int lane_id = t_id % 32;
+    int lane_id = t_id % warpSize;
     int group_id = lane_id / threadsx;
     unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx;
     int tx = sq_id % threadsx;
@@ -397,7 +397,7 @@ void md_j_4dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     int nsq_per_block = blockDim.x;
     //assert(nsq_per_block == threadsx * threadsy);
     int t_id = gout_id * nsq_per_block + sq_id;
-    int lane_id = t_id % 32;
+    int lane_id = t_id % warpSize;
     int group_id = lane_id / threadsx;
     unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx;
     int tx = sq_id % threadsx;
diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
index 84d73f0a0..47e1d67d6 100644
--- a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
+++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
@@ -43,7 +43,11 @@ int offset_for_Rt2_idx(int lij, int lkl)
 
 int qd_offset_for_threads(int npairs, int threads)
 {
-    int npairs_aligned = (npairs + 31) & 0xffffffe0; // 32-element aligned
+    // Host-side warp-aligned padding. `warpSize` is a device-runtime
+    // constant unavailable here; centralize the value so a future
+    // wider-wavefront port is a single-point change.
+    constexpr int WARP_SIZE_HOST = 32;
+    int npairs_aligned = (npairs + (WARP_SIZE_HOST - 1)) & ~(WARP_SIZE_HOST - 1);
     int address = 0;
     for (int i = 1; i < threads; i *= 2) {
         address += npairs_aligned;
diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
index 8e3cced8d..76702991a 100644
--- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
@@ -36,7 +36,7 @@ void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -237,7 +237,7 @@ void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -443,7 +443,7 @@ void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -704,7 +704,7 @@ void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -928,7 +928,7 @@ void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -1252,7 +1252,7 @@ void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -1770,7 +1770,7 @@ void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -2029,7 +2029,7 @@ void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -2460,7 +2460,7 @@ void md_j_3_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -3222,7 +3222,7 @@ void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -3545,7 +3545,7 @@ void md_j_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -4151,7 +4151,7 @@ void md_j_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double *vj = jk.vj;
     double vj_kl0, dm_kl0;
-    unsigned int lane_id = thread_id % 32;
+    unsigned int lane_id = thread_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu
index b64bb0bd8..ba68832c4 100644
--- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu
+++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu
@@ -32,7 +32,7 @@ void md_j_4dm_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[8];
     double dm_kl[8];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -335,7 +335,7 @@ void md_j_4dm_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[8];
     double dm_kl[8];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -709,7 +709,7 @@ void md_j_4dm_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[8];
     double dm_kl[8];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -1477,7 +1477,7 @@ void md_j_4dm_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[8];
     double dm_kl[8];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -2007,7 +2007,7 @@ void md_j_4dm_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[4];
     double dm_kl[4];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -3036,7 +3036,7 @@ void md_j_4dm_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[4];
     double dm_kl[4];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -5209,7 +5209,7 @@ void md_j_4dm_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[4];
     double dm_kl[4];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -5856,7 +5856,7 @@ void md_j_4dm_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[4];
     double dm_kl[4];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -7748,7 +7748,7 @@ void md_j_4dm_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[2];
     double dm_kl[2];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -8330,7 +8330,7 @@ void md_j_4dm_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[2];
     double dm_kl[2];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
@@ -9967,7 +9967,7 @@ void md_j_4dm_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
     double *env = envs.env;
     double vj_kl[2];
     double dm_kl[2];
-    unsigned int lane_id = sq_id % 32;
+    unsigned int lane_id = sq_id % warpSize;
     unsigned int group_id = lane_id / 16;
     unsigned int mask = 0xffff << (group_id * 16);
 
diff --git a/gpu4pyscf/lib/multigrid/multigrid.cuh b/gpu4pyscf/lib/multigrid/multigrid.cuh
index 44594bba0..9ed504739 100644
--- a/gpu4pyscf/lib/multigrid/multigrid.cuh
+++ b/gpu4pyscf/lib/multigrid/multigrid.cuh
@@ -16,7 +16,13 @@
 
 #include <stdint.h>
 
+// WARP_SIZE: compile-time constant used for shared-memory sizing.
+// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
+// so we keep a literal here. Guarded so the build can override
+// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
+#ifndef WARP_SIZE
 #define WARP_SIZE       32
+#endif
 #define WARPS           8
 #define THREADS         (WARP_SIZE*WARPS)
 #define LMAX            4
diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh
index 05f6a5bec..f8cbec2cd 100644
--- a/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh
+++ b/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh
@@ -93,7 +93,11 @@ __global__ static void evaluate_density_kernel(
   KernelType
       prefactor[n_channels * n_i_cartesian_functions * n_j_cartesian_functions];
 
-  constexpr int n_warps = n_threads / 32;  /*L2*/
+  // Compile-time warp count; `warpSize` is a device-runtime constant
+  // (not constexpr), so we use the literal here. Centralizing the
+  // value will make the future HIP/wavefront port a single-point edit.
+  constexpr int WARP_SIZE_CT = 32;
+  constexpr int n_warps = n_threads / WARP_SIZE_CT;  /*L2*/
   __shared__ KernelType reduced_density_values[n_channels * n_warps * n_threads];
 
 #pragma unroll
@@ -329,11 +333,14 @@ __global__ static void evaluate_density_kernel(
             density_value_to_be_shared *= gaussian;
 
             KernelType _wv = density_value_to_be_shared; /*WS*/
+            // Intra-warp reduction using warpSize (device-runtime
+            // built-in). Stride starts at warpSize/2 and halves.
 #pragma unroll
-            for (int _o = 16; _o > 0; _o >>= 1)
+            for (int _o = warpSize / 2; _o > 0; _o >>= 1)
               _wv += __shfl_down_sync(0xffffffffu, _wv, _o);
-            if ((thread_id & 31) == 0) {  /*L2: warp-private store, no atomic*/
-              reduced_density_values[(i_channel * n_warps + (thread_id >> 5)) *
+            // Warp leader test + warp-index extraction using warpSize.
+            if ((thread_id % warpSize) == 0) {  /*L2: warp-private store, no atomic*/
+              reduced_density_values[(i_channel * n_warps + (thread_id / warpSize)) *
                                          n_threads +
                                      a_index * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ +
                                      b_index * BLOCK_DIM_XYZ + c_index] += _wv;
diff --git a/gpu4pyscf/lib/pbc/ft_ao.cu b/gpu4pyscf/lib/pbc/ft_ao.cu
index e8b31aa84..fff9be2d8 100644
--- a/gpu4pyscf/lib/pbc/ft_ao.cu
+++ b/gpu4pyscf/lib/pbc/ft_ao.cu
@@ -21,7 +21,13 @@
 #include "gvhf-rys/vhf.cuh"
 #include "gvhf-rys/rys_contract_k.cuh"
 
+// WARP_SIZE: compile-time constant used for shared-memory sizing.
+// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
+// so we keep a literal here. Guarded so the build can override
+// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
+#ifndef WARP_SIZE
 #define WARP_SIZE       32
+#endif
 #define WARPS           8
 #define NG_PER_BLOCK    WARP_SIZE
 #define FT_AO_THREADS   (WARP_SIZE*4)
diff --git a/gpu4pyscf/lib/pbc/ft_ao.cuh b/gpu4pyscf/lib/pbc/ft_ao.cuh
index f054a0bba..fcdbfbfd6 100644
--- a/gpu4pyscf/lib/pbc/ft_ao.cuh
+++ b/gpu4pyscf/lib/pbc/ft_ao.cuh
@@ -18,7 +18,13 @@
 
 #include <stdint.h>
 
+// WARP_SIZE: compile-time constant used for shared-memory sizing.
+// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
+// so we keep a literal here. Guarded so the build can override
+// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
+#ifndef WARP_SIZE
 #define WARP_SIZE       32
+#endif
 #define FT_AO_THREADS   (WARP_SIZE*4)
 #define NG_PER_BLOCK    32
 
diff --git a/gpu4pyscf/lib/pbc/int3c2e.cuh b/gpu4pyscf/lib/pbc/int3c2e.cuh
index 3dac1bbaf..4e49d764d 100644
--- a/gpu4pyscf/lib/pbc/int3c2e.cuh
+++ b/gpu4pyscf/lib/pbc/int3c2e.cuh
@@ -18,7 +18,13 @@
 
 #include <stdint.h>
 
+// WARP_SIZE: compile-time constant used for shared-memory sizing.
+// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
+// so we keep a literal here. Guarded so the build can override
+// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
+#ifndef WARP_SIZE
 #define WARP_SIZE       32
+#endif
 // corresponding to 256 threads
 #define WARPS           8
 #define THREADS         (WARP_SIZE*WARPS)
diff --git a/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh b/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh
index 0aa549fe1..477536101 100644
--- a/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh
+++ b/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh
@@ -22,7 +22,13 @@
 #include <cub/cub.cuh>
 
 #define THREADS         256
+// WARP_SIZE: compile-time constant used for shared-memory sizing.
+// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
+// so we keep a literal here. Guarded so the build can override
+// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
+#ifndef WARP_SIZE
 #define WARP_SIZE       32
+#endif
 #define WARPS           8
 #define LMAX            4
 #define LMAX1           (LMAX+1)
diff --git a/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh b/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh
index eec32372a..ace518638 100644
--- a/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh
+++ b/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh
@@ -22,7 +22,13 @@
 #include "gvhf-rys/vhf.cuh"
 
 #define THREADS         256
+// WARP_SIZE: compile-time constant used for shared-memory sizing.
+// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
+// so we keep a literal here. Guarded so the build can override
+// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
+#ifndef WARP_SIZE
 #define WARP_SIZE       32
+#endif
 #define WARPS           8
 #define REMOTE_THRESHOLD 50
 

From 68bc7515b87b41968d10c9244aa3e16b818cf776 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anlvpn217-217.anl.gov>
Date: Fri, 12 Jun 2026 11:56:03 -0500
Subject: [PATCH 2/4] Replace some of the hardcoded values with macros

---
 gpu4pyscf/lib/gvhf-md/md_contract_j.cu     |  8 ++++---
 gpu4pyscf/lib/gvhf-md/md_j.cuh             | 28 ++++++++++++++++++++++
 gpu4pyscf/lib/gvhf-md/md_j_driver.cu       |  9 ++++---
 gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu |  8 ++++---
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
index 5afff3784..f60b663b5 100644
--- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
@@ -891,9 +891,11 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
     int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
     int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
     int nf3ijkl = (order+1)*(order+2)*(order+3)/6;
-    // 16x16 threads are applied to all unrolled code
-    float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16);
-    float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16);
+    // MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all
+    // unrolled code. The tile width matches the `dim3 threads(16,16)`
+    // launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh.
+    float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS);
+    float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS);
     MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl,
         npairs_ij, npairs_kl, pair_ij_mapping, pair_kl_mapping,
         pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max,
diff --git a/gpu4pyscf/lib/gvhf-md/md_j.cuh b/gpu4pyscf/lib/gvhf-md/md_j.cuh
index 831ab1f39..17d8058a1 100644
--- a/gpu4pyscf/lib/gvhf-md/md_j.cuh
+++ b/gpu4pyscf/lib/gvhf-md/md_j.cuh
@@ -15,6 +15,34 @@
  */
 
 #pragma once
+
+// =====================================================================
+// Per-kernel-family constants for the md_j contraction.
+//
+// MD_J_TILE_THREADS
+//   Tile-row width used by the unrolled md_j_*_* kernels. The launch
+//   geometry for those kernels is `dim3 threads(16, 16)` and the
+//   per-row task batching uses `sq_id = tx + 16*ty`. The qd_offset
+//   layered pyramid is therefore indexed at this same width when the
+//   tile16 unrolled path is selected.
+//
+// MD_J_QD_ALIGN
+//   Alignment unit (in elements) for the per-power-of-two strata of
+//   the qd_*_max pyramid in qd_offset_for_threads(). Set to the warp
+//   width assumed by the kernels that consume qd_*_max so the strided
+//   accesses are warp-/cache-line-friendly. `warpSize` is a
+//   device-only built-in; this host-side constant mirrors it.
+//   Centralizing the value here makes a future wider-wavefront port
+//   a single-point change.
+// =====================================================================
+#ifndef MD_J_TILE_THREADS
+#define MD_J_TILE_THREADS 16
+#endif
+
+#ifndef MD_J_QD_ALIGN
+#define MD_J_QD_ALIGN     32
+#endif
+
 typedef struct {
     int li;
     int lj;
diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
index 47e1d67d6..d16a81754 100644
--- a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
+++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
@@ -43,11 +43,10 @@ int offset_for_Rt2_idx(int lij, int lkl)
 
 int qd_offset_for_threads(int npairs, int threads)
 {
-    // Host-side warp-aligned padding. `warpSize` is a device-runtime
-    // constant unavailable here; centralize the value so a future
-    // wider-wavefront port is a single-point change.
-    constexpr int WARP_SIZE_HOST = 32;
-    int npairs_aligned = (npairs + (WARP_SIZE_HOST - 1)) & ~(WARP_SIZE_HOST - 1);
+    // Layered pyramid of warp-aligned npairs strata. Alignment unit is
+    // MD_J_QD_ALIGN (defined in md_j.cuh) -- the host-side mirror of
+    // `warpSize` (which is device-only and cannot be used here).
+    int npairs_aligned = (npairs + (MD_J_QD_ALIGN - 1)) & ~(MD_J_QD_ALIGN - 1);
     int address = 0;
     for (int i = 1; i < threads; i *= 2) {
         address += npairs_aligned;
diff --git a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
index 93a2a97e0..f82bd6f5e 100644
--- a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
@@ -337,9 +337,11 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
     int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
     int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
     int nf3ijkl = (order+1)*(order+2)*(order+3)/6;
-    // 16x16 threads are applied to all unrolled code
-    float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16);
-    float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16);
+    // MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all
+    // unrolled code. The tile width matches the `dim3 threads(16,16)`
+    // launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh.
+    float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS);
+    float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS);
     MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl,
         npairs_ij, npairs_kl, (int *)pair_ij_mapping, (int *)pair_kl_mapping,
         pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max,

From 3fb415f88a8387ed2fca97eae6c40ee94eab5464 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anlvpn217-217.anl.gov>
Date: Fri, 12 Jun 2026 11:56:31 -0500
Subject: [PATCH 3/4] fix a test case with cupy.linalg -> numpy.linalg

---
 gpu4pyscf/scf/tests/test_cphf.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gpu4pyscf/scf/tests/test_cphf.py b/gpu4pyscf/scf/tests/test_cphf.py
index a1c483260..b2aba740e 100644
--- a/gpu4pyscf/scf/tests/test_cphf.py
+++ b/gpu4pyscf/scf/tests/test_cphf.py
@@ -83,8 +83,8 @@ def fx_gpu(mo1):
         s1vo = cupy.asarray(s1vo)
         mo1_gpu, e1_gpu = cphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9)
 
-        assert cupy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6
-        assert cupy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6
+        assert numpy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6
+        assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6
 
     def test_cphf_with_guess(self):
         # Test GPU CPHF solver with an initial guess (mo10) against CPU default
@@ -127,8 +127,8 @@ def fx_gpu(mo1):
         
         mo1_gpu, e1_gpu = cphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo_gpu, s1vo_gpu, tol=1e-9, mo10=mo10_gpu)
 
-        assert cupy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6
-        assert cupy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6
+        assert numpy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6
+        assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6
 
     def test_ucphf(self):
         mf = scf.UHF(mol)
@@ -166,10 +166,10 @@ def fx_gpu(mo1):
         s1vo = cupy.asarray(s1vo)
         mo1_gpu, e1_gpu = ucphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9)
 
-        assert cupy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6
-        assert cupy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6
-        assert cupy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6
-        assert cupy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6
+        assert numpy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6
+        assert numpy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6
+        assert numpy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6
+        assert numpy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6
 
 if __name__ == "__main__":
     print("Full Tests for CPHF/UCPHF")

From e64378cdd3c9704528d1b06b2050b8d6251fcd3a Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anlvpn217-217.anl.gov>
Date: Fri, 12 Jun 2026 14:00:09 -0500
Subject: [PATCH 4/4] Replace a np.int32 initialization with device-resident
 cp.zeros

---
 gpu4pyscf/scf/j_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
index 7f1876c04..e77fdddf5 100644
--- a/gpu4pyscf/scf/j_engine.py
+++ b/gpu4pyscf/scf/j_engine.py
@@ -125,7 +125,7 @@ def get_j(self, dms, verbose):
         ll = ls[:,None] + ls
         ll = ll.ravel()[pair_lst] # drops the pairs that do not contribute to integrals
         xyz_size = (ll+1)*(ll+2)*(ll+3)//6
-        pair_loc_gpu = cp.cumsum(cp.append(np.int32(0), xyz_size.ravel()), dtype=np.int32)
+        pair_loc_gpu = cp.cumsum(cp.append(cp.zeros(1, dtype=np.int32), xyz_size.ravel()), dtype=np.int32)
         xyz_size = ls = ll = None
 
         pair_lst = np.asarray(pair_lst.get(), dtype=np.int32)