From 1e4cb960a66b3bf306b4303a21fbc313617b4b06 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Thu, 11 Jun 2026 21:29:01 -0500 Subject: [PATCH 1/4] Replace hardcoded warp-size (32) literals with warpSize built-in Pure CUDA refactor that substitutes hardcoded warp-size assumptions in device-side kernels with the CUDA/HIP `warpSize` device-runtime built-in. No HIP-specific code is introduced; all changes remain correct on NVIDIA (warpSize=32) and become portable to architectures where the warp/wavefront size differs (e.g. AMD CDNA gfx9xx with wavefront=64). This is a prerequisite for the companion `hip` branch (a HIP/ROCm build) to produce correct results on wavefront-64 AMD GPUs. The two branches are intentionally independent so the warpSize cleanup can land on master without coupling to the HIP build system. Device-side substitutions (warpSize is a runtime constant; safe in __global__ / __device__ contexts): - gpu4pyscf/lib/ecp/common.cu: classic block_reduce pattern -- shared-memory reduce loop bound (s >= 32 -> s >= warpSize), warp-leader gate (tid < 32 -> tid < warpSize), and the intra-warp __shfl_down_sync sequence (16/8/4/2/1 unrolled -> for offset = warpSize/2; offset > 0; offset >>= 1). - gpu4pyscf/lib/gvhf-md/{unrolled_md_j.cu, unrolled_md_j_4dm.cu, md_contract_j.cu}: 25 occurrences of `lane_id = X % 32` -> `lane_id = X % warpSize`. - gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh: warp-stride reduction loop (_o = 16 -> _o = warpSize/2), warp-leader test (tid & 31 -> tid % warpSize), warp index (tid >> 5 -> tid / warpSize). Centralized the constexpr n_warps divisor into a named constant. Host-side / constexpr substitutions (warpSize is device-only so a named constant is used instead): - gpu4pyscf/lib/gvhf-md/md_j_driver.cu: qd_offset_for_threads 32-element alignment now uses a named WARP_SIZE_HOST constant derived from the assumed warp width. - gpu4pyscf/lib/pbc/{int3c2e_create_tasks.cuh, int3c2e_create_tasks_o1.cuh, int3c2e.cuh, ft_ao.cuh, ft_ao.cu}, gpu4pyscf/lib/multigrid/multigrid.cuh: existing `#define WARP_SIZE 32` directives wrapped in `#ifndef WARP_SIZE` so the build system can override the value (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. All changes are zero-cost on NVIDIA and unlock correct portability. --- gpu4pyscf/lib/ecp/common.cu | 16 ++++++------- gpu4pyscf/lib/gvhf-md/md_contract_j.cu | 4 ++-- gpu4pyscf/lib/gvhf-md/md_j_driver.cu | 6 ++++- gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu | 24 +++++++++---------- gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu | 22 ++++++++--------- gpu4pyscf/lib/multigrid/multigrid.cuh | 6 +++++ .../lib/multigrid/multigrid_v2/evaluation.cuh | 15 ++++++++---- gpu4pyscf/lib/pbc/ft_ao.cu | 6 +++++ gpu4pyscf/lib/pbc/ft_ao.cuh | 6 +++++ gpu4pyscf/lib/pbc/int3c2e.cuh | 6 +++++ gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh | 6 +++++ gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh | 6 +++++ 12 files changed, 85 insertions(+), 38 deletions(-) diff --git a/gpu4pyscf/lib/ecp/common.cu b/gpu4pyscf/lib/ecp/common.cu index 543d95e47..e05b9f1a2 100644 --- a/gpu4pyscf/lib/ecp/common.cu +++ b/gpu4pyscf/lib/ecp/common.cu @@ -125,22 +125,22 @@ void block_reduce(double val, double *d_out) { __syncthreads(); // Perform reduction in shared memory. - // Reduce the data until 32 threads remain. - for (unsigned int s = THREADS / 2; s >= 32; s >>= 1) { + // Reduce the data until one warp remains. warpSize is a built-in + // device constant (32 on NVIDIA today; portable for future targets). + for (unsigned int s = THREADS / 2; s >= warpSize; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } - if (tid < 32) { + if (tid < warpSize) { double value = sdata[tid]; // load to register unsigned int mask = __activemask(); // all lanes in warp 0 are active - value += __shfl_down_sync(mask, value, 16); - value += __shfl_down_sync(mask, value, 8); - value += __shfl_down_sync(mask, value, 4); - value += __shfl_down_sync(mask, value, 2); - value += __shfl_down_sync(mask, value, 1); + // Final intra-warp reduction; stride starts at warpSize/2. + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + value += __shfl_down_sync(mask, value, offset); + } if (tid == 0) { d_out[0] += value; } diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu index d34980e91..5afff3784 100644 --- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu @@ -98,7 +98,7 @@ void md_j_1dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, int nsq_per_block = blockDim.x; //assert(nsq_per_block == threadsx * threadsy); int t_id = gout_id * nsq_per_block + sq_id; - int lane_id = t_id % 32; + int lane_id = t_id % warpSize; int group_id = lane_id / threadsx; unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx; int tx = sq_id % threadsx; @@ -397,7 +397,7 @@ void md_j_4dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, int nsq_per_block = blockDim.x; //assert(nsq_per_block == threadsx * threadsy); int t_id = gout_id * nsq_per_block + sq_id; - int lane_id = t_id % 32; + int lane_id = t_id % warpSize; int group_id = lane_id / threadsx; unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx; int tx = sq_id % threadsx; diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu index 84d73f0a0..47e1d67d6 100644 --- a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu +++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu @@ -43,7 +43,11 @@ int offset_for_Rt2_idx(int lij, int lkl) int qd_offset_for_threads(int npairs, int threads) { - int npairs_aligned = (npairs + 31) & 0xffffffe0; // 32-element aligned + // Host-side warp-aligned padding. `warpSize` is a device-runtime + // constant unavailable here; centralize the value so a future + // wider-wavefront port is a single-point change. + constexpr int WARP_SIZE_HOST = 32; + int npairs_aligned = (npairs + (WARP_SIZE_HOST - 1)) & ~(WARP_SIZE_HOST - 1); int address = 0; for (int i = 1; i < threads; i *= 2) { address += npairs_aligned; diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu index 8e3cced8d..76702991a 100644 --- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu +++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu @@ -36,7 +36,7 @@ void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -237,7 +237,7 @@ void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -443,7 +443,7 @@ void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -704,7 +704,7 @@ void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -928,7 +928,7 @@ void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -1252,7 +1252,7 @@ void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -1770,7 +1770,7 @@ void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -2029,7 +2029,7 @@ void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -2460,7 +2460,7 @@ void md_j_3_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -3222,7 +3222,7 @@ void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -3545,7 +3545,7 @@ void md_j_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -4151,7 +4151,7 @@ void md_j_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu index b64bb0bd8..ba68832c4 100644 --- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu +++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu @@ -32,7 +32,7 @@ void md_j_4dm_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -335,7 +335,7 @@ void md_j_4dm_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -709,7 +709,7 @@ void md_j_4dm_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -1477,7 +1477,7 @@ void md_j_4dm_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -2007,7 +2007,7 @@ void md_j_4dm_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -3036,7 +3036,7 @@ void md_j_4dm_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -5209,7 +5209,7 @@ void md_j_4dm_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -5856,7 +5856,7 @@ void md_j_4dm_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -7748,7 +7748,7 @@ void md_j_4dm_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[2]; double dm_kl[2]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -8330,7 +8330,7 @@ void md_j_4dm_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[2]; double dm_kl[2]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -9967,7 +9967,7 @@ void md_j_4dm_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[2]; double dm_kl[2]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); diff --git a/gpu4pyscf/lib/multigrid/multigrid.cuh b/gpu4pyscf/lib/multigrid/multigrid.cuh index 44594bba0..9ed504739 100644 --- a/gpu4pyscf/lib/multigrid/multigrid.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid.cuh @@ -16,7 +16,13 @@ #include +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define THREADS (WARP_SIZE*WARPS) #define LMAX 4 diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh index 05f6a5bec..f8cbec2cd 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh @@ -93,7 +93,11 @@ __global__ static void evaluate_density_kernel( KernelType prefactor[n_channels * n_i_cartesian_functions * n_j_cartesian_functions]; - constexpr int n_warps = n_threads / 32; /*L2*/ + // Compile-time warp count; `warpSize` is a device-runtime constant + // (not constexpr), so we use the literal here. Centralizing the + // value will make the future HIP/wavefront port a single-point edit. + constexpr int WARP_SIZE_CT = 32; + constexpr int n_warps = n_threads / WARP_SIZE_CT; /*L2*/ __shared__ KernelType reduced_density_values[n_channels * n_warps * n_threads]; #pragma unroll @@ -329,11 +333,14 @@ __global__ static void evaluate_density_kernel( density_value_to_be_shared *= gaussian; KernelType _wv = density_value_to_be_shared; /*WS*/ + // Intra-warp reduction using warpSize (device-runtime + // built-in). Stride starts at warpSize/2 and halves. #pragma unroll - for (int _o = 16; _o > 0; _o >>= 1) + for (int _o = warpSize / 2; _o > 0; _o >>= 1) _wv += __shfl_down_sync(0xffffffffu, _wv, _o); - if ((thread_id & 31) == 0) { /*L2: warp-private store, no atomic*/ - reduced_density_values[(i_channel * n_warps + (thread_id >> 5)) * + // Warp leader test + warp-index extraction using warpSize. + if ((thread_id % warpSize) == 0) { /*L2: warp-private store, no atomic*/ + reduced_density_values[(i_channel * n_warps + (thread_id / warpSize)) * n_threads + a_index * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ + b_index * BLOCK_DIM_XYZ + c_index] += _wv; diff --git a/gpu4pyscf/lib/pbc/ft_ao.cu b/gpu4pyscf/lib/pbc/ft_ao.cu index e8b31aa84..fff9be2d8 100644 --- a/gpu4pyscf/lib/pbc/ft_ao.cu +++ b/gpu4pyscf/lib/pbc/ft_ao.cu @@ -21,7 +21,13 @@ #include "gvhf-rys/vhf.cuh" #include "gvhf-rys/rys_contract_k.cuh" +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define NG_PER_BLOCK WARP_SIZE #define FT_AO_THREADS (WARP_SIZE*4) diff --git a/gpu4pyscf/lib/pbc/ft_ao.cuh b/gpu4pyscf/lib/pbc/ft_ao.cuh index f054a0bba..fcdbfbfd6 100644 --- a/gpu4pyscf/lib/pbc/ft_ao.cuh +++ b/gpu4pyscf/lib/pbc/ft_ao.cuh @@ -18,7 +18,13 @@ #include +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define FT_AO_THREADS (WARP_SIZE*4) #define NG_PER_BLOCK 32 diff --git a/gpu4pyscf/lib/pbc/int3c2e.cuh b/gpu4pyscf/lib/pbc/int3c2e.cuh index 3dac1bbaf..4e49d764d 100644 --- a/gpu4pyscf/lib/pbc/int3c2e.cuh +++ b/gpu4pyscf/lib/pbc/int3c2e.cuh @@ -18,7 +18,13 @@ #include +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif // corresponding to 256 threads #define WARPS 8 #define THREADS (WARP_SIZE*WARPS) diff --git a/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh b/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh index 0aa549fe1..477536101 100644 --- a/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh +++ b/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh @@ -22,7 +22,13 @@ #include #define THREADS 256 +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define LMAX 4 #define LMAX1 (LMAX+1) diff --git a/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh b/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh index eec32372a..ace518638 100644 --- a/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh +++ b/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh @@ -22,7 +22,13 @@ #include "gvhf-rys/vhf.cuh" #define THREADS 256 +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define REMOTE_THRESHOLD 50 From 68bc7515b87b41968d10c9244aa3e16b818cf776 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 12 Jun 2026 11:56:03 -0500 Subject: [PATCH 2/4] Replace some of the hardcoded values with macros --- gpu4pyscf/lib/gvhf-md/md_contract_j.cu | 8 ++++--- gpu4pyscf/lib/gvhf-md/md_j.cuh | 28 ++++++++++++++++++++++ gpu4pyscf/lib/gvhf-md/md_j_driver.cu | 9 ++++--- gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu | 8 ++++--- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu index 5afff3784..f60b663b5 100644 --- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu @@ -891,9 +891,11 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size, int nf3ij = (lij+1)*(lij+2)*(lij+3)/6; int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6; int nf3ijkl = (order+1)*(order+2)*(order+3)/6; - // 16x16 threads are applied to all unrolled code - float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16); - float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16); + // MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all + // unrolled code. The tile width matches the `dim3 threads(16,16)` + // launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh. + float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS); + float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS); MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl, npairs_ij, npairs_kl, pair_ij_mapping, pair_kl_mapping, pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max, diff --git a/gpu4pyscf/lib/gvhf-md/md_j.cuh b/gpu4pyscf/lib/gvhf-md/md_j.cuh index 831ab1f39..17d8058a1 100644 --- a/gpu4pyscf/lib/gvhf-md/md_j.cuh +++ b/gpu4pyscf/lib/gvhf-md/md_j.cuh @@ -15,6 +15,34 @@ */ #pragma once + +// ===================================================================== +// Per-kernel-family constants for the md_j contraction. +// +// MD_J_TILE_THREADS +// Tile-row width used by the unrolled md_j_*_* kernels. The launch +// geometry for those kernels is `dim3 threads(16, 16)` and the +// per-row task batching uses `sq_id = tx + 16*ty`. The qd_offset +// layered pyramid is therefore indexed at this same width when the +// tile16 unrolled path is selected. +// +// MD_J_QD_ALIGN +// Alignment unit (in elements) for the per-power-of-two strata of +// the qd_*_max pyramid in qd_offset_for_threads(). Set to the warp +// width assumed by the kernels that consume qd_*_max so the strided +// accesses are warp-/cache-line-friendly. `warpSize` is a +// device-only built-in; this host-side constant mirrors it. +// Centralizing the value here makes a future wider-wavefront port +// a single-point change. +// ===================================================================== +#ifndef MD_J_TILE_THREADS +#define MD_J_TILE_THREADS 16 +#endif + +#ifndef MD_J_QD_ALIGN +#define MD_J_QD_ALIGN 32 +#endif + typedef struct { int li; int lj; diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu index 47e1d67d6..d16a81754 100644 --- a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu +++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu @@ -43,11 +43,10 @@ int offset_for_Rt2_idx(int lij, int lkl) int qd_offset_for_threads(int npairs, int threads) { - // Host-side warp-aligned padding. `warpSize` is a device-runtime - // constant unavailable here; centralize the value so a future - // wider-wavefront port is a single-point change. - constexpr int WARP_SIZE_HOST = 32; - int npairs_aligned = (npairs + (WARP_SIZE_HOST - 1)) & ~(WARP_SIZE_HOST - 1); + // Layered pyramid of warp-aligned npairs strata. Alignment unit is + // MD_J_QD_ALIGN (defined in md_j.cuh) -- the host-side mirror of + // `warpSize` (which is device-only and cannot be used here). + int npairs_aligned = (npairs + (MD_J_QD_ALIGN - 1)) & ~(MD_J_QD_ALIGN - 1); int address = 0; for (int i = 1; i < threads; i *= 2) { address += npairs_aligned; diff --git a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu index 93a2a97e0..f82bd6f5e 100644 --- a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu @@ -337,9 +337,11 @@ int PBC_build_j(double *vj, double *dm, int n_dm, int nf3ij = (lij+1)*(lij+2)*(lij+3)/6; int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6; int nf3ijkl = (order+1)*(order+2)*(order+3)/6; - // 16x16 threads are applied to all unrolled code - float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16); - float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16); + // MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all + // unrolled code. The tile width matches the `dim3 threads(16,16)` + // launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh. + float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS); + float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS); MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl, npairs_ij, npairs_kl, (int *)pair_ij_mapping, (int *)pair_kl_mapping, pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max, From 3fb415f88a8387ed2fca97eae6c40ee94eab5464 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 12 Jun 2026 11:56:31 -0500 Subject: [PATCH 3/4] fix a test case with cupy.linalg -> numpy.linalg --- gpu4pyscf/scf/tests/test_cphf.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gpu4pyscf/scf/tests/test_cphf.py b/gpu4pyscf/scf/tests/test_cphf.py index a1c483260..b2aba740e 100644 --- a/gpu4pyscf/scf/tests/test_cphf.py +++ b/gpu4pyscf/scf/tests/test_cphf.py @@ -83,8 +83,8 @@ def fx_gpu(mo1): s1vo = cupy.asarray(s1vo) mo1_gpu, e1_gpu = cphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9) - assert cupy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 def test_cphf_with_guess(self): # Test GPU CPHF solver with an initial guess (mo10) against CPU default @@ -127,8 +127,8 @@ def fx_gpu(mo1): mo1_gpu, e1_gpu = cphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo_gpu, s1vo_gpu, tol=1e-9, mo10=mo10_gpu) - assert cupy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 def test_ucphf(self): mf = scf.UHF(mol) @@ -166,10 +166,10 @@ def fx_gpu(mo1): s1vo = cupy.asarray(s1vo) mo1_gpu, e1_gpu = ucphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9) - assert cupy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6 - assert cupy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6 if __name__ == "__main__": print("Full Tests for CPHF/UCPHF") From e64378cdd3c9704528d1b06b2050b8d6251fcd3a Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 12 Jun 2026 14:00:09 -0500 Subject: [PATCH 4/4] Replace a np.int32 initialization with device-resident cp.zeros --- gpu4pyscf/scf/j_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py index 7f1876c04..e77fdddf5 100644 --- a/gpu4pyscf/scf/j_engine.py +++ b/gpu4pyscf/scf/j_engine.py @@ -125,7 +125,7 @@ def get_j(self, dms, verbose): ll = ls[:,None] + ls ll = ll.ravel()[pair_lst] # drops the pairs that do not contribute to integrals xyz_size = (ll+1)*(ll+2)*(ll+3)//6 - pair_loc_gpu = cp.cumsum(cp.append(np.int32(0), xyz_size.ravel()), dtype=np.int32) + pair_loc_gpu = cp.cumsum(cp.append(cp.zeros(1, dtype=np.int32), xyz_size.ravel()), dtype=np.int32) xyz_size = ls = ll = None pair_lst = np.asarray(pair_lst.get(), dtype=np.int32)