diff --git a/gpu4pyscf/lib/ecp/common.cu b/gpu4pyscf/lib/ecp/common.cu index 543d95e47..e05b9f1a2 100644 --- a/gpu4pyscf/lib/ecp/common.cu +++ b/gpu4pyscf/lib/ecp/common.cu @@ -125,22 +125,22 @@ void block_reduce(double val, double *d_out) { __syncthreads(); // Perform reduction in shared memory. - // Reduce the data until 32 threads remain. - for (unsigned int s = THREADS / 2; s >= 32; s >>= 1) { + // Reduce the data until one warp remains. warpSize is a built-in + // device constant (32 on NVIDIA today; portable for future targets). + for (unsigned int s = THREADS / 2; s >= warpSize; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } - if (tid < 32) { + if (tid < warpSize) { double value = sdata[tid]; // load to register unsigned int mask = __activemask(); // all lanes in warp 0 are active - value += __shfl_down_sync(mask, value, 16); - value += __shfl_down_sync(mask, value, 8); - value += __shfl_down_sync(mask, value, 4); - value += __shfl_down_sync(mask, value, 2); - value += __shfl_down_sync(mask, value, 1); + // Final intra-warp reduction; stride starts at warpSize/2. + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + value += __shfl_down_sync(mask, value, offset); + } if (tid == 0) { d_out[0] += value; } diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu index d34980e91..f60b663b5 100644 --- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu @@ -98,7 +98,7 @@ void md_j_1dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, int nsq_per_block = blockDim.x; //assert(nsq_per_block == threadsx * threadsy); int t_id = gout_id * nsq_per_block + sq_id; - int lane_id = t_id % 32; + int lane_id = t_id % warpSize; int group_id = lane_id / threadsx; unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx; int tx = sq_id % threadsx; @@ -397,7 +397,7 @@ void md_j_4dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, int nsq_per_block = blockDim.x; //assert(nsq_per_block == threadsx * threadsy); int t_id = gout_id * nsq_per_block + sq_id; - int lane_id = t_id % 32; + int lane_id = t_id % warpSize; int group_id = lane_id / threadsx; unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx; int tx = sq_id % threadsx; @@ -891,9 +891,11 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size, int nf3ij = (lij+1)*(lij+2)*(lij+3)/6; int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6; int nf3ijkl = (order+1)*(order+2)*(order+3)/6; - // 16x16 threads are applied to all unrolled code - float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16); - float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16); + // MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all + // unrolled code. The tile width matches the `dim3 threads(16,16)` + // launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh. + float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS); + float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS); MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl, npairs_ij, npairs_kl, pair_ij_mapping, pair_kl_mapping, pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max, diff --git a/gpu4pyscf/lib/gvhf-md/md_j.cuh b/gpu4pyscf/lib/gvhf-md/md_j.cuh index 831ab1f39..17d8058a1 100644 --- a/gpu4pyscf/lib/gvhf-md/md_j.cuh +++ b/gpu4pyscf/lib/gvhf-md/md_j.cuh @@ -15,6 +15,34 @@ */ #pragma once + +// ===================================================================== +// Per-kernel-family constants for the md_j contraction. +// +// MD_J_TILE_THREADS +// Tile-row width used by the unrolled md_j_*_* kernels. The launch +// geometry for those kernels is `dim3 threads(16, 16)` and the +// per-row task batching uses `sq_id = tx + 16*ty`. The qd_offset +// layered pyramid is therefore indexed at this same width when the +// tile16 unrolled path is selected. +// +// MD_J_QD_ALIGN +// Alignment unit (in elements) for the per-power-of-two strata of +// the qd_*_max pyramid in qd_offset_for_threads(). Set to the warp +// width assumed by the kernels that consume qd_*_max so the strided +// accesses are warp-/cache-line-friendly. `warpSize` is a +// device-only built-in; this host-side constant mirrors it. +// Centralizing the value here makes a future wider-wavefront port +// a single-point change. +// ===================================================================== +#ifndef MD_J_TILE_THREADS +#define MD_J_TILE_THREADS 16 +#endif + +#ifndef MD_J_QD_ALIGN +#define MD_J_QD_ALIGN 32 +#endif + typedef struct { int li; int lj; diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu index 84d73f0a0..d16a81754 100644 --- a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu +++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu @@ -43,7 +43,10 @@ int offset_for_Rt2_idx(int lij, int lkl) int qd_offset_for_threads(int npairs, int threads) { - int npairs_aligned = (npairs + 31) & 0xffffffe0; // 32-element aligned + // Layered pyramid of warp-aligned npairs strata. Alignment unit is + // MD_J_QD_ALIGN (defined in md_j.cuh) -- the host-side mirror of + // `warpSize` (which is device-only and cannot be used here). + int npairs_aligned = (npairs + (MD_J_QD_ALIGN - 1)) & ~(MD_J_QD_ALIGN - 1); int address = 0; for (int i = 1; i < threads; i *= 2) { address += npairs_aligned; diff --git a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu index 93a2a97e0..f82bd6f5e 100644 --- a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu @@ -337,9 +337,11 @@ int PBC_build_j(double *vj, double *dm, int n_dm, int nf3ij = (lij+1)*(lij+2)*(lij+3)/6; int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6; int nf3ijkl = (order+1)*(order+2)*(order+3)/6; - // 16x16 threads are applied to all unrolled code - float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16); - float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16); + // MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all + // unrolled code. The tile width matches the `dim3 threads(16,16)` + // launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh. + float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS); + float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS); MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl, npairs_ij, npairs_kl, (int *)pair_ij_mapping, (int *)pair_kl_mapping, pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max, diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu index 8e3cced8d..76702991a 100644 --- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu +++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu @@ -36,7 +36,7 @@ void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -237,7 +237,7 @@ void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -443,7 +443,7 @@ void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -704,7 +704,7 @@ void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -928,7 +928,7 @@ void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -1252,7 +1252,7 @@ void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -1770,7 +1770,7 @@ void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -2029,7 +2029,7 @@ void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -2460,7 +2460,7 @@ void md_j_3_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -3222,7 +3222,7 @@ void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -3545,7 +3545,7 @@ void md_j_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -4151,7 +4151,7 @@ void md_j_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double *vj = jk.vj; double vj_kl0, dm_kl0; - unsigned int lane_id = thread_id % 32; + unsigned int lane_id = thread_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu index b64bb0bd8..ba68832c4 100644 --- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu +++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu @@ -32,7 +32,7 @@ void md_j_4dm_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -335,7 +335,7 @@ void md_j_4dm_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -709,7 +709,7 @@ void md_j_4dm_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -1477,7 +1477,7 @@ void md_j_4dm_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[8]; double dm_kl[8]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -2007,7 +2007,7 @@ void md_j_4dm_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -3036,7 +3036,7 @@ void md_j_4dm_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -5209,7 +5209,7 @@ void md_j_4dm_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -5856,7 +5856,7 @@ void md_j_4dm_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[4]; double dm_kl[4]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -7748,7 +7748,7 @@ void md_j_4dm_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[2]; double dm_kl[2]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -8330,7 +8330,7 @@ void md_j_4dm_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[2]; double dm_kl[2]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); @@ -9967,7 +9967,7 @@ void md_j_4dm_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds, double *env = envs.env; double vj_kl[2]; double dm_kl[2]; - unsigned int lane_id = sq_id % 32; + unsigned int lane_id = sq_id % warpSize; unsigned int group_id = lane_id / 16; unsigned int mask = 0xffff << (group_id * 16); diff --git a/gpu4pyscf/lib/multigrid/multigrid.cuh b/gpu4pyscf/lib/multigrid/multigrid.cuh index 44594bba0..9ed504739 100644 --- a/gpu4pyscf/lib/multigrid/multigrid.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid.cuh @@ -16,7 +16,13 @@ #include +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define THREADS (WARP_SIZE*WARPS) #define LMAX 4 diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh index 05f6a5bec..f8cbec2cd 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh @@ -93,7 +93,11 @@ __global__ static void evaluate_density_kernel( KernelType prefactor[n_channels * n_i_cartesian_functions * n_j_cartesian_functions]; - constexpr int n_warps = n_threads / 32; /*L2*/ + // Compile-time warp count; `warpSize` is a device-runtime constant + // (not constexpr), so we use the literal here. Centralizing the + // value will make the future HIP/wavefront port a single-point edit. + constexpr int WARP_SIZE_CT = 32; + constexpr int n_warps = n_threads / WARP_SIZE_CT; /*L2*/ __shared__ KernelType reduced_density_values[n_channels * n_warps * n_threads]; #pragma unroll @@ -329,11 +333,14 @@ __global__ static void evaluate_density_kernel( density_value_to_be_shared *= gaussian; KernelType _wv = density_value_to_be_shared; /*WS*/ + // Intra-warp reduction using warpSize (device-runtime + // built-in). Stride starts at warpSize/2 and halves. #pragma unroll - for (int _o = 16; _o > 0; _o >>= 1) + for (int _o = warpSize / 2; _o > 0; _o >>= 1) _wv += __shfl_down_sync(0xffffffffu, _wv, _o); - if ((thread_id & 31) == 0) { /*L2: warp-private store, no atomic*/ - reduced_density_values[(i_channel * n_warps + (thread_id >> 5)) * + // Warp leader test + warp-index extraction using warpSize. + if ((thread_id % warpSize) == 0) { /*L2: warp-private store, no atomic*/ + reduced_density_values[(i_channel * n_warps + (thread_id / warpSize)) * n_threads + a_index * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ + b_index * BLOCK_DIM_XYZ + c_index] += _wv; diff --git a/gpu4pyscf/lib/pbc/ft_ao.cu b/gpu4pyscf/lib/pbc/ft_ao.cu index e8b31aa84..fff9be2d8 100644 --- a/gpu4pyscf/lib/pbc/ft_ao.cu +++ b/gpu4pyscf/lib/pbc/ft_ao.cu @@ -21,7 +21,13 @@ #include "gvhf-rys/vhf.cuh" #include "gvhf-rys/rys_contract_k.cuh" +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define NG_PER_BLOCK WARP_SIZE #define FT_AO_THREADS (WARP_SIZE*4) diff --git a/gpu4pyscf/lib/pbc/ft_ao.cuh b/gpu4pyscf/lib/pbc/ft_ao.cuh index f054a0bba..fcdbfbfd6 100644 --- a/gpu4pyscf/lib/pbc/ft_ao.cuh +++ b/gpu4pyscf/lib/pbc/ft_ao.cuh @@ -18,7 +18,13 @@ #include +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define FT_AO_THREADS (WARP_SIZE*4) #define NG_PER_BLOCK 32 diff --git a/gpu4pyscf/lib/pbc/int3c2e.cuh b/gpu4pyscf/lib/pbc/int3c2e.cuh index 3dac1bbaf..4e49d764d 100644 --- a/gpu4pyscf/lib/pbc/int3c2e.cuh +++ b/gpu4pyscf/lib/pbc/int3c2e.cuh @@ -18,7 +18,13 @@ #include +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif // corresponding to 256 threads #define WARPS 8 #define THREADS (WARP_SIZE*WARPS) diff --git a/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh b/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh index 0aa549fe1..477536101 100644 --- a/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh +++ b/gpu4pyscf/lib/pbc/int3c2e_create_tasks.cuh @@ -22,7 +22,13 @@ #include #define THREADS 256 +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define LMAX 4 #define LMAX1 (LMAX+1) diff --git a/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh b/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh index eec32372a..ace518638 100644 --- a/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh +++ b/gpu4pyscf/lib/pbc/int3c2e_create_tasks_o1.cuh @@ -22,7 +22,13 @@ #include "gvhf-rys/vhf.cuh" #define THREADS 256 +// WARP_SIZE: compile-time constant used for shared-memory sizing. +// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr, +// so we keep a literal here. Guarded so the build can override +// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets. +#ifndef WARP_SIZE #define WARP_SIZE 32 +#endif #define WARPS 8 #define REMOTE_THRESHOLD 50 diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py index 7f1876c04..e77fdddf5 100644 --- a/gpu4pyscf/scf/j_engine.py +++ b/gpu4pyscf/scf/j_engine.py @@ -125,7 +125,7 @@ def get_j(self, dms, verbose): ll = ls[:,None] + ls ll = ll.ravel()[pair_lst] # drops the pairs that do not contribute to integrals xyz_size = (ll+1)*(ll+2)*(ll+3)//6 - pair_loc_gpu = cp.cumsum(cp.append(np.int32(0), xyz_size.ravel()), dtype=np.int32) + pair_loc_gpu = cp.cumsum(cp.append(cp.zeros(1, dtype=np.int32), xyz_size.ravel()), dtype=np.int32) xyz_size = ls = ll = None pair_lst = np.asarray(pair_lst.get(), dtype=np.int32) diff --git a/gpu4pyscf/scf/tests/test_cphf.py b/gpu4pyscf/scf/tests/test_cphf.py index a1c483260..b2aba740e 100644 --- a/gpu4pyscf/scf/tests/test_cphf.py +++ b/gpu4pyscf/scf/tests/test_cphf.py @@ -83,8 +83,8 @@ def fx_gpu(mo1): s1vo = cupy.asarray(s1vo) mo1_gpu, e1_gpu = cphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9) - assert cupy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 def test_cphf_with_guess(self): # Test GPU CPHF solver with an initial guess (mo10) against CPU default @@ -127,8 +127,8 @@ def fx_gpu(mo1): mo1_gpu, e1_gpu = cphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo_gpu, s1vo_gpu, tol=1e-9, mo10=mo10_gpu) - assert cupy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu - mo1_gpu.get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-6 def test_ucphf(self): mf = scf.UHF(mol) @@ -166,10 +166,10 @@ def fx_gpu(mo1): s1vo = cupy.asarray(s1vo) mo1_gpu, e1_gpu = ucphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9) - assert cupy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6 - assert cupy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6 - assert cupy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6 + assert numpy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6 + assert numpy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6 if __name__ == "__main__": print("Full Tests for CPHF/UCPHF")