Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions gpu4pyscf/lib/ecp/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -125,22 +125,22 @@ void block_reduce(double val, double *d_out) {
__syncthreads();

// Perform reduction in shared memory.
// Reduce the data until 32 threads remain.
for (unsigned int s = THREADS / 2; s >= 32; s >>= 1) {
// Reduce the data until one warp remains. warpSize is a built-in
// device constant (32 on NVIDIA today; portable for future targets).
for (unsigned int s = THREADS / 2; s >= warpSize; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}

if (tid < 32) {
if (tid < warpSize) {
double value = sdata[tid]; // load to register
unsigned int mask = __activemask(); // all lanes in warp 0 are active
value += __shfl_down_sync(mask, value, 16);
value += __shfl_down_sync(mask, value, 8);
value += __shfl_down_sync(mask, value, 4);
value += __shfl_down_sync(mask, value, 2);
value += __shfl_down_sync(mask, value, 1);
// Final intra-warp reduction; stride starts at warpSize/2.
for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
value += __shfl_down_sync(mask, value, offset);
}
if (tid == 0) {
d_out[0] += value;
}
Expand Down
12 changes: 7 additions & 5 deletions gpu4pyscf/lib/gvhf-md/md_contract_j.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void md_j_1dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
int nsq_per_block = blockDim.x;
//assert(nsq_per_block == threadsx * threadsy);
int t_id = gout_id * nsq_per_block + sq_id;
int lane_id = t_id % 32;
int lane_id = t_id % warpSize;
int group_id = lane_id / threadsx;
unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx;
int tx = sq_id % threadsx;
Expand Down Expand Up @@ -397,7 +397,7 @@ void md_j_4dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
int nsq_per_block = blockDim.x;
//assert(nsq_per_block == threadsx * threadsy);
int t_id = gout_id * nsq_per_block + sq_id;
int lane_id = t_id % 32;
int lane_id = t_id % warpSize;
int group_id = lane_id / threadsx;
unsigned int mask = ((1 << threadsx) - 1) << group_id * threadsx;
int tx = sq_id % threadsx;
Expand Down Expand Up @@ -891,9 +891,11 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
int nf3ijkl = (order+1)*(order+2)*(order+3)/6;
// 16x16 threads are applied to all unrolled code
float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16);
float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16);
// MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all
// unrolled code. The tile width matches the `dim3 threads(16,16)`
// launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh.
float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS);
float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS);
MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl,
npairs_ij, npairs_kl, pair_ij_mapping, pair_kl_mapping,
pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max,
Expand Down
28 changes: 28 additions & 0 deletions gpu4pyscf/lib/gvhf-md/md_j.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,34 @@
*/

#pragma once

// =====================================================================
// Per-kernel-family constants for the md_j contraction.
//
// MD_J_TILE_THREADS
// Tile-row width used by the unrolled md_j_*_* kernels. The launch
// geometry for those kernels is `dim3 threads(16, 16)` and the
// per-row task batching uses `sq_id = tx + 16*ty`. The qd_offset
// layered pyramid is therefore indexed at this same width when the
// tile16 unrolled path is selected.
//
// MD_J_QD_ALIGN
// Alignment unit (in elements) for the per-power-of-two strata of
// the qd_*_max pyramid in qd_offset_for_threads(). Set to the warp
// width assumed by the kernels that consume qd_*_max so the strided
// accesses are warp-/cache-line-friendly. `warpSize` is a
// device-only built-in; this host-side constant mirrors it.
// Centralizing the value here makes a future wider-wavefront port
// a single-point change.
// =====================================================================
#ifndef MD_J_TILE_THREADS
#define MD_J_TILE_THREADS 16
#endif

#ifndef MD_J_QD_ALIGN
#define MD_J_QD_ALIGN 32
#endif

typedef struct {
int li;
int lj;
Expand Down
5 changes: 4 additions & 1 deletion gpu4pyscf/lib/gvhf-md/md_j_driver.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ int offset_for_Rt2_idx(int lij, int lkl)

int qd_offset_for_threads(int npairs, int threads)
{
int npairs_aligned = (npairs + 31) & 0xffffffe0; // 32-element aligned
// Layered pyramid of warp-aligned npairs strata. Alignment unit is
// MD_J_QD_ALIGN (defined in md_j.cuh) -- the host-side mirror of
// `warpSize` (which is device-only and cannot be used here).
int npairs_aligned = (npairs + (MD_J_QD_ALIGN - 1)) & ~(MD_J_QD_ALIGN - 1);
int address = 0;
for (int i = 1; i < threads; i *= 2) {
address += npairs_aligned;
Expand Down
8 changes: 5 additions & 3 deletions gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,11 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
int nf3ijkl = (order+1)*(order+2)*(order+3)/6;
// 16x16 threads are applied to all unrolled code
float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, 16);
float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, 16);
// MD_J_TILE_THREADS x MD_J_TILE_THREADS threads are applied to all
// unrolled code. The tile width matches the `dim3 threads(16,16)`
// launch geometry in unrolled_md_j*.cu and is defined in md_j.cuh.
float *tile16_qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, MD_J_TILE_THREADS);
float *tile16_qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, MD_J_TILE_THREADS);
MDBoundsInfo bounds = {li, lj, lk, ll, lij, lkl, order, nf3ij, nf3kl, nf3ijkl,
npairs_ij, npairs_kl, (int *)pair_ij_mapping, (int *)pair_kl_mapping,
pair_ij_loc, pair_kl_loc, tile16_qd_ij_max, tile16_qd_kl_max,
Expand Down
24 changes: 12 additions & 12 deletions gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -237,7 +237,7 @@ void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -443,7 +443,7 @@ void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -704,7 +704,7 @@ void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -928,7 +928,7 @@ void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -1252,7 +1252,7 @@ void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -1770,7 +1770,7 @@ void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -2029,7 +2029,7 @@ void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -2460,7 +2460,7 @@ void md_j_3_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -3222,7 +3222,7 @@ void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -3545,7 +3545,7 @@ void md_j_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -4151,7 +4151,7 @@ void md_j_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double *vj = jk.vj;
double vj_kl0, dm_kl0;
unsigned int lane_id = thread_id % 32;
unsigned int lane_id = thread_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down
22 changes: 11 additions & 11 deletions gpu4pyscf/lib/gvhf-md/unrolled_md_j_4dm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ void md_j_4dm_0_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[8];
double dm_kl[8];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -335,7 +335,7 @@ void md_j_4dm_1_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[8];
double dm_kl[8];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -709,7 +709,7 @@ void md_j_4dm_1_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[8];
double dm_kl[8];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -1477,7 +1477,7 @@ void md_j_4dm_2_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[8];
double dm_kl[8];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -2007,7 +2007,7 @@ void md_j_4dm_2_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[4];
double dm_kl[4];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -3036,7 +3036,7 @@ void md_j_4dm_2_2(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[4];
double dm_kl[4];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -5209,7 +5209,7 @@ void md_j_4dm_3_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[4];
double dm_kl[4];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -5856,7 +5856,7 @@ void md_j_4dm_3_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[4];
double dm_kl[4];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -7748,7 +7748,7 @@ void md_j_4dm_4_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[2];
double dm_kl[2];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -8330,7 +8330,7 @@ void md_j_4dm_4_1(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[2];
double dm_kl[2];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down Expand Up @@ -9967,7 +9967,7 @@ void md_j_4dm_5_0(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
double *env = envs.env;
double vj_kl[2];
double dm_kl[2];
unsigned int lane_id = sq_id % 32;
unsigned int lane_id = sq_id % warpSize;
unsigned int group_id = lane_id / 16;
unsigned int mask = 0xffff << (group_id * 16);

Expand Down
6 changes: 6 additions & 0 deletions gpu4pyscf/lib/multigrid/multigrid.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@

#include <stdint.h>

// WARP_SIZE: compile-time constant used for shared-memory sizing.
// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
// so we keep a literal here. Guarded so the build can override
// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
#ifndef WARP_SIZE
#define WARP_SIZE 32
#endif
#define WARPS 8
#define THREADS (WARP_SIZE*WARPS)
#define LMAX 4
Expand Down
15 changes: 11 additions & 4 deletions gpu4pyscf/lib/multigrid/multigrid_v2/evaluation.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ __global__ static void evaluate_density_kernel(
KernelType
prefactor[n_channels * n_i_cartesian_functions * n_j_cartesian_functions];

constexpr int n_warps = n_threads / 32; /*L2*/
// Compile-time warp count; `warpSize` is a device-runtime constant
// (not constexpr), so we use the literal here. Centralizing the
// value will make the future HIP/wavefront port a single-point edit.
constexpr int WARP_SIZE_CT = 32;
constexpr int n_warps = n_threads / WARP_SIZE_CT; /*L2*/
__shared__ KernelType reduced_density_values[n_channels * n_warps * n_threads];

#pragma unroll
Expand Down Expand Up @@ -329,11 +333,14 @@ __global__ static void evaluate_density_kernel(
density_value_to_be_shared *= gaussian;

KernelType _wv = density_value_to_be_shared; /*WS*/
// Intra-warp reduction using warpSize (device-runtime
// built-in). Stride starts at warpSize/2 and halves.
#pragma unroll
for (int _o = 16; _o > 0; _o >>= 1)
for (int _o = warpSize / 2; _o > 0; _o >>= 1)
_wv += __shfl_down_sync(0xffffffffu, _wv, _o);
if ((thread_id & 31) == 0) { /*L2: warp-private store, no atomic*/
reduced_density_values[(i_channel * n_warps + (thread_id >> 5)) *
// Warp leader test + warp-index extraction using warpSize.
if ((thread_id % warpSize) == 0) { /*L2: warp-private store, no atomic*/
reduced_density_values[(i_channel * n_warps + (thread_id / warpSize)) *
n_threads +
a_index * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ +
b_index * BLOCK_DIM_XYZ + c_index] += _wv;
Expand Down
6 changes: 6 additions & 0 deletions gpu4pyscf/lib/pbc/ft_ao.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@
#include "gvhf-rys/vhf.cuh"
#include "gvhf-rys/rys_contract_k.cuh"

// WARP_SIZE: compile-time constant used for shared-memory sizing.
// `warpSize` (HIP/CUDA device-runtime built-in) is not constexpr,
// so we keep a literal here. Guarded so the build can override
// it (e.g. -DWARP_SIZE=64) for future wider-wavefront targets.
#ifndef WARP_SIZE
#define WARP_SIZE 32
#endif
#define WARPS 8
#define NG_PER_BLOCK WARP_SIZE
#define FT_AO_THREADS (WARP_SIZE*4)
Expand Down
Loading
Loading