From 12b935421437bc9b1c64ad215046c1e91fe23f45 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Fri, 8 Aug 2025 18:34:20 -0700 Subject: [PATCH 01/14] cache pseudo potential to avoid redundant timing in gradient --- gpu4pyscf/pbc/dft/multigrid_v2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index ef7aebd5e..fa9441856 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -1263,7 +1263,7 @@ def get_veff_ip1( xc_for_fock[:, 0] += coulomb_on_g_mesh if cell._pseudo: - xc_for_fock[:, 0] += multigrid_v1.eval_vpplocG_part1(cell, mesh) + xc_for_fock[:, 0] += ni.cached_vpplocG_part1 veff_gradient = convert_xc_on_g_mesh_to_fock_gradient( ni, xc_for_fock, dm_kpts, hermi, kpts_band @@ -1279,6 +1279,9 @@ def __init__(self, cell): self.mesh = cell.mesh self.tasks = None self.sorted_gaussian_pairs = None + self.build() + if cell._pseudo: + self.cached_vpplocG_part1 = multigrid_v1.eval_vpplocG_part1(cell, self.mesh) build = sort_gaussian_pairs From e3b9bf8927b207eaaccc8a80ab7d1e7222aeba12 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Sun, 26 Oct 2025 00:11:51 -0700 Subject: [PATCH 02/14] add aggressive screening of gaussian pairs around the boundaries --- .../lib/multigrid/multigrid_v2/screen.cu | 30 ++ .../lib/multigrid/multigrid_v2/screening.cuh | 318 +++++++++++++++- .../lib/multigrid/multigrid_v2/utils.cuh | 23 ++ gpu4pyscf/pbc/dft/multigrid_v2.py | 338 ++++++++++++------ 4 files changed, 603 insertions(+), 106 deletions(-) diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu b/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu index b6574d95d..39ebbcc39 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu @@ -206,4 +206,34 @@ void put_pairs_on_blocks( checkCudaErrors(cudaPeekAtLastError()); } + +int tailor_gaussian_pairs( + int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid, + const int i_angular, const int j_angular, const int *non_trivial_pairs, + const int *i_shells, const int *j_shells, const int n_j_shells, + const int *shell_to_ao_indices, + const int *accumulated_n_pairs_per_local_grid, + const int *sorted_block_index, const int n_contributing_blocks, + const int *image_indices, const double *vectors_to_neighboring_images, + const int n_images, const int *mesh, const int *atm, const int *bas, + const double *env, const int is_non_orthogonal, const double threshold, + const int derivative_order) { + if (is_non_orthogonal) { + return gpu4pyscf::gpbc::multi_grid::tailor_gaussian_pairs_driver( + sorted_pairs_per_local_grid, n_pairs_per_local_grid, i_angular, + j_angular, non_trivial_pairs, i_shells, j_shells, n_j_shells, + shell_to_ao_indices, accumulated_n_pairs_per_local_grid, + sorted_block_index, n_contributing_blocks, image_indices, + vectors_to_neighboring_images, n_images, mesh, atm, bas, env, threshold, + derivative_order); + } else { + return gpu4pyscf::gpbc::multi_grid::tailor_gaussian_pairs_driver( + sorted_pairs_per_local_grid, n_pairs_per_local_grid, i_angular, + j_angular, non_trivial_pairs, i_shells, j_shells, n_j_shells, + shell_to_ao_indices, accumulated_n_pairs_per_local_grid, + sorted_block_index, n_contributing_blocks, image_indices, + vectors_to_neighboring_images, n_images, mesh, atm, bas, env, threshold, + derivative_order); + } +} } diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh index b9adb4ea6..fa3971250 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh @@ -26,7 +26,7 @@ #define EIJ_CUTOFF 60 #define BLOCK_DIM_XYZ 4 -#define EXP_OVERFLOW 400 +#define EXP_OVERFLOW 400 namespace gpu4pyscf::gpbc::multi_grid { @@ -644,4 +644,320 @@ __global__ void put_pairs_on_blocks_kernel( } } +template +__global__ static void tailor_gaussian_pairs_kernel( + int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid, + const int *non_trivial_pairs, const int *i_shells, const int *j_shells, + const int n_j_shells, const int *shell_to_ao_indices, + const int *accumulated_n_pairs_per_local_grid, + const int *sorted_block_index, const int *image_indices, + const double *vectors_to_neighboring_images, const int n_images, + const int mesh_a, const int mesh_b, const int mesh_c, const int *atm, + const int *bas, const double *env, const double threshold, + const int derivative_order) { + constexpr int n_threads = BLOCK_DIM_XYZ * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ; + + const int block_index = sorted_block_index[blockIdx.x]; + const int n_blocks_b = (mesh_b + BLOCK_DIM_XYZ - 1) / BLOCK_DIM_XYZ; + const int n_blocks_c = (mesh_c + BLOCK_DIM_XYZ - 1) / BLOCK_DIM_XYZ; + + const int block_a_stride = n_blocks_b * n_blocks_c; + const int block_a_index = block_index / block_a_stride; + const int block_ab_index = block_index % block_a_stride; + const int block_b_index = block_ab_index / n_blocks_c; + const int block_c_index = block_ab_index % n_blocks_c; + + const int a_start = block_a_index * BLOCK_DIM_XYZ; + const int b_start = block_b_index * BLOCK_DIM_XYZ; + const int c_start = block_c_index * BLOCK_DIM_XYZ; + + const double start_position_x = + dxyz_dabc[0] * a_start + dxyz_dabc[3] * b_start + dxyz_dabc[6] * c_start; + const double start_position_y = + dxyz_dabc[1] * a_start + dxyz_dabc[4] * b_start + dxyz_dabc[7] * c_start; + const double start_position_z = + dxyz_dabc[2] * a_start + dxyz_dabc[5] * b_start + dxyz_dabc[8] * c_start; + + const double a_dot_b = dxyz_dabc[0] * dxyz_dabc[3] + + dxyz_dabc[1] * dxyz_dabc[4] + + dxyz_dabc[2] * dxyz_dabc[5]; + const double a_dot_c = dxyz_dabc[0] * dxyz_dabc[6] + + dxyz_dabc[1] * dxyz_dabc[7] + + dxyz_dabc[2] * dxyz_dabc[8]; + const double b_dot_c = dxyz_dabc[3] * dxyz_dabc[6] + + dxyz_dabc[4] * dxyz_dabc[7] + + dxyz_dabc[5] * dxyz_dabc[8]; + + const int a_upper = min(a_start + BLOCK_DIM_XYZ, mesh_a) - a_start; + const int b_upper = min(b_start + BLOCK_DIM_XYZ, mesh_b) - b_start; + const int c_upper = min(c_start + BLOCK_DIM_XYZ, mesh_c) - c_start; + + const int thread_id = threadIdx.x + threadIdx.y * BLOCK_DIM_XYZ + + threadIdx.z * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ; + + const int start_pair_index = accumulated_n_pairs_per_local_grid[block_index]; + const int end_pair_index = + accumulated_n_pairs_per_local_grid[block_index + 1]; + const int n_pairs = end_pair_index - start_pair_index; + const int n_batches = (n_pairs + n_threads - 1) / n_threads; + + for (int i_batch = 0, i_pair_index = start_pair_index + thread_id; + i_batch < n_batches; i_batch++, i_pair_index += n_threads) { + const bool is_valid_pair = i_pair_index < end_pair_index; + const int i_pair = + is_valid_pair ? sorted_pairs_per_local_grid[i_pair_index] : 0; + + const int image_index = image_indices[i_pair]; + const int image_index_i = image_index / n_images; + const int image_index_j = image_index % n_images; + + const int shell_pair_index = non_trivial_pairs[i_pair]; + const int i_shell_index = shell_pair_index / n_j_shells; + const int j_shell_index = shell_pair_index % n_j_shells; + const int i_shell = i_shells[i_shell_index]; + const int j_shell = j_shells[j_shell_index]; + + const double i_exponent = env[bas(PTR_EXP, i_shell)]; + const int i_coord_offset = atm(PTR_COORD, bas(ATOM_OF, i_shell)); + const double i_x = + env[i_coord_offset] + vectors_to_neighboring_images[image_index_i * 3]; + const double i_y = env[i_coord_offset + 1] + + vectors_to_neighboring_images[image_index_i * 3 + 1]; + const double i_z = env[i_coord_offset + 2] + + vectors_to_neighboring_images[image_index_i * 3 + 2]; + const double i_coeff = env[bas(PTR_COEFF, i_shell)]; + + const double j_exponent = env[bas(PTR_EXP, j_shell)]; + const int j_coord_offset = atm(PTR_COORD, bas(ATOM_OF, j_shell)); + const double j_x = + env[j_coord_offset] + vectors_to_neighboring_images[image_index_j * 3]; + const double j_y = env[j_coord_offset + 1] + + vectors_to_neighboring_images[image_index_j * 3 + 1]; + const double j_z = env[j_coord_offset + 2] + + vectors_to_neighboring_images[image_index_j * 3 + 2]; + const double j_coeff = env[bas(PTR_COEFF, j_shell)]; + + const double ij_exponent = i_exponent + j_exponent; + const double ij_exponent_in_prefactor = + i_exponent * j_exponent / ij_exponent * + distance_squared(i_x - j_x, i_y - j_y, i_z - j_z); + + const double pair_x = (i_exponent * i_x + j_exponent * j_x) / ij_exponent; + const double pair_y = (i_exponent * i_y + j_exponent * j_y) / ij_exponent; + const double pair_z = (i_exponent * i_z + j_exponent * j_z) / ij_exponent; + + const double x0 = start_position_x - pair_x; + const double y0 = start_position_y - pair_y; + const double z0 = start_position_z - pair_z; + + const double gaussian_exponent_at_reference = + ij_exponent * distance_squared(x0, y0, z0); + + const double pair_prefactor = i_coeff * j_coeff * + common_fac_sp() * + common_fac_sp(); + + const double gaussian_starting_point = + is_valid_pair + ? exp(-(ij_exponent_in_prefactor + gaussian_exponent_at_reference) / + 3.0) + : 0; + + const double da_squared = + distance_squared(dxyz_dabc[0], dxyz_dabc[1], dxyz_dabc[2]); + const double db_squared = + distance_squared(dxyz_dabc[3], dxyz_dabc[4], dxyz_dabc[5]); + const double dc_squared = + distance_squared(dxyz_dabc[6], dxyz_dabc[7], dxyz_dabc[8]); + + const double exp_da_squared = exp(-2 * ij_exponent * da_squared); + const double exp_db_squared = exp(-2 * ij_exponent * db_squared); + const double exp_dc_squared = exp(-2 * ij_exponent * dc_squared); + + const double cross_term_a = + dxyz_dabc[0] * x0 + dxyz_dabc[1] * y0 + dxyz_dabc[2] * z0; + const double cross_term_b = + dxyz_dabc[3] * x0 + dxyz_dabc[4] * y0 + dxyz_dabc[5] * z0; + const double cross_term_c = + dxyz_dabc[6] * x0 + dxyz_dabc[7] * y0 + dxyz_dabc[8] * z0; + + const double recursion_factor_a_start = + exp(-ij_exponent * (2 * cross_term_a + da_squared)); + const double recursion_factor_b_start = + exp(-ij_exponent * (2 * cross_term_b + db_squared)); + const double recursion_factor_c_start = + exp(-ij_exponent * (2 * cross_term_c + dc_squared)); + + const double exp_dadb = exp(-2 * ij_exponent * a_dot_b); + const double exp_dadc = exp(-2 * ij_exponent * a_dot_c); + const double exp_dbdc = exp(-2 * ij_exponent * b_dot_c); + + int a_index, b_index, c_index; + double x, y, z; + double gaussian_x, gaussian_y, gaussian_z, recursion_factor_a, + recursion_factor_b, recursion_factor_c; + double recursion_factor_ab_pow_a = 1; + double recursion_factor_ac_pow_a = 1; + double recursion_factor_bc_pow_b = 1; + + if constexpr (is_non_orthogonal) { + // recursion_factor_ab_pow_a = 1; + // recursion_factor_ac_pow_a = 1; + } else { + x = start_position_x; + } + + double max_gaussian_value = 0; + + for (a_index = 0, gaussian_x = gaussian_starting_point, + recursion_factor_a = recursion_factor_a_start; + a_index < a_upper; a_index++, gaussian_x *= recursion_factor_a, + recursion_factor_a *= exp_da_squared) { + + if constexpr (is_non_orthogonal) { + recursion_factor_bc_pow_b = 1; + } else { + y = start_position_y; + } + for (b_index = 0, gaussian_y = gaussian_starting_point, + recursion_factor_b = recursion_factor_b_start; + b_index < b_upper; b_index++, + gaussian_y *= recursion_factor_b * recursion_factor_ab_pow_a, + recursion_factor_b *= exp_db_squared) { + + if constexpr (is_non_orthogonal) { + x = start_position_x + a_index * dxyz_dabc[0] + + b_index * dxyz_dabc[3]; + y = start_position_y + a_index * dxyz_dabc[1] + + b_index * dxyz_dabc[4]; + z = start_position_z + a_index * dxyz_dabc[2] + + b_index * dxyz_dabc[5]; + } else { + z = start_position_z; + } + for (c_index = 0, gaussian_z = gaussian_starting_point, + recursion_factor_c = recursion_factor_c_start; + c_index < c_upper; c_index++, + gaussian_z *= recursion_factor_c * recursion_factor_ac_pow_a * + recursion_factor_bc_pow_b, + recursion_factor_c *= exp_dc_squared) { + + const double r_i_squared = + distance_squared(x - i_x, y - i_y, z - i_z); + const double r_j_squared = + distance_squared(x - j_x, y - j_y, z - j_z); + const double r_p_squared = + distance_squared(x - pair_x, y - pair_y, z - pair_z); + + const double approxmate_polynomial = + approximate_polynomial_value( + r_i_squared, r_j_squared, r_p_squared, derivative_order); + + const double gaussian = gaussian_x * gaussian_y * gaussian_z; + + const double approximate_value = + abs(4.0 * M_PI * r_p_squared * pair_prefactor * + approxmate_polynomial * gaussian); + + max_gaussian_value = max(max_gaussian_value, approximate_value); + + if constexpr (is_non_orthogonal) { + x += dxyz_dabc[6]; + y += dxyz_dabc[7]; + z += dxyz_dabc[8]; + } else { + z += dxyz_dabc[8]; + } + } + + if constexpr (is_non_orthogonal) { + recursion_factor_bc_pow_b *= exp_dbdc; + } else { + y += dxyz_dabc[4]; + } + } + + if constexpr (is_non_orthogonal) { + recursion_factor_ab_pow_a *= exp_dadb; + recursion_factor_ac_pow_a *= exp_dadc; + } else { + x += dxyz_dabc[0]; + } + } + + if (max_gaussian_value < threshold && is_valid_pair) { + sorted_pairs_per_local_grid[i_pair_index] = -1; + atomicAdd(n_pairs_per_local_grid + block_index, -1); + } + } +} + +#define tailor_gaussian_pairs_kernel_macro(li, lj) \ + tailor_gaussian_pairs_kernel \ + <<>>( \ + sorted_pairs_per_local_grid, n_pairs_per_local_grid, \ + non_trivial_pairs, i_shells, j_shells, n_j_shells, \ + shell_to_ao_indices, accumulated_n_pairs_per_local_grid, \ + sorted_block_index, image_indices, vectors_to_neighboring_images, \ + n_images, mesh_a, mesh_b, mesh_c, atm, bas, env, threshold, \ + derivative_order); + +#define tailor_gaussian_pairs_kernel_case_macro(li, lj) \ + case (li * 10 + lj): \ + tailor_gaussian_pairs_kernel_macro(li, lj); \ + break + +template +int tailor_gaussian_pairs_driver( + int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid, + const int i_angular, const int j_angular, const int *non_trivial_pairs, + const int *i_shells, const int *j_shells, const int n_j_shells, + const int *shell_to_ao_indices, + const int *accumulated_n_pairs_per_local_grid, + const int *sorted_block_index, const int n_contributing_blocks, + const int *image_indices, const double *vectors_to_neighboring_images, + const int n_images, const int *mesh, const int *atm, const int *bas, + const double *env, const double threshold, const int derivative_order) { + dim3 block_size(BLOCK_DIM_XYZ, BLOCK_DIM_XYZ, BLOCK_DIM_XYZ); + int mesh_a = mesh[0]; + int mesh_b = mesh[1]; + int mesh_c = mesh[2]; + dim3 block_grid(n_contributing_blocks, 1, 1); + switch (i_angular * 10 + j_angular) { + tailor_gaussian_pairs_kernel_case_macro(0, 0); + tailor_gaussian_pairs_kernel_case_macro(0, 1); + tailor_gaussian_pairs_kernel_case_macro(0, 2); + tailor_gaussian_pairs_kernel_case_macro(0, 3); + tailor_gaussian_pairs_kernel_case_macro(0, 4); + tailor_gaussian_pairs_kernel_case_macro(1, 0); + tailor_gaussian_pairs_kernel_case_macro(1, 1); + tailor_gaussian_pairs_kernel_case_macro(1, 2); + tailor_gaussian_pairs_kernel_case_macro(1, 3); + tailor_gaussian_pairs_kernel_case_macro(1, 4); + tailor_gaussian_pairs_kernel_case_macro(2, 0); + tailor_gaussian_pairs_kernel_case_macro(2, 1); + tailor_gaussian_pairs_kernel_case_macro(2, 2); + tailor_gaussian_pairs_kernel_case_macro(2, 3); + tailor_gaussian_pairs_kernel_case_macro(2, 4); + tailor_gaussian_pairs_kernel_case_macro(3, 0); + tailor_gaussian_pairs_kernel_case_macro(3, 1); + tailor_gaussian_pairs_kernel_case_macro(3, 2); + tailor_gaussian_pairs_kernel_case_macro(3, 3); + tailor_gaussian_pairs_kernel_case_macro(3, 4); + tailor_gaussian_pairs_kernel_case_macro(4, 0); + tailor_gaussian_pairs_kernel_case_macro(4, 1); + tailor_gaussian_pairs_kernel_case_macro(4, 2); + tailor_gaussian_pairs_kernel_case_macro(4, 3); + tailor_gaussian_pairs_kernel_case_macro(4, 4); + default: + fprintf(stderr, + "angular momentum pair %d, %d is not supported in " + "evaluate_density_driver\n", + i_angular, j_angular); + return 1; + } + + return checkCudaErrors(cudaPeekAtLastError()); +} + } // namespace gpu4pyscf::gpbc::multi_grid diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh index cd774255d..a878d76dd 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh @@ -26,6 +26,29 @@ __host__ __device__ T distance_squared(const T x, const T y, const T z) { return x * x + y * y + z * z; } +template +__host__ __device__ T approximate_polynomial_value(const double r_i_squared, + const double r_j_squared, + const double r_p_squared, + const int derivative_order) { + T result = + pow(r_i_squared, i_angular / 2.0) * pow(r_j_squared, j_angular / 2.0); + + if (derivative_order == 1) { + result *= -2.0 * sqrt(r_p_squared); + if constexpr (i_angular > 0) { + result += i_angular * pow(r_i_squared, (i_angular - 2) / 2.0) * + pow(r_j_squared, j_angular / 2.0); + } + + if constexpr (j_angular > 0) { + result += j_angular * pow(r_i_squared, i_angular / 2.0) * + pow(r_j_squared, (j_angular - 2) / 2.0); + } + } + return result; +} + template __device__ constexpr T common_fac_sp() { if constexpr (ANG == 0) { return 0.282094791773878143; diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index e09b4963a..32d49fc15 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -36,7 +36,7 @@ import gpu4pyscf.pbc.dft.multigrid as multigrid_v1 from gpu4pyscf.lib.cupy_helper import contract, tag_array, load_library -__all__ = ['MultiGridNumInt'] +__all__ = ["MultiGridNumInt"] libgpbc = load_library("libmgrid_v2") libgpbc.evaluate_density_driver.restype = ctypes.c_int @@ -76,19 +76,23 @@ def ifft_in_place(x): def unique_with_sort(x): # This function does the same thing as cp.unique(x, return_inverse=True). # It's not super optimized, but for whatever reason, cp.unique is very slow, so this one is better. - assert type(x) is cp.ndarray and (x.dtype == cp.int32 or x.dtype == cp.int64) and x.ndim == 1 + assert ( + type(x) is cp.ndarray + and (x.dtype == cp.int32 or x.dtype == cp.int64) + and x.ndim == 1 + ) n = x.shape[0] if n <= 1: return x, cp.zeros(n) sort_index = cp.argsort(x) - inverse_sort = cp.empty(n, dtype = cp.int64) - inverse_sort[sort_index] = cp.arange(0, n, dtype = cp.int64) + inverse_sort = cp.empty(n, dtype=cp.int64) + inverse_sort[sort_index] = cp.arange(0, n, dtype=cp.int64) x = x[sort_index] mask = cp.empty(n, dtype=cp.bool_) mask[0] = True - mask[1:] = (x[1:] != x[:-1]) + mask[1:] = x[1:] != x[:-1] x = x[mask] inverse_unique = cp.cumsum(mask, dtype=cp.int64) - 1 @@ -106,24 +110,29 @@ def image_pair_to_difference( translation_vectors = cp.asarray( cp.linalg.solve(lattice_vectors.T, vectors_to_neighboring_images.T).T, ) - translation_vectors = cp.asarray(cp.round(translation_vectors), dtype = cp.int32) + translation_vectors = cp.asarray(cp.round(translation_vectors), dtype=cp.int32) image_difference_full = ( # k_j - k_i corresponding to - translation_vectors[None,:,:] - translation_vectors[:,None,:] + translation_vectors[None, :, :] + - translation_vectors[:, None, :] ).reshape(-1, 3) max_offset = cp.max(cp.abs(image_difference_full)) + 1 - assert (max_offset * 2)**3 < np.iinfo(np.int32).max + assert (max_offset * 2) ** 3 < np.iinfo(np.int32).max image_difference_3in1 = image_difference_full + max_offset - image_difference_3in1 = image_difference_3in1[:, 0] * (max_offset * 2)**2 \ - + image_difference_3in1[:, 1] * (max_offset * 2) \ - + image_difference_3in1[:, 2] + image_difference_3in1 = ( + image_difference_3in1[:, 0] * (max_offset * 2) ** 2 + + image_difference_3in1[:, 1] * (max_offset * 2) + + image_difference_3in1[:, 2] + ) image_difference_3in1, inverse = unique_with_sort(image_difference_3in1) - translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype = cp.int32) - translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2)**2 - translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2)**2) // (max_offset * 2) + translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype=cp.int32) + translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2) ** 2 + translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // ( + max_offset * 2 + ) translation_vectors[:, 2] = image_difference_3in1 % (max_offset * 2) translation_vectors -= max_offset @@ -138,6 +147,7 @@ def image_pair_to_difference( # where R1 is associated with the first orbital in a pair, and R2 associated to the second. return cp.asarray(difference_images), cp.asarray(inverse, dtype=cp.int32) + def image_phase_for_kpts(cell, neighboring_images, kpts=None): n_images = len(neighboring_images) if kpts is None or is_gamma_point(kpts): @@ -154,6 +164,7 @@ def image_phase_for_kpts(cell, neighboring_images, kpts=None): ) return phase_diff_among_images, image_pair_difference_index + def count_non_trivial_pairs( i_angular, j_angular, @@ -187,7 +198,9 @@ def count_non_trivial_pairs( ctypes.c_double(threshold_in_log), ) if err != 0: - raise RuntimeError(f'count_non_trivial_pairs for li={i_angular} lj={j_angular} failed') + raise RuntimeError( + f"count_non_trivial_pairs for li={i_angular} lj={j_angular} failed" + ) return int(n_pairs[0]) @@ -243,7 +256,9 @@ def screen_gaussian_pairs( ctypes.c_double(threshold_in_log), ) if err != 0: - raise RuntimeError(f'screen_gaussian_pairs for li={i_angular} lj={j_angular} failed') + raise RuntimeError( + f"screen_gaussian_pairs for li={i_angular} lj={j_angular} failed" + ) return ( screened_shell_pairs, image_indices, @@ -257,20 +272,26 @@ def assign_pairs_to_blocks( pairs_to_blocks_end, n_blocks_abc, n_indices, + i_angular, + j_angular, non_trivial_pairs, i_shells, j_shells, + shell_to_ao_indices, image_indices, vectors_to_neighboring_images, mesh, atm, bas, env, - has_warned_instability + has_warned_instability, + is_non_orthogonal, + threshold, + derivative_order, ): n_blocks = np.prod(n_blocks_abc) n_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32) - n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype = cp.int32) + n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32) err = libgpbc.count_pairs_on_blocks( cast_to_pointer(n_pairs_on_blocks), cast_to_pointer(n_unstable_pairs_on_blocks), @@ -288,16 +309,17 @@ def assign_pairs_to_blocks( cast_to_pointer(mesh), cast_to_pointer(atm), cast_to_pointer(bas), - cast_to_pointer(env) + cast_to_pointer(env), ) - has_unstable_pairs = (n_unstable_pairs_on_blocks[-1] > 0) + has_unstable_pairs = n_unstable_pairs_on_blocks[-1] > 0 if not has_warned_instability and has_unstable_pairs: - warnings.warn("Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.") + warnings.warn( + "Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff." + ) has_warned_instability = True - if err != 0: - raise RuntimeError('count_pairs_on_blocks failed') + raise RuntimeError("count_pairs_on_blocks failed") n_contributing_blocks = int(n_pairs_on_blocks[-1]) n_pairs_on_blocks = n_pairs_on_blocks[:-1] @@ -325,14 +347,45 @@ def assign_pairs_to_blocks( cast_to_pointer(mesh), cast_to_pointer(atm), cast_to_pointer(bas), - cast_to_pointer(env) + cast_to_pointer(env), + ) + + libgpbc.tailor_gaussian_pairs( + cast_to_pointer(pairs_on_blocks), + cast_to_pointer(n_pairs_on_blocks), + ctypes.c_int(i_angular), + ctypes.c_int(j_angular), + cast_to_pointer(non_trivial_pairs), + cast_to_pointer(i_shells), + cast_to_pointer(j_shells), + ctypes.c_int(len(j_shells)), + cast_to_pointer(shell_to_ao_indices), + cast_to_pointer(accumulated_n_pairs_per_block), + cast_to_pointer(sorted_block_index), + ctypes.c_int(n_contributing_blocks), + cast_to_pointer(image_indices), + cast_to_pointer(vectors_to_neighboring_images), + ctypes.c_int(len(vectors_to_neighboring_images)), + cast_to_pointer(mesh), + cast_to_pointer(atm), + cast_to_pointer(bas), + cast_to_pointer(env), + ctypes.c_int(is_non_orthogonal), + ctypes.c_double(threshold), + ctypes.c_int(derivative_order), ) + pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0] + sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32) + n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) + accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32) + sorted_block_index = sorted_block_index[:n_contributing_blocks] + return ( pairs_on_blocks, accumulated_n_pairs_per_block, sorted_block_index, - has_warned_instability + has_warned_instability, ) @@ -355,7 +408,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): libgpbc.update_lattice_vectors( lattice_vectors.ctypes, reciprocal_lattice_vectors.ctypes, - reciprocal_norms.ctypes + reciprocal_norms.ctypes, ) tasks = getattr(mydf, "tasks", None) @@ -366,6 +419,13 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): t0 = log.timer("task generation", *t0) t1 = t0 pairs = [] + + derivative_order = 0 + if xc_type == "GGA": + derivative_order = 1 + if xc_type == "mGGA": + raise NotImplementedError("mGGA is not supported yet in multigrid_v2.") + for grids_localized, grids_diffused in tasks: subcell_in_localized_region = grids_localized.cell # the original grids_localized.mesh has dtype=np.int64, which can cause @@ -374,14 +434,14 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): fft_grid = list( map( - lambda n_mesh_points: cp.round(cp.fft.fftfreq( - n_mesh_points, 1.0 / n_mesh_points - )).astype(cp.int32), + lambda n_mesh_points: cp.round( + cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points) + ).astype(cp.int32), mesh, ) ) - dxyz_dabc = lattice_vectors / mesh[:,None] + dxyz_dabc = lattice_vectors / mesh[:, None] libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes) n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32) equivalent_cell_in_localized, coeff_in_localized = ( @@ -421,8 +481,8 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): concatenated_coeff = cp.asarray(concatenated_coeff) n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell) - rad = vol**(-1./3) * cell.rcut + 1 - surface = 4*np.pi * rad**2 + rad = vol ** (-1.0 / 3) * cell.rcut + 1 + surface = 4 * np.pi * rad**2 lattice_sum_factor = surface precision = cell.precision / lattice_sum_factor threshold_in_log = np.log(precision) @@ -497,27 +557,33 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): gaussian_pair_indices, accumulated_counts, sorted_contributing_blocks, - has_warned_instability + has_warned_instability, ) = assign_pairs_to_blocks( pairs_to_blocks_begin, pairs_to_blocks_end, n_blocks_abc, n_indices, + i_angular, + j_angular, screened_shell_pairs, i_shells, j_shells, + shell_to_ao_indices, image_indices, vectors_to_neighboring_images, mesh, atm, bas, env, - has_warned_instability + has_warned_instability, + is_non_orthogonal, + cell.precision, + derivative_order, ) t1 = log.timer_debug2( "assigning pairs to blocks in angular pair" + str((i_angular, j_angular)), - *t1 + *t1, ) per_angular_pairs.append( { @@ -559,7 +625,9 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): return mydf -def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False): +def evaluate_density_wrapper( + pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False +): if with_tau: c_driver = libgpbc.evaluate_density_tau_driver else: @@ -600,9 +668,19 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, assert density_matrix_with_translation_real_part.size < np.iinfo(np.int32).max if with_tau: - density = cp.zeros((n_channels, 2, ) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype) + density = cp.zeros( + ( + n_channels, + 2, + ) + + tuple(pairs_info["mesh"]), + dtype=density_matrix_with_translation_real_part.dtype, + ) else: - density = cp.zeros((n_channels,) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype) + density = cp.zeros( + (n_channels,) + tuple(pairs_info["mesh"]), + dtype=density_matrix_with_translation_real_part.dtype, + ) for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: (i_angular, j_angular) = gaussians_per_angular_pair["angular"] @@ -637,11 +715,14 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, use_float_precision, ) if err != 0: - raise RuntimeError(f'evaluate_density_driver for li={i_angular} lj={j_angular} failed') + raise RuntimeError( + f"evaluate_density_driver for li={i_angular} lj={j_angular} failed" + ) return density -def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): + +def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) n_channels, n_k_points = dms.shape[:2] @@ -708,7 +789,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) density = ( evaluate_density_wrapper( - pairs, coeff_sandwiched_density_matrix, img_phase, with_tau = with_tau + pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau ) * weight_per_grid_point ) @@ -740,7 +821,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): ] += tau density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1]) - if xc_type != 'LDA': + if xc_type != "LDA": density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j return density_on_g_mesh @@ -748,11 +829,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): if with_tau: - assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2 + assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2 n_channels = xc_weights.shape[0] # density_slices = 2 else: - assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1) + assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or ( + xc_weights.ndim == 3 + 1 + ) n_channels = xc_weights.shape[0] # density_slices = 1 @@ -814,12 +897,12 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): ) fock += fock_slice if err != 0: - raise RuntimeError(f'evaluate_xc_driver for li={i_angular} lj={j_angular} failed') + raise RuntimeError( + f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed" + ) if not (n_k_points == 1 and n_difference_images == 1): - return cp.einsum( - "kt, ntij -> nkij", phase_diff_among_images, fock - ) + return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock) else: return fock @@ -888,7 +971,9 @@ def convert_xc_on_g_mesh_to_fock( n_ao_in_localized = len(pairs["ao_indices_in_localized"]) libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - fock_slice = evaluate_xc_wrapper(pairs, interpolated_xc, img_phase, with_tau=with_tau) + fock_slice = evaluate_xc_wrapper( + pairs, interpolated_xc, img_phase, with_tau=with_tau + ) fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"]) fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"]) @@ -929,14 +1014,22 @@ def convert_xc_on_g_mesh_to_fock( def evaluate_xc_gradient_wrapper( - gradient, pairs_info, xc_weights, dm_slice, img_phase, ignore_imag=True, with_tau=False + gradient, + pairs_info, + xc_weights, + dm_slice, + img_phase, + ignore_imag=True, + with_tau=False, ): if with_tau: - assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2 + assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2 n_channels = xc_weights.shape[0] # density_slices = 2 else: - assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1) + assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or ( + xc_weights.ndim == 3 + 1 + ) n_channels = xc_weights.shape[0] # density_slices = 1 @@ -1006,7 +1099,9 @@ def evaluate_xc_gradient_wrapper( use_float_precision, ) if err != 0: - raise RuntimeError(f'evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed') + raise RuntimeError( + f"evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed" + ) def convert_xc_on_g_mesh_to_fock_gradient( @@ -1088,7 +1183,8 @@ def convert_xc_on_g_mesh_to_fock_gradient( return gradient -#FIXME: merge to multigrid_v1.get_pp + +# FIXME: merge to multigrid_v1.get_pp def get_nuc(ni, kpts=None): if ni.sorted_gaussian_pairs is None: ni.build() @@ -1102,7 +1198,8 @@ def get_nuc(ni, kpts=None): vne = vne[0] return vne -#FIXME: merge to multigrid_v1.get_pp + +# FIXME: merge to multigrid_v1.get_pp def get_pp(ni, kpts=None): """Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed.""" if ni.sorted_gaussian_pairs is None: @@ -1132,8 +1229,9 @@ def get_pp(ni, kpts=None): log.timer("get_pp", *t0) return vpp + def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): - '''Get the Coulomb (J) AO matrix at sampled k-points. + """Get the Coulomb (J) AO matrix at sampled k-points. Args: dm_kpts : (nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray @@ -1149,7 +1247,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): Returns: vj : (nkpts, nao, nao) ndarray or list of vj if the input dm_kpts is a list of DMs - ''' + """ if kpts is None: kpts = np.zeros((1, 3)) cell = ni.cell @@ -1165,9 +1263,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): Gv = pbc_tools._get_Gv(cell, mesh) coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) - coulomb_on_g_mesh = cp.einsum( - "ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh - ) + coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh) weight = cell.vol / ngrids density = density.reshape(-1, *mesh) @@ -1176,16 +1272,28 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): density = ifft_in_place(density).real.reshape(nset, -1, ngrids) density /= weight - #if kpts_band is not None: + # if kpts_band is not None: # ni = ni.copy().reset().build() kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band xc_for_fock = convert_xc_on_g_mesh_to_fock(ni, coulomb_on_g_mesh, hermi, kpts_band) t0 = log.timer("vj", *t0) return _format_jks(xc_for_fock, dm_kpts, input_band, kpts) -def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, - kpts=None, kpts_band=None, with_j=False, verbose=None): - '''Compute the XC energy and RKS XC matrix at sampled k-points. + +def nr_rks( + ni, + cell, + grids, + xc_code, + dm_kpts, + relativity=0, + hermi=1, + kpts=None, + kpts_band=None, + with_j=False, + verbose=None, +): + """Compute the XC energy and RKS XC matrix at sampled k-points. multigrid version of function pbc.dft.numint.nr_rks. Args: @@ -1204,7 +1312,7 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, nelec : number of electrons obtained from the numerical integration veff : (nkpts, nao, nao) ndarray or list of veff if the input dm_kpts is a list of DMs - ''' + """ cell = ni.cell log = logger.new_logger(cell, verbose) t0 = log.init_timer() @@ -1239,12 +1347,12 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, density /= weight # eval_xc_eff supports float64 only - density = cp.asarray(density, dtype=np.float64, order='C') + density = cp.asarray(density, dtype=np.float64, order="C") if xc_type == "LDA": xc_for_energy, xc_for_fock = ni.eval_xc_eff( xc_code, density[0], deriv=1, xctype=xc_type )[:2] - elif xc_type == 'GGA' or xc_type == 'MGGA': + elif xc_type == "GGA" or xc_type == "MGGA": xc_for_energy, xc_for_fock = ni.eval_xc_eff( xc_code, density, deriv=1, xctype=xc_type )[:2] @@ -1270,10 +1378,13 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, xc_for_fock = xc_for_fock.reshape((-1, ngrids)) elif xc_type == "MGGA": xc_for_fock[0] -= contract("gp, pg -> p", xc_for_fock[1:4], Gv) * 1j - xc_for_fock = cp.concatenate([ - xc_for_fock[0].reshape((-1, ngrids)), - xc_for_fock[4].reshape((-1, ngrids)), - ], axis = 0) + xc_for_fock = cp.concatenate( + [ + xc_for_fock[0].reshape((-1, ngrids)), + xc_for_fock[4].reshape((-1, ngrids)), + ], + axis=0, + ) else: raise ValueError(f"Incorrect xc_type = {xc_type}") @@ -1281,17 +1392,31 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, xc_for_fock[0] += coulomb_on_g_mesh kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band - veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA")) + veff = convert_xc_on_g_mesh_to_fock( + ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA") + ) veff = _format_jks(veff, dm_kpts, input_band, kpts) veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None) t0 = log.timer("xc", *t0) return n_electrons, xc_energy_sum, veff + # Note nr_uks handles only one set of KUKS density matrices (alpha, beta) in # each call (nr_rks supports multiple sets of KRKS density matrices) -def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, - kpts=None, kpts_band=None, with_j=False, verbose=None): - '''Compute the XC energy and UKS XC matrix at sampled k-points. +def nr_uks( + ni, + cell, + grids, + xc_code, + dm_kpts, + relativity=0, + hermi=1, + kpts=None, + kpts_band=None, + with_j=False, + verbose=None, +): + """Compute the XC energy and UKS XC matrix at sampled k-points. multigrid version of function pbc.dft.numint.nr_rks. Args: @@ -1310,7 +1435,7 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, nelec : number of electrons obtained from the numerical integration veff : (nkpts, nao, nao) ndarray or list of veff if the input dm_kpts is a list of DMs - ''' + """ cell = ni.cell log = logger.new_logger(cell, verbose) t0 = log.init_timer() @@ -1346,12 +1471,12 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, density /= weight # eval_xc_eff supports float64 only - density = cp.asarray(density, dtype=np.float64, order='C') + density = cp.asarray(density, dtype=np.float64, order="C") if xc_type == "LDA": xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density[:,0], deriv=1, xctype=xc_type + xc_code, density[:, 0], deriv=1, xctype=xc_type )[:2] - elif xc_type == 'GGA' or xc_type == 'MGGA': + elif xc_type == "GGA" or xc_type == "MGGA": xc_for_energy, xc_for_fock = ni.eval_xc_eff( xc_code, density, deriv=1, xctype=xc_type )[:2] @@ -1377,10 +1502,13 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) elif xc_type == "MGGA": xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j - xc_for_fock = cp.concatenate([ - xc_for_fock[:, 0].reshape((nset, -1, ngrids)), - xc_for_fock[:, 4].reshape((nset, -1, ngrids)), - ], axis = 1) + xc_for_fock = cp.concatenate( + [ + xc_for_fock[:, 0].reshape((nset, -1, ngrids)), + xc_for_fock[:, 4].reshape((nset, -1, ngrids)), + ], + axis=1, + ) else: raise ValueError(f"Incorrect xc_type = {xc_type}") @@ -1388,15 +1516,17 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, xc_for_fock[:, 0] += coulomb_on_g_mesh kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band - veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA")) + veff = convert_xc_on_g_mesh_to_fock( + ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA") + ) veff = _format_jks(veff, dm_kpts, input_band, kpts) veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None) t0 = log.timer("xc", *t0) return n_electrons, xc_energy_sum, veff + def get_rho(ni, dm, kpts=None): - '''Density in real space - ''' + """Density in real space""" cell = ni.cell mesh = ni.mesh ngrids = np.prod(mesh) @@ -1407,6 +1537,7 @@ def get_rho(ni, dm, kpts=None): rhoR = ifft_in_place(density.reshape(-1, *mesh)).real / weight return rhoR.reshape(-1, ngrids) + def get_veff_ip1( ni, xc_code, @@ -1436,9 +1567,7 @@ def get_veff_ip1( Gv = pbc_tools._get_Gv(cell, mesh) coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) - coulomb_on_g_mesh = cp.einsum( - "ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh - ) + coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh) weight = cell.vol / ngrids @@ -1453,13 +1582,9 @@ def get_veff_ip1( ) if nset == 1: - xc_for_fock = ni.eval_xc_eff( - xc_code, density[0], deriv=1, xctype=xc_type - )[1] + xc_for_fock = ni.eval_xc_eff(xc_code, density[0], deriv=1, xctype=xc_type)[1] else: - xc_for_fock = ni.eval_xc_eff( - xc_code, density, deriv=1, xctype=xc_type - )[1] + xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[1] xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight xc_for_fock = fft_in_place(xc_for_fock).reshape(nset, -1, ngrids) @@ -1473,10 +1598,13 @@ def get_veff_ip1( xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) elif xc_type == "MGGA": xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j - xc_for_fock = cp.concatenate([ - xc_for_fock[:, 0].reshape((nset, -1, ngrids)), - xc_for_fock[:, 4].reshape((nset, -1, ngrids)), - ], axis = 1) + xc_for_fock = cp.concatenate( + [ + xc_for_fock[:, 0].reshape((nset, -1, ngrids)), + xc_for_fock[:, 4].reshape((nset, -1, ngrids)), + ], + axis=1, + ) else: raise ValueError(f"Incorrect xc_type = {xc_type}") @@ -1488,13 +1616,14 @@ def get_veff_ip1( xc_for_fock[:, 0] += multigrid_v1.eval_vpplocG_part1(cell, mesh) veff_gradient = convert_xc_on_g_mesh_to_fock_gradient( - ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau = (xc_type == "MGGA") + ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau=(xc_type == "MGGA") ) t0 = log.timer("veff_gradient", *t0) return veff_gradient + class MultiGridNumInt(lib.StreamObject, numint.LibXCMixin): def __init__(self, cell): self.cell = cell @@ -1511,8 +1640,7 @@ def reset(self, cell=None): self.sorted_gaussian_pairs = None return self - def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, - omega=None, exxdiv='ewald'): + def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv="ewald"): if kpts is not None: raise NotImplementedError vj = get_j_kpts(self, dm, hermi, kpts, kpts_band) @@ -1524,7 +1652,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, get_rho = get_rho nr_rks = nr_rks nr_uks = nr_uks - get_vxc = nr_vxc = NotImplemented #numint_cpu.KNumInt.nr_vxc + get_vxc = nr_vxc = NotImplemented # numint_cpu.KNumInt.nr_vxc eval_xc_eff = numint.eval_xc_eff _init_xcfuns = numint.NumInt._init_xcfuns @@ -1532,11 +1660,11 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, nr_rks_fxc = NotImplemented nr_uks_fxc = NotImplemented nr_rks_fxc_st = NotImplemented - cache_xc_kernel = NotImplemented + cache_xc_kernel = NotImplemented cache_xc_kernel1 = NotImplemented to_gpu = utils.to_gpu device = utils.device def to_cpu(self): - raise RuntimeError('Not available') + raise RuntimeError("Not available") From 386cdf0eeefafc7914da6cf140d56054cfc1a2d9 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Tue, 28 Oct 2025 16:51:21 -0700 Subject: [PATCH 03/14] add timer for individual kernel call --- gpu4pyscf/pbc/dft/multigrid_v2.py | 38 +++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 32d49fc15..464007854 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -626,7 +626,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): def evaluate_density_wrapper( - pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False + cell, pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False ): if with_tau: c_driver = libgpbc.evaluate_density_tau_driver @@ -682,6 +682,9 @@ def evaluate_density_wrapper( dtype=density_matrix_with_translation_real_part.dtype, ) + log = logger.new_logger(cell, cell.verbose) + t0 = log.init_timer() + for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: (i_angular, j_angular) = gaussians_per_angular_pair["angular"] @@ -718,8 +721,9 @@ def evaluate_density_wrapper( raise RuntimeError( f"evaluate_density_driver for li={i_angular} lj={j_angular} failed" ) + t1 = log.timer_debug1("density kernel", *t0) - return density + return density, cp.cuda.get_elapsed_time(t0[-1], t1[-1]) def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): @@ -746,6 +750,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density_on_g_mesh = cp.zeros( (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128 ) + + kernel_time = 0 for pairs in mydf.sorted_gaussian_pairs: mesh = pairs["mesh"] @@ -787,13 +793,16 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - density = ( + density, kernel_subroutine = ( evaluate_density_wrapper( - pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau + cell, pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau ) - * weight_per_grid_point ) + density *= weight_per_grid_point + + kernel_time += kernel_subroutine + if with_tau: assert density.shape[1] == 2 tau = density[:, 1] @@ -824,10 +833,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): if xc_type != "LDA": density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j + print("density kernel: ", kernel_time, "ms") return density_on_g_mesh -def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): +def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False): if with_tau: assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2 n_channels = xc_weights.shape[0] @@ -860,6 +870,10 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): assert xc_weights.dtype == cp.float64 use_float_precision = ctypes.c_int(0) + + log = logger.new_logger(cell, cell.verbose) + t0 = log.init_timer() + for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: fock_slice = cp.zeros( (n_channels, n_difference_images, n_i_functions, n_j_functions), @@ -901,10 +915,12 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed" ) + t1 = log.timer_debug1("xc kernel", *t0) + if not (n_k_points == 1 and n_difference_images == 1): return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock) else: - return fock + return fock, cp.cuda.get_elapsed_time(t0[-1], t1[-1]) def convert_xc_on_g_mesh_to_fock( @@ -957,6 +973,7 @@ def convert_xc_on_g_mesh_to_fock( data_type = complex_type(cp.float64) fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type) + kernel_time = 0 for pairs in mydf.sorted_gaussian_pairs: interpolated_xc = xc_on_g_mesh[ @@ -971,9 +988,10 @@ def convert_xc_on_g_mesh_to_fock( n_ao_in_localized = len(pairs["ao_indices_in_localized"]) libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - fock_slice = evaluate_xc_wrapper( - pairs, interpolated_xc, img_phase, with_tau=with_tau + fock_slice, kernel_subroutine = evaluate_xc_wrapper( + cell, pairs, interpolated_xc, img_phase, with_tau=with_tau ) + kernel_time += kernel_subroutine fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"]) fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"]) @@ -1010,6 +1028,8 @@ def convert_xc_on_g_mesh_to_fock( else: raise NotImplementedError + + print("timing for xc kernel: ", kernel_time, "ms") return fock From c345a268b13c82b279d2a9c92941b384d31a15d0 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Wed, 29 Oct 2025 00:14:55 -0700 Subject: [PATCH 04/14] add detailed timing --- gpu4pyscf/pbc/dft/multigrid_v2.py | 85 ++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 464007854..ce8acabe6 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -626,7 +626,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): def evaluate_density_wrapper( - cell, pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False + pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False ): if with_tau: c_driver = libgpbc.evaluate_density_tau_driver @@ -682,9 +682,6 @@ def evaluate_density_wrapper( dtype=density_matrix_with_translation_real_part.dtype, ) - log = logger.new_logger(cell, cell.verbose) - t0 = log.init_timer() - for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: (i_angular, j_angular) = gaussians_per_angular_pair["angular"] @@ -721,9 +718,8 @@ def evaluate_density_wrapper( raise RuntimeError( f"evaluate_density_driver for li={i_angular} lj={j_angular} failed" ) - t1 = log.timer_debug1("density kernel", *t0) - return density, cp.cuda.get_elapsed_time(t0[-1], t1[-1]) + return density def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): @@ -745,6 +741,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): raise ValueError(f"Incorrect xc_type = {xc_type}") cell = mydf.cell + log = logger.new_logger(cell, cell.verbose) nx, ny, nz = mydf.mesh density_on_g_mesh = cp.zeros( @@ -752,6 +749,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): ) kernel_time = 0 + fft_time = 0 + contraction_time = 0 for pairs in mydf.sorted_gaussian_pairs: mesh = pairs["mesh"] @@ -759,6 +758,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): n_grid_points = np.prod(mesh) weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points + density_matrix_with_rows_in_localized = dms[ :, :, @@ -773,7 +773,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): pairs["ao_indices_in_localized"], ] + n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3] + + t0 = log.init_timer() + density_matrix_with_rows_in_localized[ :, :, :, n_ao_in_localized: ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() @@ -790,18 +794,20 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): pairs["concatenated_coeff"], ) + contraction_time += log.timer_silent(*t0)[-1] + t0 = log.init_timer() + libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - density, kernel_subroutine = ( + density = ( evaluate_density_wrapper( - cell, pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau - ) + pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau + ) * weight_per_grid_point ) - density *= weight_per_grid_point - - kernel_time += kernel_subroutine + kernel_time += log.timer_silent(*t0)[-1] + t0 = log.init_timer() if with_tau: assert density.shape[1] == 2 @@ -810,6 +816,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density = fft_in_place(density) + fft_time += log.timer_silent(*t0)[-1] + density_on_g_mesh[ :, 0, @@ -834,10 +842,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j print("density kernel: ", kernel_time, "ms") + print("density fft: ", fft_time, "ms") + print("density contraction: ", contraction_time, "ms") + return density_on_g_mesh -def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False): +def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): if with_tau: assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2 n_channels = xc_weights.shape[0] @@ -870,10 +881,6 @@ def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False) assert xc_weights.dtype == cp.float64 use_float_precision = ctypes.c_int(0) - - log = logger.new_logger(cell, cell.verbose) - t0 = log.init_timer() - for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: fock_slice = cp.zeros( (n_channels, n_difference_images, n_i_functions, n_j_functions), @@ -915,13 +922,11 @@ def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False) f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed" ) - t1 = log.timer_debug1("xc kernel", *t0) if not (n_k_points == 1 and n_difference_images == 1): return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock) else: - return fock, cp.cuda.get_elapsed_time(t0[-1], t1[-1]) - + return fock def convert_xc_on_g_mesh_to_fock( mydf, @@ -974,8 +979,13 @@ def convert_xc_on_g_mesh_to_fock( fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type) kernel_time = 0 + fft_time = 0 + contraction_time = 0 + log = logger.new_logger(cell, cell.verbose) for pairs in mydf.sorted_gaussian_pairs: + + t0 = log.init_timer() interpolated_xc = xc_on_g_mesh[ :, :, @@ -984,14 +994,19 @@ def convert_xc_on_g_mesh_to_fock( pairs["fft_grid"][2], ] interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C") + fft_time += log.timer_silent(*t0)[-1] + t0 = log.init_timer() n_ao_in_localized = len(pairs["ao_indices_in_localized"]) libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - fock_slice, kernel_subroutine = evaluate_xc_wrapper( - cell, pairs, interpolated_xc, img_phase, with_tau=with_tau + fock_slice = evaluate_xc_wrapper( + pairs, interpolated_xc, img_phase, with_tau=with_tau ) - kernel_time += kernel_subroutine + + kernel_time += log.timer_silent(*t0)[-1] + t0 = log.init_timer() + fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"]) fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"]) @@ -1016,6 +1031,9 @@ def convert_xc_on_g_mesh_to_fock( pairs["ao_indices_in_localized"][:, None], pairs["ao_indices_in_diffused"], ] += fock_slice[:, :, :, n_ao_in_localized:] + + contraction_time += log.timer_silent(*t0)[-1] + if hermi == 1: fock[ :, @@ -1029,7 +1047,10 @@ def convert_xc_on_g_mesh_to_fock( raise NotImplementedError - print("timing for xc kernel: ", kernel_time, "ms") + print("xc kernel: ", kernel_time, "ms") + print("xc fft: ", fft_time, "ms") + print("xc contraction: ", contraction_time, "ms") + return fock @@ -1350,11 +1371,18 @@ def nr_rks( mesh = ni.mesh ngrids = np.prod(mesh) + tq = log.init_timer() density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type) + print("evaluate_density_on_g_mesh: ", log.timer_silent(*tq)[-1], "ms") rho_sf = density[0, 0] + tq = log.init_timer() Gv = pbc_tools._get_Gv(cell, mesh) + print("get_Gv: ", log.timer_silent(*tq)[-1], "ms") + + tq = log.init_timer() coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) + print("get_coulG: ", log.timer_silent(*tq)[-1], "ms") coulomb_on_g_mesh = rho_sf * coulomb_kernel_on_g_mesh coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol @@ -1368,6 +1396,7 @@ def nr_rks( # eval_xc_eff supports float64 only density = cp.asarray(density, dtype=np.float64, order="C") + tq = log.init_timer() if xc_type == "LDA": xc_for_energy, xc_for_fock = ni.eval_xc_eff( xc_code, density[0], deriv=1, xctype=xc_type @@ -1379,16 +1408,22 @@ def nr_rks( else: raise ValueError(f"Incorrect xc_type = {xc_type}") + print("libxc: ", log.timer_silent(*tq)[-1], "ms") + rho_sf = density[0].real xc_energy_sum = rho_sf.dot(xc_for_energy.ravel()).get()[()] * weight # To reduce the memory usage, we reuse the xc_for_fock name. # Now xc_for_fock represents xc on G space xc_for_fock *= weight + tq = log.init_timer() xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids) + print("fft for xc:", log.timer_silent(*tq)[-1], "ms") log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) + + tq = log.init_timer() if xc_type == "LDA": pass elif xc_type == "GGA": @@ -1408,6 +1443,8 @@ def nr_rks( else: raise ValueError(f"Incorrect xc_type = {xc_type}") + print("merge Gv: ", log.timer_silent(*tq)[-1], "ms") + if with_j: xc_for_fock[0] += coulomb_on_g_mesh From b9756943d5c2dc8c082fae9c67722304c6928f0b Mon Sep 17 00:00:00 2001 From: Rui Li Date: Mon, 24 Nov 2025 14:50:37 -0800 Subject: [PATCH 05/14] improve primitive <-> contracted basis conversion and ig, gi-> g contraction --- gpu4pyscf/pbc/dft/multigrid_v2.py | 408 +++++++++++++++++++++++++----- 1 file changed, 347 insertions(+), 61 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index ce8acabe6..3d03b7cf2 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -73,6 +73,140 @@ def ifft_in_place(x): return fft.ifftn(x, axes=(-3, -2, -1), overwrite_x=True) +def iG_density(density, cell): + mesh = cell.mesh + b = cp.asarray(cell.reciprocal_vectors()) + + mesh_in_int32 = np.array(mesh, dtype=np.int32) + block_dim = (8, 8, 8) + grid_dim = tuple( + np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32) + ) + + result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3,-1) + + custom_kernel = cp.RawKernel( + r""" + #include + extern "C" __global__ + void kernel(complex * result, + const complex * density, + const double * reciprocal_lattice, + const int mesh_a, + const int mesh_b, + const int mesh_c) { + int a = blockIdx.x * blockDim.x + threadIdx.x; + int b = blockIdx.y * blockDim.y + threadIdx.y; + int c = blockIdx.z * blockDim.z + threadIdx.z; + + if(a >= mesh_a || b >= mesh_b || c >= mesh_c) { + return; + } + + const size_t grid_index = a * mesh_b * mesh_c + b * mesh_c + c; + result += grid_index; + const complex density_value = density[grid_index] * complex{0.0, 1.0}; + + if(a >= (mesh_a + 1) / 2) a -= mesh_a; + if(b >= (mesh_b + 1) / 2) b -= mesh_b; + if(c >= (mesh_c + 1) / 2) c -= mesh_c; + + int n_grid = mesh_a * mesh_b * mesh_c; + + *result = (a * reciprocal_lattice[0] + + b * reciprocal_lattice[3] + + c * reciprocal_lattice[6]) * density_value; + result += n_grid; + *result = (a * reciprocal_lattice[1] + + b * reciprocal_lattice[4] + + c * reciprocal_lattice[7]) * density_value; + result += n_grid; + *result = (a * reciprocal_lattice[2] + + b * reciprocal_lattice[5] + + c * reciprocal_lattice[8]) * density_value; + } + """, + "kernel", + ) + + custom_kernel( + grid_dim, + block_dim, + (result, density, b, mesh_in_int32[0], mesh_in_int32[1], mesh_in_int32[2]), + ) + + return result + +def contract_iG_potential(gga_potential, cell): + mesh = cell.mesh + b = cp.asarray(cell.reciprocal_vectors()) + + mesh_in_int32 = np.array(mesh, dtype=np.int32) + block_dim = (8, 8, 8) + grid_dim = tuple( + np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32) + ) + + custom_kernel = cp.RawKernel( + r""" + #include + extern "C" __global__ + void kernel(complex * gga_potential, + const double * reciprocal_lattice, + const int mesh_a, + const int mesh_b, + const int mesh_c) { + int a = blockIdx.x * blockDim.x + threadIdx.x; + int b = blockIdx.y * blockDim.y + threadIdx.y; + int c = blockIdx.z * blockDim.z + threadIdx.z; + + if(a >= mesh_a || b >= mesh_b || c >= mesh_c) { + return; + } + + int n_grid = mesh_a * mesh_b * mesh_c; + + const size_t grid_index = a * mesh_b * mesh_c + b * mesh_c + c; + gga_potential += grid_index; + + if(a >= (mesh_a + 1) / 2) a -= mesh_a; + if(b >= (mesh_b + 1) / 2) b -= mesh_b; + if(c >= (mesh_c + 1) / 2) c -= mesh_c; + complex potential_change = 0; + potential_change += + (a * reciprocal_lattice[0] + + b * reciprocal_lattice[3] + + c * reciprocal_lattice[6]) * + complex{0.0, 1.0} * + gga_potential[n_grid]; + + potential_change += + (a * reciprocal_lattice[1] + + b * reciprocal_lattice[4] + + c * reciprocal_lattice[7]) * + complex{0.0, 1.0} * + gga_potential[2 * n_grid]; + + potential_change += + (a * reciprocal_lattice[2] + + b * reciprocal_lattice[5] + + c * reciprocal_lattice[8]) * + complex{0.0, 1.0} * + gga_potential[3 * n_grid]; + + *gga_potential -= potential_change; + } + """, + "kernel", + ) + + custom_kernel( + grid_dim, + block_dim, + (gga_potential, b, mesh_in_int32[0], mesh_in_int32[1], mesh_in_int32[2]), + ) + + def unique_with_sort(x): # This function does the same thing as cp.unique(x, return_inverse=True). # It's not super optimized, but for whatever reason, cp.unique is very slow, so this one is better. @@ -100,6 +234,152 @@ def unique_with_sort(x): return x, inverse_unique[inverse_sort] +def unique_with_multiple_keys(x): + # This function expands the previous function to handle multiple keys + # shaped as [ (1, 2), (3, -4), ....] + assert ( + type(x) is cp.ndarray + and (x.dtype == cp.int32 or x.dtype == cp.int64) + and x.ndim == 2 + ) + x = x.T + n = x.shape[-1] + + inverse_sort = cp.zeros(n, dtype=cp.int64) + if n <= 1: + return x, inverse_sort + + sort_index = cp.lexsort(x) + inverse_sort[sort_index] = cp.arange(0, n, dtype=cp.int64) + x = x[:, sort_index].T + + mask = cp.empty(n, dtype=cp.bool_) + mask[0] = True + mask[1:] = cp.any(x[1:] != x[:-1], axis=-1) + + x = x[mask] + inverse_unique = cp.cumsum(mask, dtype=cp.int64) - 1 + + return x, inverse_unique[inverse_sort] + + +def sort_contraction_coefficients(coeff): + contraction_shapes = cp.array([i.shape for i in coeff]) + unique_shapes, inverse = unique_with_multiple_keys(contraction_shapes) + unique_shapes = unique_shapes.get() + inverse = inverse.get() + + sliced_axis = np.zeros((len(coeff) + 1, 2), dtype=np.int32) + sliced_axis[1:] = np.cumsum( + np.array([i.shape for i in coeff]), axis=0, dtype=np.int32 + ) + + left_basis_function_indices = [ + cp.arange(begin, end) + for begin, end in zip(sliced_axis[:-1, 0], sliced_axis[1:, 0]) + ] + right_basis_function_indices = [ + cp.arange(begin, end) + for begin, end in zip(sliced_axis[:-1, 1], sliced_axis[1:, 1]) + ] + + sorted_coeffs = [ + {"shape": shape, "coeffs": [], "left_indices": [], "right_indices": []} + for shape in unique_shapes + ] + + for category, coeffs, left, right in zip( + inverse, coeff, left_basis_function_indices, right_basis_function_indices + ): + sorted_coeffs[category]["coeffs"].append(coeffs) + sorted_coeffs[category]["left_indices"].append(left) + sorted_coeffs[category]["right_indices"].append(right) + + for category in sorted_coeffs: + category["coeffs"] = cp.array(category["coeffs"]) + category["left_indices"] = cp.concatenate(category["left_indices"]) + category["right_indices"] = cp.concatenate(category["right_indices"]) + + return sorted_coeffs, sliced_axis[-1] + + +def contracted_to_primitive( + batched_matrices, sorted_coeffs_left, sorted_coeffs_right, primitive_shape +): + + assert len(batched_matrices.shape) == 3 + + n_slices = batched_matrices.shape[0] + n_cols = batched_matrices.shape[2] + + n_rows_primitive = primitive_shape[0] + n_cols_primitive = primitive_shape[1] + + intermediate_shape = (n_slices, n_rows_primitive, n_cols) + intermediate = cp.zeros(intermediate_shape) + + for i in sorted_coeffs_left: + subarray_shape = (n_slices, -1, i["shape"][1], n_cols) + intermediate[:, i["left_indices"]] = cp.einsum( + "naij, api -> napj", + batched_matrices[:, i["right_indices"]].reshape(subarray_shape), + i["coeffs"], + ).reshape(n_slices, -1, n_cols) + + intermediate = intermediate.transpose(0, 2, 1) + + result_shape = (n_slices, n_cols_primitive, n_rows_primitive) + result = cp.zeros(result_shape) + + for i in sorted_coeffs_right: + subarray_shape = (n_slices, -1, i["shape"][1], n_rows_primitive) + result[:, i["left_indices"]] = cp.einsum( + "najp, aqj -> naqp", + intermediate[:, i["right_indices"]].reshape(subarray_shape), + i["coeffs"], + ).reshape(n_slices, -1, n_rows_primitive) + + return result.transpose(0, 2, 1) + + +def primitive_to_contracted( + batched_matrices, sorted_coeffs_left, sorted_coeffs_right, contracted_shape +): + + assert len(batched_matrices.shape) == 3 + + n_slices = batched_matrices.shape[0] + n_cols = batched_matrices.shape[2] + + n_rows_contracted = contracted_shape[0] + n_cols_contracted = contracted_shape[1] + + intermediate_shape = (n_slices, n_rows_contracted, n_cols) + intermediate = cp.zeros(intermediate_shape) + + for i in sorted_coeffs_left: + subarray_shape = (n_slices, -1, i["shape"][0], n_cols) + intermediate[:, i["right_indices"]] = cp.einsum( + "napq, api -> naiq", + batched_matrices[:, i["left_indices"]].reshape(subarray_shape), + i["coeffs"], + ).reshape(n_slices, -1, n_cols) + + intermediate = intermediate.transpose(0, 2, 1) + result_shape = (n_slices, n_cols_contracted, n_rows_contracted) + result = cp.zeros(result_shape) + + for i in sorted_coeffs_right: + subarray_shape = (n_slices, -1, i["shape"][0], n_rows_contracted) + result[:, i["right_indices"]] = cp.einsum( + "naqi, aqj -> naji", + intermediate[:, i["left_indices"]].reshape(subarray_shape), + i["coeffs"], + ).reshape(n_slices, -1, n_rows_contracted) + + return result.transpose(0, 2, 1) + + def image_pair_to_difference( vectors_to_neighboring_images, lattice_vectors, @@ -377,7 +657,7 @@ def assign_pairs_to_blocks( pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0] sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32) - n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) + n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32) sorted_block_index = sorted_block_index[:n_contributing_blocks] @@ -445,7 +725,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes) n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32) equivalent_cell_in_localized, coeff_in_localized = ( - subcell_in_localized_region.decontract_basis(to_cart=True, aggregate=True) + subcell_in_localized_region.decontract_basis(to_cart=True) ) n_primitive_gtos_in_localized = multigrid._pgto_shells( @@ -460,13 +740,11 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): if grids_diffused is None: grouped_cell = equivalent_cell_in_localized - concatenated_coeff = scipy.linalg.block_diag(coeff_in_localized) + concatenated_coeff = coeff_in_localized else: subcell_in_diffused_region = grids_diffused.cell equivalent_cell_in_diffused, coeff_in_diffused = ( - subcell_in_diffused_region.decontract_basis( - to_cart=True, aggregate=True - ) + subcell_in_diffused_region.decontract_basis(to_cart=True) ) grouped_cell = equivalent_cell_in_localized + equivalent_cell_in_diffused @@ -475,10 +753,15 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): subcell_in_localized_region._atm ) - concatenated_coeff = scipy.linalg.block_diag( - coeff_in_localized, coeff_in_diffused - ) - concatenated_coeff = cp.asarray(concatenated_coeff) + concatenated_coeff = coeff_in_localized + coeff_in_diffused + + coeff_in_localized, localized_shape = sort_contraction_coefficients( + coeff_in_localized + ) + + concatenated_coeff, concatenated_shape = sort_contraction_coefficients( + concatenated_coeff + ) n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell) rad = vol ** (-1.0 / 3) * cell.rcut + 1 @@ -499,7 +782,6 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): concatenated_ao_indices = cp.concatenate( (ao_indices_in_localized, ao_indices_in_diffused) ) - coeff_in_localized = cp.asarray(coeff_in_localized) per_angular_pairs = [] i_angulars = grouped_cell._bas[:n_primitive_gtos_in_localized, multigrid.ANG_OF] @@ -611,6 +893,8 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): "concatenated_ao_indices": concatenated_ao_indices, "coeff_in_localized": coeff_in_localized, "concatenated_coeff": concatenated_coeff, + "primitive_shape": (localized_shape[0], concatenated_shape[0]), + "contracted_shape": (localized_shape[1], concatenated_shape[1]), "atm": atm, "bas": bas, "env": env, @@ -758,7 +1042,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): n_grid_points = np.prod(mesh) weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points - density_matrix_with_rows_in_localized = dms[ :, :, @@ -773,7 +1056,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): pairs["ao_indices_in_localized"], ] - n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3] t0 = log.init_timer() @@ -782,17 +1064,20 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): :, :, :, n_ao_in_localized: ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() - coeff_sandwiched_density_matrix = cp.einsum( - "nkij,pi->nkpj", - density_matrix_with_rows_in_localized, - pairs["coeff_in_localized"], + n_sets, n_k_points = density_matrix_with_rows_in_localized.shape[:2] + + density_matrix_with_rows_in_localized = ( + density_matrix_with_rows_in_localized.reshape( + -1, *pairs["contracted_shape"] + ) ) - coeff_sandwiched_density_matrix = cp.einsum( - "nkpj, qj -> nkpq", - coeff_sandwiched_density_matrix, + coeff_sandwiched_density_matrix = contracted_to_primitive( + density_matrix_with_rows_in_localized, + pairs["coeff_in_localized"], pairs["concatenated_coeff"], - ) + pairs["primitive_shape"], + ).reshape(n_sets, n_k_points, *pairs["primitive_shape"]) contraction_time += log.timer_silent(*t0)[-1] t0 = log.init_timer() @@ -803,7 +1088,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density = ( evaluate_density_wrapper( pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau - ) * weight_per_grid_point + ) + * weight_per_grid_point ) kernel_time += log.timer_silent(*t0)[-1] @@ -839,8 +1125,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1]) if xc_type != "LDA": - density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T - density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j + for i in range(len(density_on_g_mesh)): + density_on_g_mesh[i, 1:4] = iG_density(density_on_g_mesh[i], cell) print("density kernel: ", kernel_time, "ms") print("density fft: ", fft_time, "ms") print("density contraction: ", contraction_time, "ms") @@ -864,8 +1150,7 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): c_driver = libgpbc.evaluate_xc_with_tau_driver else: c_driver = libgpbc.evaluate_xc_driver - n_i_functions = len(pairs_info["coeff_in_localized"]) - n_j_functions = len(pairs_info["concatenated_coeff"]) + n_i_functions, n_j_functions = pairs_info["primitive_shape"] phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape @@ -922,12 +1207,12 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed" ) - if not (n_k_points == 1 and n_difference_images == 1): return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock) else: return fock + def convert_xc_on_g_mesh_to_fock( mydf, xc_on_g_mesh, @@ -1000,15 +1285,21 @@ def convert_xc_on_g_mesh_to_fock( n_ao_in_localized = len(pairs["ao_indices_in_localized"]) libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - fock_slice = evaluate_xc_wrapper( + fock_slice = evaluate_xc_wrapper( pairs, interpolated_xc, img_phase, with_tau=with_tau ) - kernel_time += log.timer_silent(*t0)[-1] t0 = log.init_timer() - - fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"]) - fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"]) + n_sets, n_k_points = fock_slice.shape[:2] + + fock_slice = fock_slice.reshape(-1, *pairs["primitive_shape"]) + + fock_slice = primitive_to_contracted( + fock_slice, + pairs["coeff_in_localized"], + pairs["concatenated_coeff"], + pairs["contracted_shape"], + ).reshape(n_sets, n_k_points, *pairs["contracted_shape"]) # While mathematically it is correct to have concatenated # ao indices in the addition, but it is possible that the ao @@ -1046,7 +1337,6 @@ def convert_xc_on_g_mesh_to_fock( else: raise NotImplementedError - print("xc kernel: ", kernel_time, "ms") print("xc fft: ", fft_time, "ms") print("xc contraction: ", contraction_time, "ms") @@ -1197,17 +1487,18 @@ def convert_xc_on_g_mesh_to_fock_gradient( :, :, :, n_ao_in_localized: ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() - coeff_sandwiched_density_matrix = cp.einsum( - "nkij,pi->nkpj", - density_matrix_slice, - pairs["coeff_in_localized"], + n_sets, n_k_points = density_matrix_slice.shape[:2] + + density_matrix_slice = density_matrix_slice.reshape( + -1, *pairs["contracted_shape"] ) - coeff_sandwiched_density_matrix = cp.einsum( - "nkpj, qj -> nkpq", - coeff_sandwiched_density_matrix, + coeff_sandwiched_density_matrix = contracted_to_primitive( + density_matrix_slice, + pairs["coeff_in_localized"], pairs["concatenated_coeff"], - ) + pairs["primitive_shape"], + ).reshape(n_sets, n_k_points, *pairs["primitive_shape"]) libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) @@ -1378,12 +1669,9 @@ def nr_rks( tq = log.init_timer() Gv = pbc_tools._get_Gv(cell, mesh) - print("get_Gv: ", log.timer_silent(*tq)[-1], "ms") tq = log.init_timer() - coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) - print("get_coulG: ", log.timer_silent(*tq)[-1], "ms") - coulomb_on_g_mesh = rho_sf * coulomb_kernel_on_g_mesh + coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol log.debug("Multigrid Coulomb energy %s", coulomb_energy) @@ -1422,17 +1710,15 @@ def nr_rks( log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) - tq = log.init_timer() if xc_type == "LDA": pass elif xc_type == "GGA": - xc_for_fock = ( - xc_for_fock[0] - contract("gp, pg -> p", xc_for_fock[1:4], Gv) * 1j - ) + contract_iG_potential(xc_for_fock, cell) + xc_for_fock = xc_for_fock[0] xc_for_fock = xc_for_fock.reshape((-1, ngrids)) elif xc_type == "MGGA": - xc_for_fock[0] -= contract("gp, pg -> p", xc_for_fock[1:4], Gv) * 1j + xc_for_fock = contract_iG_potential(xc_for_fock, cell) xc_for_fock = cp.concatenate( [ xc_for_fock[0].reshape((-1, ngrids)), @@ -1514,8 +1800,7 @@ def nr_uks( rho_sf = density[0, 0] + density[1, 0] Gv = pbc_tools._get_Gv(cell, mesh) - coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) - coulomb_on_g_mesh = rho_sf * coulomb_kernel_on_g_mesh + coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol log.debug("Multigrid Coulomb energy %s", coulomb_energy) @@ -1553,12 +1838,12 @@ def nr_uks( if xc_type == "LDA": pass elif xc_type == "GGA": - xc_for_fock = ( - xc_for_fock[:, 0] - contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j - ) + for i in xc_for_fock: + contract_iG_potential(i, cell) xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) elif xc_type == "MGGA": - xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j + for i in xc_for_fock: + contract_iG_potential(i, cell) xc_for_fock = cp.concatenate( [ xc_for_fock[:, 0].reshape((nset, -1, ngrids)), @@ -1623,8 +1908,7 @@ def get_veff_ip1( density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type) Gv = pbc_tools._get_Gv(cell, mesh) - coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) - coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh) + coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], ni.coulG) weight = cell.vol / ngrids @@ -1649,9 +1933,9 @@ def get_veff_ip1( if xc_type == "LDA": pass elif xc_type == "GGA": - xc_for_fock = ( - xc_for_fock[:, 0] - contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j - ) + for i in xc_for_fock: + contract_iG_potential(i, cell) + xc_for_fock = xc_for_fock[:,0] xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) elif xc_type == "MGGA": xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j @@ -1687,6 +1971,8 @@ def __init__(self, cell): self.mesh = cell.mesh self.tasks = None self.sorted_gaussian_pairs = None + Gv = pbc_tools._get_Gv(cell, cell.mesh) + self.coulG = pbc_tools.get_coulG(cell, Gv=Gv) build = sort_gaussian_pairs From 38189ecf5f5203a52f7544def4161b4a53c8b05c Mon Sep 17 00:00:00 2001 From: Rui Li Date: Mon, 24 Nov 2025 14:58:32 -0800 Subject: [PATCH 06/14] clean timing printers --- gpu4pyscf/pbc/dft/multigrid_v2.py | 53 ++----------------------------- 1 file changed, 3 insertions(+), 50 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 3d03b7cf2..e184c6899 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -83,7 +83,7 @@ def iG_density(density, cell): np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32) ) - result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3,-1) + result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3, -1) custom_kernel = cp.RawKernel( r""" @@ -137,6 +137,7 @@ def iG_density(density, cell): return result + def contract_iG_potential(gga_potential, cell): mesh = cell.mesh b = cp.asarray(cell.reciprocal_vectors()) @@ -1025,16 +1026,12 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): raise ValueError(f"Incorrect xc_type = {xc_type}") cell = mydf.cell - log = logger.new_logger(cell, cell.verbose) nx, ny, nz = mydf.mesh density_on_g_mesh = cp.zeros( (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128 ) - kernel_time = 0 - fft_time = 0 - contraction_time = 0 for pairs in mydf.sorted_gaussian_pairs: mesh = pairs["mesh"] @@ -1058,8 +1055,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3] - t0 = log.init_timer() - density_matrix_with_rows_in_localized[ :, :, :, n_ao_in_localized: ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() @@ -1079,9 +1074,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): pairs["primitive_shape"], ).reshape(n_sets, n_k_points, *pairs["primitive_shape"]) - contraction_time += log.timer_silent(*t0)[-1] - t0 = log.init_timer() - libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) @@ -1092,18 +1084,12 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): * weight_per_grid_point ) - kernel_time += log.timer_silent(*t0)[-1] - t0 = log.init_timer() - if with_tau: assert density.shape[1] == 2 tau = density[:, 1] density = density[:, 0] density = fft_in_place(density) - - fft_time += log.timer_silent(*t0)[-1] - density_on_g_mesh[ :, 0, @@ -1127,9 +1113,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): if xc_type != "LDA": for i in range(len(density_on_g_mesh)): density_on_g_mesh[i, 1:4] = iG_density(density_on_g_mesh[i], cell) - print("density kernel: ", kernel_time, "ms") - print("density fft: ", fft_time, "ms") - print("density contraction: ", contraction_time, "ms") return density_on_g_mesh @@ -1263,14 +1246,9 @@ def convert_xc_on_g_mesh_to_fock( data_type = complex_type(cp.float64) fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type) - kernel_time = 0 - fft_time = 0 - contraction_time = 0 - log = logger.new_logger(cell, cell.verbose) for pairs in mydf.sorted_gaussian_pairs: - t0 = log.init_timer() interpolated_xc = xc_on_g_mesh[ :, :, @@ -1279,8 +1257,6 @@ def convert_xc_on_g_mesh_to_fock( pairs["fft_grid"][2], ] interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C") - fft_time += log.timer_silent(*t0)[-1] - t0 = log.init_timer() n_ao_in_localized = len(pairs["ao_indices_in_localized"]) libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) @@ -1288,8 +1264,6 @@ def convert_xc_on_g_mesh_to_fock( fock_slice = evaluate_xc_wrapper( pairs, interpolated_xc, img_phase, with_tau=with_tau ) - kernel_time += log.timer_silent(*t0)[-1] - t0 = log.init_timer() n_sets, n_k_points = fock_slice.shape[:2] fock_slice = fock_slice.reshape(-1, *pairs["primitive_shape"]) @@ -1323,8 +1297,6 @@ def convert_xc_on_g_mesh_to_fock( pairs["ao_indices_in_diffused"], ] += fock_slice[:, :, :, n_ao_in_localized:] - contraction_time += log.timer_silent(*t0)[-1] - if hermi == 1: fock[ :, @@ -1337,10 +1309,6 @@ def convert_xc_on_g_mesh_to_fock( else: raise NotImplementedError - print("xc kernel: ", kernel_time, "ms") - print("xc fft: ", fft_time, "ms") - print("xc contraction: ", contraction_time, "ms") - return fock @@ -1662,15 +1630,9 @@ def nr_rks( mesh = ni.mesh ngrids = np.prod(mesh) - tq = log.init_timer() density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type) - print("evaluate_density_on_g_mesh: ", log.timer_silent(*tq)[-1], "ms") rho_sf = density[0, 0] - tq = log.init_timer() - Gv = pbc_tools._get_Gv(cell, mesh) - - tq = log.init_timer() coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol @@ -1684,7 +1646,6 @@ def nr_rks( # eval_xc_eff supports float64 only density = cp.asarray(density, dtype=np.float64, order="C") - tq = log.init_timer() if xc_type == "LDA": xc_for_energy, xc_for_fock = ni.eval_xc_eff( xc_code, density[0], deriv=1, xctype=xc_type @@ -1696,21 +1657,16 @@ def nr_rks( else: raise ValueError(f"Incorrect xc_type = {xc_type}") - print("libxc: ", log.timer_silent(*tq)[-1], "ms") - rho_sf = density[0].real xc_energy_sum = rho_sf.dot(xc_for_energy.ravel()).get()[()] * weight # To reduce the memory usage, we reuse the xc_for_fock name. # Now xc_for_fock represents xc on G space xc_for_fock *= weight - tq = log.init_timer() xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids) - print("fft for xc:", log.timer_silent(*tq)[-1], "ms") log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) - tq = log.init_timer() if xc_type == "LDA": pass elif xc_type == "GGA": @@ -1729,8 +1685,6 @@ def nr_rks( else: raise ValueError(f"Incorrect xc_type = {xc_type}") - print("merge Gv: ", log.timer_silent(*tq)[-1], "ms") - if with_j: xc_for_fock[0] += coulomb_on_g_mesh @@ -1799,7 +1753,6 @@ def nr_uks( density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type) rho_sf = density[0, 0] + density[1, 0] - Gv = pbc_tools._get_Gv(cell, mesh) coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol @@ -1935,7 +1888,7 @@ def get_veff_ip1( elif xc_type == "GGA": for i in xc_for_fock: contract_iG_potential(i, cell) - xc_for_fock = xc_for_fock[:,0] + xc_for_fock = xc_for_fock[:, 0] xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) elif xc_type == "MGGA": xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j From de2b63b7d3479b0dc825e9714cfda5b93c141d6e Mon Sep 17 00:00:00 2001 From: Rui Li Date: Thu, 4 Dec 2025 15:21:34 -0800 Subject: [PATCH 07/14] remove old caching --- gpu4pyscf/pbc/dft/multigrid_v2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 0e7d04264..8671da0c6 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -1591,7 +1591,8 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts) Gv = pbc_tools._get_Gv(cell, mesh) - coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) + # coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) + coulomb_kernel_on_g_mesh = ni.coulG coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh) weight = cell.vol / ngrids @@ -1974,8 +1975,6 @@ def __init__(self, cell): Gv = pbc_tools._get_Gv(cell, cell.mesh) self.coulG = pbc_tools.get_coulG(cell, Gv=Gv) self.build() - if cell._pseudo: - self.cached_vpplocG_part1 = multigrid_v1.eval_vpplocG_part1(cell, self.mesh) build = sort_gaussian_pairs From 37d0d068071d678703d378d04ca45d8a51da376c Mon Sep 17 00:00:00 2001 From: Rui Li Date: Thu, 12 Feb 2026 14:21:38 -0800 Subject: [PATCH 08/14] add changes of pseudo-potential --- gpu4pyscf/pbc/dft/multigrid_v2.py | 737 +++++++++++++----------------- gpu4pyscf/pbc/grad/rhf.py | 35 +- 2 files changed, 338 insertions(+), 434 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 8671da0c6..ccedf4fb6 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -35,9 +35,9 @@ import gpu4pyscf.pbc.dft.multigrid as multigrid_v1 from gpu4pyscf.lib.cupy_helper import contract, tag_array, load_library -__all__ = ["MultiGridNumInt"] +__all__ = ['MultiGridNumInt'] -libgpbc = load_library("libmgrid_v2") +libgpbc = load_library('libmgrid_v2') libgpbc.evaluate_density_driver.restype = ctypes.c_int libgpbc.evaluate_xc_driver.restype = ctypes.c_int libgpbc.evaluate_xc_gradient_driver.restype = ctypes.c_int @@ -52,7 +52,7 @@ def complex_type(dtype): elif dtype == cp.float64: return cp.complex128 else: - raise ValueError("Invalid dtype") + raise ValueError('Invalid dtype') def cast_to_pointer(array): @@ -61,7 +61,7 @@ def cast_to_pointer(array): elif isinstance(array, np.ndarray): return array.ctypes.data_as(ctypes.c_void_p) else: - raise ValueError("Invalid array type") + raise ValueError('Invalid array type') def fft_in_place(x): @@ -78,9 +78,7 @@ def iG_density(density, cell): mesh_in_int32 = np.array(mesh, dtype=np.int32) block_dim = (8, 8, 8) - grid_dim = tuple( - np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32) - ) + grid_dim = tuple(np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)) result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3, -1) @@ -125,7 +123,7 @@ def iG_density(density, cell): c * reciprocal_lattice[8]) * density_value; } """, - "kernel", + 'kernel', ) custom_kernel( @@ -143,9 +141,7 @@ def contract_iG_potential(gga_potential, cell): mesh_in_int32 = np.array(mesh, dtype=np.int32) block_dim = (8, 8, 8) - grid_dim = tuple( - np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32) - ) + grid_dim = tuple(np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)) custom_kernel = cp.RawKernel( r""" @@ -197,7 +193,7 @@ def contract_iG_potential(gga_potential, cell): *gga_potential -= potential_change; } """, - "kernel", + 'kernel', ) custom_kernel( @@ -210,11 +206,7 @@ def contract_iG_potential(gga_potential, cell): def unique_with_sort(x): # This function does the same thing as cp.unique(x, return_inverse=True). # It's not super optimized, but for whatever reason, cp.unique is very slow, so this one is better. - assert ( - type(x) is cp.ndarray - and (x.dtype == cp.int32 or x.dtype == cp.int64) - and x.ndim == 1 - ) + assert type(x) is cp.ndarray and (x.dtype == cp.int32 or x.dtype == cp.int64) and x.ndim == 1 n = x.shape[0] if n <= 1: return x, cp.zeros(n) @@ -237,11 +229,7 @@ def unique_with_sort(x): def unique_with_multiple_keys(x): # This function expands the previous function to handle multiple keys # shaped as [ (1, 2), (3, -4), ....] - assert ( - type(x) is cp.ndarray - and (x.dtype == cp.int32 or x.dtype == cp.int64) - and x.ndim == 2 - ) + assert type(x) is cp.ndarray and (x.dtype == cp.int32 or x.dtype == cp.int64) and x.ndim == 2 x = x.T n = x.shape[-1] @@ -270,43 +258,29 @@ def sort_contraction_coefficients(coeff): inverse = inverse.get() sliced_axis = np.zeros((len(coeff) + 1, 2), dtype=np.int32) - sliced_axis[1:] = np.cumsum( - np.array([i.shape for i in coeff]), axis=0, dtype=np.int32 - ) + sliced_axis[1:] = np.cumsum(np.array([i.shape for i in coeff]), axis=0, dtype=np.int32) - left_basis_function_indices = [ - cp.arange(begin, end) - for begin, end in zip(sliced_axis[:-1, 0], sliced_axis[1:, 0]) - ] + left_basis_function_indices = [cp.arange(begin, end) for begin, end in zip(sliced_axis[:-1, 0], sliced_axis[1:, 0])] right_basis_function_indices = [ - cp.arange(begin, end) - for begin, end in zip(sliced_axis[:-1, 1], sliced_axis[1:, 1]) + cp.arange(begin, end) for begin, end in zip(sliced_axis[:-1, 1], sliced_axis[1:, 1]) ] - sorted_coeffs = [ - {"shape": shape, "coeffs": [], "left_indices": [], "right_indices": []} - for shape in unique_shapes - ] + sorted_coeffs = [{'shape': shape, 'coeffs': [], 'left_indices': [], 'right_indices': []} for shape in unique_shapes] - for category, coeffs, left, right in zip( - inverse, coeff, left_basis_function_indices, right_basis_function_indices - ): - sorted_coeffs[category]["coeffs"].append(coeffs) - sorted_coeffs[category]["left_indices"].append(left) - sorted_coeffs[category]["right_indices"].append(right) + for category, coeffs, left, right in zip(inverse, coeff, left_basis_function_indices, right_basis_function_indices): + sorted_coeffs[category]['coeffs'].append(coeffs) + sorted_coeffs[category]['left_indices'].append(left) + sorted_coeffs[category]['right_indices'].append(right) for category in sorted_coeffs: - category["coeffs"] = cp.array(category["coeffs"]) - category["left_indices"] = cp.concatenate(category["left_indices"]) - category["right_indices"] = cp.concatenate(category["right_indices"]) + category['coeffs'] = cp.array(category['coeffs']) + category['left_indices'] = cp.concatenate(category['left_indices']) + category['right_indices'] = cp.concatenate(category['right_indices']) return sorted_coeffs, sliced_axis[-1] -def contracted_to_primitive( - batched_matrices, sorted_coeffs_left, sorted_coeffs_right, primitive_shape -): - +def contracted_to_primitive(batched_matrices, sorted_coeffs_left, sorted_coeffs_right, primitive_shape): assert len(batched_matrices.shape) == 3 n_slices = batched_matrices.shape[0] @@ -319,11 +293,11 @@ def contracted_to_primitive( intermediate = cp.zeros(intermediate_shape) for i in sorted_coeffs_left: - subarray_shape = (n_slices, -1, i["shape"][1], n_cols) - intermediate[:, i["left_indices"]] = cp.einsum( - "naij, api -> napj", - batched_matrices[:, i["right_indices"]].reshape(subarray_shape), - i["coeffs"], + subarray_shape = (n_slices, -1, i['shape'][1], n_cols) + intermediate[:, i['left_indices']] = cp.einsum( + 'naij, api -> napj', + batched_matrices[:, i['right_indices']].reshape(subarray_shape), + i['coeffs'], ).reshape(n_slices, -1, n_cols) intermediate = intermediate.transpose(0, 2, 1) @@ -332,20 +306,17 @@ def contracted_to_primitive( result = cp.zeros(result_shape) for i in sorted_coeffs_right: - subarray_shape = (n_slices, -1, i["shape"][1], n_rows_primitive) - result[:, i["left_indices"]] = cp.einsum( - "najp, aqj -> naqp", - intermediate[:, i["right_indices"]].reshape(subarray_shape), - i["coeffs"], + subarray_shape = (n_slices, -1, i['shape'][1], n_rows_primitive) + result[:, i['left_indices']] = cp.einsum( + 'najp, aqj -> naqp', + intermediate[:, i['right_indices']].reshape(subarray_shape), + i['coeffs'], ).reshape(n_slices, -1, n_rows_primitive) return result.transpose(0, 2, 1) -def primitive_to_contracted( - batched_matrices, sorted_coeffs_left, sorted_coeffs_right, contracted_shape -): - +def primitive_to_contracted(batched_matrices, sorted_coeffs_left, sorted_coeffs_right, contracted_shape): assert len(batched_matrices.shape) == 3 n_slices = batched_matrices.shape[0] @@ -358,11 +329,11 @@ def primitive_to_contracted( intermediate = cp.zeros(intermediate_shape) for i in sorted_coeffs_left: - subarray_shape = (n_slices, -1, i["shape"][0], n_cols) - intermediate[:, i["right_indices"]] = cp.einsum( - "napq, api -> naiq", - batched_matrices[:, i["left_indices"]].reshape(subarray_shape), - i["coeffs"], + subarray_shape = (n_slices, -1, i['shape'][0], n_cols) + intermediate[:, i['right_indices']] = cp.einsum( + 'napq, api -> naiq', + batched_matrices[:, i['left_indices']].reshape(subarray_shape), + i['coeffs'], ).reshape(n_slices, -1, n_cols) intermediate = intermediate.transpose(0, 2, 1) @@ -370,11 +341,11 @@ def primitive_to_contracted( result = cp.zeros(result_shape) for i in sorted_coeffs_right: - subarray_shape = (n_slices, -1, i["shape"][0], n_rows_contracted) - result[:, i["right_indices"]] = cp.einsum( - "naqi, aqj -> naji", - intermediate[:, i["left_indices"]].reshape(subarray_shape), - i["coeffs"], + subarray_shape = (n_slices, -1, i['shape'][0], n_rows_contracted) + result[:, i['right_indices']] = cp.einsum( + 'naqi, aqj -> naji', + intermediate[:, i['left_indices']].reshape(subarray_shape), + i['coeffs'], ).reshape(n_slices, -1, n_rows_contracted) return result.transpose(0, 2, 1) @@ -384,7 +355,7 @@ def image_pair_to_difference( vectors_to_neighboring_images, lattice_vectors, ): - ''' + """ Find unique image pairs for double lattice-sums associated with orbital products. When k-point phases are applied to orbital products with double lattice sum @@ -406,14 +377,14 @@ def image_pair_to_difference( A tuple containing: - The reduced lattice-sum vectors T for the unique image pairs. - A inverse mapping that restores the index of double lattice-sum from T. - ''' + """ vectors_to_neighboring_images = cp.asarray(vectors_to_neighboring_images) lattice_vectors = cp.asarray(lattice_vectors) translation_vectors = cp.asarray( cp.linalg.solve(lattice_vectors.T, vectors_to_neighboring_images.T).T, ) - translation_vectors = cp.asarray(cp.round(translation_vectors), dtype = cp.int32) + translation_vectors = cp.asarray(cp.round(translation_vectors), dtype=cp.int32) difference_images, inverse = _unique_image_pair(translation_vectors) difference_images = difference_images @ lattice_vectors @@ -421,30 +392,31 @@ def image_pair_to_difference( # where R1 is associated with the first orbital in a pair, and R2 associated to the second. return cp.asarray(difference_images), cp.asarray(inverse, dtype=cp.int32) + def _unique_image_pair(translation_vectors): - ''' + """ unqiue((-L[:,None] + L).reshape(-1, 3), axis=0, return_inverse=True) - ''' + """ image_difference_full = ( # -k_i + k_j corresponding to - translation_vectors[None,:,:] - translation_vectors[:,None,:] + translation_vectors[None, :, :] - translation_vectors[:, None, :] ).reshape(-1, 3) max_offset = (translation_vectors.max(axis=0) - translation_vectors.min(axis=0)).max() + 1 - assert (max_offset * 2)**3 < np.iinfo(np.int32).max + assert (max_offset * 2) ** 3 < np.iinfo(np.int32).max image_difference_3in1 = image_difference_full image_difference_3in1 += max_offset - image_difference_3in1 = image_difference_3in1[:, 0] * (max_offset * 2)**2 \ - + image_difference_3in1[:, 1] * (max_offset * 2) \ - + image_difference_3in1[:, 2] + image_difference_3in1 = ( + image_difference_3in1[:, 0] * (max_offset * 2) ** 2 + + image_difference_3in1[:, 1] * (max_offset * 2) + + image_difference_3in1[:, 2] + ) image_difference_3in1, inverse = unique_with_sort(image_difference_3in1) translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype=cp.int32) translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2) ** 2 - translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // ( - max_offset * 2 - ) + translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // (max_offset * 2) translation_vectors[:, 2] = image_difference_3in1 % (max_offset * 2) translation_vectors -= max_offset return translation_vectors, inverse @@ -461,9 +433,7 @@ def image_phase_for_kpts(cell, neighboring_images, kpts=None): neighboring_images, lattice_vectors, ) - phase_diff_among_images = cp.exp( - 1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T) - ) + phase_diff_among_images = cp.exp(1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T)) return phase_diff_among_images, image_pair_difference_index @@ -500,9 +470,7 @@ def count_non_trivial_pairs( ctypes.c_double(threshold_in_log), ) if err != 0: - raise RuntimeError( - f"count_non_trivial_pairs for li={i_angular} lj={j_angular} failed" - ) + raise RuntimeError(f'count_non_trivial_pairs for li={i_angular} lj={j_angular} failed') return int(n_pairs[0]) @@ -558,9 +526,7 @@ def screen_gaussian_pairs( ctypes.c_double(threshold_in_log), ) if err != 0: - raise RuntimeError( - f"screen_gaussian_pairs for li={i_angular} lj={j_angular} failed" - ) + raise RuntimeError(f'screen_gaussian_pairs for li={i_angular} lj={j_angular} failed') return ( screened_shell_pairs, image_indices, @@ -615,13 +581,11 @@ def assign_pairs_to_blocks( ) has_unstable_pairs = n_unstable_pairs_on_blocks[-1] > 0 if not has_warned_instability and has_unstable_pairs: - warnings.warn( - "Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff." - ) + warnings.warn('Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.') has_warned_instability = True if err != 0: - raise RuntimeError("count_pairs_on_blocks failed") + raise RuntimeError('count_pairs_on_blocks failed') n_contributing_blocks = int(n_pairs_on_blocks[-1]) n_pairs_on_blocks = n_pairs_on_blocks[:-1] @@ -691,7 +655,7 @@ def assign_pairs_to_blocks( ) -def sort_gaussian_pairs(mydf, xc_type="LDA"): +def sort_gaussian_pairs(mydf, xc_type='LDA'): cell = mydf.cell log = logger.new_logger(cell) t0 = log.init_timer() @@ -704,7 +668,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): is_non_orthogonal = 1 else: is_non_orthogonal = 0 - reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order="C") + reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order='C') reciprocal_norms = np.linalg.norm(reciprocal_lattice_vectors, axis=1) libgpbc.update_lattice_vectors( @@ -713,20 +677,20 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): reciprocal_norms.ctypes, ) - tasks = getattr(mydf, "tasks", None) + tasks = getattr(mydf, 'tasks', None) if tasks is None: tasks = multigrid.multi_grids_tasks(cell, mydf.mesh, log) mydf.tasks = tasks - t0 = log.timer("task generation", *t0) + t0 = log.timer('task generation', *t0) t1 = t0 pairs = [] derivative_order = 0 - if xc_type == "GGA": + if xc_type == 'GGA': derivative_order = 1 - if xc_type == "mGGA": - raise NotImplementedError("mGGA is not supported yet in multigrid_v2.") + if xc_type == 'mGGA': + raise NotImplementedError('mGGA is not supported yet in multigrid_v2.') for grids_localized, grids_diffused in tasks: subcell_in_localized_region = grids_localized.cell @@ -736,9 +700,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): fft_grid = list( map( - lambda n_mesh_points: cp.round( - cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points) - ).astype(cp.int32), + lambda n_mesh_points: cp.round(cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points)).astype(cp.int32), mesh, ) ) @@ -746,13 +708,9 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): dxyz_dabc = lattice_vectors / mesh[:, None] libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes) n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32) - equivalent_cell_in_localized, coeff_in_localized = ( - subcell_in_localized_region.decontract_basis(to_cart=True) - ) + equivalent_cell_in_localized, coeff_in_localized = subcell_in_localized_region.decontract_basis(to_cart=True) - n_primitive_gtos_in_localized = multigrid._pgto_shells( - subcell_in_localized_region - ) + n_primitive_gtos_in_localized = multigrid._pgto_shells(subcell_in_localized_region) # theoretically we can use the rcut defined in localized cell to reduce the # number of images, but somehow it can introduce some error when the lattice @@ -765,25 +723,17 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): concatenated_coeff = coeff_in_localized else: subcell_in_diffused_region = grids_diffused.cell - equivalent_cell_in_diffused, coeff_in_diffused = ( - subcell_in_diffused_region.decontract_basis(to_cart=True) - ) + equivalent_cell_in_diffused, coeff_in_diffused = subcell_in_diffused_region.decontract_basis(to_cart=True) grouped_cell = equivalent_cell_in_localized + equivalent_cell_in_diffused - grouped_cell._bas[n_primitive_gtos_in_localized:, 0] -= len( - subcell_in_localized_region._atm - ) + grouped_cell._bas[n_primitive_gtos_in_localized:, 0] -= len(subcell_in_localized_region._atm) concatenated_coeff = coeff_in_localized + coeff_in_diffused - coeff_in_localized, localized_shape = sort_contraction_coefficients( - coeff_in_localized - ) + coeff_in_localized, localized_shape = sort_contraction_coefficients(coeff_in_localized) - concatenated_coeff, concatenated_shape = sort_contraction_coefficients( - concatenated_coeff - ) + concatenated_coeff, concatenated_shape = sort_contraction_coefficients(concatenated_coeff) n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell) rad = vol ** (-1.0 / 3) * cell.rcut + 1 @@ -792,18 +742,14 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): precision = cell.precision / lattice_sum_factor threshold_in_log = np.log(precision) - shell_to_ao_indices = cp.asarray( - gto.moleintor.make_loc(grouped_cell._bas, "cart"), dtype=cp.int32 - ) + shell_to_ao_indices = cp.asarray(gto.moleintor.make_loc(grouped_cell._bas, 'cart'), dtype=cp.int32) ao_indices_in_localized = cp.asarray(grids_localized.ao_idx, dtype=cp.int32) if grids_diffused is None: ao_indices_in_diffused = cp.array([], dtype=cp.int32) else: ao_indices_in_diffused = cp.asarray(grids_diffused.ao_idx, dtype=cp.int32) - concatenated_ao_indices = cp.concatenate( - (ao_indices_in_localized, ao_indices_in_diffused) - ) + concatenated_ao_indices = cp.concatenate((ao_indices_in_localized, ao_indices_in_diffused)) per_angular_pairs = [] i_angulars = grouped_cell._bas[:n_primitive_gtos_in_localized, multigrid.ANG_OF] @@ -813,9 +759,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): i_shells = cp.asarray(np.where(i_angulars == l)[0], dtype=cp.int32) sorted_i_shells.append(i_shells) - j_angulars = grouped_cell._bas[ - :n_primitive_gtos_in_two_regions, multigrid.ANG_OF - ] + j_angulars = grouped_cell._bas[:n_primitive_gtos_in_two_regions, multigrid.ANG_OF] j_angulars_unique = np.unique(j_angulars) sorted_j_shells = [] for l in j_angulars_unique: @@ -826,7 +770,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): bas = cp.asarray(grouped_cell._bas, dtype=cp.int32) env = cp.asarray(grouped_cell._env) - t1 = log.timer_debug2("routines before screening", *t1) + t1 = log.timer_debug2('routines before screening', *t1) has_warned_instability = False for i_angular, i_shells in zip(i_angulars_unique, sorted_i_shells): for j_angular, j_shells in zip(j_angulars_unique, sorted_j_shells): @@ -847,15 +791,9 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): env, threshold_in_log, ) - t1 = log.timer_debug2( - "screening in angular pair" + str((i_angular, j_angular)), *t1 - ) - contributing_block_ranges = ( - pairs_to_blocks_end - pairs_to_blocks_begin + 1 - ) - n_contributing_blocks_per_pair = cp.prod( - contributing_block_ranges, axis=0 - ) + t1 = log.timer_debug2('screening in angular pair' + str((i_angular, j_angular)), *t1) + contributing_block_ranges = pairs_to_blocks_end - pairs_to_blocks_begin + 1 + n_contributing_blocks_per_pair = cp.prod(contributing_block_ranges, axis=0) n_indices = int(cp.sum(n_contributing_blocks_per_pair)) ( gaussian_pair_indices, @@ -885,60 +823,57 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): derivative_order, ) t1 = log.timer_debug2( - "assigning pairs to blocks in angular pair" - + str((i_angular, j_angular)), + 'assigning pairs to blocks in angular pair' + str((i_angular, j_angular)), *t1, ) per_angular_pairs.append( { - "angular": (i_angular, j_angular), - "screened_shell_pairs": screened_shell_pairs, - "pair_indices_per_block": gaussian_pair_indices, - "accumulated_counts_per_block": accumulated_counts, - "sorted_block_index": sorted_contributing_blocks, - "image_indices": image_indices, - "i_shells": i_shells, - "j_shells": j_shells, - "shell_to_ao_indices": shell_to_ao_indices, + 'angular': (i_angular, j_angular), + 'screened_shell_pairs': screened_shell_pairs, + 'pair_indices_per_block': gaussian_pair_indices, + 'accumulated_counts_per_block': accumulated_counts, + 'sorted_block_index': sorted_contributing_blocks, + 'image_indices': image_indices, + 'i_shells': i_shells, + 'j_shells': j_shells, + 'shell_to_ao_indices': shell_to_ao_indices, } ) pairs.append( { - "per_angular_pairs": per_angular_pairs, - "neighboring_images": vectors_to_neighboring_images, - "grouped_cell": grouped_cell, - "mesh": mesh, # this one is on cpu memory - "fft_grid": fft_grid, - "ao_indices_in_localized": ao_indices_in_localized, - "ao_indices_in_diffused": ao_indices_in_diffused, - "concatenated_ao_indices": concatenated_ao_indices, - "coeff_in_localized": coeff_in_localized, - "concatenated_coeff": concatenated_coeff, - "primitive_shape": (localized_shape[0], concatenated_shape[0]), - "contracted_shape": (localized_shape[1], concatenated_shape[1]), - "atm": atm, - "bas": bas, - "env": env, - "dxyz_dabc": dxyz_dabc, - "is_non_orthogonal": is_non_orthogonal, + 'per_angular_pairs': per_angular_pairs, + 'neighboring_images': vectors_to_neighboring_images, + 'grouped_cell': grouped_cell, + 'mesh': mesh, # this one is on cpu memory + 'fft_grid': fft_grid, + 'ao_indices_in_localized': ao_indices_in_localized, + 'ao_indices_in_diffused': ao_indices_in_diffused, + 'concatenated_ao_indices': concatenated_ao_indices, + 'coeff_in_localized': coeff_in_localized, + 'concatenated_coeff': concatenated_coeff, + 'primitive_shape': (localized_shape[0], concatenated_shape[0]), + 'contracted_shape': (localized_shape[1], concatenated_shape[1]), + 'atm': atm, + 'bas': bas, + 'env': env, + 'dxyz_dabc': dxyz_dabc, + 'is_non_orthogonal': is_non_orthogonal, } ) mydf.sorted_gaussian_pairs = pairs - t0 = log.timer("sort_gaussian_pairs", *t0) + t0 = log.timer('sort_gaussian_pairs', *t0) return mydf -def evaluate_density_wrapper( - pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False -): +def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False): if with_tau: c_driver = libgpbc.evaluate_density_tau_driver else: c_driver = libgpbc.evaluate_density_driver - n_images = pairs_info["neighboring_images"].shape[0] + n_images = pairs_info['neighboring_images'].shape[0] phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape if n_k_points == 1 and n_difference_images == 1: @@ -948,9 +883,7 @@ def evaluate_density_wrapper( # e^{i \vec{k} \cdot (\vec{R}_1 - \vec{R}_2)} # Because during grid density evaluation, rho = \sum_{\mu\nu} D_{\mu\nu} \mu \nu^* # The conjugate is on \nu, which is different from other Fock integrals - density_matrix_with_translation = cp.einsum( - "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice - ) + density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice) n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape @@ -962,9 +895,7 @@ def evaluate_density_wrapper( # assert abs(density_matrix_with_translation.imag).max() < real_dm_imag_threshold, \ # f"The dm transformed into real space contains large imaginary part " \ # f"(max = {abs(density_matrix_with_translation.imag).max()}) >= {real_dm_imag_threshold}" - density_matrix_with_translation_real_part = cp.asarray( - density_matrix_with_translation.real, order="C" - ) + density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C') if density_matrix_with_translation_real_part.dtype == cp.float32: use_float_precision = ctypes.c_int(1) @@ -979,83 +910,78 @@ def evaluate_density_wrapper( n_channels, 2, ) - + tuple(pairs_info["mesh"]), + + tuple(pairs_info['mesh']), dtype=density_matrix_with_translation_real_part.dtype, ) else: density = cp.zeros( - (n_channels,) + tuple(pairs_info["mesh"]), + (n_channels,) + tuple(pairs_info['mesh']), dtype=density_matrix_with_translation_real_part.dtype, ) - for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: - (i_angular, j_angular) = gaussians_per_angular_pair["angular"] + for gaussians_per_angular_pair in pairs_info['per_angular_pairs']: + (i_angular, j_angular) = gaussians_per_angular_pair['angular'] err = c_driver( cast_to_pointer(density), cast_to_pointer(density_matrix_with_translation_real_part), ctypes.c_int(i_angular), ctypes.c_int(j_angular), - cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]), - cast_to_pointer(gaussians_per_angular_pair["i_shells"]), - cast_to_pointer(gaussians_per_angular_pair["j_shells"]), - ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])), - cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]), + cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']), + cast_to_pointer(gaussians_per_angular_pair['i_shells']), + cast_to_pointer(gaussians_per_angular_pair['j_shells']), + ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])), + cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']), ctypes.c_int(n_i_functions), ctypes.c_int(n_j_functions), - cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]), - cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]), - cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]), - ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])), - cast_to_pointer(gaussians_per_angular_pair["image_indices"]), - cast_to_pointer(pairs_info["neighboring_images"]), + cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']), + cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']), + cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']), + ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])), + cast_to_pointer(gaussians_per_angular_pair['image_indices']), + cast_to_pointer(pairs_info['neighboring_images']), ctypes.c_int(n_images), cast_to_pointer(image_pair_difference_index), ctypes.c_int(n_difference_images), - (ctypes.c_int * 3)(*pairs_info["mesh"]), - cast_to_pointer(pairs_info["atm"]), - cast_to_pointer(pairs_info["bas"]), - cast_to_pointer(pairs_info["env"]), + (ctypes.c_int * 3)(*pairs_info['mesh']), + cast_to_pointer(pairs_info['atm']), + cast_to_pointer(pairs_info['bas']), + cast_to_pointer(pairs_info['env']), ctypes.c_int(n_channels), - ctypes.c_int(pairs_info["is_non_orthogonal"]), + ctypes.c_int(pairs_info['is_non_orthogonal']), use_float_precision, ) if err != 0: - raise RuntimeError( - f"evaluate_density_driver for li={i_angular} lj={j_angular} failed" - ) + raise RuntimeError(f'evaluate_density_driver for li={i_angular} lj={j_angular} failed') return density -def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): - dm_kpts = cp.asarray(dm_kpts, order="C") +def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): + dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) n_channels, n_k_points = dms.shape[:2] if mydf.sorted_gaussian_pairs is None: mydf.build(xc_type) with_tau = False - if xc_type == "LDA": + if xc_type == 'LDA': density_slices = 1 - elif xc_type == "GGA": + elif xc_type == 'GGA': density_slices = 4 - elif xc_type == "MGGA": + elif xc_type == 'MGGA': density_slices = 5 with_tau = True else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + raise ValueError(f'Incorrect xc_type = {xc_type}') cell = mydf.cell nx, ny, nz = mydf.mesh - density_on_g_mesh = cp.zeros( - (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128 - ) + density_on_g_mesh = cp.zeros((n_channels, density_slices, nx, ny, nz), dtype=cp.complex128) for pairs in mydf.sorted_gaussian_pairs: - - mesh = pairs["mesh"] + mesh = pairs['mesh'] n_grid_points = np.prod(mesh) weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points @@ -1063,45 +989,41 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density_matrix_with_rows_in_localized = dms[ :, :, - pairs["ao_indices_in_localized"][:, None], - pairs["concatenated_ao_indices"], + pairs['ao_indices_in_localized'][:, None], + pairs['concatenated_ao_indices'], ] density_matrix_with_rows_in_diffused = dms[ :, :, - pairs["ao_indices_in_diffused"][:, None], - pairs["ao_indices_in_localized"], + pairs['ao_indices_in_diffused'][:, None], + pairs['ao_indices_in_localized'], ] n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3] - density_matrix_with_rows_in_localized[ - :, :, :, n_ao_in_localized: - ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() + density_matrix_with_rows_in_localized[:, :, :, n_ao_in_localized:] += ( + density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() + ) n_sets, n_k_points = density_matrix_with_rows_in_localized.shape[:2] - density_matrix_with_rows_in_localized = ( - density_matrix_with_rows_in_localized.reshape( - -1, *pairs["contracted_shape"] - ) + density_matrix_with_rows_in_localized = density_matrix_with_rows_in_localized.reshape( + -1, *pairs['contracted_shape'] ) coeff_sandwiched_density_matrix = contracted_to_primitive( density_matrix_with_rows_in_localized, - pairs["coeff_in_localized"], - pairs["concatenated_coeff"], - pairs["primitive_shape"], - ).reshape(n_sets, n_k_points, *pairs["primitive_shape"]) + pairs['coeff_in_localized'], + pairs['concatenated_coeff'], + pairs['primitive_shape'], + ).reshape(n_sets, n_k_points, *pairs['primitive_shape']) - libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) + libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes) - img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) + img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts) density = ( - evaluate_density_wrapper( - pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau - ) + evaluate_density_wrapper(pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau) * weight_per_grid_point ) @@ -1114,9 +1036,9 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density_on_g_mesh[ :, 0, - pairs["fft_grid"][0][:, None, None], - pairs["fft_grid"][1][:, None], - pairs["fft_grid"][2], + pairs['fft_grid'][0][:, None, None], + pairs['fft_grid'][1][:, None], + pairs['fft_grid'][2], ] += density if with_tau: @@ -1125,13 +1047,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"): density_on_g_mesh[ :, 4, - pairs["fft_grid"][0][:, None, None], - pairs["fft_grid"][1][:, None], - pairs["fft_grid"][2], + pairs['fft_grid'][0][:, None, None], + pairs['fft_grid'][1][:, None], + pairs['fft_grid'][2], ] += tau density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1]) - if xc_type != "LDA": + if xc_type != 'LDA': for i in range(len(density_on_g_mesh)): density_on_g_mesh[i, 1:4] = iG_density(density_on_g_mesh[i], cell) @@ -1144,9 +1066,7 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): n_channels = xc_weights.shape[0] # density_slices = 2 else: - assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or ( - xc_weights.ndim == 3 + 1 - ) + assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1) n_channels = xc_weights.shape[0] # density_slices = 1 @@ -1154,11 +1074,11 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): c_driver = libgpbc.evaluate_xc_with_tau_driver else: c_driver = libgpbc.evaluate_xc_driver - n_i_functions, n_j_functions = pairs_info["primitive_shape"] + n_i_functions, n_j_functions = pairs_info['primitive_shape'] phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape - n_images = pairs_info["neighboring_images"].shape[0] + n_images = pairs_info['neighboring_images'].shape[0] fock = cp.zeros( (n_channels, n_difference_images, n_i_functions, n_j_functions), @@ -1170,49 +1090,47 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): assert xc_weights.dtype == cp.float64 use_float_precision = ctypes.c_int(0) - for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: + for gaussians_per_angular_pair in pairs_info['per_angular_pairs']: fock_slice = cp.zeros( (n_channels, n_difference_images, n_i_functions, n_j_functions), dtype=xc_weights.dtype, ) - (i_angular, j_angular) = gaussians_per_angular_pair["angular"] + (i_angular, j_angular) = gaussians_per_angular_pair['angular'] err = c_driver( cast_to_pointer(fock_slice), cast_to_pointer(xc_weights), ctypes.c_int(i_angular), ctypes.c_int(j_angular), - cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]), - cast_to_pointer(gaussians_per_angular_pair["i_shells"]), - cast_to_pointer(gaussians_per_angular_pair["j_shells"]), - ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])), - cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]), + cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']), + cast_to_pointer(gaussians_per_angular_pair['i_shells']), + cast_to_pointer(gaussians_per_angular_pair['j_shells']), + ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])), + cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']), ctypes.c_int(n_i_functions), ctypes.c_int(n_j_functions), - cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]), - cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]), - cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]), - ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])), - cast_to_pointer(gaussians_per_angular_pair["image_indices"]), - cast_to_pointer(pairs_info["neighboring_images"]), + cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']), + cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']), + cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']), + ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])), + cast_to_pointer(gaussians_per_angular_pair['image_indices']), + cast_to_pointer(pairs_info['neighboring_images']), ctypes.c_int(n_images), cast_to_pointer(image_pair_difference_index), ctypes.c_int(n_difference_images), - cast_to_pointer(pairs_info["mesh"]), - cast_to_pointer(pairs_info["atm"]), - cast_to_pointer(pairs_info["bas"]), - cast_to_pointer(pairs_info["env"]), + cast_to_pointer(pairs_info['mesh']), + cast_to_pointer(pairs_info['atm']), + cast_to_pointer(pairs_info['bas']), + cast_to_pointer(pairs_info['env']), ctypes.c_int(n_channels), - ctypes.c_int(pairs_info["is_non_orthogonal"]), + ctypes.c_int(pairs_info['is_non_orthogonal']), use_float_precision, ) fock += fock_slice if err != 0: - raise RuntimeError( - f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed" - ) + raise RuntimeError(f'evaluate_xc_driver for li={i_angular} lj={j_angular} failed') if not (n_k_points == 1 and n_difference_images == 1): - return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock) + return cp.einsum('kt, ntij -> nkij', phase_diff_among_images, fock) else: return fock @@ -1235,7 +1153,7 @@ def convert_xc_on_g_mesh_to_fock( assert xc_on_g_mesh.shape[1] == 2 n_channels = xc_on_g_mesh.shape[0] else: - raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}") + raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}') density_slices = 2 else: if xc_on_g_mesh.ndim == 1: @@ -1246,7 +1164,7 @@ def convert_xc_on_g_mesh_to_fock( assert xc_on_g_mesh.shape[1] == 1 n_channels = xc_on_g_mesh.shape[0] else: - raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}") + raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}') density_slices = 1 xc_on_g_mesh = xc_on_g_mesh.reshape(n_channels, density_slices, *mydf.mesh) @@ -1269,32 +1187,29 @@ def convert_xc_on_g_mesh_to_fock( fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type) for pairs in mydf.sorted_gaussian_pairs: - interpolated_xc = xc_on_g_mesh[ :, :, - pairs["fft_grid"][0][:, None, None], - pairs["fft_grid"][1][:, None], - pairs["fft_grid"][2], + pairs['fft_grid'][0][:, None, None], + pairs['fft_grid'][1][:, None], + pairs['fft_grid'][2], ] - interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C") + interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C') - n_ao_in_localized = len(pairs["ao_indices_in_localized"]) - libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) - img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) - fock_slice = evaluate_xc_wrapper( - pairs, interpolated_xc, img_phase, with_tau=with_tau - ) + n_ao_in_localized = len(pairs['ao_indices_in_localized']) + libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes) + img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts) + fock_slice = evaluate_xc_wrapper(pairs, interpolated_xc, img_phase, with_tau=with_tau) n_sets, n_k_points = fock_slice.shape[:2] - fock_slice = fock_slice.reshape(-1, *pairs["primitive_shape"]) + fock_slice = fock_slice.reshape(-1, *pairs['primitive_shape']) fock_slice = primitive_to_contracted( fock_slice, - pairs["coeff_in_localized"], - pairs["concatenated_coeff"], - pairs["contracted_shape"], - ).reshape(n_sets, n_k_points, *pairs["contracted_shape"]) + pairs['coeff_in_localized'], + pairs['concatenated_coeff'], + pairs['contracted_shape'], + ).reshape(n_sets, n_k_points, *pairs['contracted_shape']) # While mathematically it is correct to have concatenated # ao indices in the addition, but it is possible that the ao @@ -1308,25 +1223,23 @@ def convert_xc_on_g_mesh_to_fock( fock[ :, :, - pairs["ao_indices_in_localized"][:, None], - pairs["ao_indices_in_localized"], + pairs['ao_indices_in_localized'][:, None], + pairs['ao_indices_in_localized'], ] += fock_slice[:, :, :, :n_ao_in_localized] fock[ :, :, - pairs["ao_indices_in_localized"][:, None], - pairs["ao_indices_in_diffused"], + pairs['ao_indices_in_localized'][:, None], + pairs['ao_indices_in_diffused'], ] += fock_slice[:, :, :, n_ao_in_localized:] if hermi == 1: fock[ :, :, - pairs["ao_indices_in_diffused"][:, None], - pairs["ao_indices_in_localized"], - ] += ( - fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj() - ) + pairs['ao_indices_in_diffused'][:, None], + pairs['ao_indices_in_localized'], + ] += fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj() else: raise NotImplementedError @@ -1347,9 +1260,7 @@ def evaluate_xc_gradient_wrapper( n_channels = xc_weights.shape[0] # density_slices = 2 else: - assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or ( - xc_weights.ndim == 3 + 1 - ) + assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1) n_channels = xc_weights.shape[0] # density_slices = 1 @@ -1365,63 +1276,57 @@ def evaluate_xc_gradient_wrapper( else: use_float_precision = ctypes.c_int(0) - n_images = pairs_info["neighboring_images"].shape[0] + n_images = pairs_info['neighboring_images'].shape[0] phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape if n_k_points == 1 and n_difference_images == 1: density_matrix_with_translation = dm_slice else: - density_matrix_with_translation = cp.einsum( - "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice - ) + density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice) n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape if ignore_imag is False: raise NotImplementedError - density_matrix_with_translation_real_part = cp.asarray( - density_matrix_with_translation.real, order="C" - ) + density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C') assert gradient.dtype == density_matrix_with_translation_real_part.dtype - for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: - (i_angular, j_angular) = gaussians_per_angular_pair["angular"] + for gaussians_per_angular_pair in pairs_info['per_angular_pairs']: + (i_angular, j_angular) = gaussians_per_angular_pair['angular'] err = c_driver( cast_to_pointer(gradient), cast_to_pointer(xc_weights), cast_to_pointer(density_matrix_with_translation_real_part), ctypes.c_int(i_angular), ctypes.c_int(j_angular), - cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]), - cast_to_pointer(gaussians_per_angular_pair["i_shells"]), - cast_to_pointer(gaussians_per_angular_pair["j_shells"]), - ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])), - cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]), + cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']), + cast_to_pointer(gaussians_per_angular_pair['i_shells']), + cast_to_pointer(gaussians_per_angular_pair['j_shells']), + ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])), + cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']), ctypes.c_int(n_i_functions), ctypes.c_int(n_j_functions), - cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]), - cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]), - cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]), - ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])), - cast_to_pointer(gaussians_per_angular_pair["image_indices"]), - cast_to_pointer(pairs_info["neighboring_images"]), + cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']), + cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']), + cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']), + ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])), + cast_to_pointer(gaussians_per_angular_pair['image_indices']), + cast_to_pointer(pairs_info['neighboring_images']), ctypes.c_int(n_images), cast_to_pointer(image_pair_difference_index), ctypes.c_int(n_difference_images), - cast_to_pointer(pairs_info["mesh"]), - cast_to_pointer(pairs_info["atm"]), - cast_to_pointer(pairs_info["bas"]), - cast_to_pointer(pairs_info["env"]), + cast_to_pointer(pairs_info['mesh']), + cast_to_pointer(pairs_info['atm']), + cast_to_pointer(pairs_info['bas']), + cast_to_pointer(pairs_info['env']), ctypes.c_int(n_channels), - ctypes.c_int(pairs_info["is_non_orthogonal"]), + ctypes.c_int(pairs_info['is_non_orthogonal']), use_float_precision, ) if err != 0: - raise RuntimeError( - f"evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed" - ) + raise RuntimeError(f'evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed') def convert_xc_on_g_mesh_to_fock_gradient( @@ -1433,7 +1338,7 @@ def convert_xc_on_g_mesh_to_fock_gradient( with_tau=False, ): cell = mydf.cell - dm_kpts = cp.asarray(dm_kpts, order="C") + dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) n_atoms = cell.natm @@ -1451,47 +1356,45 @@ def convert_xc_on_g_mesh_to_fock_gradient( interpolated_xc = xc_on_g_mesh[ :, :, - pairs["fft_grid"][0][:, None, None], - pairs["fft_grid"][1][:, None], - pairs["fft_grid"][2], + pairs['fft_grid'][0][:, None, None], + pairs['fft_grid'][1][:, None], + pairs['fft_grid'][2], ] - interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C") + interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C') density_matrix_slice = dms[ :, :, - pairs["ao_indices_in_localized"][:, None], - pairs["concatenated_ao_indices"], + pairs['ao_indices_in_localized'][:, None], + pairs['concatenated_ao_indices'], ] density_matrix_with_rows_in_diffused = dms[ :, :, - pairs["ao_indices_in_diffused"][:, None], - pairs["ao_indices_in_localized"], + pairs['ao_indices_in_diffused'][:, None], + pairs['ao_indices_in_localized'], ] n_ao_in_localized = density_matrix_slice.shape[2] - density_matrix_slice[ - :, :, :, n_ao_in_localized: - ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj() + density_matrix_slice[:, :, :, n_ao_in_localized:] += density_matrix_with_rows_in_diffused.transpose( + 0, 1, 3, 2 + ).conj() n_sets, n_k_points = density_matrix_slice.shape[:2] - density_matrix_slice = density_matrix_slice.reshape( - -1, *pairs["contracted_shape"] - ) + density_matrix_slice = density_matrix_slice.reshape(-1, *pairs['contracted_shape']) coeff_sandwiched_density_matrix = contracted_to_primitive( density_matrix_slice, - pairs["coeff_in_localized"], - pairs["concatenated_coeff"], - pairs["primitive_shape"], - ).reshape(n_sets, n_k_points, *pairs["primitive_shape"]) + pairs['coeff_in_localized'], + pairs['concatenated_coeff'], + pairs['primitive_shape'], + ).reshape(n_sets, n_k_points, *pairs['primitive_shape']) - libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) + libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes) - img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) + img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts) evaluate_xc_gradient_wrapper( gradient, pairs, @@ -1540,9 +1443,9 @@ def get_pp(ni, kpts=None): mesh = ni.mesh # Compute the vpplocG as # -einsum('ij,ij->j', pseudo.get_vlocG(cell, Gv), cell.get_SI(Gv)) - vpplocG = multigrid_v1.eval_vpplocG(cell, mesh) + vpplocG = ni.vpplocG vpp = convert_xc_on_g_mesh_to_fock(ni, vpplocG, hermi=1, kpts=kpts)[0] - t1 = log.timer_debug1("vpploc", *t0) + t1 = log.timer_debug1('vpploc', *t0) vppnl = pp_int.get_pp_nl(cell, kpts) for k, kpt in enumerate(kpts): @@ -1553,8 +1456,8 @@ def get_pp(ni, kpts=None): if is_single_kpt: vpp = vpp[0] - log.timer_debug1("vppnl", *t1) - log.timer("get_pp", *t0) + log.timer_debug1('vppnl', *t1) + log.timer('get_pp', *t0) return vpp @@ -1583,7 +1486,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): cell = ni.cell log = logger.new_logger(cell) t0 = log.init_timer() - dm_kpts = cp.asarray(dm_kpts, order="C") + dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] mesh = ni.mesh @@ -1594,7 +1497,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): # coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv) coulomb_kernel_on_g_mesh = ni.coulG - coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh) + coulomb_on_g_mesh = cp.einsum('ng, g -> g', density[:, 0], coulomb_kernel_on_g_mesh) weight = cell.vol / ngrids density = density.reshape(-1, *mesh) @@ -1607,7 +1510,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): # ni = ni.copy().reset().build() kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band xc_for_fock = convert_xc_on_g_mesh_to_fock(ni, coulomb_on_g_mesh, hermi, kpts_band) - t0 = log.timer("vj", *t0) + t0 = log.timer('vj', *t0) return _format_jks(xc_for_fock, dm_kpts, input_band, kpts) @@ -1655,7 +1558,7 @@ def nr_rks( kpts = np.zeros((1, 3)) else: kpts = kpts.reshape(-1, 3) - dm_kpts = cp.asarray(dm_kpts, order="C") + dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] dms = None @@ -1670,8 +1573,8 @@ def nr_rks( coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol - log.debug("Multigrid Coulomb energy %s", coulomb_energy) - t0 = log.timer("coulomb", *t0) + log.debug('Multigrid Coulomb energy %s', coulomb_energy) + t0 = log.timer('coulomb', *t0) weight = cell.vol / ngrids density = ifft_in_place(density.reshape(-1, *mesh)).real.reshape(-1, ngrids) @@ -1679,17 +1582,13 @@ def nr_rks( density /= weight # eval_xc_eff supports float64 only - density = cp.asarray(density, dtype=np.float64, order="C") - if xc_type == "LDA": - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density[0], deriv=1, xctype=xc_type - )[:2] - elif xc_type == "GGA" or xc_type == "MGGA": - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density, deriv=1, xctype=xc_type - )[:2] + density = cp.asarray(density, dtype=np.float64, order='C') + if xc_type == 'LDA': + xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density[0], deriv=1, xctype=xc_type)[:2] + elif xc_type == 'GGA' or xc_type == 'MGGA': + xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2] else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + raise ValueError(f'Incorrect xc_type = {xc_type}') rho_sf = density[0].real xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).get()) * weight @@ -1699,15 +1598,15 @@ def nr_rks( xc_for_fock *= weight xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids) - log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) + log.debug('Multigrid exc %s nelec %s', xc_energy_sum, n_electrons) - if xc_type == "LDA": + if xc_type == 'LDA': pass - elif xc_type == "GGA": + elif xc_type == 'GGA': contract_iG_potential(xc_for_fock, cell) xc_for_fock = xc_for_fock[0] xc_for_fock = xc_for_fock.reshape((-1, ngrids)) - elif xc_type == "MGGA": + elif xc_type == 'MGGA': xc_for_fock = contract_iG_potential(xc_for_fock, cell) xc_for_fock = cp.concatenate( [ @@ -1717,18 +1616,16 @@ def nr_rks( axis=0, ) else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + raise ValueError(f'Incorrect xc_type = {xc_type}') if with_j: xc_for_fock[0] += coulomb_on_g_mesh kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band - veff = convert_xc_on_g_mesh_to_fock( - ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA") - ) + veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA')) veff = _format_jks(veff, dm_kpts, input_band, kpts) veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None) - t0 = log.timer("xc", *t0) + t0 = log.timer('xc', *t0) return n_electrons, xc_energy_sum, veff @@ -1778,7 +1675,7 @@ def nr_uks( kpts = np.zeros((1, 3)) else: kpts = kpts.reshape(-1, 3) - dm_kpts = cp.asarray(dm_kpts, order="C") + dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] dms = None @@ -1793,8 +1690,8 @@ def nr_uks( coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol - log.debug("Multigrid Coulomb energy %s", coulomb_energy) - t0 = log.timer("coulomb", *t0) + log.debug('Multigrid Coulomb energy %s', coulomb_energy) + t0 = log.timer('coulomb', *t0) weight = cell.vol / ngrids density = density.reshape(-1, *mesh) @@ -1803,17 +1700,13 @@ def nr_uks( density /= weight # eval_xc_eff supports float64 only - density = cp.asarray(density, dtype=np.float64, order="C") - if xc_type == "LDA": - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density[:, 0], deriv=1, xctype=xc_type - )[:2] - elif xc_type == "GGA" or xc_type == "MGGA": - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density, deriv=1, xctype=xc_type - )[:2] + density = cp.asarray(density, dtype=np.float64, order='C') + if xc_type == 'LDA': + xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density[:, 0], deriv=1, xctype=xc_type)[:2] + elif xc_type == 'GGA' or xc_type == 'MGGA': + xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2] else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + raise ValueError(f'Incorrect xc_type = {xc_type}') rho_sf = (density[0, 0] + density[1, 0]).real xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).real.get()) * weight @@ -1823,15 +1716,15 @@ def nr_uks( xc_for_fock *= weight xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(nset, -1, ngrids) - log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) + log.debug('Multigrid exc %s nelec %s', xc_energy_sum, n_electrons) - if xc_type == "LDA": + if xc_type == 'LDA': pass - elif xc_type == "GGA": + elif xc_type == 'GGA': for i in xc_for_fock: contract_iG_potential(i, cell) xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) - elif xc_type == "MGGA": + elif xc_type == 'MGGA': for i in xc_for_fock: contract_iG_potential(i, cell) xc_for_fock = cp.concatenate( @@ -1842,18 +1735,16 @@ def nr_uks( axis=1, ) else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + raise ValueError(f'Incorrect xc_type = {xc_type}') if with_j: xc_for_fock[:, 0] += coulomb_on_g_mesh kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band - veff = convert_xc_on_g_mesh_to_fock( - ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA") - ) + veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA')) veff = _format_jks(veff, dm_kpts, input_band, kpts) veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None) - t0 = log.timer("xc", *t0) + t0 = log.timer('xc', *t0) return n_electrons, xc_energy_sum, veff @@ -1881,21 +1772,21 @@ def get_veff_ip1( with_pseudo_vloc_orbital_derivative=True, verbose=None, ): - '''Computes the derivatives of the Exc along with additional contributions + """Computes the derivatives of the Exc along with additional contributions from the Coulomb and pseudopotential terms. Note, the current return is the energy per cell scaled by the number of k-points. This should return the energy per cell directly and will be changed in future. - ''' + """ if kpts is None: kpts = np.zeros((1, 3)) else: kpts = kpts.reshape(-1, 3) - log = logger.new_logger(ni, verbose) - t0 = log.init_timer() cell = ni.cell - dm_kpts = cp.asarray(dm_kpts, order="C") + log = logger.new_logger(cell) + t0 = log.init_timer() + dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] dms = None @@ -1907,7 +1798,7 @@ def get_veff_ip1( density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type) Gv = pbc_tools._get_Gv(cell, mesh) - coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], ni.coulG) + coulomb_on_g_mesh = cp.einsum('ng, g -> g', density[:, 0], ni.coulG) weight = cell.vol / ngrids @@ -1916,7 +1807,7 @@ def get_veff_ip1( density = ( cp.asarray( ifft_in_place(density.reshape(nset, -1, *mesh)).real, - order="C", + order='C', ).reshape(nset, -1, ngrids) / weight ) @@ -1929,15 +1820,15 @@ def get_veff_ip1( xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight xc_for_fock = fft_in_place(xc_for_fock).reshape(nset, -1, ngrids) - if xc_type == "LDA": + if xc_type == 'LDA': pass - elif xc_type == "GGA": + elif xc_type == 'GGA': for i in xc_for_fock: contract_iG_potential(i, cell) xc_for_fock = xc_for_fock[:, 0] xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) - elif xc_type == "MGGA": - xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j + elif xc_type == 'MGGA': + xc_for_fock[:, 0] -= contract('ngp, pg -> np', xc_for_fock[:, 1:4], Gv) * 1j xc_for_fock = cp.concatenate( [ xc_for_fock[:, 0].reshape((nset, -1, ngrids)), @@ -1946,22 +1837,19 @@ def get_veff_ip1( axis=1, ) else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + raise ValueError(f'Incorrect xc_type = {xc_type}') if with_j: xc_for_fock[:, 0] += coulomb_on_g_mesh if with_pseudo_vloc_orbital_derivative: - if cell._pseudo: - xc_for_fock[:, 0] += multigrid_v1.eval_vpplocG(cell, mesh) - else: - xc_for_fock[:, 0] += multigrid_v1.eval_nucG(cell, mesh) + xc_for_fock[:, 0] += ni.vpplocG veff_gradient = convert_xc_on_g_mesh_to_fock_gradient( - ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau=(xc_type == "MGGA") + ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau=(xc_type == 'MGGA') ) - t0 = log.timer("veff_gradient", *t0) + t0 = log.timer('veff_gradient', *t0) return veff_gradient @@ -1974,6 +1862,11 @@ def __init__(self, cell): self.sorted_gaussian_pairs = None Gv = pbc_tools._get_Gv(cell, cell.mesh) self.coulG = pbc_tools.get_coulG(cell, Gv=Gv) + if cell._pseudo: + self.vpplocG = multigrid_v1.eval_vpplocG(cell, cell.mesh) + else: + self.vpplocG = multigrid_v1.eval_nucG(cell, cell.mesh) + self.build() build = sort_gaussian_pairs @@ -1985,7 +1878,7 @@ def reset(self, cell=None): self.sorted_gaussian_pairs = None return self - def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv="ewald"): + def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv='ewald'): if kpts is not None: raise NotImplementedError vj = get_j_kpts(self, dm, hermi, kpts, kpts_band) @@ -2012,4 +1905,4 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv="ewal device = utils.device def to_cpu(self): - raise RuntimeError("Not available") + raise RuntimeError('Not available') diff --git a/gpu4pyscf/pbc/grad/rhf.py b/gpu4pyscf/pbc/grad/rhf.py index 491bf4be1..7670914c9 100644 --- a/gpu4pyscf/pbc/grad/rhf.py +++ b/gpu4pyscf/pbc/grad/rhf.py @@ -28,22 +28,23 @@ import gpu4pyscf.pbc.dft.multigrid as multigrid_v1 from gpu4pyscf.pbc.gto import int1e from gpu4pyscf.pbc.grad.pp import vppnl_nuc_grad +from gpu4pyscf.lib import logger, utils __all__ = ['Gradients'] + class GradientsBase(mol_rhf.GradientsBase): get_ovlp = NotImplemented grad_nuc = cpu_rhf.GradientsBase.grad_nuc def optimizer(self): - '''Geometry (atom positions and lattice) optimization solver - ''' + """Geometry (atom positions and lattice) optimization solver""" from gpu4pyscf.geomopt.ase_solver import GeometryOptimizer + return GeometryOptimizer(self.base) class Gradients(GradientsBase): - make_rdm1e = mol_rhf.Gradients.make_rdm1e def get_veff(self, cell=None, dm=None, kpt=None, verbose=None): @@ -57,6 +58,7 @@ def grad_elec( atmlst=None, ): from gpu4pyscf.pbc.grad.krhf import contract_h1e_dm + mf = self.base cell = mf.cell kpt = mf.kpt @@ -74,10 +76,14 @@ def grad_elec( atmlst = range(cell.natm) with_rsjk = mf.rsjk + + log = logger.new_logger(cell) + # TODO: handle all-electron+GGA and pseudo+GGA differently # pseudo+GGA does not need to evaluate the gradients with PBCJKMatrixOpt if with_rsjk is not None: from gpu4pyscf.pbc.scf.rsjk import PBCJKMatrixOpt + assert isinstance(with_rsjk, PBCJKMatrixOpt) if hasattr(mf, 'xc'): ni = mf._numint @@ -87,7 +93,9 @@ def grad_elec( with_rsjk = PBCJKMatrixOpt(cell, omega=omega).build() if with_rsjk.supmol is None: with_rsjk.build() - de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True, with_pseudo_vloc_orbital_derivative=True).get() + de = multigrid_v2.get_veff_ip1( + ni, mf.xc, dm0, with_j=True, with_pseudo_vloc_orbital_derivative=True + ).get() j_factor = 0 else: ni = multigrid_v2.MultiGridNumInt(cell).build() @@ -95,38 +103,41 @@ def grad_elec( de = 0 if cell._pseudo: vpplocG = multigrid_v1.eval_vpplocG(ni.cell, ni.mesh) - de = multigrid_v2.convert_xc_on_g_mesh_to_fock_gradient( - ni, vpplocG.reshape(1,1,-1), dm0).get() + de = multigrid_v2.convert_xc_on_g_mesh_to_fock_gradient(ni, vpplocG.reshape(1, 1, -1), dm0).get() else: raise NotImplementedError - ejk = with_rsjk._get_ejk_sr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv, - j_factor=j_factor, k_factor=k_sr) - ejk += with_rsjk._get_ejk_lr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv, - j_factor=j_factor, k_factor=k_lr) - de += ejk*2 + ejk = with_rsjk._get_ejk_sr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv, j_factor=j_factor, k_factor=k_sr) + ejk += with_rsjk._get_ejk_lr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv, j_factor=j_factor, k_factor=k_lr) + de += ejk * 2 else: assert hasattr(mf, 'xc'), 'HF gradients not supported' ni = mf._numint assert isinstance(ni, multigrid_v2.MultiGridNumInt) de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True, with_pseudo_vloc_orbital_derivative=True).get() + t0 = log.init_timer() s1 = int1e.int1e_ipovlp(cell)[0] de += contract_h1e_dm(cell, s1, dme0, hermi=1) + t0 = log.timer('ovlp gradient', *t0) # the CPU code requires the attribute .rhoG rhoG = multigrid_v2.evaluate_density_on_g_mesh(ni, dm0) - rhoG = rhoG[0,0] + rhoG = rhoG[0, 0] + if cell._pseudo: de += multigrid_v1.eval_vpplocG_SI_gradient(cell, ni.mesh, rhoG).get() de += vppnl_nuc_grad(cell, dm0.get()) else: de += multigrid_v1.eval_nucG_SI_gradient(cell, ni.mesh, rhoG).get() + t0 = log.timer('other pp gradient', *t0) rhoG = None core_hamiltonian_gradient = int1e.int1e_ipkin(cell)[0] de -= contract_h1e_dm(cell, core_hamiltonian_gradient, dm0, hermi=1) + t0 = log.timer('kinetic gradient', *t0) return de def get_stress(self): from gpu4pyscf.pbc.grad import rhf_stress + return rhf_stress.kernel(self) From ff0c6322a61649f5a012f4c586cae589d6b603fc Mon Sep 17 00:00:00 2001 From: Rui Li Date: Wed, 18 Mar 2026 14:11:54 -0700 Subject: [PATCH 09/14] adjust for k-point calculation --- gpu4pyscf/pbc/dft/multigrid_v2.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index ccedf4fb6..0f22c23ca 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -8,8 +8,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -290,7 +289,7 @@ def contracted_to_primitive(batched_matrices, sorted_coeffs_left, sorted_coeffs_ n_cols_primitive = primitive_shape[1] intermediate_shape = (n_slices, n_rows_primitive, n_cols) - intermediate = cp.zeros(intermediate_shape) + intermediate = cp.zeros(intermediate_shape, dtype=batched_matrices.dtype) for i in sorted_coeffs_left: subarray_shape = (n_slices, -1, i['shape'][1], n_cols) @@ -303,7 +302,7 @@ def contracted_to_primitive(batched_matrices, sorted_coeffs_left, sorted_coeffs_ intermediate = intermediate.transpose(0, 2, 1) result_shape = (n_slices, n_cols_primitive, n_rows_primitive) - result = cp.zeros(result_shape) + result = cp.zeros(result_shape, dtype=batched_matrices.dtype) for i in sorted_coeffs_right: subarray_shape = (n_slices, -1, i['shape'][1], n_rows_primitive) @@ -326,7 +325,7 @@ def primitive_to_contracted(batched_matrices, sorted_coeffs_left, sorted_coeffs_ n_cols_contracted = contracted_shape[1] intermediate_shape = (n_slices, n_rows_contracted, n_cols) - intermediate = cp.zeros(intermediate_shape) + intermediate = cp.zeros(intermediate_shape, dtype=batched_matrices.dtype) for i in sorted_coeffs_left: subarray_shape = (n_slices, -1, i['shape'][0], n_cols) @@ -338,7 +337,7 @@ def primitive_to_contracted(batched_matrices, sorted_coeffs_left, sorted_coeffs_ intermediate = intermediate.transpose(0, 2, 1) result_shape = (n_slices, n_cols_contracted, n_rows_contracted) - result = cp.zeros(result_shape) + result = cp.zeros(result_shape, dtype=batched_matrices.dtype) for i in sorted_coeffs_right: subarray_shape = (n_slices, -1, i['shape'][0], n_rows_contracted) @@ -958,7 +957,6 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): - dm_kpts = cp.asarray(dm_kpts, order='C') dms = _format_dms(dm_kpts, kpts) n_channels, n_k_points = dms.shape[:2] if mydf.sorted_gaussian_pairs is None: @@ -976,6 +974,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): raise ValueError(f'Incorrect xc_type = {xc_type}') cell = mydf.cell + log = logger.new_logger(cell, mydf.verbose) nx, ny, nz = mydf.mesh density_on_g_mesh = cp.zeros((n_channels, density_slices, nx, ny, nz), dtype=cp.complex128) @@ -986,6 +985,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): n_grid_points = np.prod(mesh) weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points + breakpoint() + density_matrix_with_rows_in_localized = dms[ :, :, @@ -1144,7 +1145,7 @@ def convert_xc_on_g_mesh_to_fock( ): cell = mydf.cell nao = cell.nao_nr() - + log = logger.new_logger(cell, mydf.verbose) if with_tau: if xc_on_g_mesh.ndim == 2: assert xc_on_g_mesh.shape[0] == 2 From e507aa59eb928a55d089d815fa285f3701056c1f Mon Sep 17 00:00:00 2001 From: Rui Li Date: Wed, 18 Mar 2026 14:55:37 -0700 Subject: [PATCH 10/14] revert formatting --- gpu4pyscf/pbc/dft/multigrid_v2.py | 525 +++++++++++++++--------------- 1 file changed, 255 insertions(+), 270 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 8ce6e7439..c3160be0f 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -8,7 +8,8 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -36,7 +37,7 @@ __all__ = ['MultiGridNumInt'] -libgpbc = load_library('libmgrid_v2') +libgpbc = load_library("libmgrid_v2") libgpbc.evaluate_density_driver.restype = ctypes.c_int libgpbc.evaluate_xc_driver.restype = ctypes.c_int libgpbc.evaluate_xc_gradient_driver.restype = ctypes.c_int @@ -51,7 +52,7 @@ def complex_type(dtype): elif dtype == cp.float64: return cp.complex128 else: - raise ValueError('Invalid dtype') + raise ValueError("Invalid dtype") def cast_to_pointer(array): @@ -60,7 +61,7 @@ def cast_to_pointer(array): elif isinstance(array, np.ndarray): return array.ctypes.data_as(ctypes.c_void_p) else: - raise ValueError('Invalid array type') + raise ValueError("Invalid array type") def fft_in_place(x): @@ -211,8 +212,8 @@ def unique_with_sort(x): return x, cp.zeros(n) sort_index = cp.argsort(x) - inverse_sort = cp.empty(n, dtype=cp.int64) - inverse_sort[sort_index] = cp.arange(0, n, dtype=cp.int64) + inverse_sort = cp.empty(n, dtype = cp.int64) + inverse_sort[sort_index] = cp.arange(0, n, dtype = cp.int64) x = x[sort_index] mask = cp.empty(n, dtype=cp.bool_) @@ -354,7 +355,7 @@ def image_pair_to_difference( vectors_to_neighboring_images, lattice_vectors, ): - """ + ''' Find unique image pairs for double lattice-sums associated with orbital products. When k-point phases are applied to orbital products with double lattice sum @@ -376,14 +377,14 @@ def image_pair_to_difference( A tuple containing: - The reduced lattice-sum vectors T for the unique image pairs. - A inverse mapping that restores the index of double lattice-sum from T. - """ + ''' vectors_to_neighboring_images = cp.asarray(vectors_to_neighboring_images) lattice_vectors = cp.asarray(lattice_vectors) translation_vectors = cp.asarray( cp.linalg.solve(lattice_vectors.T, vectors_to_neighboring_images.T).T, ) - translation_vectors = cp.asarray(cp.round(translation_vectors), dtype=cp.int32) + translation_vectors = cp.asarray(cp.round(translation_vectors), dtype = cp.int32) difference_images, inverse = _unique_image_pair(translation_vectors) difference_images = difference_images @ lattice_vectors @@ -391,36 +392,32 @@ def image_pair_to_difference( # where R1 is associated with the first orbital in a pair, and R2 associated to the second. return cp.asarray(difference_images), cp.asarray(inverse, dtype=cp.int32) - def _unique_image_pair(translation_vectors): - """ + ''' unqiue((-L[:,None] + L).reshape(-1, 3), axis=0, return_inverse=True) - """ + ''' image_difference_full = ( # -k_i + k_j corresponding to - translation_vectors[None, :, :] - translation_vectors[:, None, :] + translation_vectors[None,:,:] - translation_vectors[:,None,:] ).reshape(-1, 3) max_offset = (translation_vectors.max(axis=0) - translation_vectors.min(axis=0)).max() + 1 - assert (max_offset * 2) ** 3 < np.iinfo(np.int32).max + assert (max_offset * 2)**3 < np.iinfo(np.int32).max image_difference_3in1 = image_difference_full image_difference_3in1 += max_offset - image_difference_3in1 = ( - image_difference_3in1[:, 0] * (max_offset * 2) ** 2 - + image_difference_3in1[:, 1] * (max_offset * 2) - + image_difference_3in1[:, 2] - ) + image_difference_3in1 = image_difference_3in1[:, 0] * (max_offset * 2)**2 \ + + image_difference_3in1[:, 1] * (max_offset * 2) \ + + image_difference_3in1[:, 2] image_difference_3in1, inverse = unique_with_sort(image_difference_3in1) - translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype=cp.int32) - translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2) ** 2 - translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // (max_offset * 2) + translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype = cp.int32) + translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2)**2 + translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2)**2) // (max_offset * 2) translation_vectors[:, 2] = image_difference_3in1 % (max_offset * 2) translation_vectors -= max_offset return translation_vectors, inverse - def image_phase_for_kpts(cell, neighboring_images, kpts=None): n_images = len(neighboring_images) if kpts is None or is_gamma_point(kpts): @@ -432,10 +429,11 @@ def image_phase_for_kpts(cell, neighboring_images, kpts=None): neighboring_images, lattice_vectors, ) - phase_diff_among_images = cp.exp(1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T)) + phase_diff_among_images = cp.exp( + 1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T) + ) return phase_diff_among_images, image_pair_difference_index - def count_non_trivial_pairs( i_angular, j_angular, @@ -558,7 +556,7 @@ def assign_pairs_to_blocks( ): n_blocks = np.prod(n_blocks_abc) n_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32) - n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32) + n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype = cp.int32) err = libgpbc.count_pairs_on_blocks( cast_to_pointer(n_pairs_on_blocks), cast_to_pointer(n_unstable_pairs_on_blocks), @@ -576,13 +574,14 @@ def assign_pairs_to_blocks( cast_to_pointer(mesh), cast_to_pointer(atm), cast_to_pointer(bas), - cast_to_pointer(env), + cast_to_pointer(env) ) - has_unstable_pairs = n_unstable_pairs_on_blocks[-1] > 0 + has_unstable_pairs = (n_unstable_pairs_on_blocks[-1] > 0) if not has_warned_instability and has_unstable_pairs: - warnings.warn('Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.') + warnings.warn("Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.") has_warned_instability = True + if err != 0: raise RuntimeError('count_pairs_on_blocks failed') @@ -612,7 +611,7 @@ def assign_pairs_to_blocks( cast_to_pointer(mesh), cast_to_pointer(atm), cast_to_pointer(bas), - cast_to_pointer(env), + cast_to_pointer(env) ) libgpbc.tailor_gaussian_pairs( @@ -650,11 +649,11 @@ def assign_pairs_to_blocks( pairs_on_blocks, accumulated_n_pairs_per_block, sorted_block_index, - has_warned_instability, + has_warned_instability ) -def sort_gaussian_pairs(mydf, xc_type='LDA'): +def sort_gaussian_pairs(mydf, xc_type="LDA"): cell = mydf.cell log = logger.new_logger(cell) t0 = log.init_timer() @@ -667,30 +666,23 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): is_non_orthogonal = 1 else: is_non_orthogonal = 0 - reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order='C') + reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order="C") reciprocal_norms = np.linalg.norm(reciprocal_lattice_vectors, axis=1) libgpbc.update_lattice_vectors( lattice_vectors.ctypes, reciprocal_lattice_vectors.ctypes, - reciprocal_norms.ctypes, + reciprocal_norms.ctypes ) - tasks = getattr(mydf, 'tasks', None) + tasks = getattr(mydf, "tasks", None) if tasks is None: tasks = multigrid.multi_grids_tasks(cell, mydf.mesh, log) mydf.tasks = tasks - t0 = log.timer('task generation', *t0) + t0 = log.timer("task generation", *t0) t1 = t0 pairs = [] - - derivative_order = 0 - if xc_type == 'GGA': - derivative_order = 1 - if xc_type == 'mGGA': - raise NotImplementedError('mGGA is not supported yet in multigrid_v2.') - for grids_localized, grids_diffused in tasks: subcell_in_localized_region = grids_localized.cell # the original grids_localized.mesh has dtype=np.int64, which can cause @@ -699,17 +691,23 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): fft_grid = list( map( - lambda n_mesh_points: cp.round(cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points)).astype(cp.int32), + lambda n_mesh_points: cp.round(cp.fft.fftfreq( + n_mesh_points, 1.0 / n_mesh_points + )).astype(cp.int32), mesh, ) ) - dxyz_dabc = lattice_vectors / mesh[:, None] + dxyz_dabc = lattice_vectors / mesh[:,None] libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes) n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32) - equivalent_cell_in_localized, coeff_in_localized = subcell_in_localized_region.decontract_basis(to_cart=True) + equivalent_cell_in_localized, coeff_in_localized = ( + subcell_in_localized_region.decontract_basis(to_cart=True, aggregate=True) + ) - n_primitive_gtos_in_localized = multigrid._pgto_shells(subcell_in_localized_region) + n_primitive_gtos_in_localized = multigrid._pgto_shells( + subcell_in_localized_region + ) # theoretically we can use the rcut defined in localized cell to reduce the # number of images, but somehow it can introduce some error when the lattice @@ -735,13 +733,15 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): concatenated_coeff, concatenated_shape = sort_contraction_coefficients(concatenated_coeff) n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell) - rad = vol ** (-1.0 / 3) * cell.rcut + 1 - surface = 4 * np.pi * rad**2 + rad = vol**(-1./3) * cell.rcut + 1 + surface = 4*np.pi * rad**2 lattice_sum_factor = surface precision = cell.precision / lattice_sum_factor threshold_in_log = np.log(precision) - shell_to_ao_indices = cp.asarray(gto.moleintor.make_loc(grouped_cell._bas, 'cart'), dtype=cp.int32) + shell_to_ao_indices = cp.asarray( + gto.moleintor.make_loc(grouped_cell._bas, "cart"), dtype=cp.int32 + ) ao_indices_in_localized = cp.asarray(grids_localized.ao_idx, dtype=cp.int32) if grids_diffused is None: ao_indices_in_diffused = cp.array([], dtype=cp.int32) @@ -758,7 +758,9 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): i_shells = cp.asarray(np.where(i_angulars == l)[0], dtype=cp.int32) sorted_i_shells.append(i_shells) - j_angulars = grouped_cell._bas[:n_primitive_gtos_in_two_regions, multigrid.ANG_OF] + j_angulars = grouped_cell._bas[ + :n_primitive_gtos_in_two_regions, multigrid.ANG_OF + ] j_angulars_unique = np.unique(j_angulars) sorted_j_shells = [] for l in j_angulars_unique: @@ -769,7 +771,7 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): bas = cp.asarray(grouped_cell._bas, dtype=cp.int32) env = cp.asarray(grouped_cell._env) - t1 = log.timer_debug2('routines before screening', *t1) + t1 = log.timer_debug2("routines before screening", *t1) has_warned_instability = False for i_angular, i_shells in zip(i_angulars_unique, sorted_i_shells): for j_angular, j_shells in zip(j_angulars_unique, sorted_j_shells): @@ -790,15 +792,21 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): env, threshold_in_log, ) - t1 = log.timer_debug2('screening in angular pair' + str((i_angular, j_angular)), *t1) - contributing_block_ranges = pairs_to_blocks_end - pairs_to_blocks_begin + 1 - n_contributing_blocks_per_pair = cp.prod(contributing_block_ranges, axis=0) + t1 = log.timer_debug2( + "screening in angular pair" + str((i_angular, j_angular)), *t1 + ) + contributing_block_ranges = ( + pairs_to_blocks_end - pairs_to_blocks_begin + 1 + ) + n_contributing_blocks_per_pair = cp.prod( + contributing_block_ranges, axis=0 + ) n_indices = int(cp.sum(n_contributing_blocks_per_pair)) ( gaussian_pair_indices, accumulated_counts, sorted_contributing_blocks, - has_warned_instability, + has_warned_instability ) = assign_pairs_to_blocks( pairs_to_blocks_begin, pairs_to_blocks_end, @@ -822,8 +830,9 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): derivative_order, ) t1 = log.timer_debug2( - 'assigning pairs to blocks in angular pair' + str((i_angular, j_angular)), - *t1, + "assigning pairs to blocks in angular pair" + + str((i_angular, j_angular)), + *t1 ) per_angular_pairs.append( { @@ -863,7 +872,7 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'): mydf.sorted_gaussian_pairs = pairs - t0 = log.timer('sort_gaussian_pairs', *t0) + t0 = log.timer("sort_gaussian_pairs", *t0) return mydf @@ -872,7 +881,7 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, c_driver = libgpbc.evaluate_density_tau_driver else: c_driver = libgpbc.evaluate_density_driver - n_images = pairs_info['neighboring_images'].shape[0] + n_images = pairs_info["neighboring_images"].shape[0] phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape if n_k_points == 1 and n_difference_images == 1: @@ -882,7 +891,9 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, # e^{i \vec{k} \cdot (\vec{R}_1 - \vec{R}_2)} # Because during grid density evaluation, rho = \sum_{\mu\nu} D_{\mu\nu} \mu \nu^* # The conjugate is on \nu, which is different from other Fock integrals - density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice) + density_matrix_with_translation = cp.einsum( + "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice + ) n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape @@ -894,7 +905,9 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, # assert abs(density_matrix_with_translation.imag).max() < real_dm_imag_threshold, \ # f"The dm transformed into real space contains large imaginary part " \ # f"(max = {abs(density_matrix_with_translation.imag).max()}) >= {real_dm_imag_threshold}" - density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C') + density_matrix_with_translation_real_part = cp.asarray( + density_matrix_with_translation.real, order="C" + ) if density_matrix_with_translation_real_part.dtype == cp.float32: use_float_precision = ctypes.c_int(1) @@ -904,50 +917,40 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, assert density_matrix_with_translation_real_part.size < np.iinfo(np.int32).max if with_tau: - density = cp.zeros( - ( - n_channels, - 2, - ) - + tuple(pairs_info['mesh']), - dtype=density_matrix_with_translation_real_part.dtype, - ) + density = cp.zeros((n_channels, 2, ) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype) else: - density = cp.zeros( - (n_channels,) + tuple(pairs_info['mesh']), - dtype=density_matrix_with_translation_real_part.dtype, - ) + density = cp.zeros((n_channels,) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype) - for gaussians_per_angular_pair in pairs_info['per_angular_pairs']: - (i_angular, j_angular) = gaussians_per_angular_pair['angular'] + for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: + (i_angular, j_angular) = gaussians_per_angular_pair["angular"] err = c_driver( cast_to_pointer(density), cast_to_pointer(density_matrix_with_translation_real_part), ctypes.c_int(i_angular), ctypes.c_int(j_angular), - cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']), - cast_to_pointer(gaussians_per_angular_pair['i_shells']), - cast_to_pointer(gaussians_per_angular_pair['j_shells']), - ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])), - cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']), + cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]), + cast_to_pointer(gaussians_per_angular_pair["i_shells"]), + cast_to_pointer(gaussians_per_angular_pair["j_shells"]), + ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])), + cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]), ctypes.c_int(n_i_functions), ctypes.c_int(n_j_functions), - cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']), - cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']), - cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']), - ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])), - cast_to_pointer(gaussians_per_angular_pair['image_indices']), - cast_to_pointer(pairs_info['neighboring_images']), + cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]), + cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]), + cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]), + ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])), + cast_to_pointer(gaussians_per_angular_pair["image_indices"]), + cast_to_pointer(pairs_info["neighboring_images"]), ctypes.c_int(n_images), cast_to_pointer(image_pair_difference_index), ctypes.c_int(n_difference_images), - (ctypes.c_int * 3)(*pairs_info['mesh']), - cast_to_pointer(pairs_info['atm']), - cast_to_pointer(pairs_info['bas']), - cast_to_pointer(pairs_info['env']), + (ctypes.c_int * 3)(*pairs_info["mesh"]), + cast_to_pointer(pairs_info["atm"]), + cast_to_pointer(pairs_info["bas"]), + cast_to_pointer(pairs_info["env"]), ctypes.c_int(n_channels), - ctypes.c_int(pairs_info['is_non_orthogonal']), + ctypes.c_int(pairs_info["is_non_orthogonal"]), use_float_precision, ) if err != 0: @@ -955,8 +958,8 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, return density - def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): + dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) n_channels, n_k_points = dms.shape[:2] if mydf.sorted_gaussian_pairs is None: @@ -965,40 +968,39 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): with_tau = False if xc_type == "LDA" or xc_type == 'HF': density_slices = 1 - elif xc_type == 'GGA': + elif xc_type == "GGA": density_slices = 4 - elif xc_type == 'MGGA': + elif xc_type == "MGGA": density_slices = 5 with_tau = True else: - raise ValueError(f'Incorrect xc_type = {xc_type}') + raise ValueError(f"Incorrect xc_type = {xc_type}") cell = mydf.cell - log = logger.new_logger(cell, mydf.verbose) nx, ny, nz = mydf.mesh - density_on_g_mesh = cp.zeros((n_channels, density_slices, nx, ny, nz), dtype=cp.complex128) - + density_on_g_mesh = cp.zeros( + (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128 + ) for pairs in mydf.sorted_gaussian_pairs: - mesh = pairs['mesh'] + + mesh = pairs["mesh"] n_grid_points = np.prod(mesh) weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points - breakpoint() - density_matrix_with_rows_in_localized = dms[ :, :, - pairs['ao_indices_in_localized'][:, None], - pairs['concatenated_ao_indices'], + pairs["ao_indices_in_localized"][:, None], + pairs["concatenated_ao_indices"], ] density_matrix_with_rows_in_diffused = dms[ :, :, - pairs['ao_indices_in_diffused'][:, None], - pairs['ao_indices_in_localized'], + pairs["ao_indices_in_diffused"][:, None], + pairs["ao_indices_in_localized"], ] n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3] @@ -1022,9 +1024,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes) - img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts) + img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) density = ( - evaluate_density_wrapper(pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau) + evaluate_density_wrapper( + pairs, coeff_sandwiched_density_matrix, img_phase, with_tau = with_tau + ) * weight_per_grid_point ) @@ -1034,12 +1038,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): density = density[:, 0] density = fft_in_place(density) + density_on_g_mesh[ :, 0, - pairs['fft_grid'][0][:, None, None], - pairs['fft_grid'][1][:, None], - pairs['fft_grid'][2], + pairs["fft_grid"][0][:, None, None], + pairs["fft_grid"][1][:, None], + pairs["fft_grid"][2], ] += density if with_tau: @@ -1048,9 +1053,9 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): density_on_g_mesh[ :, 4, - pairs['fft_grid'][0][:, None, None], - pairs['fft_grid'][1][:, None], - pairs['fft_grid'][2], + pairs["fft_grid"][0][:, None, None], + pairs["fft_grid"][1][:, None], + pairs["fft_grid"][2], ] += tau density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1]) @@ -1063,11 +1068,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'): def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): if with_tau: - assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2 + assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2 n_channels = xc_weights.shape[0] # density_slices = 2 else: - assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1) + assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1) n_channels = xc_weights.shape[0] # density_slices = 1 @@ -1079,7 +1084,7 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape - n_images = pairs_info['neighboring_images'].shape[0] + n_images = pairs_info["neighboring_images"].shape[0] fock = cp.zeros( (n_channels, n_difference_images, n_i_functions, n_j_functions), @@ -1091,39 +1096,39 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): assert xc_weights.dtype == cp.float64 use_float_precision = ctypes.c_int(0) - for gaussians_per_angular_pair in pairs_info['per_angular_pairs']: + for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: fock_slice = cp.zeros( (n_channels, n_difference_images, n_i_functions, n_j_functions), dtype=xc_weights.dtype, ) - (i_angular, j_angular) = gaussians_per_angular_pair['angular'] + (i_angular, j_angular) = gaussians_per_angular_pair["angular"] err = c_driver( cast_to_pointer(fock_slice), cast_to_pointer(xc_weights), ctypes.c_int(i_angular), ctypes.c_int(j_angular), - cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']), - cast_to_pointer(gaussians_per_angular_pair['i_shells']), - cast_to_pointer(gaussians_per_angular_pair['j_shells']), - ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])), - cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']), + cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]), + cast_to_pointer(gaussians_per_angular_pair["i_shells"]), + cast_to_pointer(gaussians_per_angular_pair["j_shells"]), + ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])), + cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]), ctypes.c_int(n_i_functions), ctypes.c_int(n_j_functions), - cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']), - cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']), - cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']), - ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])), - cast_to_pointer(gaussians_per_angular_pair['image_indices']), - cast_to_pointer(pairs_info['neighboring_images']), + cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]), + cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]), + cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]), + ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])), + cast_to_pointer(gaussians_per_angular_pair["image_indices"]), + cast_to_pointer(pairs_info["neighboring_images"]), ctypes.c_int(n_images), cast_to_pointer(image_pair_difference_index), ctypes.c_int(n_difference_images), - cast_to_pointer(pairs_info['mesh']), - cast_to_pointer(pairs_info['atm']), - cast_to_pointer(pairs_info['bas']), - cast_to_pointer(pairs_info['env']), + cast_to_pointer(pairs_info["mesh"]), + cast_to_pointer(pairs_info["atm"]), + cast_to_pointer(pairs_info["bas"]), + cast_to_pointer(pairs_info["env"]), ctypes.c_int(n_channels), - ctypes.c_int(pairs_info['is_non_orthogonal']), + ctypes.c_int(pairs_info["is_non_orthogonal"]), use_float_precision, ) fock += fock_slice @@ -1131,7 +1136,9 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False): raise RuntimeError(f'evaluate_xc_driver for li={i_angular} lj={j_angular} failed') if not (n_k_points == 1 and n_difference_images == 1): - return cp.einsum('kt, ntij -> nkij', phase_diff_among_images, fock) + return cp.einsum( + "kt, ntij -> nkij", phase_diff_among_images, fock + ) else: return fock @@ -1145,7 +1152,7 @@ def convert_xc_on_g_mesh_to_fock( ): cell = mydf.cell nao = cell.nao_nr() - log = logger.new_logger(cell, mydf.verbose) + if with_tau: if xc_on_g_mesh.ndim == 2: assert xc_on_g_mesh.shape[0] == 2 @@ -1154,7 +1161,7 @@ def convert_xc_on_g_mesh_to_fock( assert xc_on_g_mesh.shape[1] == 2 n_channels = xc_on_g_mesh.shape[0] else: - raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}') + raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}") density_slices = 2 else: if xc_on_g_mesh.ndim == 1: @@ -1165,7 +1172,7 @@ def convert_xc_on_g_mesh_to_fock( assert xc_on_g_mesh.shape[1] == 1 n_channels = xc_on_g_mesh.shape[0] else: - raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}') + raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}") density_slices = 1 xc_on_g_mesh = xc_on_g_mesh.reshape(n_channels, density_slices, *mydf.mesh) @@ -1191,15 +1198,15 @@ def convert_xc_on_g_mesh_to_fock( interpolated_xc = xc_on_g_mesh[ :, :, - pairs['fft_grid'][0][:, None, None], - pairs['fft_grid'][1][:, None], - pairs['fft_grid'][2], + pairs["fft_grid"][0][:, None, None], + pairs["fft_grid"][1][:, None], + pairs["fft_grid"][2], ] - interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C') + interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C") - n_ao_in_localized = len(pairs['ao_indices_in_localized']) - libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes) - img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts) + n_ao_in_localized = len(pairs["ao_indices_in_localized"]) + libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes) + img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts) fock_slice = evaluate_xc_wrapper(pairs, interpolated_xc, img_phase, with_tau=with_tau) n_sets, n_k_points = fock_slice.shape[:2] @@ -1224,23 +1231,24 @@ def convert_xc_on_g_mesh_to_fock( fock[ :, :, - pairs['ao_indices_in_localized'][:, None], - pairs['ao_indices_in_localized'], + pairs["ao_indices_in_localized"][:, None], + pairs["ao_indices_in_localized"], ] += fock_slice[:, :, :, :n_ao_in_localized] fock[ :, :, - pairs['ao_indices_in_localized'][:, None], - pairs['ao_indices_in_diffused'], + pairs["ao_indices_in_localized"][:, None], + pairs["ao_indices_in_diffused"], ] += fock_slice[:, :, :, n_ao_in_localized:] - if hermi == 1: fock[ :, :, - pairs['ao_indices_in_diffused'][:, None], - pairs['ao_indices_in_localized'], - ] += fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj() + pairs["ao_indices_in_diffused"][:, None], + pairs["ao_indices_in_localized"], + ] += ( + fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj() + ) else: raise NotImplementedError @@ -1248,20 +1256,14 @@ def convert_xc_on_g_mesh_to_fock( def evaluate_xc_gradient_wrapper( - gradient, - pairs_info, - xc_weights, - dm_slice, - img_phase, - ignore_imag=True, - with_tau=False, + gradient, pairs_info, xc_weights, dm_slice, img_phase, ignore_imag=True, with_tau=False ): if with_tau: - assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2 + assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2 n_channels = xc_weights.shape[0] # density_slices = 2 else: - assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1) + assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1) n_channels = xc_weights.shape[0] # density_slices = 1 @@ -1277,53 +1279,57 @@ def evaluate_xc_gradient_wrapper( else: use_float_precision = ctypes.c_int(0) - n_images = pairs_info['neighboring_images'].shape[0] + n_images = pairs_info["neighboring_images"].shape[0] phase_diff_among_images, image_pair_difference_index = img_phase n_k_points, n_difference_images = phase_diff_among_images.shape if n_k_points == 1 and n_difference_images == 1: density_matrix_with_translation = dm_slice else: - density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice) + density_matrix_with_translation = cp.einsum( + "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice + ) n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape if ignore_imag is False: raise NotImplementedError - density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C') + density_matrix_with_translation_real_part = cp.asarray( + density_matrix_with_translation.real, order="C" + ) assert gradient.dtype == density_matrix_with_translation_real_part.dtype - for gaussians_per_angular_pair in pairs_info['per_angular_pairs']: - (i_angular, j_angular) = gaussians_per_angular_pair['angular'] + for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]: + (i_angular, j_angular) = gaussians_per_angular_pair["angular"] err = c_driver( cast_to_pointer(gradient), cast_to_pointer(xc_weights), cast_to_pointer(density_matrix_with_translation_real_part), ctypes.c_int(i_angular), ctypes.c_int(j_angular), - cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']), - cast_to_pointer(gaussians_per_angular_pair['i_shells']), - cast_to_pointer(gaussians_per_angular_pair['j_shells']), - ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])), - cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']), + cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]), + cast_to_pointer(gaussians_per_angular_pair["i_shells"]), + cast_to_pointer(gaussians_per_angular_pair["j_shells"]), + ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])), + cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]), ctypes.c_int(n_i_functions), ctypes.c_int(n_j_functions), - cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']), - cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']), - cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']), - ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])), - cast_to_pointer(gaussians_per_angular_pair['image_indices']), - cast_to_pointer(pairs_info['neighboring_images']), + cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]), + cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]), + cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]), + ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])), + cast_to_pointer(gaussians_per_angular_pair["image_indices"]), + cast_to_pointer(pairs_info["neighboring_images"]), ctypes.c_int(n_images), cast_to_pointer(image_pair_difference_index), ctypes.c_int(n_difference_images), - cast_to_pointer(pairs_info['mesh']), - cast_to_pointer(pairs_info['atm']), - cast_to_pointer(pairs_info['bas']), - cast_to_pointer(pairs_info['env']), + cast_to_pointer(pairs_info["mesh"]), + cast_to_pointer(pairs_info["atm"]), + cast_to_pointer(pairs_info["bas"]), + cast_to_pointer(pairs_info["env"]), ctypes.c_int(n_channels), - ctypes.c_int(pairs_info['is_non_orthogonal']), + ctypes.c_int(pairs_info["is_non_orthogonal"]), use_float_precision, ) if err != 0: @@ -1339,7 +1345,7 @@ def convert_xc_on_g_mesh_to_fock_gradient( with_tau=False, ): cell = mydf.cell - dm_kpts = cp.asarray(dm_kpts, order='C') + dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) n_atoms = cell.natm @@ -1357,24 +1363,24 @@ def convert_xc_on_g_mesh_to_fock_gradient( interpolated_xc = xc_on_g_mesh[ :, :, - pairs['fft_grid'][0][:, None, None], - pairs['fft_grid'][1][:, None], - pairs['fft_grid'][2], + pairs["fft_grid"][0][:, None, None], + pairs["fft_grid"][1][:, None], + pairs["fft_grid"][2], ] - interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C') + interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C") density_matrix_slice = dms[ :, :, - pairs['ao_indices_in_localized'][:, None], - pairs['concatenated_ao_indices'], + pairs["ao_indices_in_localized"][:, None], + pairs["concatenated_ao_indices"], ] density_matrix_with_rows_in_diffused = dms[ :, :, - pairs['ao_indices_in_diffused'][:, None], - pairs['ao_indices_in_localized'], + pairs["ao_indices_in_diffused"][:, None], + pairs["ao_indices_in_localized"], ] n_ao_in_localized = density_matrix_slice.shape[2] @@ -1408,8 +1414,7 @@ def convert_xc_on_g_mesh_to_fock_gradient( return gradient - -# FIXME: merge to multigrid_v1.get_pp +#FIXME: merge to multigrid_v1.get_pp def get_nuc(ni, kpts=None): if ni.sorted_gaussian_pairs is None: ni.build() @@ -1427,8 +1432,7 @@ def get_nuc(ni, kpts=None): vne = vne[0] return vne - -# FIXME: merge to multigrid_v1.get_pp +#FIXME: merge to multigrid_v1.get_pp def get_pp(ni, kpts=None): """Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed.""" if ni.sorted_gaussian_pairs is None: @@ -1446,7 +1450,7 @@ def get_pp(ni, kpts=None): # -einsum('ij,ij->j', pseudo.get_vlocG(cell, Gv), cell.get_SI(Gv)) vpplocG = ni.vpplocG vpp = convert_xc_on_g_mesh_to_fock(ni, vpplocG, hermi=1, kpts=kpts)[0] - t1 = log.timer_debug1('vpploc', *t0) + t1 = log.timer_debug1("vpploc", *t0) vppnl = pp_int.get_pp_nl(cell, kpts) for k, kpt in enumerate(kpts): @@ -1457,13 +1461,12 @@ def get_pp(ni, kpts=None): if is_single_kpt: vpp = vpp[0] - log.timer_debug1('vppnl', *t1) - log.timer('get_pp', *t0) + log.timer_debug1("vppnl", *t1) + log.timer("get_pp", *t0) return vpp - def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): - """Get the Coulomb (J) AO matrix at sampled k-points. + '''Get the Coulomb (J) AO matrix at sampled k-points. Args: dm_kpts : (*, nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray @@ -1479,7 +1482,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): Returns: vj : (*, nkpts, nao, nao) ndarray or list of vj if the input dm_kpts is a list of DMs - """ + ''' if kpts is None: kpts = np.zeros((1, 3)) else: @@ -1487,7 +1490,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): cell = ni.cell log = logger.new_logger(cell) t0 = log.init_timer() - dm_kpts = cp.asarray(dm_kpts, order='C') + dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] mesh = ni.mesh @@ -1509,28 +1512,16 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None): density = ifft_in_place(density).real.reshape(nset, -1, ngrids) density /= weight - # if kpts_band is not None: + #if kpts_band is not None: # ni = ni.copy().reset().build() kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band xc_for_fock = convert_xc_on_g_mesh_to_fock(ni, coulomb_on_g_mesh, hermi, kpts_band) - t0 = log.timer('vj', *t0) + t0 = log.timer("vj", *t0) return _format_jks(xc_for_fock, dm_kpts, input_band, kpts) - -def nr_rks( - ni, - cell, - grids, - xc_code, - dm_kpts, - relativity=0, - hermi=1, - kpts=None, - kpts_band=None, - with_j=False, - verbose=None, -): - """Compute the XC energy and RKS XC matrix at sampled k-points. +def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, + kpts=None, kpts_band=None, with_j=False, verbose=None): + '''Compute the XC energy and RKS XC matrix at sampled k-points. multigrid version of function pbc.dft.numint.nr_rks. Args: @@ -1549,7 +1540,7 @@ def nr_rks( nelec : number of electrons obtained from the numerical integration veff : (nkpts, nao, nao) ndarray or list of veff if the input dm_kpts is a list of DMs - """ + ''' cell = ni.cell log = logger.new_logger(cell, verbose) t0 = log.init_timer() @@ -1561,7 +1552,7 @@ def nr_rks( kpts = np.zeros((1, 3)) else: kpts = kpts.reshape(-1, 3) - dm_kpts = cp.asarray(dm_kpts, order='C') + dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] dms = None @@ -1591,9 +1582,11 @@ def nr_rks( xc_code, density[0], deriv=1, xctype=xc_type )[:2] elif xc_type == 'GGA' or xc_type == 'MGGA': - xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2] + xc_for_energy, xc_for_fock = ni.eval_xc_eff( + xc_code, density, deriv=1, xctype=xc_type + )[:2] else: - raise ValueError(f'Incorrect xc_type = {xc_type}') + raise ValueError(f"Incorrect xc_type = {xc_type}") rho_sf = density[0].real xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).get()) * weight @@ -1603,7 +1596,7 @@ def nr_rks( xc_for_fock *= weight xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids) - log.debug('Multigrid exc %s nelec %s', xc_energy_sum, n_electrons) + log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) if xc_type == "LDA" or xc_type == 'HF': pass @@ -1621,35 +1614,23 @@ def nr_rks( axis=0, ) else: - raise ValueError(f'Incorrect xc_type = {xc_type}') + raise ValueError(f"Incorrect xc_type = {xc_type}") if with_j: xc_for_fock[0] += coulomb_on_g_mesh kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band - veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA')) + veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA")) veff = _format_jks(veff, dm_kpts, input_band, kpts) veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None) - t0 = log.timer('xc', *t0) + t0 = log.timer("xc", *t0) return n_electrons, xc_energy_sum, veff - # Note nr_uks handles only one set of KUKS density matrices (alpha, beta) in # each call (nr_rks supports multiple sets of KRKS density matrices) -def nr_uks( - ni, - cell, - grids, - xc_code, - dm_kpts, - relativity=0, - hermi=1, - kpts=None, - kpts_band=None, - with_j=False, - verbose=None, -): - """Compute the XC energy and UKS XC matrix at sampled k-points. +def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, + kpts=None, kpts_band=None, with_j=False, verbose=None): + '''Compute the XC energy and UKS XC matrix at sampled k-points. multigrid version of function pbc.dft.numint.nr_rks. Args: @@ -1668,7 +1649,7 @@ def nr_uks( nelec : number of electrons obtained from the numerical integration veff : (nkpts, nao, nao) ndarray or list of veff if the input dm_kpts is a list of DMs - """ + ''' cell = ni.cell log = logger.new_logger(cell, verbose) t0 = log.init_timer() @@ -1680,7 +1661,7 @@ def nr_uks( kpts = np.zeros((1, 3)) else: kpts = kpts.reshape(-1, 3) - dm_kpts = cp.asarray(dm_kpts, order='C') + dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] dms = None @@ -1695,8 +1676,8 @@ def nr_uks( coulomb_on_g_mesh = rho_sf * ni.coulG coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real coulomb_energy /= cell.vol - log.debug('Multigrid Coulomb energy %s', coulomb_energy) - t0 = log.timer('coulomb', *t0) + log.debug("Multigrid Coulomb energy %s", coulomb_energy) + t0 = log.timer("coulomb", *t0) weight = cell.vol / ngrids density = density.reshape(-1, *mesh) @@ -1711,9 +1692,11 @@ def nr_uks( xc_code, density[:,0], deriv=1, xctype=xc_type )[:2] elif xc_type == 'GGA' or xc_type == 'MGGA': - xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2] + xc_for_energy, xc_for_fock = ni.eval_xc_eff( + xc_code, density, deriv=1, xctype=xc_type + )[:2] else: - raise ValueError(f'Incorrect xc_type = {xc_type}') + raise ValueError(f"Incorrect xc_type = {xc_type}") rho_sf = (density[0, 0] + density[1, 0]).real xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).real.get()) * weight @@ -1723,7 +1706,7 @@ def nr_uks( xc_for_fock *= weight xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(nset, -1, ngrids) - log.debug('Multigrid exc %s nelec %s', xc_energy_sum, n_electrons) + log.debug("Multigrid exc %s nelec %s", xc_energy_sum, n_electrons) if xc_type == "LDA" or xc_type == 'HF': pass @@ -1742,21 +1725,21 @@ def nr_uks( axis=1, ) else: - raise ValueError(f'Incorrect xc_type = {xc_type}') + raise ValueError(f"Incorrect xc_type = {xc_type}") if with_j: xc_for_fock[:, 0] += coulomb_on_g_mesh kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band - veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA')) + veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA")) veff = _format_jks(veff, dm_kpts, input_band, kpts) veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None) - t0 = log.timer('xc', *t0) + t0 = log.timer("xc", *t0) return n_electrons, xc_energy_sum, veff - def get_rho(ni, dm, kpts=None): - """Density in real space""" + '''Density in real space + ''' cell = ni.cell mesh = ni.mesh ngrids = np.prod(mesh) @@ -1767,7 +1750,6 @@ def get_rho(ni, dm, kpts=None): rhoR = ifft_in_place(density.reshape(-1, *mesh)).real / weight return rhoR.reshape(-1, ngrids) - def get_veff_ip1( ni, xc_code, @@ -1778,21 +1760,21 @@ def get_veff_ip1( with_pseudo_vloc_orbital_derivative=True, verbose=None, ): - """Computes the derivatives of the Exc along with additional contributions + '''Computes the derivatives of the Exc along with additional contributions from the Coulomb and pseudopotential terms. Note, the current return is the energy per cell scaled by the number of k-points. This should return the energy per cell directly and will be changed in future. - """ + ''' if kpts is None: kpts = np.zeros((1, 3)) else: kpts = kpts.reshape(-1, 3) - cell = ni.cell - log = logger.new_logger(cell) + log = logger.new_logger(ni, verbose) t0 = log.init_timer() - dm_kpts = cp.asarray(dm_kpts, order='C') + cell = ni.cell + dm_kpts = cp.asarray(dm_kpts, order="C") dms = _format_dms(dm_kpts, kpts) nset = dms.shape[0] dms = None @@ -1812,15 +1794,19 @@ def get_veff_ip1( density = ( cp.asarray( ifft_in_place(density.reshape(nset, -1, *mesh)).real, - order='C', + order="C", ).reshape(nset, -1, ngrids) / weight ) if nset == 1: - xc_for_fock = ni.eval_xc_eff(xc_code, density[0], deriv=1, xctype=xc_type)[1] + xc_for_fock = ni.eval_xc_eff( + xc_code, density[0], deriv=1, xctype=xc_type + )[1] else: - xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[1] + xc_for_fock = ni.eval_xc_eff( + xc_code, density, deriv=1, xctype=xc_type + )[1] xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight xc_for_fock = fft_in_place(xc_for_fock).reshape(nset, -1, ngrids) @@ -1842,7 +1828,7 @@ def get_veff_ip1( axis=1, ) else: - raise ValueError(f'Incorrect xc_type = {xc_type}') + raise ValueError(f"Incorrect xc_type = {xc_type}") if with_j: xc_for_fock[:, 0] += coulomb_on_g_mesh @@ -1854,11 +1840,10 @@ def get_veff_ip1( ni, xc_for_fock, dm_kpts, hermi, kpts, with_tau = (xc_type == "MGGA") ) - t0 = log.timer('veff_gradient', *t0) + t0 = log.timer("veff_gradient", *t0) return veff_gradient - class MultiGridNumInt(lib.StreamObject, numint.LibXCMixin): def __init__(self, cell): self.cell = cell @@ -1893,7 +1878,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None): get_rho = get_rho nr_rks = nr_rks nr_uks = nr_uks - get_vxc = nr_vxc = NotImplemented # numint_cpu.KNumInt.nr_vxc + get_vxc = nr_vxc = NotImplemented #numint_cpu.KNumInt.nr_vxc eval_xc_eff = numint.eval_xc_eff _init_xcfuns = numint.NumInt._init_xcfuns @@ -1901,7 +1886,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None): nr_rks_fxc = NotImplemented nr_uks_fxc = NotImplemented nr_rks_fxc_st = NotImplemented - cache_xc_kernel = NotImplemented + cache_xc_kernel = NotImplemented cache_xc_kernel1 = NotImplemented to_gpu = utils.to_gpu From d568ac475180189258bd541cf7cc7f67b1061070 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Wed, 18 Mar 2026 15:03:28 -0700 Subject: [PATCH 11/14] further revert formatting --- gpu4pyscf/pbc/dft/multigrid_v2.py | 18 +++++++++--------- gpu4pyscf/pbc/grad/rhf.py | 7 +++---- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index c3160be0f..042bf92c0 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -836,15 +836,15 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): ) per_angular_pairs.append( { - 'angular': (i_angular, j_angular), - 'screened_shell_pairs': screened_shell_pairs, - 'pair_indices_per_block': gaussian_pair_indices, - 'accumulated_counts_per_block': accumulated_counts, - 'sorted_block_index': sorted_contributing_blocks, - 'image_indices': image_indices, - 'i_shells': i_shells, - 'j_shells': j_shells, - 'shell_to_ao_indices': shell_to_ao_indices, + "angular": (i_angular, j_angular), + "screened_shell_pairs": screened_shell_pairs, + "pair_indices_per_block": gaussian_pair_indices, + "accumulated_counts_per_block": accumulated_counts, + "sorted_block_index": sorted_contributing_blocks, + "image_indices": image_indices, + "i_shells": i_shells, + "j_shells": j_shells, + "shell_to_ao_indices": shell_to_ao_indices, } ) diff --git a/gpu4pyscf/pbc/grad/rhf.py b/gpu4pyscf/pbc/grad/rhf.py index 8a86a719c..d04c63bbd 100644 --- a/gpu4pyscf/pbc/grad/rhf.py +++ b/gpu4pyscf/pbc/grad/rhf.py @@ -34,7 +34,6 @@ __all__ = ['Gradients'] - class GradientsBase(mol_rhf.GradientsBase): _keys = {'cell'} @@ -89,13 +88,14 @@ def kernel(self, mo_energy=None, mo_coeff=None, mo_occ=None): return self.de def optimizer(self): - """Geometry (atom positions and lattice) optimization solver""" + '''Geometry (atom positions and lattice) optimization solver + ''' from gpu4pyscf.geomopt.ase_solver import GeometryOptimizer - return GeometryOptimizer(self.base) class Gradients(GradientsBase): + make_rdm1e = mol_rhf.Gradients.make_rdm1e def energy_ee(self, dm): @@ -201,7 +201,6 @@ def grad_elec( def get_stress(self): from gpu4pyscf.pbc.grad import rhf_stress - return rhf_stress.kernel(self) def contract_h1e_dm(cell, h1e, dm, hermi=0): From 7896cb88c3832721503ba094dc701b4ae20f6fc0 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Wed, 18 Mar 2026 15:05:09 -0700 Subject: [PATCH 12/14] final? revert of formatting changes --- gpu4pyscf/pbc/dft/multigrid_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 042bf92c0..7377c8d3a 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -218,7 +218,7 @@ def unique_with_sort(x): mask = cp.empty(n, dtype=cp.bool_) mask[0] = True - mask[1:] = x[1:] != x[:-1] + mask[1:] = (x[1:] != x[:-1]) x = x[mask] inverse_unique = cp.cumsum(mask, dtype=cp.int64) - 1 From b74463f02ae9cc0cf811c4c2d31c4bebcbb26c88 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Wed, 18 Mar 2026 21:31:16 -0700 Subject: [PATCH 13/14] fix bug from merge --- .../lib/multigrid/multigrid_v2/screening.cuh | 14 ++++----- .../lib/multigrid/multigrid_v2/utils.cuh | 29 ++++++++++++------- gpu4pyscf/pbc/dft/multigrid_v2.py | 15 +++++++--- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh index fa3971250..7abdb14a5 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh @@ -842,21 +842,19 @@ __global__ static void tailor_gaussian_pairs_kernel( recursion_factor_bc_pow_b, recursion_factor_c *= exp_dc_squared) { - const double r_i_squared = - distance_squared(x - i_x, y - i_y, z - i_z); - const double r_j_squared = - distance_squared(x - j_x, y - j_y, z - j_z); - const double r_p_squared = - distance_squared(x - pair_x, y - pair_y, z - pair_z); + const double r_i = sqrt(distance_squared(x - i_x, y - i_y, z - i_z)); + const double r_j = sqrt(distance_squared(x - j_x, y - j_y, z - j_z)); + const double r_p = + sqrt(distance_squared(x - pair_x, y - pair_y, z - pair_z)); const double approxmate_polynomial = approximate_polynomial_value( - r_i_squared, r_j_squared, r_p_squared, derivative_order); + r_i, r_j, r_p, derivative_order); const double gaussian = gaussian_x * gaussian_y * gaussian_z; const double approximate_value = - abs(4.0 * M_PI * r_p_squared * pair_prefactor * + abs(4.0 * M_PI * r_p * r_p * pair_prefactor * approxmate_polynomial * gaussian); max_gaussian_value = max(max_gaussian_value, approximate_value); diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh index a878d76dd..e7b063117 100644 --- a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh +++ b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh @@ -27,25 +27,32 @@ __host__ __device__ T distance_squared(const T x, const T y, const T z) { } template -__host__ __device__ T approximate_polynomial_value(const double r_i_squared, - const double r_j_squared, - const double r_p_squared, +__host__ __device__ T approximate_polynomial_value(const double r_i, + const double r_j, + const double r_p, const int derivative_order) { - T result = - pow(r_i_squared, i_angular / 2.0) * pow(r_j_squared, j_angular / 2.0); - if (derivative_order == 1) { - result *= -2.0 * sqrt(r_p_squared); + T result = pow(r_i, i_angular) * pow(r_j, j_angular); + + if (derivative_order > 0) { + result *= 2.0 * r_p; if constexpr (i_angular > 0) { - result += i_angular * pow(r_i_squared, (i_angular - 2) / 2.0) * - pow(r_j_squared, j_angular / 2.0); + result += i_angular * pow(r_i, i_angular - 1) * pow(r_j, j_angular); } if constexpr (j_angular > 0) { - result += j_angular * pow(r_i_squared, i_angular / 2.0) * - pow(r_j_squared, (j_angular - 2) / 2.0); + result += j_angular * pow(r_i, i_angular) * pow(r_j, j_angular - 1); } } + + if (derivative_order > 1) { + result *= 2.0 * r_p; + if constexpr (i_angular > 0 && j_angular > 0) { + result += i_angular * j_angular * pow(r_i, i_angular - 1) * + pow(r_j, j_angular - 1); + } + } + return result; } diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 7377c8d3a..a7341de74 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -682,6 +682,12 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): t0 = log.timer("task generation", *t0) t1 = t0 + derivative_order = 0 + if xc_type == 'GGA': + derivative_order = 1 + if xc_type == 'MGGA': + derivative_order = 2 + pairs = [] for grids_localized, grids_diffused in tasks: subcell_in_localized_region = grids_localized.cell @@ -702,7 +708,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"): libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes) n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32) equivalent_cell_in_localized, coeff_in_localized = ( - subcell_in_localized_region.decontract_basis(to_cart=True, aggregate=True) + subcell_in_localized_region.decontract_basis(to_cart=True) ) n_primitive_gtos_in_localized = multigrid._pgto_shells( @@ -1605,7 +1611,7 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, xc_for_fock = xc_for_fock[0] xc_for_fock = xc_for_fock.reshape((-1, ngrids)) elif xc_type == 'MGGA': - xc_for_fock = contract_iG_potential(xc_for_fock, cell) + contract_iG_potential(xc_for_fock, cell) xc_for_fock = cp.concatenate( [ xc_for_fock[0].reshape((-1, ngrids)), @@ -1713,7 +1719,7 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, elif xc_type == 'GGA': for i in xc_for_fock: contract_iG_potential(i, cell) - xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) + xc_for_fock = xc_for_fock[:,0].reshape((nset, -1, ngrids)) elif xc_type == 'MGGA': for i in xc_for_fock: contract_iG_potential(i, cell) @@ -1819,7 +1825,8 @@ def get_veff_ip1( xc_for_fock = xc_for_fock[:, 0] xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids)) elif xc_type == 'MGGA': - xc_for_fock[:, 0] -= contract('ngp, pg -> np', xc_for_fock[:, 1:4], Gv) * 1j + for i in xc_for_fock: + contract_iG_potential(i, cell) xc_for_fock = cp.concatenate( [ xc_for_fock[:, 0].reshape((nset, -1, ngrids)), From 97c4fb28475b8f1d1abb5f013f4435787cad7ff8 Mon Sep 17 00:00:00 2001 From: Rui Li Date: Thu, 19 Mar 2026 14:00:54 -0700 Subject: [PATCH 14/14] comment out tailoring to check numerical stability --- gpu4pyscf/pbc/dft/multigrid_v2.py | 60 +++++++++++++++---------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index a7341de74..00b73e635 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -614,36 +614,36 @@ def assign_pairs_to_blocks( cast_to_pointer(env) ) - libgpbc.tailor_gaussian_pairs( - cast_to_pointer(pairs_on_blocks), - cast_to_pointer(n_pairs_on_blocks), - ctypes.c_int(i_angular), - ctypes.c_int(j_angular), - cast_to_pointer(non_trivial_pairs), - cast_to_pointer(i_shells), - cast_to_pointer(j_shells), - ctypes.c_int(len(j_shells)), - cast_to_pointer(shell_to_ao_indices), - cast_to_pointer(accumulated_n_pairs_per_block), - cast_to_pointer(sorted_block_index), - ctypes.c_int(n_contributing_blocks), - cast_to_pointer(image_indices), - cast_to_pointer(vectors_to_neighboring_images), - ctypes.c_int(len(vectors_to_neighboring_images)), - cast_to_pointer(mesh), - cast_to_pointer(atm), - cast_to_pointer(bas), - cast_to_pointer(env), - ctypes.c_int(is_non_orthogonal), - ctypes.c_double(threshold), - ctypes.c_int(derivative_order), - ) - - pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0] - sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32) - n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) - accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32) - sorted_block_index = sorted_block_index[:n_contributing_blocks] + # libgpbc.tailor_gaussian_pairs( + # cast_to_pointer(pairs_on_blocks), + # cast_to_pointer(n_pairs_on_blocks), + # ctypes.c_int(i_angular), + # ctypes.c_int(j_angular), + # cast_to_pointer(non_trivial_pairs), + # cast_to_pointer(i_shells), + # cast_to_pointer(j_shells), + # ctypes.c_int(len(j_shells)), + # cast_to_pointer(shell_to_ao_indices), + # cast_to_pointer(accumulated_n_pairs_per_block), + # cast_to_pointer(sorted_block_index), + # ctypes.c_int(n_contributing_blocks), + # cast_to_pointer(image_indices), + # cast_to_pointer(vectors_to_neighboring_images), + # ctypes.c_int(len(vectors_to_neighboring_images)), + # cast_to_pointer(mesh), + # cast_to_pointer(atm), + # cast_to_pointer(bas), + # cast_to_pointer(env), + # ctypes.c_int(is_non_orthogonal), + # ctypes.c_double(threshold), + # ctypes.c_int(derivative_order), + # ) + + # pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0] + # sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32) + # n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) + # accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32) + # sorted_block_index = sorted_block_index[:n_contributing_blocks] return ( pairs_on_blocks,