From 12b935421437bc9b1c64ad215046c1e91fe23f45 Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Fri, 8 Aug 2025 18:34:20 -0700
Subject: [PATCH 01/14] cache pseudo potential to avoid redundant timing in
 gradient

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index ef7aebd5e..fa9441856 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -1263,7 +1263,7 @@ def get_veff_ip1(
         xc_for_fock[:, 0] += coulomb_on_g_mesh
 
     if cell._pseudo:
-        xc_for_fock[:, 0] += multigrid_v1.eval_vpplocG_part1(cell, mesh)
+        xc_for_fock[:, 0] += ni.cached_vpplocG_part1
 
     veff_gradient = convert_xc_on_g_mesh_to_fock_gradient(
         ni, xc_for_fock, dm_kpts, hermi, kpts_band
@@ -1279,6 +1279,9 @@ def __init__(self, cell):
         self.mesh = cell.mesh
         self.tasks = None
         self.sorted_gaussian_pairs = None
+        self.build()
+        if cell._pseudo:
+            self.cached_vpplocG_part1 = multigrid_v1.eval_vpplocG_part1(cell, self.mesh)
 
     build = sort_gaussian_pairs
 

From e3b9bf8927b207eaaccc8a80ab7d1e7222aeba12 Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Sun, 26 Oct 2025 00:11:51 -0700
Subject: [PATCH 02/14] add aggressive screening of gaussian pairs around the
 boundaries

---
 .../lib/multigrid/multigrid_v2/screen.cu      |  30 ++
 .../lib/multigrid/multigrid_v2/screening.cuh  | 318 +++++++++++++++-
 .../lib/multigrid/multigrid_v2/utils.cuh      |  23 ++
 gpu4pyscf/pbc/dft/multigrid_v2.py             | 338 ++++++++++++------
 4 files changed, 603 insertions(+), 106 deletions(-)

diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu b/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu
index b6574d95d..39ebbcc39 100644
--- a/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu
+++ b/gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu
@@ -206,4 +206,34 @@ void put_pairs_on_blocks(
 
   checkCudaErrors(cudaPeekAtLastError());
 }
+
+int tailor_gaussian_pairs(
+    int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid,
+    const int i_angular, const int j_angular, const int *non_trivial_pairs,
+    const int *i_shells, const int *j_shells, const int n_j_shells,
+    const int *shell_to_ao_indices,
+    const int *accumulated_n_pairs_per_local_grid,
+    const int *sorted_block_index, const int n_contributing_blocks,
+    const int *image_indices, const double *vectors_to_neighboring_images,
+    const int n_images, const int *mesh, const int *atm, const int *bas,
+    const double *env, const int is_non_orthogonal, const double threshold,
+    const int derivative_order) {
+  if (is_non_orthogonal) {
+    return gpu4pyscf::gpbc::multi_grid::tailor_gaussian_pairs_driver<true>(
+        sorted_pairs_per_local_grid, n_pairs_per_local_grid, i_angular,
+        j_angular, non_trivial_pairs, i_shells, j_shells, n_j_shells,
+        shell_to_ao_indices, accumulated_n_pairs_per_local_grid,
+        sorted_block_index, n_contributing_blocks, image_indices,
+        vectors_to_neighboring_images, n_images, mesh, atm, bas, env, threshold,
+        derivative_order);
+  } else {
+    return gpu4pyscf::gpbc::multi_grid::tailor_gaussian_pairs_driver<false>(
+        sorted_pairs_per_local_grid, n_pairs_per_local_grid, i_angular,
+        j_angular, non_trivial_pairs, i_shells, j_shells, n_j_shells,
+        shell_to_ao_indices, accumulated_n_pairs_per_local_grid,
+        sorted_block_index, n_contributing_blocks, image_indices,
+        vectors_to_neighboring_images, n_images, mesh, atm, bas, env, threshold,
+        derivative_order);
+  }
+}
 }
diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
index b9adb4ea6..fa3971250 100644
--- a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
+++ b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
@@ -26,7 +26,7 @@
 
 #define EIJ_CUTOFF 60
 #define BLOCK_DIM_XYZ 4
-#define EXP_OVERFLOW 400 
+#define EXP_OVERFLOW 400
 
 namespace gpu4pyscf::gpbc::multi_grid {
 
@@ -644,4 +644,320 @@ __global__ void put_pairs_on_blocks_kernel(
   }
 }
 
+template <int i_angular, int j_angular, bool is_non_orthogonal>
+__global__ static void tailor_gaussian_pairs_kernel(
+    int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid,
+    const int *non_trivial_pairs, const int *i_shells, const int *j_shells,
+    const int n_j_shells, const int *shell_to_ao_indices,
+    const int *accumulated_n_pairs_per_local_grid,
+    const int *sorted_block_index, const int *image_indices,
+    const double *vectors_to_neighboring_images, const int n_images,
+    const int mesh_a, const int mesh_b, const int mesh_c, const int *atm,
+    const int *bas, const double *env, const double threshold,
+    const int derivative_order) {
+  constexpr int n_threads = BLOCK_DIM_XYZ * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ;
+
+  const int block_index = sorted_block_index[blockIdx.x];
+  const int n_blocks_b = (mesh_b + BLOCK_DIM_XYZ - 1) / BLOCK_DIM_XYZ;
+  const int n_blocks_c = (mesh_c + BLOCK_DIM_XYZ - 1) / BLOCK_DIM_XYZ;
+
+  const int block_a_stride = n_blocks_b * n_blocks_c;
+  const int block_a_index = block_index / block_a_stride;
+  const int block_ab_index = block_index % block_a_stride;
+  const int block_b_index = block_ab_index / n_blocks_c;
+  const int block_c_index = block_ab_index % n_blocks_c;
+
+  const int a_start = block_a_index * BLOCK_DIM_XYZ;
+  const int b_start = block_b_index * BLOCK_DIM_XYZ;
+  const int c_start = block_c_index * BLOCK_DIM_XYZ;
+
+  const double start_position_x =
+      dxyz_dabc[0] * a_start + dxyz_dabc[3] * b_start + dxyz_dabc[6] * c_start;
+  const double start_position_y =
+      dxyz_dabc[1] * a_start + dxyz_dabc[4] * b_start + dxyz_dabc[7] * c_start;
+  const double start_position_z =
+      dxyz_dabc[2] * a_start + dxyz_dabc[5] * b_start + dxyz_dabc[8] * c_start;
+
+  const double a_dot_b = dxyz_dabc[0] * dxyz_dabc[3] +
+                         dxyz_dabc[1] * dxyz_dabc[4] +
+                         dxyz_dabc[2] * dxyz_dabc[5];
+  const double a_dot_c = dxyz_dabc[0] * dxyz_dabc[6] +
+                         dxyz_dabc[1] * dxyz_dabc[7] +
+                         dxyz_dabc[2] * dxyz_dabc[8];
+  const double b_dot_c = dxyz_dabc[3] * dxyz_dabc[6] +
+                         dxyz_dabc[4] * dxyz_dabc[7] +
+                         dxyz_dabc[5] * dxyz_dabc[8];
+
+  const int a_upper = min(a_start + BLOCK_DIM_XYZ, mesh_a) - a_start;
+  const int b_upper = min(b_start + BLOCK_DIM_XYZ, mesh_b) - b_start;
+  const int c_upper = min(c_start + BLOCK_DIM_XYZ, mesh_c) - c_start;
+
+  const int thread_id = threadIdx.x + threadIdx.y * BLOCK_DIM_XYZ +
+                        threadIdx.z * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ;
+
+  const int start_pair_index = accumulated_n_pairs_per_local_grid[block_index];
+  const int end_pair_index =
+      accumulated_n_pairs_per_local_grid[block_index + 1];
+  const int n_pairs = end_pair_index - start_pair_index;
+  const int n_batches = (n_pairs + n_threads - 1) / n_threads;
+
+  for (int i_batch = 0, i_pair_index = start_pair_index + thread_id;
+       i_batch < n_batches; i_batch++, i_pair_index += n_threads) {
+    const bool is_valid_pair = i_pair_index < end_pair_index;
+    const int i_pair =
+        is_valid_pair ? sorted_pairs_per_local_grid[i_pair_index] : 0;
+
+    const int image_index = image_indices[i_pair];
+    const int image_index_i = image_index / n_images;
+    const int image_index_j = image_index % n_images;
+
+    const int shell_pair_index = non_trivial_pairs[i_pair];
+    const int i_shell_index = shell_pair_index / n_j_shells;
+    const int j_shell_index = shell_pair_index % n_j_shells;
+    const int i_shell = i_shells[i_shell_index];
+    const int j_shell = j_shells[j_shell_index];
+
+    const double i_exponent = env[bas(PTR_EXP, i_shell)];
+    const int i_coord_offset = atm(PTR_COORD, bas(ATOM_OF, i_shell));
+    const double i_x =
+        env[i_coord_offset] + vectors_to_neighboring_images[image_index_i * 3];
+    const double i_y = env[i_coord_offset + 1] +
+                       vectors_to_neighboring_images[image_index_i * 3 + 1];
+    const double i_z = env[i_coord_offset + 2] +
+                       vectors_to_neighboring_images[image_index_i * 3 + 2];
+    const double i_coeff = env[bas(PTR_COEFF, i_shell)];
+
+    const double j_exponent = env[bas(PTR_EXP, j_shell)];
+    const int j_coord_offset = atm(PTR_COORD, bas(ATOM_OF, j_shell));
+    const double j_x =
+        env[j_coord_offset] + vectors_to_neighboring_images[image_index_j * 3];
+    const double j_y = env[j_coord_offset + 1] +
+                       vectors_to_neighboring_images[image_index_j * 3 + 1];
+    const double j_z = env[j_coord_offset + 2] +
+                       vectors_to_neighboring_images[image_index_j * 3 + 2];
+    const double j_coeff = env[bas(PTR_COEFF, j_shell)];
+
+    const double ij_exponent = i_exponent + j_exponent;
+    const double ij_exponent_in_prefactor =
+        i_exponent * j_exponent / ij_exponent *
+        distance_squared(i_x - j_x, i_y - j_y, i_z - j_z);
+
+    const double pair_x = (i_exponent * i_x + j_exponent * j_x) / ij_exponent;
+    const double pair_y = (i_exponent * i_y + j_exponent * j_y) / ij_exponent;
+    const double pair_z = (i_exponent * i_z + j_exponent * j_z) / ij_exponent;
+
+    const double x0 = start_position_x - pair_x;
+    const double y0 = start_position_y - pair_y;
+    const double z0 = start_position_z - pair_z;
+
+    const double gaussian_exponent_at_reference =
+        ij_exponent * distance_squared(x0, y0, z0);
+
+    const double pair_prefactor = i_coeff * j_coeff *
+                                  common_fac_sp<double, i_angular>() *
+                                  common_fac_sp<double, j_angular>();
+
+    const double gaussian_starting_point =
+        is_valid_pair
+            ? exp(-(ij_exponent_in_prefactor + gaussian_exponent_at_reference) /
+                  3.0)
+            : 0;
+
+    const double da_squared =
+        distance_squared(dxyz_dabc[0], dxyz_dabc[1], dxyz_dabc[2]);
+    const double db_squared =
+        distance_squared(dxyz_dabc[3], dxyz_dabc[4], dxyz_dabc[5]);
+    const double dc_squared =
+        distance_squared(dxyz_dabc[6], dxyz_dabc[7], dxyz_dabc[8]);
+
+    const double exp_da_squared = exp(-2 * ij_exponent * da_squared);
+    const double exp_db_squared = exp(-2 * ij_exponent * db_squared);
+    const double exp_dc_squared = exp(-2 * ij_exponent * dc_squared);
+
+    const double cross_term_a =
+        dxyz_dabc[0] * x0 + dxyz_dabc[1] * y0 + dxyz_dabc[2] * z0;
+    const double cross_term_b =
+        dxyz_dabc[3] * x0 + dxyz_dabc[4] * y0 + dxyz_dabc[5] * z0;
+    const double cross_term_c =
+        dxyz_dabc[6] * x0 + dxyz_dabc[7] * y0 + dxyz_dabc[8] * z0;
+
+    const double recursion_factor_a_start =
+        exp(-ij_exponent * (2 * cross_term_a + da_squared));
+    const double recursion_factor_b_start =
+        exp(-ij_exponent * (2 * cross_term_b + db_squared));
+    const double recursion_factor_c_start =
+        exp(-ij_exponent * (2 * cross_term_c + dc_squared));
+
+    const double exp_dadb = exp(-2 * ij_exponent * a_dot_b);
+    const double exp_dadc = exp(-2 * ij_exponent * a_dot_c);
+    const double exp_dbdc = exp(-2 * ij_exponent * b_dot_c);
+
+    int a_index, b_index, c_index;
+    double x, y, z;
+    double gaussian_x, gaussian_y, gaussian_z, recursion_factor_a,
+        recursion_factor_b, recursion_factor_c;
+    double recursion_factor_ab_pow_a = 1;
+    double recursion_factor_ac_pow_a = 1;
+    double recursion_factor_bc_pow_b = 1;
+
+    if constexpr (is_non_orthogonal) {
+      // recursion_factor_ab_pow_a = 1;
+      // recursion_factor_ac_pow_a = 1;
+    } else {
+      x = start_position_x;
+    }
+
+    double max_gaussian_value = 0;
+
+    for (a_index = 0, gaussian_x = gaussian_starting_point,
+        recursion_factor_a = recursion_factor_a_start;
+         a_index < a_upper; a_index++, gaussian_x *= recursion_factor_a,
+        recursion_factor_a *= exp_da_squared) {
+
+      if constexpr (is_non_orthogonal) {
+        recursion_factor_bc_pow_b = 1;
+      } else {
+        y = start_position_y;
+      }
+      for (b_index = 0, gaussian_y = gaussian_starting_point,
+          recursion_factor_b = recursion_factor_b_start;
+           b_index < b_upper; b_index++,
+          gaussian_y *= recursion_factor_b * recursion_factor_ab_pow_a,
+          recursion_factor_b *= exp_db_squared) {
+
+        if constexpr (is_non_orthogonal) {
+          x = start_position_x + a_index * dxyz_dabc[0] +
+              b_index * dxyz_dabc[3];
+          y = start_position_y + a_index * dxyz_dabc[1] +
+              b_index * dxyz_dabc[4];
+          z = start_position_z + a_index * dxyz_dabc[2] +
+              b_index * dxyz_dabc[5];
+        } else {
+          z = start_position_z;
+        }
+        for (c_index = 0, gaussian_z = gaussian_starting_point,
+            recursion_factor_c = recursion_factor_c_start;
+             c_index < c_upper; c_index++,
+            gaussian_z *= recursion_factor_c * recursion_factor_ac_pow_a *
+                                recursion_factor_bc_pow_b,
+            recursion_factor_c *= exp_dc_squared) {
+
+          const double r_i_squared =
+              distance_squared(x - i_x, y - i_y, z - i_z);
+          const double r_j_squared =
+              distance_squared(x - j_x, y - j_y, z - j_z);
+          const double r_p_squared =
+              distance_squared(x - pair_x, y - pair_y, z - pair_z);
+
+          const double approxmate_polynomial =
+              approximate_polynomial_value<double, i_angular, j_angular>(
+                  r_i_squared, r_j_squared, r_p_squared, derivative_order);
+
+          const double gaussian = gaussian_x * gaussian_y * gaussian_z;
+
+          const double approximate_value =
+              abs(4.0 * M_PI * r_p_squared * pair_prefactor *
+                  approxmate_polynomial * gaussian);
+
+          max_gaussian_value = max(max_gaussian_value, approximate_value);
+
+          if constexpr (is_non_orthogonal) {
+            x += dxyz_dabc[6];
+            y += dxyz_dabc[7];
+            z += dxyz_dabc[8];
+          } else {
+            z += dxyz_dabc[8];
+          }
+        }
+
+        if constexpr (is_non_orthogonal) {
+          recursion_factor_bc_pow_b *= exp_dbdc;
+        } else {
+          y += dxyz_dabc[4];
+        }
+      }
+
+      if constexpr (is_non_orthogonal) {
+        recursion_factor_ab_pow_a *= exp_dadb;
+        recursion_factor_ac_pow_a *= exp_dadc;
+      } else {
+        x += dxyz_dabc[0];
+      }
+    }
+
+    if (max_gaussian_value < threshold && is_valid_pair) {
+      sorted_pairs_per_local_grid[i_pair_index] = -1;
+      atomicAdd(n_pairs_per_local_grid + block_index, -1);
+    }
+  }
+}
+
+#define tailor_gaussian_pairs_kernel_macro(li, lj)                             \
+  tailor_gaussian_pairs_kernel<li, lj, is_non_orthogonal>                      \
+      <<<block_grid, block_size>>>(                                            \
+          sorted_pairs_per_local_grid, n_pairs_per_local_grid,                 \
+          non_trivial_pairs, i_shells, j_shells, n_j_shells,                   \
+          shell_to_ao_indices, accumulated_n_pairs_per_local_grid,             \
+          sorted_block_index, image_indices, vectors_to_neighboring_images,    \
+          n_images, mesh_a, mesh_b, mesh_c, atm, bas, env, threshold,          \
+          derivative_order);
+
+#define tailor_gaussian_pairs_kernel_case_macro(li, lj)                        \
+  case (li * 10 + lj):                                                         \
+    tailor_gaussian_pairs_kernel_macro(li, lj);                                \
+    break
+
+template <bool is_non_orthogonal>
+int tailor_gaussian_pairs_driver(
+    int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid,
+    const int i_angular, const int j_angular, const int *non_trivial_pairs,
+    const int *i_shells, const int *j_shells, const int n_j_shells,
+    const int *shell_to_ao_indices,
+    const int *accumulated_n_pairs_per_local_grid,
+    const int *sorted_block_index, const int n_contributing_blocks,
+    const int *image_indices, const double *vectors_to_neighboring_images,
+    const int n_images, const int *mesh, const int *atm, const int *bas,
+    const double *env, const double threshold, const int derivative_order) {
+  dim3 block_size(BLOCK_DIM_XYZ, BLOCK_DIM_XYZ, BLOCK_DIM_XYZ);
+  int mesh_a = mesh[0];
+  int mesh_b = mesh[1];
+  int mesh_c = mesh[2];
+  dim3 block_grid(n_contributing_blocks, 1, 1);
+  switch (i_angular * 10 + j_angular) {
+    tailor_gaussian_pairs_kernel_case_macro(0, 0);
+    tailor_gaussian_pairs_kernel_case_macro(0, 1);
+    tailor_gaussian_pairs_kernel_case_macro(0, 2);
+    tailor_gaussian_pairs_kernel_case_macro(0, 3);
+    tailor_gaussian_pairs_kernel_case_macro(0, 4);
+    tailor_gaussian_pairs_kernel_case_macro(1, 0);
+    tailor_gaussian_pairs_kernel_case_macro(1, 1);
+    tailor_gaussian_pairs_kernel_case_macro(1, 2);
+    tailor_gaussian_pairs_kernel_case_macro(1, 3);
+    tailor_gaussian_pairs_kernel_case_macro(1, 4);
+    tailor_gaussian_pairs_kernel_case_macro(2, 0);
+    tailor_gaussian_pairs_kernel_case_macro(2, 1);
+    tailor_gaussian_pairs_kernel_case_macro(2, 2);
+    tailor_gaussian_pairs_kernel_case_macro(2, 3);
+    tailor_gaussian_pairs_kernel_case_macro(2, 4);
+    tailor_gaussian_pairs_kernel_case_macro(3, 0);
+    tailor_gaussian_pairs_kernel_case_macro(3, 1);
+    tailor_gaussian_pairs_kernel_case_macro(3, 2);
+    tailor_gaussian_pairs_kernel_case_macro(3, 3);
+    tailor_gaussian_pairs_kernel_case_macro(3, 4);
+    tailor_gaussian_pairs_kernel_case_macro(4, 0);
+    tailor_gaussian_pairs_kernel_case_macro(4, 1);
+    tailor_gaussian_pairs_kernel_case_macro(4, 2);
+    tailor_gaussian_pairs_kernel_case_macro(4, 3);
+    tailor_gaussian_pairs_kernel_case_macro(4, 4);
+  default:
+    fprintf(stderr,
+            "angular momentum pair %d, %d is not supported in "
+            "evaluate_density_driver\n",
+            i_angular, j_angular);
+    return 1;
+  }
+
+  return checkCudaErrors(cudaPeekAtLastError());
+}
+
 } // namespace gpu4pyscf::gpbc::multi_grid
diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
index cd774255d..a878d76dd 100644
--- a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
+++ b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
@@ -26,6 +26,29 @@ __host__ __device__ T distance_squared(const T x, const T y, const T z) {
   return x * x + y * y + z * z;
 }
 
+template <typename T, int i_angular, int j_angular>
+__host__ __device__ T approximate_polynomial_value(const double r_i_squared,
+                                                   const double r_j_squared,
+                                                   const double r_p_squared,
+                                                   const int derivative_order) {
+  T result =
+      pow(r_i_squared, i_angular / 2.0) * pow(r_j_squared, j_angular / 2.0);
+
+  if (derivative_order == 1) {
+    result *= -2.0 * sqrt(r_p_squared);
+    if constexpr (i_angular > 0) {
+      result += i_angular * pow(r_i_squared, (i_angular - 2) / 2.0) *
+                pow(r_j_squared, j_angular / 2.0);
+    }
+
+    if constexpr (j_angular > 0) {
+      result += j_angular * pow(r_i_squared, i_angular / 2.0) *
+                pow(r_j_squared, (j_angular - 2) / 2.0);
+    }
+  }
+  return result;
+}
+
 template <typename T, int ANG> __device__ constexpr T common_fac_sp() {
   if constexpr (ANG == 0) {
     return 0.282094791773878143;
diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index e09b4963a..32d49fc15 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -36,7 +36,7 @@
 import gpu4pyscf.pbc.dft.multigrid as multigrid_v1
 from gpu4pyscf.lib.cupy_helper import contract, tag_array, load_library
 
-__all__ = ['MultiGridNumInt']
+__all__ = ["MultiGridNumInt"]
 
 libgpbc = load_library("libmgrid_v2")
 libgpbc.evaluate_density_driver.restype = ctypes.c_int
@@ -76,19 +76,23 @@ def ifft_in_place(x):
 def unique_with_sort(x):
     # This function does the same thing as cp.unique(x, return_inverse=True).
     # It's not super optimized, but for whatever reason, cp.unique is very slow, so this one is better.
-    assert type(x) is cp.ndarray and (x.dtype == cp.int32 or x.dtype == cp.int64) and x.ndim == 1
+    assert (
+        type(x) is cp.ndarray
+        and (x.dtype == cp.int32 or x.dtype == cp.int64)
+        and x.ndim == 1
+    )
     n = x.shape[0]
     if n <= 1:
         return x, cp.zeros(n)
 
     sort_index = cp.argsort(x)
-    inverse_sort = cp.empty(n, dtype = cp.int64)
-    inverse_sort[sort_index] = cp.arange(0, n, dtype = cp.int64)
+    inverse_sort = cp.empty(n, dtype=cp.int64)
+    inverse_sort[sort_index] = cp.arange(0, n, dtype=cp.int64)
     x = x[sort_index]
 
     mask = cp.empty(n, dtype=cp.bool_)
     mask[0] = True
-    mask[1:] = (x[1:] != x[:-1])
+    mask[1:] = x[1:] != x[:-1]
 
     x = x[mask]
     inverse_unique = cp.cumsum(mask, dtype=cp.int64) - 1
@@ -106,24 +110,29 @@ def image_pair_to_difference(
     translation_vectors = cp.asarray(
         cp.linalg.solve(lattice_vectors.T, vectors_to_neighboring_images.T).T,
     )
-    translation_vectors = cp.asarray(cp.round(translation_vectors), dtype = cp.int32)
+    translation_vectors = cp.asarray(cp.round(translation_vectors), dtype=cp.int32)
     image_difference_full = (
         # k_j - k_i corresponding to <i|j>
-        translation_vectors[None,:,:] - translation_vectors[:,None,:]
+        translation_vectors[None, :, :]
+        - translation_vectors[:, None, :]
     ).reshape(-1, 3)
 
     max_offset = cp.max(cp.abs(image_difference_full)) + 1
-    assert (max_offset * 2)**3 < np.iinfo(np.int32).max
+    assert (max_offset * 2) ** 3 < np.iinfo(np.int32).max
     image_difference_3in1 = image_difference_full + max_offset
-    image_difference_3in1 = image_difference_3in1[:, 0] * (max_offset * 2)**2 \
-                          + image_difference_3in1[:, 1] * (max_offset * 2) \
-                          + image_difference_3in1[:, 2]
+    image_difference_3in1 = (
+        image_difference_3in1[:, 0] * (max_offset * 2) ** 2
+        + image_difference_3in1[:, 1] * (max_offset * 2)
+        + image_difference_3in1[:, 2]
+    )
 
     image_difference_3in1, inverse = unique_with_sort(image_difference_3in1)
 
-    translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype = cp.int32)
-    translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2)**2
-    translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2)**2) // (max_offset * 2)
+    translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype=cp.int32)
+    translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2) ** 2
+    translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // (
+        max_offset * 2
+    )
     translation_vectors[:, 2] = image_difference_3in1 % (max_offset * 2)
     translation_vectors -= max_offset
 
@@ -138,6 +147,7 @@ def image_pair_to_difference(
     # where R1 is associated with the first orbital in a pair, and R2 associated to the second.
     return cp.asarray(difference_images), cp.asarray(inverse, dtype=cp.int32)
 
+
 def image_phase_for_kpts(cell, neighboring_images, kpts=None):
     n_images = len(neighboring_images)
     if kpts is None or is_gamma_point(kpts):
@@ -154,6 +164,7 @@ def image_phase_for_kpts(cell, neighboring_images, kpts=None):
         )
     return phase_diff_among_images, image_pair_difference_index
 
+
 def count_non_trivial_pairs(
     i_angular,
     j_angular,
@@ -187,7 +198,9 @@ def count_non_trivial_pairs(
         ctypes.c_double(threshold_in_log),
     )
     if err != 0:
-        raise RuntimeError(f'count_non_trivial_pairs for li={i_angular} lj={j_angular} failed')
+        raise RuntimeError(
+            f"count_non_trivial_pairs for li={i_angular} lj={j_angular} failed"
+        )
     return int(n_pairs[0])
 
 
@@ -243,7 +256,9 @@ def screen_gaussian_pairs(
         ctypes.c_double(threshold_in_log),
     )
     if err != 0:
-        raise RuntimeError(f'screen_gaussian_pairs for li={i_angular} lj={j_angular} failed')
+        raise RuntimeError(
+            f"screen_gaussian_pairs for li={i_angular} lj={j_angular} failed"
+        )
     return (
         screened_shell_pairs,
         image_indices,
@@ -257,20 +272,26 @@ def assign_pairs_to_blocks(
     pairs_to_blocks_end,
     n_blocks_abc,
     n_indices,
+    i_angular,
+    j_angular,
     non_trivial_pairs,
     i_shells,
     j_shells,
+    shell_to_ao_indices,
     image_indices,
     vectors_to_neighboring_images,
     mesh,
     atm,
     bas,
     env,
-    has_warned_instability
+    has_warned_instability,
+    is_non_orthogonal,
+    threshold,
+    derivative_order,
 ):
     n_blocks = np.prod(n_blocks_abc)
     n_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32)
-    n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype = cp.int32)
+    n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32)
     err = libgpbc.count_pairs_on_blocks(
         cast_to_pointer(n_pairs_on_blocks),
         cast_to_pointer(n_unstable_pairs_on_blocks),
@@ -288,16 +309,17 @@ def assign_pairs_to_blocks(
         cast_to_pointer(mesh),
         cast_to_pointer(atm),
         cast_to_pointer(bas),
-        cast_to_pointer(env)
+        cast_to_pointer(env),
     )
-    has_unstable_pairs = (n_unstable_pairs_on_blocks[-1] > 0)
+    has_unstable_pairs = n_unstable_pairs_on_blocks[-1] > 0
     if not has_warned_instability and has_unstable_pairs:
-        warnings.warn("Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.")
+        warnings.warn(
+            "Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff."
+        )
         has_warned_instability = True
 
-
     if err != 0:
-        raise RuntimeError('count_pairs_on_blocks failed')
+        raise RuntimeError("count_pairs_on_blocks failed")
 
     n_contributing_blocks = int(n_pairs_on_blocks[-1])
     n_pairs_on_blocks = n_pairs_on_blocks[:-1]
@@ -325,14 +347,45 @@ def assign_pairs_to_blocks(
         cast_to_pointer(mesh),
         cast_to_pointer(atm),
         cast_to_pointer(bas),
-        cast_to_pointer(env)
+        cast_to_pointer(env),
+    )
+
+    libgpbc.tailor_gaussian_pairs(
+        cast_to_pointer(pairs_on_blocks),
+        cast_to_pointer(n_pairs_on_blocks),
+        ctypes.c_int(i_angular),
+        ctypes.c_int(j_angular),
+        cast_to_pointer(non_trivial_pairs),
+        cast_to_pointer(i_shells),
+        cast_to_pointer(j_shells),
+        ctypes.c_int(len(j_shells)),
+        cast_to_pointer(shell_to_ao_indices),
+        cast_to_pointer(accumulated_n_pairs_per_block),
+        cast_to_pointer(sorted_block_index),
+        ctypes.c_int(n_contributing_blocks),
+        cast_to_pointer(image_indices),
+        cast_to_pointer(vectors_to_neighboring_images),
+        ctypes.c_int(len(vectors_to_neighboring_images)),
+        cast_to_pointer(mesh),
+        cast_to_pointer(atm),
+        cast_to_pointer(bas),
+        cast_to_pointer(env),
+        ctypes.c_int(is_non_orthogonal),
+        ctypes.c_double(threshold),
+        ctypes.c_int(derivative_order),
     )
 
+    pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0]
+    sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32)
+    n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) 
+    accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32)
+    sorted_block_index = sorted_block_index[:n_contributing_blocks]
+
     return (
         pairs_on_blocks,
         accumulated_n_pairs_per_block,
         sorted_block_index,
-        has_warned_instability
+        has_warned_instability,
     )
 
 
@@ -355,7 +408,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
     libgpbc.update_lattice_vectors(
         lattice_vectors.ctypes,
         reciprocal_lattice_vectors.ctypes,
-        reciprocal_norms.ctypes
+        reciprocal_norms.ctypes,
     )
 
     tasks = getattr(mydf, "tasks", None)
@@ -366,6 +419,13 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
     t0 = log.timer("task generation", *t0)
     t1 = t0
     pairs = []
+
+    derivative_order = 0
+    if xc_type == "GGA":
+        derivative_order = 1
+    if xc_type == "mGGA":
+        raise NotImplementedError("mGGA is not supported yet in multigrid_v2.")
+
     for grids_localized, grids_diffused in tasks:
         subcell_in_localized_region = grids_localized.cell
         # the original grids_localized.mesh has dtype=np.int64, which can cause
@@ -374,14 +434,14 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
 
         fft_grid = list(
             map(
-                lambda n_mesh_points: cp.round(cp.fft.fftfreq(
-                    n_mesh_points, 1.0 / n_mesh_points
-                )).astype(cp.int32),
+                lambda n_mesh_points: cp.round(
+                    cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points)
+                ).astype(cp.int32),
                 mesh,
             )
         )
 
-        dxyz_dabc = lattice_vectors / mesh[:,None]
+        dxyz_dabc = lattice_vectors / mesh[:, None]
         libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes)
         n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32)
         equivalent_cell_in_localized, coeff_in_localized = (
@@ -421,8 +481,8 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         concatenated_coeff = cp.asarray(concatenated_coeff)
 
         n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell)
-        rad = vol**(-1./3) * cell.rcut + 1
-        surface = 4*np.pi * rad**2
+        rad = vol ** (-1.0 / 3) * cell.rcut + 1
+        surface = 4 * np.pi * rad**2
         lattice_sum_factor = surface
         precision = cell.precision / lattice_sum_factor
         threshold_in_log = np.log(precision)
@@ -497,27 +557,33 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
                     gaussian_pair_indices,
                     accumulated_counts,
                     sorted_contributing_blocks,
-                    has_warned_instability
+                    has_warned_instability,
                 ) = assign_pairs_to_blocks(
                     pairs_to_blocks_begin,
                     pairs_to_blocks_end,
                     n_blocks_abc,
                     n_indices,
+                    i_angular,
+                    j_angular,
                     screened_shell_pairs,
                     i_shells,
                     j_shells,
+                    shell_to_ao_indices,
                     image_indices,
                     vectors_to_neighboring_images,
                     mesh,
                     atm,
                     bas,
                     env,
-                    has_warned_instability
+                    has_warned_instability,
+                    is_non_orthogonal,
+                    cell.precision,
+                    derivative_order,
                 )
                 t1 = log.timer_debug2(
                     "assigning pairs to blocks in angular pair"
                     + str((i_angular, j_angular)),
-                    *t1
+                    *t1,
                 )
                 per_angular_pairs.append(
                     {
@@ -559,7 +625,9 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
     return mydf
 
 
-def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False):
+def evaluate_density_wrapper(
+    pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False
+):
     if with_tau:
         c_driver = libgpbc.evaluate_density_tau_driver
     else:
@@ -600,9 +668,19 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
     assert density_matrix_with_translation_real_part.size < np.iinfo(np.int32).max
 
     if with_tau:
-        density = cp.zeros((n_channels, 2, ) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype)
+        density = cp.zeros(
+            (
+                n_channels,
+                2,
+            )
+            + tuple(pairs_info["mesh"]),
+            dtype=density_matrix_with_translation_real_part.dtype,
+        )
     else:
-        density = cp.zeros((n_channels,) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype)
+        density = cp.zeros(
+            (n_channels,) + tuple(pairs_info["mesh"]),
+            dtype=density_matrix_with_translation_real_part.dtype,
+        )
 
     for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
         (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
@@ -637,11 +715,14 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
             use_float_precision,
         )
         if err != 0:
-            raise RuntimeError(f'evaluate_density_driver for li={i_angular} lj={j_angular} failed')
+            raise RuntimeError(
+                f"evaluate_density_driver for li={i_angular} lj={j_angular} failed"
+            )
 
     return density
 
-def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
+
+def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
     dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     n_channels, n_k_points = dms.shape[:2]
@@ -708,7 +789,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
         density = (
             evaluate_density_wrapper(
-                pairs, coeff_sandwiched_density_matrix, img_phase, with_tau = with_tau
+                pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
             )
             * weight_per_grid_point
         )
@@ -740,7 +821,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
             ] += tau
 
     density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1])
-    if xc_type != 'LDA':
+    if xc_type != "LDA":
         density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T
         density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j
     return density_on_g_mesh
@@ -748,11 +829,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
 
 def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
     if with_tau:
-        assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2
+        assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2
         n_channels = xc_weights.shape[0]
         # density_slices = 2
     else:
-        assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1)
+        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (
+            xc_weights.ndim == 3 + 1
+        )
         n_channels = xc_weights.shape[0]
         # density_slices = 1
 
@@ -814,12 +897,12 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         )
         fock += fock_slice
         if err != 0:
-            raise RuntimeError(f'evaluate_xc_driver for li={i_angular} lj={j_angular} failed')
+            raise RuntimeError(
+                f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed"
+            )
 
     if not (n_k_points == 1 and n_difference_images == 1):
-        return cp.einsum(
-            "kt, ntij -> nkij", phase_diff_among_images, fock
-        )
+        return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock)
     else:
         return fock
 
@@ -888,7 +971,9 @@ def convert_xc_on_g_mesh_to_fock(
         n_ao_in_localized = len(pairs["ao_indices_in_localized"])
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        fock_slice = evaluate_xc_wrapper(pairs, interpolated_xc, img_phase, with_tau=with_tau)
+        fock_slice = evaluate_xc_wrapper(
+            pairs, interpolated_xc, img_phase, with_tau=with_tau
+        )
         fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"])
         fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"])
 
@@ -929,14 +1014,22 @@ def convert_xc_on_g_mesh_to_fock(
 
 
 def evaluate_xc_gradient_wrapper(
-    gradient, pairs_info, xc_weights, dm_slice, img_phase, ignore_imag=True, with_tau=False
+    gradient,
+    pairs_info,
+    xc_weights,
+    dm_slice,
+    img_phase,
+    ignore_imag=True,
+    with_tau=False,
 ):
     if with_tau:
-        assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2
+        assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2
         n_channels = xc_weights.shape[0]
         # density_slices = 2
     else:
-        assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1)
+        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (
+            xc_weights.ndim == 3 + 1
+        )
         n_channels = xc_weights.shape[0]
         # density_slices = 1
 
@@ -1006,7 +1099,9 @@ def evaluate_xc_gradient_wrapper(
             use_float_precision,
         )
         if err != 0:
-            raise RuntimeError(f'evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed')
+            raise RuntimeError(
+                f"evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed"
+            )
 
 
 def convert_xc_on_g_mesh_to_fock_gradient(
@@ -1088,7 +1183,8 @@ def convert_xc_on_g_mesh_to_fock_gradient(
 
     return gradient
 
-#FIXME: merge to multigrid_v1.get_pp
+
+# FIXME: merge to multigrid_v1.get_pp
 def get_nuc(ni, kpts=None):
     if ni.sorted_gaussian_pairs is None:
         ni.build()
@@ -1102,7 +1198,8 @@ def get_nuc(ni, kpts=None):
         vne = vne[0]
     return vne
 
-#FIXME: merge to multigrid_v1.get_pp
+
+# FIXME: merge to multigrid_v1.get_pp
 def get_pp(ni, kpts=None):
     """Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed."""
     if ni.sorted_gaussian_pairs is None:
@@ -1132,8 +1229,9 @@ def get_pp(ni, kpts=None):
     log.timer("get_pp", *t0)
     return vpp
 
+
 def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
-    '''Get the Coulomb (J) AO matrix at sampled k-points.
+    """Get the Coulomb (J) AO matrix at sampled k-points.
 
     Args:
         dm_kpts : (nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray
@@ -1149,7 +1247,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     Returns:
         vj : (nkpts, nao, nao) ndarray
         or list of vj if the input dm_kpts is a list of DMs
-    '''
+    """
     if kpts is None:
         kpts = np.zeros((1, 3))
     cell = ni.cell
@@ -1165,9 +1263,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     Gv = pbc_tools._get_Gv(cell, mesh)
     coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
 
-    coulomb_on_g_mesh = cp.einsum(
-        "ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh
-    )
+    coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh)
     weight = cell.vol / ngrids
 
     density = density.reshape(-1, *mesh)
@@ -1176,16 +1272,28 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     density = ifft_in_place(density).real.reshape(nset, -1, ngrids)
     density /= weight
 
-    #if kpts_band is not None:
+    # if kpts_band is not None:
     #    ni = ni.copy().reset().build()
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
     xc_for_fock = convert_xc_on_g_mesh_to_fock(ni, coulomb_on_g_mesh, hermi, kpts_band)
     t0 = log.timer("vj", *t0)
     return _format_jks(xc_for_fock, dm_kpts, input_band, kpts)
 
-def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
-           kpts=None, kpts_band=None, with_j=False, verbose=None):
-    '''Compute the XC energy and RKS XC matrix at sampled k-points.
+
+def nr_rks(
+    ni,
+    cell,
+    grids,
+    xc_code,
+    dm_kpts,
+    relativity=0,
+    hermi=1,
+    kpts=None,
+    kpts_band=None,
+    with_j=False,
+    verbose=None,
+):
+    """Compute the XC energy and RKS XC matrix at sampled k-points.
     multigrid version of function pbc.dft.numint.nr_rks.
 
     Args:
@@ -1204,7 +1312,7 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         nelec : number of electrons obtained from the numerical integration
         veff : (nkpts, nao, nao) ndarray
             or list of veff if the input dm_kpts is a list of DMs
-    '''
+    """
     cell = ni.cell
     log = logger.new_logger(cell, verbose)
     t0 = log.init_timer()
@@ -1239,12 +1347,12 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
     density /= weight
 
     # eval_xc_eff supports float64 only
-    density = cp.asarray(density, dtype=np.float64, order='C')
+    density = cp.asarray(density, dtype=np.float64, order="C")
     if xc_type == "LDA":
         xc_for_energy, xc_for_fock = ni.eval_xc_eff(
             xc_code, density[0], deriv=1, xctype=xc_type
         )[:2]
-    elif xc_type == 'GGA' or xc_type == 'MGGA':
+    elif xc_type == "GGA" or xc_type == "MGGA":
         xc_for_energy, xc_for_fock = ni.eval_xc_eff(
             xc_code, density, deriv=1, xctype=xc_type
         )[:2]
@@ -1270,10 +1378,13 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         xc_for_fock = xc_for_fock.reshape((-1, ngrids))
     elif xc_type == "MGGA":
         xc_for_fock[0] -= contract("gp, pg -> p", xc_for_fock[1:4], Gv) * 1j
-        xc_for_fock = cp.concatenate([
-            xc_for_fock[0].reshape((-1, ngrids)),
-            xc_for_fock[4].reshape((-1, ngrids)),
-        ], axis = 0)
+        xc_for_fock = cp.concatenate(
+            [
+                xc_for_fock[0].reshape((-1, ngrids)),
+                xc_for_fock[4].reshape((-1, ngrids)),
+            ],
+            axis=0,
+        )
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
@@ -1281,17 +1392,31 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         xc_for_fock[0] += coulomb_on_g_mesh
 
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
-    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA"))
+    veff = convert_xc_on_g_mesh_to_fock(
+        ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA")
+    )
     veff = _format_jks(veff, dm_kpts, input_band, kpts)
     veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None)
     t0 = log.timer("xc", *t0)
     return n_electrons, xc_energy_sum, veff
 
+
 # Note nr_uks handles only one set of KUKS density matrices (alpha, beta) in
 # each call (nr_rks supports multiple sets of KRKS density matrices)
-def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
-           kpts=None, kpts_band=None, with_j=False, verbose=None):
-    '''Compute the XC energy and UKS XC matrix at sampled k-points.
+def nr_uks(
+    ni,
+    cell,
+    grids,
+    xc_code,
+    dm_kpts,
+    relativity=0,
+    hermi=1,
+    kpts=None,
+    kpts_band=None,
+    with_j=False,
+    verbose=None,
+):
+    """Compute the XC energy and UKS XC matrix at sampled k-points.
     multigrid version of function pbc.dft.numint.nr_rks.
 
     Args:
@@ -1310,7 +1435,7 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         nelec : number of electrons obtained from the numerical integration
         veff : (nkpts, nao, nao) ndarray
             or list of veff if the input dm_kpts is a list of DMs
-    '''
+    """
     cell = ni.cell
     log = logger.new_logger(cell, verbose)
     t0 = log.init_timer()
@@ -1346,12 +1471,12 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
     density /= weight
 
     # eval_xc_eff supports float64 only
-    density = cp.asarray(density, dtype=np.float64, order='C')
+    density = cp.asarray(density, dtype=np.float64, order="C")
     if xc_type == "LDA":
         xc_for_energy, xc_for_fock = ni.eval_xc_eff(
-            xc_code, density[:,0], deriv=1, xctype=xc_type
+            xc_code, density[:, 0], deriv=1, xctype=xc_type
         )[:2]
-    elif xc_type == 'GGA' or xc_type == 'MGGA':
+    elif xc_type == "GGA" or xc_type == "MGGA":
         xc_for_energy, xc_for_fock = ni.eval_xc_eff(
             xc_code, density, deriv=1, xctype=xc_type
         )[:2]
@@ -1377,10 +1502,13 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
     elif xc_type == "MGGA":
         xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
-        xc_for_fock = cp.concatenate([
-            xc_for_fock[:, 0].reshape((nset, -1, ngrids)),
-            xc_for_fock[:, 4].reshape((nset, -1, ngrids)),
-        ], axis = 1)
+        xc_for_fock = cp.concatenate(
+            [
+                xc_for_fock[:, 0].reshape((nset, -1, ngrids)),
+                xc_for_fock[:, 4].reshape((nset, -1, ngrids)),
+            ],
+            axis=1,
+        )
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
@@ -1388,15 +1516,17 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         xc_for_fock[:, 0] += coulomb_on_g_mesh
 
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
-    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA"))
+    veff = convert_xc_on_g_mesh_to_fock(
+        ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA")
+    )
     veff = _format_jks(veff, dm_kpts, input_band, kpts)
     veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None)
     t0 = log.timer("xc", *t0)
     return n_electrons, xc_energy_sum, veff
 
+
 def get_rho(ni, dm, kpts=None):
-    '''Density in real space
-    '''
+    """Density in real space"""
     cell = ni.cell
     mesh = ni.mesh
     ngrids = np.prod(mesh)
@@ -1407,6 +1537,7 @@ def get_rho(ni, dm, kpts=None):
     rhoR = ifft_in_place(density.reshape(-1, *mesh)).real / weight
     return rhoR.reshape(-1, ngrids)
 
+
 def get_veff_ip1(
     ni,
     xc_code,
@@ -1436,9 +1567,7 @@ def get_veff_ip1(
 
     Gv = pbc_tools._get_Gv(cell, mesh)
     coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
-    coulomb_on_g_mesh = cp.einsum(
-        "ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh
-    )
+    coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh)
 
     weight = cell.vol / ngrids
 
@@ -1453,13 +1582,9 @@ def get_veff_ip1(
     )
 
     if nset == 1:
-        xc_for_fock = ni.eval_xc_eff(
-            xc_code, density[0], deriv=1, xctype=xc_type
-        )[1]
+        xc_for_fock = ni.eval_xc_eff(xc_code, density[0], deriv=1, xctype=xc_type)[1]
     else:
-        xc_for_fock = ni.eval_xc_eff(
-            xc_code, density, deriv=1, xctype=xc_type
-        )[1]
+        xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[1]
 
     xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight
     xc_for_fock = fft_in_place(xc_for_fock).reshape(nset, -1, ngrids)
@@ -1473,10 +1598,13 @@ def get_veff_ip1(
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
     elif xc_type == "MGGA":
         xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
-        xc_for_fock = cp.concatenate([
-            xc_for_fock[:, 0].reshape((nset, -1, ngrids)),
-            xc_for_fock[:, 4].reshape((nset, -1, ngrids)),
-        ], axis = 1)
+        xc_for_fock = cp.concatenate(
+            [
+                xc_for_fock[:, 0].reshape((nset, -1, ngrids)),
+                xc_for_fock[:, 4].reshape((nset, -1, ngrids)),
+            ],
+            axis=1,
+        )
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
@@ -1488,13 +1616,14 @@ def get_veff_ip1(
         xc_for_fock[:, 0] += multigrid_v1.eval_vpplocG_part1(cell, mesh)
 
     veff_gradient = convert_xc_on_g_mesh_to_fock_gradient(
-        ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau = (xc_type == "MGGA")
+        ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau=(xc_type == "MGGA")
     )
 
     t0 = log.timer("veff_gradient", *t0)
 
     return veff_gradient
 
+
 class MultiGridNumInt(lib.StreamObject, numint.LibXCMixin):
     def __init__(self, cell):
         self.cell = cell
@@ -1511,8 +1640,7 @@ def reset(self, cell=None):
         self.sorted_gaussian_pairs = None
         return self
 
-    def get_j(self, dm, hermi=1, kpts=None, kpts_band=None,
-              omega=None, exxdiv='ewald'):
+    def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv="ewald"):
         if kpts is not None:
             raise NotImplementedError
         vj = get_j_kpts(self, dm, hermi, kpts, kpts_band)
@@ -1524,7 +1652,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None,
     get_rho = get_rho
     nr_rks = nr_rks
     nr_uks = nr_uks
-    get_vxc = nr_vxc = NotImplemented #numint_cpu.KNumInt.nr_vxc
+    get_vxc = nr_vxc = NotImplemented  # numint_cpu.KNumInt.nr_vxc
 
     eval_xc_eff = numint.eval_xc_eff
     _init_xcfuns = numint.NumInt._init_xcfuns
@@ -1532,11 +1660,11 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None,
     nr_rks_fxc = NotImplemented
     nr_uks_fxc = NotImplemented
     nr_rks_fxc_st = NotImplemented
-    cache_xc_kernel  = NotImplemented
+    cache_xc_kernel = NotImplemented
     cache_xc_kernel1 = NotImplemented
 
     to_gpu = utils.to_gpu
     device = utils.device
 
     def to_cpu(self):
-        raise RuntimeError('Not available')
+        raise RuntimeError("Not available")

From 386cdf0eeefafc7914da6cf140d56054cfc1a2d9 Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Tue, 28 Oct 2025 16:51:21 -0700
Subject: [PATCH 03/14] add timer for individual kernel call

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 38 +++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 32d49fc15..464007854 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -626,7 +626,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
 
 
 def evaluate_density_wrapper(
-    pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False
+    cell, pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False
 ):
     if with_tau:
         c_driver = libgpbc.evaluate_density_tau_driver
@@ -682,6 +682,9 @@ def evaluate_density_wrapper(
             dtype=density_matrix_with_translation_real_part.dtype,
         )
 
+    log = logger.new_logger(cell, cell.verbose)
+    t0 = log.init_timer()
+
     for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
         (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
 
@@ -718,8 +721,9 @@ def evaluate_density_wrapper(
             raise RuntimeError(
                 f"evaluate_density_driver for li={i_angular} lj={j_angular} failed"
             )
+    t1 = log.timer_debug1("density kernel", *t0)
 
-    return density
+    return density, cp.cuda.get_elapsed_time(t0[-1], t1[-1])
 
 
 def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
@@ -746,6 +750,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
     density_on_g_mesh = cp.zeros(
         (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128
     )
+
+    kernel_time = 0
     for pairs in mydf.sorted_gaussian_pairs:
 
         mesh = pairs["mesh"]
@@ -787,13 +793,16 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
 
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        density = (
+        density, kernel_subroutine = (
             evaluate_density_wrapper(
-                pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
+                cell, pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
             )
-            * weight_per_grid_point
         )
 
+        density *= weight_per_grid_point
+
+        kernel_time += kernel_subroutine
+
         if with_tau:
             assert density.shape[1] == 2
             tau = density[:, 1]
@@ -824,10 +833,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
     if xc_type != "LDA":
         density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T
         density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j
+    print("density kernel: ", kernel_time, "ms")
     return density_on_g_mesh
 
 
-def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
+def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False):
     if with_tau:
         assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2
         n_channels = xc_weights.shape[0]
@@ -860,6 +870,10 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         assert xc_weights.dtype == cp.float64
         use_float_precision = ctypes.c_int(0)
 
+
+    log = logger.new_logger(cell, cell.verbose)
+    t0 = log.init_timer()
+
     for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
         fock_slice = cp.zeros(
             (n_channels, n_difference_images, n_i_functions, n_j_functions),
@@ -901,10 +915,12 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
                 f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed"
             )
 
+    t1 = log.timer_debug1("xc kernel", *t0)
+
     if not (n_k_points == 1 and n_difference_images == 1):
         return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock)
     else:
-        return fock
+        return fock, cp.cuda.get_elapsed_time(t0[-1], t1[-1])
 
 
 def convert_xc_on_g_mesh_to_fock(
@@ -957,6 +973,7 @@ def convert_xc_on_g_mesh_to_fock(
         data_type = complex_type(cp.float64)
 
     fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type)
+    kernel_time = 0
 
     for pairs in mydf.sorted_gaussian_pairs:
         interpolated_xc = xc_on_g_mesh[
@@ -971,9 +988,10 @@ def convert_xc_on_g_mesh_to_fock(
         n_ao_in_localized = len(pairs["ao_indices_in_localized"])
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        fock_slice = evaluate_xc_wrapper(
-            pairs, interpolated_xc, img_phase, with_tau=with_tau
+        fock_slice, kernel_subroutine = evaluate_xc_wrapper(
+            cell, pairs, interpolated_xc, img_phase, with_tau=with_tau
         )
+        kernel_time += kernel_subroutine
         fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"])
         fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"])
 
@@ -1010,6 +1028,8 @@ def convert_xc_on_g_mesh_to_fock(
         else:
             raise NotImplementedError
 
+
+    print("timing for xc kernel: ", kernel_time, "ms")
     return fock
 
 

From c345a268b13c82b279d2a9c92941b384d31a15d0 Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Wed, 29 Oct 2025 00:14:55 -0700
Subject: [PATCH 04/14] add detailed timing

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 85 ++++++++++++++++++++++---------
 1 file changed, 61 insertions(+), 24 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 464007854..ce8acabe6 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -626,7 +626,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
 
 
 def evaluate_density_wrapper(
-    cell, pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False
+    pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False
 ):
     if with_tau:
         c_driver = libgpbc.evaluate_density_tau_driver
@@ -682,9 +682,6 @@ def evaluate_density_wrapper(
             dtype=density_matrix_with_translation_real_part.dtype,
         )
 
-    log = logger.new_logger(cell, cell.verbose)
-    t0 = log.init_timer()
-
     for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
         (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
 
@@ -721,9 +718,8 @@ def evaluate_density_wrapper(
             raise RuntimeError(
                 f"evaluate_density_driver for li={i_angular} lj={j_angular} failed"
             )
-    t1 = log.timer_debug1("density kernel", *t0)
 
-    return density, cp.cuda.get_elapsed_time(t0[-1], t1[-1])
+    return density
 
 
 def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
@@ -745,6 +741,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     cell = mydf.cell
+    log = logger.new_logger(cell, cell.verbose)
 
     nx, ny, nz = mydf.mesh
     density_on_g_mesh = cp.zeros(
@@ -752,6 +749,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
     )
 
     kernel_time = 0
+    fft_time = 0
+    contraction_time = 0
     for pairs in mydf.sorted_gaussian_pairs:
 
         mesh = pairs["mesh"]
@@ -759,6 +758,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         n_grid_points = np.prod(mesh)
         weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points
 
+
         density_matrix_with_rows_in_localized = dms[
             :,
             :,
@@ -773,7 +773,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             pairs["ao_indices_in_localized"],
         ]
 
+
         n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3]
+
+        t0 = log.init_timer()
+
         density_matrix_with_rows_in_localized[
             :, :, :, n_ao_in_localized:
         ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
@@ -790,18 +794,20 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             pairs["concatenated_coeff"],
         )
 
+        contraction_time += log.timer_silent(*t0)[-1]
+        t0 = log.init_timer()
+
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
 
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        density, kernel_subroutine = (
+        density = (
             evaluate_density_wrapper(
-                cell, pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
-            )
+                pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
+            ) * weight_per_grid_point
         )
 
-        density *= weight_per_grid_point
-
-        kernel_time += kernel_subroutine
+        kernel_time += log.timer_silent(*t0)[-1]
+        t0 = log.init_timer()
 
         if with_tau:
             assert density.shape[1] == 2
@@ -810,6 +816,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
 
         density = fft_in_place(density)
 
+        fft_time += log.timer_silent(*t0)[-1]
+
         density_on_g_mesh[
             :,
             0,
@@ -834,10 +842,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T
         density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j
     print("density kernel: ", kernel_time, "ms")
+    print("density fft: ", fft_time, "ms")
+    print("density contraction: ", contraction_time, "ms")
+
     return density_on_g_mesh
 
 
-def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False):
+def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
     if with_tau:
         assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2
         n_channels = xc_weights.shape[0]
@@ -870,10 +881,6 @@ def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False)
         assert xc_weights.dtype == cp.float64
         use_float_precision = ctypes.c_int(0)
 
-
-    log = logger.new_logger(cell, cell.verbose)
-    t0 = log.init_timer()
-
     for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
         fock_slice = cp.zeros(
             (n_channels, n_difference_images, n_i_functions, n_j_functions),
@@ -915,13 +922,11 @@ def evaluate_xc_wrapper(cell, pairs_info, xc_weights, img_phase, with_tau=False)
                 f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed"
             )
 
-    t1 = log.timer_debug1("xc kernel", *t0)
 
     if not (n_k_points == 1 and n_difference_images == 1):
         return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock)
     else:
-        return fock, cp.cuda.get_elapsed_time(t0[-1], t1[-1])
-
+        return fock
 
 def convert_xc_on_g_mesh_to_fock(
     mydf,
@@ -974,8 +979,13 @@ def convert_xc_on_g_mesh_to_fock(
 
     fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type)
     kernel_time = 0
+    fft_time = 0
+    contraction_time = 0
+    log = logger.new_logger(cell, cell.verbose)
 
     for pairs in mydf.sorted_gaussian_pairs:
+
+        t0 = log.init_timer()
         interpolated_xc = xc_on_g_mesh[
             :,
             :,
@@ -984,14 +994,19 @@ def convert_xc_on_g_mesh_to_fock(
             pairs["fft_grid"][2],
         ]
         interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C")
+        fft_time += log.timer_silent(*t0)[-1]
+        t0 = log.init_timer()
 
         n_ao_in_localized = len(pairs["ao_indices_in_localized"])
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        fock_slice, kernel_subroutine = evaluate_xc_wrapper(
-            cell, pairs, interpolated_xc, img_phase, with_tau=with_tau
+        fock_slice =  evaluate_xc_wrapper(
+            pairs, interpolated_xc, img_phase, with_tau=with_tau
         )
-        kernel_time += kernel_subroutine
+
+        kernel_time += log.timer_silent(*t0)[-1]
+        t0 = log.init_timer()
+        
         fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"])
         fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"])
 
@@ -1016,6 +1031,9 @@ def convert_xc_on_g_mesh_to_fock(
             pairs["ao_indices_in_localized"][:, None],
             pairs["ao_indices_in_diffused"],
         ] += fock_slice[:, :, :, n_ao_in_localized:]
+
+        contraction_time += log.timer_silent(*t0)[-1]
+
         if hermi == 1:
             fock[
                 :,
@@ -1029,7 +1047,10 @@ def convert_xc_on_g_mesh_to_fock(
             raise NotImplementedError
 
 
-    print("timing for xc kernel: ", kernel_time, "ms")
+    print("xc kernel: ", kernel_time, "ms")
+    print("xc fft: ", fft_time, "ms")
+    print("xc contraction: ", contraction_time, "ms")
+
     return fock
 
 
@@ -1350,11 +1371,18 @@ def nr_rks(
     mesh = ni.mesh
     ngrids = np.prod(mesh)
 
+    tq = log.init_timer()
     density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type)
+    print("evaluate_density_on_g_mesh: ", log.timer_silent(*tq)[-1], "ms")
     rho_sf = density[0, 0]
 
+    tq = log.init_timer()
     Gv = pbc_tools._get_Gv(cell, mesh)
+    print("get_Gv: ", log.timer_silent(*tq)[-1], "ms")
+
+    tq = log.init_timer()
     coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
+    print("get_coulG: ", log.timer_silent(*tq)[-1], "ms")
     coulomb_on_g_mesh = rho_sf * coulomb_kernel_on_g_mesh
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
@@ -1368,6 +1396,7 @@ def nr_rks(
 
     # eval_xc_eff supports float64 only
     density = cp.asarray(density, dtype=np.float64, order="C")
+    tq = log.init_timer()
     if xc_type == "LDA":
         xc_for_energy, xc_for_fock = ni.eval_xc_eff(
             xc_code, density[0], deriv=1, xctype=xc_type
@@ -1379,16 +1408,22 @@ def nr_rks(
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
+    print("libxc: ", log.timer_silent(*tq)[-1], "ms")
+
     rho_sf = density[0].real
     xc_energy_sum = rho_sf.dot(xc_for_energy.ravel()).get()[()] * weight
 
     # To reduce the memory usage, we reuse the xc_for_fock name.
     # Now xc_for_fock represents xc on G space
     xc_for_fock *= weight
+    tq = log.init_timer()
     xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids)
+    print("fft for xc:", log.timer_silent(*tq)[-1], "ms")
 
     log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
 
+
+    tq = log.init_timer()
     if xc_type == "LDA":
         pass
     elif xc_type == "GGA":
@@ -1408,6 +1443,8 @@ def nr_rks(
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
+    print("merge Gv: ", log.timer_silent(*tq)[-1], "ms")
+
     if with_j:
         xc_for_fock[0] += coulomb_on_g_mesh
 

From b9756943d5c2dc8c082fae9c67722304c6928f0b Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Mon, 24 Nov 2025 14:50:37 -0800
Subject: [PATCH 05/14] improve primitive <-> contracted basis conversion and
 ig, gi-> g contraction

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 408 +++++++++++++++++++++++++-----
 1 file changed, 347 insertions(+), 61 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index ce8acabe6..3d03b7cf2 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -73,6 +73,140 @@ def ifft_in_place(x):
     return fft.ifftn(x, axes=(-3, -2, -1), overwrite_x=True)
 
 
+def iG_density(density, cell):
+    mesh = cell.mesh
+    b = cp.asarray(cell.reciprocal_vectors())
+
+    mesh_in_int32 = np.array(mesh, dtype=np.int32)
+    block_dim = (8, 8, 8)
+    grid_dim = tuple(
+        np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)
+    )
+
+    result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3,-1)
+
+    custom_kernel = cp.RawKernel(
+        r"""
+        #include <cupy/complex.cuh>
+        extern "C" __global__
+        void kernel(complex<double> * result,
+                    const complex<double> * density,
+                    const double * reciprocal_lattice,
+                    const int mesh_a,
+                    const int mesh_b,
+                    const int mesh_c) {
+            int a = blockIdx.x * blockDim.x + threadIdx.x;
+            int b = blockIdx.y * blockDim.y + threadIdx.y;
+            int c = blockIdx.z * blockDim.z + threadIdx.z;
+
+            if(a >= mesh_a || b >= mesh_b || c >= mesh_c) {
+                return;
+            }
+
+            const size_t grid_index = a * mesh_b * mesh_c + b * mesh_c + c;
+            result += grid_index;
+            const complex<double> density_value = density[grid_index] * complex<double>{0.0, 1.0};
+
+            if(a >= (mesh_a + 1) / 2) a -= mesh_a;
+            if(b >= (mesh_b + 1) / 2) b -= mesh_b;
+            if(c >= (mesh_c + 1) / 2) c -= mesh_c;
+
+            int n_grid = mesh_a * mesh_b * mesh_c;
+
+            *result = (a * reciprocal_lattice[0] +
+                       b * reciprocal_lattice[3] +
+                       c * reciprocal_lattice[6]) * density_value;
+            result += n_grid; 
+            *result = (a * reciprocal_lattice[1] +
+                       b * reciprocal_lattice[4] +
+                       c * reciprocal_lattice[7]) * density_value;
+            result += n_grid; 
+            *result = (a * reciprocal_lattice[2] +
+                       b * reciprocal_lattice[5] +
+                       c * reciprocal_lattice[8]) * density_value;
+        }
+    """,
+        "kernel",
+    )
+
+    custom_kernel(
+        grid_dim,
+        block_dim,
+        (result, density, b, mesh_in_int32[0], mesh_in_int32[1], mesh_in_int32[2]),
+    )
+
+    return result
+
+def contract_iG_potential(gga_potential, cell):
+    mesh = cell.mesh
+    b = cp.asarray(cell.reciprocal_vectors())
+
+    mesh_in_int32 = np.array(mesh, dtype=np.int32)
+    block_dim = (8, 8, 8)
+    grid_dim = tuple(
+        np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)
+    )
+
+    custom_kernel = cp.RawKernel(
+        r"""
+        #include <cupy/complex.cuh>
+        extern "C" __global__
+        void kernel(complex<double> * gga_potential,
+                    const double * reciprocal_lattice,
+                    const int mesh_a,
+                    const int mesh_b,
+                    const int mesh_c) {
+            int a = blockIdx.x * blockDim.x + threadIdx.x;
+            int b = blockIdx.y * blockDim.y + threadIdx.y;
+            int c = blockIdx.z * blockDim.z + threadIdx.z;
+
+            if(a >= mesh_a || b >= mesh_b || c >= mesh_c) {
+                return;
+            }
+
+            int n_grid = mesh_a * mesh_b * mesh_c;
+
+            const size_t grid_index = a * mesh_b * mesh_c + b * mesh_c + c;
+            gga_potential += grid_index;
+
+            if(a >= (mesh_a + 1) / 2) a -= mesh_a;
+            if(b >= (mesh_b + 1) / 2) b -= mesh_b;
+            if(c >= (mesh_c + 1) / 2) c -= mesh_c;
+            complex<double> potential_change = 0;
+            potential_change += 
+                      (a * reciprocal_lattice[0] +
+                       b * reciprocal_lattice[3] +
+                       c * reciprocal_lattice[6]) * 
+                       complex<double>{0.0, 1.0} * 
+                       gga_potential[n_grid];
+
+            potential_change += 
+                      (a * reciprocal_lattice[1] +
+                       b * reciprocal_lattice[4] +
+                       c * reciprocal_lattice[7]) * 
+                       complex<double>{0.0, 1.0} * 
+                       gga_potential[2 * n_grid];
+
+            potential_change += 
+                      (a * reciprocal_lattice[2] +
+                       b * reciprocal_lattice[5] +
+                       c * reciprocal_lattice[8]) * 
+                       complex<double>{0.0, 1.0} * 
+                       gga_potential[3 * n_grid];
+
+            *gga_potential -= potential_change; 
+        }
+    """,
+        "kernel",
+    )
+
+    custom_kernel(
+        grid_dim,
+        block_dim,
+        (gga_potential, b, mesh_in_int32[0], mesh_in_int32[1], mesh_in_int32[2]),
+    )
+
+
 def unique_with_sort(x):
     # This function does the same thing as cp.unique(x, return_inverse=True).
     # It's not super optimized, but for whatever reason, cp.unique is very slow, so this one is better.
@@ -100,6 +234,152 @@ def unique_with_sort(x):
     return x, inverse_unique[inverse_sort]
 
 
+def unique_with_multiple_keys(x):
+    # This function expands the previous function to handle multiple keys
+    # shaped as [ (1, 2), (3, -4), ....]
+    assert (
+        type(x) is cp.ndarray
+        and (x.dtype == cp.int32 or x.dtype == cp.int64)
+        and x.ndim == 2
+    )
+    x = x.T
+    n = x.shape[-1]
+
+    inverse_sort = cp.zeros(n, dtype=cp.int64)
+    if n <= 1:
+        return x, inverse_sort
+
+    sort_index = cp.lexsort(x)
+    inverse_sort[sort_index] = cp.arange(0, n, dtype=cp.int64)
+    x = x[:, sort_index].T
+
+    mask = cp.empty(n, dtype=cp.bool_)
+    mask[0] = True
+    mask[1:] = cp.any(x[1:] != x[:-1], axis=-1)
+
+    x = x[mask]
+    inverse_unique = cp.cumsum(mask, dtype=cp.int64) - 1
+
+    return x, inverse_unique[inverse_sort]
+
+
+def sort_contraction_coefficients(coeff):
+    contraction_shapes = cp.array([i.shape for i in coeff])
+    unique_shapes, inverse = unique_with_multiple_keys(contraction_shapes)
+    unique_shapes = unique_shapes.get()
+    inverse = inverse.get()
+
+    sliced_axis = np.zeros((len(coeff) + 1, 2), dtype=np.int32)
+    sliced_axis[1:] = np.cumsum(
+        np.array([i.shape for i in coeff]), axis=0, dtype=np.int32
+    )
+
+    left_basis_function_indices = [
+        cp.arange(begin, end)
+        for begin, end in zip(sliced_axis[:-1, 0], sliced_axis[1:, 0])
+    ]
+    right_basis_function_indices = [
+        cp.arange(begin, end)
+        for begin, end in zip(sliced_axis[:-1, 1], sliced_axis[1:, 1])
+    ]
+
+    sorted_coeffs = [
+        {"shape": shape, "coeffs": [], "left_indices": [], "right_indices": []}
+        for shape in unique_shapes
+    ]
+
+    for category, coeffs, left, right in zip(
+        inverse, coeff, left_basis_function_indices, right_basis_function_indices
+    ):
+        sorted_coeffs[category]["coeffs"].append(coeffs)
+        sorted_coeffs[category]["left_indices"].append(left)
+        sorted_coeffs[category]["right_indices"].append(right)
+
+    for category in sorted_coeffs:
+        category["coeffs"] = cp.array(category["coeffs"])
+        category["left_indices"] = cp.concatenate(category["left_indices"])
+        category["right_indices"] = cp.concatenate(category["right_indices"])
+
+    return sorted_coeffs, sliced_axis[-1]
+
+
+def contracted_to_primitive(
+    batched_matrices, sorted_coeffs_left, sorted_coeffs_right, primitive_shape
+):
+
+    assert len(batched_matrices.shape) == 3
+
+    n_slices = batched_matrices.shape[0]
+    n_cols = batched_matrices.shape[2]
+
+    n_rows_primitive = primitive_shape[0]
+    n_cols_primitive = primitive_shape[1]
+
+    intermediate_shape = (n_slices, n_rows_primitive, n_cols)
+    intermediate = cp.zeros(intermediate_shape)
+
+    for i in sorted_coeffs_left:
+        subarray_shape = (n_slices, -1, i["shape"][1], n_cols)
+        intermediate[:, i["left_indices"]] = cp.einsum(
+            "naij, api -> napj",
+            batched_matrices[:, i["right_indices"]].reshape(subarray_shape),
+            i["coeffs"],
+        ).reshape(n_slices, -1, n_cols)
+
+    intermediate = intermediate.transpose(0, 2, 1)
+
+    result_shape = (n_slices, n_cols_primitive, n_rows_primitive)
+    result = cp.zeros(result_shape)
+
+    for i in sorted_coeffs_right:
+        subarray_shape = (n_slices, -1, i["shape"][1], n_rows_primitive)
+        result[:, i["left_indices"]] = cp.einsum(
+            "najp, aqj -> naqp",
+            intermediate[:, i["right_indices"]].reshape(subarray_shape),
+            i["coeffs"],
+        ).reshape(n_slices, -1, n_rows_primitive)
+
+    return result.transpose(0, 2, 1)
+
+
+def primitive_to_contracted(
+    batched_matrices, sorted_coeffs_left, sorted_coeffs_right, contracted_shape
+):
+
+    assert len(batched_matrices.shape) == 3
+
+    n_slices = batched_matrices.shape[0]
+    n_cols = batched_matrices.shape[2]
+
+    n_rows_contracted = contracted_shape[0]
+    n_cols_contracted = contracted_shape[1]
+
+    intermediate_shape = (n_slices, n_rows_contracted, n_cols)
+    intermediate = cp.zeros(intermediate_shape)
+
+    for i in sorted_coeffs_left:
+        subarray_shape = (n_slices, -1, i["shape"][0], n_cols)
+        intermediate[:, i["right_indices"]] = cp.einsum(
+            "napq, api -> naiq",
+            batched_matrices[:, i["left_indices"]].reshape(subarray_shape),
+            i["coeffs"],
+        ).reshape(n_slices, -1, n_cols)
+
+    intermediate = intermediate.transpose(0, 2, 1)
+    result_shape = (n_slices, n_cols_contracted, n_rows_contracted)
+    result = cp.zeros(result_shape)
+
+    for i in sorted_coeffs_right:
+        subarray_shape = (n_slices, -1, i["shape"][0], n_rows_contracted)
+        result[:, i["right_indices"]] = cp.einsum(
+            "naqi, aqj -> naji",
+            intermediate[:, i["left_indices"]].reshape(subarray_shape),
+            i["coeffs"],
+        ).reshape(n_slices, -1, n_rows_contracted)
+
+    return result.transpose(0, 2, 1)
+
+
 def image_pair_to_difference(
     vectors_to_neighboring_images,
     lattice_vectors,
@@ -377,7 +657,7 @@ def assign_pairs_to_blocks(
 
     pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0]
     sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32)
-    n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks) 
+    n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks)
     accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32)
     sorted_block_index = sorted_block_index[:n_contributing_blocks]
 
@@ -445,7 +725,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes)
         n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32)
         equivalent_cell_in_localized, coeff_in_localized = (
-            subcell_in_localized_region.decontract_basis(to_cart=True, aggregate=True)
+            subcell_in_localized_region.decontract_basis(to_cart=True)
         )
 
         n_primitive_gtos_in_localized = multigrid._pgto_shells(
@@ -460,13 +740,11 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
 
         if grids_diffused is None:
             grouped_cell = equivalent_cell_in_localized
-            concatenated_coeff = scipy.linalg.block_diag(coeff_in_localized)
+            concatenated_coeff = coeff_in_localized
         else:
             subcell_in_diffused_region = grids_diffused.cell
             equivalent_cell_in_diffused, coeff_in_diffused = (
-                subcell_in_diffused_region.decontract_basis(
-                    to_cart=True, aggregate=True
-                )
+                subcell_in_diffused_region.decontract_basis(to_cart=True)
             )
 
             grouped_cell = equivalent_cell_in_localized + equivalent_cell_in_diffused
@@ -475,10 +753,15 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
                 subcell_in_localized_region._atm
             )
 
-            concatenated_coeff = scipy.linalg.block_diag(
-                coeff_in_localized, coeff_in_diffused
-            )
-        concatenated_coeff = cp.asarray(concatenated_coeff)
+            concatenated_coeff = coeff_in_localized + coeff_in_diffused
+
+        coeff_in_localized, localized_shape = sort_contraction_coefficients(
+            coeff_in_localized
+        )
+
+        concatenated_coeff, concatenated_shape = sort_contraction_coefficients(
+            concatenated_coeff
+        )
 
         n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell)
         rad = vol ** (-1.0 / 3) * cell.rcut + 1
@@ -499,7 +782,6 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         concatenated_ao_indices = cp.concatenate(
             (ao_indices_in_localized, ao_indices_in_diffused)
         )
-        coeff_in_localized = cp.asarray(coeff_in_localized)
         per_angular_pairs = []
 
         i_angulars = grouped_cell._bas[:n_primitive_gtos_in_localized, multigrid.ANG_OF]
@@ -611,6 +893,8 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
                 "concatenated_ao_indices": concatenated_ao_indices,
                 "coeff_in_localized": coeff_in_localized,
                 "concatenated_coeff": concatenated_coeff,
+                "primitive_shape": (localized_shape[0], concatenated_shape[0]),
+                "contracted_shape": (localized_shape[1], concatenated_shape[1]),
                 "atm": atm,
                 "bas": bas,
                 "env": env,
@@ -758,7 +1042,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         n_grid_points = np.prod(mesh)
         weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points
 
-
         density_matrix_with_rows_in_localized = dms[
             :,
             :,
@@ -773,7 +1056,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             pairs["ao_indices_in_localized"],
         ]
 
-
         n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3]
 
         t0 = log.init_timer()
@@ -782,17 +1064,20 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             :, :, :, n_ao_in_localized:
         ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
 
-        coeff_sandwiched_density_matrix = cp.einsum(
-            "nkij,pi->nkpj",
-            density_matrix_with_rows_in_localized,
-            pairs["coeff_in_localized"],
+        n_sets, n_k_points = density_matrix_with_rows_in_localized.shape[:2]
+
+        density_matrix_with_rows_in_localized = (
+            density_matrix_with_rows_in_localized.reshape(
+                -1, *pairs["contracted_shape"]
+            )
         )
 
-        coeff_sandwiched_density_matrix = cp.einsum(
-            "nkpj, qj -> nkpq",
-            coeff_sandwiched_density_matrix,
+        coeff_sandwiched_density_matrix = contracted_to_primitive(
+            density_matrix_with_rows_in_localized,
+            pairs["coeff_in_localized"],
             pairs["concatenated_coeff"],
-        )
+            pairs["primitive_shape"],
+        ).reshape(n_sets, n_k_points, *pairs["primitive_shape"])
 
         contraction_time += log.timer_silent(*t0)[-1]
         t0 = log.init_timer()
@@ -803,7 +1088,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         density = (
             evaluate_density_wrapper(
                 pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
-            ) * weight_per_grid_point
+            )
+            * weight_per_grid_point
         )
 
         kernel_time += log.timer_silent(*t0)[-1]
@@ -839,8 +1125,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
 
     density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1])
     if xc_type != "LDA":
-        density_on_g_mesh[:, 1:4] = pbc_tools._get_Gv(mydf.cell, mydf.mesh).T
-        density_on_g_mesh[:, 1:4] *= density_on_g_mesh[:, :1] * 1j
+        for i in range(len(density_on_g_mesh)):
+            density_on_g_mesh[i, 1:4] = iG_density(density_on_g_mesh[i], cell)
     print("density kernel: ", kernel_time, "ms")
     print("density fft: ", fft_time, "ms")
     print("density contraction: ", contraction_time, "ms")
@@ -864,8 +1150,7 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         c_driver = libgpbc.evaluate_xc_with_tau_driver
     else:
         c_driver = libgpbc.evaluate_xc_driver
-    n_i_functions = len(pairs_info["coeff_in_localized"])
-    n_j_functions = len(pairs_info["concatenated_coeff"])
+    n_i_functions, n_j_functions = pairs_info["primitive_shape"]
 
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
@@ -922,12 +1207,12 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
                 f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed"
             )
 
-
     if not (n_k_points == 1 and n_difference_images == 1):
         return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock)
     else:
         return fock
 
+
 def convert_xc_on_g_mesh_to_fock(
     mydf,
     xc_on_g_mesh,
@@ -1000,15 +1285,21 @@ def convert_xc_on_g_mesh_to_fock(
         n_ao_in_localized = len(pairs["ao_indices_in_localized"])
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        fock_slice =  evaluate_xc_wrapper(
+        fock_slice = evaluate_xc_wrapper(
             pairs, interpolated_xc, img_phase, with_tau=with_tau
         )
-
         kernel_time += log.timer_silent(*t0)[-1]
         t0 = log.init_timer()
-        
-        fock_slice = cp.einsum("nkpq,pi->nkiq", fock_slice, pairs["coeff_in_localized"])
-        fock_slice = cp.einsum("nkiq,qj->nkij", fock_slice, pairs["concatenated_coeff"])
+        n_sets, n_k_points = fock_slice.shape[:2]
+
+        fock_slice = fock_slice.reshape(-1, *pairs["primitive_shape"])
+
+        fock_slice = primitive_to_contracted(
+            fock_slice,
+            pairs["coeff_in_localized"],
+            pairs["concatenated_coeff"],
+            pairs["contracted_shape"],
+        ).reshape(n_sets, n_k_points, *pairs["contracted_shape"])
 
         # While mathematically it is correct to have concatenated
         # ao indices in the addition, but it is possible that the ao
@@ -1046,7 +1337,6 @@ def convert_xc_on_g_mesh_to_fock(
         else:
             raise NotImplementedError
 
-
     print("xc kernel: ", kernel_time, "ms")
     print("xc fft: ", fft_time, "ms")
     print("xc contraction: ", contraction_time, "ms")
@@ -1197,17 +1487,18 @@ def convert_xc_on_g_mesh_to_fock_gradient(
             :, :, :, n_ao_in_localized:
         ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
 
-        coeff_sandwiched_density_matrix = cp.einsum(
-            "nkij,pi->nkpj",
-            density_matrix_slice,
-            pairs["coeff_in_localized"],
+        n_sets, n_k_points = density_matrix_slice.shape[:2]
+
+        density_matrix_slice = density_matrix_slice.reshape(
+            -1, *pairs["contracted_shape"]
         )
 
-        coeff_sandwiched_density_matrix = cp.einsum(
-            "nkpj, qj -> nkpq",
-            coeff_sandwiched_density_matrix,
+        coeff_sandwiched_density_matrix = contracted_to_primitive(
+            density_matrix_slice,
+            pairs["coeff_in_localized"],
             pairs["concatenated_coeff"],
-        )
+            pairs["primitive_shape"],
+        ).reshape(n_sets, n_k_points, *pairs["primitive_shape"])
 
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
 
@@ -1378,12 +1669,9 @@ def nr_rks(
 
     tq = log.init_timer()
     Gv = pbc_tools._get_Gv(cell, mesh)
-    print("get_Gv: ", log.timer_silent(*tq)[-1], "ms")
 
     tq = log.init_timer()
-    coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
-    print("get_coulG: ", log.timer_silent(*tq)[-1], "ms")
-    coulomb_on_g_mesh = rho_sf * coulomb_kernel_on_g_mesh
+    coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
     log.debug("Multigrid Coulomb energy %s", coulomb_energy)
@@ -1422,17 +1710,15 @@ def nr_rks(
 
     log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
 
-
     tq = log.init_timer()
     if xc_type == "LDA":
         pass
     elif xc_type == "GGA":
-        xc_for_fock = (
-            xc_for_fock[0] - contract("gp, pg -> p", xc_for_fock[1:4], Gv) * 1j
-        )
+        contract_iG_potential(xc_for_fock, cell)
+        xc_for_fock = xc_for_fock[0]
         xc_for_fock = xc_for_fock.reshape((-1, ngrids))
     elif xc_type == "MGGA":
-        xc_for_fock[0] -= contract("gp, pg -> p", xc_for_fock[1:4], Gv) * 1j
+        xc_for_fock = contract_iG_potential(xc_for_fock, cell)
         xc_for_fock = cp.concatenate(
             [
                 xc_for_fock[0].reshape((-1, ngrids)),
@@ -1514,8 +1800,7 @@ def nr_uks(
     rho_sf = density[0, 0] + density[1, 0]
 
     Gv = pbc_tools._get_Gv(cell, mesh)
-    coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
-    coulomb_on_g_mesh = rho_sf * coulomb_kernel_on_g_mesh
+    coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
     log.debug("Multigrid Coulomb energy %s", coulomb_energy)
@@ -1553,12 +1838,12 @@ def nr_uks(
     if xc_type == "LDA":
         pass
     elif xc_type == "GGA":
-        xc_for_fock = (
-            xc_for_fock[:, 0] - contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
-        )
+        for i in xc_for_fock:
+            contract_iG_potential(i, cell)
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
     elif xc_type == "MGGA":
-        xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
+        for i in xc_for_fock:
+            contract_iG_potential(i, cell)
         xc_for_fock = cp.concatenate(
             [
                 xc_for_fock[:, 0].reshape((nset, -1, ngrids)),
@@ -1623,8 +1908,7 @@ def get_veff_ip1(
     density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type)
 
     Gv = pbc_tools._get_Gv(cell, mesh)
-    coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
-    coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh)
+    coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], ni.coulG)
 
     weight = cell.vol / ngrids
 
@@ -1649,9 +1933,9 @@ def get_veff_ip1(
     if xc_type == "LDA":
         pass
     elif xc_type == "GGA":
-        xc_for_fock = (
-            xc_for_fock[:, 0] - contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
-        )
+        for i in xc_for_fock:
+            contract_iG_potential(i, cell)
+        xc_for_fock = xc_for_fock[:,0]
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
     elif xc_type == "MGGA":
         xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
@@ -1687,6 +1971,8 @@ def __init__(self, cell):
         self.mesh = cell.mesh
         self.tasks = None
         self.sorted_gaussian_pairs = None
+        Gv = pbc_tools._get_Gv(cell, cell.mesh)
+        self.coulG = pbc_tools.get_coulG(cell, Gv=Gv)
 
     build = sort_gaussian_pairs
 

From 38189ecf5f5203a52f7544def4161b4a53c8b05c Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Mon, 24 Nov 2025 14:58:32 -0800
Subject: [PATCH 06/14] clean timing printers

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 53 ++-----------------------------
 1 file changed, 3 insertions(+), 50 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 3d03b7cf2..e184c6899 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -83,7 +83,7 @@ def iG_density(density, cell):
         np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)
     )
 
-    result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3,-1)
+    result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3, -1)
 
     custom_kernel = cp.RawKernel(
         r"""
@@ -137,6 +137,7 @@ def iG_density(density, cell):
 
     return result
 
+
 def contract_iG_potential(gga_potential, cell):
     mesh = cell.mesh
     b = cp.asarray(cell.reciprocal_vectors())
@@ -1025,16 +1026,12 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     cell = mydf.cell
-    log = logger.new_logger(cell, cell.verbose)
 
     nx, ny, nz = mydf.mesh
     density_on_g_mesh = cp.zeros(
         (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128
     )
 
-    kernel_time = 0
-    fft_time = 0
-    contraction_time = 0
     for pairs in mydf.sorted_gaussian_pairs:
 
         mesh = pairs["mesh"]
@@ -1058,8 +1055,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
 
         n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3]
 
-        t0 = log.init_timer()
-
         density_matrix_with_rows_in_localized[
             :, :, :, n_ao_in_localized:
         ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
@@ -1079,9 +1074,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             pairs["primitive_shape"],
         ).reshape(n_sets, n_k_points, *pairs["primitive_shape"])
 
-        contraction_time += log.timer_silent(*t0)[-1]
-        t0 = log.init_timer()
-
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
 
         img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
@@ -1092,18 +1084,12 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             * weight_per_grid_point
         )
 
-        kernel_time += log.timer_silent(*t0)[-1]
-        t0 = log.init_timer()
-
         if with_tau:
             assert density.shape[1] == 2
             tau = density[:, 1]
             density = density[:, 0]
 
         density = fft_in_place(density)
-
-        fft_time += log.timer_silent(*t0)[-1]
-
         density_on_g_mesh[
             :,
             0,
@@ -1127,9 +1113,6 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
     if xc_type != "LDA":
         for i in range(len(density_on_g_mesh)):
             density_on_g_mesh[i, 1:4] = iG_density(density_on_g_mesh[i], cell)
-    print("density kernel: ", kernel_time, "ms")
-    print("density fft: ", fft_time, "ms")
-    print("density contraction: ", contraction_time, "ms")
 
     return density_on_g_mesh
 
@@ -1263,14 +1246,9 @@ def convert_xc_on_g_mesh_to_fock(
         data_type = complex_type(cp.float64)
 
     fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type)
-    kernel_time = 0
-    fft_time = 0
-    contraction_time = 0
-    log = logger.new_logger(cell, cell.verbose)
 
     for pairs in mydf.sorted_gaussian_pairs:
 
-        t0 = log.init_timer()
         interpolated_xc = xc_on_g_mesh[
             :,
             :,
@@ -1279,8 +1257,6 @@ def convert_xc_on_g_mesh_to_fock(
             pairs["fft_grid"][2],
         ]
         interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C")
-        fft_time += log.timer_silent(*t0)[-1]
-        t0 = log.init_timer()
 
         n_ao_in_localized = len(pairs["ao_indices_in_localized"])
         libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
@@ -1288,8 +1264,6 @@ def convert_xc_on_g_mesh_to_fock(
         fock_slice = evaluate_xc_wrapper(
             pairs, interpolated_xc, img_phase, with_tau=with_tau
         )
-        kernel_time += log.timer_silent(*t0)[-1]
-        t0 = log.init_timer()
         n_sets, n_k_points = fock_slice.shape[:2]
 
         fock_slice = fock_slice.reshape(-1, *pairs["primitive_shape"])
@@ -1323,8 +1297,6 @@ def convert_xc_on_g_mesh_to_fock(
             pairs["ao_indices_in_diffused"],
         ] += fock_slice[:, :, :, n_ao_in_localized:]
 
-        contraction_time += log.timer_silent(*t0)[-1]
-
         if hermi == 1:
             fock[
                 :,
@@ -1337,10 +1309,6 @@ def convert_xc_on_g_mesh_to_fock(
         else:
             raise NotImplementedError
 
-    print("xc kernel: ", kernel_time, "ms")
-    print("xc fft: ", fft_time, "ms")
-    print("xc contraction: ", contraction_time, "ms")
-
     return fock
 
 
@@ -1662,15 +1630,9 @@ def nr_rks(
     mesh = ni.mesh
     ngrids = np.prod(mesh)
 
-    tq = log.init_timer()
     density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type)
-    print("evaluate_density_on_g_mesh: ", log.timer_silent(*tq)[-1], "ms")
     rho_sf = density[0, 0]
 
-    tq = log.init_timer()
-    Gv = pbc_tools._get_Gv(cell, mesh)
-
-    tq = log.init_timer()
     coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
@@ -1684,7 +1646,6 @@ def nr_rks(
 
     # eval_xc_eff supports float64 only
     density = cp.asarray(density, dtype=np.float64, order="C")
-    tq = log.init_timer()
     if xc_type == "LDA":
         xc_for_energy, xc_for_fock = ni.eval_xc_eff(
             xc_code, density[0], deriv=1, xctype=xc_type
@@ -1696,21 +1657,16 @@ def nr_rks(
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
-    print("libxc: ", log.timer_silent(*tq)[-1], "ms")
-
     rho_sf = density[0].real
     xc_energy_sum = rho_sf.dot(xc_for_energy.ravel()).get()[()] * weight
 
     # To reduce the memory usage, we reuse the xc_for_fock name.
     # Now xc_for_fock represents xc on G space
     xc_for_fock *= weight
-    tq = log.init_timer()
     xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids)
-    print("fft for xc:", log.timer_silent(*tq)[-1], "ms")
 
     log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
 
-    tq = log.init_timer()
     if xc_type == "LDA":
         pass
     elif xc_type == "GGA":
@@ -1729,8 +1685,6 @@ def nr_rks(
     else:
         raise ValueError(f"Incorrect xc_type = {xc_type}")
 
-    print("merge Gv: ", log.timer_silent(*tq)[-1], "ms")
-
     if with_j:
         xc_for_fock[0] += coulomb_on_g_mesh
 
@@ -1799,7 +1753,6 @@ def nr_uks(
     density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type)
     rho_sf = density[0, 0] + density[1, 0]
 
-    Gv = pbc_tools._get_Gv(cell, mesh)
     coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
@@ -1935,7 +1888,7 @@ def get_veff_ip1(
     elif xc_type == "GGA":
         for i in xc_for_fock:
             contract_iG_potential(i, cell)
-        xc_for_fock = xc_for_fock[:,0]
+        xc_for_fock = xc_for_fock[:, 0]
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
     elif xc_type == "MGGA":
         xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j

From de2b63b7d3479b0dc825e9714cfda5b93c141d6e Mon Sep 17 00:00:00 2001
From: Rui Li <rli3@caltech.edu>
Date: Thu, 4 Dec 2025 15:21:34 -0800
Subject: [PATCH 07/14] remove old caching

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 0e7d04264..8671da0c6 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -1591,7 +1591,8 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
 
     density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts)
     Gv = pbc_tools._get_Gv(cell, mesh)
-    coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
+    # coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
+    coulomb_kernel_on_g_mesh = ni.coulG
 
     coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh)
     weight = cell.vol / ngrids
@@ -1974,8 +1975,6 @@ def __init__(self, cell):
         Gv = pbc_tools._get_Gv(cell, cell.mesh)
         self.coulG = pbc_tools.get_coulG(cell, Gv=Gv)
         self.build()
-        if cell._pseudo:
-            self.cached_vpplocG_part1 = multigrid_v1.eval_vpplocG_part1(cell, self.mesh)
 
     build = sort_gaussian_pairs
 

From 37d0d068071d678703d378d04ca45d8a51da376c Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Thu, 12 Feb 2026 14:21:38 -0800
Subject: [PATCH 08/14] add changes of pseudo-potential

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 737 +++++++++++++-----------------
 gpu4pyscf/pbc/grad/rhf.py         |  35 +-
 2 files changed, 338 insertions(+), 434 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 8671da0c6..ccedf4fb6 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -35,9 +35,9 @@
 import gpu4pyscf.pbc.dft.multigrid as multigrid_v1
 from gpu4pyscf.lib.cupy_helper import contract, tag_array, load_library
 
-__all__ = ["MultiGridNumInt"]
+__all__ = ['MultiGridNumInt']
 
-libgpbc = load_library("libmgrid_v2")
+libgpbc = load_library('libmgrid_v2')
 libgpbc.evaluate_density_driver.restype = ctypes.c_int
 libgpbc.evaluate_xc_driver.restype = ctypes.c_int
 libgpbc.evaluate_xc_gradient_driver.restype = ctypes.c_int
@@ -52,7 +52,7 @@ def complex_type(dtype):
     elif dtype == cp.float64:
         return cp.complex128
     else:
-        raise ValueError("Invalid dtype")
+        raise ValueError('Invalid dtype')
 
 
 def cast_to_pointer(array):
@@ -61,7 +61,7 @@ def cast_to_pointer(array):
     elif isinstance(array, np.ndarray):
         return array.ctypes.data_as(ctypes.c_void_p)
     else:
-        raise ValueError("Invalid array type")
+        raise ValueError('Invalid array type')
 
 
 def fft_in_place(x):
@@ -78,9 +78,7 @@ def iG_density(density, cell):
 
     mesh_in_int32 = np.array(mesh, dtype=np.int32)
     block_dim = (8, 8, 8)
-    grid_dim = tuple(
-        np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)
-    )
+    grid_dim = tuple(np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32))
 
     result = cp.zeros((3, *mesh), dtype=cp.complex128).reshape(3, -1)
 
@@ -125,7 +123,7 @@ def iG_density(density, cell):
                        c * reciprocal_lattice[8]) * density_value;
         }
     """,
-        "kernel",
+        'kernel',
     )
 
     custom_kernel(
@@ -143,9 +141,7 @@ def contract_iG_potential(gga_potential, cell):
 
     mesh_in_int32 = np.array(mesh, dtype=np.int32)
     block_dim = (8, 8, 8)
-    grid_dim = tuple(
-        np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32)
-    )
+    grid_dim = tuple(np.array(np.ceil(mesh_in_int32 / np.array(block_dim)), dtype=np.int32))
 
     custom_kernel = cp.RawKernel(
         r"""
@@ -197,7 +193,7 @@ def contract_iG_potential(gga_potential, cell):
             *gga_potential -= potential_change; 
         }
     """,
-        "kernel",
+        'kernel',
     )
 
     custom_kernel(
@@ -210,11 +206,7 @@ def contract_iG_potential(gga_potential, cell):
 def unique_with_sort(x):
     # This function does the same thing as cp.unique(x, return_inverse=True).
     # It's not super optimized, but for whatever reason, cp.unique is very slow, so this one is better.
-    assert (
-        type(x) is cp.ndarray
-        and (x.dtype == cp.int32 or x.dtype == cp.int64)
-        and x.ndim == 1
-    )
+    assert type(x) is cp.ndarray and (x.dtype == cp.int32 or x.dtype == cp.int64) and x.ndim == 1
     n = x.shape[0]
     if n <= 1:
         return x, cp.zeros(n)
@@ -237,11 +229,7 @@ def unique_with_sort(x):
 def unique_with_multiple_keys(x):
     # This function expands the previous function to handle multiple keys
     # shaped as [ (1, 2), (3, -4), ....]
-    assert (
-        type(x) is cp.ndarray
-        and (x.dtype == cp.int32 or x.dtype == cp.int64)
-        and x.ndim == 2
-    )
+    assert type(x) is cp.ndarray and (x.dtype == cp.int32 or x.dtype == cp.int64) and x.ndim == 2
     x = x.T
     n = x.shape[-1]
 
@@ -270,43 +258,29 @@ def sort_contraction_coefficients(coeff):
     inverse = inverse.get()
 
     sliced_axis = np.zeros((len(coeff) + 1, 2), dtype=np.int32)
-    sliced_axis[1:] = np.cumsum(
-        np.array([i.shape for i in coeff]), axis=0, dtype=np.int32
-    )
+    sliced_axis[1:] = np.cumsum(np.array([i.shape for i in coeff]), axis=0, dtype=np.int32)
 
-    left_basis_function_indices = [
-        cp.arange(begin, end)
-        for begin, end in zip(sliced_axis[:-1, 0], sliced_axis[1:, 0])
-    ]
+    left_basis_function_indices = [cp.arange(begin, end) for begin, end in zip(sliced_axis[:-1, 0], sliced_axis[1:, 0])]
     right_basis_function_indices = [
-        cp.arange(begin, end)
-        for begin, end in zip(sliced_axis[:-1, 1], sliced_axis[1:, 1])
+        cp.arange(begin, end) for begin, end in zip(sliced_axis[:-1, 1], sliced_axis[1:, 1])
     ]
 
-    sorted_coeffs = [
-        {"shape": shape, "coeffs": [], "left_indices": [], "right_indices": []}
-        for shape in unique_shapes
-    ]
+    sorted_coeffs = [{'shape': shape, 'coeffs': [], 'left_indices': [], 'right_indices': []} for shape in unique_shapes]
 
-    for category, coeffs, left, right in zip(
-        inverse, coeff, left_basis_function_indices, right_basis_function_indices
-    ):
-        sorted_coeffs[category]["coeffs"].append(coeffs)
-        sorted_coeffs[category]["left_indices"].append(left)
-        sorted_coeffs[category]["right_indices"].append(right)
+    for category, coeffs, left, right in zip(inverse, coeff, left_basis_function_indices, right_basis_function_indices):
+        sorted_coeffs[category]['coeffs'].append(coeffs)
+        sorted_coeffs[category]['left_indices'].append(left)
+        sorted_coeffs[category]['right_indices'].append(right)
 
     for category in sorted_coeffs:
-        category["coeffs"] = cp.array(category["coeffs"])
-        category["left_indices"] = cp.concatenate(category["left_indices"])
-        category["right_indices"] = cp.concatenate(category["right_indices"])
+        category['coeffs'] = cp.array(category['coeffs'])
+        category['left_indices'] = cp.concatenate(category['left_indices'])
+        category['right_indices'] = cp.concatenate(category['right_indices'])
 
     return sorted_coeffs, sliced_axis[-1]
 
 
-def contracted_to_primitive(
-    batched_matrices, sorted_coeffs_left, sorted_coeffs_right, primitive_shape
-):
-
+def contracted_to_primitive(batched_matrices, sorted_coeffs_left, sorted_coeffs_right, primitive_shape):
     assert len(batched_matrices.shape) == 3
 
     n_slices = batched_matrices.shape[0]
@@ -319,11 +293,11 @@ def contracted_to_primitive(
     intermediate = cp.zeros(intermediate_shape)
 
     for i in sorted_coeffs_left:
-        subarray_shape = (n_slices, -1, i["shape"][1], n_cols)
-        intermediate[:, i["left_indices"]] = cp.einsum(
-            "naij, api -> napj",
-            batched_matrices[:, i["right_indices"]].reshape(subarray_shape),
-            i["coeffs"],
+        subarray_shape = (n_slices, -1, i['shape'][1], n_cols)
+        intermediate[:, i['left_indices']] = cp.einsum(
+            'naij, api -> napj',
+            batched_matrices[:, i['right_indices']].reshape(subarray_shape),
+            i['coeffs'],
         ).reshape(n_slices, -1, n_cols)
 
     intermediate = intermediate.transpose(0, 2, 1)
@@ -332,20 +306,17 @@ def contracted_to_primitive(
     result = cp.zeros(result_shape)
 
     for i in sorted_coeffs_right:
-        subarray_shape = (n_slices, -1, i["shape"][1], n_rows_primitive)
-        result[:, i["left_indices"]] = cp.einsum(
-            "najp, aqj -> naqp",
-            intermediate[:, i["right_indices"]].reshape(subarray_shape),
-            i["coeffs"],
+        subarray_shape = (n_slices, -1, i['shape'][1], n_rows_primitive)
+        result[:, i['left_indices']] = cp.einsum(
+            'najp, aqj -> naqp',
+            intermediate[:, i['right_indices']].reshape(subarray_shape),
+            i['coeffs'],
         ).reshape(n_slices, -1, n_rows_primitive)
 
     return result.transpose(0, 2, 1)
 
 
-def primitive_to_contracted(
-    batched_matrices, sorted_coeffs_left, sorted_coeffs_right, contracted_shape
-):
-
+def primitive_to_contracted(batched_matrices, sorted_coeffs_left, sorted_coeffs_right, contracted_shape):
     assert len(batched_matrices.shape) == 3
 
     n_slices = batched_matrices.shape[0]
@@ -358,11 +329,11 @@ def primitive_to_contracted(
     intermediate = cp.zeros(intermediate_shape)
 
     for i in sorted_coeffs_left:
-        subarray_shape = (n_slices, -1, i["shape"][0], n_cols)
-        intermediate[:, i["right_indices"]] = cp.einsum(
-            "napq, api -> naiq",
-            batched_matrices[:, i["left_indices"]].reshape(subarray_shape),
-            i["coeffs"],
+        subarray_shape = (n_slices, -1, i['shape'][0], n_cols)
+        intermediate[:, i['right_indices']] = cp.einsum(
+            'napq, api -> naiq',
+            batched_matrices[:, i['left_indices']].reshape(subarray_shape),
+            i['coeffs'],
         ).reshape(n_slices, -1, n_cols)
 
     intermediate = intermediate.transpose(0, 2, 1)
@@ -370,11 +341,11 @@ def primitive_to_contracted(
     result = cp.zeros(result_shape)
 
     for i in sorted_coeffs_right:
-        subarray_shape = (n_slices, -1, i["shape"][0], n_rows_contracted)
-        result[:, i["right_indices"]] = cp.einsum(
-            "naqi, aqj -> naji",
-            intermediate[:, i["left_indices"]].reshape(subarray_shape),
-            i["coeffs"],
+        subarray_shape = (n_slices, -1, i['shape'][0], n_rows_contracted)
+        result[:, i['right_indices']] = cp.einsum(
+            'naqi, aqj -> naji',
+            intermediate[:, i['left_indices']].reshape(subarray_shape),
+            i['coeffs'],
         ).reshape(n_slices, -1, n_rows_contracted)
 
     return result.transpose(0, 2, 1)
@@ -384,7 +355,7 @@ def image_pair_to_difference(
     vectors_to_neighboring_images,
     lattice_vectors,
 ):
-    '''
+    """
     Find unique image pairs for double lattice-sums associated with orbital products.
 
     When k-point phases are applied to orbital products with double lattice sum
@@ -406,14 +377,14 @@ def image_pair_to_difference(
         A tuple containing:
         - The reduced lattice-sum vectors T for the unique image pairs.
         - A inverse mapping that restores the index of double lattice-sum from T.
-    '''
+    """
     vectors_to_neighboring_images = cp.asarray(vectors_to_neighboring_images)
     lattice_vectors = cp.asarray(lattice_vectors)
 
     translation_vectors = cp.asarray(
         cp.linalg.solve(lattice_vectors.T, vectors_to_neighboring_images.T).T,
     )
-    translation_vectors = cp.asarray(cp.round(translation_vectors), dtype = cp.int32)
+    translation_vectors = cp.asarray(cp.round(translation_vectors), dtype=cp.int32)
     difference_images, inverse = _unique_image_pair(translation_vectors)
     difference_images = difference_images @ lattice_vectors
 
@@ -421,30 +392,31 @@ def image_pair_to_difference(
     # where R1 is associated with the first orbital in a pair, and R2 associated to the second.
     return cp.asarray(difference_images), cp.asarray(inverse, dtype=cp.int32)
 
+
 def _unique_image_pair(translation_vectors):
-    '''
+    """
     unqiue((-L[:,None] + L).reshape(-1, 3), axis=0, return_inverse=True)
-    '''
+    """
     image_difference_full = (
         # -k_i + k_j corresponding to <i|j>
-        translation_vectors[None,:,:] - translation_vectors[:,None,:]
+        translation_vectors[None, :, :] - translation_vectors[:, None, :]
     ).reshape(-1, 3)
 
     max_offset = (translation_vectors.max(axis=0) - translation_vectors.min(axis=0)).max() + 1
-    assert (max_offset * 2)**3 < np.iinfo(np.int32).max
+    assert (max_offset * 2) ** 3 < np.iinfo(np.int32).max
     image_difference_3in1 = image_difference_full
     image_difference_3in1 += max_offset
-    image_difference_3in1 = image_difference_3in1[:, 0] * (max_offset * 2)**2 \
-                          + image_difference_3in1[:, 1] * (max_offset * 2) \
-                          + image_difference_3in1[:, 2]
+    image_difference_3in1 = (
+        image_difference_3in1[:, 0] * (max_offset * 2) ** 2
+        + image_difference_3in1[:, 1] * (max_offset * 2)
+        + image_difference_3in1[:, 2]
+    )
 
     image_difference_3in1, inverse = unique_with_sort(image_difference_3in1)
 
     translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype=cp.int32)
     translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2) ** 2
-    translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // (
-        max_offset * 2
-    )
+    translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // (max_offset * 2)
     translation_vectors[:, 2] = image_difference_3in1 % (max_offset * 2)
     translation_vectors -= max_offset
     return translation_vectors, inverse
@@ -461,9 +433,7 @@ def image_phase_for_kpts(cell, neighboring_images, kpts=None):
             neighboring_images,
             lattice_vectors,
         )
-        phase_diff_among_images = cp.exp(
-            1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T)
-        )
+        phase_diff_among_images = cp.exp(1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T))
     return phase_diff_among_images, image_pair_difference_index
 
 
@@ -500,9 +470,7 @@ def count_non_trivial_pairs(
         ctypes.c_double(threshold_in_log),
     )
     if err != 0:
-        raise RuntimeError(
-            f"count_non_trivial_pairs for li={i_angular} lj={j_angular} failed"
-        )
+        raise RuntimeError(f'count_non_trivial_pairs for li={i_angular} lj={j_angular} failed')
     return int(n_pairs[0])
 
 
@@ -558,9 +526,7 @@ def screen_gaussian_pairs(
         ctypes.c_double(threshold_in_log),
     )
     if err != 0:
-        raise RuntimeError(
-            f"screen_gaussian_pairs for li={i_angular} lj={j_angular} failed"
-        )
+        raise RuntimeError(f'screen_gaussian_pairs for li={i_angular} lj={j_angular} failed')
     return (
         screened_shell_pairs,
         image_indices,
@@ -615,13 +581,11 @@ def assign_pairs_to_blocks(
     )
     has_unstable_pairs = n_unstable_pairs_on_blocks[-1] > 0
     if not has_warned_instability and has_unstable_pairs:
-        warnings.warn(
-            "Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff."
-        )
+        warnings.warn('Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.')
         has_warned_instability = True
 
     if err != 0:
-        raise RuntimeError("count_pairs_on_blocks failed")
+        raise RuntimeError('count_pairs_on_blocks failed')
 
     n_contributing_blocks = int(n_pairs_on_blocks[-1])
     n_pairs_on_blocks = n_pairs_on_blocks[:-1]
@@ -691,7 +655,7 @@ def assign_pairs_to_blocks(
     )
 
 
-def sort_gaussian_pairs(mydf, xc_type="LDA"):
+def sort_gaussian_pairs(mydf, xc_type='LDA'):
     cell = mydf.cell
     log = logger.new_logger(cell)
     t0 = log.init_timer()
@@ -704,7 +668,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         is_non_orthogonal = 1
     else:
         is_non_orthogonal = 0
-    reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order="C")
+    reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order='C')
 
     reciprocal_norms = np.linalg.norm(reciprocal_lattice_vectors, axis=1)
     libgpbc.update_lattice_vectors(
@@ -713,20 +677,20 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         reciprocal_norms.ctypes,
     )
 
-    tasks = getattr(mydf, "tasks", None)
+    tasks = getattr(mydf, 'tasks', None)
     if tasks is None:
         tasks = multigrid.multi_grids_tasks(cell, mydf.mesh, log)
         mydf.tasks = tasks
 
-    t0 = log.timer("task generation", *t0)
+    t0 = log.timer('task generation', *t0)
     t1 = t0
     pairs = []
 
     derivative_order = 0
-    if xc_type == "GGA":
+    if xc_type == 'GGA':
         derivative_order = 1
-    if xc_type == "mGGA":
-        raise NotImplementedError("mGGA is not supported yet in multigrid_v2.")
+    if xc_type == 'mGGA':
+        raise NotImplementedError('mGGA is not supported yet in multigrid_v2.')
 
     for grids_localized, grids_diffused in tasks:
         subcell_in_localized_region = grids_localized.cell
@@ -736,9 +700,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
 
         fft_grid = list(
             map(
-                lambda n_mesh_points: cp.round(
-                    cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points)
-                ).astype(cp.int32),
+                lambda n_mesh_points: cp.round(cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points)).astype(cp.int32),
                 mesh,
             )
         )
@@ -746,13 +708,9 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         dxyz_dabc = lattice_vectors / mesh[:, None]
         libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes)
         n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32)
-        equivalent_cell_in_localized, coeff_in_localized = (
-            subcell_in_localized_region.decontract_basis(to_cart=True)
-        )
+        equivalent_cell_in_localized, coeff_in_localized = subcell_in_localized_region.decontract_basis(to_cart=True)
 
-        n_primitive_gtos_in_localized = multigrid._pgto_shells(
-            subcell_in_localized_region
-        )
+        n_primitive_gtos_in_localized = multigrid._pgto_shells(subcell_in_localized_region)
 
         # theoretically we can use the rcut defined in localized cell to reduce the
         # number of images, but somehow it can introduce some error when the lattice
@@ -765,25 +723,17 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
             concatenated_coeff = coeff_in_localized
         else:
             subcell_in_diffused_region = grids_diffused.cell
-            equivalent_cell_in_diffused, coeff_in_diffused = (
-                subcell_in_diffused_region.decontract_basis(to_cart=True)
-            )
+            equivalent_cell_in_diffused, coeff_in_diffused = subcell_in_diffused_region.decontract_basis(to_cart=True)
 
             grouped_cell = equivalent_cell_in_localized + equivalent_cell_in_diffused
 
-            grouped_cell._bas[n_primitive_gtos_in_localized:, 0] -= len(
-                subcell_in_localized_region._atm
-            )
+            grouped_cell._bas[n_primitive_gtos_in_localized:, 0] -= len(subcell_in_localized_region._atm)
 
             concatenated_coeff = coeff_in_localized + coeff_in_diffused
 
-        coeff_in_localized, localized_shape = sort_contraction_coefficients(
-            coeff_in_localized
-        )
+        coeff_in_localized, localized_shape = sort_contraction_coefficients(coeff_in_localized)
 
-        concatenated_coeff, concatenated_shape = sort_contraction_coefficients(
-            concatenated_coeff
-        )
+        concatenated_coeff, concatenated_shape = sort_contraction_coefficients(concatenated_coeff)
 
         n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell)
         rad = vol ** (-1.0 / 3) * cell.rcut + 1
@@ -792,18 +742,14 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         precision = cell.precision / lattice_sum_factor
         threshold_in_log = np.log(precision)
 
-        shell_to_ao_indices = cp.asarray(
-            gto.moleintor.make_loc(grouped_cell._bas, "cart"), dtype=cp.int32
-        )
+        shell_to_ao_indices = cp.asarray(gto.moleintor.make_loc(grouped_cell._bas, 'cart'), dtype=cp.int32)
         ao_indices_in_localized = cp.asarray(grids_localized.ao_idx, dtype=cp.int32)
         if grids_diffused is None:
             ao_indices_in_diffused = cp.array([], dtype=cp.int32)
         else:
             ao_indices_in_diffused = cp.asarray(grids_diffused.ao_idx, dtype=cp.int32)
 
-        concatenated_ao_indices = cp.concatenate(
-            (ao_indices_in_localized, ao_indices_in_diffused)
-        )
+        concatenated_ao_indices = cp.concatenate((ao_indices_in_localized, ao_indices_in_diffused))
         per_angular_pairs = []
 
         i_angulars = grouped_cell._bas[:n_primitive_gtos_in_localized, multigrid.ANG_OF]
@@ -813,9 +759,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
             i_shells = cp.asarray(np.where(i_angulars == l)[0], dtype=cp.int32)
             sorted_i_shells.append(i_shells)
 
-        j_angulars = grouped_cell._bas[
-            :n_primitive_gtos_in_two_regions, multigrid.ANG_OF
-        ]
+        j_angulars = grouped_cell._bas[:n_primitive_gtos_in_two_regions, multigrid.ANG_OF]
         j_angulars_unique = np.unique(j_angulars)
         sorted_j_shells = []
         for l in j_angulars_unique:
@@ -826,7 +770,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         bas = cp.asarray(grouped_cell._bas, dtype=cp.int32)
         env = cp.asarray(grouped_cell._env)
 
-        t1 = log.timer_debug2("routines before screening", *t1)
+        t1 = log.timer_debug2('routines before screening', *t1)
         has_warned_instability = False
         for i_angular, i_shells in zip(i_angulars_unique, sorted_i_shells):
             for j_angular, j_shells in zip(j_angulars_unique, sorted_j_shells):
@@ -847,15 +791,9 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
                     env,
                     threshold_in_log,
                 )
-                t1 = log.timer_debug2(
-                    "screening in angular pair" + str((i_angular, j_angular)), *t1
-                )
-                contributing_block_ranges = (
-                    pairs_to_blocks_end - pairs_to_blocks_begin + 1
-                )
-                n_contributing_blocks_per_pair = cp.prod(
-                    contributing_block_ranges, axis=0
-                )
+                t1 = log.timer_debug2('screening in angular pair' + str((i_angular, j_angular)), *t1)
+                contributing_block_ranges = pairs_to_blocks_end - pairs_to_blocks_begin + 1
+                n_contributing_blocks_per_pair = cp.prod(contributing_block_ranges, axis=0)
                 n_indices = int(cp.sum(n_contributing_blocks_per_pair))
                 (
                     gaussian_pair_indices,
@@ -885,60 +823,57 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
                     derivative_order,
                 )
                 t1 = log.timer_debug2(
-                    "assigning pairs to blocks in angular pair"
-                    + str((i_angular, j_angular)),
+                    'assigning pairs to blocks in angular pair' + str((i_angular, j_angular)),
                     *t1,
                 )
                 per_angular_pairs.append(
                     {
-                        "angular": (i_angular, j_angular),
-                        "screened_shell_pairs": screened_shell_pairs,
-                        "pair_indices_per_block": gaussian_pair_indices,
-                        "accumulated_counts_per_block": accumulated_counts,
-                        "sorted_block_index": sorted_contributing_blocks,
-                        "image_indices": image_indices,
-                        "i_shells": i_shells,
-                        "j_shells": j_shells,
-                        "shell_to_ao_indices": shell_to_ao_indices,
+                        'angular': (i_angular, j_angular),
+                        'screened_shell_pairs': screened_shell_pairs,
+                        'pair_indices_per_block': gaussian_pair_indices,
+                        'accumulated_counts_per_block': accumulated_counts,
+                        'sorted_block_index': sorted_contributing_blocks,
+                        'image_indices': image_indices,
+                        'i_shells': i_shells,
+                        'j_shells': j_shells,
+                        'shell_to_ao_indices': shell_to_ao_indices,
                     }
                 )
 
         pairs.append(
             {
-                "per_angular_pairs": per_angular_pairs,
-                "neighboring_images": vectors_to_neighboring_images,
-                "grouped_cell": grouped_cell,
-                "mesh": mesh,  # this one is on cpu memory
-                "fft_grid": fft_grid,
-                "ao_indices_in_localized": ao_indices_in_localized,
-                "ao_indices_in_diffused": ao_indices_in_diffused,
-                "concatenated_ao_indices": concatenated_ao_indices,
-                "coeff_in_localized": coeff_in_localized,
-                "concatenated_coeff": concatenated_coeff,
-                "primitive_shape": (localized_shape[0], concatenated_shape[0]),
-                "contracted_shape": (localized_shape[1], concatenated_shape[1]),
-                "atm": atm,
-                "bas": bas,
-                "env": env,
-                "dxyz_dabc": dxyz_dabc,
-                "is_non_orthogonal": is_non_orthogonal,
+                'per_angular_pairs': per_angular_pairs,
+                'neighboring_images': vectors_to_neighboring_images,
+                'grouped_cell': grouped_cell,
+                'mesh': mesh,  # this one is on cpu memory
+                'fft_grid': fft_grid,
+                'ao_indices_in_localized': ao_indices_in_localized,
+                'ao_indices_in_diffused': ao_indices_in_diffused,
+                'concatenated_ao_indices': concatenated_ao_indices,
+                'coeff_in_localized': coeff_in_localized,
+                'concatenated_coeff': concatenated_coeff,
+                'primitive_shape': (localized_shape[0], concatenated_shape[0]),
+                'contracted_shape': (localized_shape[1], concatenated_shape[1]),
+                'atm': atm,
+                'bas': bas,
+                'env': env,
+                'dxyz_dabc': dxyz_dabc,
+                'is_non_orthogonal': is_non_orthogonal,
             }
         )
 
     mydf.sorted_gaussian_pairs = pairs
 
-    t0 = log.timer("sort_gaussian_pairs", *t0)
+    t0 = log.timer('sort_gaussian_pairs', *t0)
     return mydf
 
 
-def evaluate_density_wrapper(
-    pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False
-):
+def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True, with_tau=False):
     if with_tau:
         c_driver = libgpbc.evaluate_density_tau_driver
     else:
         c_driver = libgpbc.evaluate_density_driver
-    n_images = pairs_info["neighboring_images"].shape[0]
+    n_images = pairs_info['neighboring_images'].shape[0]
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
     if n_k_points == 1 and n_difference_images == 1:
@@ -948,9 +883,7 @@ def evaluate_density_wrapper(
         # e^{i \vec{k} \cdot (\vec{R}_1 - \vec{R}_2)}
         # Because during grid density evaluation, rho = \sum_{\mu\nu} D_{\mu\nu} \mu \nu^*
         # The conjugate is on \nu, which is different from other Fock integrals
-        density_matrix_with_translation = cp.einsum(
-            "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice
-        )
+        density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice)
 
     n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape
 
@@ -962,9 +895,7 @@ def evaluate_density_wrapper(
         # assert abs(density_matrix_with_translation.imag).max() < real_dm_imag_threshold, \
         #     f"The dm transformed into real space contains large imaginary part " \
         #     f"(max = {abs(density_matrix_with_translation.imag).max()}) >= {real_dm_imag_threshold}"
-    density_matrix_with_translation_real_part = cp.asarray(
-        density_matrix_with_translation.real, order="C"
-    )
+    density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C')
 
     if density_matrix_with_translation_real_part.dtype == cp.float32:
         use_float_precision = ctypes.c_int(1)
@@ -979,83 +910,78 @@ def evaluate_density_wrapper(
                 n_channels,
                 2,
             )
-            + tuple(pairs_info["mesh"]),
+            + tuple(pairs_info['mesh']),
             dtype=density_matrix_with_translation_real_part.dtype,
         )
     else:
         density = cp.zeros(
-            (n_channels,) + tuple(pairs_info["mesh"]),
+            (n_channels,) + tuple(pairs_info['mesh']),
             dtype=density_matrix_with_translation_real_part.dtype,
         )
 
-    for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
-        (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
+    for gaussians_per_angular_pair in pairs_info['per_angular_pairs']:
+        (i_angular, j_angular) = gaussians_per_angular_pair['angular']
 
         err = c_driver(
             cast_to_pointer(density),
             cast_to_pointer(density_matrix_with_translation_real_part),
             ctypes.c_int(i_angular),
             ctypes.c_int(j_angular),
-            cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]),
-            cast_to_pointer(gaussians_per_angular_pair["i_shells"]),
-            cast_to_pointer(gaussians_per_angular_pair["j_shells"]),
-            ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])),
-            cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]),
+            cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']),
+            cast_to_pointer(gaussians_per_angular_pair['i_shells']),
+            cast_to_pointer(gaussians_per_angular_pair['j_shells']),
+            ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])),
+            cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']),
             ctypes.c_int(n_i_functions),
             ctypes.c_int(n_j_functions),
-            cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]),
-            cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]),
-            cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]),
-            ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])),
-            cast_to_pointer(gaussians_per_angular_pair["image_indices"]),
-            cast_to_pointer(pairs_info["neighboring_images"]),
+            cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']),
+            cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']),
+            cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']),
+            ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])),
+            cast_to_pointer(gaussians_per_angular_pair['image_indices']),
+            cast_to_pointer(pairs_info['neighboring_images']),
             ctypes.c_int(n_images),
             cast_to_pointer(image_pair_difference_index),
             ctypes.c_int(n_difference_images),
-            (ctypes.c_int * 3)(*pairs_info["mesh"]),
-            cast_to_pointer(pairs_info["atm"]),
-            cast_to_pointer(pairs_info["bas"]),
-            cast_to_pointer(pairs_info["env"]),
+            (ctypes.c_int * 3)(*pairs_info['mesh']),
+            cast_to_pointer(pairs_info['atm']),
+            cast_to_pointer(pairs_info['bas']),
+            cast_to_pointer(pairs_info['env']),
             ctypes.c_int(n_channels),
-            ctypes.c_int(pairs_info["is_non_orthogonal"]),
+            ctypes.c_int(pairs_info['is_non_orthogonal']),
             use_float_precision,
         )
         if err != 0:
-            raise RuntimeError(
-                f"evaluate_density_driver for li={i_angular} lj={j_angular} failed"
-            )
+            raise RuntimeError(f'evaluate_density_driver for li={i_angular} lj={j_angular} failed')
 
     return density
 
 
-def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
-    dm_kpts = cp.asarray(dm_kpts, order="C")
+def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
+    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     n_channels, n_k_points = dms.shape[:2]
     if mydf.sorted_gaussian_pairs is None:
         mydf.build(xc_type)
 
     with_tau = False
-    if xc_type == "LDA":
+    if xc_type == 'LDA':
         density_slices = 1
-    elif xc_type == "GGA":
+    elif xc_type == 'GGA':
         density_slices = 4
-    elif xc_type == "MGGA":
+    elif xc_type == 'MGGA':
         density_slices = 5
         with_tau = True
     else:
-        raise ValueError(f"Incorrect xc_type = {xc_type}")
+        raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     cell = mydf.cell
 
     nx, ny, nz = mydf.mesh
-    density_on_g_mesh = cp.zeros(
-        (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128
-    )
+    density_on_g_mesh = cp.zeros((n_channels, density_slices, nx, ny, nz), dtype=cp.complex128)
 
     for pairs in mydf.sorted_gaussian_pairs:
-
-        mesh = pairs["mesh"]
+        mesh = pairs['mesh']
 
         n_grid_points = np.prod(mesh)
         weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points
@@ -1063,45 +989,41 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         density_matrix_with_rows_in_localized = dms[
             :,
             :,
-            pairs["ao_indices_in_localized"][:, None],
-            pairs["concatenated_ao_indices"],
+            pairs['ao_indices_in_localized'][:, None],
+            pairs['concatenated_ao_indices'],
         ]
 
         density_matrix_with_rows_in_diffused = dms[
             :,
             :,
-            pairs["ao_indices_in_diffused"][:, None],
-            pairs["ao_indices_in_localized"],
+            pairs['ao_indices_in_diffused'][:, None],
+            pairs['ao_indices_in_localized'],
         ]
 
         n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3]
 
-        density_matrix_with_rows_in_localized[
-            :, :, :, n_ao_in_localized:
-        ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
+        density_matrix_with_rows_in_localized[:, :, :, n_ao_in_localized:] += (
+            density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
+        )
 
         n_sets, n_k_points = density_matrix_with_rows_in_localized.shape[:2]
 
-        density_matrix_with_rows_in_localized = (
-            density_matrix_with_rows_in_localized.reshape(
-                -1, *pairs["contracted_shape"]
-            )
+        density_matrix_with_rows_in_localized = density_matrix_with_rows_in_localized.reshape(
+            -1, *pairs['contracted_shape']
         )
 
         coeff_sandwiched_density_matrix = contracted_to_primitive(
             density_matrix_with_rows_in_localized,
-            pairs["coeff_in_localized"],
-            pairs["concatenated_coeff"],
-            pairs["primitive_shape"],
-        ).reshape(n_sets, n_k_points, *pairs["primitive_shape"])
+            pairs['coeff_in_localized'],
+            pairs['concatenated_coeff'],
+            pairs['primitive_shape'],
+        ).reshape(n_sets, n_k_points, *pairs['primitive_shape'])
 
-        libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
+        libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes)
 
-        img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
+        img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts)
         density = (
-            evaluate_density_wrapper(
-                pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau
-            )
+            evaluate_density_wrapper(pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau)
             * weight_per_grid_point
         )
 
@@ -1114,9 +1036,9 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
         density_on_g_mesh[
             :,
             0,
-            pairs["fft_grid"][0][:, None, None],
-            pairs["fft_grid"][1][:, None],
-            pairs["fft_grid"][2],
+            pairs['fft_grid'][0][:, None, None],
+            pairs['fft_grid'][1][:, None],
+            pairs['fft_grid'][2],
         ] += density
 
         if with_tau:
@@ -1125,13 +1047,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type="LDA"):
             density_on_g_mesh[
                 :,
                 4,
-                pairs["fft_grid"][0][:, None, None],
-                pairs["fft_grid"][1][:, None],
-                pairs["fft_grid"][2],
+                pairs['fft_grid'][0][:, None, None],
+                pairs['fft_grid'][1][:, None],
+                pairs['fft_grid'][2],
             ] += tau
 
     density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1])
-    if xc_type != "LDA":
+    if xc_type != 'LDA':
         for i in range(len(density_on_g_mesh)):
             density_on_g_mesh[i, 1:4] = iG_density(density_on_g_mesh[i], cell)
 
@@ -1144,9 +1066,7 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         n_channels = xc_weights.shape[0]
         # density_slices = 2
     else:
-        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (
-            xc_weights.ndim == 3 + 1
-        )
+        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1)
         n_channels = xc_weights.shape[0]
         # density_slices = 1
 
@@ -1154,11 +1074,11 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         c_driver = libgpbc.evaluate_xc_with_tau_driver
     else:
         c_driver = libgpbc.evaluate_xc_driver
-    n_i_functions, n_j_functions = pairs_info["primitive_shape"]
+    n_i_functions, n_j_functions = pairs_info['primitive_shape']
 
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
-    n_images = pairs_info["neighboring_images"].shape[0]
+    n_images = pairs_info['neighboring_images'].shape[0]
 
     fock = cp.zeros(
         (n_channels, n_difference_images, n_i_functions, n_j_functions),
@@ -1170,49 +1090,47 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         assert xc_weights.dtype == cp.float64
         use_float_precision = ctypes.c_int(0)
 
-    for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
+    for gaussians_per_angular_pair in pairs_info['per_angular_pairs']:
         fock_slice = cp.zeros(
             (n_channels, n_difference_images, n_i_functions, n_j_functions),
             dtype=xc_weights.dtype,
         )
-        (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
+        (i_angular, j_angular) = gaussians_per_angular_pair['angular']
         err = c_driver(
             cast_to_pointer(fock_slice),
             cast_to_pointer(xc_weights),
             ctypes.c_int(i_angular),
             ctypes.c_int(j_angular),
-            cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]),
-            cast_to_pointer(gaussians_per_angular_pair["i_shells"]),
-            cast_to_pointer(gaussians_per_angular_pair["j_shells"]),
-            ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])),
-            cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]),
+            cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']),
+            cast_to_pointer(gaussians_per_angular_pair['i_shells']),
+            cast_to_pointer(gaussians_per_angular_pair['j_shells']),
+            ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])),
+            cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']),
             ctypes.c_int(n_i_functions),
             ctypes.c_int(n_j_functions),
-            cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]),
-            cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]),
-            cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]),
-            ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])),
-            cast_to_pointer(gaussians_per_angular_pair["image_indices"]),
-            cast_to_pointer(pairs_info["neighboring_images"]),
+            cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']),
+            cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']),
+            cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']),
+            ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])),
+            cast_to_pointer(gaussians_per_angular_pair['image_indices']),
+            cast_to_pointer(pairs_info['neighboring_images']),
             ctypes.c_int(n_images),
             cast_to_pointer(image_pair_difference_index),
             ctypes.c_int(n_difference_images),
-            cast_to_pointer(pairs_info["mesh"]),
-            cast_to_pointer(pairs_info["atm"]),
-            cast_to_pointer(pairs_info["bas"]),
-            cast_to_pointer(pairs_info["env"]),
+            cast_to_pointer(pairs_info['mesh']),
+            cast_to_pointer(pairs_info['atm']),
+            cast_to_pointer(pairs_info['bas']),
+            cast_to_pointer(pairs_info['env']),
             ctypes.c_int(n_channels),
-            ctypes.c_int(pairs_info["is_non_orthogonal"]),
+            ctypes.c_int(pairs_info['is_non_orthogonal']),
             use_float_precision,
         )
         fock += fock_slice
         if err != 0:
-            raise RuntimeError(
-                f"evaluate_xc_driver for li={i_angular} lj={j_angular} failed"
-            )
+            raise RuntimeError(f'evaluate_xc_driver for li={i_angular} lj={j_angular} failed')
 
     if not (n_k_points == 1 and n_difference_images == 1):
-        return cp.einsum("kt, ntij -> nkij", phase_diff_among_images, fock)
+        return cp.einsum('kt, ntij -> nkij', phase_diff_among_images, fock)
     else:
         return fock
 
@@ -1235,7 +1153,7 @@ def convert_xc_on_g_mesh_to_fock(
             assert xc_on_g_mesh.shape[1] == 2
             n_channels = xc_on_g_mesh.shape[0]
         else:
-            raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}")
+            raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}')
         density_slices = 2
     else:
         if xc_on_g_mesh.ndim == 1:
@@ -1246,7 +1164,7 @@ def convert_xc_on_g_mesh_to_fock(
             assert xc_on_g_mesh.shape[1] == 1
             n_channels = xc_on_g_mesh.shape[0]
         else:
-            raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}")
+            raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}')
         density_slices = 1
 
     xc_on_g_mesh = xc_on_g_mesh.reshape(n_channels, density_slices, *mydf.mesh)
@@ -1269,32 +1187,29 @@ def convert_xc_on_g_mesh_to_fock(
     fock = cp.zeros((n_channels, n_k_points, nao, nao), dtype=data_type)
 
     for pairs in mydf.sorted_gaussian_pairs:
-
         interpolated_xc = xc_on_g_mesh[
             :,
             :,
-            pairs["fft_grid"][0][:, None, None],
-            pairs["fft_grid"][1][:, None],
-            pairs["fft_grid"][2],
+            pairs['fft_grid'][0][:, None, None],
+            pairs['fft_grid'][1][:, None],
+            pairs['fft_grid'][2],
         ]
-        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C")
+        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C')
 
-        n_ao_in_localized = len(pairs["ao_indices_in_localized"])
-        libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
-        img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
-        fock_slice = evaluate_xc_wrapper(
-            pairs, interpolated_xc, img_phase, with_tau=with_tau
-        )
+        n_ao_in_localized = len(pairs['ao_indices_in_localized'])
+        libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes)
+        img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts)
+        fock_slice = evaluate_xc_wrapper(pairs, interpolated_xc, img_phase, with_tau=with_tau)
         n_sets, n_k_points = fock_slice.shape[:2]
 
-        fock_slice = fock_slice.reshape(-1, *pairs["primitive_shape"])
+        fock_slice = fock_slice.reshape(-1, *pairs['primitive_shape'])
 
         fock_slice = primitive_to_contracted(
             fock_slice,
-            pairs["coeff_in_localized"],
-            pairs["concatenated_coeff"],
-            pairs["contracted_shape"],
-        ).reshape(n_sets, n_k_points, *pairs["contracted_shape"])
+            pairs['coeff_in_localized'],
+            pairs['concatenated_coeff'],
+            pairs['contracted_shape'],
+        ).reshape(n_sets, n_k_points, *pairs['contracted_shape'])
 
         # While mathematically it is correct to have concatenated
         # ao indices in the addition, but it is possible that the ao
@@ -1308,25 +1223,23 @@ def convert_xc_on_g_mesh_to_fock(
         fock[
             :,
             :,
-            pairs["ao_indices_in_localized"][:, None],
-            pairs["ao_indices_in_localized"],
+            pairs['ao_indices_in_localized'][:, None],
+            pairs['ao_indices_in_localized'],
         ] += fock_slice[:, :, :, :n_ao_in_localized]
         fock[
             :,
             :,
-            pairs["ao_indices_in_localized"][:, None],
-            pairs["ao_indices_in_diffused"],
+            pairs['ao_indices_in_localized'][:, None],
+            pairs['ao_indices_in_diffused'],
         ] += fock_slice[:, :, :, n_ao_in_localized:]
 
         if hermi == 1:
             fock[
                 :,
                 :,
-                pairs["ao_indices_in_diffused"][:, None],
-                pairs["ao_indices_in_localized"],
-            ] += (
-                fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj()
-            )
+                pairs['ao_indices_in_diffused'][:, None],
+                pairs['ao_indices_in_localized'],
+            ] += fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj()
         else:
             raise NotImplementedError
 
@@ -1347,9 +1260,7 @@ def evaluate_xc_gradient_wrapper(
         n_channels = xc_weights.shape[0]
         # density_slices = 2
     else:
-        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (
-            xc_weights.ndim == 3 + 1
-        )
+        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1)
         n_channels = xc_weights.shape[0]
         # density_slices = 1
 
@@ -1365,63 +1276,57 @@ def evaluate_xc_gradient_wrapper(
     else:
         use_float_precision = ctypes.c_int(0)
 
-    n_images = pairs_info["neighboring_images"].shape[0]
+    n_images = pairs_info['neighboring_images'].shape[0]
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
 
     if n_k_points == 1 and n_difference_images == 1:
         density_matrix_with_translation = dm_slice
     else:
-        density_matrix_with_translation = cp.einsum(
-            "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice
-        )
+        density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice)
 
     n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape
     if ignore_imag is False:
         raise NotImplementedError
 
-    density_matrix_with_translation_real_part = cp.asarray(
-        density_matrix_with_translation.real, order="C"
-    )
+    density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C')
 
     assert gradient.dtype == density_matrix_with_translation_real_part.dtype
 
-    for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
-        (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
+    for gaussians_per_angular_pair in pairs_info['per_angular_pairs']:
+        (i_angular, j_angular) = gaussians_per_angular_pair['angular']
         err = c_driver(
             cast_to_pointer(gradient),
             cast_to_pointer(xc_weights),
             cast_to_pointer(density_matrix_with_translation_real_part),
             ctypes.c_int(i_angular),
             ctypes.c_int(j_angular),
-            cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]),
-            cast_to_pointer(gaussians_per_angular_pair["i_shells"]),
-            cast_to_pointer(gaussians_per_angular_pair["j_shells"]),
-            ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])),
-            cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]),
+            cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']),
+            cast_to_pointer(gaussians_per_angular_pair['i_shells']),
+            cast_to_pointer(gaussians_per_angular_pair['j_shells']),
+            ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])),
+            cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']),
             ctypes.c_int(n_i_functions),
             ctypes.c_int(n_j_functions),
-            cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]),
-            cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]),
-            cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]),
-            ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])),
-            cast_to_pointer(gaussians_per_angular_pair["image_indices"]),
-            cast_to_pointer(pairs_info["neighboring_images"]),
+            cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']),
+            cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']),
+            cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']),
+            ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])),
+            cast_to_pointer(gaussians_per_angular_pair['image_indices']),
+            cast_to_pointer(pairs_info['neighboring_images']),
             ctypes.c_int(n_images),
             cast_to_pointer(image_pair_difference_index),
             ctypes.c_int(n_difference_images),
-            cast_to_pointer(pairs_info["mesh"]),
-            cast_to_pointer(pairs_info["atm"]),
-            cast_to_pointer(pairs_info["bas"]),
-            cast_to_pointer(pairs_info["env"]),
+            cast_to_pointer(pairs_info['mesh']),
+            cast_to_pointer(pairs_info['atm']),
+            cast_to_pointer(pairs_info['bas']),
+            cast_to_pointer(pairs_info['env']),
             ctypes.c_int(n_channels),
-            ctypes.c_int(pairs_info["is_non_orthogonal"]),
+            ctypes.c_int(pairs_info['is_non_orthogonal']),
             use_float_precision,
         )
         if err != 0:
-            raise RuntimeError(
-                f"evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed"
-            )
+            raise RuntimeError(f'evaluate_xc_gradient_driver for li={i_angular} lj={j_angular} failed')
 
 
 def convert_xc_on_g_mesh_to_fock_gradient(
@@ -1433,7 +1338,7 @@ def convert_xc_on_g_mesh_to_fock_gradient(
     with_tau=False,
 ):
     cell = mydf.cell
-    dm_kpts = cp.asarray(dm_kpts, order="C")
+    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     n_atoms = cell.natm
 
@@ -1451,47 +1356,45 @@ def convert_xc_on_g_mesh_to_fock_gradient(
         interpolated_xc = xc_on_g_mesh[
             :,
             :,
-            pairs["fft_grid"][0][:, None, None],
-            pairs["fft_grid"][1][:, None],
-            pairs["fft_grid"][2],
+            pairs['fft_grid'][0][:, None, None],
+            pairs['fft_grid'][1][:, None],
+            pairs['fft_grid'][2],
         ]
 
-        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C")
+        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C')
 
         density_matrix_slice = dms[
             :,
             :,
-            pairs["ao_indices_in_localized"][:, None],
-            pairs["concatenated_ao_indices"],
+            pairs['ao_indices_in_localized'][:, None],
+            pairs['concatenated_ao_indices'],
         ]
         density_matrix_with_rows_in_diffused = dms[
             :,
             :,
-            pairs["ao_indices_in_diffused"][:, None],
-            pairs["ao_indices_in_localized"],
+            pairs['ao_indices_in_diffused'][:, None],
+            pairs['ao_indices_in_localized'],
         ]
 
         n_ao_in_localized = density_matrix_slice.shape[2]
-        density_matrix_slice[
-            :, :, :, n_ao_in_localized:
-        ] += density_matrix_with_rows_in_diffused.transpose(0, 1, 3, 2).conj()
+        density_matrix_slice[:, :, :, n_ao_in_localized:] += density_matrix_with_rows_in_diffused.transpose(
+            0, 1, 3, 2
+        ).conj()
 
         n_sets, n_k_points = density_matrix_slice.shape[:2]
 
-        density_matrix_slice = density_matrix_slice.reshape(
-            -1, *pairs["contracted_shape"]
-        )
+        density_matrix_slice = density_matrix_slice.reshape(-1, *pairs['contracted_shape'])
 
         coeff_sandwiched_density_matrix = contracted_to_primitive(
             density_matrix_slice,
-            pairs["coeff_in_localized"],
-            pairs["concatenated_coeff"],
-            pairs["primitive_shape"],
-        ).reshape(n_sets, n_k_points, *pairs["primitive_shape"])
+            pairs['coeff_in_localized'],
+            pairs['concatenated_coeff'],
+            pairs['primitive_shape'],
+        ).reshape(n_sets, n_k_points, *pairs['primitive_shape'])
 
-        libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
+        libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes)
 
-        img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
+        img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts)
         evaluate_xc_gradient_wrapper(
             gradient,
             pairs,
@@ -1540,9 +1443,9 @@ def get_pp(ni, kpts=None):
     mesh = ni.mesh
     # Compute the vpplocG as
     # -einsum('ij,ij->j', pseudo.get_vlocG(cell, Gv), cell.get_SI(Gv))
-    vpplocG = multigrid_v1.eval_vpplocG(cell, mesh)
+    vpplocG = ni.vpplocG
     vpp = convert_xc_on_g_mesh_to_fock(ni, vpplocG, hermi=1, kpts=kpts)[0]
-    t1 = log.timer_debug1("vpploc", *t0)
+    t1 = log.timer_debug1('vpploc', *t0)
 
     vppnl = pp_int.get_pp_nl(cell, kpts)
     for k, kpt in enumerate(kpts):
@@ -1553,8 +1456,8 @@ def get_pp(ni, kpts=None):
 
     if is_single_kpt:
         vpp = vpp[0]
-    log.timer_debug1("vppnl", *t1)
-    log.timer("get_pp", *t0)
+    log.timer_debug1('vppnl', *t1)
+    log.timer('get_pp', *t0)
     return vpp
 
 
@@ -1583,7 +1486,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     cell = ni.cell
     log = logger.new_logger(cell)
     t0 = log.init_timer()
-    dm_kpts = cp.asarray(dm_kpts, order="C")
+    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     mesh = ni.mesh
@@ -1594,7 +1497,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     # coulomb_kernel_on_g_mesh = pbc_tools.get_coulG(cell, Gv=Gv)
     coulomb_kernel_on_g_mesh = ni.coulG
 
-    coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], coulomb_kernel_on_g_mesh)
+    coulomb_on_g_mesh = cp.einsum('ng, g -> g', density[:, 0], coulomb_kernel_on_g_mesh)
     weight = cell.vol / ngrids
 
     density = density.reshape(-1, *mesh)
@@ -1607,7 +1510,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     #    ni = ni.copy().reset().build()
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
     xc_for_fock = convert_xc_on_g_mesh_to_fock(ni, coulomb_on_g_mesh, hermi, kpts_band)
-    t0 = log.timer("vj", *t0)
+    t0 = log.timer('vj', *t0)
     return _format_jks(xc_for_fock, dm_kpts, input_band, kpts)
 
 
@@ -1655,7 +1558,7 @@ def nr_rks(
         kpts = np.zeros((1, 3))
     else:
         kpts = kpts.reshape(-1, 3)
-    dm_kpts = cp.asarray(dm_kpts, order="C")
+    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     dms = None
@@ -1670,8 +1573,8 @@ def nr_rks(
     coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
-    log.debug("Multigrid Coulomb energy %s", coulomb_energy)
-    t0 = log.timer("coulomb", *t0)
+    log.debug('Multigrid Coulomb energy %s', coulomb_energy)
+    t0 = log.timer('coulomb', *t0)
     weight = cell.vol / ngrids
 
     density = ifft_in_place(density.reshape(-1, *mesh)).real.reshape(-1, ngrids)
@@ -1679,17 +1582,13 @@ def nr_rks(
     density /= weight
 
     # eval_xc_eff supports float64 only
-    density = cp.asarray(density, dtype=np.float64, order="C")
-    if xc_type == "LDA":
-        xc_for_energy, xc_for_fock = ni.eval_xc_eff(
-            xc_code, density[0], deriv=1, xctype=xc_type
-        )[:2]
-    elif xc_type == "GGA" or xc_type == "MGGA":
-        xc_for_energy, xc_for_fock = ni.eval_xc_eff(
-            xc_code, density, deriv=1, xctype=xc_type
-        )[:2]
+    density = cp.asarray(density, dtype=np.float64, order='C')
+    if xc_type == 'LDA':
+        xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density[0], deriv=1, xctype=xc_type)[:2]
+    elif xc_type == 'GGA' or xc_type == 'MGGA':
+        xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2]
     else:
-        raise ValueError(f"Incorrect xc_type = {xc_type}")
+        raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     rho_sf = density[0].real
     xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).get()) * weight
@@ -1699,15 +1598,15 @@ def nr_rks(
     xc_for_fock *= weight
     xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids)
 
-    log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
+    log.debug('Multigrid exc %s  nelec %s', xc_energy_sum, n_electrons)
 
-    if xc_type == "LDA":
+    if xc_type == 'LDA':
         pass
-    elif xc_type == "GGA":
+    elif xc_type == 'GGA':
         contract_iG_potential(xc_for_fock, cell)
         xc_for_fock = xc_for_fock[0]
         xc_for_fock = xc_for_fock.reshape((-1, ngrids))
-    elif xc_type == "MGGA":
+    elif xc_type == 'MGGA':
         xc_for_fock = contract_iG_potential(xc_for_fock, cell)
         xc_for_fock = cp.concatenate(
             [
@@ -1717,18 +1616,16 @@ def nr_rks(
             axis=0,
         )
     else:
-        raise ValueError(f"Incorrect xc_type = {xc_type}")
+        raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     if with_j:
         xc_for_fock[0] += coulomb_on_g_mesh
 
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
-    veff = convert_xc_on_g_mesh_to_fock(
-        ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA")
-    )
+    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA'))
     veff = _format_jks(veff, dm_kpts, input_band, kpts)
     veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None)
-    t0 = log.timer("xc", *t0)
+    t0 = log.timer('xc', *t0)
     return n_electrons, xc_energy_sum, veff
 
 
@@ -1778,7 +1675,7 @@ def nr_uks(
         kpts = np.zeros((1, 3))
     else:
         kpts = kpts.reshape(-1, 3)
-    dm_kpts = cp.asarray(dm_kpts, order="C")
+    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     dms = None
@@ -1793,8 +1690,8 @@ def nr_uks(
     coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
-    log.debug("Multigrid Coulomb energy %s", coulomb_energy)
-    t0 = log.timer("coulomb", *t0)
+    log.debug('Multigrid Coulomb energy %s', coulomb_energy)
+    t0 = log.timer('coulomb', *t0)
     weight = cell.vol / ngrids
 
     density = density.reshape(-1, *mesh)
@@ -1803,17 +1700,13 @@ def nr_uks(
     density /= weight
 
     # eval_xc_eff supports float64 only
-    density = cp.asarray(density, dtype=np.float64, order="C")
-    if xc_type == "LDA":
-        xc_for_energy, xc_for_fock = ni.eval_xc_eff(
-            xc_code, density[:, 0], deriv=1, xctype=xc_type
-        )[:2]
-    elif xc_type == "GGA" or xc_type == "MGGA":
-        xc_for_energy, xc_for_fock = ni.eval_xc_eff(
-            xc_code, density, deriv=1, xctype=xc_type
-        )[:2]
+    density = cp.asarray(density, dtype=np.float64, order='C')
+    if xc_type == 'LDA':
+        xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density[:, 0], deriv=1, xctype=xc_type)[:2]
+    elif xc_type == 'GGA' or xc_type == 'MGGA':
+        xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2]
     else:
-        raise ValueError(f"Incorrect xc_type = {xc_type}")
+        raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     rho_sf = (density[0, 0] + density[1, 0]).real
     xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).real.get()) * weight
@@ -1823,15 +1716,15 @@ def nr_uks(
     xc_for_fock *= weight
     xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(nset, -1, ngrids)
 
-    log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
+    log.debug('Multigrid exc %s  nelec %s', xc_energy_sum, n_electrons)
 
-    if xc_type == "LDA":
+    if xc_type == 'LDA':
         pass
-    elif xc_type == "GGA":
+    elif xc_type == 'GGA':
         for i in xc_for_fock:
             contract_iG_potential(i, cell)
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
-    elif xc_type == "MGGA":
+    elif xc_type == 'MGGA':
         for i in xc_for_fock:
             contract_iG_potential(i, cell)
         xc_for_fock = cp.concatenate(
@@ -1842,18 +1735,16 @@ def nr_uks(
             axis=1,
         )
     else:
-        raise ValueError(f"Incorrect xc_type = {xc_type}")
+        raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     if with_j:
         xc_for_fock[:, 0] += coulomb_on_g_mesh
 
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
-    veff = convert_xc_on_g_mesh_to_fock(
-        ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == "MGGA")
-    )
+    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA'))
     veff = _format_jks(veff, dm_kpts, input_band, kpts)
     veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None)
-    t0 = log.timer("xc", *t0)
+    t0 = log.timer('xc', *t0)
     return n_electrons, xc_energy_sum, veff
 
 
@@ -1881,21 +1772,21 @@ def get_veff_ip1(
     with_pseudo_vloc_orbital_derivative=True,
     verbose=None,
 ):
-    '''Computes the derivatives of the Exc along with additional contributions
+    """Computes the derivatives of the Exc along with additional contributions
     from the Coulomb and pseudopotential terms.
 
     Note, the current return is the energy per cell scaled by the number of
     k-points. This should return the energy per cell directly and will be
     changed in future.
-    '''
+    """
     if kpts is None:
         kpts = np.zeros((1, 3))
     else:
         kpts = kpts.reshape(-1, 3)
-    log = logger.new_logger(ni, verbose)
-    t0 = log.init_timer()
     cell = ni.cell
-    dm_kpts = cp.asarray(dm_kpts, order="C")
+    log = logger.new_logger(cell)
+    t0 = log.init_timer()
+    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     dms = None
@@ -1907,7 +1798,7 @@ def get_veff_ip1(
     density = evaluate_density_on_g_mesh(ni, dm_kpts, kpts, xc_type)
 
     Gv = pbc_tools._get_Gv(cell, mesh)
-    coulomb_on_g_mesh = cp.einsum("ng, g -> g", density[:, 0], ni.coulG)
+    coulomb_on_g_mesh = cp.einsum('ng, g -> g', density[:, 0], ni.coulG)
 
     weight = cell.vol / ngrids
 
@@ -1916,7 +1807,7 @@ def get_veff_ip1(
     density = (
         cp.asarray(
             ifft_in_place(density.reshape(nset, -1, *mesh)).real,
-            order="C",
+            order='C',
         ).reshape(nset, -1, ngrids)
         / weight
     )
@@ -1929,15 +1820,15 @@ def get_veff_ip1(
     xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight
     xc_for_fock = fft_in_place(xc_for_fock).reshape(nset, -1, ngrids)
 
-    if xc_type == "LDA":
+    if xc_type == 'LDA':
         pass
-    elif xc_type == "GGA":
+    elif xc_type == 'GGA':
         for i in xc_for_fock:
             contract_iG_potential(i, cell)
         xc_for_fock = xc_for_fock[:, 0]
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
-    elif xc_type == "MGGA":
-        xc_for_fock[:, 0] -= contract("ngp, pg -> np", xc_for_fock[:, 1:4], Gv) * 1j
+    elif xc_type == 'MGGA':
+        xc_for_fock[:, 0] -= contract('ngp, pg -> np', xc_for_fock[:, 1:4], Gv) * 1j
         xc_for_fock = cp.concatenate(
             [
                 xc_for_fock[:, 0].reshape((nset, -1, ngrids)),
@@ -1946,22 +1837,19 @@ def get_veff_ip1(
             axis=1,
         )
     else:
-        raise ValueError(f"Incorrect xc_type = {xc_type}")
+        raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     if with_j:
         xc_for_fock[:, 0] += coulomb_on_g_mesh
 
     if with_pseudo_vloc_orbital_derivative:
-        if cell._pseudo:
-            xc_for_fock[:, 0] += multigrid_v1.eval_vpplocG(cell, mesh)
-        else:
-            xc_for_fock[:, 0] += multigrid_v1.eval_nucG(cell, mesh)
+        xc_for_fock[:, 0] += ni.vpplocG
 
     veff_gradient = convert_xc_on_g_mesh_to_fock_gradient(
-        ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau=(xc_type == "MGGA")
+        ni, xc_for_fock, dm_kpts, hermi, kpts_band, with_tau=(xc_type == 'MGGA')
     )
 
-    t0 = log.timer("veff_gradient", *t0)
+    t0 = log.timer('veff_gradient', *t0)
 
     return veff_gradient
 
@@ -1974,6 +1862,11 @@ def __init__(self, cell):
         self.sorted_gaussian_pairs = None
         Gv = pbc_tools._get_Gv(cell, cell.mesh)
         self.coulG = pbc_tools.get_coulG(cell, Gv=Gv)
+        if cell._pseudo:
+            self.vpplocG = multigrid_v1.eval_vpplocG(cell, cell.mesh)
+        else:
+            self.vpplocG = multigrid_v1.eval_nucG(cell, cell.mesh)
+
         self.build()
 
     build = sort_gaussian_pairs
@@ -1985,7 +1878,7 @@ def reset(self, cell=None):
         self.sorted_gaussian_pairs = None
         return self
 
-    def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv="ewald"):
+    def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv='ewald'):
         if kpts is not None:
             raise NotImplementedError
         vj = get_j_kpts(self, dm, hermi, kpts, kpts_band)
@@ -2012,4 +1905,4 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None, omega=None, exxdiv="ewal
     device = utils.device
 
     def to_cpu(self):
-        raise RuntimeError("Not available")
+        raise RuntimeError('Not available')
diff --git a/gpu4pyscf/pbc/grad/rhf.py b/gpu4pyscf/pbc/grad/rhf.py
index 491bf4be1..7670914c9 100644
--- a/gpu4pyscf/pbc/grad/rhf.py
+++ b/gpu4pyscf/pbc/grad/rhf.py
@@ -28,22 +28,23 @@
 import gpu4pyscf.pbc.dft.multigrid as multigrid_v1
 from gpu4pyscf.pbc.gto import int1e
 from gpu4pyscf.pbc.grad.pp import vppnl_nuc_grad
+from gpu4pyscf.lib import logger, utils
 
 __all__ = ['Gradients']
 
+
 class GradientsBase(mol_rhf.GradientsBase):
     get_ovlp = NotImplemented
     grad_nuc = cpu_rhf.GradientsBase.grad_nuc
 
     def optimizer(self):
-        '''Geometry (atom positions and lattice) optimization solver
-        '''
+        """Geometry (atom positions and lattice) optimization solver"""
         from gpu4pyscf.geomopt.ase_solver import GeometryOptimizer
+
         return GeometryOptimizer(self.base)
 
 
 class Gradients(GradientsBase):
-
     make_rdm1e = mol_rhf.Gradients.make_rdm1e
 
     def get_veff(self, cell=None, dm=None, kpt=None, verbose=None):
@@ -57,6 +58,7 @@ def grad_elec(
         atmlst=None,
     ):
         from gpu4pyscf.pbc.grad.krhf import contract_h1e_dm
+
         mf = self.base
         cell = mf.cell
         kpt = mf.kpt
@@ -74,10 +76,14 @@ def grad_elec(
             atmlst = range(cell.natm)
 
         with_rsjk = mf.rsjk
+
+        log = logger.new_logger(cell)
+
         # TODO: handle all-electron+GGA and pseudo+GGA differently
         # pseudo+GGA does not need to evaluate the gradients with PBCJKMatrixOpt
         if with_rsjk is not None:
             from gpu4pyscf.pbc.scf.rsjk import PBCJKMatrixOpt
+
             assert isinstance(with_rsjk, PBCJKMatrixOpt)
             if hasattr(mf, 'xc'):
                 ni = mf._numint
@@ -87,7 +93,9 @@ def grad_elec(
                     with_rsjk = PBCJKMatrixOpt(cell, omega=omega).build()
                 if with_rsjk.supmol is None:
                     with_rsjk.build()
-                de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True, with_pseudo_vloc_orbital_derivative=True).get()
+                de = multigrid_v2.get_veff_ip1(
+                    ni, mf.xc, dm0, with_j=True, with_pseudo_vloc_orbital_derivative=True
+                ).get()
                 j_factor = 0
             else:
                 ni = multigrid_v2.MultiGridNumInt(cell).build()
@@ -95,38 +103,41 @@ def grad_elec(
                 de = 0
                 if cell._pseudo:
                     vpplocG = multigrid_v1.eval_vpplocG(ni.cell, ni.mesh)
-                    de = multigrid_v2.convert_xc_on_g_mesh_to_fock_gradient(
-                        ni, vpplocG.reshape(1,1,-1), dm0).get()
+                    de = multigrid_v2.convert_xc_on_g_mesh_to_fock_gradient(ni, vpplocG.reshape(1, 1, -1), dm0).get()
                 else:
                     raise NotImplementedError
-            ejk  = with_rsjk._get_ejk_sr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv,
-                                             j_factor=j_factor, k_factor=k_sr)
-            ejk += with_rsjk._get_ejk_lr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv,
-                                             j_factor=j_factor, k_factor=k_lr)
-            de += ejk*2
+            ejk = with_rsjk._get_ejk_sr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv, j_factor=j_factor, k_factor=k_sr)
+            ejk += with_rsjk._get_ejk_lr_ip1(dm0, kpts=kpt, exxdiv=mf.exxdiv, j_factor=j_factor, k_factor=k_lr)
+            de += ejk * 2
         else:
             assert hasattr(mf, 'xc'), 'HF gradients not supported'
             ni = mf._numint
             assert isinstance(ni, multigrid_v2.MultiGridNumInt)
             de = multigrid_v2.get_veff_ip1(ni, mf.xc, dm0, with_j=True, with_pseudo_vloc_orbital_derivative=True).get()
 
+        t0 = log.init_timer()
         s1 = int1e.int1e_ipovlp(cell)[0]
         de += contract_h1e_dm(cell, s1, dme0, hermi=1)
+        t0 = log.timer('ovlp gradient', *t0)
 
         # the CPU code requires the attribute .rhoG
         rhoG = multigrid_v2.evaluate_density_on_g_mesh(ni, dm0)
-        rhoG = rhoG[0,0]
+        rhoG = rhoG[0, 0]
+
         if cell._pseudo:
             de += multigrid_v1.eval_vpplocG_SI_gradient(cell, ni.mesh, rhoG).get()
             de += vppnl_nuc_grad(cell, dm0.get())
         else:
             de += multigrid_v1.eval_nucG_SI_gradient(cell, ni.mesh, rhoG).get()
+        t0 = log.timer('other pp gradient', *t0)
         rhoG = None
         core_hamiltonian_gradient = int1e.int1e_ipkin(cell)[0]
         de -= contract_h1e_dm(cell, core_hamiltonian_gradient, dm0, hermi=1)
+        t0 = log.timer('kinetic gradient', *t0)
 
         return de
 
     def get_stress(self):
         from gpu4pyscf.pbc.grad import rhf_stress
+
         return rhf_stress.kernel(self)

From ff0c6322a61649f5a012f4c586cae589d6b603fc Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Wed, 18 Mar 2026 14:11:54 -0700
Subject: [PATCH 09/14] adjust for k-point calculation

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index ccedf4fb6..0f22c23ca 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -8,8 +8,7 @@
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
@@ -290,7 +289,7 @@ def contracted_to_primitive(batched_matrices, sorted_coeffs_left, sorted_coeffs_
     n_cols_primitive = primitive_shape[1]
 
     intermediate_shape = (n_slices, n_rows_primitive, n_cols)
-    intermediate = cp.zeros(intermediate_shape)
+    intermediate = cp.zeros(intermediate_shape, dtype=batched_matrices.dtype)
 
     for i in sorted_coeffs_left:
         subarray_shape = (n_slices, -1, i['shape'][1], n_cols)
@@ -303,7 +302,7 @@ def contracted_to_primitive(batched_matrices, sorted_coeffs_left, sorted_coeffs_
     intermediate = intermediate.transpose(0, 2, 1)
 
     result_shape = (n_slices, n_cols_primitive, n_rows_primitive)
-    result = cp.zeros(result_shape)
+    result = cp.zeros(result_shape, dtype=batched_matrices.dtype)
 
     for i in sorted_coeffs_right:
         subarray_shape = (n_slices, -1, i['shape'][1], n_rows_primitive)
@@ -326,7 +325,7 @@ def primitive_to_contracted(batched_matrices, sorted_coeffs_left, sorted_coeffs_
     n_cols_contracted = contracted_shape[1]
 
     intermediate_shape = (n_slices, n_rows_contracted, n_cols)
-    intermediate = cp.zeros(intermediate_shape)
+    intermediate = cp.zeros(intermediate_shape, dtype=batched_matrices.dtype)
 
     for i in sorted_coeffs_left:
         subarray_shape = (n_slices, -1, i['shape'][0], n_cols)
@@ -338,7 +337,7 @@ def primitive_to_contracted(batched_matrices, sorted_coeffs_left, sorted_coeffs_
 
     intermediate = intermediate.transpose(0, 2, 1)
     result_shape = (n_slices, n_cols_contracted, n_rows_contracted)
-    result = cp.zeros(result_shape)
+    result = cp.zeros(result_shape, dtype=batched_matrices.dtype)
 
     for i in sorted_coeffs_right:
         subarray_shape = (n_slices, -1, i['shape'][0], n_rows_contracted)
@@ -958,7 +957,6 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
 
 
 def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
-    dm_kpts = cp.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     n_channels, n_k_points = dms.shape[:2]
     if mydf.sorted_gaussian_pairs is None:
@@ -976,6 +974,7 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
         raise ValueError(f'Incorrect xc_type = {xc_type}')
 
     cell = mydf.cell
+    log = logger.new_logger(cell, mydf.verbose)
 
     nx, ny, nz = mydf.mesh
     density_on_g_mesh = cp.zeros((n_channels, density_slices, nx, ny, nz), dtype=cp.complex128)
@@ -986,6 +985,8 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
         n_grid_points = np.prod(mesh)
         weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points
 
+        breakpoint()
+
         density_matrix_with_rows_in_localized = dms[
             :,
             :,
@@ -1144,7 +1145,7 @@ def convert_xc_on_g_mesh_to_fock(
 ):
     cell = mydf.cell
     nao = cell.nao_nr()
-
+    log = logger.new_logger(cell, mydf.verbose)
     if with_tau:
         if xc_on_g_mesh.ndim == 2:
             assert xc_on_g_mesh.shape[0] == 2

From e507aa59eb928a55d089d815fa285f3701056c1f Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Wed, 18 Mar 2026 14:55:37 -0700
Subject: [PATCH 10/14] revert formatting

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 525 +++++++++++++++---------------
 1 file changed, 255 insertions(+), 270 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 8ce6e7439..c3160be0f 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -8,7 +8,8 @@
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
@@ -36,7 +37,7 @@
 
 __all__ = ['MultiGridNumInt']
 
-libgpbc = load_library('libmgrid_v2')
+libgpbc = load_library("libmgrid_v2")
 libgpbc.evaluate_density_driver.restype = ctypes.c_int
 libgpbc.evaluate_xc_driver.restype = ctypes.c_int
 libgpbc.evaluate_xc_gradient_driver.restype = ctypes.c_int
@@ -51,7 +52,7 @@ def complex_type(dtype):
     elif dtype == cp.float64:
         return cp.complex128
     else:
-        raise ValueError('Invalid dtype')
+        raise ValueError("Invalid dtype")
 
 
 def cast_to_pointer(array):
@@ -60,7 +61,7 @@ def cast_to_pointer(array):
     elif isinstance(array, np.ndarray):
         return array.ctypes.data_as(ctypes.c_void_p)
     else:
-        raise ValueError('Invalid array type')
+        raise ValueError("Invalid array type")
 
 
 def fft_in_place(x):
@@ -211,8 +212,8 @@ def unique_with_sort(x):
         return x, cp.zeros(n)
 
     sort_index = cp.argsort(x)
-    inverse_sort = cp.empty(n, dtype=cp.int64)
-    inverse_sort[sort_index] = cp.arange(0, n, dtype=cp.int64)
+    inverse_sort = cp.empty(n, dtype = cp.int64)
+    inverse_sort[sort_index] = cp.arange(0, n, dtype = cp.int64)
     x = x[sort_index]
 
     mask = cp.empty(n, dtype=cp.bool_)
@@ -354,7 +355,7 @@ def image_pair_to_difference(
     vectors_to_neighboring_images,
     lattice_vectors,
 ):
-    """
+    '''
     Find unique image pairs for double lattice-sums associated with orbital products.
 
     When k-point phases are applied to orbital products with double lattice sum
@@ -376,14 +377,14 @@ def image_pair_to_difference(
         A tuple containing:
         - The reduced lattice-sum vectors T for the unique image pairs.
         - A inverse mapping that restores the index of double lattice-sum from T.
-    """
+    '''
     vectors_to_neighboring_images = cp.asarray(vectors_to_neighboring_images)
     lattice_vectors = cp.asarray(lattice_vectors)
 
     translation_vectors = cp.asarray(
         cp.linalg.solve(lattice_vectors.T, vectors_to_neighboring_images.T).T,
     )
-    translation_vectors = cp.asarray(cp.round(translation_vectors), dtype=cp.int32)
+    translation_vectors = cp.asarray(cp.round(translation_vectors), dtype = cp.int32)
     difference_images, inverse = _unique_image_pair(translation_vectors)
     difference_images = difference_images @ lattice_vectors
 
@@ -391,36 +392,32 @@ def image_pair_to_difference(
     # where R1 is associated with the first orbital in a pair, and R2 associated to the second.
     return cp.asarray(difference_images), cp.asarray(inverse, dtype=cp.int32)
 
-
 def _unique_image_pair(translation_vectors):
-    """
+    '''
     unqiue((-L[:,None] + L).reshape(-1, 3), axis=0, return_inverse=True)
-    """
+    '''
     image_difference_full = (
         # -k_i + k_j corresponding to <i|j>
-        translation_vectors[None, :, :] - translation_vectors[:, None, :]
+        translation_vectors[None,:,:] - translation_vectors[:,None,:]
     ).reshape(-1, 3)
 
     max_offset = (translation_vectors.max(axis=0) - translation_vectors.min(axis=0)).max() + 1
-    assert (max_offset * 2) ** 3 < np.iinfo(np.int32).max
+    assert (max_offset * 2)**3 < np.iinfo(np.int32).max
     image_difference_3in1 = image_difference_full
     image_difference_3in1 += max_offset
-    image_difference_3in1 = (
-        image_difference_3in1[:, 0] * (max_offset * 2) ** 2
-        + image_difference_3in1[:, 1] * (max_offset * 2)
-        + image_difference_3in1[:, 2]
-    )
+    image_difference_3in1 = image_difference_3in1[:, 0] * (max_offset * 2)**2 \
+                          + image_difference_3in1[:, 1] * (max_offset * 2) \
+                          + image_difference_3in1[:, 2]
 
     image_difference_3in1, inverse = unique_with_sort(image_difference_3in1)
 
-    translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype=cp.int32)
-    translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2) ** 2
-    translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2) ** 2) // (max_offset * 2)
+    translation_vectors = cp.empty([image_difference_3in1.shape[0], 3], dtype = cp.int32)
+    translation_vectors[:, 0] = image_difference_3in1 // (max_offset * 2)**2
+    translation_vectors[:, 1] = (image_difference_3in1 % (max_offset * 2)**2) // (max_offset * 2)
     translation_vectors[:, 2] = image_difference_3in1 % (max_offset * 2)
     translation_vectors -= max_offset
     return translation_vectors, inverse
 
-
 def image_phase_for_kpts(cell, neighboring_images, kpts=None):
     n_images = len(neighboring_images)
     if kpts is None or is_gamma_point(kpts):
@@ -432,10 +429,11 @@ def image_phase_for_kpts(cell, neighboring_images, kpts=None):
             neighboring_images,
             lattice_vectors,
         )
-        phase_diff_among_images = cp.exp(1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T))
+        phase_diff_among_images = cp.exp(
+            1j * cp.asarray(kpts.reshape(-1, 3)).dot(difference_images.T)
+        )
     return phase_diff_among_images, image_pair_difference_index
 
-
 def count_non_trivial_pairs(
     i_angular,
     j_angular,
@@ -558,7 +556,7 @@ def assign_pairs_to_blocks(
 ):
     n_blocks = np.prod(n_blocks_abc)
     n_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32)
-    n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype=cp.int32)
+    n_unstable_pairs_on_blocks = cp.zeros(n_blocks + 1, dtype = cp.int32)
     err = libgpbc.count_pairs_on_blocks(
         cast_to_pointer(n_pairs_on_blocks),
         cast_to_pointer(n_unstable_pairs_on_blocks),
@@ -576,13 +574,14 @@ def assign_pairs_to_blocks(
         cast_to_pointer(mesh),
         cast_to_pointer(atm),
         cast_to_pointer(bas),
-        cast_to_pointer(env),
+        cast_to_pointer(env)
     )
-    has_unstable_pairs = n_unstable_pairs_on_blocks[-1] > 0
+    has_unstable_pairs = (n_unstable_pairs_on_blocks[-1] > 0)
     if not has_warned_instability and has_unstable_pairs:
-        warnings.warn('Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.')
+        warnings.warn("Numerical instability may occur due to presence of core electrons or insufficient ke_cutoff.")
         has_warned_instability = True
 
+
     if err != 0:
         raise RuntimeError('count_pairs_on_blocks failed')
 
@@ -612,7 +611,7 @@ def assign_pairs_to_blocks(
         cast_to_pointer(mesh),
         cast_to_pointer(atm),
         cast_to_pointer(bas),
-        cast_to_pointer(env),
+        cast_to_pointer(env)
     )
 
     libgpbc.tailor_gaussian_pairs(
@@ -650,11 +649,11 @@ def assign_pairs_to_blocks(
         pairs_on_blocks,
         accumulated_n_pairs_per_block,
         sorted_block_index,
-        has_warned_instability,
+        has_warned_instability
     )
 
 
-def sort_gaussian_pairs(mydf, xc_type='LDA'):
+def sort_gaussian_pairs(mydf, xc_type="LDA"):
     cell = mydf.cell
     log = logger.new_logger(cell)
     t0 = log.init_timer()
@@ -667,30 +666,23 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
         is_non_orthogonal = 1
     else:
         is_non_orthogonal = 0
-    reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order='C')
+    reciprocal_lattice_vectors = np.asarray(np.linalg.inv(lattice_vectors.T), order="C")
 
     reciprocal_norms = np.linalg.norm(reciprocal_lattice_vectors, axis=1)
     libgpbc.update_lattice_vectors(
         lattice_vectors.ctypes,
         reciprocal_lattice_vectors.ctypes,
-        reciprocal_norms.ctypes,
+        reciprocal_norms.ctypes
     )
 
-    tasks = getattr(mydf, 'tasks', None)
+    tasks = getattr(mydf, "tasks", None)
     if tasks is None:
         tasks = multigrid.multi_grids_tasks(cell, mydf.mesh, log)
         mydf.tasks = tasks
 
-    t0 = log.timer('task generation', *t0)
+    t0 = log.timer("task generation", *t0)
     t1 = t0
     pairs = []
-
-    derivative_order = 0
-    if xc_type == 'GGA':
-        derivative_order = 1
-    if xc_type == 'mGGA':
-        raise NotImplementedError('mGGA is not supported yet in multigrid_v2.')
-
     for grids_localized, grids_diffused in tasks:
         subcell_in_localized_region = grids_localized.cell
         # the original grids_localized.mesh has dtype=np.int64, which can cause
@@ -699,17 +691,23 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
 
         fft_grid = list(
             map(
-                lambda n_mesh_points: cp.round(cp.fft.fftfreq(n_mesh_points, 1.0 / n_mesh_points)).astype(cp.int32),
+                lambda n_mesh_points: cp.round(cp.fft.fftfreq(
+                    n_mesh_points, 1.0 / n_mesh_points
+                )).astype(cp.int32),
                 mesh,
             )
         )
 
-        dxyz_dabc = lattice_vectors / mesh[:, None]
+        dxyz_dabc = lattice_vectors / mesh[:,None]
         libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes)
         n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32)
-        equivalent_cell_in_localized, coeff_in_localized = subcell_in_localized_region.decontract_basis(to_cart=True)
+        equivalent_cell_in_localized, coeff_in_localized = (
+            subcell_in_localized_region.decontract_basis(to_cart=True, aggregate=True)
+        )
 
-        n_primitive_gtos_in_localized = multigrid._pgto_shells(subcell_in_localized_region)
+        n_primitive_gtos_in_localized = multigrid._pgto_shells(
+            subcell_in_localized_region
+        )
 
         # theoretically we can use the rcut defined in localized cell to reduce the
         # number of images, but somehow it can introduce some error when the lattice
@@ -735,13 +733,15 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
         concatenated_coeff, concatenated_shape = sort_contraction_coefficients(concatenated_coeff)
 
         n_primitive_gtos_in_two_regions = multigrid._pgto_shells(grouped_cell)
-        rad = vol ** (-1.0 / 3) * cell.rcut + 1
-        surface = 4 * np.pi * rad**2
+        rad = vol**(-1./3) * cell.rcut + 1
+        surface = 4*np.pi * rad**2
         lattice_sum_factor = surface
         precision = cell.precision / lattice_sum_factor
         threshold_in_log = np.log(precision)
 
-        shell_to_ao_indices = cp.asarray(gto.moleintor.make_loc(grouped_cell._bas, 'cart'), dtype=cp.int32)
+        shell_to_ao_indices = cp.asarray(
+            gto.moleintor.make_loc(grouped_cell._bas, "cart"), dtype=cp.int32
+        )
         ao_indices_in_localized = cp.asarray(grids_localized.ao_idx, dtype=cp.int32)
         if grids_diffused is None:
             ao_indices_in_diffused = cp.array([], dtype=cp.int32)
@@ -758,7 +758,9 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
             i_shells = cp.asarray(np.where(i_angulars == l)[0], dtype=cp.int32)
             sorted_i_shells.append(i_shells)
 
-        j_angulars = grouped_cell._bas[:n_primitive_gtos_in_two_regions, multigrid.ANG_OF]
+        j_angulars = grouped_cell._bas[
+            :n_primitive_gtos_in_two_regions, multigrid.ANG_OF
+        ]
         j_angulars_unique = np.unique(j_angulars)
         sorted_j_shells = []
         for l in j_angulars_unique:
@@ -769,7 +771,7 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
         bas = cp.asarray(grouped_cell._bas, dtype=cp.int32)
         env = cp.asarray(grouped_cell._env)
 
-        t1 = log.timer_debug2('routines before screening', *t1)
+        t1 = log.timer_debug2("routines before screening", *t1)
         has_warned_instability = False
         for i_angular, i_shells in zip(i_angulars_unique, sorted_i_shells):
             for j_angular, j_shells in zip(j_angulars_unique, sorted_j_shells):
@@ -790,15 +792,21 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
                     env,
                     threshold_in_log,
                 )
-                t1 = log.timer_debug2('screening in angular pair' + str((i_angular, j_angular)), *t1)
-                contributing_block_ranges = pairs_to_blocks_end - pairs_to_blocks_begin + 1
-                n_contributing_blocks_per_pair = cp.prod(contributing_block_ranges, axis=0)
+                t1 = log.timer_debug2(
+                    "screening in angular pair" + str((i_angular, j_angular)), *t1
+                )
+                contributing_block_ranges = (
+                    pairs_to_blocks_end - pairs_to_blocks_begin + 1
+                )
+                n_contributing_blocks_per_pair = cp.prod(
+                    contributing_block_ranges, axis=0
+                )
                 n_indices = int(cp.sum(n_contributing_blocks_per_pair))
                 (
                     gaussian_pair_indices,
                     accumulated_counts,
                     sorted_contributing_blocks,
-                    has_warned_instability,
+                    has_warned_instability
                 ) = assign_pairs_to_blocks(
                     pairs_to_blocks_begin,
                     pairs_to_blocks_end,
@@ -822,8 +830,9 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
                     derivative_order,
                 )
                 t1 = log.timer_debug2(
-                    'assigning pairs to blocks in angular pair' + str((i_angular, j_angular)),
-                    *t1,
+                    "assigning pairs to blocks in angular pair"
+                    + str((i_angular, j_angular)),
+                    *t1
                 )
                 per_angular_pairs.append(
                     {
@@ -863,7 +872,7 @@ def sort_gaussian_pairs(mydf, xc_type='LDA'):
 
     mydf.sorted_gaussian_pairs = pairs
 
-    t0 = log.timer('sort_gaussian_pairs', *t0)
+    t0 = log.timer("sort_gaussian_pairs", *t0)
     return mydf
 
 
@@ -872,7 +881,7 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
         c_driver = libgpbc.evaluate_density_tau_driver
     else:
         c_driver = libgpbc.evaluate_density_driver
-    n_images = pairs_info['neighboring_images'].shape[0]
+    n_images = pairs_info["neighboring_images"].shape[0]
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
     if n_k_points == 1 and n_difference_images == 1:
@@ -882,7 +891,9 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
         # e^{i \vec{k} \cdot (\vec{R}_1 - \vec{R}_2)}
         # Because during grid density evaluation, rho = \sum_{\mu\nu} D_{\mu\nu} \mu \nu^*
         # The conjugate is on \nu, which is different from other Fock integrals
-        density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice)
+        density_matrix_with_translation = cp.einsum(
+            "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice
+        )
 
     n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape
 
@@ -894,7 +905,9 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
         # assert abs(density_matrix_with_translation.imag).max() < real_dm_imag_threshold, \
         #     f"The dm transformed into real space contains large imaginary part " \
         #     f"(max = {abs(density_matrix_with_translation.imag).max()}) >= {real_dm_imag_threshold}"
-    density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C')
+    density_matrix_with_translation_real_part = cp.asarray(
+        density_matrix_with_translation.real, order="C"
+    )
 
     if density_matrix_with_translation_real_part.dtype == cp.float32:
         use_float_precision = ctypes.c_int(1)
@@ -904,50 +917,40 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
     assert density_matrix_with_translation_real_part.size < np.iinfo(np.int32).max
 
     if with_tau:
-        density = cp.zeros(
-            (
-                n_channels,
-                2,
-            )
-            + tuple(pairs_info['mesh']),
-            dtype=density_matrix_with_translation_real_part.dtype,
-        )
+        density = cp.zeros((n_channels, 2, ) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype)
     else:
-        density = cp.zeros(
-            (n_channels,) + tuple(pairs_info['mesh']),
-            dtype=density_matrix_with_translation_real_part.dtype,
-        )
+        density = cp.zeros((n_channels,) + tuple(pairs_info["mesh"]), dtype=density_matrix_with_translation_real_part.dtype)
 
-    for gaussians_per_angular_pair in pairs_info['per_angular_pairs']:
-        (i_angular, j_angular) = gaussians_per_angular_pair['angular']
+    for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
+        (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
 
         err = c_driver(
             cast_to_pointer(density),
             cast_to_pointer(density_matrix_with_translation_real_part),
             ctypes.c_int(i_angular),
             ctypes.c_int(j_angular),
-            cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']),
-            cast_to_pointer(gaussians_per_angular_pair['i_shells']),
-            cast_to_pointer(gaussians_per_angular_pair['j_shells']),
-            ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])),
-            cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']),
+            cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]),
+            cast_to_pointer(gaussians_per_angular_pair["i_shells"]),
+            cast_to_pointer(gaussians_per_angular_pair["j_shells"]),
+            ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])),
+            cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]),
             ctypes.c_int(n_i_functions),
             ctypes.c_int(n_j_functions),
-            cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']),
-            cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']),
-            cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']),
-            ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])),
-            cast_to_pointer(gaussians_per_angular_pair['image_indices']),
-            cast_to_pointer(pairs_info['neighboring_images']),
+            cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]),
+            cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]),
+            cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]),
+            ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])),
+            cast_to_pointer(gaussians_per_angular_pair["image_indices"]),
+            cast_to_pointer(pairs_info["neighboring_images"]),
             ctypes.c_int(n_images),
             cast_to_pointer(image_pair_difference_index),
             ctypes.c_int(n_difference_images),
-            (ctypes.c_int * 3)(*pairs_info['mesh']),
-            cast_to_pointer(pairs_info['atm']),
-            cast_to_pointer(pairs_info['bas']),
-            cast_to_pointer(pairs_info['env']),
+            (ctypes.c_int * 3)(*pairs_info["mesh"]),
+            cast_to_pointer(pairs_info["atm"]),
+            cast_to_pointer(pairs_info["bas"]),
+            cast_to_pointer(pairs_info["env"]),
             ctypes.c_int(n_channels),
-            ctypes.c_int(pairs_info['is_non_orthogonal']),
+            ctypes.c_int(pairs_info["is_non_orthogonal"]),
             use_float_precision,
         )
         if err != 0:
@@ -955,8 +958,8 @@ def evaluate_density_wrapper(pairs_info, dm_slice, img_phase, ignore_imag=True,
 
     return density
 
-
 def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
+    dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     n_channels, n_k_points = dms.shape[:2]
     if mydf.sorted_gaussian_pairs is None:
@@ -965,40 +968,39 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
     with_tau = False
     if xc_type == "LDA" or xc_type == 'HF':
         density_slices = 1
-    elif xc_type == 'GGA':
+    elif xc_type == "GGA":
         density_slices = 4
-    elif xc_type == 'MGGA':
+    elif xc_type == "MGGA":
         density_slices = 5
         with_tau = True
     else:
-        raise ValueError(f'Incorrect xc_type = {xc_type}')
+        raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     cell = mydf.cell
-    log = logger.new_logger(cell, mydf.verbose)
 
     nx, ny, nz = mydf.mesh
-    density_on_g_mesh = cp.zeros((n_channels, density_slices, nx, ny, nz), dtype=cp.complex128)
-
+    density_on_g_mesh = cp.zeros(
+        (n_channels, density_slices, nx, ny, nz), dtype=cp.complex128
+    )
     for pairs in mydf.sorted_gaussian_pairs:
-        mesh = pairs['mesh']
+
+        mesh = pairs["mesh"]
 
         n_grid_points = np.prod(mesh)
         weight_per_grid_point = 1.0 / n_k_points * cell.vol / n_grid_points
 
-        breakpoint()
-
         density_matrix_with_rows_in_localized = dms[
             :,
             :,
-            pairs['ao_indices_in_localized'][:, None],
-            pairs['concatenated_ao_indices'],
+            pairs["ao_indices_in_localized"][:, None],
+            pairs["concatenated_ao_indices"],
         ]
 
         density_matrix_with_rows_in_diffused = dms[
             :,
             :,
-            pairs['ao_indices_in_diffused'][:, None],
-            pairs['ao_indices_in_localized'],
+            pairs["ao_indices_in_diffused"][:, None],
+            pairs["ao_indices_in_localized"],
         ]
 
         n_ao_in_localized = density_matrix_with_rows_in_diffused.shape[3]
@@ -1022,9 +1024,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
 
         libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes)
 
-        img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts)
+        img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
         density = (
-            evaluate_density_wrapper(pairs, coeff_sandwiched_density_matrix, img_phase, with_tau=with_tau)
+            evaluate_density_wrapper(
+                pairs, coeff_sandwiched_density_matrix, img_phase, with_tau = with_tau
+            )
             * weight_per_grid_point
         )
 
@@ -1034,12 +1038,13 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
             density = density[:, 0]
 
         density = fft_in_place(density)
+
         density_on_g_mesh[
             :,
             0,
-            pairs['fft_grid'][0][:, None, None],
-            pairs['fft_grid'][1][:, None],
-            pairs['fft_grid'][2],
+            pairs["fft_grid"][0][:, None, None],
+            pairs["fft_grid"][1][:, None],
+            pairs["fft_grid"][2],
         ] += density
 
         if with_tau:
@@ -1048,9 +1053,9 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
             density_on_g_mesh[
                 :,
                 4,
-                pairs['fft_grid'][0][:, None, None],
-                pairs['fft_grid'][1][:, None],
-                pairs['fft_grid'][2],
+                pairs["fft_grid"][0][:, None, None],
+                pairs["fft_grid"][1][:, None],
+                pairs["fft_grid"][2],
             ] += tau
 
     density_on_g_mesh = density_on_g_mesh.reshape([n_channels, density_slices, -1])
@@ -1063,11 +1068,11 @@ def evaluate_density_on_g_mesh(mydf, dm_kpts, kpts=None, xc_type='LDA'):
 
 def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
     if with_tau:
-        assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2
+        assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2
         n_channels = xc_weights.shape[0]
         # density_slices = 2
     else:
-        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1)
+        assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1)
         n_channels = xc_weights.shape[0]
         # density_slices = 1
 
@@ -1079,7 +1084,7 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
 
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
-    n_images = pairs_info['neighboring_images'].shape[0]
+    n_images = pairs_info["neighboring_images"].shape[0]
 
     fock = cp.zeros(
         (n_channels, n_difference_images, n_i_functions, n_j_functions),
@@ -1091,39 +1096,39 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
         assert xc_weights.dtype == cp.float64
         use_float_precision = ctypes.c_int(0)
 
-    for gaussians_per_angular_pair in pairs_info['per_angular_pairs']:
+    for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
         fock_slice = cp.zeros(
             (n_channels, n_difference_images, n_i_functions, n_j_functions),
             dtype=xc_weights.dtype,
         )
-        (i_angular, j_angular) = gaussians_per_angular_pair['angular']
+        (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
         err = c_driver(
             cast_to_pointer(fock_slice),
             cast_to_pointer(xc_weights),
             ctypes.c_int(i_angular),
             ctypes.c_int(j_angular),
-            cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']),
-            cast_to_pointer(gaussians_per_angular_pair['i_shells']),
-            cast_to_pointer(gaussians_per_angular_pair['j_shells']),
-            ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])),
-            cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']),
+            cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]),
+            cast_to_pointer(gaussians_per_angular_pair["i_shells"]),
+            cast_to_pointer(gaussians_per_angular_pair["j_shells"]),
+            ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])),
+            cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]),
             ctypes.c_int(n_i_functions),
             ctypes.c_int(n_j_functions),
-            cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']),
-            cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']),
-            cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']),
-            ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])),
-            cast_to_pointer(gaussians_per_angular_pair['image_indices']),
-            cast_to_pointer(pairs_info['neighboring_images']),
+            cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]),
+            cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]),
+            cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]),
+            ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])),
+            cast_to_pointer(gaussians_per_angular_pair["image_indices"]),
+            cast_to_pointer(pairs_info["neighboring_images"]),
             ctypes.c_int(n_images),
             cast_to_pointer(image_pair_difference_index),
             ctypes.c_int(n_difference_images),
-            cast_to_pointer(pairs_info['mesh']),
-            cast_to_pointer(pairs_info['atm']),
-            cast_to_pointer(pairs_info['bas']),
-            cast_to_pointer(pairs_info['env']),
+            cast_to_pointer(pairs_info["mesh"]),
+            cast_to_pointer(pairs_info["atm"]),
+            cast_to_pointer(pairs_info["bas"]),
+            cast_to_pointer(pairs_info["env"]),
             ctypes.c_int(n_channels),
-            ctypes.c_int(pairs_info['is_non_orthogonal']),
+            ctypes.c_int(pairs_info["is_non_orthogonal"]),
             use_float_precision,
         )
         fock += fock_slice
@@ -1131,7 +1136,9 @@ def evaluate_xc_wrapper(pairs_info, xc_weights, img_phase, with_tau=False):
             raise RuntimeError(f'evaluate_xc_driver for li={i_angular} lj={j_angular} failed')
 
     if not (n_k_points == 1 and n_difference_images == 1):
-        return cp.einsum('kt, ntij -> nkij', phase_diff_among_images, fock)
+        return cp.einsum(
+            "kt, ntij -> nkij", phase_diff_among_images, fock
+        )
     else:
         return fock
 
@@ -1145,7 +1152,7 @@ def convert_xc_on_g_mesh_to_fock(
 ):
     cell = mydf.cell
     nao = cell.nao_nr()
-    log = logger.new_logger(cell, mydf.verbose)
+
     if with_tau:
         if xc_on_g_mesh.ndim == 2:
             assert xc_on_g_mesh.shape[0] == 2
@@ -1154,7 +1161,7 @@ def convert_xc_on_g_mesh_to_fock(
             assert xc_on_g_mesh.shape[1] == 2
             n_channels = xc_on_g_mesh.shape[0]
         else:
-            raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}')
+            raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}")
         density_slices = 2
     else:
         if xc_on_g_mesh.ndim == 1:
@@ -1165,7 +1172,7 @@ def convert_xc_on_g_mesh_to_fock(
             assert xc_on_g_mesh.shape[1] == 1
             n_channels = xc_on_g_mesh.shape[0]
         else:
-            raise ValueError('Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}')
+            raise ValueError("Incorrect shape of xc_on_g_mesh = {xc_on_g_mesh.shape}")
         density_slices = 1
 
     xc_on_g_mesh = xc_on_g_mesh.reshape(n_channels, density_slices, *mydf.mesh)
@@ -1191,15 +1198,15 @@ def convert_xc_on_g_mesh_to_fock(
         interpolated_xc = xc_on_g_mesh[
             :,
             :,
-            pairs['fft_grid'][0][:, None, None],
-            pairs['fft_grid'][1][:, None],
-            pairs['fft_grid'][2],
+            pairs["fft_grid"][0][:, None, None],
+            pairs["fft_grid"][1][:, None],
+            pairs["fft_grid"][2],
         ]
-        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C')
+        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C")
 
-        n_ao_in_localized = len(pairs['ao_indices_in_localized'])
-        libgpbc.update_dxyz_dabc(pairs['dxyz_dabc'].ctypes)
-        img_phase = image_phase_for_kpts(cell, pairs['neighboring_images'], kpts)
+        n_ao_in_localized = len(pairs["ao_indices_in_localized"])
+        libgpbc.update_dxyz_dabc(pairs["dxyz_dabc"].ctypes)
+        img_phase = image_phase_for_kpts(cell, pairs["neighboring_images"], kpts)
         fock_slice = evaluate_xc_wrapper(pairs, interpolated_xc, img_phase, with_tau=with_tau)
         n_sets, n_k_points = fock_slice.shape[:2]
 
@@ -1224,23 +1231,24 @@ def convert_xc_on_g_mesh_to_fock(
         fock[
             :,
             :,
-            pairs['ao_indices_in_localized'][:, None],
-            pairs['ao_indices_in_localized'],
+            pairs["ao_indices_in_localized"][:, None],
+            pairs["ao_indices_in_localized"],
         ] += fock_slice[:, :, :, :n_ao_in_localized]
         fock[
             :,
             :,
-            pairs['ao_indices_in_localized'][:, None],
-            pairs['ao_indices_in_diffused'],
+            pairs["ao_indices_in_localized"][:, None],
+            pairs["ao_indices_in_diffused"],
         ] += fock_slice[:, :, :, n_ao_in_localized:]
-
         if hermi == 1:
             fock[
                 :,
                 :,
-                pairs['ao_indices_in_diffused'][:, None],
-                pairs['ao_indices_in_localized'],
-            ] += fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj()
+                pairs["ao_indices_in_diffused"][:, None],
+                pairs["ao_indices_in_localized"],
+            ] += (
+                fock_slice[:, :, :, n_ao_in_localized:].transpose(0, 1, 3, 2).conj()
+            )
         else:
             raise NotImplementedError
 
@@ -1248,20 +1256,14 @@ def convert_xc_on_g_mesh_to_fock(
 
 
 def evaluate_xc_gradient_wrapper(
-    gradient,
-    pairs_info,
-    xc_weights,
-    dm_slice,
-    img_phase,
-    ignore_imag=True,
-    with_tau=False,
+    gradient, pairs_info, xc_weights, dm_slice, img_phase, ignore_imag=True, with_tau=False
 ):
     if with_tau:
-        assert xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 2
+        assert xc_weights.ndim == 3+2 and xc_weights.shape[1] == 2
         n_channels = xc_weights.shape[0]
         # density_slices = 2
     else:
-        assert (xc_weights.ndim == 3 + 2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3 + 1)
+        assert (xc_weights.ndim == 3+2 and xc_weights.shape[1] == 1) or (xc_weights.ndim == 3+1)
         n_channels = xc_weights.shape[0]
         # density_slices = 1
 
@@ -1277,53 +1279,57 @@ def evaluate_xc_gradient_wrapper(
     else:
         use_float_precision = ctypes.c_int(0)
 
-    n_images = pairs_info['neighboring_images'].shape[0]
+    n_images = pairs_info["neighboring_images"].shape[0]
     phase_diff_among_images, image_pair_difference_index = img_phase
     n_k_points, n_difference_images = phase_diff_among_images.shape
 
     if n_k_points == 1 and n_difference_images == 1:
         density_matrix_with_translation = dm_slice
     else:
-        density_matrix_with_translation = cp.einsum('kt, ikpq->itpq', phase_diff_among_images.conj(), dm_slice)
+        density_matrix_with_translation = cp.einsum(
+            "kt, ikpq->itpq", phase_diff_among_images.conj(), dm_slice
+        )
 
     n_channels, _, n_i_functions, n_j_functions = density_matrix_with_translation.shape
     if ignore_imag is False:
         raise NotImplementedError
 
-    density_matrix_with_translation_real_part = cp.asarray(density_matrix_with_translation.real, order='C')
+    density_matrix_with_translation_real_part = cp.asarray(
+        density_matrix_with_translation.real, order="C"
+    )
 
     assert gradient.dtype == density_matrix_with_translation_real_part.dtype
 
-    for gaussians_per_angular_pair in pairs_info['per_angular_pairs']:
-        (i_angular, j_angular) = gaussians_per_angular_pair['angular']
+    for gaussians_per_angular_pair in pairs_info["per_angular_pairs"]:
+        (i_angular, j_angular) = gaussians_per_angular_pair["angular"]
         err = c_driver(
             cast_to_pointer(gradient),
             cast_to_pointer(xc_weights),
             cast_to_pointer(density_matrix_with_translation_real_part),
             ctypes.c_int(i_angular),
             ctypes.c_int(j_angular),
-            cast_to_pointer(gaussians_per_angular_pair['screened_shell_pairs']),
-            cast_to_pointer(gaussians_per_angular_pair['i_shells']),
-            cast_to_pointer(gaussians_per_angular_pair['j_shells']),
-            ctypes.c_int(len(gaussians_per_angular_pair['j_shells'])),
-            cast_to_pointer(gaussians_per_angular_pair['shell_to_ao_indices']),
+            cast_to_pointer(gaussians_per_angular_pair["screened_shell_pairs"]),
+            cast_to_pointer(gaussians_per_angular_pair["i_shells"]),
+            cast_to_pointer(gaussians_per_angular_pair["j_shells"]),
+            ctypes.c_int(len(gaussians_per_angular_pair["j_shells"])),
+            cast_to_pointer(gaussians_per_angular_pair["shell_to_ao_indices"]),
             ctypes.c_int(n_i_functions),
             ctypes.c_int(n_j_functions),
-            cast_to_pointer(gaussians_per_angular_pair['pair_indices_per_block']),
-            cast_to_pointer(gaussians_per_angular_pair['accumulated_counts_per_block']),
-            cast_to_pointer(gaussians_per_angular_pair['sorted_block_index']),
-            ctypes.c_int(len(gaussians_per_angular_pair['sorted_block_index'])),
-            cast_to_pointer(gaussians_per_angular_pair['image_indices']),
-            cast_to_pointer(pairs_info['neighboring_images']),
+            cast_to_pointer(gaussians_per_angular_pair["pair_indices_per_block"]),
+            cast_to_pointer(gaussians_per_angular_pair["accumulated_counts_per_block"]),
+            cast_to_pointer(gaussians_per_angular_pair["sorted_block_index"]),
+            ctypes.c_int(len(gaussians_per_angular_pair["sorted_block_index"])),
+            cast_to_pointer(gaussians_per_angular_pair["image_indices"]),
+            cast_to_pointer(pairs_info["neighboring_images"]),
             ctypes.c_int(n_images),
             cast_to_pointer(image_pair_difference_index),
             ctypes.c_int(n_difference_images),
-            cast_to_pointer(pairs_info['mesh']),
-            cast_to_pointer(pairs_info['atm']),
-            cast_to_pointer(pairs_info['bas']),
-            cast_to_pointer(pairs_info['env']),
+            cast_to_pointer(pairs_info["mesh"]),
+            cast_to_pointer(pairs_info["atm"]),
+            cast_to_pointer(pairs_info["bas"]),
+            cast_to_pointer(pairs_info["env"]),
             ctypes.c_int(n_channels),
-            ctypes.c_int(pairs_info['is_non_orthogonal']),
+            ctypes.c_int(pairs_info["is_non_orthogonal"]),
             use_float_precision,
         )
         if err != 0:
@@ -1339,7 +1345,7 @@ def convert_xc_on_g_mesh_to_fock_gradient(
     with_tau=False,
 ):
     cell = mydf.cell
-    dm_kpts = cp.asarray(dm_kpts, order='C')
+    dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     n_atoms = cell.natm
 
@@ -1357,24 +1363,24 @@ def convert_xc_on_g_mesh_to_fock_gradient(
         interpolated_xc = xc_on_g_mesh[
             :,
             :,
-            pairs['fft_grid'][0][:, None, None],
-            pairs['fft_grid'][1][:, None],
-            pairs['fft_grid'][2],
+            pairs["fft_grid"][0][:, None, None],
+            pairs["fft_grid"][1][:, None],
+            pairs["fft_grid"][2],
         ]
 
-        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order='C')
+        interpolated_xc = cp.asarray(ifft_in_place(interpolated_xc).real, order="C")
 
         density_matrix_slice = dms[
             :,
             :,
-            pairs['ao_indices_in_localized'][:, None],
-            pairs['concatenated_ao_indices'],
+            pairs["ao_indices_in_localized"][:, None],
+            pairs["concatenated_ao_indices"],
         ]
         density_matrix_with_rows_in_diffused = dms[
             :,
             :,
-            pairs['ao_indices_in_diffused'][:, None],
-            pairs['ao_indices_in_localized'],
+            pairs["ao_indices_in_diffused"][:, None],
+            pairs["ao_indices_in_localized"],
         ]
 
         n_ao_in_localized = density_matrix_slice.shape[2]
@@ -1408,8 +1414,7 @@ def convert_xc_on_g_mesh_to_fock_gradient(
 
     return gradient
 
-
-# FIXME: merge to multigrid_v1.get_pp
+#FIXME: merge to multigrid_v1.get_pp
 def get_nuc(ni, kpts=None):
     if ni.sorted_gaussian_pairs is None:
         ni.build()
@@ -1427,8 +1432,7 @@ def get_nuc(ni, kpts=None):
         vne = vne[0]
     return vne
 
-
-# FIXME: merge to multigrid_v1.get_pp
+#FIXME: merge to multigrid_v1.get_pp
 def get_pp(ni, kpts=None):
     """Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed."""
     if ni.sorted_gaussian_pairs is None:
@@ -1446,7 +1450,7 @@ def get_pp(ni, kpts=None):
     # -einsum('ij,ij->j', pseudo.get_vlocG(cell, Gv), cell.get_SI(Gv))
     vpplocG = ni.vpplocG
     vpp = convert_xc_on_g_mesh_to_fock(ni, vpplocG, hermi=1, kpts=kpts)[0]
-    t1 = log.timer_debug1('vpploc', *t0)
+    t1 = log.timer_debug1("vpploc", *t0)
 
     vppnl = pp_int.get_pp_nl(cell, kpts)
     for k, kpt in enumerate(kpts):
@@ -1457,13 +1461,12 @@ def get_pp(ni, kpts=None):
 
     if is_single_kpt:
         vpp = vpp[0]
-    log.timer_debug1('vppnl', *t1)
-    log.timer('get_pp', *t0)
+    log.timer_debug1("vppnl", *t1)
+    log.timer("get_pp", *t0)
     return vpp
 
-
 def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
-    """Get the Coulomb (J) AO matrix at sampled k-points.
+    '''Get the Coulomb (J) AO matrix at sampled k-points.
 
     Args:
         dm_kpts : (*, nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray
@@ -1479,7 +1482,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     Returns:
         vj : (*, nkpts, nao, nao) ndarray
         or list of vj if the input dm_kpts is a list of DMs
-    """
+    '''
     if kpts is None:
         kpts = np.zeros((1, 3))
     else:
@@ -1487,7 +1490,7 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     cell = ni.cell
     log = logger.new_logger(cell)
     t0 = log.init_timer()
-    dm_kpts = cp.asarray(dm_kpts, order='C')
+    dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     mesh = ni.mesh
@@ -1509,28 +1512,16 @@ def get_j_kpts(ni, dm_kpts, hermi=1, kpts=None, kpts_band=None):
     density = ifft_in_place(density).real.reshape(nset, -1, ngrids)
     density /= weight
 
-    # if kpts_band is not None:
+    #if kpts_band is not None:
     #    ni = ni.copy().reset().build()
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
     xc_for_fock = convert_xc_on_g_mesh_to_fock(ni, coulomb_on_g_mesh, hermi, kpts_band)
-    t0 = log.timer('vj', *t0)
+    t0 = log.timer("vj", *t0)
     return _format_jks(xc_for_fock, dm_kpts, input_band, kpts)
 
-
-def nr_rks(
-    ni,
-    cell,
-    grids,
-    xc_code,
-    dm_kpts,
-    relativity=0,
-    hermi=1,
-    kpts=None,
-    kpts_band=None,
-    with_j=False,
-    verbose=None,
-):
-    """Compute the XC energy and RKS XC matrix at sampled k-points.
+def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
+           kpts=None, kpts_band=None, with_j=False, verbose=None):
+    '''Compute the XC energy and RKS XC matrix at sampled k-points.
     multigrid version of function pbc.dft.numint.nr_rks.
 
     Args:
@@ -1549,7 +1540,7 @@ def nr_rks(
         nelec : number of electrons obtained from the numerical integration
         veff : (nkpts, nao, nao) ndarray
             or list of veff if the input dm_kpts is a list of DMs
-    """
+    '''
     cell = ni.cell
     log = logger.new_logger(cell, verbose)
     t0 = log.init_timer()
@@ -1561,7 +1552,7 @@ def nr_rks(
         kpts = np.zeros((1, 3))
     else:
         kpts = kpts.reshape(-1, 3)
-    dm_kpts = cp.asarray(dm_kpts, order='C')
+    dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     dms = None
@@ -1591,9 +1582,11 @@ def nr_rks(
             xc_code, density[0], deriv=1, xctype=xc_type
         )[:2]
     elif xc_type == 'GGA' or xc_type == 'MGGA':
-        xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2]
+        xc_for_energy, xc_for_fock = ni.eval_xc_eff(
+            xc_code, density, deriv=1, xctype=xc_type
+        )[:2]
     else:
-        raise ValueError(f'Incorrect xc_type = {xc_type}')
+        raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     rho_sf = density[0].real
     xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).get()) * weight
@@ -1603,7 +1596,7 @@ def nr_rks(
     xc_for_fock *= weight
     xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(-1, ngrids)
 
-    log.debug('Multigrid exc %s  nelec %s', xc_energy_sum, n_electrons)
+    log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
 
     if xc_type == "LDA" or xc_type == 'HF':
         pass
@@ -1621,35 +1614,23 @@ def nr_rks(
             axis=0,
         )
     else:
-        raise ValueError(f'Incorrect xc_type = {xc_type}')
+        raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     if with_j:
         xc_for_fock[0] += coulomb_on_g_mesh
 
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
-    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA'))
+    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA"))
     veff = _format_jks(veff, dm_kpts, input_band, kpts)
     veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None)
-    t0 = log.timer('xc', *t0)
+    t0 = log.timer("xc", *t0)
     return n_electrons, xc_energy_sum, veff
 
-
 # Note nr_uks handles only one set of KUKS density matrices (alpha, beta) in
 # each call (nr_rks supports multiple sets of KRKS density matrices)
-def nr_uks(
-    ni,
-    cell,
-    grids,
-    xc_code,
-    dm_kpts,
-    relativity=0,
-    hermi=1,
-    kpts=None,
-    kpts_band=None,
-    with_j=False,
-    verbose=None,
-):
-    """Compute the XC energy and UKS XC matrix at sampled k-points.
+def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
+           kpts=None, kpts_band=None, with_j=False, verbose=None):
+    '''Compute the XC energy and UKS XC matrix at sampled k-points.
     multigrid version of function pbc.dft.numint.nr_rks.
 
     Args:
@@ -1668,7 +1649,7 @@ def nr_uks(
         nelec : number of electrons obtained from the numerical integration
         veff : (nkpts, nao, nao) ndarray
             or list of veff if the input dm_kpts is a list of DMs
-    """
+    '''
     cell = ni.cell
     log = logger.new_logger(cell, verbose)
     t0 = log.init_timer()
@@ -1680,7 +1661,7 @@ def nr_uks(
         kpts = np.zeros((1, 3))
     else:
         kpts = kpts.reshape(-1, 3)
-    dm_kpts = cp.asarray(dm_kpts, order='C')
+    dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     dms = None
@@ -1695,8 +1676,8 @@ def nr_uks(
     coulomb_on_g_mesh = rho_sf * ni.coulG
     coulomb_energy = 0.5 * rho_sf.conj().dot(coulomb_on_g_mesh).real
     coulomb_energy /= cell.vol
-    log.debug('Multigrid Coulomb energy %s', coulomb_energy)
-    t0 = log.timer('coulomb', *t0)
+    log.debug("Multigrid Coulomb energy %s", coulomb_energy)
+    t0 = log.timer("coulomb", *t0)
     weight = cell.vol / ngrids
 
     density = density.reshape(-1, *mesh)
@@ -1711,9 +1692,11 @@ def nr_uks(
             xc_code, density[:,0], deriv=1, xctype=xc_type
         )[:2]
     elif xc_type == 'GGA' or xc_type == 'MGGA':
-        xc_for_energy, xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[:2]
+        xc_for_energy, xc_for_fock = ni.eval_xc_eff(
+            xc_code, density, deriv=1, xctype=xc_type
+        )[:2]
     else:
-        raise ValueError(f'Incorrect xc_type = {xc_type}')
+        raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     rho_sf = (density[0, 0] + density[1, 0]).real
     xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).real.get()) * weight
@@ -1723,7 +1706,7 @@ def nr_uks(
     xc_for_fock *= weight
     xc_for_fock = fft_in_place(xc_for_fock.reshape(-1, *mesh)).reshape(nset, -1, ngrids)
 
-    log.debug('Multigrid exc %s  nelec %s', xc_energy_sum, n_electrons)
+    log.debug("Multigrid exc %s  nelec %s", xc_energy_sum, n_electrons)
 
     if xc_type == "LDA" or xc_type == 'HF':
         pass
@@ -1742,21 +1725,21 @@ def nr_uks(
             axis=1,
         )
     else:
-        raise ValueError(f'Incorrect xc_type = {xc_type}')
+        raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     if with_j:
         xc_for_fock[:, 0] += coulomb_on_g_mesh
 
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
-    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau=(xc_type == 'MGGA'))
+    veff = convert_xc_on_g_mesh_to_fock(ni, xc_for_fock, hermi, kpts_band, with_tau = (xc_type == "MGGA"))
     veff = _format_jks(veff, dm_kpts, input_band, kpts)
     veff = tag_array(veff, ecoul=coulomb_energy, exc=xc_energy_sum, vj=None, vk=None)
-    t0 = log.timer('xc', *t0)
+    t0 = log.timer("xc", *t0)
     return n_electrons, xc_energy_sum, veff
 
-
 def get_rho(ni, dm, kpts=None):
-    """Density in real space"""
+    '''Density in real space
+    '''
     cell = ni.cell
     mesh = ni.mesh
     ngrids = np.prod(mesh)
@@ -1767,7 +1750,6 @@ def get_rho(ni, dm, kpts=None):
     rhoR = ifft_in_place(density.reshape(-1, *mesh)).real / weight
     return rhoR.reshape(-1, ngrids)
 
-
 def get_veff_ip1(
     ni,
     xc_code,
@@ -1778,21 +1760,21 @@ def get_veff_ip1(
     with_pseudo_vloc_orbital_derivative=True,
     verbose=None,
 ):
-    """Computes the derivatives of the Exc along with additional contributions
+    '''Computes the derivatives of the Exc along with additional contributions
     from the Coulomb and pseudopotential terms.
 
     Note, the current return is the energy per cell scaled by the number of
     k-points. This should return the energy per cell directly and will be
     changed in future.
-    """
+    '''
     if kpts is None:
         kpts = np.zeros((1, 3))
     else:
         kpts = kpts.reshape(-1, 3)
-    cell = ni.cell
-    log = logger.new_logger(cell)
+    log = logger.new_logger(ni, verbose)
     t0 = log.init_timer()
-    dm_kpts = cp.asarray(dm_kpts, order='C')
+    cell = ni.cell
+    dm_kpts = cp.asarray(dm_kpts, order="C")
     dms = _format_dms(dm_kpts, kpts)
     nset = dms.shape[0]
     dms = None
@@ -1812,15 +1794,19 @@ def get_veff_ip1(
     density = (
         cp.asarray(
             ifft_in_place(density.reshape(nset, -1, *mesh)).real,
-            order='C',
+            order="C",
         ).reshape(nset, -1, ngrids)
         / weight
     )
 
     if nset == 1:
-        xc_for_fock = ni.eval_xc_eff(xc_code, density[0], deriv=1, xctype=xc_type)[1]
+        xc_for_fock = ni.eval_xc_eff(
+            xc_code, density[0], deriv=1, xctype=xc_type
+        )[1]
     else:
-        xc_for_fock = ni.eval_xc_eff(xc_code, density, deriv=1, xctype=xc_type)[1]
+        xc_for_fock = ni.eval_xc_eff(
+            xc_code, density, deriv=1, xctype=xc_type
+        )[1]
 
     xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight
     xc_for_fock = fft_in_place(xc_for_fock).reshape(nset, -1, ngrids)
@@ -1842,7 +1828,7 @@ def get_veff_ip1(
             axis=1,
         )
     else:
-        raise ValueError(f'Incorrect xc_type = {xc_type}')
+        raise ValueError(f"Incorrect xc_type = {xc_type}")
 
     if with_j:
         xc_for_fock[:, 0] += coulomb_on_g_mesh
@@ -1854,11 +1840,10 @@ def get_veff_ip1(
         ni, xc_for_fock, dm_kpts, hermi, kpts, with_tau = (xc_type == "MGGA")
     )
 
-    t0 = log.timer('veff_gradient', *t0)
+    t0 = log.timer("veff_gradient", *t0)
 
     return veff_gradient
 
-
 class MultiGridNumInt(lib.StreamObject, numint.LibXCMixin):
     def __init__(self, cell):
         self.cell = cell
@@ -1893,7 +1878,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None):
     get_rho = get_rho
     nr_rks = nr_rks
     nr_uks = nr_uks
-    get_vxc = nr_vxc = NotImplemented  # numint_cpu.KNumInt.nr_vxc
+    get_vxc = nr_vxc = NotImplemented #numint_cpu.KNumInt.nr_vxc
 
     eval_xc_eff = numint.eval_xc_eff
     _init_xcfuns = numint.NumInt._init_xcfuns
@@ -1901,7 +1886,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None):
     nr_rks_fxc = NotImplemented
     nr_uks_fxc = NotImplemented
     nr_rks_fxc_st = NotImplemented
-    cache_xc_kernel = NotImplemented
+    cache_xc_kernel  = NotImplemented
     cache_xc_kernel1 = NotImplemented
 
     to_gpu = utils.to_gpu

From d568ac475180189258bd541cf7cc7f67b1061070 Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Wed, 18 Mar 2026 15:03:28 -0700
Subject: [PATCH 11/14] further revert formatting

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 18 +++++++++---------
 gpu4pyscf/pbc/grad/rhf.py         |  7 +++----
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index c3160be0f..042bf92c0 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -836,15 +836,15 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
                 )
                 per_angular_pairs.append(
                     {
-                        'angular': (i_angular, j_angular),
-                        'screened_shell_pairs': screened_shell_pairs,
-                        'pair_indices_per_block': gaussian_pair_indices,
-                        'accumulated_counts_per_block': accumulated_counts,
-                        'sorted_block_index': sorted_contributing_blocks,
-                        'image_indices': image_indices,
-                        'i_shells': i_shells,
-                        'j_shells': j_shells,
-                        'shell_to_ao_indices': shell_to_ao_indices,
+                        "angular": (i_angular, j_angular),
+                        "screened_shell_pairs": screened_shell_pairs,
+                        "pair_indices_per_block": gaussian_pair_indices,
+                        "accumulated_counts_per_block": accumulated_counts,
+                        "sorted_block_index": sorted_contributing_blocks,
+                        "image_indices": image_indices,
+                        "i_shells": i_shells,
+                        "j_shells": j_shells,
+                        "shell_to_ao_indices": shell_to_ao_indices,
                     }
                 )
 
diff --git a/gpu4pyscf/pbc/grad/rhf.py b/gpu4pyscf/pbc/grad/rhf.py
index 8a86a719c..d04c63bbd 100644
--- a/gpu4pyscf/pbc/grad/rhf.py
+++ b/gpu4pyscf/pbc/grad/rhf.py
@@ -34,7 +34,6 @@
 
 __all__ = ['Gradients']
 
-
 class GradientsBase(mol_rhf.GradientsBase):
     _keys = {'cell'}
 
@@ -89,13 +88,14 @@ def kernel(self, mo_energy=None, mo_coeff=None, mo_occ=None):
         return self.de
 
     def optimizer(self):
-        """Geometry (atom positions and lattice) optimization solver"""
+        '''Geometry (atom positions and lattice) optimization solver
+        '''
         from gpu4pyscf.geomopt.ase_solver import GeometryOptimizer
-
         return GeometryOptimizer(self.base)
 
 
 class Gradients(GradientsBase):
+
     make_rdm1e = mol_rhf.Gradients.make_rdm1e
 
     def energy_ee(self, dm):
@@ -201,7 +201,6 @@ def grad_elec(
 
     def get_stress(self):
         from gpu4pyscf.pbc.grad import rhf_stress
-
         return rhf_stress.kernel(self)
 
 def contract_h1e_dm(cell, h1e, dm, hermi=0):

From 7896cb88c3832721503ba094dc701b4ae20f6fc0 Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Wed, 18 Mar 2026 15:05:09 -0700
Subject: [PATCH 12/14] final? revert of formatting changes

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 042bf92c0..7377c8d3a 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -218,7 +218,7 @@ def unique_with_sort(x):
 
     mask = cp.empty(n, dtype=cp.bool_)
     mask[0] = True
-    mask[1:] = x[1:] != x[:-1]
+    mask[1:] = (x[1:] != x[:-1])
 
     x = x[mask]
     inverse_unique = cp.cumsum(mask, dtype=cp.int64) - 1

From b74463f02ae9cc0cf811c4c2d31c4bebcbb26c88 Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Wed, 18 Mar 2026 21:31:16 -0700
Subject: [PATCH 13/14] fix bug from merge

---
 .../lib/multigrid/multigrid_v2/screening.cuh  | 14 ++++-----
 .../lib/multigrid/multigrid_v2/utils.cuh      | 29 ++++++++++++-------
 gpu4pyscf/pbc/dft/multigrid_v2.py             | 15 +++++++---
 3 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
index fa3971250..7abdb14a5 100644
--- a/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
+++ b/gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
@@ -842,21 +842,19 @@ __global__ static void tailor_gaussian_pairs_kernel(
                                 recursion_factor_bc_pow_b,
             recursion_factor_c *= exp_dc_squared) {
 
-          const double r_i_squared =
-              distance_squared(x - i_x, y - i_y, z - i_z);
-          const double r_j_squared =
-              distance_squared(x - j_x, y - j_y, z - j_z);
-          const double r_p_squared =
-              distance_squared(x - pair_x, y - pair_y, z - pair_z);
+          const double r_i = sqrt(distance_squared(x - i_x, y - i_y, z - i_z));
+          const double r_j = sqrt(distance_squared(x - j_x, y - j_y, z - j_z));
+          const double r_p =
+              sqrt(distance_squared(x - pair_x, y - pair_y, z - pair_z));
 
           const double approxmate_polynomial =
               approximate_polynomial_value<double, i_angular, j_angular>(
-                  r_i_squared, r_j_squared, r_p_squared, derivative_order);
+                  r_i, r_j, r_p, derivative_order);
 
           const double gaussian = gaussian_x * gaussian_y * gaussian_z;
 
           const double approximate_value =
-              abs(4.0 * M_PI * r_p_squared * pair_prefactor *
+              abs(4.0 * M_PI * r_p * r_p * pair_prefactor *
                   approxmate_polynomial * gaussian);
 
           max_gaussian_value = max(max_gaussian_value, approximate_value);
diff --git a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
index a878d76dd..e7b063117 100644
--- a/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
+++ b/gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
@@ -27,25 +27,32 @@ __host__ __device__ T distance_squared(const T x, const T y, const T z) {
 }
 
 template <typename T, int i_angular, int j_angular>
-__host__ __device__ T approximate_polynomial_value(const double r_i_squared,
-                                                   const double r_j_squared,
-                                                   const double r_p_squared,
+__host__ __device__ T approximate_polynomial_value(const double r_i,
+                                                   const double r_j,
+                                                   const double r_p,
                                                    const int derivative_order) {
-  T result =
-      pow(r_i_squared, i_angular / 2.0) * pow(r_j_squared, j_angular / 2.0);
 
-  if (derivative_order == 1) {
-    result *= -2.0 * sqrt(r_p_squared);
+  T result = pow(r_i, i_angular) * pow(r_j, j_angular);
+
+  if (derivative_order > 0) {
+    result *= 2.0 * r_p;
     if constexpr (i_angular > 0) {
-      result += i_angular * pow(r_i_squared, (i_angular - 2) / 2.0) *
-                pow(r_j_squared, j_angular / 2.0);
+      result += i_angular * pow(r_i, i_angular - 1) * pow(r_j, j_angular);
     }
 
     if constexpr (j_angular > 0) {
-      result += j_angular * pow(r_i_squared, i_angular / 2.0) *
-                pow(r_j_squared, (j_angular - 2) / 2.0);
+      result += j_angular * pow(r_i, i_angular) * pow(r_j, j_angular - 1);
     }
   }
+
+  if (derivative_order > 1) {
+    result *= 2.0 * r_p;
+    if constexpr (i_angular > 0 && j_angular > 0) {
+      result += i_angular * j_angular * pow(r_i, i_angular - 1) *
+                pow(r_j, j_angular - 1);
+    }
+  }
+
   return result;
 }
 
diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index 7377c8d3a..a7341de74 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -682,6 +682,12 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
 
     t0 = log.timer("task generation", *t0)
     t1 = t0
+    derivative_order = 0
+    if xc_type == 'GGA':
+        derivative_order = 1
+    if xc_type == 'MGGA':
+        derivative_order = 2
+
     pairs = []
     for grids_localized, grids_diffused in tasks:
         subcell_in_localized_region = grids_localized.cell
@@ -702,7 +708,7 @@ def sort_gaussian_pairs(mydf, xc_type="LDA"):
         libgpbc.update_dxyz_dabc(dxyz_dabc.ctypes)
         n_blocks_abc = np.asarray(np.ceil(mesh / block_size), dtype=cp.int32)
         equivalent_cell_in_localized, coeff_in_localized = (
-            subcell_in_localized_region.decontract_basis(to_cart=True, aggregate=True)
+            subcell_in_localized_region.decontract_basis(to_cart=True)
         )
 
         n_primitive_gtos_in_localized = multigrid._pgto_shells(
@@ -1605,7 +1611,7 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
         xc_for_fock = xc_for_fock[0]
         xc_for_fock = xc_for_fock.reshape((-1, ngrids))
     elif xc_type == 'MGGA':
-        xc_for_fock = contract_iG_potential(xc_for_fock, cell)
+        contract_iG_potential(xc_for_fock, cell)
         xc_for_fock = cp.concatenate(
             [
                 xc_for_fock[0].reshape((-1, ngrids)),
@@ -1713,7 +1719,7 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1,
     elif xc_type == 'GGA':
         for i in xc_for_fock:
             contract_iG_potential(i, cell)
-        xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
+        xc_for_fock = xc_for_fock[:,0].reshape((nset, -1, ngrids))
     elif xc_type == 'MGGA':
         for i in xc_for_fock:
             contract_iG_potential(i, cell)
@@ -1819,7 +1825,8 @@ def get_veff_ip1(
         xc_for_fock = xc_for_fock[:, 0]
         xc_for_fock = xc_for_fock.reshape((nset, -1, ngrids))
     elif xc_type == 'MGGA':
-        xc_for_fock[:, 0] -= contract('ngp, pg -> np', xc_for_fock[:, 1:4], Gv) * 1j
+        for i in xc_for_fock:
+            contract_iG_potential(i, cell)
         xc_for_fock = cp.concatenate(
             [
                 xc_for_fock[:, 0].reshape((nset, -1, ngrids)),

From 97c4fb28475b8f1d1abb5f013f4435787cad7ff8 Mon Sep 17 00:00:00 2001
From: Rui Li <walterfeng@foxmail.com>
Date: Thu, 19 Mar 2026 14:00:54 -0700
Subject: [PATCH 14/14] comment out tailoring to check numerical stability

---
 gpu4pyscf/pbc/dft/multigrid_v2.py | 60 +++++++++++++++----------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py
index a7341de74..00b73e635 100644
--- a/gpu4pyscf/pbc/dft/multigrid_v2.py
+++ b/gpu4pyscf/pbc/dft/multigrid_v2.py
@@ -614,36 +614,36 @@ def assign_pairs_to_blocks(
         cast_to_pointer(env)
     )
 
-    libgpbc.tailor_gaussian_pairs(
-        cast_to_pointer(pairs_on_blocks),
-        cast_to_pointer(n_pairs_on_blocks),
-        ctypes.c_int(i_angular),
-        ctypes.c_int(j_angular),
-        cast_to_pointer(non_trivial_pairs),
-        cast_to_pointer(i_shells),
-        cast_to_pointer(j_shells),
-        ctypes.c_int(len(j_shells)),
-        cast_to_pointer(shell_to_ao_indices),
-        cast_to_pointer(accumulated_n_pairs_per_block),
-        cast_to_pointer(sorted_block_index),
-        ctypes.c_int(n_contributing_blocks),
-        cast_to_pointer(image_indices),
-        cast_to_pointer(vectors_to_neighboring_images),
-        ctypes.c_int(len(vectors_to_neighboring_images)),
-        cast_to_pointer(mesh),
-        cast_to_pointer(atm),
-        cast_to_pointer(bas),
-        cast_to_pointer(env),
-        ctypes.c_int(is_non_orthogonal),
-        ctypes.c_double(threshold),
-        ctypes.c_int(derivative_order),
-    )
-
-    pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0]
-    sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32)
-    n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks)
-    accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32)
-    sorted_block_index = sorted_block_index[:n_contributing_blocks]
+    # libgpbc.tailor_gaussian_pairs(
+    #     cast_to_pointer(pairs_on_blocks),
+    #     cast_to_pointer(n_pairs_on_blocks),
+    #     ctypes.c_int(i_angular),
+    #     ctypes.c_int(j_angular),
+    #     cast_to_pointer(non_trivial_pairs),
+    #     cast_to_pointer(i_shells),
+    #     cast_to_pointer(j_shells),
+    #     ctypes.c_int(len(j_shells)),
+    #     cast_to_pointer(shell_to_ao_indices),
+    #     cast_to_pointer(accumulated_n_pairs_per_block),
+    #     cast_to_pointer(sorted_block_index),
+    #     ctypes.c_int(n_contributing_blocks),
+    #     cast_to_pointer(image_indices),
+    #     cast_to_pointer(vectors_to_neighboring_images),
+    #     ctypes.c_int(len(vectors_to_neighboring_images)),
+    #     cast_to_pointer(mesh),
+    #     cast_to_pointer(atm),
+    #     cast_to_pointer(bas),
+    #     cast_to_pointer(env),
+    #     ctypes.c_int(is_non_orthogonal),
+    #     ctypes.c_double(threshold),
+    #     ctypes.c_int(derivative_order),
+    # )
+
+    # pairs_on_blocks = pairs_on_blocks[pairs_on_blocks >= 0]
+    # sorted_block_index = cp.asarray(cp.argsort(-n_pairs_on_blocks), dtype=cp.int32)
+    # n_contributing_blocks = cp.count_nonzero(n_pairs_on_blocks)
+    # accumulated_n_pairs_per_block[1:] = cp.cumsum(n_pairs_on_blocks, dtype=cp.int32)
+    # sorted_block_index = sorted_block_index[:n_contributing_blocks]
 
     return (
         pairs_on_blocks,