From 611b1d6d76fa7f7fcedf2356bd03ab99cb1de910 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 12 May 2026 14:43:46 +0800
Subject: [PATCH 01/30] add the first version

---
 gpu4pyscf/dmet/__init__.py        |  25 ++
 gpu4pyscf/dmet/dmet.py            | 658 ++++++++++++++++++++++++++++++
 gpu4pyscf/dmet/tests/test_dmet.py | 121 ++++++
 3 files changed, 804 insertions(+)
 create mode 100644 gpu4pyscf/dmet/__init__.py
 create mode 100644 gpu4pyscf/dmet/dmet.py
 create mode 100644 gpu4pyscf/dmet/tests/test_dmet.py

diff --git a/gpu4pyscf/dmet/__init__.py b/gpu4pyscf/dmet/__init__.py
new file mode 100644
index 000000000..883b3e735
--- /dev/null
+++ b/gpu4pyscf/dmet/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .dmet import (
+    DMET,
+    get_fragment_ao_indices,
+    schmidt_decompose,
+    build_embedding_basis,
+    build_core_dm,
+    transform_h1,
+    transform_eri,
+    lowdin_orth,
+)
diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
new file mode 100644
index 000000000..7c7a893d9
--- /dev/null
+++ b/gpu4pyscf/dmet/dmet.py
@@ -0,0 +1,658 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import numpy as np
+import cupy as cp
+import pyscf
+from pyscf import gto, ao2mo
+import gpu4pyscf
+from gpu4pyscf.scf import hf as gpu_hf
+
+
+def _as_cupy(x):
+    if isinstance(x, cp.ndarray):
+        return x
+    return cp.asarray(x)
+
+
+# TODO: use already implemented lowdin_orth
+def lowdin_orth(s):
+    """
+    Loewdin symmetric orthogonalization.
+
+    Given an AO overlap matrix ``S``, return ``X = S^{-1/2}`` and
+    ``X_inv = S^{1/2}``. Eigenvalues of ``S`` smaller than 1e-12 are
+    treated as linearly dependent and dropped.
+
+    Returns
+    -------
+    X : cp.ndarray, shape (nao, nao_orth)
+        AO -> orthonormal AO transformation. Columns of ``X`` are the
+        coefficients of the orthonormal AOs in the AO basis.
+    X_inv : cp.ndarray, shape (nao_orth, nao)
+        Inverse transformation: ``X_inv = X^T S``.
+    """
+    s = _as_cupy(s)
+    s = 0.5 * (s + s.T)
+    eigvals, eigvecs = cp.linalg.eigh(s)
+    keep = eigvals > 1e-12
+    if not cp.all(keep):
+        eigvals = eigvals[keep]
+        eigvecs = eigvecs[:, keep]
+    inv_sqrt = 1.0 / cp.sqrt(eigvals)
+    sqrt = cp.sqrt(eigvals)
+    X = (eigvecs * inv_sqrt) @ eigvecs.T          # S^{-1/2}
+    X_inv = (eigvecs * sqrt) @ eigvecs.T          # S^{+1/2}
+    return X, X_inv
+
+
+def get_fragment_ao_indices(mol, frag_atoms):
+    """
+    Return the atomic-orbital indices that belong to the listed atoms.
+
+    Parameters
+    ----------
+    mol : pyscf.gto.Mole
+        The full system molecule.
+    frag_atoms : sequence of int
+        Atom indices that constitute the fragment.
+
+    Returns
+    -------
+    ao_indices : cp.ndarray of int
+        Sorted AO indices (in the AO ordering of ``mol``) that belong
+        to ``frag_atoms``.
+    """
+    aoslice = mol.aoslice_by_atom()
+    indices = []
+    for ia in frag_atoms:
+        ia = int(ia)
+        if ia < 0 or ia >= mol.natm:
+            raise ValueError(
+                f"Atom index {ia} is out of range [0, {mol.natm})."
+            )
+        p0, p1 = int(aoslice[ia, 2]), int(aoslice[ia, 3])
+        indices.extend(range(p0, p1))
+    indices = cp.asarray(sorted(indices), dtype=cp.int32)
+    if indices.size == 0:
+        raise ValueError(
+            "Fragment is empty: no atomic orbitals were selected."
+        )
+    return indices
+
+
+def schmidt_decompose(dm_full, frag_idx, env_idx, threshold=1e-5):
+    """
+    Schmidt decomposition.
+
+    Parameters
+    ----------
+    dm_full : array_like, shape (nao, nao)
+        Spin-summed 1-RDM in the full AO basis. The trace equals the
+        number of electrons.
+    frag_idx, env_idx : cp.ndarray
+        AO indices of fragment and environment, respectively.
+        ``frag_idx`` and ``env_idx`` together must form a partition of
+        ``range(nao)``.
+    threshold : float
+        Eigenvalue cutoff used to classify the environment orbitals.
+
+    Returns
+    -------
+    bath_orb : cp.ndarray, shape (n_env, n_bath)
+        Eigenvectors of D^E whose eigenvalues are within
+        (threshold, 2 - threshold).
+    core_orb : cp.ndarray, shape (n_env, n_core)
+        Eigenvectors of D^E whose eigenvalues exceed 2 - threshold.
+        These define the unentangled occupied (core) orbitals.
+    info : dict
+        Dictionary with eigenvalue arrays for each category and the
+        electron count of the core space.
+    """
+    dm = _as_cupy(dm_full)
+    if dm.ndim != 2 or dm.shape[0] != dm.shape[1]:
+        raise ValueError("dm_full must be a square 2D matrix.")
+
+    env_idx = _as_cupy(env_idx)
+    if env_idx.size == 0:
+        # Pure fragment, no environment to entangle with.
+        return (cp.zeros((0, 0)),
+                cp.zeros((0, 0)),
+                {'core': cp.zeros(0),
+                 'bath': cp.zeros(0),
+                 'virtual': cp.zeros(0),
+                 'n_core_electrons': 0})
+
+    # Symmetrize to suppress numerical asymmetry from the SCF solver
+    D_env = dm[env_idx[:, None], env_idx[None, :]]
+    D_env = 0.5 * (D_env + D_env.T)
+
+    eigvals, eigvecs = cp.linalg.eigh(D_env)
+
+    is_core = eigvals > (2.0 - threshold)
+    is_virt = eigvals < threshold
+    is_bath = ~(is_core | is_virt)
+
+    bath_orb = eigvecs[:, is_bath]
+    core_orb = eigvecs[:, is_core]
+
+    info = {
+        'core':    eigvals[is_core],
+        'bath':    eigvals[is_bath],
+        'virtual': eigvals[is_virt],
+        # Each unentangled-occupied orbital is doubly occupied in the
+        # spin-restricted formulation.
+        'n_core_electrons': 2 * int(is_core.sum()),
+    }
+    return bath_orb, core_orb, info
+
+
+def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
+    """
+    Construct the AO -> embedded transformation matrix B.
+
+    Columns of B are arranged as
+        [ fragment-AO basis (identity columns),
+          bath orbitals (eigenvectors lifted into the env block) ].
+
+    Parameters
+    ----------
+    nao : int
+        Number of atomic orbitals in the full system.
+    frag_idx : cp.ndarray of int
+        AO indices of the fragment.
+    env_idx : cp.ndarray of int
+        AO indices of the environment.
+    bath_orb : cp.ndarray, shape (n_env, n_bath)
+        Bath orbitals expressed in the environment AO subspace.
+
+    Returns
+    -------
+    B : cp.ndarray, shape (nao, n_frag + n_bath)
+        Transformation matrix whose columns span the embedded space A.
+    """
+    frag_idx = _as_cupy(frag_idx)
+    env_idx = _as_cupy(env_idx)
+    n_frag = frag_idx.size
+    n_bath = bath_orb.shape[1] if bath_orb.size else 0
+
+    B = cp.zeros((nao, n_frag + n_bath), dtype=float)
+    # Fragment columns: identity on fragment AOs
+    B[frag_idx, cp.arange(n_frag)] = 1.0
+    # Bath columns: embed env eigenvectors into the env rows
+    if n_bath > 0:
+        B[env_idx[:, None], cp.arange(n_bath)[None, :] + n_frag] = bath_orb
+    return B
+
+
+def build_core_dm(env_idx, core_orb, nao):
+    """
+    Build the spin-summed core 1-RDM in the full AO basis.
+
+    Each unentangled-occupied orbital is doubly occupied:
+
+        D_core = 2 * C_core C_core^T,
+
+    where C_core is the matrix of core orbitals lifted into the full
+    AO basis (the rows corresponding to fragment AOs are zero).
+    """
+    env_idx = _as_cupy(env_idx)
+    if core_orb.size == 0:
+        return cp.zeros((nao, nao), dtype=float)
+    C_core = cp.zeros((nao, core_orb.shape[1]), dtype=float)
+    C_core[env_idx, :] = core_orb
+    return 2.0 * (C_core @ C_core.T)
+
+
+# ---------------------------------------------------------------------------
+# Hamiltonian transformations
+# ---------------------------------------------------------------------------
+def transform_h1(h_ao, B):
+    """
+    Project a 1-electron operator from the full AO basis to the
+    embedded basis: ``h_emb = B^T h_ao B``.
+    """
+    h_emb = B.T @ h_ao @ B
+    return h_emb
+
+
+def transform_eri(mol, B):
+    """
+    Transform the four-index two-electron repulsion integrals from the
+    full AO basis to the embedded basis using ``pyscf.ao2mo``:
+
+        V^A_{xy,zw} = sum_{rstu} B^r_x B^s_y V^{rs}_{tu} B^t_z B^u_w.
+
+    The result is returned in 4-fold symmetric packed form, suitable
+    for assignment to ``mf._eri`` of an SCF object.
+
+    Parameters
+    ----------
+    mol : pyscf.gto.Mole
+        Full-system molecule providing the AO integrals.
+    B : cp.ndarray, shape (nao, nemb)
+        AO -> embedded transformation matrix.
+
+    Returns
+    -------
+    eri_emb : cp.ndarray
+        ERIs in the embedded basis (4-fold symmetric, packed).
+    """
+    nemb = B.shape[1]
+    # pyscf.ao2mo requires CPU numpy arrays
+    B_cpu = cp.asnumpy(B)
+    eri_emb = ao2mo.kernel(mol, B_cpu, compact=True)
+    # ``ao2mo.kernel`` already returns the 4-fold packed form for
+    # real, equal-MOs inputs; ensure consistent shape.
+    eri_emb = ao2mo.restore(4, eri_emb, nemb)
+    return cp.asarray(eri_emb)
+
+
+# ---------------------------------------------------------------------------
+# Embedded Mole helper
+# ---------------------------------------------------------------------------
+def _build_embedded_mole(nemb, n_emb_electrons, spin=0,
+                         verbose=0, max_memory=4000):
+    """
+    Build a placeholder ``pyscf.gto.Mole`` whose only role is to carry
+    the bookkeeping needed by a PySCF SCF driver: the number of
+    electrons, the number of orbitals, and the ``incore_anyway`` flag
+    (so that the driver consumes ``mf._eri`` directly instead of
+    rebuilding integrals from atomic basis functions).
+    """
+    if n_emb_electrons < 0:
+        raise ValueError(
+            f"Embedded electron count {n_emb_electrons} is negative; "
+            "check the fragment definition and the Schmidt threshold."
+        )
+    if n_emb_electrons > 2 * nemb:
+        raise ValueError(
+            f"Embedded electron count {n_emb_electrons} exceeds "
+            f"2 * nemb = {2 * nemb}; the embedded space is too small."
+        )
+
+    mol = gto.Mole()
+    mol.verbose = verbose
+    mol.max_memory = max_memory
+    mol.atom = []
+    mol.basis = {}
+    mol.unit = 'Bohr'
+    mol.spin = spin
+    mol.nelectron = int(n_emb_electrons)
+    mol.charge = 0
+    mol.incore_anyway = True
+    mol.build(parse_arg=False, dump_input=False)
+
+    # Override the basis-counting helpers so PySCF treats the molecule
+    # as having exactly nemb orbitals.
+    nemb_int = int(nemb)
+
+    def _nao_nr(self=mol, _n=nemb_int):
+        return _n
+
+    mol.nao_nr = _nao_nr
+    mol.nao = nemb_int
+    return mol
+
+
+def _instantiate_inner_mf(mf_template, embedded_mol):
+    """
+    Create an SCF/DFT object on ``embedded_mol`` that mirrors
+    the type/configuration of ``mf_template``. 
+    """
+    cls = type(mf_template)
+    try:
+        new_mf = cls(embedded_mol)
+    except TypeError:
+        new_mf = copy.copy(mf_template)
+        new_mf.mol = embedded_mol
+        new_mf.mo_coeff = None
+        new_mf.mo_energy = None
+        new_mf.mo_occ = None
+        new_mf.converged = False
+
+    # Propagate selected configuration parameters
+    for attr in ('xc', 'conv_tol', 'conv_tol_grad', 'max_cycle',
+                 'level_shift', 'damp', 'diis', 'verbose'):
+        if hasattr(mf_template, attr):
+            try:
+                setattr(new_mf, attr, getattr(mf_template, attr))
+            except Exception:
+                pass
+
+    if hasattr(mf_template, 'grids') and hasattr(new_mf, 'grids'):
+        for g_attr in ('level', 'prune', 'atom_grid'):
+            if hasattr(mf_template.grids, g_attr):
+                try:
+                    setattr(new_mf.grids, g_attr,
+                            getattr(mf_template.grids, g_attr))
+                except Exception:
+                    pass
+
+    return new_mf
+
+
+# ---------------------------------------------------------------------------
+# Main driver
+# ---------------------------------------------------------------------------
+class DMET:
+    """
+    Single-shot Density Matrix Embedding Theory driver.
+
+    Parameters
+    ----------
+    mf_outer : SCF object (gpu4pyscf)
+        Low-level mean-field on the full system. Must be (or be made)
+        converged before its 1-RDM is consumed. If ``mf_outer`` does
+        not yet hold a converged MO set, ``kernel()`` will run it.
+    mf_inner : SCF/DFT object (gpu4pyscf)
+        High-level mean-field template applied to the embedded cluster.
+        A fresh PySCF object of the same class is instantiated on
+        the embedded "mole" and patched with the embedded Hamiltonian
+        (h^A, V^A). The user-supplied object is left untouched.
+    frag_atoms : sequence of int, optional
+        Atom indices that define the fragment region A. Mutually
+        exclusive with ``frag_orbs``.
+    frag_orbs : sequence of int, optional
+        Explicit AO indices defining the fragment region.
+    threshold : float
+        Eigenvalue cutoff used to classify environment orbitals into
+        core / bath / virtual. Defaults to 1e-5.
+    """
+
+    def __init__(self, mf_outer, mf_inner,
+                 frag_atoms=None, frag_orbs=None,
+                 threshold=1e-5):
+        if mf_outer is None or mf_inner is None:
+            raise ValueError("mf_outer and mf_inner are both required.")
+        if frag_atoms is None and frag_orbs is None:
+            raise ValueError(
+                "Provide either 'frag_atoms' or 'frag_orbs' to define "
+                "the DMET fragment."
+            )
+        if frag_atoms is not None and frag_orbs is not None:
+            raise ValueError(
+                "Specify only one of 'frag_atoms' or 'frag_orbs'."
+            )
+        if not (0.0 < threshold < 1.0):
+            raise ValueError(
+                f"threshold must lie in (0, 1); got {threshold}."
+            )
+
+        self.mf_outer = mf_outer
+        self.mf_inner_template = mf_inner
+        self.full_mol = mf_outer.mol
+        self.threshold = float(threshold)
+
+        nao = int(self.full_mol.nao_nr())
+        if frag_atoms is not None:
+            self.frag_atoms = list(int(a) for a in frag_atoms)
+            self.frag_idx = get_fragment_ao_indices(
+                self.full_mol, self.frag_atoms)
+        else:
+            self.frag_atoms = None
+            self.frag_idx = cp.asarray(sorted(int(i) for i in frag_orbs),
+                                       dtype=cp.int32)
+
+        all_idx = cp.arange(nao, dtype=cp.int32)
+        env_mask = cp.ones(nao, dtype=bool)
+        env_mask[self.frag_idx] = False
+        self.env_idx = all_idx[env_mask]
+
+        # ---- intermediate / output caches ----
+        self.bath_orb = None         # (n_env, n_bath)
+        self.core_orb = None         # (n_env, n_core)
+        self.eig_info = None         # dict from schmidt_decompose
+        self.B = None                # AO -> embedded basis transform
+        self.dm_core = None          # full-AO core density matrix
+        self.h_emb = None            # embedded 1e Hamiltonian (cupy)
+        self.eri_emb = None          # embedded 2e Hamiltonian (cupy)
+        self.e_core = None           # core energy contribution
+        self.e_nuc = None            # nuclear repulsion energy
+        self.mf_inner = None         # patched inner SCF object
+        self.dm_emb_init = None      # initial embedded density matrix
+        self.e_inner = None          # inner SCF total energy w/ overrides
+        self.e_tot = None            # final DMET total energy
+
+    # ------------------------------------------------------------------
+    # Step 1: ensure low-level mean-field is converged
+    # ------------------------------------------------------------------
+    def _ensure_outer_converged(self):
+        if getattr(self.mf_outer, 'mo_coeff', None) is None or not getattr(self.mf_outer, 'converged', True):
+            self.mf_outer.kernel()
+
+    # ------------------------------------------------------------------
+    # Step 2: bath construction
+    # ------------------------------------------------------------------
+    def build_bath(self):
+        """
+        Run the Schmidt decomposition on the environment block of the
+        outer-SCF density matrix expressed in the Loewdin orthonormal
+        AO (OAO) basis. Populates ``self.bath_orb``, ``self.core_orb``,
+        ``self.eig_info``, ``self.B_oao``, ``self.X``, and ``self.B``
+        (the AO coefficients of the embedded orbitals).
+        """
+        self._ensure_outer_converged()
+        dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
+
+        # Loewdin orthogonalization of the AO basis
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
+        # 1-RDM in the OAO basis: D' = S^{1/2} D S^{1/2}
+        dm_full_oao = X_inv @ dm_full_ao @ X_inv
+
+        bath_orb, core_orb, info = schmidt_decompose(
+            dm_full_oao, self.frag_idx, self.env_idx, self.threshold)
+
+        nao_oao = X.shape[1]
+        # OAO -> embedded transformation
+        B_oao = build_embedding_basis(nao_oao, self.frag_idx, self.env_idx,
+                                      bath_orb)
+        # AO coefficients of the embedded orbitals: C_emb = X B'
+        B_ao = X @ B_oao
+
+        # Core orbitals lifted from OAO env subspace into the AO basis.
+        if core_orb.size > 0:
+            C_core_oao = cp.zeros((nao_oao, core_orb.shape[1]), dtype=float)
+            C_core_oao[self.env_idx, :] = core_orb
+            C_core_ao = X @ C_core_oao
+            dm_core_ao = 2.0 * (C_core_ao @ C_core_ao.T)
+        else:
+            dm_core_ao = cp.zeros_like(dm_full_ao)
+
+        self.X = X
+        self.X_inv = X_inv
+        self.bath_orb = bath_orb
+        self.core_orb = core_orb
+        self.eig_info = info
+        self.B_oao = B_oao        # OAO -> embedded
+        self.B = B_ao             # AO  -> embedded (orthonormal columns)
+        self.dm_core = dm_core_ao
+        return self
+
+    # ------------------------------------------------------------------
+    # Step 3: build the embedded Hamiltonian
+    # ------------------------------------------------------------------
+    def build_embedded_hamiltonian(self):
+        """
+        Construct h^A and V^A in the embedded basis A and the
+        constant core energy.
+        """
+        if self.B is None:
+            self.build_bath()
+
+        mol = self.full_mol
+        # Bare 1e Hamiltonian on the full AO basis. Use the outer-mf
+        # implementation to inherit any custom modifications (ECPs,
+        # external charges, etc.).
+        h_ao = _as_cupy(self.mf_outer.get_hcore())
+
+        # Mean-field potential generated by the unentangled-occupied
+        # core orbitals in the full AO basis.
+        if self.eig_info['n_core_electrons'] > 0:
+            vj_core, vk_core = self.mf_outer.get_jk(mol, self.dm_core)
+            v_core_ao = _as_cupy(vj_core) - 0.5 * _as_cupy(vk_core)
+        else:
+            v_core_ao = cp.zeros_like(h_ao)
+
+        # 1-electron Hamiltonian in the embedded basis
+        h_emb = transform_h1(h_ao + v_core_ao, self.B)
+
+        # 2-electron Hamiltonian in the embedded basis
+        eri_emb = transform_eri(mol, self.B)
+
+        # Constant core energy: 1/2 Tr[D_core (h + (h + v_core))]
+        # = Tr[D_core h] + 1/2 Tr[D_core v_core]
+        if self.eig_info['n_core_electrons'] > 0:
+            e_core = (cp.einsum('ij,ji->', self.dm_core, h_ao)
+                      + 0.5 * cp.einsum('ij,ji->', self.dm_core, v_core_ao))
+        else:
+            e_core = 0.0
+
+        self.h_emb = h_emb
+        self.eri_emb = eri_emb
+        self.e_core = float(e_core)
+        self.e_nuc = float(mol.energy_nuc())
+        return self
+
+    # ------------------------------------------------------------------
+    # Step 4: build / patch the inner SCF object and solve
+    # ------------------------------------------------------------------
+    def _build_inner_mf(self):
+        """Instantiate the inner SCF on the embedded mole."""
+        if self.h_emb is None:
+            self.build_embedded_hamiltonian()
+
+        nemb = self.B.shape[1]
+        n_total_electrons = int(self.full_mol.nelectron)
+        n_emb_electrons = n_total_electrons \
+            - int(self.eig_info['n_core_electrons'])
+
+        emb_mol = _build_embedded_mole(
+            nemb=nemb,
+            n_emb_electrons=n_emb_electrons,
+            spin=int(getattr(self.full_mol, 'spin', 0)),
+            verbose=int(getattr(self.full_mol, 'verbose', 0)),
+            max_memory=int(getattr(self.full_mol, 'max_memory', 4000)),
+        )
+
+        mf_inner = _instantiate_inner_mf(self.mf_inner_template, emb_mol)
+
+        # ----- Patch the underlying Hamiltonian -----
+        h_emb = self.h_emb
+        ovlp = cp.eye(nemb)
+
+        mf_inner.get_hcore = lambda *args, **kwargs: h_emb
+        mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
+        mf_inner.energy_nuc = lambda *args, **kwargs: self.e_nuc + self.e_core
+
+        # Use ao2mo's 8-fold packed format for the in-core ERIs so
+        # PySCF's optimized JK routines can be reused.
+        eri_emb_cpu = cp.asnumpy(self.eri_emb)
+        eri_8fold = ao2mo.restore(8, eri_emb_cpu, nemb)
+        mf_inner._eri = cp.asarray(eri_8fold)
+
+        # Initial guess: project the outer 1-RDM into the embedded
+        # basis. With C_emb expressed in AO coefficients, the projector
+        # is C_emb^T S D_AO S C_emb (which equals B_oao^T D_OAO B_oao).
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
+        sB = s_ao @ self.B
+        dm_emb_init = sB.T @ dm_full_ao @ sB
+        
+        # Ensure exact electron count consistency
+        trace = float(cp.trace(dm_emb_init))
+        if trace > 0:
+            dm_emb_init = dm_emb_init * (n_emb_electrons / trace)
+        self.dm_emb_init = dm_emb_init
+
+        self.mf_inner = mf_inner
+        return mf_inner
+
+    def solve_embedded(self):
+        """Run the high-level embedded SCF and return its total energy."""
+        if self.mf_inner is None:
+            self._build_inner_mf()
+
+        e_inner = self.mf_inner.kernel(dm0=self.dm_emb_init)
+        if isinstance(e_inner, tuple):
+            e_inner = float(self.mf_inner.e_tot)
+        else:
+            e_inner = float(e_inner)
+        self.e_inner = e_inner
+        return e_inner
+
+    # ------------------------------------------------------------------
+    # Public entry point
+    # ------------------------------------------------------------------
+    def kernel(self):
+        """
+        Drive the full single-shot DMET workflow and return the total
+        energy.
+
+        E_DMET = E_inner_total
+
+        Note: the inner SCF's ``energy_nuc`` is set to (E_nuc + E_core),
+        so the energy returned by the inner solver already accounts for
+        the nuclear repulsion of the full system and the mean-field
+        contribution of the unentangled-occupied core orbitals.
+        """
+        self.build_bath()
+        self.build_embedded_hamiltonian()
+        self._build_inner_mf()
+        e_inner = self.solve_embedded()
+        self.e_tot = float(e_inner)
+        return self.e_tot
+
+    # ------------------------------------------------------------------
+    # Diagnostics
+    # ------------------------------------------------------------------
+    def energy_decomposition(self):
+        """
+        Return a dictionary describing the various energy contributions
+        gathered during the DMET calculation.
+        """
+        if self.e_tot is None:
+            self.kernel()
+        return {
+            'E_nuc':   self.e_nuc,
+            'E_core':  self.e_core,
+            'E_inner': self.e_inner,
+            'E_DMET':  self.e_tot,
+        }
+
+    def bath_summary(self):
+        """
+        Return a brief description of the Schmidt decomposition
+        outcome: the sizes of the fragment, bath, core and virtual
+        spaces, and the eigenvalue arrays of each environment block.
+        """
+        if self.eig_info is None:
+            self.build_bath()
+        return {
+            'n_fragment_aos': int(self.frag_idx.size),
+            'n_bath':         int(self.bath_orb.shape[1]),
+            'n_core':         int(self.core_orb.shape[1]),
+            'n_virtual':      int(self.eig_info['virtual'].size),
+            'core_eigvals':   self.eig_info['core'],
+            'bath_eigvals':   self.eig_info['bath'],
+            'virt_eigvals':   self.eig_info['virtual'],
+            'n_core_electrons': int(self.eig_info['n_core_electrons']),
+        }
+
+    def __call__(self):
+        """Allow ``DMET(...)()`` invocation in the PySCF mf style."""
+        return self.kernel()
diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/dmet/tests/test_dmet.py
new file mode 100644
index 000000000..a3c8194ea
--- /dev/null
+++ b/gpu4pyscf/dmet/tests/test_dmet.py
@@ -0,0 +1,121 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Basic correctness tests for the single-shot DMET driver.
+
+The cancellation property used here:
+
+    For a closed-shell system computed at the SAME mean-field level
+    (i.e. ``mf_inner`` and ``mf_outer`` share the same method and the
+    same orbital basis), the single-shot DMET total energy must
+    reproduce the full-system mean-field total energy exactly.
+"""
+
+import unittest
+import numpy as np
+from pyscf import gto, scf
+
+from gpu4pyscf.dmet import DMET
+
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.mol = gto.M(
+            atom='''
+            H 0.0 0.0 0.00
+            H 0.0 0.0 0.74
+            H 0.0 0.0 2.20
+            H 0.0 0.0 2.94
+            ''',
+            basis='sto-3g',
+            verbose=0,
+        )
+        cls.mf_ref = scf.RHF(cls.mol)
+        cls.e_ref = cls.mf_ref.kernel()
+
+    def test_self_consistency_two_atom_fragment(self):
+        # A single-shot DMET with the same low- and high-level method
+        # must reproduce the full-system mean-field energy.
+        mf_outer = scf.RHF(self.mol)
+        mf_outer.kernel()
+
+        mf_inner_template = scf.RHF(self.mol)
+
+        dmet = DMET(
+            mf_outer=mf_outer,
+            mf_inner=mf_inner_template,
+            frag_atoms=[0, 1],
+            threshold=1e-8,
+        )
+        e_dmet = dmet.kernel()
+
+        self.assertAlmostEqual(e_dmet, self.e_ref, places=7)
+
+    def test_self_consistency_single_atom_fragment(self):
+        mf_outer = scf.RHF(self.mol)
+        mf_outer.kernel()
+
+        mf_inner_template = scf.RHF(self.mol)
+
+        dmet = DMET(
+            mf_outer=mf_outer,
+            mf_inner=mf_inner_template,
+            frag_atoms=[0],
+            threshold=1e-8,
+        )
+        e_dmet = dmet.kernel()
+        self.assertAlmostEqual(e_dmet, self.e_ref, places=7)
+
+    def test_bath_summary(self):
+        mf_outer = scf.RHF(self.mol)
+        mf_outer.kernel()
+
+        dmet = DMET(
+            mf_outer=mf_outer,
+            mf_inner=scf.RHF(self.mol),
+            frag_atoms=[0, 1],
+            threshold=1e-6,
+        )
+        dmet.build_bath()
+        info = dmet.bath_summary()
+        # Two H atoms in STO-3G means 2 fragment AOs.
+        self.assertEqual(info['n_fragment_aos'], 2)
+        # Number of (bath + core + virtual) eigenvalues equals the
+        # environment AO count.
+        self.assertEqual(
+            info['n_bath'] + info['n_core'] + info['n_virtual'],
+            self.mol.nao_nr() - info['n_fragment_aos'],
+        )
+
+    def test_decomposition_keys(self):
+        mf_outer = scf.RHF(self.mol)
+        mf_outer.kernel()
+
+        dmet = DMET(
+            mf_outer=mf_outer,
+            mf_inner=scf.RHF(self.mol),
+            frag_atoms=[0, 1],
+            threshold=1e-8,
+        )
+        dmet.kernel()
+        decomp = dmet.energy_decomposition()
+        for key in ('E_nuc', 'E_core', 'E_inner', 'E_DMET'):
+            self.assertIn(key, decomp)
+
+
+if __name__ == '__main__':
+    print("Tests for single-shot DMET")
+    unittest.main()

From 2385803cfc507ad1457953a9659de952977ba9c0 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 12 May 2026 15:05:03 +0800
Subject: [PATCH 02/30] debug

---
 gpu4pyscf/dmet/dmet.py | 553 +++++++++++++----------------------------
 1 file changed, 172 insertions(+), 381 deletions(-)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index 7c7a893d9..6c67408b5 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -28,22 +28,9 @@ def _as_cupy(x):
     return cp.asarray(x)
 
 
-# TODO: use already implemented lowdin_orth
 def lowdin_orth(s):
     """
     Loewdin symmetric orthogonalization.
-
-    Given an AO overlap matrix ``S``, return ``X = S^{-1/2}`` and
-    ``X_inv = S^{1/2}``. Eigenvalues of ``S`` smaller than 1e-12 are
-    treated as linearly dependent and dropped.
-
-    Returns
-    -------
-    X : cp.ndarray, shape (nao, nao_orth)
-        AO -> orthonormal AO transformation. Columns of ``X`` are the
-        coefficients of the orthonormal AOs in the AO basis.
-    X_inv : cp.ndarray, shape (nao_orth, nao)
-        Inverse transformation: ``X_inv = X^T S``.
     """
     s = _as_cupy(s)
     s = 0.5 * (s + s.T)
@@ -62,81 +49,32 @@ def lowdin_orth(s):
 def get_fragment_ao_indices(mol, frag_atoms):
     """
     Return the atomic-orbital indices that belong to the listed atoms.
-
-    Parameters
-    ----------
-    mol : pyscf.gto.Mole
-        The full system molecule.
-    frag_atoms : sequence of int
-        Atom indices that constitute the fragment.
-
-    Returns
-    -------
-    ao_indices : cp.ndarray of int
-        Sorted AO indices (in the AO ordering of ``mol``) that belong
-        to ``frag_atoms``.
     """
     aoslice = mol.aoslice_by_atom()
     indices = []
     for ia in frag_atoms:
         ia = int(ia)
         if ia < 0 or ia >= mol.natm:
-            raise ValueError(
-                f"Atom index {ia} is out of range [0, {mol.natm})."
-            )
+            raise ValueError(f"Atom index {ia} is out of range [0, {mol.natm}).")
         p0, p1 = int(aoslice[ia, 2]), int(aoslice[ia, 3])
         indices.extend(range(p0, p1))
     indices = cp.asarray(sorted(indices), dtype=cp.int32)
     if indices.size == 0:
-        raise ValueError(
-            "Fragment is empty: no atomic orbitals were selected."
-        )
+        raise ValueError("Fragment is empty: no atomic orbitals were selected.")
     return indices
 
 
-def schmidt_decompose(dm_full, frag_idx, env_idx, threshold=1e-5):
+def schmidt_decompose(dm_full, env_idx, threshold=1e-5):
     """
     Schmidt decomposition.
-
-    Parameters
-    ----------
-    dm_full : array_like, shape (nao, nao)
-        Spin-summed 1-RDM in the full AO basis. The trace equals the
-        number of electrons.
-    frag_idx, env_idx : cp.ndarray
-        AO indices of fragment and environment, respectively.
-        ``frag_idx`` and ``env_idx`` together must form a partition of
-        ``range(nao)``.
-    threshold : float
-        Eigenvalue cutoff used to classify the environment orbitals.
-
-    Returns
-    -------
-    bath_orb : cp.ndarray, shape (n_env, n_bath)
-        Eigenvectors of D^E whose eigenvalues are within
-        (threshold, 2 - threshold).
-    core_orb : cp.ndarray, shape (n_env, n_core)
-        Eigenvectors of D^E whose eigenvalues exceed 2 - threshold.
-        These define the unentangled occupied (core) orbitals.
-    info : dict
-        Dictionary with eigenvalue arrays for each category and the
-        electron count of the core space.
     """
     dm = _as_cupy(dm_full)
-    if dm.ndim != 2 or dm.shape[0] != dm.shape[1]:
-        raise ValueError("dm_full must be a square 2D matrix.")
-
     env_idx = _as_cupy(env_idx)
     if env_idx.size == 0:
-        # Pure fragment, no environment to entangle with.
         return (cp.zeros((0, 0)),
                 cp.zeros((0, 0)),
-                {'core': cp.zeros(0),
-                 'bath': cp.zeros(0),
-                 'virtual': cp.zeros(0),
-                 'n_core_electrons': 0})
+                {'core': cp.zeros(0), 'bath': cp.zeros(0), 'virtual': cp.zeros(0), 'n_core_electrons': 0})
 
-    # Symmetrize to suppress numerical asymmetry from the SCF solver
     D_env = dm[env_idx[:, None], env_idx[None, :]]
     D_env = 0.5 * (D_env + D_env.T)
 
@@ -153,8 +91,6 @@ def schmidt_decompose(dm_full, frag_idx, env_idx, threshold=1e-5):
         'core':    eigvals[is_core],
         'bath':    eigvals[is_bath],
         'virtual': eigvals[is_virt],
-        # Each unentangled-occupied orbital is doubly occupied in the
-        # spin-restricted formulation.
         'n_core_electrons': 2 * int(is_core.sum()),
     }
     return bath_orb, core_orb, info
@@ -163,26 +99,6 @@ def schmidt_decompose(dm_full, frag_idx, env_idx, threshold=1e-5):
 def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
     """
     Construct the AO -> embedded transformation matrix B.
-
-    Columns of B are arranged as
-        [ fragment-AO basis (identity columns),
-          bath orbitals (eigenvectors lifted into the env block) ].
-
-    Parameters
-    ----------
-    nao : int
-        Number of atomic orbitals in the full system.
-    frag_idx : cp.ndarray of int
-        AO indices of the fragment.
-    env_idx : cp.ndarray of int
-        AO indices of the environment.
-    bath_orb : cp.ndarray, shape (n_env, n_bath)
-        Bath orbitals expressed in the environment AO subspace.
-
-    Returns
-    -------
-    B : cp.ndarray, shape (nao, n_frag + n_bath)
-        Transformation matrix whose columns span the embedded space A.
     """
     frag_idx = _as_cupy(frag_idx)
     env_idx = _as_cupy(env_idx)
@@ -190,9 +106,7 @@ def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
     n_bath = bath_orb.shape[1] if bath_orb.size else 0
 
     B = cp.zeros((nao, n_frag + n_bath), dtype=float)
-    # Fragment columns: identity on fragment AOs
     B[frag_idx, cp.arange(n_frag)] = 1.0
-    # Bath columns: embed env eigenvectors into the env rows
     if n_bath > 0:
         B[env_idx[:, None], cp.arange(n_bath)[None, :] + n_frag] = bath_orb
     return B
@@ -200,14 +114,7 @@ def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
 
 def build_core_dm(env_idx, core_orb, nao):
     """
-    Build the spin-summed core 1-RDM in the full AO basis.
-
-    Each unentangled-occupied orbital is doubly occupied:
-
-        D_core = 2 * C_core C_core^T,
-
-    where C_core is the matrix of core orbitals lifted into the full
-    AO basis (the rows corresponding to fragment AOs are zero).
+    Build the core 1-RDM in the full AO basis.
     """
     env_idx = _as_cupy(env_idx)
     if core_orb.size == 0:
@@ -217,72 +124,27 @@ def build_core_dm(env_idx, core_orb, nao):
     return 2.0 * (C_core @ C_core.T)
 
 
-# ---------------------------------------------------------------------------
-# Hamiltonian transformations
-# ---------------------------------------------------------------------------
 def transform_h1(h_ao, B):
     """
-    Project a 1-electron operator from the full AO basis to the
-    embedded basis: ``h_emb = B^T h_ao B``.
+    Project a 1-electron operator from the full AO basis to the embedded basis.
     """
-    h_emb = B.T @ h_ao @ B
-    return h_emb
+    return B.T @ h_ao @ B
 
 
 def transform_eri(mol, B):
     """
-    Transform the four-index two-electron repulsion integrals from the
-    full AO basis to the embedded basis using ``pyscf.ao2mo``:
-
-        V^A_{xy,zw} = sum_{rstu} B^r_x B^s_y V^{rs}_{tu} B^t_z B^u_w.
-
-    The result is returned in 4-fold symmetric packed form, suitable
-    for assignment to ``mf._eri`` of an SCF object.
-
-    Parameters
-    ----------
-    mol : pyscf.gto.Mole
-        Full-system molecule providing the AO integrals.
-    B : cp.ndarray, shape (nao, nemb)
-        AO -> embedded transformation matrix.
-
-    Returns
-    -------
-    eri_emb : cp.ndarray
-        ERIs in the embedded basis (4-fold symmetric, packed).
+    Transform the four-index two-electron repulsion integrals from the full AO basis.
     """
     nemb = B.shape[1]
-    # pyscf.ao2mo requires CPU numpy arrays
     B_cpu = cp.asnumpy(B)
     eri_emb = ao2mo.kernel(mol, B_cpu, compact=True)
-    # ``ao2mo.kernel`` already returns the 4-fold packed form for
-    # real, equal-MOs inputs; ensure consistent shape.
     eri_emb = ao2mo.restore(4, eri_emb, nemb)
     return cp.asarray(eri_emb)
 
 
-# ---------------------------------------------------------------------------
-# Embedded Mole helper
-# ---------------------------------------------------------------------------
-def _build_embedded_mole(nemb, n_emb_electrons, spin=0,
-                         verbose=0, max_memory=4000):
-    """
-    Build a placeholder ``pyscf.gto.Mole`` whose only role is to carry
-    the bookkeeping needed by a PySCF SCF driver: the number of
-    electrons, the number of orbitals, and the ``incore_anyway`` flag
-    (so that the driver consumes ``mf._eri`` directly instead of
-    rebuilding integrals from atomic basis functions).
-    """
-    if n_emb_electrons < 0:
-        raise ValueError(
-            f"Embedded electron count {n_emb_electrons} is negative; "
-            "check the fragment definition and the Schmidt threshold."
-        )
-    if n_emb_electrons > 2 * nemb:
-        raise ValueError(
-            f"Embedded electron count {n_emb_electrons} exceeds "
-            f"2 * nemb = {2 * nemb}; the embedded space is too small."
-        )
+def _build_embedded_mole(nemb, n_emb_electrons, spin=0, verbose=0, max_memory=4000):
+    if n_emb_electrons < 0 or n_emb_electrons > 2 * nemb:
+        raise ValueError(f"Invalid embedded electron count: {n_emb_electrons}")
 
     mol = gto.Mole()
     mol.verbose = verbose
@@ -296,10 +158,7 @@ def _build_embedded_mole(nemb, n_emb_electrons, spin=0,
     mol.incore_anyway = True
     mol.build(parse_arg=False, dump_input=False)
 
-    # Override the basis-counting helpers so PySCF treats the molecule
-    # as having exactly nemb orbitals.
     nemb_int = int(nemb)
-
     def _nao_nr(self=mol, _n=nemb_int):
         return _n
 
@@ -309,10 +168,6 @@ def _nao_nr(self=mol, _n=nemb_int):
 
 
 def _instantiate_inner_mf(mf_template, embedded_mol):
-    """
-    Create an SCF/DFT object on ``embedded_mol`` that mirrors
-    the type/configuration of ``mf_template``. 
-    """
     cls = type(mf_template)
     try:
         new_mf = cls(embedded_mol)
@@ -324,7 +179,6 @@ def _instantiate_inner_mf(mf_template, embedded_mol):
         new_mf.mo_occ = None
         new_mf.converged = False
 
-    # Propagate selected configuration parameters
     for attr in ('xc', 'conv_tol', 'conv_tol_grad', 'max_cycle',
                  'level_shift', 'damp', 'diis', 'verbose'):
         if hasattr(mf_template, attr):
@@ -337,322 +191,259 @@ def _instantiate_inner_mf(mf_template, embedded_mol):
         for g_attr in ('level', 'prune', 'atom_grid'):
             if hasattr(mf_template.grids, g_attr):
                 try:
-                    setattr(new_mf.grids, g_attr,
-                            getattr(mf_template.grids, g_attr))
+                    setattr(new_mf.grids, g_attr, getattr(mf_template.grids, g_attr))
                 except Exception:
                     pass
 
     return new_mf
 
 
-# ---------------------------------------------------------------------------
-# Main driver
-# ---------------------------------------------------------------------------
 class DMET:
     """
-    Single-shot Density Matrix Embedding Theory driver.
+    Density Matrix Embedding Theory driver with macroscopic iteration.
 
     Parameters
     ----------
     mf_outer : SCF object (gpu4pyscf)
-        Low-level mean-field on the full system. Must be (or be made)
-        converged before its 1-RDM is consumed. If ``mf_outer`` does
-        not yet hold a converged MO set, ``kernel()`` will run it.
+        Low-level mean-field on the full system.
     mf_inner : SCF/DFT object (gpu4pyscf)
         High-level mean-field template applied to the embedded cluster.
-        A fresh PySCF object of the same class is instantiated on
-        the embedded "mole" and patched with the embedded Hamiltonian
-        (h^A, V^A). The user-supplied object is left untouched.
-    frag_atoms : sequence of int, optional
-        Atom indices that define the fragment region A. Mutually
-        exclusive with ``frag_orbs``.
-    frag_orbs : sequence of int, optional
-        Explicit AO indices defining the fragment region.
+    fragments : list of lists of int
+        List of fragments, where each fragment is a list of atom indices.
     threshold : float
-        Eigenvalue cutoff used to classify environment orbitals into
-        core / bath / virtual. Defaults to 1e-5.
+        Eigenvalue cutoff used to classify environment orbitals.
+    max_macro_iter : int
+        Maximum number of macroscopic iterations for correlation potential (u).
+    macro_tol : float
+        Convergence tolerance for the difference in fragment 1-RDMs.
     """
 
-    def __init__(self, mf_outer, mf_inner,
-                 frag_atoms=None, frag_orbs=None,
-                 threshold=1e-5):
+    def __init__(self, mf_outer, mf_inner, fragments,
+                 threshold=1e-5, max_macro_iter=20, macro_tol=1e-4):
         if mf_outer is None or mf_inner is None:
             raise ValueError("mf_outer and mf_inner are both required.")
-        if frag_atoms is None and frag_orbs is None:
-            raise ValueError(
-                "Provide either 'frag_atoms' or 'frag_orbs' to define "
-                "the DMET fragment."
-            )
-        if frag_atoms is not None and frag_orbs is not None:
-            raise ValueError(
-                "Specify only one of 'frag_atoms' or 'frag_orbs'."
-            )
-        if not (0.0 < threshold < 1.0):
-            raise ValueError(
-                f"threshold must lie in (0, 1); got {threshold}."
-            )
+        if not fragments:
+            raise ValueError("Provide a list of fragments to define the DMET regions.")
 
         self.mf_outer = mf_outer
         self.mf_inner_template = mf_inner
         self.full_mol = mf_outer.mol
         self.threshold = float(threshold)
+        self.max_macro_iter = max_macro_iter
+        self.macro_tol = macro_tol
 
+        self.fragments = [list(int(a) for a in frag) for frag in fragments]
+        self.nfrags = len(self.fragments)
+        
         nao = int(self.full_mol.nao_nr())
-        if frag_atoms is not None:
-            self.frag_atoms = list(int(a) for a in frag_atoms)
-            self.frag_idx = get_fragment_ao_indices(
-                self.full_mol, self.frag_atoms)
-        else:
-            self.frag_atoms = None
-            self.frag_idx = cp.asarray(sorted(int(i) for i in frag_orbs),
-                                       dtype=cp.int32)
-
         all_idx = cp.arange(nao, dtype=cp.int32)
-        env_mask = cp.ones(nao, dtype=bool)
-        env_mask[self.frag_idx] = False
-        self.env_idx = all_idx[env_mask]
-
-        # ---- intermediate / output caches ----
-        self.bath_orb = None         # (n_env, n_bath)
-        self.core_orb = None         # (n_env, n_core)
-        self.eig_info = None         # dict from schmidt_decompose
-        self.B = None                # AO -> embedded basis transform
-        self.dm_core = None          # full-AO core density matrix
-        self.h_emb = None            # embedded 1e Hamiltonian (cupy)
-        self.eri_emb = None          # embedded 2e Hamiltonian (cupy)
-        self.e_core = None           # core energy contribution
-        self.e_nuc = None            # nuclear repulsion energy
-        self.mf_inner = None         # patched inner SCF object
-        self.dm_emb_init = None      # initial embedded density matrix
-        self.e_inner = None          # inner SCF total energy w/ overrides
-        self.e_tot = None            # final DMET total energy
-
-    # ------------------------------------------------------------------
-    # Step 1: ensure low-level mean-field is converged
-    # ------------------------------------------------------------------
-    def _ensure_outer_converged(self):
-        if getattr(self.mf_outer, 'mo_coeff', None) is None or not getattr(self.mf_outer, 'converged', True):
-            self.mf_outer.kernel()
-
-    # ------------------------------------------------------------------
-    # Step 2: bath construction
-    # ------------------------------------------------------------------
-    def build_bath(self):
+        
+        self.frag_idx = []
+        self.env_idx = []
+        for frag_atoms in self.fragments:
+            f_idx = get_fragment_ao_indices(self.full_mol, frag_atoms)
+            self.frag_idx.append(f_idx)
+            env_mask = cp.ones(nao, dtype=bool)
+            env_mask[f_idx] = False
+            self.env_idx.append(all_idx[env_mask])
+
+        # ---- intermediate / output caches (lists for multiple fragments) ----
+        self.bath_orb = [None] * self.nfrags
+        self.core_orb = [None] * self.nfrags
+        self.eig_info = [None] * self.nfrags
+        self.B_oao = [None] * self.nfrags
+        self.B = [None] * self.nfrags
+        self.dm_core = [None] * self.nfrags
+        self.h_emb = [None] * self.nfrags
+        self.eri_emb = [None] * self.nfrags
+        self.e_core = [None] * self.nfrags
+        self.mf_inner = [None] * self.nfrags
+        self.dm_emb_init = [None] * self.nfrags
+        self.e_inner = [None] * self.nfrags
+        self.e_tot = None            
+        self.u = cp.zeros((nao, nao))  # Global correlation potential
+
+    def build_bath(self, ifrag, dm_full_oao, X):
         """
-        Run the Schmidt decomposition on the environment block of the
-        outer-SCF density matrix expressed in the Loewdin orthonormal
-        AO (OAO) basis. Populates ``self.bath_orb``, ``self.core_orb``,
-        ``self.eig_info``, ``self.B_oao``, ``self.X``, and ``self.B``
-        (the AO coefficients of the embedded orbitals).
+        Run the Schmidt decomposition for a specific fragment.
         """
-        self._ensure_outer_converged()
-        dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
-
-        # Loewdin orthogonalization of the AO basis
-        s_ao = _as_cupy(self.mf_outer.get_ovlp())
-        X, X_inv = lowdin_orth(s_ao)
-        # 1-RDM in the OAO basis: D' = S^{1/2} D S^{1/2}
-        dm_full_oao = X_inv @ dm_full_ao @ X_inv
-
         bath_orb, core_orb, info = schmidt_decompose(
-            dm_full_oao, self.frag_idx, self.env_idx, self.threshold)
+            dm_full_oao, self.frag_idx[ifrag], self.env_idx[ifrag], self.threshold)
 
         nao_oao = X.shape[1]
-        # OAO -> embedded transformation
-        B_oao = build_embedding_basis(nao_oao, self.frag_idx, self.env_idx,
-                                      bath_orb)
-        # AO coefficients of the embedded orbitals: C_emb = X B'
+        B_oao = build_embedding_basis(nao_oao, self.frag_idx[ifrag], self.env_idx[ifrag], bath_orb)
         B_ao = X @ B_oao
 
-        # Core orbitals lifted from OAO env subspace into the AO basis.
         if core_orb.size > 0:
             C_core_oao = cp.zeros((nao_oao, core_orb.shape[1]), dtype=float)
-            C_core_oao[self.env_idx, :] = core_orb
+            C_core_oao[self.env_idx[ifrag], :] = core_orb
             C_core_ao = X @ C_core_oao
             dm_core_ao = 2.0 * (C_core_ao @ C_core_ao.T)
         else:
-            dm_core_ao = cp.zeros_like(dm_full_ao)
-
-        self.X = X
-        self.X_inv = X_inv
-        self.bath_orb = bath_orb
-        self.core_orb = core_orb
-        self.eig_info = info
-        self.B_oao = B_oao        # OAO -> embedded
-        self.B = B_ao             # AO  -> embedded (orthonormal columns)
-        self.dm_core = dm_core_ao
+            dm_core_ao = cp.zeros((X.shape[0], X.shape[0]), dtype=float)
+
+        self.bath_orb[ifrag] = bath_orb
+        self.core_orb[ifrag] = core_orb
+        self.eig_info[ifrag] = info
+        self.B_oao[ifrag] = B_oao        
+        self.B[ifrag] = B_ao             
+        self.dm_core[ifrag] = dm_core_ao
         return self
 
-    # ------------------------------------------------------------------
-    # Step 3: build the embedded Hamiltonian
-    # ------------------------------------------------------------------
-    def build_embedded_hamiltonian(self):
+    def build_embedded_hamiltonian(self, ifrag, hcore_orig):
         """
-        Construct h^A and V^A in the embedded basis A and the
-        constant core energy.
+        Construct h^A and V^A in the embedded basis A.
+        Uses bare hcore_orig (without the correlation potential 'u').
         """
-        if self.B is None:
-            self.build_bath()
-
         mol = self.full_mol
-        # Bare 1e Hamiltonian on the full AO basis. Use the outer-mf
-        # implementation to inherit any custom modifications (ECPs,
-        # external charges, etc.).
-        h_ao = _as_cupy(self.mf_outer.get_hcore())
-
-        # Mean-field potential generated by the unentangled-occupied
-        # core orbitals in the full AO basis.
-        if self.eig_info['n_core_electrons'] > 0:
-            vj_core, vk_core = self.mf_outer.get_jk(mol, self.dm_core)
+        h_ao = _as_cupy(hcore_orig)
+
+        if self.eig_info[ifrag]['n_core_electrons'] > 0:
+            vj_core, vk_core = self.mf_outer.get_jk(mol, self.dm_core[ifrag])
             v_core_ao = _as_cupy(vj_core) - 0.5 * _as_cupy(vk_core)
         else:
             v_core_ao = cp.zeros_like(h_ao)
 
-        # 1-electron Hamiltonian in the embedded basis
-        h_emb = transform_h1(h_ao + v_core_ao, self.B)
+        h_emb = transform_h1(h_ao + v_core_ao, self.B[ifrag])
+        eri_emb = transform_eri(mol, self.B[ifrag])
 
-        # 2-electron Hamiltonian in the embedded basis
-        eri_emb = transform_eri(mol, self.B)
-
-        # Constant core energy: 1/2 Tr[D_core (h + (h + v_core))]
-        # = Tr[D_core h] + 1/2 Tr[D_core v_core]
-        if self.eig_info['n_core_electrons'] > 0:
-            e_core = (cp.einsum('ij,ji->', self.dm_core, h_ao)
-                      + 0.5 * cp.einsum('ij,ji->', self.dm_core, v_core_ao))
+        if self.eig_info[ifrag]['n_core_electrons'] > 0:
+            e_core = (cp.einsum('ij,ji->', self.dm_core[ifrag], h_ao)
+                      + 0.5 * cp.einsum('ij,ji->', self.dm_core[ifrag], v_core_ao))
         else:
             e_core = 0.0
 
-        self.h_emb = h_emb
-        self.eri_emb = eri_emb
-        self.e_core = float(e_core)
-        self.e_nuc = float(mol.energy_nuc())
+        self.h_emb[ifrag] = h_emb
+        self.eri_emb[ifrag] = eri_emb
+        self.e_core[ifrag] = float(e_core)
         return self
 
-    # ------------------------------------------------------------------
-    # Step 4: build / patch the inner SCF object and solve
-    # ------------------------------------------------------------------
-    def _build_inner_mf(self):
+    def _build_inner_mf(self, ifrag, dm_full_ao):
         """Instantiate the inner SCF on the embedded mole."""
-        if self.h_emb is None:
-            self.build_embedded_hamiltonian()
-
-        nemb = self.B.shape[1]
+        nemb = self.B[ifrag].shape[1]
         n_total_electrons = int(self.full_mol.nelectron)
-        n_emb_electrons = n_total_electrons \
-            - int(self.eig_info['n_core_electrons'])
+        n_emb_electrons = n_total_electrons - int(self.eig_info[ifrag]['n_core_electrons'])
 
         emb_mol = _build_embedded_mole(
             nemb=nemb,
             n_emb_electrons=n_emb_electrons,
             spin=int(getattr(self.full_mol, 'spin', 0)),
-            verbose=int(getattr(self.full_mol, 'verbose', 0)),
+            verbose=0,
             max_memory=int(getattr(self.full_mol, 'max_memory', 4000)),
         )
 
         mf_inner = _instantiate_inner_mf(self.mf_inner_template, emb_mol)
 
-        # ----- Patch the underlying Hamiltonian -----
-        h_emb = self.h_emb
+        h_emb = self.h_emb[ifrag]
         ovlp = cp.eye(nemb)
 
+        # Base energy offset for debugging per fragment
+        e_nuc = float(self.full_mol.energy_nuc())
         mf_inner.get_hcore = lambda *args, **kwargs: h_emb
         mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
-        mf_inner.energy_nuc = lambda *args, **kwargs: self.e_nuc + self.e_core
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + self.e_core[ifrag]
 
-        # Use ao2mo's 8-fold packed format for the in-core ERIs so
-        # PySCF's optimized JK routines can be reused.
-        eri_emb_cpu = cp.asnumpy(self.eri_emb)
+        eri_emb_cpu = cp.asnumpy(self.eri_emb[ifrag])
         eri_8fold = ao2mo.restore(8, eri_emb_cpu, nemb)
         mf_inner._eri = cp.asarray(eri_8fold)
 
-        # Initial guess: project the outer 1-RDM into the embedded
-        # basis. With C_emb expressed in AO coefficients, the projector
-        # is C_emb^T S D_AO S C_emb (which equals B_oao^T D_OAO B_oao).
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
-        dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
-        sB = s_ao @ self.B
+        sB = s_ao @ self.B[ifrag]
         dm_emb_init = sB.T @ dm_full_ao @ sB
         
-        # Ensure exact electron count consistency
         trace = float(cp.trace(dm_emb_init))
         if trace > 0:
             dm_emb_init = dm_emb_init * (n_emb_electrons / trace)
-        self.dm_emb_init = dm_emb_init
+        self.dm_emb_init[ifrag] = dm_emb_init
 
-        self.mf_inner = mf_inner
+        self.mf_inner[ifrag] = mf_inner
         return mf_inner
 
-    def solve_embedded(self):
-        """Run the high-level embedded SCF and return its total energy."""
-        if self.mf_inner is None:
-            self._build_inner_mf()
-
-        e_inner = self.mf_inner.kernel(dm0=self.dm_emb_init)
+    def solve_embedded(self, ifrag):
+        """Run the high-level embedded SCF for a specific fragment."""
+        e_inner = self.mf_inner[ifrag].kernel(dm0=self.dm_emb_init[ifrag])
         if isinstance(e_inner, tuple):
-            e_inner = float(self.mf_inner.e_tot)
+            e_inner = float(self.mf_inner[ifrag].e_tot)
         else:
             e_inner = float(e_inner)
-        self.e_inner = e_inner
+        self.e_inner[ifrag] = e_inner
         return e_inner
 
-    # ------------------------------------------------------------------
-    # Public entry point
-    # ------------------------------------------------------------------
     def kernel(self):
         """
-        Drive the full single-shot DMET workflow and return the total
-        energy.
+        Drive the macroscopic-iterating DMET workflow.
+        Returns the DMET total energy.
+        """
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
 
-        E_DMET = E_inner_total
+        for macro_iter in range(self.max_macro_iter):
+            # 1. Run low-level SCF with current correlation potential 'u'
+            self.mf_outer.get_hcore = lambda *args, **kwargs: cp.asnumpy(hcore_orig + self.u)
+            self.mf_outer.mo_coeff = None # Force re-run
+            self.mf_outer.kernel()
+            
+            dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
+            dm_full_oao = X_inv @ dm_full_ao @ X_inv
+
+            e_tot = 0.0
+            dm_inners = []
+
+            # 2. Loop over all fragments
+            for ifrag in range(self.nfrags):
+                self.build_bath(ifrag, dm_full_oao, X)
+                self.build_embedded_hamiltonian(ifrag, hcore_orig)
+                mf_inner = self._build_inner_mf(ifrag, dm_full_ao)
+                self.solve_embedded(ifrag)
+
+                dm_emb = _as_cupy(mf_inner.make_rdm1())
+                fock_emb = _as_cupy(mf_inner.get_fock(dm=mf_inner.make_rdm1()))
+                
+                # Transform inner DM back to full AO basis for D-matching
+                B = self.B[ifrag]
+                dm_inner_ao = B @ dm_emb @ B.T
+                dm_inners.append(dm_inner_ao)
+
+                # Extract Fragment Energy: 1/2 Tr_x [ D (h + F) ]
+                n_frag = self.frag_idx[ifrag].size
+                e_frag_elec = 0.5 * cp.sum(
+                    dm_emb[:n_frag, :] * (self.h_emb[ifrag][:n_frag, :] + fock_emb[:n_frag, :])
+                )
+                
+                # Extract Fragment Nuclear Energy
+                e_frag_nuc = 0.0
+                coords = self.full_mol.atom_coords()
+                charges = self.full_mol.atom_charges()
+                frag_atoms = self.fragments[ifrag]
+                for i in frag_atoms:
+                    for j in range(self.full_mol.natm):
+                        if i == j: continue
+                        r = np.linalg.norm(coords[i] - coords[j])
+                        factor = 0.5 if j in frag_atoms else 1.0
+                        e_frag_nuc += factor * charges[i] * charges[j] / r
+                
+                e_tot += float(e_frag_elec) + e_frag_nuc
+
+            # 3. Macroscopic iteration: update correlation potential 'u'
+            error = 0.0
+            for ifrag in range(self.nfrags):
+                idx = self.frag_idx[ifrag]
+                idx_mesh = cp.ix_(idx, idx)
+                # Cost function: \Delta D = D_inner - D_outer over fragment blocks
+                diff = dm_inners[ifrag][idx_mesh] - dm_full_ao[idx_mesh]
+                error += float(cp.linalg.norm(diff))
+                
+                # Simple gradient descent step with damping factor
+                self.u[idx_mesh] -= 0.5 * diff
+            
+            print(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
+            self.e_tot = e_tot
+            if error < self.macro_tol:
+                print("DMET macroscopic iterations converged.")
+                break
 
-        Note: the inner SCF's ``energy_nuc`` is set to (E_nuc + E_core),
-        so the energy returned by the inner solver already accounts for
-        the nuclear repulsion of the full system and the mean-field
-        contribution of the unentangled-occupied core orbitals.
-        """
-        self.build_bath()
-        self.build_embedded_hamiltonian()
-        self._build_inner_mf()
-        e_inner = self.solve_embedded()
-        self.e_tot = float(e_inner)
         return self.e_tot
 
-    # ------------------------------------------------------------------
-    # Diagnostics
-    # ------------------------------------------------------------------
-    def energy_decomposition(self):
-        """
-        Return a dictionary describing the various energy contributions
-        gathered during the DMET calculation.
-        """
-        if self.e_tot is None:
-            self.kernel()
-        return {
-            'E_nuc':   self.e_nuc,
-            'E_core':  self.e_core,
-            'E_inner': self.e_inner,
-            'E_DMET':  self.e_tot,
-        }
-
-    def bath_summary(self):
-        """
-        Return a brief description of the Schmidt decomposition
-        outcome: the sizes of the fragment, bath, core and virtual
-        spaces, and the eigenvalue arrays of each environment block.
-        """
-        if self.eig_info is None:
-            self.build_bath()
-        return {
-            'n_fragment_aos': int(self.frag_idx.size),
-            'n_bath':         int(self.bath_orb.shape[1]),
-            'n_core':         int(self.core_orb.shape[1]),
-            'n_virtual':      int(self.eig_info['virtual'].size),
-            'core_eigvals':   self.eig_info['core'],
-            'bath_eigvals':   self.eig_info['bath'],
-            'virt_eigvals':   self.eig_info['virtual'],
-            'n_core_electrons': int(self.eig_info['n_core_electrons']),
-        }
-
     def __call__(self):
-        """Allow ``DMET(...)()`` invocation in the PySCF mf style."""
-        return self.kernel()
+        return self.kernel()
\ No newline at end of file

From e551598d6880f5600d0a9734d37bf0eb8b7551f8 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 12 May 2026 15:48:07 +0800
Subject: [PATCH 03/30] debug

---
 gpu4pyscf/dmet/dmet.py | 89 +++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 32 deletions(-)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index 6c67408b5..c0281c352 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -16,10 +16,7 @@
 import copy
 import numpy as np
 import cupy as cp
-import pyscf
-from pyscf import gto, ao2mo
-import gpu4pyscf
-from gpu4pyscf.scf import hf as gpu_hf
+from pyscf import gto
 
 
 def _as_cupy(x):
@@ -131,17 +128,6 @@ def transform_h1(h_ao, B):
     return B.T @ h_ao @ B
 
 
-def transform_eri(mol, B):
-    """
-    Transform the four-index two-electron repulsion integrals from the full AO basis.
-    """
-    nemb = B.shape[1]
-    B_cpu = cp.asnumpy(B)
-    eri_emb = ao2mo.kernel(mol, B_cpu, compact=True)
-    eri_emb = ao2mo.restore(4, eri_emb, nemb)
-    return cp.asarray(eri_emb)
-
-
 def _build_embedded_mole(nemb, n_emb_electrons, spin=0, verbose=0, max_memory=4000):
     if n_emb_electrons < 0 or n_emb_electrons > 2 * nemb:
         raise ValueError(f"Invalid embedded electron count: {n_emb_electrons}")
@@ -155,7 +141,6 @@ def _build_embedded_mole(nemb, n_emb_electrons, spin=0, verbose=0, max_memory=40
     mol.spin = spin
     mol.nelectron = int(n_emb_electrons)
     mol.charge = 0
-    mol.incore_anyway = True
     mol.build(parse_arg=False, dump_input=False)
 
     nemb_int = int(nemb)
@@ -255,7 +240,6 @@ def __init__(self, mf_outer, mf_inner, fragments,
         self.B = [None] * self.nfrags
         self.dm_core = [None] * self.nfrags
         self.h_emb = [None] * self.nfrags
-        self.eri_emb = [None] * self.nfrags
         self.e_core = [None] * self.nfrags
         self.mf_inner = [None] * self.nfrags
         self.dm_emb_init = [None] * self.nfrags
@@ -292,7 +276,7 @@ def build_bath(self, ifrag, dm_full_oao, X):
 
     def build_embedded_hamiltonian(self, ifrag, hcore_orig):
         """
-        Construct h^A and V^A in the embedded basis A.
+        Construct h^A in the embedded basis A.
         Uses bare hcore_orig (without the correlation potential 'u').
         """
         mol = self.full_mol
@@ -305,7 +289,6 @@ def build_embedded_hamiltonian(self, ifrag, hcore_orig):
             v_core_ao = cp.zeros_like(h_ao)
 
         h_emb = transform_h1(h_ao + v_core_ao, self.B[ifrag])
-        eri_emb = transform_eri(mol, self.B[ifrag])
 
         if self.eig_info[ifrag]['n_core_electrons'] > 0:
             e_core = (cp.einsum('ij,ji->', self.dm_core[ifrag], h_ao)
@@ -314,7 +297,6 @@ def build_embedded_hamiltonian(self, ifrag, hcore_orig):
             e_core = 0.0
 
         self.h_emb[ifrag] = h_emb
-        self.eri_emb[ifrag] = eri_emb
         self.e_core[ifrag] = float(e_core)
         return self
 
@@ -343,9 +325,39 @@ def _build_inner_mf(self, ifrag, dm_full_ao):
         mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
         mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + self.e_core[ifrag]
 
-        eri_emb_cpu = cp.asnumpy(self.eri_emb[ifrag])
-        eri_8fold = ao2mo.restore(8, eri_emb_cpu, nemb)
-        mf_inner._eri = cp.asarray(eri_8fold)
+        # Overwrite get_jk to compute J and K on-the-fly using the outer MF
+        # without computing or storing 4-index ERIs.
+        def _get_jk(mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None):
+            if dm is None:
+                dm = mf_inner.make_rdm1()
+            dm_cp = _as_cupy(dm)
+            B_mat = self.B[ifrag]
+            
+            # Project embedded dm to full AO basis
+            if dm_cp.ndim == 2:
+                dm_ao = B_mat @ dm_cp @ B_mat.T
+            else:
+                dm_ao = cp.einsum('pi,xij,qj->xpq', B_mat, dm_cp, B_mat)
+                
+            # Compute J and K in full AO basis using outer SCF's optimized routine
+            vj_ao, vk_ao = self.mf_outer.get_jk(self.full_mol, dm_ao, hermi, with_j, with_k, omega)
+            
+            # Project J and K back to embedded basis
+            vj_emb = vk_emb = None
+            if vj_ao is not None:
+                if dm_cp.ndim == 2:
+                    vj_emb = B_mat.T @ vj_ao @ B_mat
+                else:
+                    vj_emb = cp.einsum('pi,xpq,qj->xij', B_mat, vj_ao, B_mat)
+            if vk_ao is not None:
+                if dm_cp.ndim == 2:
+                    vk_emb = B_mat.T @ vk_ao @ B_mat
+                else:
+                    vk_emb = cp.einsum('pi,xpq,qj->xij', B_mat, vk_ao, B_mat)
+                    
+            return vj_emb, vk_emb
+
+        mf_inner.get_jk = _get_jk
 
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         sB = s_ao @ self.B[ifrag]
@@ -398,17 +410,28 @@ def kernel(self):
                 self.solve_embedded(ifrag)
 
                 dm_emb = _as_cupy(mf_inner.make_rdm1())
-                fock_emb = _as_cupy(mf_inner.get_fock(dm=mf_inner.make_rdm1()))
                 
-                # Transform inner DM back to full AO basis for D-matching
+                # Transform inner DM back to full AO basis
                 B = self.B[ifrag]
-                dm_inner_ao = B @ dm_emb @ B.T
-                dm_inners.append(dm_inner_ao)
+                dm_inner_active_ao = B @ dm_emb @ B.T
+                
+                dm_inner_full_ao = self.dm_core[ifrag] + dm_inner_active_ao
+                dm_inners.append(dm_inner_full_ao)
 
-                # Extract Fragment Energy: 1/2 Tr_x [ D (h + F) ]
-                n_frag = self.frag_idx[ifrag].size
+                # 2.1 Reconstruct the full effective Fock matrix for the embedded system in AO
+                vj, vk = self.mf_outer.get_jk(self.full_mol, dm_inner_full_ao)
+                fock_full_ao = hcore_orig + _as_cupy(vj) - 0.5 * _as_cupy(vk)
+                
+                # 2.2 Transform D, H, and F to the Lowdin orthogonalized (OAO) basis
+                dm_full_oao_inner = X_inv @ dm_inner_full_ao @ X_inv
+                hcore_oao = X.T @ hcore_orig @ X
+                fock_oao = X.T @ fock_full_ao @ X
+                
+                # 2.3 Extract Fragment Energy: 1/2 \sum_{i \in A, j} D_{ij}^{OAO} (H_{ij}^{OAO} + F_{ij}^{OAO})
+                # In symmetric orthogonalization, AO index mapping is perfectly preserved in OAO.
+                idx = self.frag_idx[ifrag]
                 e_frag_elec = 0.5 * cp.sum(
-                    dm_emb[:n_frag, :] * (self.h_emb[ifrag][:n_frag, :] + fock_emb[:n_frag, :])
+                    dm_full_oao_inner[idx, :] * (hcore_oao[idx, :] + fock_oao[idx, :])
                 )
                 
                 # Extract Fragment Nuclear Energy
@@ -430,11 +453,13 @@ def kernel(self):
             for ifrag in range(self.nfrags):
                 idx = self.frag_idx[ifrag]
                 idx_mesh = cp.ix_(idx, idx)
-                # Cost function: \Delta D = D_inner - D_outer over fragment blocks
+                
+                # Cost function: \Delta D = D_inner_full - D_outer_full over fragment blocks
                 diff = dm_inners[ifrag][idx_mesh] - dm_full_ao[idx_mesh]
                 error += float(cp.linalg.norm(diff))
                 
-                # Simple gradient descent step with damping factor
+                # Simple gradient descent step
+                # Note: 0.5 is a hyperparameter. If it oscillates, reduce it (e.g. to 0.1).
                 self.u[idx_mesh] -= 0.5 * diff
             
             print(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")

From 94ed8e4e2295fa39a3bf9ea0fb5c41d2f7c8e17f Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 13 May 2026 11:12:51 +0800
Subject: [PATCH 04/30] runable, but needs debug

---
 gpu4pyscf/dmet/__init__.py        |  11 +--
 gpu4pyscf/dmet/dmet.py            | 108 ++++++++++++---------
 gpu4pyscf/dmet/tests/test_dmet.py | 150 ++++++++++++------------------
 3 files changed, 128 insertions(+), 141 deletions(-)

diff --git a/gpu4pyscf/dmet/__init__.py b/gpu4pyscf/dmet/__init__.py
index 883b3e735..3b9c8ea05 100644
--- a/gpu4pyscf/dmet/__init__.py
+++ b/gpu4pyscf/dmet/__init__.py
@@ -13,13 +13,4 @@
 # limitations under the License.
 
 
-from .dmet import (
-    DMET,
-    get_fragment_ao_indices,
-    schmidt_decompose,
-    build_embedding_basis,
-    build_core_dm,
-    transform_h1,
-    transform_eri,
-    lowdin_orth,
-)
+from .dmet import DMET
diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index c0281c352..0ccb8ca23 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -61,34 +61,50 @@ def get_fragment_ao_indices(mol, frag_atoms):
     return indices
 
 
-def schmidt_decompose(dm_full, env_idx, threshold=1e-5):
+def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     """
-    Schmidt decomposition.
+    Schmidt decomposition via SVD of the occupied orbital coefficients on the fragment.
+    Strictly follows the original 2012 DMET formulation.
     """
-    dm = _as_cupy(dm_full)
+    mo_coeff_oao = _as_cupy(mo_coeff_oao)
+    mo_occ = _as_cupy(mo_occ)
     env_idx = _as_cupy(env_idx)
-    if env_idx.size == 0:
-        return (cp.zeros((0, 0)),
-                cp.zeros((0, 0)),
-                {'core': cp.zeros(0), 'bath': cp.zeros(0), 'virtual': cp.zeros(0), 'n_core_electrons': 0})
-
-    D_env = dm[env_idx[:, None], env_idx[None, :]]
-    D_env = 0.5 * (D_env + D_env.T)
-
-    eigvals, eigvecs = cp.linalg.eigh(D_env)
-
-    is_core = eigvals > (2.0 - threshold)
-    is_virt = eigvals < threshold
-    is_bath = ~(is_core | is_virt)
-
-    bath_orb = eigvecs[:, is_bath]
-    core_orb = eigvecs[:, is_core]
-
+    frag_idx = _as_cupy(frag_idx)
+    
+    # Filter strictly occupied orbitals
+    occ_mask = mo_occ > 1e-8
+    C_occ = mo_coeff_oao[:, occ_mask]
+    
+    if env_idx.size == 0 or C_occ.shape[1] == 0:
+        return (cp.zeros((0, 0)), cp.zeros((0, 0)), 
+                {'n_core_electrons': 0})
+        
+    # Fragment block of occupied orbitals
+    C_A = C_occ[frag_idx, :]
+    
+    # SVD of C_A: C_A = U * S * Vh
+    U, S, Vh = cp.linalg.svd(C_A, full_matrices=True)
+    
+    # Rotate all occupied orbitals according to Vh
+    C_rot = C_occ @ Vh.T
+    
+    is_bath = S > threshold
+    is_core_small = S <= threshold
+    n_sv = len(S)
+    
+    # Entangled bath orbitals (environment part)
+    bath_orb = C_rot[env_idx, :n_sv][:, is_bath]
+    norms = cp.linalg.norm(bath_orb, axis=0)
+    norms[norms < 1e-12] = 1.0  # Safe division
+    bath_orb = bath_orb / norms
+    
+    # Pure environment core orbitals come from null space + small singular values
+    core_orb_small = C_rot[env_idx, :n_sv][:, is_core_small]
+    core_orb_null = C_rot[env_idx, n_sv:]
+    core_orb = cp.hstack([core_orb_small, core_orb_null])
+    
     info = {
-        'core':    eigvals[is_core],
-        'bath':    eigvals[is_bath],
-        'virtual': eigvals[is_virt],
-        'n_core_electrons': 2 * int(is_core.sum()),
+        'n_core_electrons': 2 * core_orb.shape[1]
     }
     return bath_orb, core_orb, info
 
@@ -245,14 +261,15 @@ def __init__(self, mf_outer, mf_inner, fragments,
         self.dm_emb_init = [None] * self.nfrags
         self.e_inner = [None] * self.nfrags
         self.e_tot = None            
-        self.u = cp.zeros((nao, nao))  # Global correlation potential
+        self.u_oao = cp.zeros((nao, nao))  # Global correlation potential
 
-    def build_bath(self, ifrag, dm_full_oao, X):
+    def build_bath(self, ifrag, mo_coeff, mo_occ, X_inv, X):
         """
         Run the Schmidt decomposition for a specific fragment.
         """
+        mo_coeff_oao = X_inv @ _as_cupy(mo_coeff)
         bath_orb, core_orb, info = schmidt_decompose(
-            dm_full_oao, self.frag_idx[ifrag], self.env_idx[ifrag], self.threshold)
+            mo_coeff_oao, mo_occ, self.frag_idx[ifrag], self.env_idx[ifrag], self.threshold)
 
         nao_oao = X.shape[1]
         B_oao = build_embedding_basis(nao_oao, self.frag_idx[ifrag], self.env_idx[ifrag], bath_orb)
@@ -301,7 +318,6 @@ def build_embedded_hamiltonian(self, ifrag, hcore_orig):
         return self
 
     def _build_inner_mf(self, ifrag, dm_full_ao):
-        """Instantiate the inner SCF on the embedded mole."""
         nemb = self.B[ifrag].shape[1]
         n_total_electrons = int(self.full_mol.nelectron)
         n_emb_electrons = n_total_electrons - int(self.eig_info[ifrag]['n_core_electrons'])
@@ -359,6 +375,14 @@ def _get_jk(mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None):
 
         mf_inner.get_jk = _get_jk
 
+        def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+            if dm is None:
+                dm = mf_inner.make_rdm1()
+            vj, vk = _get_jk(mol, dm, hermi=hermi)
+            return vj - 0.5 * vk
+        
+        mf_inner.get_veff = _get_veff
+
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         sB = s_ao @ self.B[ifrag]
         dm_emb_init = sB.T @ dm_full_ao @ sB
@@ -391,20 +415,22 @@ def kernel(self):
         X, X_inv = lowdin_orth(s_ao)
 
         for macro_iter in range(self.max_macro_iter):
-            # 1. Run low-level SCF with current correlation potential 'u'
-            self.mf_outer.get_hcore = lambda *args, **kwargs: cp.asnumpy(hcore_orig + self.u)
+            u_ao = X_inv @ self.u_oao @ X_inv
+
+            # Run low-level SCF with current correlation potential 'u'
+            self.mf_outer.get_hcore = lambda *args, **kwargs: cp.asnumpy(hcore_orig + u_ao)
             self.mf_outer.mo_coeff = None # Force re-run
             self.mf_outer.kernel()
             
+            mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
+            mo_occ = _as_cupy(self.mf_outer.mo_occ)
             dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
-            dm_full_oao = X_inv @ dm_full_ao @ X_inv
 
             e_tot = 0.0
             dm_inners = []
 
-            # 2. Loop over all fragments
             for ifrag in range(self.nfrags):
-                self.build_bath(ifrag, dm_full_oao, X)
+                self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
                 self.build_embedded_hamiltonian(ifrag, hcore_orig)
                 mf_inner = self._build_inner_mf(ifrag, dm_full_ao)
                 self.solve_embedded(ifrag)
@@ -418,16 +444,16 @@ def kernel(self):
                 dm_inner_full_ao = self.dm_core[ifrag] + dm_inner_active_ao
                 dm_inners.append(dm_inner_full_ao)
 
-                # 2.1 Reconstruct the full effective Fock matrix for the embedded system in AO
+                # Reconstruct the full effective Fock matrix for the embedded system in AO
                 vj, vk = self.mf_outer.get_jk(self.full_mol, dm_inner_full_ao)
                 fock_full_ao = hcore_orig + _as_cupy(vj) - 0.5 * _as_cupy(vk)
                 
-                # 2.2 Transform D, H, and F to the Lowdin orthogonalized (OAO) basis
+                # Transform D, H, and F to the Lowdin orthogonalized (OAO) basis
                 dm_full_oao_inner = X_inv @ dm_inner_full_ao @ X_inv
                 hcore_oao = X.T @ hcore_orig @ X
                 fock_oao = X.T @ fock_full_ao @ X
                 
-                # 2.3 Extract Fragment Energy: 1/2 \sum_{i \in A, j} D_{ij}^{OAO} (H_{ij}^{OAO} + F_{ij}^{OAO})
+                # Extract Fragment Energy: 1/2 \sum_{i \in A, j} D_{ij}^{OAO} (H_{ij}^{OAO} + F_{ij}^{OAO})
                 # In symmetric orthogonalization, AO index mapping is perfectly preserved in OAO.
                 idx = self.frag_idx[ifrag]
                 e_frag_elec = 0.5 * cp.sum(
@@ -443,24 +469,22 @@ def kernel(self):
                     for j in range(self.full_mol.natm):
                         if i == j: continue
                         r = np.linalg.norm(coords[i] - coords[j])
-                        factor = 0.5 if j in frag_atoms else 1.0
-                        e_frag_nuc += factor * charges[i] * charges[j] / r
+                        e_frag_nuc += 0.5 * charges[i] * charges[j] / r
                 
                 e_tot += float(e_frag_elec) + e_frag_nuc
 
-            # 3. Macroscopic iteration: update correlation potential 'u'
+            # Mupdate correlation potential 'u'
             error = 0.0
             for ifrag in range(self.nfrags):
                 idx = self.frag_idx[ifrag]
                 idx_mesh = cp.ix_(idx, idx)
                 
-                # Cost function: \Delta D = D_inner_full - D_outer_full over fragment blocks
                 diff = dm_inners[ifrag][idx_mesh] - dm_full_ao[idx_mesh]
                 error += float(cp.linalg.norm(diff))
                 
                 # Simple gradient descent step
-                # Note: 0.5 is a hyperparameter. If it oscillates, reduce it (e.g. to 0.1).
-                self.u[idx_mesh] -= 0.5 * diff
+                # TODO: 0.5 is a hyperparameter. If it oscillates, reduce it (e.g. to 0.1).
+                self.u_oao[idx_mesh] -= 0.5 * diff
             
             print(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
             self.e_tot = e_tot
diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/dmet/tests/test_dmet.py
index a3c8194ea..93312df3d 100644
--- a/gpu4pyscf/dmet/tests/test_dmet.py
+++ b/gpu4pyscf/dmet/tests/test_dmet.py
@@ -12,110 +12,82 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""
-Basic correctness tests for the single-shot DMET driver.
-
-The cancellation property used here:
-
-    For a closed-shell system computed at the SAME mean-field level
-    (i.e. ``mf_inner`` and ``mf_outer`` share the same method and the
-    same orbital basis), the single-shot DMET total energy must
-    reproduce the full-system mean-field total energy exactly.
-"""
 
 import unittest
-import numpy as np
-from pyscf import gto, scf
-
-from gpu4pyscf.dmet import DMET
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.dmet import DMET 
 
 
 class KnownValues(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        cls.mol = gto.M(
-            atom='''
-            H 0.0 0.0 0.00
-            H 0.0 0.0 0.74
-            H 0.0 0.0 2.20
-            H 0.0 0.0 2.94
-            ''',
-            basis='sto-3g',
-            verbose=0,
-        )
-        cls.mf_ref = scf.RHF(cls.mol)
-        cls.e_ref = cls.mf_ref.kernel()
-
-    def test_self_consistency_two_atom_fragment(self):
-        # A single-shot DMET with the same low- and high-level method
-        # must reproduce the full-system mean-field energy.
-        mf_outer = scf.RHF(self.mol)
-        mf_outer.kernel()
-
-        mf_inner_template = scf.RHF(self.mol)
-
-        dmet = DMET(
-            mf_outer=mf_outer,
-            mf_inner=mf_inner_template,
-            frag_atoms=[0, 1],
-            threshold=1e-8,
-        )
-        e_dmet = dmet.kernel()
 
-        self.assertAlmostEqual(e_dmet, self.e_ref, places=7)
+        cls.mol = gto.Mole()
+        cls.mol.atom = '''
+            H 0.0 0.0 0.0
+            H 0.0 0.0 1.0
+            H 0.0 0.0 2.0
+            H 0.0 0.0 3.0
+        '''
+        cls.mol.basis = 'sto-3g'
+        cls.mol.spin = 0
+        cls.mol.charge = 0
+        cls.mol.verbose = 0
+        cls.mol.build()
 
-    def test_self_consistency_single_atom_fragment(self):
-        mf_outer = scf.RHF(self.mol)
-        mf_outer.kernel()
+        cls.fragments = [[0, 1], [2, 3]]
 
-        mf_inner_template = scf.RHF(self.mol)
+        cls.mf_outer = gpu_hf.RHF(cls.mol)
+        cls.mf_inner_template = gpu_hf.RHF(cls.mol)
 
-        dmet = DMET(
-            mf_outer=mf_outer,
-            mf_inner=mf_inner_template,
-            frag_atoms=[0],
-            threshold=1e-8,
-        )
-        e_dmet = dmet.kernel()
-        self.assertAlmostEqual(e_dmet, self.e_ref, places=7)
-
-    def test_bath_summary(self):
-        mf_outer = scf.RHF(self.mol)
-        mf_outer.kernel()
-
-        dmet = DMET(
-            mf_outer=mf_outer,
-            mf_inner=scf.RHF(self.mol),
-            frag_atoms=[0, 1],
-            threshold=1e-6,
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol
+        del cls.mf_outer
+        del cls.mf_inner_template
+        cp.get_default_memory_pool().free_all_blocks()
+
+    def test_dmet_initialization(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer,
+            mf_inner=self.mf_inner_template,
+            fragments=self.fragments,
+            threshold=1e-5
         )
-        dmet.build_bath()
-        info = dmet.bath_summary()
-        # Two H atoms in STO-3G means 2 fragment AOs.
-        self.assertEqual(info['n_fragment_aos'], 2)
-        # Number of (bath + core + virtual) eigenvalues equals the
-        # environment AO count.
-        self.assertEqual(
-            info['n_bath'] + info['n_core'] + info['n_virtual'],
-            self.mol.nao_nr() - info['n_fragment_aos'],
+
+        nao = self.mol.nao_nr()
+        
+        self.assertEqual(dmet_solver.nfrags, 2, "Number of fragments should be 2.")
+        self.assertEqual(len(dmet_solver.frag_idx), 2, "Fragment indices list should have length 2.")
+        
+        self.assertEqual(dmet_solver.u_oao.shape, (nao, nao), "Correlation potential u_oao should be of shape (nao, nao).")
+        self.assertTrue(isinstance(dmet_solver.u_oao, cp.ndarray), "Correlation potential should be a CuPy array.")
+
+    def test_dmet_execution_and_convergence(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer,
+            mf_inner=self.mf_inner_template,
+            fragments=self.fragments,
+            threshold=1e-5,
+            max_macro_iter=20,
+            macro_tol=1e-3
         )
 
-    def test_decomposition_keys(self):
-        mf_outer = scf.RHF(self.mol)
-        mf_outer.kernel()
+        e_tot = dmet_solver.kernel()
 
-        dmet = DMET(
-            mf_outer=mf_outer,
-            mf_inner=scf.RHF(self.mol),
-            frag_atoms=[0, 1],
-            threshold=1e-8,
-        )
-        dmet.kernel()
-        decomp = dmet.energy_decomposition()
-        for key in ('E_nuc', 'E_core', 'E_inner', 'E_DMET'):
-            self.assertIn(key, decomp)
+        self.assertIsNotNone(e_tot, "DMET kernel should return a valid energy value, not None.")
+        self.assertIsInstance(e_tot, float, "The returned total energy must be a float.")
+
+        self.assertLess(e_tot, 0.0, "Total energy of H4 molecule should be negative.")
+
+        self.assertIsNotNone(dmet_solver.bath_orb[0], "Bath orbitals for fragment 0 should be generated.")
+        self.assertIsNotNone(dmet_solver.h_emb[0], "Embedded Hamiltonian for fragment 0 should be generated.")
+        
+        self.assertTrue(isinstance(dmet_solver.dm_core[0], cp.ndarray), "Core density matrix should be a CuPy array.")
 
 
 if __name__ == '__main__':
-    print("Tests for single-shot DMET")
-    unittest.main()
+    print("Full Tests for DMET")
+    unittest.main()
\ No newline at end of file

From bc48b6dc4883536088b92de62c03493a6e72663c Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 14 May 2026 16:36:08 +0800
Subject: [PATCH 05/30] H4 passed, C4 has bug, needs debug

---
 gpu4pyscf/dmet/dmet.py            | 86 ++++++++++++++++++-------------
 gpu4pyscf/dmet/tests/test_dmet.py | 27 ++++++----
 gpu4pyscf/scf/hf.py               | 33 ++++++++++++
 3 files changed, 100 insertions(+), 46 deletions(-)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index 0ccb8ca23..a53b6a289 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -17,6 +17,7 @@
 import numpy as np
 import cupy as cp
 from pyscf import gto
+import pyscf.ao2mo  # Added for exact 4-index ERI transformation
 
 
 def _as_cupy(x):
@@ -26,9 +27,6 @@ def _as_cupy(x):
 
 
 def lowdin_orth(s):
-    """
-    Loewdin symmetric orthogonalization.
-    """
     s = _as_cupy(s)
     s = 0.5 * (s + s.T)
     eigvals, eigvecs = cp.linalg.eigh(s)
@@ -38,8 +36,8 @@ def lowdin_orth(s):
         eigvecs = eigvecs[:, keep]
     inv_sqrt = 1.0 / cp.sqrt(eigvals)
     sqrt = cp.sqrt(eigvals)
-    X = (eigvecs * inv_sqrt) @ eigvecs.T          # S^{-1/2}
-    X_inv = (eigvecs * sqrt) @ eigvecs.T          # S^{+1/2}
+    X = (eigvecs * inv_sqrt) @ eigvecs.T # S^{-1/2}
+    X_inv = (eigvecs * sqrt) @ eigvecs.T # S^{+1/2}
     return X, X_inv
 
 
@@ -63,7 +61,6 @@ def get_fragment_ao_indices(mol, frag_atoms):
 
 def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     """
-    Schmidt decomposition via SVD of the occupied orbital coefficients on the fragment.
     Strictly follows the original 2012 DMET formulation.
     """
     mo_coeff_oao = _as_cupy(mo_coeff_oao)
@@ -71,7 +68,6 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     env_idx = _as_cupy(env_idx)
     frag_idx = _as_cupy(frag_idx)
     
-    # Filter strictly occupied orbitals
     occ_mask = mo_occ > 1e-8
     C_occ = mo_coeff_oao[:, occ_mask]
     
@@ -79,13 +75,10 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
         return (cp.zeros((0, 0)), cp.zeros((0, 0)), 
                 {'n_core_electrons': 0})
         
-    # Fragment block of occupied orbitals
     C_A = C_occ[frag_idx, :]
     
-    # SVD of C_A: C_A = U * S * Vh
     U, S, Vh = cp.linalg.svd(C_A, full_matrices=True)
     
-    # Rotate all occupied orbitals according to Vh
     C_rot = C_occ @ Vh.T
     
     is_bath = S > threshold
@@ -95,7 +88,7 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     # Entangled bath orbitals (environment part)
     bath_orb = C_rot[env_idx, :n_sv][:, is_bath]
     norms = cp.linalg.norm(bath_orb, axis=0)
-    norms[norms < 1e-12] = 1.0  # Safe division
+    norms[norms < 1e-12] = 1.0 # This should not happen
     bath_orb = bath_orb / norms
     
     # Pure environment core orbitals come from null space + small singular values
@@ -113,6 +106,9 @@ def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
     """
     Construct the AO -> embedded transformation matrix B.
     """
+    # Due to the Carlson-Keller theorem, the lowdin OAO basis 
+    # and the AO basis is 1-to-1 match.
+    # Therefore, we can use the fragment indices to construct the embedding matrix.
     frag_idx = _as_cupy(frag_idx)
     env_idx = _as_cupy(env_idx)
     n_frag = frag_idx.size
@@ -207,8 +203,8 @@ class DMET:
     ----------
     mf_outer : SCF object (gpu4pyscf)
         Low-level mean-field on the full system.
-    mf_inner : SCF/DFT object (gpu4pyscf)
-        High-level mean-field template applied to the embedded cluster.
+    mf_inner : SCF/DFT/post-HF object (gpu4pyscf)
+        High-level mean-field or post-HF template applied to the embedded cluster.
     fragments : list of lists of int
         List of fragments, where each fragment is a list of atom indices.
     threshold : float
@@ -248,7 +244,6 @@ def __init__(self, mf_outer, mf_inner, fragments,
             env_mask[f_idx] = False
             self.env_idx.append(all_idx[env_mask])
 
-        # ---- intermediate / output caches (lists for multiple fragments) ----
         self.bath_orb = [None] * self.nfrags
         self.core_orb = [None] * self.nfrags
         self.eig_info = [None] * self.nfrags
@@ -313,11 +308,12 @@ def build_embedded_hamiltonian(self, ifrag, hcore_orig):
         else:
             e_core = 0.0
 
-        self.h_emb[ifrag] = h_emb
+        self.h_emb[ifrag] = h_emb # embeding basis
         self.e_core[ifrag] = float(e_core)
         return self
 
     def _build_inner_mf(self, ifrag, dm_full_ao):
+        # TODO: Handle post-HF case!
         nemb = self.B[ifrag].shape[1]
         n_total_electrons = int(self.full_mol.nelectron)
         n_emb_electrons = n_total_electrons - int(self.eig_info[ifrag]['n_core_electrons'])
@@ -375,6 +371,7 @@ def _get_jk(mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None):
 
         mf_inner.get_jk = _get_jk
 
+        # TODO: this is only works for SCF, even not for DFT or post-HF!
         def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             if dm is None:
                 dm = mf_inner.make_rdm1()
@@ -382,7 +379,8 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             return vj - 0.5 * vk
         
         mf_inner.get_veff = _get_veff
-
+        
+        # using s to make the upper index to the lower index
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         sB = s_ao @ self.B[ifrag]
         dm_emb_init = sB.T @ dm_full_ao @ sB
@@ -396,7 +394,6 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         return mf_inner
 
     def solve_embedded(self, ifrag):
-        """Run the high-level embedded SCF for a specific fragment."""
         e_inner = self.mf_inner[ifrag].kernel(dm0=self.dm_emb_init[ifrag])
         if isinstance(e_inner, tuple):
             e_inner = float(self.mf_inner[ifrag].e_tot)
@@ -406,10 +403,6 @@ def solve_embedded(self, ifrag):
         return e_inner
 
     def kernel(self):
-        """
-        Drive the macroscopic-iterating DMET workflow.
-        Returns the DMET total energy.
-        """
         hcore_orig = _as_cupy(self.mf_outer.get_hcore())
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         X, X_inv = lowdin_orth(s_ao)
@@ -444,21 +437,38 @@ def kernel(self):
                 dm_inner_full_ao = self.dm_core[ifrag] + dm_inner_active_ao
                 dm_inners.append(dm_inner_full_ao)
 
-                # Reconstruct the full effective Fock matrix for the embedded system in AO
-                vj, vk = self.mf_outer.get_jk(self.full_mol, dm_inner_full_ao)
-                fock_full_ao = hcore_orig + _as_cupy(vj) - 0.5 * _as_cupy(vk)
+                # Compute Embedded 4-index ERI for Exact Correlation Energy
+                nemb = B.shape[1]
+                # TODO: this can be replaced by a more efficient routine
+                B_cpu = cp.asnumpy(B)
+                eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+                eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
+                eri_emb = _as_cupy(eri_emb_cpu)
                 
-                # Transform D, H, and F to the Lowdin orthogonalized (OAO) basis
-                dm_full_oao_inner = X_inv @ dm_inner_full_ao @ X_inv
-                hcore_oao = X.T @ hcore_orig @ X
-                fock_oao = X.T @ fock_full_ao @ X
+                # Extract 1-RDM and 2-RDM
+                dm1_emb = dm_emb
+                if hasattr(mf_inner, 'make_rdm2'):
+                    dm2_emb = _as_cupy(mf_inner.make_rdm2())
+                else:
+                    # using the HF 2-RDM formulation
+                    dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
+                               - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
+                
+                # By construction, fragment orbitals are precisely the first n_frag indices
+                n_frag = len(self.fragments[ifrag])
                 
-                # Extract Fragment Energy: 1/2 \sum_{i \in A, j} D_{ij}^{OAO} (H_{ij}^{OAO} + F_{ij}^{OAO})
-                # In symmetric orthogonalization, AO index mapping is perfectly preserved in OAO.
+                # Extract Fragment Electronic Energy
+                e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * self.h_emb[ifrag][:n_frag, :])
+                e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+
+                # Extract Fragment Core Energy Partition in AO basis
+                # TODO: this is only works for SCF, even not for DFT or post-HF!
+                vj_core, vk_core = self.mf_outer.get_jk(self.full_mol, self.dm_core[ifrag])
+                v_core_ao = _as_cupy(vj_core) - 0.5 * _as_cupy(vk_core)
                 idx = self.frag_idx[ifrag]
-                e_frag_elec = 0.5 * cp.sum(
-                    dm_full_oao_inner[idx, :] * (hcore_oao[idx, :] + fock_oao[idx, :])
-                )
+                
+                e_frag_core = cp.sum(self.dm_core[ifrag][idx, :] * hcore_orig[idx, :]) + \
+                              0.5 * cp.sum(self.dm_core[ifrag][idx, :] * v_core_ao[idx, :])
                 
                 # Extract Fragment Nuclear Energy
                 e_frag_nuc = 0.0
@@ -471,15 +481,19 @@ def kernel(self):
                         r = np.linalg.norm(coords[i] - coords[j])
                         e_frag_nuc += 0.5 * charges[i] * charges[j] / r
                 
-                e_tot += float(e_frag_elec) + e_frag_nuc
+                e_tot += float(e_frag_elec) + float(e_frag_core) + e_frag_nuc
 
-            # Mupdate correlation potential 'u'
+            # Strictly use OAO basis to evaluate density differences
+            dm_low_oao = X_inv @ dm_full_ao @ X_inv
+            
             error = 0.0
             for ifrag in range(self.nfrags):
                 idx = self.frag_idx[ifrag]
                 idx_mesh = cp.ix_(idx, idx)
                 
-                diff = dm_inners[ifrag][idx_mesh] - dm_full_ao[idx_mesh]
+                dm_high_oao = X_inv @ dm_inners[ifrag] @ X_inv
+                
+                diff = dm_high_oao[idx_mesh] - dm_low_oao[idx_mesh]
                 error += float(cp.linalg.norm(diff))
                 
                 # Simple gradient descent step
diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/dmet/tests/test_dmet.py
index 93312df3d..ee0ba7f6f 100644
--- a/gpu4pyscf/dmet/tests/test_dmet.py
+++ b/gpu4pyscf/dmet/tests/test_dmet.py
@@ -14,10 +14,12 @@
 
 
 import unittest
+import numpy as np
 import cupy as cp
 from pyscf import gto
 from gpu4pyscf.scf import hf as gpu_hf
 from gpu4pyscf.dmet import DMET 
+from gpu4pyscf import dmet
 
 
 class KnownValues(unittest.TestCase):
@@ -34,7 +36,7 @@ def setUpClass(cls):
         cls.mol.basis = 'sto-3g'
         cls.mol.spin = 0
         cls.mol.charge = 0
-        cls.mol.verbose = 0
+        # cls.mol.verbose = 0
         cls.mol.build()
 
         cls.fragments = [[0, 1], [2, 3]]
@@ -47,7 +49,6 @@ def tearDownClass(cls):
         del cls.mol
         del cls.mf_outer
         del cls.mf_inner_template
-        cp.get_default_memory_pool().free_all_blocks()
 
     def test_dmet_initialization(self):
         dmet_solver = DMET(
@@ -65,6 +66,18 @@ def test_dmet_initialization(self):
         self.assertEqual(dmet_solver.u_oao.shape, (nao, nao), "Correlation potential u_oao should be of shape (nao, nao).")
         self.assertTrue(isinstance(dmet_solver.u_oao, cp.ndarray), "Correlation potential should be a CuPy array.")
 
+    def test_lowdin(self):
+        ovlp = self.mf_outer.get_ovlp()
+        X, _ = dmet.dmet.lowdin_orth(ovlp)
+        X_ref = cp.array([[ 1.1214051976, -0.3278815514,  0.0611473762, -0.0095874461],
+                          [-0.3278815514,  1.2643824327, -0.3597401082,  0.0611473762],
+                          [ 0.0611473762, -0.3597401082,  1.2643824327, -0.3278815514],
+                          [-0.0095874461,  0.0611473762, -0.3278815514,  1.1214051976]])
+        assert np.abs(X - X_ref).max() < 1e-8, "Lowdin orthogonalization should yield a close-to-identity matrix."
+
+    def test_schmidt(self):
+        pass
+
     def test_dmet_execution_and_convergence(self):
         dmet_solver = DMET(
             mf_outer=self.mf_outer,
@@ -77,15 +90,9 @@ def test_dmet_execution_and_convergence(self):
 
         e_tot = dmet_solver.kernel()
 
-        self.assertIsNotNone(e_tot, "DMET kernel should return a valid energy value, not None.")
-        self.assertIsInstance(e_tot, float, "The returned total energy must be a float.")
-
-        self.assertLess(e_tot, 0.0, "Total energy of H4 molecule should be negative.")
-
-        self.assertIsNotNone(dmet_solver.bath_orb[0], "Bath orbitals for fragment 0 should be generated.")
-        self.assertIsNotNone(dmet_solver.h_emb[0], "Embedded Hamiltonian for fragment 0 should be generated.")
+        e_tot_ref = self.mf_outer.kernel()
         
-        self.assertTrue(isinstance(dmet_solver.dm_core[0], cp.ndarray), "Core density matrix should be a CuPy array.")
+        assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
 
 
 if __name__ == '__main__':
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index 56c1d30eb..f18a84cac 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -662,6 +662,34 @@ def __call__(self, mol_or_geom, **kwargs):
         self._last_mol_fp = mol.ao_loc
         return e_tot
 
+
+def make_rdm2(mo_coeff, mo_occ, **kwargs):
+    '''Two-particle density matrix in AO representation
+
+    NOTE the indices of the two-particle density matrix is ordered to
+
+    dm2[p,q,r,s] = <q^+ s^+ r p>.
+
+    HF energy can be computed
+    E = einsum('pq,qp', hcore, 1pdm) + einsum('pqrs,pqrs', eri, 2pdm) / 2
+    where h1[p,q] = <p|h|q> and eri[p,q,r,s] = (pq|rs)
+to make the density matrix consistent with the density matrix obtained
+    from post-HF methods,
+
+    Args:
+        mo_coeff : 2D ndarray
+            Orbital coefficients. Each column is one orbital.
+        mo_occ : 1D ndarray
+            Occupancy
+    Returns:
+        Two-particle density matrix, 4D ndarray
+    '''
+    dm1 = make_rdm1(mo_coeff, mo_occ, **kwargs)
+    dm2 = (cupy.einsum('ij,kl->ijkl', dm1, dm1)
+         - cupy.einsum('ij,kl->iklj', dm1, dm1)/2)
+    return dm2
+
+
 class SCF(pyscf_lib.StreamObject):
 
     # attributes
@@ -869,6 +897,11 @@ def make_rdm1(self, mo_coeff=None, mo_occ=None, **kwargs):
         if mo_coeff is None: mo_coeff = self.mo_coeff
         return make_rdm1(mo_coeff, mo_occ)
 
+    def make_rdm2(self, mo_coeff=None, mo_occ=None, **kwargs):
+        if mo_occ is None: mo_occ = self.mo_occ
+        if mo_coeff is None: mo_coeff = self.mo_coeff
+        return make_rdm2(mo_coeff, mo_occ)
+
     def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None,
                    verbose=logger.NOTE):
         if mol is None: mol = self.mol

From fa41b1268cc6764ada087fdfcfe96fb6ae27017c Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 15 May 2026 10:32:32 +0800
Subject: [PATCH 06/30] runable codes, just for HF embeding HF

---
 gpu4pyscf/dmet/dmet.py            | 40 +++++++++++++++-------------
 gpu4pyscf/dmet/tests/test_dmet.py | 44 ++++++++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index a53b6a289..6cf499f00 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -17,6 +17,8 @@
 import numpy as np
 import cupy as cp
 from pyscf import gto
+from pyscf import lib
+from gpu4pyscf.lib import logger
 import pyscf.ao2mo  # Added for exact 4-index ERI transformation
 
 
@@ -195,7 +197,7 @@ def _instantiate_inner_mf(mf_template, embedded_mol):
     return new_mf
 
 
-class DMET:
+class DMET(lib.StreamObject):
     """
     Density Matrix Embedding Theory driver with macroscopic iteration.
 
@@ -216,12 +218,17 @@ class DMET:
     """
 
     def __init__(self, mf_outer, mf_inner, fragments,
-                 threshold=1e-5, max_macro_iter=20, macro_tol=1e-4):
+                 threshold=1e-5, max_macro_iter=20, macro_tol=1e-4, verbose=None):
         if mf_outer is None or mf_inner is None:
             raise ValueError("mf_outer and mf_inner are both required.")
         if not fragments:
             raise ValueError("Provide a list of fragments to define the DMET regions.")
-
+        
+        if verbose is None:
+            verbose = mf_outer.verbose
+        else:
+            verbose = int(verbose)
+        self.log = logger.new_logger(mf_outer, verbose)
         self.mf_outer = mf_outer
         self.mf_inner_template = mf_inner
         self.full_mol = mf_outer.mol
@@ -408,6 +415,7 @@ def kernel(self):
         X, X_inv = lowdin_orth(s_ao)
 
         for macro_iter in range(self.max_macro_iter):
+            self.log.info(f"Macro Iter {macro_iter}")
             u_ao = X_inv @ self.u_oao @ X_inv
 
             # Run low-level SCF with current correlation potential 'u'
@@ -455,22 +463,18 @@ def kernel(self):
                                - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
                 
                 # By construction, fragment orbitals are precisely the first n_frag indices
-                n_frag = len(self.fragments[ifrag])
+                n_frag = self.frag_idx[ifrag].size
                 
-                # Extract Fragment Electronic Energy
-                e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * self.h_emb[ifrag][:n_frag, :])
-                e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
-
-                # Extract Fragment Core Energy Partition in AO basis
                 # TODO: this is only works for SCF, even not for DFT or post-HF!
                 vj_core, vk_core = self.mf_outer.get_jk(self.full_mol, self.dm_core[ifrag])
                 v_core_ao = _as_cupy(vj_core) - 0.5 * _as_cupy(vk_core)
-                idx = self.frag_idx[ifrag]
+                v_core_emb = B.T @ v_core_ao @ B
                 
-                e_frag_core = cp.sum(self.dm_core[ifrag][idx, :] * hcore_orig[idx, :]) + \
-                              0.5 * cp.sum(self.dm_core[ifrag][idx, :] * v_core_ao[idx, :])
+                # Apply 0.5 factor to core potential to avoid double counting across fragments
+                h_eval = self.h_emb[ifrag] - 0.5 * v_core_emb
+                e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
+                e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
                 
-                # Extract Fragment Nuclear Energy
                 e_frag_nuc = 0.0
                 coords = self.full_mol.atom_coords()
                 charges = self.full_mol.atom_charges()
@@ -480,10 +484,10 @@ def kernel(self):
                         if i == j: continue
                         r = np.linalg.norm(coords[i] - coords[j])
                         e_frag_nuc += 0.5 * charges[i] * charges[j] / r
-                
-                e_tot += float(e_frag_elec) + float(e_frag_core) + e_frag_nuc
+                        
+                self.log.info(f"Fragment {ifrag} Electronic Energy: {float(e_frag_elec):.8f} | Nuclear Energy: {e_frag_nuc:.8f}")
+                e_tot += float(e_frag_elec) + e_frag_nuc
 
-            # Strictly use OAO basis to evaluate density differences
             dm_low_oao = X_inv @ dm_full_ao @ X_inv
             
             error = 0.0
@@ -500,10 +504,10 @@ def kernel(self):
                 # TODO: 0.5 is a hyperparameter. If it oscillates, reduce it (e.g. to 0.1).
                 self.u_oao[idx_mesh] -= 0.5 * diff
             
-            print(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
+            self.log.info(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
             self.e_tot = e_tot
             if error < self.macro_tol:
-                print("DMET macroscopic iterations converged.")
+                self.log.info("DMET macroscopic iterations converged.")
                 break
 
         return self.e_tot
diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/dmet/tests/test_dmet.py
index ee0ba7f6f..81a7b3a0f 100644
--- a/gpu4pyscf/dmet/tests/test_dmet.py
+++ b/gpu4pyscf/dmet/tests/test_dmet.py
@@ -36,7 +36,7 @@ def setUpClass(cls):
         cls.mol.basis = 'sto-3g'
         cls.mol.spin = 0
         cls.mol.charge = 0
-        # cls.mol.verbose = 0
+        cls.mol.verbose = 0
         cls.mol.build()
 
         cls.fragments = [[0, 1], [2, 3]]
@@ -44,11 +44,38 @@ def setUpClass(cls):
         cls.mf_outer = gpu_hf.RHF(cls.mol)
         cls.mf_inner_template = gpu_hf.RHF(cls.mol)
 
+        cls.mol2 = gto.Mole()
+        cls.mol2.atom = '''
+            C      -0.76091    -0.00000     0.00000
+            C       0.76091    -0.00000     0.00000
+            H      -1.16001     1.02029     0.00000
+            H      -1.16001    -0.51014    -0.88357
+            H      -1.16001    -0.51014     0.88357
+            H       1.16001    -1.02029     0.00000
+            H       1.16001     0.51014     0.88357
+            H       1.16001     0.51014    -0.88357    
+        '''
+        cls.mol2.basis = '6-31g'
+        cls.mol2.spin = 0
+        cls.mol2.charge = 0
+        cls.mol2.verbose = 0
+        cls.mol2.build()
+
+        cls.fragments2 = [[0, 2, 3, 4], [1, 5, 6, 7]]
+
+        cls.mf_outer2 = gpu_hf.RHF(cls.mol2)
+        cls.mf_outer2.conv_tol = 1e-12
+        cls.mf_inner_template2 = gpu_hf.RHF(cls.mol2)
+        cls.mf_inner_template2.conv_tol = 1e-12
+
     @classmethod
     def tearDownClass(cls):
         del cls.mol
         del cls.mf_outer
         del cls.mf_inner_template
+        del cls.mol2
+        del cls.mf_outer2
+        del cls.mf_inner_template2
 
     def test_dmet_initialization(self):
         dmet_solver = DMET(
@@ -94,6 +121,21 @@ def test_dmet_execution_and_convergence(self):
         
         assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
 
+        dmet_solver2 = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=20,
+            macro_tol=1e-3
+        )
+
+        e_tot = dmet_solver2.kernel()
+        self.mf_outer2.mo_coeff = None
+        e_tot_ref = self.mf_outer2.kernel()
+        
+        assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
+
 
 if __name__ == '__main__':
     print("Full Tests for DMET")

From a219258bd6982c7e27991aa647d3638490c5b896 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 15 May 2026 11:22:25 +0800
Subject: [PATCH 07/30] add a new routine for calculating fragment energies for
 mean field methods

---
 gpu4pyscf/dmet/dmet.py | 46 +++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index 6cf499f00..7b53aa4f3 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -445,24 +445,8 @@ def kernel(self):
                 dm_inner_full_ao = self.dm_core[ifrag] + dm_inner_active_ao
                 dm_inners.append(dm_inner_full_ao)
 
-                # Compute Embedded 4-index ERI for Exact Correlation Energy
-                nemb = B.shape[1]
-                # TODO: this can be replaced by a more efficient routine
-                B_cpu = cp.asnumpy(B)
-                eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
-                eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
-                eri_emb = _as_cupy(eri_emb_cpu)
-                
-                # Extract 1-RDM and 2-RDM
                 dm1_emb = dm_emb
-                if hasattr(mf_inner, 'make_rdm2'):
-                    dm2_emb = _as_cupy(mf_inner.make_rdm2())
-                else:
-                    # using the HF 2-RDM formulation
-                    dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
-                               - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
                 
-                # By construction, fragment orbitals are precisely the first n_frag indices
                 n_frag = self.frag_idx[ifrag].size
                 
                 # TODO: this is only works for SCF, even not for DFT or post-HF!
@@ -473,7 +457,31 @@ def kernel(self):
                 # Apply 0.5 factor to core potential to avoid double counting across fragments
                 h_eval = self.h_emb[ifrag] - 0.5 * v_core_emb
                 e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
-                e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+
+                # Check if the inner solver is a mean-field template by looking for 'get_veff'
+                is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
+
+                if not is_mean_field:
+                    self.log.info("using non-mean-field solver")
+                    nemb = B.shape[1]
+                    # TODO: this can be replaced by a more efficient routine
+                    B_cpu = cp.asnumpy(B)
+                    eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+                    eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
+                    eri_emb = _as_cupy(eri_emb_cpu)
+                    
+                    if hasattr(mf_inner, 'make_rdm2'):
+                        dm2_emb = _as_cupy(mf_inner.make_rdm2())
+                    else:
+                        # Fallback using the HF 2-RDM formulation for post-HF methods lacking make_rdm2
+                        dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
+                                   - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
+                    
+                    e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+                else:
+                    self.log.info("using mean-field solver")
+                    vj_emb, vk_emb = mf_inner.get_jk(dm=dm1_emb)
+                    e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * (_as_cupy(vj_emb) - 0.5 * _as_cupy(vk_emb))[:n_frag, :])
                 
                 e_frag_nuc = 0.0
                 coords = self.full_mol.atom_coords()
@@ -504,10 +512,10 @@ def kernel(self):
                 # TODO: 0.5 is a hyperparameter. If it oscillates, reduce it (e.g. to 0.1).
                 self.u_oao[idx_mesh] -= 0.5 * diff
             
-            self.log.info(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
+            self.log.note(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
             self.e_tot = e_tot
             if error < self.macro_tol:
-                self.log.info("DMET macroscopic iterations converged.")
+                self.log.note("DMET macroscopic iterations converged.")
                 break
 
         return self.e_tot

From 0437ddeff73e1f1076ceb08f8df25b887b6a93b0 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 15 May 2026 14:36:42 +0800
Subject: [PATCH 08/30] WIP: adding unit test for schmidt and DFT

---
 gpu4pyscf/dmet/dmet.py            | 61 +++++++++++++------------------
 gpu4pyscf/dmet/tests/test_dmet.py | 29 ++++++++++++++-
 2 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index 7b53aa4f3..7e8d75df7 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -19,7 +19,7 @@
 from pyscf import gto
 from pyscf import lib
 from gpu4pyscf.lib import logger
-import pyscf.ao2mo  # Added for exact 4-index ERI transformation
+import pyscf.ao2mo
 
 
 def _as_cupy(x):
@@ -257,6 +257,7 @@ def __init__(self, mf_outer, mf_inner, fragments,
         self.B_oao = [None] * self.nfrags
         self.B = [None] * self.nfrags
         self.dm_core = [None] * self.nfrags
+        self.v_core_ao = [None] * self.nfrags
         self.h_emb = [None] * self.nfrags
         self.e_core = [None] * self.nfrags
         self.mf_inner = [None] * self.nfrags
@@ -302,10 +303,11 @@ def build_embedded_hamiltonian(self, ifrag, hcore_orig):
         h_ao = _as_cupy(hcore_orig)
 
         if self.eig_info[ifrag]['n_core_electrons'] > 0:
-            vj_core, vk_core = self.mf_outer.get_jk(mol, self.dm_core[ifrag])
-            v_core_ao = _as_cupy(vj_core) - 0.5 * _as_cupy(vk_core)
+            v_core_ao = _as_cupy(self.mf_outer.get_veff(mol, self.dm_core[ifrag]))
         else:
             v_core_ao = cp.zeros_like(h_ao)
+            
+        self.v_core_ao[ifrag] = v_core_ao
 
         h_emb = transform_h1(h_ao + v_core_ao, self.B[ifrag])
 
@@ -344,9 +346,8 @@ def _build_inner_mf(self, ifrag, dm_full_ao):
         mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
         mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + self.e_core[ifrag]
 
-        # Overwrite get_jk to compute J and K on-the-fly using the outer MF
-        # without computing or storing 4-index ERIs.
-        def _get_jk(mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None):
+        # Overwrite get_veff to compute on-the-fly using the outer MF
+        def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             if dm is None:
                 dm = mf_inner.make_rdm1()
             dm_cp = _as_cupy(dm)
@@ -358,33 +359,21 @@ def _get_jk(mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None):
             else:
                 dm_ao = cp.einsum('pi,xij,qj->xpq', B_mat, dm_cp, B_mat)
                 
-            # Compute J and K in full AO basis using outer SCF's optimized routine
-            vj_ao, vk_ao = self.mf_outer.get_jk(self.full_mol, dm_ao, hermi, with_j, with_k, omega)
+            dm_full_ao = self.dm_core[ifrag] + dm_ao
             
-            # Project J and K back to embedded basis
-            vj_emb = vk_emb = None
-            if vj_ao is not None:
-                if dm_cp.ndim == 2:
-                    vj_emb = B_mat.T @ vj_ao @ B_mat
-                else:
-                    vj_emb = cp.einsum('pi,xpq,qj->xij', B_mat, vj_ao, B_mat)
-            if vk_ao is not None:
-                if dm_cp.ndim == 2:
-                    vk_emb = B_mat.T @ vk_ao @ B_mat
-                else:
-                    vk_emb = cp.einsum('pi,xpq,qj->xij', B_mat, vk_ao, B_mat)
+            # Compute Veff in full AO basis using outer SCF's optimized routine
+            v_eff_full = self.mf_outer.get_veff(self.full_mol, dm_full_ao, hermi=hermi)
+            v_eff_active = _as_cupy(v_eff_full) - self.v_core_ao[ifrag]
+            
+            # Project Veff back to embedded basis
+            if dm_cp.ndim == 2:
+                v_eff_emb = B_mat.T @ v_eff_active @ B_mat
+            else:
+                v_eff_emb = cp.einsum('pi,xpq,qj->xij', B_mat, v_eff_active, B_mat)
                     
-            return vj_emb, vk_emb
-
-        mf_inner.get_jk = _get_jk
+            return v_eff_emb
 
-        # TODO: this is only works for SCF, even not for DFT or post-HF!
-        def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
-            if dm is None:
-                dm = mf_inner.make_rdm1()
-            vj, vk = _get_jk(mol, dm, hermi=hermi)
-            return vj - 0.5 * vk
-        
+        # TODO: this is only works for HF/DFT, not for post-HF!
         mf_inner.get_veff = _get_veff
         
         # using s to make the upper index to the lower index
@@ -449,19 +438,19 @@ def kernel(self):
                 
                 n_frag = self.frag_idx[ifrag].size
                 
-                # TODO: this is only works for SCF, even not for DFT or post-HF!
-                vj_core, vk_core = self.mf_outer.get_jk(self.full_mol, self.dm_core[ifrag])
-                v_core_ao = _as_cupy(vj_core) - 0.5 * _as_cupy(vk_core)
+                # TODO: this is only works for HF/DFT, not for post-HF!
+                v_core_ao = self.v_core_ao[ifrag]
                 v_core_emb = B.T @ v_core_ao @ B
                 
                 # Apply 0.5 factor to core potential to avoid double counting across fragments
                 h_eval = self.h_emb[ifrag] - 0.5 * v_core_emb
                 e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
 
-                # Check if the inner solver is a mean-field template by looking for 'get_veff'
+                # Check if the inner solver is a mean-field template
                 is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
 
                 if not is_mean_field:
+                    raise NotImplementedError("Only mean-field solver is supported for DMET.")
                     self.log.info("using non-mean-field solver")
                     nemb = B.shape[1]
                     # TODO: this can be replaced by a more efficient routine
@@ -480,8 +469,8 @@ def kernel(self):
                     e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
                 else:
                     self.log.info("using mean-field solver")
-                    vj_emb, vk_emb = mf_inner.get_jk(dm=dm1_emb)
-                    e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * (_as_cupy(vj_emb) - 0.5 * _as_cupy(vk_emb))[:n_frag, :])
+                    v_eff_emb = mf_inner.get_veff(dm=dm1_emb)
+                    e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * _as_cupy(v_eff_emb)[:n_frag, :])
                 
                 e_frag_nuc = 0.0
                 coords = self.full_mol.atom_coords()
diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/dmet/tests/test_dmet.py
index 81a7b3a0f..646d07044 100644
--- a/gpu4pyscf/dmet/tests/test_dmet.py
+++ b/gpu4pyscf/dmet/tests/test_dmet.py
@@ -18,6 +18,7 @@
 import cupy as cp
 from pyscf import gto
 from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.dft import rks
 from gpu4pyscf.dmet import DMET 
 from gpu4pyscf import dmet
 
@@ -68,6 +69,11 @@ def setUpClass(cls):
         cls.mf_inner_template2 = gpu_hf.RHF(cls.mol2)
         cls.mf_inner_template2.conv_tol = 1e-12
 
+        cls.mf_outer3 = rks.RKS(cls.mol2)
+        cls.mf_outer3.conv_tol = 1e-12
+        cls.mf_inner_template3 = rks.RKS(cls.mol2)
+        cls.mf_inner_template3.conv_tol = 1e-12
+
     @classmethod
     def tearDownClass(cls):
         del cls.mol
@@ -95,15 +101,34 @@ def test_dmet_initialization(self):
 
     def test_lowdin(self):
         ovlp = self.mf_outer.get_ovlp()
-        X, _ = dmet.dmet.lowdin_orth(ovlp)
+        X, X_inv = dmet.dmet.lowdin_orth(ovlp)
         X_ref = cp.array([[ 1.1214051976, -0.3278815514,  0.0611473762, -0.0095874461],
                           [-0.3278815514,  1.2643824327, -0.3597401082,  0.0611473762],
                           [ 0.0611473762, -0.3597401082,  1.2643824327, -0.3278815514],
                           [-0.0095874461,  0.0611473762, -0.3278815514,  1.1214051976]])
+        identity = cp.eye(4)
+        assert np.abs(X@X_inv - identity).max() < 1e-8, "Lowdin orthogonalization should yield an identity matrix."
         assert np.abs(X - X_ref).max() < 1e-8, "Lowdin orthogonalization should yield a close-to-identity matrix."
 
     def test_schmidt(self):
-        pass
+        mol = gto.Mole()
+        mol.atom = '''
+            H 0.0 0.0 0.0
+            H 0.0 0.0 1.0
+            H 0.0 0.0 2.0
+            H 0.0 0.0 3.0
+        '''
+        mol.basis = '6-31g'
+        mol.spin = 0
+        mol.charge = 0
+        mol.verbose = 0
+        mol.build()
+
+        mf = gpu_hf.RHF(mol)
+        mf.kernel()
+        
+        s = 
+
 
     def test_dmet_execution_and_convergence(self):
         dmet_solver = DMET(

From 7e3f8461f8def6af5d773ea611a9f7fa0c637484 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Mon, 18 May 2026 09:13:07 +0800
Subject: [PATCH 09/30] add unit tests

---
 gpu4pyscf/dmet/tests/test_dmet.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/dmet/tests/test_dmet.py
index 646d07044..d559f13e7 100644
--- a/gpu4pyscf/dmet/tests/test_dmet.py
+++ b/gpu4pyscf/dmet/tests/test_dmet.py
@@ -127,8 +127,19 @@ def test_schmidt(self):
         mf = gpu_hf.RHF(mol)
         mf.kernel()
         
-        s = 
-
+        s = mf.get_ovlp()
+        mo_coeff = mf.mo_coeff
+        X, X_inv = dmet.dmet.lowdin_orth(s)
+        mo_coeff_oao = X@mo_coeff
+        C_occ = mo_coeff_oao[:, :2]
+        C_A = mo_coeff_oao[:4, :2]
+        U, S, Vh = cp.linalg.svd(C_A, full_matrices=True)
+        C_rot = C_occ @ Vh.T
+        bath_orb_ref = C_rot[4:]
+        norms = cp.linalg.norm(bath_orb_ref, axis=0)
+        bath_orb_ref /= norms
+        bath_orb = dmet.dmet.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7])[0]
+        assert np.abs(bath_orb.get() - bath_orb_ref.get()).max() < 1e-8, "Schmidt decomposition should yield close-to-identity matrices."
 
     def test_dmet_execution_and_convergence(self):
         dmet_solver = DMET(

From fc6767a3877c334785af212d00bedc3fddca5acb Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Mon, 18 May 2026 10:27:59 +0800
Subject: [PATCH 10/30] DFT runable, needs debug

---
 gpu4pyscf/dmet/dmet.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/dmet/dmet.py
index 7e8d75df7..689733419 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/dmet/dmet.py
@@ -19,6 +19,7 @@
 from pyscf import gto
 from pyscf import lib
 from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import tag_array
 import pyscf.ao2mo
 
 
@@ -370,6 +371,19 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
                 v_eff_emb = B_mat.T @ v_eff_active @ B_mat
             else:
                 v_eff_emb = cp.einsum('pi,xpq,qj->xij', B_mat, v_eff_active, B_mat)
+            
+            ecoul = getattr(v_eff_full, 'ecoul', 0.0)
+            exc = getattr(v_eff_full, 'exc', 0.0)
+            if hasattr(v_eff_full, 'vj'): 
+                vj = getattr(v_eff_full, 'vj')
+            else:
+                vj = cp.zeros_like(v_eff_emb)
+            if hasattr(v_eff_full, 'vk'): 
+                vk = getattr(v_eff_full, 'vk')
+            else:
+                vk = cp.zeros_like(v_eff_emb)
+            
+            v_eff_emb = tag_array(v_eff_emb, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
                     
             return v_eff_emb
 
@@ -399,6 +413,7 @@ def solve_embedded(self, ifrag):
         return e_inner
 
     def kernel(self):
+        orig_outer_get_hcore = self.mf_outer.get_hcore
         hcore_orig = _as_cupy(self.mf_outer.get_hcore())
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         X, X_inv = lowdin_orth(s_ao)
@@ -506,6 +521,19 @@ def kernel(self):
             if error < self.macro_tol:
                 self.log.note("DMET macroscopic iterations converged.")
                 break
+        
+        # Restore outer mean-field to its original unpolluted state
+        self.mf_outer.get_hcore = orig_outer_get_hcore
+        self.mf_outer.mo_coeff = None
+        self.mf_outer.mo_energy = None
+        self.mf_outer.mo_occ = None
+        
+        # Free up memory and break closures in inner mean-fields
+        for ifrag in range(self.nfrags):
+            if self.mf_inner[ifrag] is not None:
+                self.mf_inner[ifrag].mo_coeff = None
+                self.mf_inner[ifrag].mo_occ = None
+                self.mf_inner[ifrag].mo_energy = None
 
         return self.e_tot
 

From d71f94b228962c4e545d2dad996fa5ea77bc9ed9 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 20 May 2026 12:34:53 +0800
Subject: [PATCH 11/30] move the codes

---
 .../{dmet => qmmm/embedding}/__init__.py      |   2 +-
 .../dmet.py => qmmm/embedding/embedding.py}   | 194 ++++++++++++------
 .../embedding/tests/test_dmet_embeding.py}    |  35 +++-
 3 files changed, 163 insertions(+), 68 deletions(-)
 rename gpu4pyscf/{dmet => qmmm/embedding}/__init__.py (95%)
 rename gpu4pyscf/{dmet/dmet.py => qmmm/embedding/embedding.py} (67%)
 rename gpu4pyscf/{dmet/tests/test_dmet.py => qmmm/embedding/tests/test_dmet_embeding.py} (80%)

diff --git a/gpu4pyscf/dmet/__init__.py b/gpu4pyscf/qmmm/embedding/__init__.py
similarity index 95%
rename from gpu4pyscf/dmet/__init__.py
rename to gpu4pyscf/qmmm/embedding/__init__.py
index 3b9c8ea05..e829ae4bb 100644
--- a/gpu4pyscf/dmet/__init__.py
+++ b/gpu4pyscf/qmmm/embedding/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 
-from .dmet import DMET
+from .embedding import DMET
diff --git a/gpu4pyscf/dmet/dmet.py b/gpu4pyscf/qmmm/embedding/embedding.py
similarity index 67%
rename from gpu4pyscf/dmet/dmet.py
rename to gpu4pyscf/qmmm/embedding/embedding.py
index 689733419..e439ecd25 100644
--- a/gpu4pyscf/dmet/dmet.py
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -63,9 +63,6 @@ def get_fragment_ao_indices(mol, frag_atoms):
 
 
 def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
-    """
-    Strictly follows the original 2012 DMET formulation.
-    """
     mo_coeff_oao = _as_cupy(mo_coeff_oao)
     mo_occ = _as_cupy(mo_occ)
     env_idx = _as_cupy(env_idx)
@@ -216,10 +213,13 @@ class DMET(lib.StreamObject):
         Maximum number of macroscopic iterations for correlation potential (u).
     macro_tol : float
         Convergence tolerance for the difference in fragment 1-RDMs.
+    energy_method : str
+        Method for calculating the total energy: 'direct' or 'delta'.
     """
 
     def __init__(self, mf_outer, mf_inner, fragments,
-                 threshold=1e-5, max_macro_iter=20, macro_tol=1e-4, verbose=None):
+                 threshold=1e-5, max_macro_iter=20, macro_tol=1e-4, 
+                 energy_method='direct', verbose=None):
         if mf_outer is None or mf_inner is None:
             raise ValueError("mf_outer and mf_inner are both required.")
         if not fragments:
@@ -236,6 +236,10 @@ def __init__(self, mf_outer, mf_inner, fragments,
         self.threshold = float(threshold)
         self.max_macro_iter = max_macro_iter
         self.macro_tol = macro_tol
+        
+        self.energy_method = energy_method.lower()
+        if self.energy_method not in ['direct', 'delta']:
+            raise ValueError("energy_method must be 'direct' or 'delta'")
 
         self.fragments = [list(int(a) for a in frag) for frag in fragments]
         self.nfrags = len(self.fragments)
@@ -323,7 +327,6 @@ def build_embedded_hamiltonian(self, ifrag, hcore_orig):
         return self
 
     def _build_inner_mf(self, ifrag, dm_full_ao):
-        # TODO: Handle post-HF case!
         nemb = self.B[ifrag].shape[1]
         n_total_electrons = int(self.full_mol.nelectron)
         n_emb_electrons = n_total_electrons - int(self.eig_info[ifrag]['n_core_electrons'])
@@ -338,21 +341,33 @@ def _build_inner_mf(self, ifrag, dm_full_ao):
 
         mf_inner = _instantiate_inner_mf(self.mf_inner_template, emb_mol)
 
-        h_emb = self.h_emb[ifrag]
+        B_mat = self.B[ifrag]
+        
+        if hasattr(self.mf_inner_template, 'get_veff'):
+            v_core_inner_ao = _as_cupy(self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag]))
+        else:
+            v_core_inner_ao = cp.zeros_like(self.dm_core[ifrag])
+
+        h_ao = _as_cupy(self.mf_outer.get_hcore())
+        # The inner Hamiltonian gets the strict high-level background potential
+        h_emb_inner = B_mat.T @ (h_ao + v_core_inner_ao) @ B_mat
         ovlp = cp.eye(nemb)
 
-        # Base energy offset for debugging per fragment
         e_nuc = float(self.full_mol.energy_nuc())
-        mf_inner.get_hcore = lambda *args, **kwargs: h_emb
+        mf_inner.get_hcore = lambda *args, **kwargs: h_emb_inner
         mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
-        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + self.e_core[ifrag]
+        
+        # Energy offset for inner solver debugging aligns with inner core potential
+        # This 0.5 will be removed for 1-fragment systmes.
+        e_core_inner = float(cp.einsum('ij,ji->', self.dm_core[ifrag], h_ao) + 
+                             0.5 * cp.einsum('ij,ji->', self.dm_core[ifrag], v_core_inner_ao))
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + e_core_inner
 
-        # Overwrite get_veff to compute on-the-fly using the outer MF
+        # Overwrite get_veff to compute on-the-fly using the inner template
         def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             if dm is None:
                 dm = mf_inner.make_rdm1()
             dm_cp = _as_cupy(dm)
-            B_mat = self.B[ifrag]
             
             # Project embedded dm to full AO basis
             if dm_cp.ndim == 2:
@@ -360,11 +375,11 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             else:
                 dm_ao = cp.einsum('pi,xij,qj->xpq', B_mat, dm_cp, B_mat)
                 
-            dm_full_ao = self.dm_core[ifrag] + dm_ao
+            dm_full_ao_inner = self.dm_core[ifrag] + dm_ao
             
-            # Compute Veff in full AO basis using outer SCF's optimized routine
-            v_eff_full = self.mf_outer.get_veff(self.full_mol, dm_full_ao, hermi=hermi)
-            v_eff_active = _as_cupy(v_eff_full) - self.v_core_ao[ifrag]
+            # [FIXED] Compute Veff in full AO basis using inner template strictly
+            v_eff_full = self.mf_inner_template.get_veff(self.full_mol, dm_full_ao_inner, hermi=hermi)
+            v_eff_active = _as_cupy(v_eff_full) - v_core_inner_ao
             
             # Project Veff back to embedded basis
             if dm_cp.ndim == 2:
@@ -387,7 +402,6 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
                     
             return v_eff_emb
 
-        # TODO: this is only works for HF/DFT, not for post-HF!
         mf_inner.get_veff = _get_veff
         
         # using s to make the upper index to the lower index
@@ -430,8 +444,13 @@ def kernel(self):
             mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
             mo_occ = _as_cupy(self.mf_outer.mo_occ)
             dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
-
-            e_tot = 0.0
+            
+            if self.energy_method == 'delta':
+                # Remove the correlation potential penalty from the total energy to get the physical base energy
+                e_tot = self.mf_outer.e_tot - float(cp.sum(dm_full_ao * u_ao))
+            else:
+                e_tot = 0.0
+                
             dm_inners = []
 
             for ifrag in range(self.nfrags):
@@ -453,52 +472,112 @@ def kernel(self):
                 
                 n_frag = self.frag_idx[ifrag].size
                 
-                # TODO: this is only works for HF/DFT, not for post-HF!
+                # Outer (Low-level) environment embedding
                 v_core_ao = self.v_core_ao[ifrag]
                 v_core_emb = B.T @ v_core_ao @ B
                 
                 # Apply 0.5 factor to core potential to avoid double counting across fragments
+                # TODO: The 0.5 factor should be removed for ONIOM energy of just 1 fragment.
                 h_eval = self.h_emb[ifrag] - 0.5 * v_core_emb
-                e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
-
-                # Check if the inner solver is a mean-field template
+                
                 is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
 
-                if not is_mean_field:
-                    raise NotImplementedError("Only mean-field solver is supported for DMET.")
-                    self.log.info("using non-mean-field solver")
-                    nemb = B.shape[1]
-                    # TODO: this can be replaced by a more efficient routine
-                    B_cpu = cp.asnumpy(B)
-                    eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
-                    eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
-                    eri_emb = _as_cupy(eri_emb_cpu)
-                    
-                    if hasattr(mf_inner, 'make_rdm2'):
-                        dm2_emb = _as_cupy(mf_inner.make_rdm2())
+                # [FIXED] Inner (High-level) evaluation uses its own core functional to prevent cross-talk
+                if is_mean_field:
+                    v_core_inner_ao = _as_cupy(self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag]))
+                    v_core_inner_emb = B.T @ v_core_inner_ao @ B
+                    h_ao = _as_cupy(hcore_orig)
+                    h_emb_inner = B.T @ (h_ao + v_core_inner_ao) @ B
+                    h_eval_high = h_emb_inner - 0.5 * v_core_inner_emb
+                else:
+                    h_eval_high = h_eval
+
+                if self.energy_method == 'direct':
+                    e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval_high[:n_frag, :])
+                    if not is_mean_field:
+                        raise NotImplementedError("Only mean-field solver is supported for DMET.")
+                        self.log.info("using non-mean-field solver")
+                        nemb = B.shape[1]
+                        # TODO: this can be replaced by a more efficient routine
+                        B_cpu = cp.asnumpy(B)
+                        eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+                        eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
+                        eri_emb = _as_cupy(eri_emb_cpu)
+                        
+                        if hasattr(mf_inner, 'make_rdm2'):
+                            dm2_emb = _as_cupy(mf_inner.make_rdm2())
+                        else:
+                            # Fallback using the HF 2-RDM formulation for post-HF methods lacking make_rdm2
+                            dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
+                                       - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
+                        
+                        e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
                     else:
-                        # Fallback using the HF 2-RDM formulation for post-HF methods lacking make_rdm2
-                        dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
-                                   - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
+                        self.log.info("using mean-field solver")
+                        v_eff_emb = mf_inner.get_veff(dm=dm1_emb)
+                        e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * _as_cupy(v_eff_emb)[:n_frag, :])
                     
-                    e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
-                else:
-                    self.log.info("using mean-field solver")
-                    v_eff_emb = mf_inner.get_veff(dm=dm1_emb)
-                    e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * _as_cupy(v_eff_emb)[:n_frag, :])
-                
-                e_frag_nuc = 0.0
-                coords = self.full_mol.atom_coords()
-                charges = self.full_mol.atom_charges()
-                frag_atoms = self.fragments[ifrag]
-                for i in frag_atoms:
-                    for j in range(self.full_mol.natm):
-                        if i == j: continue
-                        r = np.linalg.norm(coords[i] - coords[j])
-                        e_frag_nuc += 0.5 * charges[i] * charges[j] / r
+                    e_frag_nuc = 0.0
+                    coords = self.full_mol.atom_coords()
+                    charges = self.full_mol.atom_charges()
+                    frag_atoms = self.fragments[ifrag]
+                    for i in frag_atoms:
+                        for j in range(self.full_mol.natm):
+                            if i == j: continue
+                            r = np.linalg.norm(coords[i] - coords[j])
+                            e_frag_nuc += 0.5 * charges[i] * charges[j] / r
+                            
+                    self.log.info(f"Fragment {ifrag} Electronic Energy: {float(e_frag_elec):.8f} | Nuclear Energy: {e_frag_nuc:.8f}")
+                    e_tot += float(e_frag_elec) + e_frag_nuc
+
+                elif self.energy_method == 'delta':
+                    dm1_emb_high = dm1_emb
+                    dm1_emb_low = self.dm_emb_init[ifrag]
+                    
+                    # Compute High-Level pseudo energy (using strictly high-level core potential evaluation)
+                    e_high = cp.sum(dm1_emb_high[:n_frag, :] * h_eval_high[:n_frag, :])
+                    
+                    # Compute Low-Level pseudo energy (using strictly low-level core potential evaluation)
+                    e_low = cp.sum(dm1_emb_low[:n_frag, :] * h_eval[:n_frag, :])
+                    
+                    if not is_mean_field:
+                        raise NotImplementedError("Only mean-field solver is supported for DMET.")
+                        self.log.info("using non-mean-field solver")
+                        nemb = B.shape[1]
+                        B_cpu = cp.asnumpy(B)
+                        eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+                        eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb)
+                        eri_emb = _as_cupy(eri_emb_cpu)
                         
-                self.log.info(f"Fragment {ifrag} Electronic Energy: {float(e_frag_elec):.8f} | Nuclear Energy: {e_frag_nuc:.8f}")
-                e_tot += float(e_frag_elec) + e_frag_nuc
+                        if hasattr(mf_inner, 'make_rdm2'):
+                            dm2_emb_high = _as_cupy(mf_inner.make_rdm2())
+                        else:
+                            dm2_emb_high = (cp.einsum('ij,kl->ijkl', dm1_emb_high, dm1_emb_high) 
+                                       - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb_high, dm1_emb_high))
+                        e_high += 0.5 * cp.sum(dm2_emb_high[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+                        
+                        # Low-level is always un-correlated 2-RDM
+                        dm2_emb_low = (cp.einsum('ij,kl->ijkl', dm1_emb_low, dm1_emb_low) 
+                                       - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb_low, dm1_emb_low))
+                        e_low += 0.5 * cp.sum(dm2_emb_low[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+                    else:
+                        self.log.info("using mean-field solver")
+                        v_eff_emb_high = mf_inner.get_veff(dm=dm1_emb_high)
+                        e_high += 0.5 * cp.sum(dm1_emb_high[:n_frag, :] * _as_cupy(v_eff_emb_high)[:n_frag, :])
+                        
+                        # [FIXED] Compute Veff for the low-level density explicitly using the outer functional
+                        dm_ao_low = B @ dm1_emb_low @ B.T
+                        dm_full_ao_low = self.dm_core[ifrag] + dm_ao_low
+                        
+                        v_eff_full_low = self.mf_outer.get_veff(self.full_mol, dm_full_ao_low)
+                        v_eff_active_low = _as_cupy(v_eff_full_low) - self.v_core_ao[ifrag]
+                        v_eff_emb_low = B.T @ v_eff_active_low @ B
+                        
+                        e_low += 0.5 * cp.sum(dm1_emb_low[:n_frag, :] * v_eff_emb_low[:n_frag, :])
+                    
+                    delta_e = float(e_high - e_low)
+                    self.log.info(f"Fragment {ifrag} Delta E (Correlation Improvement): {delta_e:.8f}")
+                    e_tot += delta_e
 
             dm_low_oao = X_inv @ dm_full_ao @ X_inv
             
@@ -527,13 +606,6 @@ def kernel(self):
         self.mf_outer.mo_coeff = None
         self.mf_outer.mo_energy = None
         self.mf_outer.mo_occ = None
-        
-        # Free up memory and break closures in inner mean-fields
-        for ifrag in range(self.nfrags):
-            if self.mf_inner[ifrag] is not None:
-                self.mf_inner[ifrag].mo_coeff = None
-                self.mf_inner[ifrag].mo_occ = None
-                self.mf_inner[ifrag].mo_energy = None
 
         return self.e_tot
 
diff --git a/gpu4pyscf/dmet/tests/test_dmet.py b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embeding.py
similarity index 80%
rename from gpu4pyscf/dmet/tests/test_dmet.py
rename to gpu4pyscf/qmmm/embedding/tests/test_dmet_embeding.py
index d559f13e7..8f55b69f0 100644
--- a/gpu4pyscf/dmet/tests/test_dmet.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embeding.py
@@ -19,8 +19,8 @@
 from pyscf import gto
 from gpu4pyscf.scf import hf as gpu_hf
 from gpu4pyscf.dft import rks
-from gpu4pyscf.dmet import DMET 
-from gpu4pyscf import dmet
+from gpu4pyscf.qmmm.embedding.embedding import DMET 
+from gpu4pyscf.qmmm.embedding import embedding
 
 
 class KnownValues(unittest.TestCase):
@@ -43,7 +43,9 @@ def setUpClass(cls):
         cls.fragments = [[0, 1], [2, 3]]
 
         cls.mf_outer = gpu_hf.RHF(cls.mol)
+        cls.mf_outer.conv_tol = 1e-14
         cls.mf_inner_template = gpu_hf.RHF(cls.mol)
+        cls.mf_inner_template.conv_tol = 1e-14
 
         cls.mol2 = gto.Mole()
         cls.mol2.atom = '''
@@ -101,7 +103,7 @@ def test_dmet_initialization(self):
 
     def test_lowdin(self):
         ovlp = self.mf_outer.get_ovlp()
-        X, X_inv = dmet.dmet.lowdin_orth(ovlp)
+        X, X_inv = embedding.lowdin_orth(ovlp)
         X_ref = cp.array([[ 1.1214051976, -0.3278815514,  0.0611473762, -0.0095874461],
                           [-0.3278815514,  1.2643824327, -0.3597401082,  0.0611473762],
                           [ 0.0611473762, -0.3597401082,  1.2643824327, -0.3278815514],
@@ -129,7 +131,7 @@ def test_schmidt(self):
         
         s = mf.get_ovlp()
         mo_coeff = mf.mo_coeff
-        X, X_inv = dmet.dmet.lowdin_orth(s)
+        X, X_inv = embedding.lowdin_orth(s)
         mo_coeff_oao = X@mo_coeff
         C_occ = mo_coeff_oao[:, :2]
         C_A = mo_coeff_oao[:4, :2]
@@ -138,7 +140,7 @@ def test_schmidt(self):
         bath_orb_ref = C_rot[4:]
         norms = cp.linalg.norm(bath_orb_ref, axis=0)
         bath_orb_ref /= norms
-        bath_orb = dmet.dmet.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7])[0]
+        bath_orb = embedding.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7])[0]
         assert np.abs(bath_orb.get() - bath_orb_ref.get()).max() < 1e-8, "Schmidt decomposition should yield close-to-identity matrices."
 
     def test_dmet_execution_and_convergence(self):
@@ -156,7 +158,8 @@ def test_dmet_execution_and_convergence(self):
         e_tot_ref = self.mf_outer.kernel()
         
         assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
-
+        assert np.abs(dmet_solver.u_oao).sum() < 1e-8, "Correlation potential should be close to zero."
+        
         dmet_solver2 = DMET(
             mf_outer=self.mf_outer2,
             mf_inner=self.mf_inner_template2,
@@ -169,8 +172,28 @@ def test_dmet_execution_and_convergence(self):
         e_tot = dmet_solver2.kernel()
         self.mf_outer2.mo_coeff = None
         e_tot_ref = self.mf_outer2.kernel()
+
+        dmet_solver2_iter1 = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=1,
+            macro_tol=1e-3
+        )
+        e_tot_iter1 = dmet_solver2_iter1.kernel()
+
+        total_elec_dmet = 0.0
+        for ifrag in range(dmet_solver2.nfrags):
+            dm_high = cp.asnumpy(dmet_solver2.mf_inner[ifrag].make_rdm1())
+            n_frag_orbs = len(dmet_solver2.frag_idx[ifrag])
+            total_elec_dmet += np.trace(dm_high[:n_frag_orbs, :n_frag_orbs])
         
+        assert np.abs(total_elec_dmet - (self.mol2.nelec[0] + self.mol2.nelec[1])) < 1e-8, \
+            "Sum of numbers of electrons from fragments should be close to the total number."
         assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
+        assert np.abs(e_tot_iter1 - e_tot) < 1e-8, "DMET energy should be converged in 1 macro iteration."
+        assert np.abs(dmet_solver2.u_oao).sum() < 1e-8, "Correlation potential should be close to zero."
 
 
 if __name__ == '__main__':

From 1c602261d6f940d055cbf2b0632288705c8cd1eb Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 20 May 2026 13:46:18 +0800
Subject: [PATCH 12/30] add embedding for 1-fragment DFT

---
 gpu4pyscf/qmmm/embedding/embeding_dft.py | 152 +++++++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 gpu4pyscf/qmmm/embedding/embeding_dft.py

diff --git a/gpu4pyscf/qmmm/embedding/embeding_dft.py b/gpu4pyscf/qmmm/embedding/embeding_dft.py
new file mode 100644
index 000000000..443dbca26
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/embeding_dft.py
@@ -0,0 +1,152 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupy as cp
+import numpy as np
+import pyscf.ao2mo
+from gpu4pyscf.lib.cupy_helper import tag_array
+
+# Import your original DMET base class and helper functions
+# from dmet import DMET, lowdin_orth, _as_cupy
+from .dmet import DMET, lowdin_orth, _as_cupy
+
+
+class SingleFragmentEmbedding(DMET):
+    """
+    Single-Fragment ONIOM-like Embedding driver inheriting from the DMET base class.
+    
+    This class overrides the initialization and kernel to perform a single-shot,
+    single-fragment delta-method energy evaluation without macroscopic iterations.
+    It rigorously traces over the entire active space (Fragment + Bath) to capture
+    full polarization correlation, eliminating the 0.5 double-counting factor.
+    """
+    
+    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
+        """
+        Parameters
+        ----------
+        mf_outer : SCF object
+            Low-level mean-field on the full system (e.g., PBE).
+        mf_inner : SCF/DFT/post-HF object
+            High-level template applied to the embedded cluster (e.g., B3LYP).
+        fragment : list of int
+            A single list of atom indices defining the QM region.
+        threshold : float
+            Eigenvalue cutoff used to classify environment orbitals.
+        """
+        # Wrap the single fragment into a list of lists to satisfy parent DMET __init__
+        fragments = [fragment]
+        
+        # Initialize parent class. 
+        # Force max_macro_iter=1 and energy_method='delta' strictly
+        super().__init__(mf_outer, mf_inner, fragments,
+                         threshold=threshold, max_macro_iter=1, 
+                         energy_method='delta', verbose=verbose)
+        
+        # Expose the single fragment directly for user convenience
+        self.fragment = self.fragments[0]
+        
+    def kernel(self):
+        """
+        Executes the single-shot embedding workflow.
+        """
+        # 1. Run Outer Mean-Field (if not already converged)
+        if not self.mf_outer.converged:
+            self.mf_outer.kernel()
+            
+        e_global_low = self.mf_outer.e_tot
+        mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
+        mo_occ = _as_cupy(self.mf_outer.mo_occ)
+        dm_full_ao_low = _as_cupy(self.mf_outer.make_rdm1())
+        
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
+
+        ifrag = 0 # Strictly single fragment at index 0
+        
+        # 2. Schmidt Decomposition & Bath Construction using parent methods
+        self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
+        self.build_embedded_hamiltonian(ifrag, hcore_orig)
+        
+        # 3. Build and Run Inner embedded solver
+        # _build_inner_mf already encapsulates the rigorous dual-functional core potential logic
+        mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
+        self.log.info("Running high-level inner solver...")
+        self.solve_embedded(ifrag)
+        
+        dm_emb_high = _as_cupy(mf_inner.make_rdm1())
+        dm_emb_low = self.dm_emb_init[ifrag]
+        
+        B = self.B[ifrag]
+        nemb = B.shape[1]
+        is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
+        
+        # 4. Evaluate Energy using strict Delta Method
+        
+        # --- Evaluate High-Level trace ---
+        # Note: Trace is implicitly over the FULL active space (dm_emb_high * h_eval_high).
+        # No 0.5 reduction factor is applied for the core potential since there are no other fragments.
+        if is_mean_field:
+            v_core_inner_ao = _as_cupy(self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag]))
+            h_eval_high = B.T @ (hcore_orig + v_core_inner_ao) @ B
+        else:
+            h_eval_high = self.h_emb[ifrag]
+            
+        e_high = cp.sum(dm_emb_high * h_eval_high) 
+        
+        if is_mean_field:
+            v_eff_emb_high = mf_inner.get_veff(dm=dm_emb_high)
+            e_high += 0.5 * cp.sum(dm_emb_high * _as_cupy(v_eff_emb_high))
+        else:
+            # WFT evaluation over full active space (kept for future CCSD/MP2 extensions)
+            B_cpu = cp.asnumpy(B)
+            eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+            eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb)
+            eri_emb = _as_cupy(eri_emb_cpu)
+            
+            if hasattr(mf_inner, 'make_rdm2'):
+                dm2_emb_high = _as_cupy(mf_inner.make_rdm2())
+            else:
+                dm2_emb_high = (cp.einsum('ij,kl->ijkl', dm_emb_high, dm_emb_high) 
+                           - 0.5 * cp.einsum('il,jk->ijkl', dm_emb_high, dm_emb_high))
+            e_high += 0.5 * cp.sum(dm2_emb_high * eri_emb)
+            
+        # --- Evaluate Low-Level trace ---
+        # self.h_emb strictly contains 1.0 * v_core_outer_ao natively
+        h_eval_low = self.h_emb[ifrag] 
+        e_low = cp.sum(dm_emb_low * h_eval_low)
+        
+        if is_mean_field:
+            # Reconstruct full low-level density strictly from embedded projection
+            dm_full_ao_low_reconstructed = self.dm_core[ifrag] + B @ dm_emb_low @ B.T
+            v_eff_full_low = self.mf_outer.get_veff(self.full_mol, dm_full_ao_low_reconstructed)
+            v_eff_active_low = _as_cupy(v_eff_full_low) - self.v_core_ao[ifrag]
+            v_eff_emb_low = B.T @ v_eff_active_low @ B
+            
+            e_low += 0.5 * cp.sum(dm_emb_low * v_eff_emb_low)
+        else:
+            dm2_emb_low = (cp.einsum('ij,kl->ijkl', dm_emb_low, dm_emb_low) 
+                       - 0.5 * cp.einsum('il,jk->ijkl', dm_emb_low, dm_emb_low))
+            e_low += 0.5 * cp.sum(dm2_emb_low * eri_emb)
+        
+        # --- Assembly ---
+        delta_e = float(e_high - e_low)
+        self.log.note(f"Global Low-Level E : {e_global_low:.8f}")
+        self.log.note(f"Active Space dE    : {delta_e:.8f}")
+        
+        self.e_tot = e_global_low + delta_e
+        self.log.note(f"Total Embedded E   : {self.e_tot:.8f}")
+
+        return self.e_tot
\ No newline at end of file

From 0ed250150de5d107d2189c9948ea430e44a6552b Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 20 May 2026 16:28:47 +0800
Subject: [PATCH 13/30] in debugging

---
 gpu4pyscf/qmmm/embedding/embeding_dft.py | 145 ++++++++++++++---------
 1 file changed, 87 insertions(+), 58 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embeding_dft.py b/gpu4pyscf/qmmm/embedding/embeding_dft.py
index 443dbca26..87b17940f 100644
--- a/gpu4pyscf/qmmm/embedding/embeding_dft.py
+++ b/gpu4pyscf/qmmm/embedding/embeding_dft.py
@@ -16,18 +16,15 @@
 import numpy as np
 import pyscf.ao2mo
 from gpu4pyscf.lib.cupy_helper import tag_array
-
-# Import your original DMET base class and helper functions
-# from dmet import DMET, lowdin_orth, _as_cupy
-from .dmet import DMET, lowdin_orth, _as_cupy
+from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth, _as_cupy
 
 
 class SingleFragmentEmbedding(DMET):
     """
-    Single-Fragment ONIOM-like Embedding driver inheriting from the DMET base class.
+    Single-Fragment ONIOM-like embedding.
     
-    This class overrides the initialization and kernel to perform a single-shot,
-    single-fragment delta-method energy evaluation without macroscopic iterations.
+    This class performs a single-shot,
+    single-fragment delta-method energy evaluation WITHOUT macroscopic iterations.
     It rigorously traces over the entire active space (Fragment + Bath) to capture
     full polarization correlation, eliminating the 0.5 double-counting factor.
     """
@@ -35,7 +32,7 @@ class SingleFragmentEmbedding(DMET):
     def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         """
         Parameters
-        ----------
+       -------
         mf_outer : SCF object
             Low-level mean-field on the full system (e.g., PBE).
         mf_inner : SCF/DFT/post-HF object
@@ -45,11 +42,8 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         threshold : float
             Eigenvalue cutoff used to classify environment orbitals.
         """
-        # Wrap the single fragment into a list of lists to satisfy parent DMET __init__
         fragments = [fragment]
         
-        # Initialize parent class. 
-        # Force max_macro_iter=1 and energy_method='delta' strictly
         super().__init__(mf_outer, mf_inner, fragments,
                          threshold=threshold, max_macro_iter=1, 
                          energy_method='delta', verbose=verbose)
@@ -58,10 +52,7 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         self.fragment = self.fragments[0]
         
     def kernel(self):
-        """
-        Executes the single-shot embedding workflow.
-        """
-        # 1. Run Outer Mean-Field (if not already converged)
+
         if not self.mf_outer.converged:
             self.mf_outer.kernel()
             
@@ -74,14 +65,12 @@ def kernel(self):
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         X, X_inv = lowdin_orth(s_ao)
 
-        ifrag = 0 # Strictly single fragment at index 0
+        ifrag = 0
         
-        # 2. Schmidt Decomposition & Bath Construction using parent methods
         self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
         self.build_embedded_hamiltonian(ifrag, hcore_orig)
         
-        # 3. Build and Run Inner embedded solver
-        # _build_inner_mf already encapsulates the rigorous dual-functional core potential logic
+        # Build and Run Inner embedded solver
         mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
         self.log.info("Running high-level inner solver...")
         self.solve_embedded(ifrag)
@@ -90,58 +79,98 @@ def kernel(self):
         dm_emb_low = self.dm_emb_init[ifrag]
         
         B = self.B[ifrag]
-        nemb = B.shape[1]
         is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
         
-        # 4. Evaluate Energy using strict Delta Method
-        
-        # --- Evaluate High-Level trace ---
-        # Note: Trace is implicitly over the FULL active space (dm_emb_high * h_eval_high).
-        # No 0.5 reduction factor is applied for the core potential since there are no other fragments.
+        # Evaluate High-Level trace
         if is_mean_field:
-            v_core_inner_ao = _as_cupy(self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag]))
-            h_eval_high = B.T @ (hcore_orig + v_core_inner_ao) @ B
-        else:
-            h_eval_high = self.h_emb[ifrag]
+            # Bare one-electron Hamiltonian trace
+            h_eval_bare = B.T @ hcore_orig @ B
+            e_high_h = cp.sum(dm_emb_high * h_eval_bare)
             
-        e_high = cp.sum(dm_emb_high * h_eval_high) 
-        
-        if is_mean_field:
-            v_eff_emb_high = mf_inner.get_veff(dm=dm_emb_high)
-            e_high += 0.5 * cp.sum(dm_emb_high * _as_cupy(v_eff_emb_high))
-        else:
-            # WFT evaluation over full active space (kept for future CCSD/MP2 extensions)
-            B_cpu = cp.asnumpy(B)
-            eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
-            eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb)
-            eri_emb = _as_cupy(eri_emb_cpu)
+            # Full density reconstruction
+            dm_full_ao_high = self.dm_core[ifrag] + B @ dm_emb_high @ B.T
+            v_eff_full_high = self.mf_inner_template.get_veff(self.full_mol, dm_full_ao_high)
             
-            if hasattr(mf_inner, 'make_rdm2'):
-                dm2_emb_high = _as_cupy(mf_inner.make_rdm2())
-            else:
-                dm2_emb_high = (cp.einsum('ij,kl->ijkl', dm_emb_high, dm_emb_high) 
-                           - 0.5 * cp.einsum('il,jk->ijkl', dm_emb_high, dm_emb_high))
-            e_high += 0.5 * cp.sum(dm2_emb_high * eri_emb)
+            # Coulomb J interaction traced over active space
+            vj_full_high = getattr(v_eff_full_high, 'vj', None)
+            vj_emb_high = B.T @ _as_cupy(vj_full_high) @ B
+            e_high_J = 0.5 * cp.sum(dm_emb_high * vj_emb_high)
             
-        # --- Evaluate Low-Level trace ---
-        # self.h_emb strictly contains 1.0 * v_core_outer_ao natively
-        h_eval_low = self.h_emb[ifrag] 
-        e_low = cp.sum(dm_emb_low * h_eval_low)
-        
+            # Exact Exchange interaction traced over active space + Grid XC extraction
+            exc_tot_high = getattr(v_eff_full_high, 'exc', 0.0)
+            vk_full_high = getattr(v_eff_full_high, 'vk', None)
+            
+            e_high_K = 0.0
+            grid_exc_tot_high = exc_tot_high
+            if vk_full_high is not None:
+                vk_full_high = _as_cupy(vk_full_high)
+                vk_emb_high = B.T @ vk_full_high @ B
+                e_high_K = -0.5 * cp.sum(dm_emb_high * vk_emb_high)
+                e_K_global_high = -0.5 * cp.sum(dm_full_ao_high * vk_full_high)
+                # Isolate the pure non-linear grid integration part
+                grid_exc_tot_high = exc_tot_high - e_K_global_high
+                
+            # Core evaluation for pure Grid XC subtraction
+            v_eff_core_high = self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag])
+            exc_core_high = getattr(v_eff_core_high, 'exc', 0.0)
+            vk_core_high = getattr(v_eff_core_high, 'vk', None)
+            
+            grid_exc_core_high = exc_core_high
+            if vk_core_high is not None:
+                vk_core_high = _as_cupy(vk_core_high)
+                e_K_global_core_high = -0.25 * cp.sum(self.dm_core[ifrag] * vk_core_high)
+                grid_exc_core_high = exc_core_high - e_K_global_core_high
+            
+            e_high = e_high_h + e_high_J + e_high_K + grid_exc_tot_high - grid_exc_core_high
+        else:
+            raise NotImplementedError("WFT evaluation is not implemented for this class.")
+            
+        # Evaluate Low-Level trace (Exact Real-Space XC Integration)
         if is_mean_field:
+            # 1. Bare one-electron Hamiltonian trace
+            e_low_h = cp.sum(dm_emb_low * h_eval_bare)
+            
             # Reconstruct full low-level density strictly from embedded projection
             dm_full_ao_low_reconstructed = self.dm_core[ifrag] + B @ dm_emb_low @ B.T
             v_eff_full_low = self.mf_outer.get_veff(self.full_mol, dm_full_ao_low_reconstructed)
-            v_eff_active_low = _as_cupy(v_eff_full_low) - self.v_core_ao[ifrag]
-            v_eff_emb_low = B.T @ v_eff_active_low @ B
             
-            e_low += 0.5 * cp.sum(dm_emb_low * v_eff_emb_low)
+            # 2. Coulomb (J) interaction traced over active space
+            vj_full_low = getattr(v_eff_full_low, 'vj', None)
+            if vj_full_low is None:
+                vj_full_low = self.mf_outer.get_j(self.full_mol, dm_full_ao_low_reconstructed)
+            vj_emb_low = B.T @ _as_cupy(vj_full_low) @ B
+            e_low_J = 0.5 * cp.sum(dm_emb_low * vj_emb_low)
+            
+            # 3. Exact Exchange (K) interaction traced over active space + Grid XC extraction
+            exc_tot_low = getattr(v_eff_full_low, 'exc', 0.0)
+            vk_full_low = getattr(v_eff_full_low, 'vk', None)
+            
+            e_low_K = 0.0
+            grid_exc_tot_low = exc_tot_low
+            if vk_full_low is not None:
+                vk_full_low = _as_cupy(vk_full_low)
+                vk_emb_low = B.T @ vk_full_low @ B
+                e_low_K = -0.5 * cp.sum(dm_emb_low * vk_emb_low)
+                e_K_global_low = -0.5 * cp.sum(dm_full_ao_low_reconstructed * vk_full_low)
+                # Isolate the pure non-linear grid integration part
+                grid_exc_tot_low = exc_tot_low - e_K_global_low
+                
+            # Core evaluation for pure Grid XC subtraction
+            v_eff_core_low = self.mf_outer.get_veff(self.full_mol, self.dm_core[ifrag])
+            exc_core_low = getattr(v_eff_core_low, 'exc', 0.0)
+            vk_core_low = getattr(v_eff_core_low, 'vk', None)
+            
+            grid_exc_core_low = exc_core_low
+            if vk_core_low is not None:
+                vk_core_low = _as_cupy(vk_core_low)
+                e_K_global_core_low = -0.25 * cp.sum(self.dm_core[ifrag] * vk_core_low)
+                grid_exc_core_low = exc_core_low - e_K_global_core_low
+                
+            e_low = e_low_h + e_low_J + e_low_K + grid_exc_tot_low - grid_exc_core_low
         else:
-            dm2_emb_low = (cp.einsum('ij,kl->ijkl', dm_emb_low, dm_emb_low) 
-                       - 0.5 * cp.einsum('il,jk->ijkl', dm_emb_low, dm_emb_low))
-            e_low += 0.5 * cp.sum(dm2_emb_low * eri_emb)
+            raise NotImplementedError("WFT evaluation is not implemented for this class.")
         
-        # --- Assembly ---
+        # Assembly
         delta_e = float(e_high - e_low)
         self.log.note(f"Global Low-Level E : {e_global_low:.8f}")
         self.log.note(f"Active Space dE    : {delta_e:.8f}")

From 945fb6aabc5c951f861f1cc14ed6770d9e9b2c26 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 20 May 2026 17:28:01 +0800
Subject: [PATCH 14/30] finish writing, needs debug

---
 gpu4pyscf/qmmm/embedding/__init__.py          |   1 +
 gpu4pyscf/qmmm/embedding/embeding_dft.py      | 144 +++++++-----------
 .../embedding/tests/test_dft_embedding.py     |   0
 ...met_embeding.py => test_dmet_embedding.py} |   0
 4 files changed, 58 insertions(+), 87 deletions(-)
 create mode 100644 gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
 rename gpu4pyscf/qmmm/embedding/tests/{test_dmet_embeding.py => test_dmet_embedding.py} (100%)

diff --git a/gpu4pyscf/qmmm/embedding/__init__.py b/gpu4pyscf/qmmm/embedding/__init__.py
index e829ae4bb..01eaa5903 100644
--- a/gpu4pyscf/qmmm/embedding/__init__.py
+++ b/gpu4pyscf/qmmm/embedding/__init__.py
@@ -14,3 +14,4 @@
 
 
 from .embedding import DMET
+from .embeding_dft import SingleFragmentEmbedding
diff --git a/gpu4pyscf/qmmm/embedding/embeding_dft.py b/gpu4pyscf/qmmm/embedding/embeding_dft.py
index 87b17940f..9c8d206ce 100644
--- a/gpu4pyscf/qmmm/embedding/embeding_dft.py
+++ b/gpu4pyscf/qmmm/embedding/embeding_dft.py
@@ -45,14 +45,53 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         fragments = [fragment]
         
         super().__init__(mf_outer, mf_inner, fragments,
-                         threshold=threshold, max_macro_iter=1, 
-                         energy_method='delta', verbose=verbose)
+                         threshold=threshold, max_macro_iter=1, verbose=verbose)
         
-        # Expose the single fragment directly for user convenience
         self.fragment = self.fragments[0]
         
-    def kernel(self):
+    def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
+        # Bare one-electron Hamiltonian trace
+        e_h = cp.sum(dm_emb * h_eval_bare)
+        
+        # Full density reconstruction
+        dm_full_ao = dm_core + B @ dm_emb @ B.T
+        v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
+        
+        # Coulomb J interaction traced over active space
+        vj_full = getattr(v_eff_full, 'vj', None)
+        if vj_full is None:
+            vj_full = mf_obj.get_j(self.full_mol, dm_full_ao)
+        vj_emb = B.T @ _as_cupy(vj_full) @ B
+        e_J = 0.5 * cp.sum(dm_emb * vj_emb)
+        
+        # Exact Exchange interaction traced over active space + Grid XC extraction
+        exc_tot = getattr(v_eff_full, 'exc', 0.0)
+        vk_full = getattr(v_eff_full, 'vk', None)
+        
+        e_K = 0.0
+        grid_exc_tot = exc_tot
+        if vk_full is not None:
+            vk_full = _as_cupy(vk_full)
+            vk_emb = B.T @ vk_full @ B
+            e_K = -0.5 * cp.sum(dm_emb * vk_emb)
+            e_K_global = -0.5 * cp.sum(dm_full_ao * vk_full)
+            # Isolate the pure non-linear grid integration part
+            grid_exc_tot = exc_tot - e_K_global
+            
+        # Core evaluation for pure Grid XC subtraction
+        v_eff_core = mf_obj.get_veff(self.full_mol, dm_core)
+        exc_core = getattr(v_eff_core, 'exc', 0.0)
+        vk_core = getattr(v_eff_core, 'vk', None)
+        
+        grid_exc_core = exc_core
+        if vk_core is not None:
+            vk_core = _as_cupy(vk_core)
+            e_K_global_core = -0.5 * cp.sum(dm_core * vk_core)
+            grid_exc_core = exc_core - e_K_global_core
+        
+        return e_h + e_J + e_K + grid_exc_tot - grid_exc_core
 
+    def kernel(self):
         if not self.mf_outer.converged:
             self.mf_outer.kernel()
             
@@ -79,98 +118,25 @@ def kernel(self):
         dm_emb_low = self.dm_emb_init[ifrag]
         
         B = self.B[ifrag]
+        dm_core = self.dm_core[ifrag]
         is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
         
-        # Evaluate High-Level trace
         if is_mean_field:
-            # Bare one-electron Hamiltonian trace
+            # Bare one-electron Hamiltonian evaluated in active space
             h_eval_bare = B.T @ hcore_orig @ B
-            e_high_h = cp.sum(dm_emb_high * h_eval_bare)
-            
-            # Full density reconstruction
-            dm_full_ao_high = self.dm_core[ifrag] + B @ dm_emb_high @ B.T
-            v_eff_full_high = self.mf_inner_template.get_veff(self.full_mol, dm_full_ao_high)
-            
-            # Coulomb J interaction traced over active space
-            vj_full_high = getattr(v_eff_full_high, 'vj', None)
-            vj_emb_high = B.T @ _as_cupy(vj_full_high) @ B
-            e_high_J = 0.5 * cp.sum(dm_emb_high * vj_emb_high)
-            
-            # Exact Exchange interaction traced over active space + Grid XC extraction
-            exc_tot_high = getattr(v_eff_full_high, 'exc', 0.0)
-            vk_full_high = getattr(v_eff_full_high, 'vk', None)
             
-            e_high_K = 0.0
-            grid_exc_tot_high = exc_tot_high
-            if vk_full_high is not None:
-                vk_full_high = _as_cupy(vk_full_high)
-                vk_emb_high = B.T @ vk_full_high @ B
-                e_high_K = -0.5 * cp.sum(dm_emb_high * vk_emb_high)
-                e_K_global_high = -0.5 * cp.sum(dm_full_ao_high * vk_full_high)
-                # Isolate the pure non-linear grid integration part
-                grid_exc_tot_high = exc_tot_high - e_K_global_high
-                
-            # Core evaluation for pure Grid XC subtraction
-            v_eff_core_high = self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag])
-            exc_core_high = getattr(v_eff_core_high, 'exc', 0.0)
-            vk_core_high = getattr(v_eff_core_high, 'vk', None)
+            # Evaluate High-Level trace
+            e_high = self._evaluate_embedded_energy(
+                self.mf_inner_template, dm_emb_high, h_eval_bare, B, dm_core
+            )
             
-            grid_exc_core_high = exc_core_high
-            if vk_core_high is not None:
-                vk_core_high = _as_cupy(vk_core_high)
-                e_K_global_core_high = -0.25 * cp.sum(self.dm_core[ifrag] * vk_core_high)
-                grid_exc_core_high = exc_core_high - e_K_global_core_high
-            
-            e_high = e_high_h + e_high_J + e_high_K + grid_exc_tot_high - grid_exc_core_high
-        else:
-            raise NotImplementedError("WFT evaluation is not implemented for this class.")
-            
-        # Evaluate Low-Level trace (Exact Real-Space XC Integration)
-        if is_mean_field:
-            # 1. Bare one-electron Hamiltonian trace
-            e_low_h = cp.sum(dm_emb_low * h_eval_bare)
-            
-            # Reconstruct full low-level density strictly from embedded projection
-            dm_full_ao_low_reconstructed = self.dm_core[ifrag] + B @ dm_emb_low @ B.T
-            v_eff_full_low = self.mf_outer.get_veff(self.full_mol, dm_full_ao_low_reconstructed)
-            
-            # 2. Coulomb (J) interaction traced over active space
-            vj_full_low = getattr(v_eff_full_low, 'vj', None)
-            if vj_full_low is None:
-                vj_full_low = self.mf_outer.get_j(self.full_mol, dm_full_ao_low_reconstructed)
-            vj_emb_low = B.T @ _as_cupy(vj_full_low) @ B
-            e_low_J = 0.5 * cp.sum(dm_emb_low * vj_emb_low)
-            
-            # 3. Exact Exchange (K) interaction traced over active space + Grid XC extraction
-            exc_tot_low = getattr(v_eff_full_low, 'exc', 0.0)
-            vk_full_low = getattr(v_eff_full_low, 'vk', None)
-            
-            e_low_K = 0.0
-            grid_exc_tot_low = exc_tot_low
-            if vk_full_low is not None:
-                vk_full_low = _as_cupy(vk_full_low)
-                vk_emb_low = B.T @ vk_full_low @ B
-                e_low_K = -0.5 * cp.sum(dm_emb_low * vk_emb_low)
-                e_K_global_low = -0.5 * cp.sum(dm_full_ao_low_reconstructed * vk_full_low)
-                # Isolate the pure non-linear grid integration part
-                grid_exc_tot_low = exc_tot_low - e_K_global_low
-                
-            # Core evaluation for pure Grid XC subtraction
-            v_eff_core_low = self.mf_outer.get_veff(self.full_mol, self.dm_core[ifrag])
-            exc_core_low = getattr(v_eff_core_low, 'exc', 0.0)
-            vk_core_low = getattr(v_eff_core_low, 'vk', None)
-            
-            grid_exc_core_low = exc_core_low
-            if vk_core_low is not None:
-                vk_core_low = _as_cupy(vk_core_low)
-                e_K_global_core_low = -0.25 * cp.sum(self.dm_core[ifrag] * vk_core_low)
-                grid_exc_core_low = exc_core_low - e_K_global_core_low
-                
-            e_low = e_low_h + e_low_J + e_low_K + grid_exc_tot_low - grid_exc_core_low
+            # Evaluate Low-Level trace
+            e_low = self._evaluate_embedded_energy(
+                self.mf_outer, dm_emb_low, h_eval_bare, B, dm_core
+            )
         else:
             raise NotImplementedError("WFT evaluation is not implemented for this class.")
         
-        # Assembly
         delta_e = float(e_high - e_low)
         self.log.note(f"Global Low-Level E : {e_global_low:.8f}")
         self.log.note(f"Active Space dE    : {delta_e:.8f}")
@@ -178,4 +144,8 @@ def kernel(self):
         self.e_tot = e_global_low + delta_e
         self.log.note(f"Total Embedded E   : {self.e_tot:.8f}")
 
+        self.mf_outer.mo_coeff = None
+        self.mf_outer.mo_energy = None
+        self.mf_outer.mo_occ = None
+
         return self.e_tot
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dmet_embeding.py b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
similarity index 100%
rename from gpu4pyscf/qmmm/embedding/tests/test_dmet_embeding.py
rename to gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py

From f6877de3c934575432733ca7af23d6ed7e831586 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 21 May 2026 15:26:44 +0800
Subject: [PATCH 15/30] finish debug the embedding DFT

---
 gpu4pyscf/qmmm/embedding/embedding.py    | 208 +++++++----------------
 gpu4pyscf/qmmm/embedding/embeding_dft.py |  10 +-
 2 files changed, 66 insertions(+), 152 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding.py b/gpu4pyscf/qmmm/embedding/embedding.py
index e439ecd25..40f3ae7da 100644
--- a/gpu4pyscf/qmmm/embedding/embedding.py
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -122,9 +122,6 @@ def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
 
 
 def build_core_dm(env_idx, core_orb, nao):
-    """
-    Build the core 1-RDM in the full AO basis.
-    """
     env_idx = _as_cupy(env_idx)
     if core_orb.size == 0:
         return cp.zeros((nao, nao), dtype=float)
@@ -201,9 +198,9 @@ class DMET(lib.StreamObject):
 
     Parameters
     ----------
-    mf_outer : SCF object (gpu4pyscf)
+    mf_outer : SCF object
         Low-level mean-field on the full system.
-    mf_inner : SCF/DFT/post-HF object (gpu4pyscf)
+    mf_inner : SCF/post-HF object
         High-level mean-field or post-HF template applied to the embedded cluster.
     fragments : list of lists of int
         List of fragments, where each fragment is a list of atom indices.
@@ -213,13 +210,11 @@ class DMET(lib.StreamObject):
         Maximum number of macroscopic iterations for correlation potential (u).
     macro_tol : float
         Convergence tolerance for the difference in fragment 1-RDMs.
-    energy_method : str
-        Method for calculating the total energy: 'direct' or 'delta'.
     """
 
     def __init__(self, mf_outer, mf_inner, fragments,
                  threshold=1e-5, max_macro_iter=20, macro_tol=1e-4, 
-                 energy_method='direct', verbose=None):
+                 verbose=None):
         if mf_outer is None or mf_inner is None:
             raise ValueError("mf_outer and mf_inner are both required.")
         if not fragments:
@@ -230,16 +225,12 @@ def __init__(self, mf_outer, mf_inner, fragments,
         else:
             verbose = int(verbose)
         self.log = logger.new_logger(mf_outer, verbose)
-        self.mf_outer = mf_outer
-        self.mf_inner_template = mf_inner
+        self.mf_outer = mf_outer.copy()
+        self.mf_inner_template = mf_inner.copy()
         self.full_mol = mf_outer.mol
         self.threshold = float(threshold)
         self.max_macro_iter = max_macro_iter
         self.macro_tol = macro_tol
-        
-        self.energy_method = energy_method.lower()
-        if self.energy_method not in ['direct', 'delta']:
-            raise ValueError("energy_method must be 'direct' or 'delta'")
 
         self.fragments = [list(int(a) for a in frag) for frag in fragments]
         self.nfrags = len(self.fragments)
@@ -341,29 +332,18 @@ def _build_inner_mf(self, ifrag, dm_full_ao):
 
         mf_inner = _instantiate_inner_mf(self.mf_inner_template, emb_mol)
 
-        B_mat = self.B[ifrag]
-        
-        if hasattr(self.mf_inner_template, 'get_veff'):
-            v_core_inner_ao = _as_cupy(self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag]))
-        else:
-            v_core_inner_ao = cp.zeros_like(self.dm_core[ifrag])
-
-        h_ao = _as_cupy(self.mf_outer.get_hcore())
-        # The inner Hamiltonian gets the strict high-level background potential
-        h_emb_inner = B_mat.T @ (h_ao + v_core_inner_ao) @ B_mat
+        h_emb = self.h_emb[ifrag]
         ovlp = cp.eye(nemb)
 
         e_nuc = float(self.full_mol.energy_nuc())
-        mf_inner.get_hcore = lambda *args, **kwargs: h_emb_inner
+        mf_inner.get_hcore = lambda *args, **kwargs: h_emb
         mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
-        
-        # Energy offset for inner solver debugging aligns with inner core potential
-        # This 0.5 will be removed for 1-fragment systmes.
-        e_core_inner = float(cp.einsum('ij,ji->', self.dm_core[ifrag], h_ao) + 
-                             0.5 * cp.einsum('ij,ji->', self.dm_core[ifrag], v_core_inner_ao))
-        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + e_core_inner
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + self.e_core[ifrag]
+
+        B_mat = self.B[ifrag]
+        v_core_ao = self.v_core_ao[ifrag]
 
-        # Overwrite get_veff to compute on-the-fly using the inner template
+        # Overwrite get_veff to compute on-the-fly using the outer HF
         def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             if dm is None:
                 dm = mf_inner.make_rdm1()
@@ -377,36 +357,41 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
                 
             dm_full_ao_inner = self.dm_core[ifrag] + dm_ao
             
-            # [FIXED] Compute Veff in full AO basis using inner template strictly
+            # For pure HF, this may be redundant, cause J, K are linear,
+            # but this is also used for DFT based method, so, we use the delta method.
             v_eff_full = self.mf_inner_template.get_veff(self.full_mol, dm_full_ao_inner, hermi=hermi)
-            v_eff_active = _as_cupy(v_eff_full) - v_core_inner_ao
+            v_eff_active = _as_cupy(v_eff_full) - v_core_ao
             
             # Project Veff back to embedded basis
             if dm_cp.ndim == 2:
                 v_eff_emb = B_mat.T @ v_eff_active @ B_mat
             else:
                 v_eff_emb = cp.einsum('pi,xpq,qj->xij', B_mat, v_eff_active, B_mat)
-            
+
             ecoul = getattr(v_eff_full, 'ecoul', 0.0)
             exc = getattr(v_eff_full, 'exc', 0.0)
-            if hasattr(v_eff_full, 'vj'): 
-                vj = getattr(v_eff_full, 'vj')
+            vj_full = getattr(v_eff_full, 'vj', None)
+            if vj_full is not None:
+                vj_emb = B_mat.T @ _as_cupy(vj_full) @ B_mat
             else:
-                vj = cp.zeros_like(v_eff_emb)
-            if hasattr(v_eff_full, 'vk'): 
-                vk = getattr(v_eff_full, 'vk')
+                vj_emb = cp.zeros_like(v_eff_emb)
+                
+            vk_full = getattr(v_eff_full, 'vk', None)
+            if vk_full is not None:
+                vk_emb = B_mat.T @ _as_cupy(vk_full) @ B_mat
             else:
-                vk = cp.zeros_like(v_eff_emb)
+                vk_emb = cp.zeros_like(v_eff_emb)
+            
+            v_eff_emb = tag_array(v_eff_emb, ecoul=ecoul, exc=exc, vj=vj_emb, vk=vk_emb)
             
-            v_eff_emb = tag_array(v_eff_emb, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
-                    
             return v_eff_emb
 
         mf_inner.get_veff = _get_veff
-        
+
         # using s to make the upper index to the lower index
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         sB = s_ao @ self.B[ifrag]
+        # Due to ths BSC_core = 0, this the following is equivelent to dm_full_ao - dm_core_ao
         dm_emb_init = sB.T @ dm_full_ao @ sB
         
         trace = float(cp.trace(dm_emb_init))
@@ -445,12 +430,7 @@ def kernel(self):
             mo_occ = _as_cupy(self.mf_outer.mo_occ)
             dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
             
-            if self.energy_method == 'delta':
-                # Remove the correlation potential penalty from the total energy to get the physical base energy
-                e_tot = self.mf_outer.e_tot - float(cp.sum(dm_full_ao * u_ao))
-            else:
-                e_tot = 0.0
-                
+            e_tot = 0.0
             dm_inners = []
 
             for ifrag in range(self.nfrags):
@@ -469,7 +449,6 @@ def kernel(self):
                 dm_inners.append(dm_inner_full_ao)
 
                 dm1_emb = dm_emb
-                
                 n_frag = self.frag_idx[ifrag].size
                 
                 # Outer (Low-level) environment embedding
@@ -477,107 +456,46 @@ def kernel(self):
                 v_core_emb = B.T @ v_core_ao @ B
                 
                 # Apply 0.5 factor to core potential to avoid double counting across fragments
-                # TODO: The 0.5 factor should be removed for ONIOM energy of just 1 fragment.
+                # The 0.5 factor should be removed for ONIOM energy of just 1 fragment.
                 h_eval = self.h_emb[ifrag] - 0.5 * v_core_emb
                 
                 is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
 
-                # [FIXED] Inner (High-level) evaluation uses its own core functional to prevent cross-talk
-                if is_mean_field:
-                    v_core_inner_ao = _as_cupy(self.mf_inner_template.get_veff(self.full_mol, self.dm_core[ifrag]))
-                    v_core_inner_emb = B.T @ v_core_inner_ao @ B
-                    h_ao = _as_cupy(hcore_orig)
-                    h_emb_inner = B.T @ (h_ao + v_core_inner_ao) @ B
-                    h_eval_high = h_emb_inner - 0.5 * v_core_inner_emb
-                else:
-                    h_eval_high = h_eval
-
-                if self.energy_method == 'direct':
-                    e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval_high[:n_frag, :])
-                    if not is_mean_field:
-                        raise NotImplementedError("Only mean-field solver is supported for DMET.")
-                        self.log.info("using non-mean-field solver")
-                        nemb = B.shape[1]
-                        # TODO: this can be replaced by a more efficient routine
-                        B_cpu = cp.asnumpy(B)
-                        eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
-                        eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
-                        eri_emb = _as_cupy(eri_emb_cpu)
-                        
-                        if hasattr(mf_inner, 'make_rdm2'):
-                            dm2_emb = _as_cupy(mf_inner.make_rdm2())
-                        else:
-                            # Fallback using the HF 2-RDM formulation for post-HF methods lacking make_rdm2
-                            dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
-                                       - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
-                        
-                        e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
-                    else:
-                        self.log.info("using mean-field solver")
-                        v_eff_emb = mf_inner.get_veff(dm=dm1_emb)
-                        e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * _as_cupy(v_eff_emb)[:n_frag, :])
-                    
-                    e_frag_nuc = 0.0
-                    coords = self.full_mol.atom_coords()
-                    charges = self.full_mol.atom_charges()
-                    frag_atoms = self.fragments[ifrag]
-                    for i in frag_atoms:
-                        for j in range(self.full_mol.natm):
-                            if i == j: continue
-                            r = np.linalg.norm(coords[i] - coords[j])
-                            e_frag_nuc += 0.5 * charges[i] * charges[j] / r
-                            
-                    self.log.info(f"Fragment {ifrag} Electronic Energy: {float(e_frag_elec):.8f} | Nuclear Energy: {e_frag_nuc:.8f}")
-                    e_tot += float(e_frag_elec) + e_frag_nuc
-
-                elif self.energy_method == 'delta':
-                    dm1_emb_high = dm1_emb
-                    dm1_emb_low = self.dm_emb_init[ifrag]
-                    
-                    # Compute High-Level pseudo energy (using strictly high-level core potential evaluation)
-                    e_high = cp.sum(dm1_emb_high[:n_frag, :] * h_eval_high[:n_frag, :])
-                    
-                    # Compute Low-Level pseudo energy (using strictly low-level core potential evaluation)
-                    e_low = cp.sum(dm1_emb_low[:n_frag, :] * h_eval[:n_frag, :])
+                e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
+                if not is_mean_field:
+                    self.log.info("using non-mean-field solver")
+                    nemb = B.shape[1]
+                    # TODO: this can be replaced by a more efficient routine
+                    B_cpu = cp.asnumpy(B)
+                    eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+                    eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
+                    eri_emb = _as_cupy(eri_emb_cpu)
                     
-                    if not is_mean_field:
-                        raise NotImplementedError("Only mean-field solver is supported for DMET.")
-                        self.log.info("using non-mean-field solver")
-                        nemb = B.shape[1]
-                        B_cpu = cp.asnumpy(B)
-                        eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
-                        eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb)
-                        eri_emb = _as_cupy(eri_emb_cpu)
-                        
-                        if hasattr(mf_inner, 'make_rdm2'):
-                            dm2_emb_high = _as_cupy(mf_inner.make_rdm2())
-                        else:
-                            dm2_emb_high = (cp.einsum('ij,kl->ijkl', dm1_emb_high, dm1_emb_high) 
-                                       - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb_high, dm1_emb_high))
-                        e_high += 0.5 * cp.sum(dm2_emb_high[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
-                        
-                        # Low-level is always un-correlated 2-RDM
-                        dm2_emb_low = (cp.einsum('ij,kl->ijkl', dm1_emb_low, dm1_emb_low) 
-                                       - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb_low, dm1_emb_low))
-                        e_low += 0.5 * cp.sum(dm2_emb_low[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+                    if hasattr(mf_inner, 'make_rdm2'):
+                        dm2_emb = _as_cupy(mf_inner.make_rdm2())
                     else:
-                        self.log.info("using mean-field solver")
-                        v_eff_emb_high = mf_inner.get_veff(dm=dm1_emb_high)
-                        e_high += 0.5 * cp.sum(dm1_emb_high[:n_frag, :] * _as_cupy(v_eff_emb_high)[:n_frag, :])
-                        
-                        # [FIXED] Compute Veff for the low-level density explicitly using the outer functional
-                        dm_ao_low = B @ dm1_emb_low @ B.T
-                        dm_full_ao_low = self.dm_core[ifrag] + dm_ao_low
-                        
-                        v_eff_full_low = self.mf_outer.get_veff(self.full_mol, dm_full_ao_low)
-                        v_eff_active_low = _as_cupy(v_eff_full_low) - self.v_core_ao[ifrag]
-                        v_eff_emb_low = B.T @ v_eff_active_low @ B
-                        
-                        e_low += 0.5 * cp.sum(dm1_emb_low[:n_frag, :] * v_eff_emb_low[:n_frag, :])
+                        # Fallback using the HF 2-RDM formulation for post-HF methods lacking make_rdm2
+                        dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
+                                   - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
                     
-                    delta_e = float(e_high - e_low)
-                    self.log.info(f"Fragment {ifrag} Delta E (Correlation Improvement): {delta_e:.8f}")
-                    e_tot += delta_e
+                    e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+                else:
+                    self.log.info("using mean-field solver")
+                    v_eff_emb = mf_inner.get_veff(dm=dm1_emb)
+                    e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * _as_cupy(v_eff_emb)[:n_frag, :])
+                
+                e_frag_nuc = 0.0
+                coords = self.full_mol.atom_coords()
+                charges = self.full_mol.atom_charges()
+                frag_atoms = self.fragments[ifrag]
+                for i in frag_atoms:
+                    for j in range(self.full_mol.natm):
+                        if i == j: continue
+                        r = np.linalg.norm(coords[i] - coords[j])
+                        e_frag_nuc += 0.5 * charges[i] * charges[j] / r
+                        
+                self.log.info(f"Fragment {ifrag} Electronic Energy: {float(e_frag_elec):.8f} | Nuclear Energy: {e_frag_nuc:.8f}")
+                e_tot += float(e_frag_elec) + e_frag_nuc
 
             dm_low_oao = X_inv @ dm_full_ao @ X_inv
             
diff --git a/gpu4pyscf/qmmm/embedding/embeding_dft.py b/gpu4pyscf/qmmm/embedding/embeding_dft.py
index 9c8d206ce..418b357ae 100644
--- a/gpu4pyscf/qmmm/embedding/embeding_dft.py
+++ b/gpu4pyscf/qmmm/embedding/embeding_dft.py
@@ -25,8 +25,8 @@ class SingleFragmentEmbedding(DMET):
     
     This class performs a single-shot,
     single-fragment delta-method energy evaluation WITHOUT macroscopic iterations.
-    It rigorously traces over the entire active space (Fragment + Bath) to capture
-    full polarization correlation, eliminating the 0.5 double-counting factor.
+    It rigorously traces over the entire active space (ffagment + bath) to capture
+    full polarization correlation,.
     """
     
     def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
@@ -143,9 +143,5 @@ def kernel(self):
         
         self.e_tot = e_global_low + delta_e
         self.log.note(f"Total Embedded E   : {self.e_tot:.8f}")
-
-        self.mf_outer.mo_coeff = None
-        self.mf_outer.mo_energy = None
-        self.mf_outer.mo_occ = None
-
+        
         return self.e_tot
\ No newline at end of file

From 41fdbc2ce282a6906c9562a8bfe45d27aee0f9d8 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 22 May 2026 11:28:07 +0800
Subject: [PATCH 16/30] - fix some bugs, which generating some unphysical bath
 orbitals - add more unit test covering this issue - add two examples

---
 examples/embedding/48-dmet-embedding.py       |  86 +++++++++++
 examples/embedding/49-dft-dmet-embedding.py   |  84 +++++++++++
 gpu4pyscf/qmmm/embedding/embedding.py         |  24 ++-
 .../embedding/tests/test_dft_embedding.py     | 137 ++++++++++++++++++
 .../embedding/tests/test_dmet_embedding.py    | 104 +++++++++++--
 5 files changed, 420 insertions(+), 15 deletions(-)
 create mode 100644 examples/embedding/48-dmet-embedding.py
 create mode 100644 examples/embedding/49-dft-dmet-embedding.py

diff --git a/examples/embedding/48-dmet-embedding.py b/examples/embedding/48-dmet-embedding.py
new file mode 100644
index 000000000..f2adc8b1b
--- /dev/null
+++ b/examples/embedding/48-dmet-embedding.py
@@ -0,0 +1,86 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example 48: Standard Multi-Fragment Self-Consistent DMET Calculation.
+
+This script demonstrates how to partition a molecule into multiple fragments
+and optimize the global correlation potential (u_oao) to match high-level and 
+low-level local 1-RDM density matrices self-consistently.
+"""
+
+from pyscf import gto
+from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.qmmm.embedding.embedding import DMET
+
+def run_dmet_example():
+    # 1. Define the system (Ethane molecule with 6-31G basis)
+    mol = gto.Mole()
+    mol.atom = '''
+        C      -0.76091    -0.00000     0.00000
+        C       0.76091    -0.00000     0.00000
+        H      -1.16001     1.02029     0.00000
+        H      -1.16001    -0.51014    -0.88357
+        H      -1.16001    -0.51014     0.88357
+        H       1.16001    -1.02029     0.00000
+        H       1.16001     0.51014     0.88357
+        H       1.16001     0.51014    -0.88357    
+    '''
+    mol.basis = '6-31g'
+    mol.verbose = 4  # Set verbose to see detailed DMET iteration logs
+    mol.build()
+
+    print("--- Step 1: Initialize Low-Level and High-Level Solver Templates ---")
+    # In this classic exact-back-to-exact test case, we nest RHF within RHF.
+    # DMET should converge the correlation potential to exactly zero.
+    mf_outer = gpu_hf.RHF(mol)
+    mf_outer.conv_tol = 1e-12
+    
+    mf_inner_template = gpu_hf.RHF(mol)
+    mf_inner_template.conv_tol = 1e-12
+
+    print("\n--- Step 2: Define Molecular Fragments ---")
+    # Partition the Ethane molecule into two methyl fragments based on atom indices:
+    # Fragment 0: First Methyl group [C1, H1, H2, H3]
+    # Fragment 1: Second Methyl group [C2, H4, H5, H6]
+    fragments = [
+        [0, 2, 3, 4],
+        [1, 5, 6, 7]
+    ]
+    print(f"Fragment 0 atom indices: {fragments[0]}")
+    print(f"Fragment 1 atom indices: {fragments[1]}")
+
+    print("\n--- Step 3: Setup and Execute the Self-Consistent DMET Solver ---")
+    dmet_solver = DMET(
+        mf_outer=mf_outer,
+        mf_inner=mf_inner_template,
+        fragments=fragments,
+        threshold=1e-5,       # SVD eigenvalue threshold for bath selection
+        max_macro_iter=20,    # Max macro loops for correlation potential fitting
+        macro_tol=1e-4        # Convergence tolerance for the density matching cost
+    )
+
+    # Trigger the DMET macroscopic self-consistent optimization
+    e_dmet = dmet_solver.kernel()
+
+    print("\n--- Final Results Summary ---")
+    # Run the raw full system RHF as an exact reference
+    e_hf_ref = mf_outer.kernel()
+    
+    print(f"Global Reference RHF Energy  : {e_hf_ref:.8f} Hartree") # -79.19706462
+    print(f"Macroscopic DMET Total Energy: {e_dmet:.8f} Hartree") # -79.19706462
+    print(f"Absolute Energy Deviation    : {abs(e_dmet - e_hf_ref):.2e} Hartree") # 9.15e-11 Hartree
+
+if __name__ == '__main__':
+    run_dmet_example()
\ No newline at end of file
diff --git a/examples/embedding/49-dft-dmet-embedding.py b/examples/embedding/49-dft-dmet-embedding.py
new file mode 100644
index 000000000..575e44da2
--- /dev/null
+++ b/examples/embedding/49-dft-dmet-embedding.py
@@ -0,0 +1,84 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example 49: Single-Fragment Delta-Method DFT-in-DFT Embedding.
+
+This script demonstrates how to embed a high-level hybrid DFT functional (B3LYP)
+into a localized region of a low-level GGA DFT environment (PBE) using a 
+highly-optimized projection basis without macroscopic iterations.
+"""
+
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embeding_dft import SingleFragmentEmbedding
+
+def run_dft_embedding_example():
+    # 1. Define the system (Ethane molecule with 6-31G basis)
+    mol = gto.Mole()
+    mol.atom = '''
+        C      -0.76091    -0.00000     0.00000
+        C       0.76091    -0.00000     0.00000
+        H      -1.16001     1.02029     0.00000
+        H      -1.16001    -0.51014    -0.88357
+        H      -1.16001    -0.51014     0.88357
+        H       1.16001    -1.02029     0.00000
+        H       1.16001     0.51014     0.88357
+        H       1.16001     0.51014    -0.88357    
+    '''
+    mol.basis = '6-31g'
+    mol.verbose = 4  # Enable to monitor localized cluster basis dimensions and logs
+    mol.build()
+
+    print("--- Step 1: Prepare Environment (PBE) and Active Region (B3LYP) Solvers ---")
+    # Low-level full system solver (Environment description)
+    mf_outer = rks.RKS(mol, xc='PBE')
+    mf_outer.conv_tol = 1e-10
+    
+    # High-level solver template (Active cluster description)
+    mf_inner_template = rks.RKS(mol, xc='B3LYP')
+    mf_inner_template.conv_tol = 1e-10
+
+    print("\n--- Step 2: Define Single Target Active Fragment ---")
+    # Select only one methyl group as the active QM region. 
+    # The other half will automatically serve as the embedding environment.
+    active_fragment = [0, 2, 3, 4]
+    print(f"Target QM Active Region atom indices: {active_fragment}")
+
+    print("\n--- Step 3: Initialize and Run Single Fragment Embedding ---")
+    # Construct the single-shot embedding object. Notice that mf_inner_template 
+    # will be cloned internally via .copy() to completely avoid cache poisoning.
+    emb_obj = SingleFragmentEmbedding(
+        mf_outer=mf_outer,
+        mf_inner=mf_inner_template,
+        fragment=active_fragment,
+        threshold=1e-5  # Filters out pure fragment states and numerical noise
+    )
+
+    # Compute the final multi-scale total energy via the delta method:
+    # E_tot = E_PBE(Full) + [E_B3LYP(Active) - E_PBE(Active)]
+    e_embedded_tot = emb_obj.kernel()
+
+    print("\n--- Step 4: Verification of Template Isolation ---")
+    # Verify that our protection armor works seamlessly: 
+    # Executing the template after embedding must converge successfully without any side effects.
+    print("Verifying inner template isolation status...")
+    mf_inner_template.kernel()
+    if mf_inner_template.converged:
+        print("Template isolation check passed successfully! No cache poisoning detected.")
+    else:
+        print("Warning: Template convergence failed, check cache isolation leaks.")
+
+if __name__ == '__main__':
+    run_dft_embedding_example()
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/embedding.py b/gpu4pyscf/qmmm/embedding/embedding.py
index 40f3ae7da..4df7ca5e7 100644
--- a/gpu4pyscf/qmmm/embedding/embedding.py
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -72,8 +72,9 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     C_occ = mo_coeff_oao[:, occ_mask]
     
     if env_idx.size == 0 or C_occ.shape[1] == 0:
+        s_dummy = cp.ones(C_occ.shape[1]) if env_idx.size == 0 else cp.zeros(0)
         return (cp.zeros((0, 0)), cp.zeros((0, 0)), 
-                {'n_core_electrons': 0})
+                {'n_core_electrons': 0, 'singular_values': s_dummy})
         
     C_A = C_occ[frag_idx, :]
     
@@ -81,14 +82,14 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     
     C_rot = C_occ @ Vh.T
     
-    is_bath = S > threshold
+    is_bath = (S > threshold) & (S < 1.0 - threshold) # Exclude singular values close to 1.0
     is_core_small = S <= threshold
     n_sv = len(S)
     
     # Entangled bath orbitals (environment part)
     bath_orb = C_rot[env_idx, :n_sv][:, is_bath]
     norms = cp.linalg.norm(bath_orb, axis=0)
-    norms[norms < 1e-12] = 1.0 # This should not happen
+    norms[norms < 1e-12] = 1.0 # This may happen, if s=1.0, which will add a new null vector to B!
     bath_orb = bath_orb / norms
     
     # Pure environment core orbitals come from null space + small singular values
@@ -97,7 +98,8 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     core_orb = cp.hstack([core_orb_small, core_orb_null])
     
     info = {
-        'n_core_electrons': 2 * core_orb.shape[1]
+        'n_core_electrons': 2 * core_orb.shape[1],
+        'singular_values': S
     }
     return bath_orb, core_orb, info
 
@@ -288,6 +290,20 @@ def build_bath(self, ifrag, mo_coeff, mo_occ, X_inv, X):
         self.B_oao[ifrag] = B_oao        
         self.B[ifrag] = B_ao             
         self.dm_core[ifrag] = dm_core_ao
+
+        n_frag = int(self.frag_idx[ifrag].size)
+        n_bath = int(bath_orb.shape[1] if bath_orb.size else 0)
+        n_core = int(core_orb.shape[1] if core_orb.size else 0)
+
+        self.log.info(f"Fragment {ifrag} Schmidt decomposition singular values:")
+        self.log.info(f"    {info['singular_values']}")
+        
+        self.log.info(f"Fragment {ifrag} embedding basis partition:")
+        self.log.info(f"    Number of Fragment AOs : {n_frag}")
+        self.log.info(f"    Number of Bath Orbitals: {n_bath}")
+        self.log.info(f"    Number of Core Orbitals: {n_core} ({info['n_core_electrons']} electrons)")
+        self.log.info(f"    Total Embedded Space   : {n_frag + n_bath} / {nao_oao} (full AO)")
+
         return self
 
     def build_embedded_hamiltonian(self, ifrag, hcore_orig):
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
index e69de29bb..47400776f 100644
--- a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
@@ -0,0 +1,137 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding import embedding
+from gpu4pyscf.qmmm.embedding.embeding_dft import SingleFragmentEmbedding
+
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+
+        cls.mol = gto.Mole()
+        cls.mol.atom = '''
+            C      -0.76091    -0.00000     0.00000
+            C       0.76091    -0.00000     0.00000
+            H      -1.16001     1.02029     0.00000
+            H      -1.16001    -0.51014    -0.88357
+            H      -1.16001    -0.51014     0.88357
+            H       1.16001    -1.02029     0.00000
+            H       1.16001     0.51014     0.88357
+            H       1.16001     0.51014    -0.88357    
+        '''
+        cls.mol.basis = '6-31g'
+        cls.mol.spin = 0
+        cls.mol.charge = 0
+        cls.mol.verbose = 0
+        cls.mol.build()
+
+        cls.fragments = [[0, 1], [2, 3]]
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol
+    
+    def test_b3lyp_in_b3lyp(self):
+
+        mf_outer = rks.RKS(self.mol, xc='B3LYP')
+        mf_inner_template = rks.RKS(self.mol, xc='B3LYP')
+
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner_template, [0, 2, 3, 4])
+        emb_obj.kernel()
+
+        e_ref = mf_outer.kernel()
+
+        assert np.abs(e_ref - emb_obj.e_tot) < 1e-8, f"Reference energy {e_ref} != Embedding energy {emb_obj.energy}"
+
+    def test_b3lyp_in_pbe(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner_template = rks.RKS(self.mol, xc='B3LYP')
+
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner_template, [i for i in range(8)])
+        emb_obj.kernel()
+
+        e_ref = mf_inner_template.kernel()
+
+        assert np.abs(e_ref - emb_obj.e_tot) < 1e-8, f"Reference energy {e_ref} != Embedding energy {emb_obj.energy}"
+
+    def test_algebraic_properties(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, [0, 1, 2])
+        emb_obj.kernel()
+
+        S_ao = cp.asarray(mf_outer.get_ovlp())
+        B = emb_obj.B[0]
+        D_core = emb_obj.dm_core[0]
+
+        # Check B^T * S * B == I (Orthonormality of embedding basis)
+        ortho_check = B.T @ S_ao @ B
+        identity = cp.eye(B.shape[1])
+        max_ortho_err = float(cp.abs(ortho_check - identity).max())
+        self.assertTrue(max_ortho_err < 1e-10, 
+                        f"Basis B is not orthogonal, max error: {max_ortho_err}")
+
+        # Check Spatial Isolation (Core DM projected onto the active space must be zero)
+        core_overlap = B.T @ S_ao @ D_core @ S_ao @ B
+        max_overlap_err = float(cp.abs(core_overlap).max())
+        self.assertTrue(max_overlap_err < 1e-10, 
+                        f"Core DM leaks into Active Space, max error: {max_overlap_err}")
+
+    def test_electron_conservation(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='B3LYP')
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, [0, 1])
+        emb_obj.kernel()
+        
+        S_ao = cp.asarray(mf_outer.get_ovlp())
+        D_emb_high = cp.asarray(emb_obj.mf_inner[0].make_rdm1())
+        D_core = emb_obj.dm_core[0]
+        B = emb_obj.B[0]
+        
+        # Project local active density back to full AO basis
+        D_emb_ao = B @ D_emb_high @ B.T # Identity S ignored
+        D_total_ao = D_core + D_emb_ao
+        
+        n_elec_calc = float(cp.trace(D_total_ao @ S_ao))
+        n_elec_exact = float(self.mol.nelectron)
+        
+        self.assertAlmostEqual(n_elec_calc, n_elec_exact, places=8, 
+                               msg=f"Electron loss: {n_elec_calc} != {n_elec_exact}")
+
+    def test_template_isolation_and_convergence(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner_template = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner_template, [0, 2, 3, 4], threshold=-1.0)
+        emb_obj.kernel()
+        
+        mf_inner_template.kernel()
+        
+        # Assert the template is still clean and converges properly
+        self.assertTrue(mf_inner_template.converged, 
+                        "Template object was poisoned and failed to converge!")
+
+
+if __name__ == '__main__':
+    print("Full Tests for ONIOM-like DFT embedding.")
+    unittest.main()
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
index 8f55b69f0..2d3353ae8 100644
--- a/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
@@ -71,11 +71,6 @@ def setUpClass(cls):
         cls.mf_inner_template2 = gpu_hf.RHF(cls.mol2)
         cls.mf_inner_template2.conv_tol = 1e-12
 
-        cls.mf_outer3 = rks.RKS(cls.mol2)
-        cls.mf_outer3.conv_tol = 1e-12
-        cls.mf_inner_template3 = rks.RKS(cls.mol2)
-        cls.mf_inner_template3.conv_tol = 1e-12
-
     @classmethod
     def tearDownClass(cls):
         del cls.mol
@@ -132,16 +127,30 @@ def test_schmidt(self):
         s = mf.get_ovlp()
         mo_coeff = mf.mo_coeff
         X, X_inv = embedding.lowdin_orth(s)
-        mo_coeff_oao = X@mo_coeff
+        mo_coeff_oao = X @ mo_coeff
         C_occ = mo_coeff_oao[:, :2]
         C_A = mo_coeff_oao[:4, :2]
+        
         U, S, Vh = cp.linalg.svd(C_A, full_matrices=True)
         C_rot = C_occ @ Vh.T
-        bath_orb_ref = C_rot[4:]
-        norms = cp.linalg.norm(bath_orb_ref, axis=0)
-        bath_orb_ref /= norms
-        bath_orb = embedding.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7])[0]
-        assert np.abs(bath_orb.get() - bath_orb_ref.get()).max() < 1e-8, "Schmidt decomposition should yield close-to-identity matrices."
+        
+        threshold = 1e-5
+        is_bath = (S > threshold) & (S < 1.0 - threshold)
+        n_sv = len(S)
+        
+        bath_orb_ref = C_rot[4:, :n_sv][:, is_bath]
+        if bath_orb_ref.size > 0:
+            norms = cp.linalg.norm(bath_orb_ref, axis=0)
+            norms[norms < 1e-12] = 1.0
+            bath_orb_ref /= norms
+            
+        bath_orb = embedding.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7], threshold=threshold)[0]
+        
+        self.assertEqual(bath_orb.shape, bath_orb_ref.shape, 
+                         "Matrix shapes must match after filtering pure fragment orbitals.")
+        if bath_orb.size > 0:
+            assert np.abs(bath_orb.get() - bath_orb_ref.get()).max() < 1e-8, \
+                "Schmidt decomposition should yield close-to-identity matrices."
 
     def test_dmet_execution_and_convergence(self):
         dmet_solver = DMET(
@@ -195,6 +204,79 @@ def test_dmet_execution_and_convergence(self):
         assert np.abs(e_tot_iter1 - e_tot) < 1e-8, "DMET energy should be converged in 1 macro iteration."
         assert np.abs(dmet_solver2.u_oao).sum() < 1e-8, "Correlation potential should be close to zero."
 
+    def test_multifragment_algebraic_and_conservation(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=1
+        )
+        dmet_solver.kernel()
+
+        S_ao = cp.asarray(self.mf_outer2.get_ovlp())
+        n_total_elec = float(self.mol2.nelectron)
+
+        for ifrag in range(dmet_solver.nfrags):
+            B = dmet_solver.B[ifrag]
+            D_core = dmet_solver.dm_core[ifrag]
+            D_emb_high = cp.asarray(dmet_solver.mf_inner[ifrag].make_rdm1())
+
+            # Check B^T * S * B == I for each fragment
+            ortho_check = B.T @ S_ao @ B
+            identity = cp.eye(B.shape[1])
+            max_ortho_err = float(cp.abs(ortho_check - identity).max())
+            self.assertTrue(max_ortho_err < 1e-10, 
+                            f"Fragment {ifrag}: Basis B is not orthonormal. Max err: {max_ortho_err}")
+
+            # Check Core DM spatial isolation from the active space
+            core_overlap = B.T @ S_ao @ D_core @ S_ao @ B
+            max_overlap_err = float(cp.abs(core_overlap).max())
+            self.assertTrue(max_overlap_err < 1e-10, 
+                            f"Fragment {ifrag}: Core DM leaks into Active Space. Max err: {max_overlap_err}")
+
+            # Check total electron conservation for this fragment representation
+            D_emb_ao = B @ D_emb_high @ B.T
+            D_total_ao = D_core + D_emb_ao
+            n_elec_calc = float(cp.trace(D_total_ao @ S_ao))
+            self.assertAlmostEqual(n_elec_calc, n_total_elec, places=8,
+                                   msg=f"Fragment {ifrag}: Electron loss detected. {n_elec_calc} != {n_total_elec}")
+
+    def test_dmet_template_isolation(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=3,
+            macro_tol=1e-3
+        )
+        dmet_solver.kernel()
+
+        self.mf_inner_template2.mo_coeff = None
+        self.mf_inner_template2.kernel()
+        
+        self.assertTrue(self.mf_inner_template2.converged, 
+                        "The inner template was poisoned by DMET macro-loops and failed to converge!")
+
+    def test_correlation_potential_symmetry(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer,
+            mf_inner=self.mf_inner_template,
+            fragments=self.fragments,
+            threshold=1e-5,
+            max_macro_iter=2
+        )
+        dmet_solver.kernel()
+
+        u = dmet_solver.u_oao
+        
+        sym_err = float(cp.abs(u - u.T).max())
+        self.assertTrue(sym_err < 1e-12, f"Correlation potential u_oao is not symmetric. Max err: {sym_err}")
+        
+        max_u_val = float(cp.abs(u).max())
+        self.assertTrue(max_u_val < 1e-7, f"Trivial correlation potential should be zero, but got max: {max_u_val}")
+
 
 if __name__ == '__main__':
     print("Full Tests for DMET")

From fde13a1aabcb481a7ccca0980fb013a765f57c78 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 22 May 2026 13:08:41 +0800
Subject: [PATCH 17/30] fix some typos

---
 gpu4pyscf/qmmm/embedding/__init__.py                           | 2 +-
 gpu4pyscf/qmmm/embedding/{embeding_dft.py => embedding_dft.py} | 0
 gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py           | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename gpu4pyscf/qmmm/embedding/{embeding_dft.py => embedding_dft.py} (100%)

diff --git a/gpu4pyscf/qmmm/embedding/__init__.py b/gpu4pyscf/qmmm/embedding/__init__.py
index 01eaa5903..6884d9f8e 100644
--- a/gpu4pyscf/qmmm/embedding/__init__.py
+++ b/gpu4pyscf/qmmm/embedding/__init__.py
@@ -14,4 +14,4 @@
 
 
 from .embedding import DMET
-from .embeding_dft import SingleFragmentEmbedding
+from .embedding_dft import SingleFragmentEmbedding
diff --git a/gpu4pyscf/qmmm/embedding/embeding_dft.py b/gpu4pyscf/qmmm/embedding/embedding_dft.py
similarity index 100%
rename from gpu4pyscf/qmmm/embedding/embeding_dft.py
rename to gpu4pyscf/qmmm/embedding/embedding_dft.py
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
index 47400776f..6e2d4243f 100644
--- a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
@@ -20,7 +20,7 @@
 from gpu4pyscf.scf import hf as gpu_hf
 from gpu4pyscf.dft import rks
 from gpu4pyscf.qmmm.embedding import embedding
-from gpu4pyscf.qmmm.embedding.embeding_dft import SingleFragmentEmbedding
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
 
 
 class KnownValues(unittest.TestCase):

From f8445c62b30427b4f23c04fe05c1b3c11994261b Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 22 May 2026 14:18:02 +0800
Subject: [PATCH 18/30] fix some typos

---
 examples/embedding/49-dft-dmet-embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/embedding/49-dft-dmet-embedding.py b/examples/embedding/49-dft-dmet-embedding.py
index 575e44da2..a3fe49afc 100644
--- a/examples/embedding/49-dft-dmet-embedding.py
+++ b/examples/embedding/49-dft-dmet-embedding.py
@@ -22,7 +22,7 @@
 
 from pyscf import gto
 from gpu4pyscf.dft import rks
-from gpu4pyscf.qmmm.embedding.embeding_dft import SingleFragmentEmbedding
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
 
 def run_dft_embedding_example():
     # 1. Define the system (Ethane molecule with 6-31G basis)

From 4071c501d8583b9c0037c82bdb70f8dab9833142 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 22 May 2026 14:35:30 +0800
Subject: [PATCH 19/30] add some comments

---
 gpu4pyscf/qmmm/embedding/embedding.py     | 14 +++++---------
 gpu4pyscf/qmmm/embedding/embedding_dft.py | 10 +++-------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding.py b/gpu4pyscf/qmmm/embedding/embedding.py
index 4df7ca5e7..9ab8fd587 100644
--- a/gpu4pyscf/qmmm/embedding/embedding.py
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -45,9 +45,6 @@ def lowdin_orth(s):
 
 
 def get_fragment_ao_indices(mol, frag_atoms):
-    """
-    Return the atomic-orbital indices that belong to the listed atoms.
-    """
     aoslice = mol.aoslice_by_atom()
     indices = []
     for ia in frag_atoms:
@@ -82,7 +79,8 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     
     C_rot = C_occ @ Vh.T
     
-    is_bath = (S > threshold) & (S < 1.0 - threshold) # Exclude singular values close to 1.0
+    # Exclude singular values close to 1.0, which are fragment orbitals
+    is_bath = (S > threshold) & (S < 1.0 - threshold)
     is_core_small = S <= threshold
     n_sv = len(S)
     
@@ -106,7 +104,7 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
 
 def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
     """
-    Construct the AO -> embedded transformation matrix B.
+    Construct the AO -> embedded transformation matrix B^{mu}_{k}
     """
     # Due to the Carlson-Keller theorem, the lowdin OAO basis 
     # and the AO basis is 1-to-1 match.
@@ -366,10 +364,7 @@ def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             dm_cp = _as_cupy(dm)
             
             # Project embedded dm to full AO basis
-            if dm_cp.ndim == 2:
-                dm_ao = B_mat @ dm_cp @ B_mat.T
-            else:
-                dm_ao = cp.einsum('pi,xij,qj->xpq', B_mat, dm_cp, B_mat)
+            dm_ao = B_mat @ dm_cp @ B_mat.T
                 
             dm_full_ao_inner = self.dm_core[ifrag] + dm_ao
             
@@ -479,6 +474,7 @@ def kernel(self):
 
                 e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
                 if not is_mean_field:
+                    raise NotImplementedError("Non-mean-field solver not implemented, needs thorough testing...")
                     self.log.info("using non-mean-field solver")
                     nemb = B.shape[1]
                     # TODO: this can be replaced by a more efficient routine
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft.py b/gpu4pyscf/qmmm/embedding/embedding_dft.py
index 418b357ae..019f60934 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft.py
@@ -21,12 +21,10 @@
 
 class SingleFragmentEmbedding(DMET):
     """
-    Single-Fragment ONIOM-like embedding.
+    Single-Fragment ONIOM-like embedding for DFT.
     
     This class performs a single-shot,
     single-fragment delta-method energy evaluation WITHOUT macroscopic iterations.
-    It rigorously traces over the entire active space (ffagment + bath) to capture
-    full polarization correlation,.
     """
     
     def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
@@ -50,7 +48,6 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         self.fragment = self.fragments[0]
         
     def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
-        # Bare one-electron Hamiltonian trace
         e_h = cp.sum(dm_emb * h_eval_bare)
         
         # Full density reconstruction
@@ -122,15 +119,14 @@ def kernel(self):
         is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
         
         if is_mean_field:
-            # Bare one-electron Hamiltonian evaluated in active space
             h_eval_bare = B.T @ hcore_orig @ B
             
-            # Evaluate High-Level trace
+            # Evaluate High-Level energy
             e_high = self._evaluate_embedded_energy(
                 self.mf_inner_template, dm_emb_high, h_eval_bare, B, dm_core
             )
             
-            # Evaluate Low-Level trace
+            # Evaluate Low-Level energy
             e_low = self._evaluate_embedded_energy(
                 self.mf_outer, dm_emb_low, h_eval_bare, B, dm_core
             )

From 9c2b47e4c159e35b2f1e8f097e47cc3eb033be0d Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 26 May 2026 07:55:49 +0800
Subject: [PATCH 20/30] - debug the error in evaluating energies; - debug the
 schmidt procedure - add more tests

---
 gpu4pyscf/qmmm/embedding/embedding.py         |  26 +++-
 gpu4pyscf/qmmm/embedding/embedding_dft.py     |  40 +----
 .../embedding/tests/test_dft_embedding.py     |  58 +++++++
 .../embedding/tests/test_dmet_embedding.py    | 145 ++++++++++++------
 4 files changed, 182 insertions(+), 87 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding.py b/gpu4pyscf/qmmm/embedding/embedding.py
index 9ab8fd587..462ac95b3 100644
--- a/gpu4pyscf/qmmm/embedding/embedding.py
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -79,16 +79,28 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     
     C_rot = C_occ @ Vh.T
     
-    # Exclude singular values close to 1.0, which are fragment orbitals
-    is_bath = (S > threshold) & (S < 1.0 - threshold)
+    # Broadly select all potential bath orbitals (including pure fragment ones S ~ 1.0)
+    is_bath_candidate = S > threshold
     is_core_small = S <= threshold
     n_sv = len(S)
     
-    # Entangled bath orbitals (environment part)
-    bath_orb = C_rot[env_idx, :n_sv][:, is_bath]
-    norms = cp.linalg.norm(bath_orb, axis=0)
-    norms[norms < 1e-12] = 1.0 # This may happen, if s=1.0, which will add a new null vector to B!
-    bath_orb = bath_orb / norms
+    # Extract the environment part for these candidates
+    raw_bath_orb = C_rot[env_idx, :n_sv][:, is_bath_candidate]
+    
+    # Calculate their true physical norms in the environment space
+    norms = cp.linalg.norm(raw_bath_orb, axis=0)
+    
+    # Keep only those with a mathematically meaningful environment tail.
+    # This automatically drops pure fragment orbitals (norm ~ 0) preventing null vectors,
+    # while safely preserving orbitals with legitimate tiny tails (like in STO-3G).
+    valid_mask = norms > 1e-10
+    
+    # Apply the mask to both the orbitals and their norms
+    bath_orb = raw_bath_orb[:, valid_mask]
+    valid_norms = norms[valid_mask]
+    
+    # Safely normalize the surviving valid bath orbitals
+    bath_orb = bath_orb / valid_norms
     
     # Pure environment core orbitals come from null space + small singular values
     core_orb_small = C_rot[env_idx, :n_sv][:, is_core_small]
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft.py b/gpu4pyscf/qmmm/embedding/embedding_dft.py
index 019f60934..8639576a8 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft.py
@@ -48,45 +48,17 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         self.fragment = self.fragments[0]
         
     def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
-        e_h = cp.sum(dm_emb * h_eval_bare)
+        e_h_active = cp.sum(dm_emb * h_eval_bare)
         
-        # Full density reconstruction
         dm_full_ao = dm_core + B @ dm_emb @ B.T
-        v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
-        
-        # Coulomb J interaction traced over active space
-        vj_full = getattr(v_eff_full, 'vj', None)
-        if vj_full is None:
-            vj_full = mf_obj.get_j(self.full_mol, dm_full_ao)
-        vj_emb = B.T @ _as_cupy(vj_full) @ B
-        e_J = 0.5 * cp.sum(dm_emb * vj_emb)
-        
-        # Exact Exchange interaction traced over active space + Grid XC extraction
-        exc_tot = getattr(v_eff_full, 'exc', 0.0)
-        vk_full = getattr(v_eff_full, 'vk', None)
         
-        e_K = 0.0
-        grid_exc_tot = exc_tot
-        if vk_full is not None:
-            vk_full = _as_cupy(vk_full)
-            vk_emb = B.T @ vk_full @ B
-            e_K = -0.5 * cp.sum(dm_emb * vk_emb)
-            e_K_global = -0.5 * cp.sum(dm_full_ao * vk_full)
-            # Isolate the pure non-linear grid integration part
-            grid_exc_tot = exc_tot - e_K_global
-            
-        # Core evaluation for pure Grid XC subtraction
+        v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
         v_eff_core = mf_obj.get_veff(self.full_mol, dm_core)
-        exc_core = getattr(v_eff_core, 'exc', 0.0)
-        vk_core = getattr(v_eff_core, 'vk', None)
-        
-        grid_exc_core = exc_core
-        if vk_core is not None:
-            vk_core = _as_cupy(vk_core)
-            e_K_global_core = -0.5 * cp.sum(dm_core * vk_core)
-            grid_exc_core = exc_core - e_K_global_core
         
-        return e_h + e_J + e_K + grid_exc_tot - grid_exc_core
+        e_2e_full = getattr(v_eff_full, 'ecoul', 0.0) + getattr(v_eff_full, 'exc', 0.0)
+        e_2e_core = getattr(v_eff_core, 'ecoul', 0.0) + getattr(v_eff_core, 'exc', 0.0)
+        # E_active = E_1e(Active) + [E_2e(Full) - E_2e(Core)]
+        return e_h_active + e_2e_full - e_2e_core
 
     def kernel(self):
         if not self.mf_outer.converged:
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
index 6e2d4243f..4b643724d 100644
--- a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
@@ -131,6 +131,64 @@ def test_template_isolation_and_convergence(self):
         self.assertTrue(mf_inner_template.converged, 
                         "Template object was poisoned and failed to converge!")
 
+    def test_hexane_core_isolation_and_exactness(self):
+        mol = gto.Mole()
+        mol.atom = '''
+            C   1.4522500000  -2.8230000000   0.0000000000
+            C   1.4522500000  -1.2830000000   0.0000000000
+            C   0.0002500000  -0.7700000000   0.0000000000
+            C   0.0002500000   0.7700000000   0.0000000000
+            C  -1.4517500000   1.2830000000   0.0000000000
+            C  -1.4517500000   2.8230000000   0.0000000000
+            H   2.4792500000  -3.1870000000   0.0000000000
+            H   0.9382500000  -3.1870000000   0.8900000000
+            H   0.9382500000  -3.1870000000  -0.8900000000
+            H   1.9652500000  -0.9200000000   0.8900000000
+            H   1.9652500000  -0.9200000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000  -0.8900000000
+            H  -1.9657500000   0.9200000000  -0.8900000000
+            H  -1.9657500000   0.9200000000   0.8900000000
+            H  -2.4797500000   3.1870000000   0.0000000000
+            H  -0.9377500000   3.1870000000   0.8900000000
+            H  -0.9377500000   3.1870000000  -0.8900000000
+        '''
+        mol.basis = 'sto3g'
+        mol.spin = 0
+        mol.verbose = 0
+        mol.build()
+
+        mf_outer = rks.RKS(mol, xc='PBE')
+        mf_inner = rks.RKS(mol, xc='PBE')
+        
+        methyl_fragment = [0, 6, 7, 8]
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, methyl_fragment, threshold=1e-5)
+        emb_obj.kernel()
+        
+        mf_outer.kernel()
+        e_global = mf_outer.e_tot
+        e_embedded = emb_obj.e_tot
+        self.assertTrue(np.abs(e_global - e_embedded) < 1e-6, 
+                        f"PBE-in-PBE Exactness failed! Error: {np.abs(e_global - e_embedded)}")
+        
+        dm_core_sum = float(cp.sum(emb_obj.dm_core[0]))
+        self.assertTrue(dm_core_sum > 1.0, 
+                        "Hexane test did not generate a non-trivial Core DM. SVD truncation might be failing.")
+
+    def test_pure_dft_vk_bypass(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, self.fragments[0])
+        try:
+            emb_obj.kernel()
+        except AttributeError as e:
+            self.fail(f"Embedding failed for Pure DFT due to missing vk attribute: {e}")
+            
+        self.assertTrue(emb_obj.e_tot is not None, "Pure DFT embedding failed to return an energy.")
+
 
 if __name__ == '__main__':
     print("Full Tests for ONIOM-like DFT embedding.")
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
index 2d3353ae8..ca0c1b6ab 100644
--- a/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
@@ -71,6 +71,47 @@ def setUpClass(cls):
         cls.mf_inner_template2 = gpu_hf.RHF(cls.mol2)
         cls.mf_inner_template2.conv_tol = 1e-12
 
+        cls.mol3 = gto.Mole()
+        cls.mol3.atom = '''
+            C   1.4522500000  -2.8230000000   0.0000000000
+            C   1.4522500000  -1.2830000000   0.0000000000
+            C   0.0002500000  -0.7700000000   0.0000000000
+            C   0.0002500000   0.7700000000   0.0000000000
+            C  -1.4517500000   1.2830000000   0.0000000000
+            C  -1.4517500000   2.8230000000   0.0000000000
+            H   2.4792500000  -3.1870000000   0.0000000000
+            H   0.9382500000  -3.1870000000   0.8900000000
+            H   0.9382500000  -3.1870000000  -0.8900000000
+            H   1.9652500000  -0.9200000000   0.8900000000
+            H   1.9652500000  -0.9200000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000  -0.8900000000
+            H  -1.9657500000   0.9200000000  -0.8900000000
+            H  -1.9657500000   0.9200000000   0.8900000000
+            H  -2.4797500000   3.1870000000   0.0000000000
+            H  -0.9377500000   3.1870000000   0.8900000000
+            H  -0.9377500000   3.1870000000  -0.8900000000   
+        '''
+        cls.mol3.basis = '6-31g'
+        cls.mol3.spin = 0
+        cls.mol3.charge = 0
+        cls.mol3.verbose = 0
+        cls.mol3.build()
+
+        cls.fragments3 = [[0, 6, 7, 8],
+                        [1, 9, 10],
+                        [2, 11, 12],
+                        [3, 13, 14],
+                        [4, 15, 16],
+                        [5, 17, 18, 19]]
+
+        cls.mf_outer3 = gpu_hf.RHF(cls.mol3)
+        cls.mf_outer3.conv_tol = 1e-12
+        cls.mf_inner_template3 = gpu_hf.RHF(cls.mol3)
+        cls.mf_inner_template3.conv_tol = 1e-12
+
     @classmethod
     def tearDownClass(cls):
         del cls.mol
@@ -108,6 +149,10 @@ def test_lowdin(self):
         assert np.abs(X - X_ref).max() < 1e-8, "Lowdin orthogonalization should yield a close-to-identity matrix."
 
     def test_schmidt(self):
+        """
+        Test Schmidt decomposition with the rigorous norm-based filtering logic 
+        to prevent null vectors and preserve legitimate physical tails.
+        """
         mol = gto.Mole()
         mol.atom = '''
             H 0.0 0.0 0.0
@@ -135,22 +180,27 @@ def test_schmidt(self):
         C_rot = C_occ @ Vh.T
         
         threshold = 1e-5
-        is_bath = (S > threshold) & (S < 1.0 - threshold)
+        is_bath_candidate = S > threshold
         n_sv = len(S)
         
-        bath_orb_ref = C_rot[4:, :n_sv][:, is_bath]
-        if bath_orb_ref.size > 0:
-            norms = cp.linalg.norm(bath_orb_ref, axis=0)
-            norms[norms < 1e-12] = 1.0
-            bath_orb_ref /= norms
+        raw_bath_orb_ref = C_rot[4:, :n_sv][:, is_bath_candidate]
+        if raw_bath_orb_ref.size > 0:
+            norms = cp.linalg.norm(raw_bath_orb_ref, axis=0)
+            valid_mask = norms > 1e-10
+            bath_orb_ref = raw_bath_orb_ref[:, valid_mask]
+            valid_norms = norms[valid_mask]
+            if bath_orb_ref.size > 0:
+                bath_orb_ref /= valid_norms
+        else:
+            bath_orb_ref = raw_bath_orb_ref
             
         bath_orb = embedding.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7], threshold=threshold)[0]
         
         self.assertEqual(bath_orb.shape, bath_orb_ref.shape, 
-                         "Matrix shapes must match after filtering pure fragment orbitals.")
+                         "Matrix shapes must match after norm-based filtering.")
         if bath_orb.size > 0:
             assert np.abs(bath_orb.get() - bath_orb_ref.get()).max() < 1e-8, \
-                "Schmidt decomposition should yield close-to-identity matrices."
+                "Schmidt decomposition should yield highly accurate normalized basis vectors."
 
     def test_dmet_execution_and_convergence(self):
         dmet_solver = DMET(
@@ -204,44 +254,6 @@ def test_dmet_execution_and_convergence(self):
         assert np.abs(e_tot_iter1 - e_tot) < 1e-8, "DMET energy should be converged in 1 macro iteration."
         assert np.abs(dmet_solver2.u_oao).sum() < 1e-8, "Correlation potential should be close to zero."
 
-    def test_multifragment_algebraic_and_conservation(self):
-        dmet_solver = DMET(
-            mf_outer=self.mf_outer2,
-            mf_inner=self.mf_inner_template2,
-            fragments=self.fragments2,
-            threshold=1e-5,
-            max_macro_iter=1
-        )
-        dmet_solver.kernel()
-
-        S_ao = cp.asarray(self.mf_outer2.get_ovlp())
-        n_total_elec = float(self.mol2.nelectron)
-
-        for ifrag in range(dmet_solver.nfrags):
-            B = dmet_solver.B[ifrag]
-            D_core = dmet_solver.dm_core[ifrag]
-            D_emb_high = cp.asarray(dmet_solver.mf_inner[ifrag].make_rdm1())
-
-            # Check B^T * S * B == I for each fragment
-            ortho_check = B.T @ S_ao @ B
-            identity = cp.eye(B.shape[1])
-            max_ortho_err = float(cp.abs(ortho_check - identity).max())
-            self.assertTrue(max_ortho_err < 1e-10, 
-                            f"Fragment {ifrag}: Basis B is not orthonormal. Max err: {max_ortho_err}")
-
-            # Check Core DM spatial isolation from the active space
-            core_overlap = B.T @ S_ao @ D_core @ S_ao @ B
-            max_overlap_err = float(cp.abs(core_overlap).max())
-            self.assertTrue(max_overlap_err < 1e-10, 
-                            f"Fragment {ifrag}: Core DM leaks into Active Space. Max err: {max_overlap_err}")
-
-            # Check total electron conservation for this fragment representation
-            D_emb_ao = B @ D_emb_high @ B.T
-            D_total_ao = D_core + D_emb_ao
-            n_elec_calc = float(cp.trace(D_total_ao @ S_ao))
-            self.assertAlmostEqual(n_elec_calc, n_total_elec, places=8,
-                                   msg=f"Fragment {ifrag}: Electron loss detected. {n_elec_calc} != {n_total_elec}")
-
     def test_dmet_template_isolation(self):
         dmet_solver = DMET(
             mf_outer=self.mf_outer2,
@@ -277,6 +289,47 @@ def test_correlation_potential_symmetry(self):
         max_u_val = float(cp.abs(u).max())
         self.assertTrue(max_u_val < 1e-7, f"Trivial correlation potential should be zero, but got max: {max_u_val}")
 
+    def test_multifragment_algebraic_and_conservation(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer3,
+            mf_inner=self.mf_inner_template3,
+            fragments=self.fragments3,
+            threshold=1e-5,
+            max_macro_iter=1
+        )
+        dmet_solver.kernel()
+
+        S_ao = cp.asarray(self.mf_outer3.get_ovlp())
+        n_total_elec = float(self.mol3.nelectron)
+
+        e_ref = self.mf_outer3.kernel()
+        assert np.abs(e_ref - dmet_solver.e_tot) < 1e-8, f"Reference energy {e_ref} != Embedding energy {dmet_solver.e_tot}"
+
+        for ifrag in range(dmet_solver.nfrags):
+            B = dmet_solver.B[ifrag]
+            D_core = dmet_solver.dm_core[ifrag]
+            D_emb_high = cp.asarray(dmet_solver.mf_inner[ifrag].make_rdm1())
+
+            # Check B^T * S * B == I for each fragment
+            ortho_check = B.T @ S_ao @ B
+            identity = cp.eye(B.shape[1])
+            max_ortho_err = float(cp.abs(ortho_check - identity).max())
+            self.assertTrue(max_ortho_err < 1e-10, 
+                            f"Fragment {ifrag}: Basis B is not orthonormal. Max err: {max_ortho_err}")
+
+            # Check Core DM spatial isolation from the active space
+            core_overlap = B.T @ S_ao @ D_core @ S_ao @ B
+            max_overlap_err = float(cp.abs(core_overlap).max())
+            self.assertTrue(max_overlap_err < 1e-10, 
+                            f"Fragment {ifrag}: Core DM leaks into Active Space. Max err: {max_overlap_err}")
+
+            # Check total electron conservation for this fragment representation
+            D_emb_ao = B @ D_emb_high @ B.T
+            D_total_ao = D_core + D_emb_ao
+            n_elec_calc = float(cp.trace(D_total_ao @ S_ao))
+            self.assertAlmostEqual(n_elec_calc, n_total_elec, places=8,
+                                   msg=f"Fragment {ifrag}: Electron loss detected. {n_elec_calc} != {n_total_elec}")
+
 
 if __name__ == '__main__':
     print("Full Tests for DMET")

From 80a23f8fbf5fcc16d7457081f6c8e0282074eb0c Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 26 May 2026 14:10:21 +0800
Subject: [PATCH 21/30] rebase master

---
 gpu4pyscf/scf/hf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index f18a84cac..a46b1dbb1 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -841,7 +841,6 @@ def dump_flags(self, verbose=None):
     init_guess_by_chkfile    = return_cupy_array(hf_cpu.SCF.init_guess_by_chkfile)
     from_chk                 = return_cupy_array(hf_cpu.SCF.from_chk)
     get_init_guess           = hf_cpu.SCF.get_init_guess
-    make_rdm2                = NotImplemented
     energy_elec              = NotImplemented
     energy_tot               = energy_tot
     energy_nuc               = hf_cpu.SCF.energy_nuc

From afc35a6795f3d266257d68e6ba703b1e6f9ed6cc Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 27 May 2026 10:31:42 +0800
Subject: [PATCH 22/30] begin to write the codes

---
 gpu4pyscf/qmmm/embedding/embedding_dft_harris.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 gpu4pyscf/qmmm/embedding/embedding_dft_harris.py

diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
new file mode 100644
index 000000000..3787aed7e
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -0,0 +1,13 @@
+# Copyright 2021-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file

From a0ca103b69865eefbc5f3134f60099a41f840dff Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 28 May 2026 15:08:25 +0800
Subject: [PATCH 23/30] begin to write

---
 .../qmmm/embedding/embedding_dft_harris.py    | 247 +++++++++++++++++-
 1 file changed, 246 insertions(+), 1 deletion(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index 3787aed7e..ed1bc983b 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -10,4 +10,249 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
+
+import numpy as np
+import cupy as cp
+
+from pyscf import lib
+from gpu4pyscf.dft import rks
+from gpu4pyscf.lib.cupy_helper import _as_cupy
+from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth
+
+class HarrisRKS(rks.RKS):
+    """
+    Harris RKS class based on machine learning (ML) predicted density.
+    
+    This class bypasses traditional SCF iterations. Instead, it relies entirely 
+    on an external ML density evaluation function to construct the global effective 
+    potential and calculate the double counting energy.
+    """
+    def __init__(self, mol, eval_density_func, xc='LDA,VWN'):
+        super().__init__(mol)
+        self.xc = xc
+        self.max_cycle = 1  
+        
+        # eval_density_func is the external ML interface.
+        # Signature: def func(mol, grids, atomic_weights=None)
+        # Returns 7 elements:
+        #   1. vj: Coulomb potential matrix (AO basis)
+        #   2. vk: Exact exchange potential matrix (AO basis, can be None for pure DFT)
+        #   3. vxc: Exchange-correlation potential matrix (AO basis)
+        #   4. e_j: Coulomb energy (scalar)
+        #   5. e_k: Exact exchange energy (scalar, can be 0.0 for pure DFT)
+        #   6. e_xc: Exchange-correlation energy (scalar)
+        #   7. int_rho_vxc: Integral of rho * V_xc (scalar)
+        self.eval_density_func = eval_density_func
+        
+        # Cache for global evaluation results to avoid redundant ML inferences
+        self._v_eff_global = None
+        self._e_dc_global = None
+
+    def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+
+        if mol is None: 
+            mol = self.mol
+        
+        if self._v_eff_global is not None:
+            return self._v_eff_global
+            
+        if self.grids.coords is None:
+            self.grids.build()
+            
+        vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
+            mol, self.xc, self.grids, atomic_weights=None
+        )
+        
+        v_eff_ao = _as_cupy(vj) + _as_cupy(vxc)
+        if vk is not None:
+            v_eff_ao -= _as_cupy(vk)
+            e_k = float(e_k)
+        else:
+            e_k = 0.0
+            
+        # Assemble double counting energy
+        e_dc = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
+        
+        self._v_eff_global = v_eff_ao
+        self._e_dc_global = e_dc
+        return self._v_eff_global
+
+    def energy_elec(self, dm=None, h1e=None, vhf=None):
+        """
+        Overrides electronic energy evaluation using the Harris energy formula:
+        E_elec = Tr[D * (h + Veff)] - E_DC
+        """
+        if dm is None: dm = self.make_rdm1()
+        if h1e is None: h1e = self.get_hcore()
+        if vhf is None: vhf = self.get_veff(self.mol, dm)
+        
+        dm_cp = _as_cupy(dm)
+        h1e_cp = _as_cupy(h1e)
+        vhf_cp = _as_cupy(vhf)
+        
+        fock = h1e_cp + vhf_cp
+        e_band = float(cp.sum(dm_cp * fock))
+        
+        e_elec = e_band - self._e_dc_global
+        return e_elec, self._e_dc_global
+
+    def get_local_veff_and_dc(self, atomic_weights):
+
+        if self.grids.coords is None:
+            self.grids.build()
+            
+        vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
+            self.mol, self.xc, self.grids, atomic_weights=atomic_weights
+        )
+        
+        v_eff_ao_local = _as_cupy(vj) + _as_cupy(vxc)
+        if vk is not None:
+            v_eff_ao_local -= _as_cupy(vk)
+            e_k = float(e_k)
+        else:
+            e_k = 0.0
+            
+        e_dc_local = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
+        
+        return v_eff_ao_local, e_dc_local
+
+
+class SingleFragmentEmbedding_ML(DMET):
+    """
+    Single-Fragment ONIOM-like embedding utilizing ML density scaling.
+    
+    This class performs DMET bond-breaking via SVD, maps the DMET orbital
+    population to atomic weights, extracts a perfectly matched local ML density, 
+    and evaluates the total energy using ONIOM error cancellation.
+    """
+    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
+        """
+        Parameters
+        ----------
+        mf_outer : HarrisRKS object
+            The global low-level solver driven by ML density.
+        mf_inner : SCF/DFT/post-HF object
+            The high-level solver applied to the embedded fragment+bath cluster.
+        fragment : list of int
+            List of atom indices defining the core QM region.
+        threshold : float
+            Eigenvalue cutoff for the Schmidt decomposition to classify bath orbitals.
+        """
+        fragments = [fragment]
+        super().__init__(mf_outer, mf_inner, fragments,
+                         threshold=threshold, max_macro_iter=1, verbose=verbose)
+        self.fragment = self.fragments[0]
+
+    def _get_atomic_weights(self, dm_active_ao, dm_full_ao, s_ao, mol):
+        """
+        Calculate the projection weight (w_A) for each atom 
+        using Mulliken population analysis of the fragment+bath (FB) orbitals.
+        """
+        pop_active = cp.einsum('ij,ji->i', dm_active_ao, s_ao)
+        pop_full = cp.einsum('ij,ji->i', dm_full_ao, s_ao)
+        
+        aoslice = mol.aoslice_by_atom()
+        weights = np.zeros(mol.natm)
+        
+        for ia in range(mol.natm):
+            p0, p1 = aoslice[ia, 2], aoslice[ia, 3]
+            if p1 > p0:
+                n_active = float(cp.sum(pop_active[p0:p1]))
+                n_full = float(cp.sum(pop_full[p0:p1]))
+                
+                if n_full > 1e-12:
+                    w = n_active / n_full
+                    weights[ia] = max(0.0, min(1.0, w))
+                else:
+                    weights[ia] = 0.0
+                    
+        return weights
+
+    def _get_scaled_nuclear_energy(self, mol, weights):
+        coords = mol.atom_coords()
+        charges = mol.atom_charges()
+        e_nuc_local = 0.0
+        
+        for i in range(mol.natm):
+            if weights[i] < 1e-8: 
+                continue
+            for j in range(i + 1, mol.natm):
+                if weights[j] < 1e-8: 
+                    continue
+                r = np.linalg.norm(coords[i] - coords[j])
+                # Scale repulsion by the product of atomic inclusion weights
+                e_nuc_local += weights[i] * weights[j] * charges[i] * charges[j] / r
+                
+        return e_nuc_local
+
+    def kernel(self):
+
+        if not self.mf_outer.converged:
+            self.mf_outer.kernel()
+            
+        e_global_low = self.mf_outer.e_tot
+        self.log.note(f"Step 1: Global Low-Level E (Harris) = {e_global_low:.8f}")
+        
+        mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
+        mo_occ = _as_cupy(self.mf_outer.mo_occ)
+        dm_full_ao_low = _as_cupy(self.mf_outer.make_rdm1())
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
+
+        # DMET Schmidt decomposition to extract bath orbitals
+        ifrag = 0
+        self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
+        B = self.B[ifrag]
+        
+        # Project density to active space and back to AO for population analysis
+        dm_emb_low = B.T @ dm_full_ao_low @ B
+        dm_active_ao = B @ dm_emb_low @ B.T
+        
+        # Calculate mapping weights w_A
+        self.log.info("Step 2 & 3: DMET SVD and calculating Atomic Weights...")
+        w_A = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
+
+        # Retrieve local ML effective potential and double counting energy
+        self.log.info("Step 4: Extracting matched local ML density components...")
+        v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_A)
+        e_nn_local = self._get_scaled_nuclear_energy(self.full_mol, w_A)
+
+        # Calculate strictly matched local low-level energy (E_L^local)
+        fock_ao_local = hcore_orig + v_eff_ao_local
+        fock_fb_local = B.T @ fock_ao_local @ B
+        e_band_local = float(cp.sum(dm_emb_low * fock_fb_local))
+        e_local_low = e_band_local - e_dc_local + e_nn_local
+        self.log.note(f"Step 5: Matched Local Low-Level E   = {e_local_low:.8f}")
+
+        # Construct pure environment core Hamiltonian and run high-level SCF
+        fock_ao_global = hcore_orig + self.mf_outer.get_veff()
+        fock_fb_global = B.T @ fock_ao_global @ B
+        
+        v_eff_fb_local = B.T @ v_eff_ao_local @ B
+        
+        # Effective core Hamiltonian isolates the environment potential
+        h_core_fb_eff = fock_fb_global - v_eff_fb_local
+        
+        self.h_emb[ifrag] = h_core_fb_eff  
+        self.e_core[ifrag] = 0.0  # ONIOM framework implies E_core shift is 0
+        
+        self.log.info("Step 6: Running high-level inner SCF in embedding space...")
+        # Build the inner solver (automatically stored in self.mf_inner[ifrag])
+        self._build_inner_mf(ifrag, dm_full_ao_low)
+        
+        # Solve the embedded cluster problem
+        self.solve_embedded(ifrag)
+        
+        e_local_high = self.e_inner[ifrag] + e_nn_local
+        self.log.note(f"Step 6: Local High-Level E (SCF)    = {e_local_high:.8f}")
+
+        # Exact ONIOM energy assembly
+        self.e_tot = e_global_low - e_local_low + e_local_high
+        
+        self.log.note("="*50)
+        self.log.note(f"FINAL ONIOM TOTAL ENERGY = {self.e_tot:.8f}")
+        self.log.note("="*50)
+        
+        return self.e_tot
\ No newline at end of file

From 4b596ed6419d452faf4aa861c98dcfa43b6611f8 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 28 May 2026 15:28:56 +0800
Subject: [PATCH 24/30] add the density-dependent weight partition

---
 .../qmmm/embedding/embedding_dft_harris.py    | 83 ++++++++++---------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index ed1bc983b..916fa2e84 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -34,7 +34,7 @@ def __init__(self, mol, eval_density_func, xc='LDA,VWN'):
         self.max_cycle = 1  
         
         # eval_density_func is the external ML interface.
-        # Signature: def func(mol, grids, atomic_weights=None)
+        # Signature: def func(mol, xc, grids, atomic_weights=None, grid_weights=None)
         # Returns 7 elements:
         #   1. vj: Coulomb potential matrix (AO basis)
         #   2. vk: Exact exchange potential matrix (AO basis, can be None for pure DFT)
@@ -60,8 +60,9 @@ def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         if self.grids.coords is None:
             self.grids.build()
             
+        # Global evaluation uses no weights
         vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
-            mol, self.xc, self.grids, atomic_weights=None
+            mol, self.xc, self.grids, atomic_weights=None, grid_weights=None
         )
         
         v_eff_ao = _as_cupy(vj) + _as_cupy(vxc)
@@ -97,13 +98,16 @@ def energy_elec(self, dm=None, h1e=None, vhf=None):
         e_elec = e_band - self._e_dc_global
         return e_elec, self._e_dc_global
 
-    def get_local_veff_and_dc(self, atomic_weights):
-
+    def get_local_veff_and_dc(self, atomic_weights=None, grid_weights=None):
+        # Pass both weight options to the external ML interface. 
+        # The ML function should apply the provided one appropriately.
         if self.grids.coords is None:
             self.grids.build()
             
         vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
-            self.mol, self.xc, self.grids, atomic_weights=atomic_weights
+            self.mol, self.xc, self.grids, 
+            atomic_weights=atomic_weights, 
+            grid_weights=grid_weights
         )
         
         v_eff_ao_local = _as_cupy(vj) + _as_cupy(vxc)
@@ -126,7 +130,7 @@ class SingleFragmentEmbedding_ML(DMET):
     population to atomic weights, extracts a perfectly matched local ML density, 
     and evaluates the total energy using ONIOM error cancellation.
     """
-    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
+    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, partition_type='atom', verbose=None):
         """
         Parameters
         ----------
@@ -138,17 +142,17 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
             List of atom indices defining the core QM region.
         threshold : float
             Eigenvalue cutoff for the Schmidt decomposition to classify bath orbitals.
+        partition_type : str
+            'atom' for Mulliken population-based atomic weights.
+            'grid' for real-space density-based grid weights w(r) = rho_local(r) / rho_global(r).
         """
         fragments = [fragment]
         super().__init__(mf_outer, mf_inner, fragments,
                          threshold=threshold, max_macro_iter=1, verbose=verbose)
         self.fragment = self.fragments[0]
+        self.partition_type = partition_type
 
     def _get_atomic_weights(self, dm_active_ao, dm_full_ao, s_ao, mol):
-        """
-        Calculate the projection weight (w_A) for each atom 
-        using Mulliken population analysis of the fragment+bath (FB) orbitals.
-        """
         pop_active = cp.einsum('ij,ji->i', dm_active_ao, s_ao)
         pop_full = cp.einsum('ij,ji->i', dm_full_ao, s_ao)
         
@@ -169,22 +173,18 @@ def _get_atomic_weights(self, dm_active_ao, dm_full_ao, s_ao, mol):
                     
         return weights
 
-    def _get_scaled_nuclear_energy(self, mol, weights):
-        coords = mol.atom_coords()
-        charges = mol.atom_charges()
-        e_nuc_local = 0.0
+    def _get_grid_weights(self, dm_active_ao, dm_full_ao, mol, grids):
+
+        ni = self.mf_outer._numint
         
-        for i in range(mol.natm):
-            if weights[i] < 1e-8: 
-                continue
-            for j in range(i + 1, mol.natm):
-                if weights[j] < 1e-8: 
-                    continue
-                r = np.linalg.norm(coords[i] - coords[j])
-                # Scale repulsion by the product of atomic inclusion weights
-                e_nuc_local += weights[i] * weights[j] * charges[i] * charges[j] / r
-                
-        return e_nuc_local
+        rho_active = ni.get_rho(mol, dm_active_ao, grids)
+        rho_full   = ni.get_rho(mol, dm_full_ao, grids)
+        
+        weights = rho_active / cp.maximum(rho_full, 1e-12)
+        
+        weights = cp.clip(weights, 0.0, 1.0)
+        
+        return weights
 
     def kernel(self):
 
@@ -210,20 +210,31 @@ def kernel(self):
         dm_emb_low = B.T @ dm_full_ao_low @ B
         dm_active_ao = B @ dm_emb_low @ B.T
         
-        # Calculate mapping weights w_A
-        self.log.info("Step 2 & 3: DMET SVD and calculating Atomic Weights...")
-        w_A = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
+        # Calculate mapping weights and extract local ML components based on partition_type
+        if self.partition_type == 'atom':
+            self.log.info("Step 2 & 3: DMET SVD and calculating Atomic Weights...")
+            w_A = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
+            self.log.info("Step 4: Extracting matched local ML density components (Atom-based)...")
+            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_A)
+            
+        elif self.partition_type == 'grid':
+            self.log.info("Step 2 & 3: DMET SVD and calculating Grid Weights w(r)...")
+            if self.mf_outer.grids.coords is None:
+                self.mf_outer.grids.build()
+            w_grid = self._get_grid_weights(dm_active_ao, dm_full_ao_low, self.full_mol, self.mf_outer.grids)
+            self.log.info("Step 4: Extracting matched local ML density components (Grid-based)...")
+            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(grid_weights=w_grid)
+            
+        else:
+            raise ValueError(f"Unknown partition_type: {self.partition_type}. Use 'atom' or 'grid'.")
 
-        # Retrieve local ML effective potential and double counting energy
-        self.log.info("Step 4: Extracting matched local ML density components...")
-        v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_A)
-        e_nn_local = self._get_scaled_nuclear_energy(self.full_mol, w_A)
+        e_nuc_constant = self.full_mol.energy_nuc()
 
         # Calculate strictly matched local low-level energy (E_L^local)
         fock_ao_local = hcore_orig + v_eff_ao_local
         fock_fb_local = B.T @ fock_ao_local @ B
         e_band_local = float(cp.sum(dm_emb_low * fock_fb_local))
-        e_local_low = e_band_local - e_dc_local + e_nn_local
+        e_local_low = e_band_local - e_dc_local + e_nuc_constant
         self.log.note(f"Step 5: Matched Local Low-Level E   = {e_local_low:.8f}")
 
         # Construct pure environment core Hamiltonian and run high-level SCF
@@ -239,16 +250,12 @@ def kernel(self):
         self.e_core[ifrag] = 0.0  # ONIOM framework implies E_core shift is 0
         
         self.log.info("Step 6: Running high-level inner SCF in embedding space...")
-        # Build the inner solver (automatically stored in self.mf_inner[ifrag])
         self._build_inner_mf(ifrag, dm_full_ao_low)
-        
-        # Solve the embedded cluster problem
         self.solve_embedded(ifrag)
         
-        e_local_high = self.e_inner[ifrag] + e_nn_local
+        e_local_high = self.e_inner[ifrag]
         self.log.note(f"Step 6: Local High-Level E (SCF)    = {e_local_high:.8f}")
 
-        # Exact ONIOM energy assembly
         self.e_tot = e_global_low - e_local_low + e_local_high
         
         self.log.note("="*50)

From e36c253985ee1462c12c04338edee5c3e316efed Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 28 May 2026 16:56:32 +0800
Subject: [PATCH 25/30] fix the incorrect non-linear treatment for v

---
 .../qmmm/embedding/embedding_dft_harris.py    | 51 ++++++++++---------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index 916fa2e84..c89c54d6b 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -17,8 +17,7 @@
 
 from pyscf import lib
 from gpu4pyscf.dft import rks
-from gpu4pyscf.lib.cupy_helper import _as_cupy
-from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth
+from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth, _as_cupy
 
 class HarrisRKS(rks.RKS):
     """
@@ -201,8 +200,8 @@ def kernel(self):
         s_ao = _as_cupy(self.mf_outer.get_ovlp())
         X, X_inv = lowdin_orth(s_ao)
 
-        # DMET Schmidt decomposition to extract bath orbitals
         ifrag = 0
+
         self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
         B = self.B[ifrag]
         
@@ -213,41 +212,47 @@ def kernel(self):
         # Calculate mapping weights and extract local ML components based on partition_type
         if self.partition_type == 'atom':
             self.log.info("Step 2 & 3: DMET SVD and calculating Atomic Weights...")
-            w_A = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
-            self.log.info("Step 4: Extracting matched local ML density components (Atom-based)...")
-            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_A)
+            w_active = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
+            w_core = 1.0 - w_active
+            
+            self.log.info("Step 4a: Extracting pure CORE potential using (1-w)...")
+            v_core_ao, _ = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_core)
+            
+            self.log.info("Step 4b: Extracting ACTIVE components for Double Counting...")
+            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_active)
             
         elif self.partition_type == 'grid':
             self.log.info("Step 2 & 3: DMET SVD and calculating Grid Weights w(r)...")
             if self.mf_outer.grids.coords is None:
                 self.mf_outer.grids.build()
-            w_grid = self._get_grid_weights(dm_active_ao, dm_full_ao_low, self.full_mol, self.mf_outer.grids)
-            self.log.info("Step 4: Extracting matched local ML density components (Grid-based)...")
-            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(grid_weights=w_grid)
+            w_active = self._get_grid_weights(dm_active_ao, dm_full_ao_low, self.full_mol, self.mf_outer.grids)
+            w_core = 1.0 - w_active
+            
+            self.log.info("Step 4a: Extracting pure CORE potential using (1-w)...")
+            v_core_ao, _ = self.mf_outer.get_local_veff_and_dc(grid_weights=w_core)
+            
+            self.log.info("Step 4b: Extracting ACTIVE components for Double Counting...")
+            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(grid_weights=w_active)
             
         else:
             raise ValueError(f"Unknown partition_type: {self.partition_type}. Use 'atom' or 'grid'.")
 
         e_nuc_constant = self.full_mol.energy_nuc()
 
-        # Calculate strictly matched local low-level energy (E_L^local)
-        fock_ao_local = hcore_orig + v_eff_ao_local
-        fock_fb_local = B.T @ fock_ao_local @ B
+        # Construct exact embedded Hamiltonian: h_emb = B^T (h_core^AO + V_core) B
+        fock_core_ao = hcore_orig + v_core_ao
+        h_core_fb_eff = B.T @ fock_core_ao @ B
+        
+        self.h_emb[ifrag] = h_core_fb_eff  
+        self.e_core[ifrag] = 0.0  # ONIOM framework implies E_core shift is 0
+
+        fock_fb_local = h_core_fb_eff + (B.T @ v_eff_ao_local @ B)
         e_band_local = float(cp.sum(dm_emb_low * fock_fb_local))
         e_local_low = e_band_local - e_dc_local + e_nuc_constant
         self.log.note(f"Step 5: Matched Local Low-Level E   = {e_local_low:.8f}")
 
-        # Construct pure environment core Hamiltonian and run high-level SCF
-        fock_ao_global = hcore_orig + self.mf_outer.get_veff()
-        fock_fb_global = B.T @ fock_ao_global @ B
-        
-        v_eff_fb_local = B.T @ v_eff_ao_local @ B
-        
-        # Effective core Hamiltonian isolates the environment potential
-        h_core_fb_eff = fock_fb_global - v_eff_fb_local
-        
-        self.h_emb[ifrag] = h_core_fb_eff  
-        self.e_core[ifrag] = 0.0  # ONIOM framework implies E_core shift is 0
+        self.dm_core[ifrag] = cp.zeros_like(dm_full_ao_low)
+        self.v_core_ao[ifrag] = cp.zeros_like(dm_full_ao_low)
         
         self.log.info("Step 6: Running high-level inner SCF in embedding space...")
         self._build_inner_mf(ifrag, dm_full_ao_low)

From bca0dae0aba24344345780388a7ce485aa6207cb Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 28 May 2026 17:49:11 +0800
Subject: [PATCH 26/30] fix some bugs

---
 .../qmmm/embedding/embedding_dft_harris.py    | 135 +++++++++++++-----
 1 file changed, 96 insertions(+), 39 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index c89c54d6b..42e38ad3f 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -17,6 +17,7 @@
 
 from pyscf import lib
 from gpu4pyscf.dft import rks
+from gpu4pyscf.lib.cupy_helper import tag_array
 from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth, _as_cupy
 
 class HarrisRKS(rks.RKS):
@@ -48,7 +49,7 @@ def __init__(self, mol, eval_density_func, xc='LDA,VWN'):
         self._v_eff_global = None
         self._e_dc_global = None
 
-    def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+    def _get_harris_veff(self, mol=None):
 
         if mol is None: 
             mol = self.mol
@@ -78,6 +79,51 @@ def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         self._e_dc_global = e_dc
         return self._v_eff_global
 
+    def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.make_rdm1()
+        
+        dm_cp = _as_cupy(dm)
+        s_ao = _as_cupy(self.get_ovlp())
+        
+        # Calculate the actual number of electrons represented by the density matrix in AO basis
+        nelec_dm = float(cp.sum(dm_cp * s_ao))
+        
+        # Handle zero density matrix under full-system inclusion limit safely
+        if nelec_dm < 1e-4:
+            v_eff_ao = cp.zeros_like(dm_cp)
+            return tag_array(v_eff_ao, ecoul=0.0, exc=0.0, vj=cp.zeros_like(dm_cp), vk=cp.zeros_like(dm_cp))
+            
+        # Rigorous electron count inspection instead of the non-orthogonal matrix trace
+        if nelec_dm > self.mol.nelectron - 0.5:
+            v_eff_ao = self._get_harris_veff(mol)
+            e_2e = float(cp.sum(dm_cp * v_eff_ao)) - self._e_dc_global
+            return tag_array(v_eff_ao, ecoul=e_2e, exc=0.0, vj=v_eff_ao, vk=cp.zeros_like(v_eff_ao))
+        else:
+            # Core evaluation using the pre-stored complementary weights
+            if self.grids.coords is None:
+                self.grids.build()
+            if isinstance(self.current_w_core, cp.ndarray) and self.current_w_core.ndim == 1:
+                vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
+                    mol, self.xc, self.grids, atomic_weights=None, grid_weights=self.current_w_core
+                )
+            else:
+                vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
+                    mol, self.xc, self.grids, atomic_weights=self.current_w_core, grid_weights=None
+                )
+            v_eff_ao = _as_cupy(vj) + _as_cupy(vxc)
+            if vk is not None: v_eff_ao -= _as_cupy(vk)
+            e_k = float(e_k) if vk is not None else 0.0
+            e_dc = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
+            e_2e = float(cp.sum(dm_cp * v_eff_ao)) - e_dc
+            return tag_array(v_eff_ao, ecoul=e_2e, exc=0.0, vj=_as_cupy(vj), vk=_as_cupy(vk) if vk is not None else cp.zeros_like(v_eff_ao))
+
+    def kernel(self, dm0=None, **kwargs):
+        # Pass through to the standard solver, get_veff handles everything natively via electron counting
+        e_tot = rks.RKS.kernel(self, dm0=dm0, **kwargs)
+        self.converged = True
+        return e_tot
+
     def energy_elec(self, dm=None, h1e=None, vhf=None):
         """
         Overrides electronic energy evaluation using the Harris energy formula:
@@ -85,7 +131,7 @@ def energy_elec(self, dm=None, h1e=None, vhf=None):
         """
         if dm is None: dm = self.make_rdm1()
         if h1e is None: h1e = self.get_hcore()
-        if vhf is None: vhf = self.get_veff(self.mol, dm)
+        if vhf is None: vhf = self._get_harris_veff(self.mol)
         
         dm_cp = _as_cupy(dm)
         h1e_cp = _as_cupy(h1e)
@@ -185,6 +231,19 @@ def _get_grid_weights(self, dm_active_ao, dm_full_ao, mol, grids):
         
         return weights
 
+    def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
+        e_h_active = cp.sum(dm_emb * h_eval_bare)
+        
+        dm_full_ao = dm_core + B @ dm_emb @ B.T
+        
+        v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
+        v_eff_core = mf_obj.get_veff(self.full_mol, dm_core)
+        
+        e_2e_full = getattr(v_eff_full, 'ecoul', 0.0) + getattr(v_eff_full, 'exc', 0.0)
+        e_2e_core = getattr(v_eff_core, 'ecoul', 0.0) + getattr(v_eff_core, 'exc', 0.0)
+        # E_active = E_1e(Active) + [E_2e(Full) - E_2e(Core)]
+        return e_h_active + e_2e_full - e_2e_core
+
     def kernel(self):
 
         if not self.mf_outer.converged:
@@ -205,8 +264,8 @@ def kernel(self):
         self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
         B = self.B[ifrag]
         
-        # Project density to active space and back to AO for population analysis
-        dm_emb_low = B.T @ dm_full_ao_low @ B
+        # Rigorous density matrix projection incorporating the non-orthogonal overlap metric S
+        dm_emb_low = B.T @ s_ao @ dm_full_ao_low @ s_ao @ B
         dm_active_ao = B @ dm_emb_low @ B.T
         
         # Calculate mapping weights and extract local ML components based on partition_type
@@ -215,12 +274,6 @@ def kernel(self):
             w_active = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
             w_core = 1.0 - w_active
             
-            self.log.info("Step 4a: Extracting pure CORE potential using (1-w)...")
-            v_core_ao, _ = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_core)
-            
-            self.log.info("Step 4b: Extracting ACTIVE components for Double Counting...")
-            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(atomic_weights=w_active)
-            
         elif self.partition_type == 'grid':
             self.log.info("Step 2 & 3: DMET SVD and calculating Grid Weights w(r)...")
             if self.mf_outer.grids.coords is None:
@@ -228,43 +281,47 @@ def kernel(self):
             w_active = self._get_grid_weights(dm_active_ao, dm_full_ao_low, self.full_mol, self.mf_outer.grids)
             w_core = 1.0 - w_active
             
-            self.log.info("Step 4a: Extracting pure CORE potential using (1-w)...")
-            v_core_ao, _ = self.mf_outer.get_local_veff_and_dc(grid_weights=w_core)
-            
-            self.log.info("Step 4b: Extracting ACTIVE components for Double Counting...")
-            v_eff_ao_local, e_dc_local = self.mf_outer.get_local_veff_and_dc(grid_weights=w_active)
-            
         else:
             raise ValueError(f"Unknown partition_type: {self.partition_type}. Use 'atom' or 'grid'.")
+        print("debug w_core:", w_core)
 
-        e_nuc_constant = self.full_mol.energy_nuc()
+        # Store w_core into mf_outer for automated core potential evaluation via trace inspection
+        self.mf_outer.current_w_core = w_core
 
-        # Construct exact embedded Hamiltonian: h_emb = B^T (h_core^AO + V_core) B
-        fock_core_ao = hcore_orig + v_core_ao
-        h_core_fb_eff = B.T @ fock_core_ao @ B
-        
-        self.h_emb[ifrag] = h_core_fb_eff  
-        self.e_core[ifrag] = 0.0  # ONIOM framework implies E_core shift is 0
-
-        fock_fb_local = h_core_fb_eff + (B.T @ v_eff_ao_local @ B)
-        e_band_local = float(cp.sum(dm_emb_low * fock_fb_local))
-        e_local_low = e_band_local - e_dc_local + e_nuc_constant
-        self.log.note(f"Step 5: Matched Local Low-Level E   = {e_local_low:.8f}")
-
-        self.dm_core[ifrag] = cp.zeros_like(dm_full_ao_low)
-        self.v_core_ao[ifrag] = cp.zeros_like(dm_full_ao_low)
+        # Standard DMET embedded Hamiltonian and core potentials construction
+        self.build_embedded_hamiltonian(ifrag, hcore_orig)
         
         self.log.info("Step 6: Running high-level inner SCF in embedding space...")
-        self._build_inner_mf(ifrag, dm_full_ao_low)
+        mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
         self.solve_embedded(ifrag)
         
-        e_local_high = self.e_inner[ifrag]
-        self.log.note(f"Step 6: Local High-Level E (SCF)    = {e_local_high:.8f}")
-
-        self.e_tot = e_global_low - e_local_low + e_local_high
+        dm_emb_high = _as_cupy(mf_inner.make_rdm1())
+        dm_emb_low = self.dm_emb_init[ifrag]
+        
+        B = self.B[ifrag]
+        dm_core = self.dm_core[ifrag]
+        is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
+        
+        if is_mean_field:
+            h_eval_bare = B.T @ hcore_orig @ B
+            
+            # Evaluate High-Level energy
+            e_high = self._evaluate_embedded_energy(
+                self.mf_inner_template, dm_emb_high, h_eval_bare, B, dm_core
+            )
+            
+            # Evaluate Low-Level energy
+            e_low = self._evaluate_embedded_energy(
+                self.mf_outer, dm_emb_low, h_eval_bare, B, dm_core
+            )
+        else:
+            raise NotImplementedError("WFT evaluation is not implemented for this class.")
+        
+        delta_e = float(e_high - e_low)
+        self.log.note(f"Global Low-Level E : {e_global_low:.8f}")
+        self.log.note(f"Active Space dE    : {delta_e:.8f}")
         
-        self.log.note("="*50)
-        self.log.note(f"FINAL ONIOM TOTAL ENERGY = {self.e_tot:.8f}")
-        self.log.note("="*50)
+        self.e_tot = e_global_low + delta_e
+        self.log.note(f"Total Embedded E   : {self.e_tot:.8f}")
         
         return self.e_tot
\ No newline at end of file

From 6669fa636096a25f7ba1e59ba9316ade7a89f29f Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 29 May 2026 16:56:06 +0800
Subject: [PATCH 27/30] Use the ML-density for global energy and density
 creation only. The low level energy is performed from SCF

---
 .../qmmm/embedding/embedding_dft_harris.py    | 222 +++++-------------
 1 file changed, 53 insertions(+), 169 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index 42e38ad3f..aee5a1060 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -14,11 +14,12 @@
 
 import numpy as np
 import cupy as cp
-
 from pyscf import lib
 from gpu4pyscf.dft import rks
 from gpu4pyscf.lib.cupy_helper import tag_array
 from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth, _as_cupy
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
+
 
 class HarrisRKS(rks.RKS):
     """
@@ -45,12 +46,11 @@ def __init__(self, mol, eval_density_func, xc='LDA,VWN'):
         #   7. int_rho_vxc: Integral of rho * V_xc (scalar)
         self.eval_density_func = eval_density_func
         
-        # Cache for global evaluation results to avoid redundant ML inferences
         self._v_eff_global = None
         self._e_dc_global = None
+        self._use_harris_veff = False
 
     def _get_harris_veff(self, mol=None):
-
         if mol is None: 
             mol = self.mol
         
@@ -72,110 +72,72 @@ def _get_harris_veff(self, mol=None):
         else:
             e_k = 0.0
             
-        # Assemble double counting energy
+        # double counting energy
         e_dc = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
         
+        vk_array = _as_cupy(vk) if vk is not None else cp.zeros_like(v_eff_ao)
+        v_eff_ao = tag_array(v_eff_ao, ecoul=float(e_j) - e_k, exc=float(e_xc), vj=_as_cupy(vj), vk=vk_array)
+        
         self._v_eff_global = v_eff_ao
         self._e_dc_global = e_dc
         return self._v_eff_global
 
     def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
-        if mol is None: mol = self.mol
-        if dm is None: dm = self.make_rdm1()
-        
-        dm_cp = _as_cupy(dm)
-        s_ao = _as_cupy(self.get_ovlp())
-        
-        # Calculate the actual number of electrons represented by the density matrix in AO basis
-        nelec_dm = float(cp.sum(dm_cp * s_ao))
-        
-        # Handle zero density matrix under full-system inclusion limit safely
-        if nelec_dm < 1e-4:
-            v_eff_ao = cp.zeros_like(dm_cp)
-            return tag_array(v_eff_ao, ecoul=0.0, exc=0.0, vj=cp.zeros_like(dm_cp), vk=cp.zeros_like(dm_cp))
-            
-        # Rigorous electron count inspection instead of the non-orthogonal matrix trace
-        if nelec_dm > self.mol.nelectron - 0.5:
-            v_eff_ao = self._get_harris_veff(mol)
-            e_2e = float(cp.sum(dm_cp * v_eff_ao)) - self._e_dc_global
-            return tag_array(v_eff_ao, ecoul=e_2e, exc=0.0, vj=v_eff_ao, vk=cp.zeros_like(v_eff_ao))
-        else:
-            # Core evaluation using the pre-stored complementary weights
-            if self.grids.coords is None:
-                self.grids.build()
-            if isinstance(self.current_w_core, cp.ndarray) and self.current_w_core.ndim == 1:
-                vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
-                    mol, self.xc, self.grids, atomic_weights=None, grid_weights=self.current_w_core
-                )
-            else:
-                vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
-                    mol, self.xc, self.grids, atomic_weights=self.current_w_core, grid_weights=None
-                )
-            v_eff_ao = _as_cupy(vj) + _as_cupy(vxc)
-            if vk is not None: v_eff_ao -= _as_cupy(vk)
-            e_k = float(e_k) if vk is not None else 0.0
-            e_dc = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
-            e_2e = float(cp.sum(dm_cp * v_eff_ao)) - e_dc
-            return tag_array(v_eff_ao, ecoul=e_2e, exc=0.0, vj=_as_cupy(vj), vk=_as_cupy(vk) if vk is not None else cp.zeros_like(v_eff_ao))
+        # Use ML evaluation ONLY during the global SCF step.
+        # For standard embedding steps, fallback to the native exact DFT evaluation.
+        if getattr(self, '_use_harris_veff', False):
+            return self._get_harris_veff(mol)
+        return rks.RKS.get_veff(self, mol, dm, dm_last, vhf_last, hermi)
 
     def kernel(self, dm0=None, **kwargs):
-        # Pass through to the standard solver, get_veff handles everything natively via electron counting
-        e_tot = rks.RKS.kernel(self, dm0=dm0, **kwargs)
+
+        if self.max_cycle != 1:
+            lib.logger.warn(self, "HarrisRKS is a non-iterative method. "
+                                  f"Overriding max_cycle from {self.max_cycle} to 1.")
+            self.max_cycle = 1
+
+        # Temporarily enable Harris ML potential for the global 1-step evaluation
+        self._use_harris_veff = True
+        try:
+            e_tot = rks.RKS.kernel(self, dm0=dm0, **kwargs)
+        finally:
+            self._use_harris_veff = False
+            
         self.converged = True
         return e_tot
 
     def energy_elec(self, dm=None, h1e=None, vhf=None):
         """
-        Overrides electronic energy evaluation using the Harris energy formula:
         E_elec = Tr[D * (h + Veff)] - E_DC
         """
-        if dm is None: dm = self.make_rdm1()
-        if h1e is None: h1e = self.get_hcore()
-        if vhf is None: vhf = self._get_harris_veff(self.mol)
-        
-        dm_cp = _as_cupy(dm)
-        h1e_cp = _as_cupy(h1e)
-        vhf_cp = _as_cupy(vhf)
-        
-        fock = h1e_cp + vhf_cp
-        e_band = float(cp.sum(dm_cp * fock))
-        
-        e_elec = e_band - self._e_dc_global
-        return e_elec, self._e_dc_global
-
-    def get_local_veff_and_dc(self, atomic_weights=None, grid_weights=None):
-        # Pass both weight options to the external ML interface. 
-        # The ML function should apply the provided one appropriately.
-        if self.grids.coords is None:
-            self.grids.build()
+        if getattr(self, '_use_harris_veff', False):
+            if dm is None: dm = self.make_rdm1()
+            if h1e is None: h1e = self.get_hcore()
+            if vhf is None: vhf = self._get_harris_veff(self.mol)
             
-        vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
-            self.mol, self.xc, self.grids, 
-            atomic_weights=atomic_weights, 
-            grid_weights=grid_weights
-        )
-        
-        v_eff_ao_local = _as_cupy(vj) + _as_cupy(vxc)
-        if vk is not None:
-            v_eff_ao_local -= _as_cupy(vk)
-            e_k = float(e_k)
-        else:
-            e_k = 0.0
+            dm_cp = _as_cupy(dm)
+            h1e_cp = _as_cupy(h1e)
+            vhf_cp = _as_cupy(vhf)
             
-        e_dc_local = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
-        
-        return v_eff_ao_local, e_dc_local
+            fock = h1e_cp + vhf_cp
+            e_band = float(cp.sum(dm_cp * fock))
+            
+            e_elec = e_band - self._e_dc_global
+            return e_elec, self._e_dc_global
+        else:
+            # Fallback to standard energy evaluation during embedding steps
+            return rks.RKS.energy_elec(self, dm, h1e, vhf)
 
 
-class SingleFragmentEmbedding_ML(DMET):
+class SingleFragmentEmbedding_ML(SingleFragmentEmbedding):
     """
-    Single-Fragment ONIOM-like embedding utilizing ML density scaling.
+    Single-Fragment ONIOM-like embedding utilizing ML density for the global low-level.
     
-    This class performs DMET bond-breaking via SVD, maps the DMET orbital
-    population to atomic weights, extracts a perfectly matched local ML density, 
-    and evaluates the total energy using ONIOM error cancellation.
+    This class performs DMET bond-breaking via SVD, and evaluates the local embedded 
+    energies using rigorous standard SCF evaluations to guarantee exact error cancellation 
+    between the high-level and low-level local calculations.
     """
-    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, partition_type='atom', verbose=None):
+    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
         """
         Parameters
         ----------
@@ -187,62 +149,10 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, partition_type=
             List of atom indices defining the core QM region.
         threshold : float
             Eigenvalue cutoff for the Schmidt decomposition to classify bath orbitals.
-        partition_type : str
-            'atom' for Mulliken population-based atomic weights.
-            'grid' for real-space density-based grid weights w(r) = rho_local(r) / rho_global(r).
         """
-        fragments = [fragment]
-        super().__init__(mf_outer, mf_inner, fragments,
-                         threshold=threshold, max_macro_iter=1, verbose=verbose)
+        super().__init__(mf_outer, mf_inner, fragment,
+                         threshold=threshold, verbose=verbose)
         self.fragment = self.fragments[0]
-        self.partition_type = partition_type
-
-    def _get_atomic_weights(self, dm_active_ao, dm_full_ao, s_ao, mol):
-        pop_active = cp.einsum('ij,ji->i', dm_active_ao, s_ao)
-        pop_full = cp.einsum('ij,ji->i', dm_full_ao, s_ao)
-        
-        aoslice = mol.aoslice_by_atom()
-        weights = np.zeros(mol.natm)
-        
-        for ia in range(mol.natm):
-            p0, p1 = aoslice[ia, 2], aoslice[ia, 3]
-            if p1 > p0:
-                n_active = float(cp.sum(pop_active[p0:p1]))
-                n_full = float(cp.sum(pop_full[p0:p1]))
-                
-                if n_full > 1e-12:
-                    w = n_active / n_full
-                    weights[ia] = max(0.0, min(1.0, w))
-                else:
-                    weights[ia] = 0.0
-                    
-        return weights
-
-    def _get_grid_weights(self, dm_active_ao, dm_full_ao, mol, grids):
-
-        ni = self.mf_outer._numint
-        
-        rho_active = ni.get_rho(mol, dm_active_ao, grids)
-        rho_full   = ni.get_rho(mol, dm_full_ao, grids)
-        
-        weights = rho_active / cp.maximum(rho_full, 1e-12)
-        
-        weights = cp.clip(weights, 0.0, 1.0)
-        
-        return weights
-
-    def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
-        e_h_active = cp.sum(dm_emb * h_eval_bare)
-        
-        dm_full_ao = dm_core + B @ dm_emb @ B.T
-        
-        v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
-        v_eff_core = mf_obj.get_veff(self.full_mol, dm_core)
-        
-        e_2e_full = getattr(v_eff_full, 'ecoul', 0.0) + getattr(v_eff_full, 'exc', 0.0)
-        e_2e_core = getattr(v_eff_core, 'ecoul', 0.0) + getattr(v_eff_core, 'exc', 0.0)
-        # E_active = E_1e(Active) + [E_2e(Full) - E_2e(Core)]
-        return e_h_active + e_2e_full - e_2e_core
 
     def kernel(self):
 
@@ -250,7 +160,7 @@ def kernel(self):
             self.mf_outer.kernel()
             
         e_global_low = self.mf_outer.e_tot
-        self.log.note(f"Step 1: Global Low-Level E (Harris) = {e_global_low:.8f}")
+        self.log.note(f"Global Low-Level E (Harris) = {e_global_low:.8f}")
         
         mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
         mo_occ = _as_cupy(self.mf_outer.mo_occ)
@@ -262,36 +172,9 @@ def kernel(self):
         ifrag = 0
 
         self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
-        B = self.B[ifrag]
-        
-        # Rigorous density matrix projection incorporating the non-orthogonal overlap metric S
-        dm_emb_low = B.T @ s_ao @ dm_full_ao_low @ s_ao @ B
-        dm_active_ao = B @ dm_emb_low @ B.T
-        
-        # Calculate mapping weights and extract local ML components based on partition_type
-        if self.partition_type == 'atom':
-            self.log.info("Step 2 & 3: DMET SVD and calculating Atomic Weights...")
-            w_active = self._get_atomic_weights(dm_active_ao, dm_full_ao_low, s_ao, self.full_mol)
-            w_core = 1.0 - w_active
-            
-        elif self.partition_type == 'grid':
-            self.log.info("Step 2 & 3: DMET SVD and calculating Grid Weights w(r)...")
-            if self.mf_outer.grids.coords is None:
-                self.mf_outer.grids.build()
-            w_active = self._get_grid_weights(dm_active_ao, dm_full_ao_low, self.full_mol, self.mf_outer.grids)
-            w_core = 1.0 - w_active
-            
-        else:
-            raise ValueError(f"Unknown partition_type: {self.partition_type}. Use 'atom' or 'grid'.")
-        print("debug w_core:", w_core)
-
-        # Store w_core into mf_outer for automated core potential evaluation via trace inspection
-        self.mf_outer.current_w_core = w_core
-
-        # Standard DMET embedded Hamiltonian and core potentials construction
         self.build_embedded_hamiltonian(ifrag, hcore_orig)
         
-        self.log.info("Step 6: Running high-level inner SCF in embedding space...")
+        self.log.info("Running high-level inner SCF in embedding space...")
         mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
         self.solve_embedded(ifrag)
         
@@ -305,15 +188,16 @@ def kernel(self):
         if is_mean_field:
             h_eval_bare = B.T @ hcore_orig @ B
             
-            # Evaluate High-Level energy
             e_high = self._evaluate_embedded_energy(
                 self.mf_inner_template, dm_emb_high, h_eval_bare, B, dm_core
             )
+            self.log.note(f"High-Level E : {e_high:.8f}")
             
-            # Evaluate Low-Level energy
+            # Evaluate Low-Level energy (mf_outer will automatically use exact get_veff for xc here)
             e_low = self._evaluate_embedded_energy(
                 self.mf_outer, dm_emb_low, h_eval_bare, B, dm_core
             )
+            self.log.note(f"Low-Level E : {e_low:.8f}")
         else:
             raise NotImplementedError("WFT evaluation is not implemented for this class.")
         

From 6972d2346a3a6006aa095c89898f3bcb1aa919bf Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Mon, 1 Jun 2026 08:58:03 +0800
Subject: [PATCH 28/30] add the unit test for the ML-density oniom embedding.

---
 .../tests/test_dft_embedding_harris.py        | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py

diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
new file mode 100644
index 000000000..631bb9314
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
@@ -0,0 +1,135 @@
+# Copyright 2021-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
+from gpu4pyscf.qmmm.embedding.embedding_dft_harris import HarrisRKS, SingleFragmentEmbedding_ML
+
+
+def dummy_eval_density_func(mol, xc, grids, atomic_weights=None, grid_weights=None):
+    mf = rks.RKS(mol)
+    mf.xc = xc
+    mf.grids = grids
+    mf.verbose = 0
+    mf.kernel()
+    
+    dm = cp.asarray(mf.make_rdm1())
+    
+    # Calculate exact J and K matrices
+    vj, vk = mf.get_jk(mol, dm)
+    e_j = 0.5 * float(cp.sum(dm * vj))
+    
+    is_hybrid = mf._numint.libxc.is_hybrid_xc(xc)
+    if is_hybrid:
+        hyb = mf._numint.libxc.hybrid_coeff(xc, spin=mol.spin)
+        vk = vk * hyb
+        e_k = 0.5 * float(cp.sum(dm * vk))
+    else:
+        vk = None
+        e_k = 0.0
+        
+    # Calculate exact Vxc and Exc
+    _, e_xc, vxc = mf._numint.nr_rks(mol, grids, xc, dm)
+    int_rho_vxc = float(cp.sum(dm * vxc))
+    
+    return vj, vk, vxc, e_j, e_k, float(e_xc), int_rho_vxc
+
+
+class TestMLEmbedding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.mol = gto.Mole()
+        cls.mol.atom = '''
+            C      -0.76091    -0.00000     0.00000
+            C       0.76091    -0.00000     0.00000
+            H      -1.16001     1.02029     0.00000
+            H      -1.16001    -0.51014    -0.88357
+            H      -1.16001    -0.51014     0.88357
+            H       1.16001    -1.02029     0.00000
+            H       1.16001     0.51014     0.88357
+            H       1.16001     0.51014    -0.88357    
+        '''
+        cls.mol.basis = '6-31g'
+        cls.mol.spin = 0
+        cls.mol.charge = 0
+        cls.mol.verbose = 0
+        cls.mol.build()
+
+        cls.methyl_fragment = [0, 2, 3, 4]
+        cls.full_fragment = [i for i in range(cls.mol.natm)]
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol
+
+    def test_harris_rks_exactness(self):
+        mf_ref = rks.RKS(self.mol, xc='PBE')
+        mf_ref.verbose = 0
+        e_ref = mf_ref.kernel()
+
+        mf_harris = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_harris.verbose = 0
+        e_harris = mf_harris.kernel()
+
+        self.assertAlmostEqual(e_ref, e_harris, places=8, 
+                               msg=f"HarrisRKS energy {e_harris} differs from exact RKS {e_ref}")
+
+    def test_full_system_pbe_in_pbe(self):
+        mf_outer = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding_ML(mf_outer, mf_inner, self.full_fragment, verbose=0)
+        emb_obj.kernel()
+        
+        mf_outer.kernel()
+        e_global = mf_outer.e_tot
+        e_emb = emb_obj.e_tot
+        
+        self.assertAlmostEqual(e_global, e_emb, places=8, 
+                               msg="Full-system PBE-in-PBE failed exact cancellation.")
+
+    def test_equivalence_to_standard_embedding(self):
+
+        mf_outer_std = rks.RKS(self.mol, xc='PBE')
+        mf_inner_std = rks.RKS(self.mol, xc='B3LYP')
+        emb_std = SingleFragmentEmbedding(mf_outer_std, mf_inner_std, self.methyl_fragment, verbose=0)
+        e_std = emb_std.kernel()
+
+        mf_outer_ml = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_inner_ml = rks.RKS(self.mol, xc='B3LYP')
+        emb_ml = SingleFragmentEmbedding_ML(mf_outer_ml, mf_inner_ml, self.methyl_fragment, verbose=0)
+        e_ml = emb_ml.kernel()
+
+        self.assertAlmostEqual(e_std, e_ml, places=8, 
+                               msg=f"ML Embedding {e_ml} diverged from Standard Embedding {e_std}!")
+
+    def test_harris_max_cycle_override(self):
+
+        mf_harris = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_harris.max_cycle = 100 
+        mf_harris.verbose = 0
+        
+        mf_harris.kernel()
+        
+        self.assertEqual(mf_harris.max_cycle, 1, 
+                         "HarrisRKS failed to override malicious max_cycle setting!")
+
+if __name__ == '__main__':
+    print("Full Tests for ML-Driven ONIOM-like Embedding...")
+    unittest.main()
+

From 59fe2ec63e51372dbd5980e31d180cbf3a088327 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Mon, 1 Jun 2026 09:52:37 +0800
Subject: [PATCH 29/30] add the example and fix some typos

---
 .../50-example_ml_density_embedding.py        | 124 ++++++++++++++++++
 .../qmmm/embedding/embedding_dft_harris.py    |   6 +-
 .../tests/test_dft_embedding_harris.py        |   2 +-
 3 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 examples/embedding/50-example_ml_density_embedding.py

diff --git a/examples/embedding/50-example_ml_density_embedding.py b/examples/embedding/50-example_ml_density_embedding.py
new file mode 100644
index 000000000..b3f408d73
--- /dev/null
+++ b/examples/embedding/50-example_ml_density_embedding.py
@@ -0,0 +1,124 @@
+# Copyright 2021-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example: ML-Driven DFT Embedding (ONIOM-like scheme)
+
+This example demonstrates how to use the `HarrisRKS` and `SingleFragmentEmbedding_ML` 
+classes to perform a multi-scale quantum chemistry calculation (QM/QM). 
+It uses a dummy ML density evaluator to simulate an ultra-fast global PBE calculation, 
+and then performs a rigorous B3LYP high-level calculation only on the active fragment.
+"""
+
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embedding_dft_harris import HarrisRKS, SingleFragmentEmbedding_ML
+
+
+def dummy_eval_density_func(mol, xc, grids):
+    """
+    A pure DFT surrogate that mimics the behavior of an ML density predictor.
+    It performs a standard SCF to convergence and returns the exact potentials 
+    and energies, acting as the "Ground Truth" ML model.
+    """
+    print("\n[ML Surrogate] Generating density and effective potentials...")
+    mf = rks.RKS(mol)
+    mf.xc = xc
+    mf.grids = grids
+    mf.verbose = 0
+    mf.kernel()
+    
+    dm = cp.asarray(mf.make_rdm1())
+    vj, vk = mf.get_jk(mol, dm)
+    e_j = 0.5 * float(cp.sum(dm * vj))
+    
+    is_hybrid = mf._numint.libxc.is_hybrid_xc(xc)
+    if is_hybrid:
+        hyb = mf._numint.libxc.hybrid_coeff(xc, spin=mol.spin)
+        vk = vk * hyb
+        e_k = 0.5 * float(cp.sum(dm * vk))
+    else:
+        vk = None
+        e_k = 0.0
+        
+    _, e_xc, vxc = mf._numint.nr_rks(mol, grids, xc, dm)
+    int_rho_vxc = float(cp.sum(dm * vxc))
+    
+    print("[ML Surrogate] Potential generation completed.\n")
+    return vj, vk, vxc, e_j, e_k, float(e_xc), int_rho_vxc
+
+
+def main():
+    # 1. Build a target molecule (e.g., Hexane)
+    mol = gto.Mole()
+    mol.atom = '''
+        C   1.4522500000  -2.8230000000   0.0000000000
+        C   1.4522500000  -1.2830000000   0.0000000000
+        C   0.0002500000  -0.7700000000   0.0000000000
+        C   0.0002500000   0.7700000000   0.0000000000
+        C  -1.4517500000   1.2830000000   0.0000000000
+        C  -1.4517500000   2.8230000000   0.0000000000
+        H   2.4792500000  -3.1870000000   0.0000000000
+        H   0.9382500000  -3.1870000000   0.8900000000
+        H   0.9382500000  -3.1870000000  -0.8900000000
+        H   1.9652500000  -0.9200000000   0.8900000000
+        H   1.9652500000  -0.9200000000  -0.8900000000
+        H  -0.5137500000  -1.1330000000  -0.8900000000
+        H  -0.5137500000  -1.1330000000   0.8900000000
+        H   0.5132500000   1.1330000000   0.8900000000
+        H   0.5132500000   1.1330000000  -0.8900000000
+        H  -1.9657500000   0.9200000000  -0.8900000000
+        H  -1.9657500000   0.9200000000   0.8900000000
+        H  -2.4797500000   3.1870000000   0.0000000000
+        H  -0.9377500000   3.1870000000   0.8900000000
+        H  -0.9377500000   3.1870000000  -0.8900000000
+    '''
+    mol.basis = 'sto3g' # Use a small basis set for quick demonstration
+    mol.spin = 0
+    mol.verbose = 4
+    mol.build()
+
+    # 2. Define the active region (e.g., the terminal methyl group: C + 3xH)
+    methyl_fragment = [0, 6, 7, 8]
+    
+    print("==================================================")
+    print("   Starting ML-Driven DFT Embedding Calculation   ")
+    print("==================================================")
+
+    # 3. Setup the Global Low-Level Solver (driven by ML)
+    # This evaluates the full system using the Harris functional approach in 1 step.
+    mf_outer = HarrisRKS(mol, dummy_eval_density_func, xc='PBE')
+    
+    # 4. Setup the Local High-Level Solver (Standard rigorous DFT)
+    # This will only be executed within the embedded active space.
+    mf_inner = rks.RKS(mol, xc='B3LYP')
+    
+    # 5. Initialize and execute the ML Embedding framework
+    emb_obj = SingleFragmentEmbedding_ML(mf_outer, mf_inner, methyl_fragment)
+    e_tot = emb_obj.kernel()
+    
+    print("\n==================================================")
+    print("                 Summary of Results               ")
+    print("==================================================")
+    print(f"Global Low-Level E (ML-PBE) : {mf_outer.e_tot:.8f} Hartree")
+    print(f"High-Level Local E (B3LYP)  : {emb_obj.e_inner[0]:.8f} Hartree")
+    print(f"Low-Level Local E (PBE)     : {emb_obj.e_inner[0] - emb_obj.e_tot + mf_outer.e_tot:.8f} Hartree") # Reverse engineered for display
+    print(f"--------------------------------------------------")
+    print(f"FINAL ONIOM TOTAL ENERGY    : {e_tot:.8f} Hartree")
+    print("==================================================")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index aee5a1060..01e342d15 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -35,7 +35,7 @@ def __init__(self, mol, eval_density_func, xc='LDA,VWN'):
         self.max_cycle = 1  
         
         # eval_density_func is the external ML interface.
-        # Signature: def func(mol, xc, grids, atomic_weights=None, grid_weights=None)
+        # Signature: def func(mol, xc, grids)
         # Returns 7 elements:
         #   1. vj: Coulomb potential matrix (AO basis)
         #   2. vk: Exact exchange potential matrix (AO basis, can be None for pure DFT)
@@ -60,10 +60,8 @@ def _get_harris_veff(self, mol=None):
         if self.grids.coords is None:
             self.grids.build()
             
-        # Global evaluation uses no weights
         vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
-            mol, self.xc, self.grids, atomic_weights=None, grid_weights=None
-        )
+            mol, self.xc, self.grids)
         
         v_eff_ao = _as_cupy(vj) + _as_cupy(vxc)
         if vk is not None:
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
index 631bb9314..3a2db6773 100644
--- a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
@@ -21,7 +21,7 @@
 from gpu4pyscf.qmmm.embedding.embedding_dft_harris import HarrisRKS, SingleFragmentEmbedding_ML
 
 
-def dummy_eval_density_func(mol, xc, grids, atomic_weights=None, grid_weights=None):
+def dummy_eval_density_func(mol, xc, grids):
     mf = rks.RKS(mol)
     mf.xc = xc
     mf.grids = grids

From accdebf34fcd2ae5fa733b2b07bf9421474fb687 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 4 Jun 2026 14:54:14 +0800
Subject: [PATCH 30/30] print the true energy in dmet oniom

---
 gpu4pyscf/qmmm/embedding/embedding.py         |  7 ++-
 gpu4pyscf/qmmm/embedding/embedding_dft.py     | 61 ++++++++++++++++---
 .../qmmm/embedding/embedding_dft_harris.py    | 46 ++++++++++++++
 3 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/gpu4pyscf/qmmm/embedding/embedding.py b/gpu4pyscf/qmmm/embedding/embedding.py
index 462ac95b3..d3ff8385c 100644
--- a/gpu4pyscf/qmmm/embedding/embedding.py
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -80,7 +80,7 @@ def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
     C_rot = C_occ @ Vh.T
     
     # Broadly select all potential bath orbitals (including pure fragment ones S ~ 1.0)
-    is_bath_candidate = S > threshold
+    is_bath_candidate = (S > threshold) #& (S < 1.0 - threshold)
     is_core_small = S <= threshold
     n_sv = len(S)
     
@@ -461,6 +461,11 @@ def kernel(self):
                 self.build_embedded_hamiltonian(ifrag, hcore_orig)
                 mf_inner = self._build_inner_mf(ifrag, dm_full_ao)
                 self.solve_embedded(ifrag)
+                if not self.mf_inner[ifrag].converged:
+                    raise RuntimeError(
+                        f"Embedded high-level SCF did not converge for fragment {ifrag}; "
+                        "do not use this density for delta energy."
+                    )
 
                 dm_emb = _as_cupy(mf_inner.make_rdm1())
                 
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft.py b/gpu4pyscf/qmmm/embedding/embedding_dft.py
index 8639576a8..b405598a9 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft.py
@@ -46,19 +46,20 @@ def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
                          threshold=threshold, max_macro_iter=1, verbose=verbose)
         
         self.fragment = self.fragments[0]
-        
+    
     def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
-        e_h_active = cp.sum(dm_emb * h_eval_bare)
+        e_h_active = float(cp.sum(dm_emb * h_eval_bare))
         
         dm_full_ao = dm_core + B @ dm_emb @ B.T
         
         v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
-        v_eff_core = mf_obj.get_veff(self.full_mol, dm_core)
+        e_2e_full = float(getattr(v_eff_full, 'ecoul', 0.0) + getattr(v_eff_full, 'exc', 0.0))
         
-        e_2e_full = getattr(v_eff_full, 'ecoul', 0.0) + getattr(v_eff_full, 'exc', 0.0)
-        e_2e_core = getattr(v_eff_core, 'ecoul', 0.0) + getattr(v_eff_core, 'exc', 0.0)
-        # E_active = E_1e(Active) + [E_2e(Full) - E_2e(Core)]
-        return e_h_active + e_2e_full - e_2e_core
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        e_1e_core = float(cp.sum(dm_core * hcore_orig))
+        
+        e_nuc = float(self.full_mol.energy_nuc())
+        return e_nuc + e_1e_core + e_h_active + e_2e_full
 
     def kernel(self):
         if not self.mf_outer.converged:
@@ -80,8 +81,54 @@ def kernel(self):
         
         # Build and Run Inner embedded solver
         mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
+        
+        B_mat = self.B[ifrag]
+        dm_core_mat = self.dm_core[ifrag]
+        h_eval_bare_mat = B_mat.T @ hcore_orig @ B_mat
+
+        # Add the missing core 1-electron energy (kinetic + nuclear attraction from the frozen core)
+        e1_core = float(cp.sum(dm_core_mat * hcore_orig))
+        
+        # Precompute the frozen core's 2-electron energy (constant during inner SCF)
+        v_eff_core_high = self.mf_inner_template.get_veff(self.full_mol, dm_core_mat)
+        e_coul_core = float(getattr(v_eff_core_high, 'ecoul', 0.0))
+        e_xc_core = float(getattr(v_eff_core_high, 'exc', 0.0))
+        
+        e_nuc_full = float(self.full_mol.energy_nuc())
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc_full
+        
+        # Override energy_elec to print the true ONIOM energy difference
+        def custom_energy_elec(dm=None, h1e=None, vhf=None):
+            if dm is None: dm = mf_inner.make_rdm1()
+            if vhf is None: vhf = mf_inner.get_veff(mf_inner.mol, dm)
+            
+            dm_cp = _as_cupy(dm)
+            
+            # e1: Active space single-electron energy + Core single-electron energy
+            e1_active = float(cp.sum(dm_cp * h_eval_bare_mat))
+            e1 = e1_active + e1_core
+            
+            # e2: Full system 2e energy minus core 2e energy
+            ecoul_full = float(getattr(vhf, 'ecoul', 0.0))
+            exc_full = float(getattr(vhf, 'exc', 0.0))
+            e2 = ecoul_full + exc_full
+            
+            # Update scf_summary for meaningful PySCF debugging output
+            mf_inner.scf_summary['e1'] = e1
+            mf_inner.scf_summary['coul'] = ecoul_full - e_coul_core
+            mf_inner.scf_summary['exc'] = exc_full - e_xc_core
+            
+            return e1 + e2, e2
+            
+        mf_inner.energy_elec = custom_energy_elec
+        
         self.log.info("Running high-level inner solver...")
         self.solve_embedded(ifrag)
+        if not self.mf_inner[ifrag].converged:
+            raise RuntimeError(
+                f"Embedded high-level SCF did not converge for fragment {ifrag}; "
+                "do not use this density for delta energy."
+            )
         
         dm_emb_high = _as_cupy(mf_inner.make_rdm1())
         dm_emb_low = self.dm_emb_init[ifrag]
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
index 01e342d15..2c8db5586 100644
--- a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -174,7 +174,53 @@ def kernel(self):
         
         self.log.info("Running high-level inner SCF in embedding space...")
         mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
+        
+        B_mat = self.B[ifrag]
+        dm_core_mat = self.dm_core[ifrag]
+        h_eval_bare_mat = B_mat.T @ hcore_orig @ B_mat
+        
+        # Add the missing core 1-electron energy (kinetic + nuclear attraction from the frozen core)
+        e1_core = float(cp.sum(dm_core_mat * hcore_orig))
+        
+        # Precompute the frozen core's 2-electron energy (constant during inner SCF)
+        v_eff_core_high = self.mf_inner_template.get_veff(self.full_mol, dm_core_mat)
+        e_coul_core = float(getattr(v_eff_core_high, 'ecoul', 0.0))
+        e_xc_core = float(getattr(v_eff_core_high, 'exc', 0.0))
+        
+        e_nuc_full = float(self.full_mol.energy_nuc())
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc_full
+        
+        # Override energy_elec to print the true full system energy
+        def custom_energy_elec(dm=None, h1e=None, vhf=None):
+            if dm is None: dm = mf_inner.make_rdm1()
+            if vhf is None: vhf = mf_inner.get_veff(mf_inner.mol, dm)
+            
+            dm_cp = _as_cupy(dm)
+            
+            # e1: Active space single-electron energy + Core single-electron energy
+            e1_active = float(cp.sum(dm_cp * h_eval_bare_mat))
+            e1 = e1_active + e1_core
+            
+            # e2: Full system 2e energy minus core 2e energy
+            ecoul_full = float(getattr(vhf, 'ecoul', 0.0))
+            exc_full = float(getattr(vhf, 'exc', 0.0))
+            e2 = ecoul_full + exc_full
+            
+            # Update scf_summary for meaningful debugging output
+            mf_inner.scf_summary['e1'] = e1
+            mf_inner.scf_summary['coul'] = ecoul_full - e_coul_core
+            mf_inner.scf_summary['exc'] = exc_full - e_xc_core
+            
+            return e1 + e2, e2
+            
+        mf_inner.energy_elec = custom_energy_elec
+
         self.solve_embedded(ifrag)
+        if not self.mf_inner[ifrag].converged:
+            raise RuntimeError(
+                f"Embedded high-level SCF did not converge for fragment {ifrag}; "
+                "do not use this density for delta energy."
+            )
         
         dm_emb_high = _as_cupy(mf_inner.make_rdm1())
         dm_emb_low = self.dm_emb_init[ifrag]