diff --git a/examples/embedding/48-dmet-embedding.py b/examples/embedding/48-dmet-embedding.py
new file mode 100644
index 000000000..f2adc8b1b
--- /dev/null
+++ b/examples/embedding/48-dmet-embedding.py
@@ -0,0 +1,86 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example 48: Standard Multi-Fragment Self-Consistent DMET Calculation.
+
+This script demonstrates how to partition a molecule into multiple fragments
+and optimize the global correlation potential (u_oao) to match high-level and 
+low-level local 1-RDM density matrices self-consistently.
+"""
+
+from pyscf import gto
+from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.qmmm.embedding.embedding import DMET
+
+def run_dmet_example():
+    # 1. Define the system (Ethane molecule with 6-31G basis)
+    mol = gto.Mole()
+    mol.atom = '''
+        C      -0.76091    -0.00000     0.00000
+        C       0.76091    -0.00000     0.00000
+        H      -1.16001     1.02029     0.00000
+        H      -1.16001    -0.51014    -0.88357
+        H      -1.16001    -0.51014     0.88357
+        H       1.16001    -1.02029     0.00000
+        H       1.16001     0.51014     0.88357
+        H       1.16001     0.51014    -0.88357    
+    '''
+    mol.basis = '6-31g'
+    mol.verbose = 4  # Set verbose to see detailed DMET iteration logs
+    mol.build()
+
+    print("--- Step 1: Initialize Low-Level and High-Level Solver Templates ---")
+    # In this classic exact-back-to-exact test case, we nest RHF within RHF.
+    # DMET should converge the correlation potential to exactly zero.
+    mf_outer = gpu_hf.RHF(mol)
+    mf_outer.conv_tol = 1e-12
+    
+    mf_inner_template = gpu_hf.RHF(mol)
+    mf_inner_template.conv_tol = 1e-12
+
+    print("\n--- Step 2: Define Molecular Fragments ---")
+    # Partition the Ethane molecule into two methyl fragments based on atom indices:
+    # Fragment 0: First Methyl group [C1, H1, H2, H3]
+    # Fragment 1: Second Methyl group [C2, H4, H5, H6]
+    fragments = [
+        [0, 2, 3, 4],
+        [1, 5, 6, 7]
+    ]
+    print(f"Fragment 0 atom indices: {fragments[0]}")
+    print(f"Fragment 1 atom indices: {fragments[1]}")
+
+    print("\n--- Step 3: Setup and Execute the Self-Consistent DMET Solver ---")
+    dmet_solver = DMET(
+        mf_outer=mf_outer,
+        mf_inner=mf_inner_template,
+        fragments=fragments,
+        threshold=1e-5,       # SVD eigenvalue threshold for bath selection
+        max_macro_iter=20,    # Max macro loops for correlation potential fitting
+        macro_tol=1e-4        # Convergence tolerance for the density matching cost
+    )
+
+    # Trigger the DMET macroscopic self-consistent optimization
+    e_dmet = dmet_solver.kernel()
+
+    print("\n--- Final Results Summary ---")
+    # Run the raw full system RHF as an exact reference
+    e_hf_ref = mf_outer.kernel()
+    
+    print(f"Global Reference RHF Energy  : {e_hf_ref:.8f} Hartree") # -79.19706462
+    print(f"Macroscopic DMET Total Energy: {e_dmet:.8f} Hartree") # -79.19706462
+    print(f"Absolute Energy Deviation    : {abs(e_dmet - e_hf_ref):.2e} Hartree") # 9.15e-11 Hartree
+
+if __name__ == '__main__':
+    run_dmet_example()
\ No newline at end of file
diff --git a/examples/embedding/49-dft-dmet-embedding.py b/examples/embedding/49-dft-dmet-embedding.py
new file mode 100644
index 000000000..a3fe49afc
--- /dev/null
+++ b/examples/embedding/49-dft-dmet-embedding.py
@@ -0,0 +1,84 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example 49: Single-Fragment Delta-Method DFT-in-DFT Embedding.
+
+This script demonstrates how to embed a high-level hybrid DFT functional (B3LYP)
+into a localized region of a low-level GGA DFT environment (PBE) using a 
+highly-optimized projection basis without macroscopic iterations.
+"""
+
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
+
+def run_dft_embedding_example():
+    # 1. Define the system (Ethane molecule with 6-31G basis)
+    mol = gto.Mole()
+    mol.atom = '''
+        C      -0.76091    -0.00000     0.00000
+        C       0.76091    -0.00000     0.00000
+        H      -1.16001     1.02029     0.00000
+        H      -1.16001    -0.51014    -0.88357
+        H      -1.16001    -0.51014     0.88357
+        H       1.16001    -1.02029     0.00000
+        H       1.16001     0.51014     0.88357
+        H       1.16001     0.51014    -0.88357    
+    '''
+    mol.basis = '6-31g'
+    mol.verbose = 4  # Enable to monitor localized cluster basis dimensions and logs
+    mol.build()
+
+    print("--- Step 1: Prepare Environment (PBE) and Active Region (B3LYP) Solvers ---")
+    # Low-level full system solver (Environment description)
+    mf_outer = rks.RKS(mol, xc='PBE')
+    mf_outer.conv_tol = 1e-10
+    
+    # High-level solver template (Active cluster description)
+    mf_inner_template = rks.RKS(mol, xc='B3LYP')
+    mf_inner_template.conv_tol = 1e-10
+
+    print("\n--- Step 2: Define Single Target Active Fragment ---")
+    # Select only one methyl group as the active QM region. 
+    # The other half will automatically serve as the embedding environment.
+    active_fragment = [0, 2, 3, 4]
+    print(f"Target QM Active Region atom indices: {active_fragment}")
+
+    print("\n--- Step 3: Initialize and Run Single Fragment Embedding ---")
+    # Construct the single-shot embedding object. Notice that mf_inner_template 
+    # will be cloned internally via .copy() to completely avoid cache poisoning.
+    emb_obj = SingleFragmentEmbedding(
+        mf_outer=mf_outer,
+        mf_inner=mf_inner_template,
+        fragment=active_fragment,
+        threshold=1e-5  # Filters out pure fragment states and numerical noise
+    )
+
+    # Compute the final multi-scale total energy via the delta method:
+    # E_tot = E_PBE(Full) + [E_B3LYP(Active) - E_PBE(Active)]
+    e_embedded_tot = emb_obj.kernel()
+
+    print("\n--- Step 4: Verification of Template Isolation ---")
+    # Verify that our protection armor works seamlessly: 
+    # Executing the template after embedding must converge successfully without any side effects.
+    print("Verifying inner template isolation status...")
+    mf_inner_template.kernel()
+    if mf_inner_template.converged:
+        print("Template isolation check passed successfully! No cache poisoning detected.")
+    else:
+        print("Warning: Template convergence failed, check cache isolation leaks.")
+
+if __name__ == '__main__':
+    run_dft_embedding_example()
\ No newline at end of file
diff --git a/examples/embedding/50-example_ml_density_embedding.py b/examples/embedding/50-example_ml_density_embedding.py
new file mode 100644
index 000000000..b3f408d73
--- /dev/null
+++ b/examples/embedding/50-example_ml_density_embedding.py
@@ -0,0 +1,124 @@
+# Copyright 2021-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example: ML-Driven DFT Embedding (ONIOM-like scheme)
+
+This example demonstrates how to use the `HarrisRKS` and `SingleFragmentEmbedding_ML` 
+classes to perform a multi-scale quantum chemistry calculation (QM/QM). 
+It uses a dummy ML density evaluator to simulate an ultra-fast global PBE calculation, 
+and then performs a rigorous B3LYP high-level calculation only on the active fragment.
+"""
+
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embedding_dft_harris import HarrisRKS, SingleFragmentEmbedding_ML
+
+
+def dummy_eval_density_func(mol, xc, grids):
+    """
+    A pure DFT surrogate that mimics the behavior of an ML density predictor.
+    It performs a standard SCF to convergence and returns the exact potentials 
+    and energies, acting as the "Ground Truth" ML model.
+    """
+    print("\n[ML Surrogate] Generating density and effective potentials...")
+    mf = rks.RKS(mol)
+    mf.xc = xc
+    mf.grids = grids
+    mf.verbose = 0
+    mf.kernel()
+    
+    dm = cp.asarray(mf.make_rdm1())
+    vj, vk = mf.get_jk(mol, dm)
+    e_j = 0.5 * float(cp.sum(dm * vj))
+    
+    is_hybrid = mf._numint.libxc.is_hybrid_xc(xc)
+    if is_hybrid:
+        hyb = mf._numint.libxc.hybrid_coeff(xc, spin=mol.spin)
+        vk = vk * hyb
+        e_k = 0.5 * float(cp.sum(dm * vk))
+    else:
+        vk = None
+        e_k = 0.0
+        
+    _, e_xc, vxc = mf._numint.nr_rks(mol, grids, xc, dm)
+    int_rho_vxc = float(cp.sum(dm * vxc))
+    
+    print("[ML Surrogate] Potential generation completed.\n")
+    return vj, vk, vxc, e_j, e_k, float(e_xc), int_rho_vxc
+
+
+def main():
+    # 1. Build a target molecule (e.g., Hexane)
+    mol = gto.Mole()
+    mol.atom = '''
+        C   1.4522500000  -2.8230000000   0.0000000000
+        C   1.4522500000  -1.2830000000   0.0000000000
+        C   0.0002500000  -0.7700000000   0.0000000000
+        C   0.0002500000   0.7700000000   0.0000000000
+        C  -1.4517500000   1.2830000000   0.0000000000
+        C  -1.4517500000   2.8230000000   0.0000000000
+        H   2.4792500000  -3.1870000000   0.0000000000
+        H   0.9382500000  -3.1870000000   0.8900000000
+        H   0.9382500000  -3.1870000000  -0.8900000000
+        H   1.9652500000  -0.9200000000   0.8900000000
+        H   1.9652500000  -0.9200000000  -0.8900000000
+        H  -0.5137500000  -1.1330000000  -0.8900000000
+        H  -0.5137500000  -1.1330000000   0.8900000000
+        H   0.5132500000   1.1330000000   0.8900000000
+        H   0.5132500000   1.1330000000  -0.8900000000
+        H  -1.9657500000   0.9200000000  -0.8900000000
+        H  -1.9657500000   0.9200000000   0.8900000000
+        H  -2.4797500000   3.1870000000   0.0000000000
+        H  -0.9377500000   3.1870000000   0.8900000000
+        H  -0.9377500000   3.1870000000  -0.8900000000
+    '''
+    mol.basis = 'sto3g' # Use a small basis set for quick demonstration
+    mol.spin = 0
+    mol.verbose = 4
+    mol.build()
+
+    # 2. Define the active region (e.g., the terminal methyl group: C + 3xH)
+    methyl_fragment = [0, 6, 7, 8]
+    
+    print("==================================================")
+    print("   Starting ML-Driven DFT Embedding Calculation   ")
+    print("==================================================")
+
+    # 3. Setup the Global Low-Level Solver (driven by ML)
+    # This evaluates the full system using the Harris functional approach in 1 step.
+    mf_outer = HarrisRKS(mol, dummy_eval_density_func, xc='PBE')
+    
+    # 4. Setup the Local High-Level Solver (Standard rigorous DFT)
+    # This will only be executed within the embedded active space.
+    mf_inner = rks.RKS(mol, xc='B3LYP')
+    
+    # 5. Initialize and execute the ML Embedding framework
+    emb_obj = SingleFragmentEmbedding_ML(mf_outer, mf_inner, methyl_fragment)
+    e_tot = emb_obj.kernel()
+    
+    print("\n==================================================")
+    print("                 Summary of Results               ")
+    print("==================================================")
+    print(f"Global Low-Level E (ML-PBE) : {mf_outer.e_tot:.8f} Hartree")
+    print(f"High-Level Local E (B3LYP)  : {emb_obj.e_inner[0]:.8f} Hartree")
+    print(f"Low-Level Local E (PBE)     : {emb_obj.e_inner[0] - emb_obj.e_tot + mf_outer.e_tot:.8f} Hartree") # Reverse engineered for display
+    print(f"--------------------------------------------------")
+    print(f"FINAL ONIOM TOTAL ENERGY    : {e_tot:.8f} Hartree")
+    print("==================================================")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/__init__.py b/gpu4pyscf/qmmm/embedding/__init__.py
new file mode 100644
index 000000000..6884d9f8e
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .embedding import DMET
+from .embedding_dft import SingleFragmentEmbedding
diff --git a/gpu4pyscf/qmmm/embedding/embedding.py b/gpu4pyscf/qmmm/embedding/embedding.py
new file mode 100644
index 000000000..d3ff8385c
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/embedding.py
@@ -0,0 +1,560 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from pyscf import lib
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import tag_array
+import pyscf.ao2mo
+
+
+def _as_cupy(x):
+    if isinstance(x, cp.ndarray):
+        return x
+    return cp.asarray(x)
+
+
+def lowdin_orth(s):
+    s = _as_cupy(s)
+    s = 0.5 * (s + s.T)
+    eigvals, eigvecs = cp.linalg.eigh(s)
+    keep = eigvals > 1e-12
+    if not cp.all(keep):
+        eigvals = eigvals[keep]
+        eigvecs = eigvecs[:, keep]
+    inv_sqrt = 1.0 / cp.sqrt(eigvals)
+    sqrt = cp.sqrt(eigvals)
+    X = (eigvecs * inv_sqrt) @ eigvecs.T # S^{-1/2}
+    X_inv = (eigvecs * sqrt) @ eigvecs.T # S^{+1/2}
+    return X, X_inv
+
+
+def get_fragment_ao_indices(mol, frag_atoms):
+    aoslice = mol.aoslice_by_atom()
+    indices = []
+    for ia in frag_atoms:
+        ia = int(ia)
+        if ia < 0 or ia >= mol.natm:
+            raise ValueError(f"Atom index {ia} is out of range [0, {mol.natm}).")
+        p0, p1 = int(aoslice[ia, 2]), int(aoslice[ia, 3])
+        indices.extend(range(p0, p1))
+    indices = cp.asarray(sorted(indices), dtype=cp.int32)
+    if indices.size == 0:
+        raise ValueError("Fragment is empty: no atomic orbitals were selected.")
+    return indices
+
+
+def schmidt_decompose(mo_coeff_oao, mo_occ, frag_idx, env_idx, threshold=1e-5):
+    mo_coeff_oao = _as_cupy(mo_coeff_oao)
+    mo_occ = _as_cupy(mo_occ)
+    env_idx = _as_cupy(env_idx)
+    frag_idx = _as_cupy(frag_idx)
+    
+    occ_mask = mo_occ > 1e-8
+    C_occ = mo_coeff_oao[:, occ_mask]
+    
+    if env_idx.size == 0 or C_occ.shape[1] == 0:
+        s_dummy = cp.ones(C_occ.shape[1]) if env_idx.size == 0 else cp.zeros(0)
+        return (cp.zeros((0, 0)), cp.zeros((0, 0)), 
+                {'n_core_electrons': 0, 'singular_values': s_dummy})
+        
+    C_A = C_occ[frag_idx, :]
+    
+    U, S, Vh = cp.linalg.svd(C_A, full_matrices=True)
+    
+    C_rot = C_occ @ Vh.T
+    
+    # Broadly select all potential bath orbitals (including pure fragment ones S ~ 1.0)
+    is_bath_candidate = (S > threshold) #& (S < 1.0 - threshold)
+    is_core_small = S <= threshold
+    n_sv = len(S)
+    
+    # Extract the environment part for these candidates
+    raw_bath_orb = C_rot[env_idx, :n_sv][:, is_bath_candidate]
+    
+    # Calculate their true physical norms in the environment space
+    norms = cp.linalg.norm(raw_bath_orb, axis=0)
+    
+    # Keep only those with a mathematically meaningful environment tail.
+    # This automatically drops pure fragment orbitals (norm ~ 0) preventing null vectors,
+    # while safely preserving orbitals with legitimate tiny tails (like in STO-3G).
+    valid_mask = norms > 1e-10
+    
+    # Apply the mask to both the orbitals and their norms
+    bath_orb = raw_bath_orb[:, valid_mask]
+    valid_norms = norms[valid_mask]
+    
+    # Safely normalize the surviving valid bath orbitals
+    bath_orb = bath_orb / valid_norms
+    
+    # Pure environment core orbitals come from null space + small singular values
+    core_orb_small = C_rot[env_idx, :n_sv][:, is_core_small]
+    core_orb_null = C_rot[env_idx, n_sv:]
+    core_orb = cp.hstack([core_orb_small, core_orb_null])
+    
+    info = {
+        'n_core_electrons': 2 * core_orb.shape[1],
+        'singular_values': S
+    }
+    return bath_orb, core_orb, info
+
+
+def build_embedding_basis(nao, frag_idx, env_idx, bath_orb):
+    """
+    Construct the AO -> embedded transformation matrix B^{mu}_{k}
+    """
+    # Due to the Carlson-Keller theorem, the lowdin OAO basis 
+    # and the AO basis is 1-to-1 match.
+    # Therefore, we can use the fragment indices to construct the embedding matrix.
+    frag_idx = _as_cupy(frag_idx)
+    env_idx = _as_cupy(env_idx)
+    n_frag = frag_idx.size
+    n_bath = bath_orb.shape[1] if bath_orb.size else 0
+
+    B = cp.zeros((nao, n_frag + n_bath), dtype=float)
+    B[frag_idx, cp.arange(n_frag)] = 1.0
+    if n_bath > 0:
+        B[env_idx[:, None], cp.arange(n_bath)[None, :] + n_frag] = bath_orb
+    return B
+
+
+def build_core_dm(env_idx, core_orb, nao):
+    env_idx = _as_cupy(env_idx)
+    if core_orb.size == 0:
+        return cp.zeros((nao, nao), dtype=float)
+    C_core = cp.zeros((nao, core_orb.shape[1]), dtype=float)
+    C_core[env_idx, :] = core_orb
+    return 2.0 * (C_core @ C_core.T)
+
+
+def transform_h1(h_ao, B):
+    """
+    Project a 1-electron operator from the full AO basis to the embedded basis.
+    """
+    return B.T @ h_ao @ B
+
+
+def _build_embedded_mole(nemb, n_emb_electrons, spin=0, verbose=0, max_memory=4000):
+    if n_emb_electrons < 0 or n_emb_electrons > 2 * nemb:
+        raise ValueError(f"Invalid embedded electron count: {n_emb_electrons}")
+
+    mol = gto.Mole()
+    mol.verbose = verbose
+    mol.max_memory = max_memory
+    mol.atom = []
+    mol.basis = {}
+    mol.unit = 'Bohr'
+    mol.spin = spin
+    mol.nelectron = int(n_emb_electrons)
+    mol.charge = 0
+    mol.build(parse_arg=False, dump_input=False)
+
+    nemb_int = int(nemb)
+    def _nao_nr(self=mol, _n=nemb_int):
+        return _n
+
+    mol.nao_nr = _nao_nr
+    mol.nao = nemb_int
+    return mol
+
+
+def _instantiate_inner_mf(mf_template, embedded_mol):
+    cls = type(mf_template)
+    try:
+        new_mf = cls(embedded_mol)
+    except TypeError:
+        new_mf = copy.copy(mf_template)
+        new_mf.mol = embedded_mol
+        new_mf.mo_coeff = None
+        new_mf.mo_energy = None
+        new_mf.mo_occ = None
+        new_mf.converged = False
+
+    for attr in ('xc', 'conv_tol', 'conv_tol_grad', 'max_cycle',
+                 'level_shift', 'damp', 'diis', 'verbose'):
+        if hasattr(mf_template, attr):
+            try:
+                setattr(new_mf, attr, getattr(mf_template, attr))
+            except Exception:
+                pass
+
+    if hasattr(mf_template, 'grids') and hasattr(new_mf, 'grids'):
+        for g_attr in ('level', 'prune', 'atom_grid'):
+            if hasattr(mf_template.grids, g_attr):
+                try:
+                    setattr(new_mf.grids, g_attr, getattr(mf_template.grids, g_attr))
+                except Exception:
+                    pass
+
+    return new_mf
+
+
+class DMET(lib.StreamObject):
+    """
+    Density Matrix Embedding Theory driver with macroscopic iteration.
+
+    Parameters
+    ----------
+    mf_outer : SCF object
+        Low-level mean-field on the full system.
+    mf_inner : SCF/post-HF object
+        High-level mean-field or post-HF template applied to the embedded cluster.
+    fragments : list of lists of int
+        List of fragments, where each fragment is a list of atom indices.
+    threshold : float
+        Eigenvalue cutoff used to classify environment orbitals.
+    max_macro_iter : int
+        Maximum number of macroscopic iterations for correlation potential (u).
+    macro_tol : float
+        Convergence tolerance for the difference in fragment 1-RDMs.
+    """
+
+    def __init__(self, mf_outer, mf_inner, fragments,
+                 threshold=1e-5, max_macro_iter=20, macro_tol=1e-4, 
+                 verbose=None):
+        if mf_outer is None or mf_inner is None:
+            raise ValueError("mf_outer and mf_inner are both required.")
+        if not fragments:
+            raise ValueError("Provide a list of fragments to define the DMET regions.")
+        
+        if verbose is None:
+            verbose = mf_outer.verbose
+        else:
+            verbose = int(verbose)
+        self.log = logger.new_logger(mf_outer, verbose)
+        self.mf_outer = mf_outer.copy()
+        self.mf_inner_template = mf_inner.copy()
+        self.full_mol = mf_outer.mol
+        self.threshold = float(threshold)
+        self.max_macro_iter = max_macro_iter
+        self.macro_tol = macro_tol
+
+        self.fragments = [list(int(a) for a in frag) for frag in fragments]
+        self.nfrags = len(self.fragments)
+        
+        nao = int(self.full_mol.nao_nr())
+        all_idx = cp.arange(nao, dtype=cp.int32)
+        
+        self.frag_idx = []
+        self.env_idx = []
+        for frag_atoms in self.fragments:
+            f_idx = get_fragment_ao_indices(self.full_mol, frag_atoms)
+            self.frag_idx.append(f_idx)
+            env_mask = cp.ones(nao, dtype=bool)
+            env_mask[f_idx] = False
+            self.env_idx.append(all_idx[env_mask])
+
+        self.bath_orb = [None] * self.nfrags
+        self.core_orb = [None] * self.nfrags
+        self.eig_info = [None] * self.nfrags
+        self.B_oao = [None] * self.nfrags
+        self.B = [None] * self.nfrags
+        self.dm_core = [None] * self.nfrags
+        self.v_core_ao = [None] * self.nfrags
+        self.h_emb = [None] * self.nfrags
+        self.e_core = [None] * self.nfrags
+        self.mf_inner = [None] * self.nfrags
+        self.dm_emb_init = [None] * self.nfrags
+        self.e_inner = [None] * self.nfrags
+        self.e_tot = None            
+        self.u_oao = cp.zeros((nao, nao))  # Global correlation potential
+
+    def build_bath(self, ifrag, mo_coeff, mo_occ, X_inv, X):
+        """
+        Run the Schmidt decomposition for a specific fragment.
+        """
+        mo_coeff_oao = X_inv @ _as_cupy(mo_coeff)
+        bath_orb, core_orb, info = schmidt_decompose(
+            mo_coeff_oao, mo_occ, self.frag_idx[ifrag], self.env_idx[ifrag], self.threshold)
+
+        nao_oao = X.shape[1]
+        B_oao = build_embedding_basis(nao_oao, self.frag_idx[ifrag], self.env_idx[ifrag], bath_orb)
+        B_ao = X @ B_oao
+
+        if core_orb.size > 0:
+            C_core_oao = cp.zeros((nao_oao, core_orb.shape[1]), dtype=float)
+            C_core_oao[self.env_idx[ifrag], :] = core_orb
+            C_core_ao = X @ C_core_oao
+            dm_core_ao = 2.0 * (C_core_ao @ C_core_ao.T)
+        else:
+            dm_core_ao = cp.zeros((X.shape[0], X.shape[0]), dtype=float)
+
+        self.bath_orb[ifrag] = bath_orb
+        self.core_orb[ifrag] = core_orb
+        self.eig_info[ifrag] = info
+        self.B_oao[ifrag] = B_oao        
+        self.B[ifrag] = B_ao             
+        self.dm_core[ifrag] = dm_core_ao
+
+        n_frag = int(self.frag_idx[ifrag].size)
+        n_bath = int(bath_orb.shape[1] if bath_orb.size else 0)
+        n_core = int(core_orb.shape[1] if core_orb.size else 0)
+
+        self.log.info(f"Fragment {ifrag} Schmidt decomposition singular values:")
+        self.log.info(f"    {info['singular_values']}")
+        
+        self.log.info(f"Fragment {ifrag} embedding basis partition:")
+        self.log.info(f"    Number of Fragment AOs : {n_frag}")
+        self.log.info(f"    Number of Bath Orbitals: {n_bath}")
+        self.log.info(f"    Number of Core Orbitals: {n_core} ({info['n_core_electrons']} electrons)")
+        self.log.info(f"    Total Embedded Space   : {n_frag + n_bath} / {nao_oao} (full AO)")
+
+        return self
+
+    def build_embedded_hamiltonian(self, ifrag, hcore_orig):
+        """
+        Construct h^A in the embedded basis A.
+        Uses bare hcore_orig (without the correlation potential 'u').
+        """
+        mol = self.full_mol
+        h_ao = _as_cupy(hcore_orig)
+
+        if self.eig_info[ifrag]['n_core_electrons'] > 0:
+            v_core_ao = _as_cupy(self.mf_outer.get_veff(mol, self.dm_core[ifrag]))
+        else:
+            v_core_ao = cp.zeros_like(h_ao)
+            
+        self.v_core_ao[ifrag] = v_core_ao
+
+        h_emb = transform_h1(h_ao + v_core_ao, self.B[ifrag])
+
+        if self.eig_info[ifrag]['n_core_electrons'] > 0:
+            e_core = (cp.einsum('ij,ji->', self.dm_core[ifrag], h_ao)
+                      + 0.5 * cp.einsum('ij,ji->', self.dm_core[ifrag], v_core_ao))
+        else:
+            e_core = 0.0
+
+        self.h_emb[ifrag] = h_emb # embeding basis
+        self.e_core[ifrag] = float(e_core)
+        return self
+
+    def _build_inner_mf(self, ifrag, dm_full_ao):
+        nemb = self.B[ifrag].shape[1]
+        n_total_electrons = int(self.full_mol.nelectron)
+        n_emb_electrons = n_total_electrons - int(self.eig_info[ifrag]['n_core_electrons'])
+
+        emb_mol = _build_embedded_mole(
+            nemb=nemb,
+            n_emb_electrons=n_emb_electrons,
+            spin=int(getattr(self.full_mol, 'spin', 0)),
+            verbose=0,
+            max_memory=int(getattr(self.full_mol, 'max_memory', 4000)),
+        )
+
+        mf_inner = _instantiate_inner_mf(self.mf_inner_template, emb_mol)
+
+        h_emb = self.h_emb[ifrag]
+        ovlp = cp.eye(nemb)
+
+        e_nuc = float(self.full_mol.energy_nuc())
+        mf_inner.get_hcore = lambda *args, **kwargs: h_emb
+        mf_inner.get_ovlp = lambda *args, **kwargs: ovlp
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc + self.e_core[ifrag]
+
+        B_mat = self.B[ifrag]
+        v_core_ao = self.v_core_ao[ifrag]
+
+        # Overwrite get_veff to compute on-the-fly using the outer HF
+        def _get_veff(mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+            if dm is None:
+                dm = mf_inner.make_rdm1()
+            dm_cp = _as_cupy(dm)
+            
+            # Project embedded dm to full AO basis
+            dm_ao = B_mat @ dm_cp @ B_mat.T
+                
+            dm_full_ao_inner = self.dm_core[ifrag] + dm_ao
+            
+            # For pure HF, this may be redundant, cause J, K are linear,
+            # but this is also used for DFT based method, so, we use the delta method.
+            v_eff_full = self.mf_inner_template.get_veff(self.full_mol, dm_full_ao_inner, hermi=hermi)
+            v_eff_active = _as_cupy(v_eff_full) - v_core_ao
+            
+            # Project Veff back to embedded basis
+            if dm_cp.ndim == 2:
+                v_eff_emb = B_mat.T @ v_eff_active @ B_mat
+            else:
+                v_eff_emb = cp.einsum('pi,xpq,qj->xij', B_mat, v_eff_active, B_mat)
+
+            ecoul = getattr(v_eff_full, 'ecoul', 0.0)
+            exc = getattr(v_eff_full, 'exc', 0.0)
+            vj_full = getattr(v_eff_full, 'vj', None)
+            if vj_full is not None:
+                vj_emb = B_mat.T @ _as_cupy(vj_full) @ B_mat
+            else:
+                vj_emb = cp.zeros_like(v_eff_emb)
+                
+            vk_full = getattr(v_eff_full, 'vk', None)
+            if vk_full is not None:
+                vk_emb = B_mat.T @ _as_cupy(vk_full) @ B_mat
+            else:
+                vk_emb = cp.zeros_like(v_eff_emb)
+            
+            v_eff_emb = tag_array(v_eff_emb, ecoul=ecoul, exc=exc, vj=vj_emb, vk=vk_emb)
+            
+            return v_eff_emb
+
+        mf_inner.get_veff = _get_veff
+
+        # using s to make the upper index to the lower index
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        sB = s_ao @ self.B[ifrag]
+        # Due to ths BSC_core = 0, this the following is equivelent to dm_full_ao - dm_core_ao
+        dm_emb_init = sB.T @ dm_full_ao @ sB
+        
+        trace = float(cp.trace(dm_emb_init))
+        if trace > 0:
+            dm_emb_init = dm_emb_init * (n_emb_electrons / trace)
+        self.dm_emb_init[ifrag] = dm_emb_init
+
+        self.mf_inner[ifrag] = mf_inner
+        return mf_inner
+
+    def solve_embedded(self, ifrag):
+        e_inner = self.mf_inner[ifrag].kernel(dm0=self.dm_emb_init[ifrag])
+        if isinstance(e_inner, tuple):
+            e_inner = float(self.mf_inner[ifrag].e_tot)
+        else:
+            e_inner = float(e_inner)
+        self.e_inner[ifrag] = e_inner
+        return e_inner
+
+    def kernel(self):
+        orig_outer_get_hcore = self.mf_outer.get_hcore
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
+
+        for macro_iter in range(self.max_macro_iter):
+            self.log.info(f"Macro Iter {macro_iter}")
+            u_ao = X_inv @ self.u_oao @ X_inv
+
+            # Run low-level SCF with current correlation potential 'u'
+            self.mf_outer.get_hcore = lambda *args, **kwargs: cp.asnumpy(hcore_orig + u_ao)
+            self.mf_outer.mo_coeff = None # Force re-run
+            self.mf_outer.kernel()
+            
+            mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
+            mo_occ = _as_cupy(self.mf_outer.mo_occ)
+            dm_full_ao = _as_cupy(self.mf_outer.make_rdm1())
+            
+            e_tot = 0.0
+            dm_inners = []
+
+            for ifrag in range(self.nfrags):
+                self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
+                self.build_embedded_hamiltonian(ifrag, hcore_orig)
+                mf_inner = self._build_inner_mf(ifrag, dm_full_ao)
+                self.solve_embedded(ifrag)
+                if not self.mf_inner[ifrag].converged:
+                    raise RuntimeError(
+                        f"Embedded high-level SCF did not converge for fragment {ifrag}; "
+                        "do not use this density for delta energy."
+                    )
+
+                dm_emb = _as_cupy(mf_inner.make_rdm1())
+                
+                # Transform inner DM back to full AO basis
+                B = self.B[ifrag]
+                dm_inner_active_ao = B @ dm_emb @ B.T
+                
+                dm_inner_full_ao = self.dm_core[ifrag] + dm_inner_active_ao
+                dm_inners.append(dm_inner_full_ao)
+
+                dm1_emb = dm_emb
+                n_frag = self.frag_idx[ifrag].size
+                
+                # Outer (Low-level) environment embedding
+                v_core_ao = self.v_core_ao[ifrag]
+                v_core_emb = B.T @ v_core_ao @ B
+                
+                # Apply 0.5 factor to core potential to avoid double counting across fragments
+                # The 0.5 factor should be removed for ONIOM energy of just 1 fragment.
+                h_eval = self.h_emb[ifrag] - 0.5 * v_core_emb
+                
+                is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
+
+                e_frag_elec = cp.sum(dm1_emb[:n_frag, :] * h_eval[:n_frag, :])
+                if not is_mean_field:
+                    raise NotImplementedError("Non-mean-field solver not implemented, needs thorough testing...")
+                    self.log.info("using non-mean-field solver")
+                    nemb = B.shape[1]
+                    # TODO: this can be replaced by a more efficient routine
+                    B_cpu = cp.asnumpy(B)
+                    eri_emb_cpu = pyscf.ao2mo.kernel(self.full_mol, B_cpu)
+                    eri_emb_cpu = pyscf.ao2mo.restore(1, eri_emb_cpu, nemb) # Restore to 4D array
+                    eri_emb = _as_cupy(eri_emb_cpu)
+                    
+                    if hasattr(mf_inner, 'make_rdm2'):
+                        dm2_emb = _as_cupy(mf_inner.make_rdm2())
+                    else:
+                        # Fallback using the HF 2-RDM formulation for post-HF methods lacking make_rdm2
+                        dm2_emb = (cp.einsum('ij,kl->ijkl', dm1_emb, dm1_emb) 
+                                   - 0.5 * cp.einsum('il,jk->ijkl', dm1_emb, dm1_emb))
+                    
+                    e_frag_elec += 0.5 * cp.sum(dm2_emb[:n_frag, :, :, :] * eri_emb[:n_frag, :, :, :])
+                else:
+                    self.log.info("using mean-field solver")
+                    v_eff_emb = mf_inner.get_veff(dm=dm1_emb)
+                    e_frag_elec += 0.5 * cp.sum(dm1_emb[:n_frag, :] * _as_cupy(v_eff_emb)[:n_frag, :])
+                
+                e_frag_nuc = 0.0
+                coords = self.full_mol.atom_coords()
+                charges = self.full_mol.atom_charges()
+                frag_atoms = self.fragments[ifrag]
+                for i in frag_atoms:
+                    for j in range(self.full_mol.natm):
+                        if i == j: continue
+                        r = np.linalg.norm(coords[i] - coords[j])
+                        e_frag_nuc += 0.5 * charges[i] * charges[j] / r
+                        
+                self.log.info(f"Fragment {ifrag} Electronic Energy: {float(e_frag_elec):.8f} | Nuclear Energy: {e_frag_nuc:.8f}")
+                e_tot += float(e_frag_elec) + e_frag_nuc
+
+            dm_low_oao = X_inv @ dm_full_ao @ X_inv
+            
+            error = 0.0
+            for ifrag in range(self.nfrags):
+                idx = self.frag_idx[ifrag]
+                idx_mesh = cp.ix_(idx, idx)
+                
+                dm_high_oao = X_inv @ dm_inners[ifrag] @ X_inv
+                
+                diff = dm_high_oao[idx_mesh] - dm_low_oao[idx_mesh]
+                error += float(cp.linalg.norm(diff))
+                
+                # Simple gradient descent step
+                # TODO: 0.5 is a hyperparameter. If it oscillates, reduce it (e.g. to 0.1).
+                self.u_oao[idx_mesh] -= 0.5 * diff
+            
+            self.log.note(f"Macro Iter {macro_iter + 1:2d} | E_DMET = {e_tot:.8f} | max(dD) = {error:.6e}")
+            self.e_tot = e_tot
+            if error < self.macro_tol:
+                self.log.note("DMET macroscopic iterations converged.")
+                break
+        
+        # Restore outer mean-field to its original unpolluted state
+        self.mf_outer.get_hcore = orig_outer_get_hcore
+        self.mf_outer.mo_coeff = None
+        self.mf_outer.mo_energy = None
+        self.mf_outer.mo_occ = None
+
+        return self.e_tot
+
+    def __call__(self):
+        return self.kernel()
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft.py b/gpu4pyscf/qmmm/embedding/embedding_dft.py
new file mode 100644
index 000000000..b405598a9
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft.py
@@ -0,0 +1,162 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupy as cp
+import numpy as np
+import pyscf.ao2mo
+from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth, _as_cupy
+
+
+class SingleFragmentEmbedding(DMET):
+    """
+    Single-Fragment ONIOM-like embedding for DFT.
+    
+    This class performs a single-shot,
+    single-fragment delta-method energy evaluation WITHOUT macroscopic iterations.
+    """
+    
+    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
+        """
+        Parameters
+       -------
+        mf_outer : SCF object
+            Low-level mean-field on the full system (e.g., PBE).
+        mf_inner : SCF/DFT/post-HF object
+            High-level template applied to the embedded cluster (e.g., B3LYP).
+        fragment : list of int
+            A single list of atom indices defining the QM region.
+        threshold : float
+            Eigenvalue cutoff used to classify environment orbitals.
+        """
+        fragments = [fragment]
+        
+        super().__init__(mf_outer, mf_inner, fragments,
+                         threshold=threshold, max_macro_iter=1, verbose=verbose)
+        
+        self.fragment = self.fragments[0]
+    
+    def _evaluate_embedded_energy(self, mf_obj, dm_emb, h_eval_bare, B, dm_core):
+        e_h_active = float(cp.sum(dm_emb * h_eval_bare))
+        
+        dm_full_ao = dm_core + B @ dm_emb @ B.T
+        
+        v_eff_full = mf_obj.get_veff(self.full_mol, dm_full_ao)
+        e_2e_full = float(getattr(v_eff_full, 'ecoul', 0.0) + getattr(v_eff_full, 'exc', 0.0))
+        
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        e_1e_core = float(cp.sum(dm_core * hcore_orig))
+        
+        e_nuc = float(self.full_mol.energy_nuc())
+        return e_nuc + e_1e_core + e_h_active + e_2e_full
+
+    def kernel(self):
+        if not self.mf_outer.converged:
+            self.mf_outer.kernel()
+            
+        e_global_low = self.mf_outer.e_tot
+        mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
+        mo_occ = _as_cupy(self.mf_outer.mo_occ)
+        dm_full_ao_low = _as_cupy(self.mf_outer.make_rdm1())
+        
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
+
+        ifrag = 0
+        
+        self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
+        self.build_embedded_hamiltonian(ifrag, hcore_orig)
+        
+        # Build and Run Inner embedded solver
+        mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
+        
+        B_mat = self.B[ifrag]
+        dm_core_mat = self.dm_core[ifrag]
+        h_eval_bare_mat = B_mat.T @ hcore_orig @ B_mat
+
+        # Add the missing core 1-electron energy (kinetic + nuclear attraction from the frozen core)
+        e1_core = float(cp.sum(dm_core_mat * hcore_orig))
+        
+        # Precompute the frozen core's 2-electron energy (constant during inner SCF)
+        v_eff_core_high = self.mf_inner_template.get_veff(self.full_mol, dm_core_mat)
+        e_coul_core = float(getattr(v_eff_core_high, 'ecoul', 0.0))
+        e_xc_core = float(getattr(v_eff_core_high, 'exc', 0.0))
+        
+        e_nuc_full = float(self.full_mol.energy_nuc())
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc_full
+        
+        # Override energy_elec to print the true ONIOM energy difference
+        def custom_energy_elec(dm=None, h1e=None, vhf=None):
+            if dm is None: dm = mf_inner.make_rdm1()
+            if vhf is None: vhf = mf_inner.get_veff(mf_inner.mol, dm)
+            
+            dm_cp = _as_cupy(dm)
+            
+            # e1: Active space single-electron energy + Core single-electron energy
+            e1_active = float(cp.sum(dm_cp * h_eval_bare_mat))
+            e1 = e1_active + e1_core
+            
+            # e2: Full system 2e energy minus core 2e energy
+            ecoul_full = float(getattr(vhf, 'ecoul', 0.0))
+            exc_full = float(getattr(vhf, 'exc', 0.0))
+            e2 = ecoul_full + exc_full
+            
+            # Update scf_summary for meaningful PySCF debugging output
+            mf_inner.scf_summary['e1'] = e1
+            mf_inner.scf_summary['coul'] = ecoul_full - e_coul_core
+            mf_inner.scf_summary['exc'] = exc_full - e_xc_core
+            
+            return e1 + e2, e2
+            
+        mf_inner.energy_elec = custom_energy_elec
+        
+        self.log.info("Running high-level inner solver...")
+        self.solve_embedded(ifrag)
+        if not self.mf_inner[ifrag].converged:
+            raise RuntimeError(
+                f"Embedded high-level SCF did not converge for fragment {ifrag}; "
+                "do not use this density for delta energy."
+            )
+        
+        dm_emb_high = _as_cupy(mf_inner.make_rdm1())
+        dm_emb_low = self.dm_emb_init[ifrag]
+        
+        B = self.B[ifrag]
+        dm_core = self.dm_core[ifrag]
+        is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
+        
+        if is_mean_field:
+            h_eval_bare = B.T @ hcore_orig @ B
+            
+            # Evaluate High-Level energy
+            e_high = self._evaluate_embedded_energy(
+                self.mf_inner_template, dm_emb_high, h_eval_bare, B, dm_core
+            )
+            
+            # Evaluate Low-Level energy
+            e_low = self._evaluate_embedded_energy(
+                self.mf_outer, dm_emb_low, h_eval_bare, B, dm_core
+            )
+        else:
+            raise NotImplementedError("WFT evaluation is not implemented for this class.")
+        
+        delta_e = float(e_high - e_low)
+        self.log.note(f"Global Low-Level E : {e_global_low:.8f}")
+        self.log.note(f"Active Space dE    : {delta_e:.8f}")
+        
+        self.e_tot = e_global_low + delta_e
+        self.log.note(f"Total Embedded E   : {self.e_tot:.8f}")
+        
+        return self.e_tot
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
new file mode 100644
index 000000000..2c8db5586
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/embedding_dft_harris.py
@@ -0,0 +1,255 @@
+# Copyright 2021-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from gpu4pyscf.dft import rks
+from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf.qmmm.embedding.embedding import DMET, lowdin_orth, _as_cupy
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
+
+
+class HarrisRKS(rks.RKS):
+    """
+    Harris RKS class based on machine learning (ML) predicted density.
+    
+    This class bypasses traditional SCF iterations. Instead, it relies entirely 
+    on an external ML density evaluation function to construct the global effective 
+    potential and calculate the double counting energy.
+    """
+    def __init__(self, mol, eval_density_func, xc='LDA,VWN'):
+        super().__init__(mol)
+        self.xc = xc
+        self.max_cycle = 1  
+        
+        # eval_density_func is the external ML interface.
+        # Signature: def func(mol, xc, grids)
+        # Returns 7 elements:
+        #   1. vj: Coulomb potential matrix (AO basis)
+        #   2. vk: Exact exchange potential matrix (AO basis, can be None for pure DFT)
+        #   3. vxc: Exchange-correlation potential matrix (AO basis)
+        #   4. e_j: Coulomb energy (scalar)
+        #   5. e_k: Exact exchange energy (scalar, can be 0.0 for pure DFT)
+        #   6. e_xc: Exchange-correlation energy (scalar)
+        #   7. int_rho_vxc: Integral of rho * V_xc (scalar)
+        self.eval_density_func = eval_density_func
+        
+        self._v_eff_global = None
+        self._e_dc_global = None
+        self._use_harris_veff = False
+
+    def _get_harris_veff(self, mol=None):
+        if mol is None: 
+            mol = self.mol
+        
+        if self._v_eff_global is not None:
+            return self._v_eff_global
+            
+        if self.grids.coords is None:
+            self.grids.build()
+            
+        vj, vk, vxc, e_j, e_k, e_xc, int_rho_vxc = self.eval_density_func(
+            mol, self.xc, self.grids)
+        
+        v_eff_ao = _as_cupy(vj) + _as_cupy(vxc)
+        if vk is not None:
+            v_eff_ao -= _as_cupy(vk)
+            e_k = float(e_k)
+        else:
+            e_k = 0.0
+            
+        # double counting energy
+        e_dc = float(e_j) - e_k + float(int_rho_vxc) - float(e_xc)
+        
+        vk_array = _as_cupy(vk) if vk is not None else cp.zeros_like(v_eff_ao)
+        v_eff_ao = tag_array(v_eff_ao, ecoul=float(e_j) - e_k, exc=float(e_xc), vj=_as_cupy(vj), vk=vk_array)
+        
+        self._v_eff_global = v_eff_ao
+        self._e_dc_global = e_dc
+        return self._v_eff_global
+
+    def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+        # Use ML evaluation ONLY during the global SCF step.
+        # For standard embedding steps, fallback to the native exact DFT evaluation.
+        if getattr(self, '_use_harris_veff', False):
+            return self._get_harris_veff(mol)
+        return rks.RKS.get_veff(self, mol, dm, dm_last, vhf_last, hermi)
+
+    def kernel(self, dm0=None, **kwargs):
+
+        if self.max_cycle != 1:
+            lib.logger.warn(self, "HarrisRKS is a non-iterative method. "
+                                  f"Overriding max_cycle from {self.max_cycle} to 1.")
+            self.max_cycle = 1
+
+        # Temporarily enable Harris ML potential for the global 1-step evaluation
+        self._use_harris_veff = True
+        try:
+            e_tot = rks.RKS.kernel(self, dm0=dm0, **kwargs)
+        finally:
+            self._use_harris_veff = False
+            
+        self.converged = True
+        return e_tot
+
+    def energy_elec(self, dm=None, h1e=None, vhf=None):
+        """
+        E_elec = Tr[D * (h + Veff)] - E_DC
+        """
+        if getattr(self, '_use_harris_veff', False):
+            if dm is None: dm = self.make_rdm1()
+            if h1e is None: h1e = self.get_hcore()
+            if vhf is None: vhf = self._get_harris_veff(self.mol)
+            
+            dm_cp = _as_cupy(dm)
+            h1e_cp = _as_cupy(h1e)
+            vhf_cp = _as_cupy(vhf)
+            
+            fock = h1e_cp + vhf_cp
+            e_band = float(cp.sum(dm_cp * fock))
+            
+            e_elec = e_band - self._e_dc_global
+            return e_elec, self._e_dc_global
+        else:
+            # Fallback to standard energy evaluation during embedding steps
+            return rks.RKS.energy_elec(self, dm, h1e, vhf)
+
+
+class SingleFragmentEmbedding_ML(SingleFragmentEmbedding):
+    """
+    Single-Fragment ONIOM-like embedding utilizing ML density for the global low-level.
+    
+    This class performs DMET bond-breaking via SVD, and evaluates the local embedded 
+    energies using rigorous standard SCF evaluations to guarantee exact error cancellation 
+    between the high-level and low-level local calculations.
+    """
+    def __init__(self, mf_outer, mf_inner, fragment, threshold=1e-5, verbose=None):
+        """
+        Parameters
+        ----------
+        mf_outer : HarrisRKS object
+            The global low-level solver driven by ML density.
+        mf_inner : SCF/DFT/post-HF object
+            The high-level solver applied to the embedded fragment+bath cluster.
+        fragment : list of int
+            List of atom indices defining the core QM region.
+        threshold : float
+            Eigenvalue cutoff for the Schmidt decomposition to classify bath orbitals.
+        """
+        super().__init__(mf_outer, mf_inner, fragment,
+                         threshold=threshold, verbose=verbose)
+        self.fragment = self.fragments[0]
+
+    def kernel(self):
+
+        if not self.mf_outer.converged:
+            self.mf_outer.kernel()
+            
+        e_global_low = self.mf_outer.e_tot
+        self.log.note(f"Global Low-Level E (Harris) = {e_global_low:.8f}")
+        
+        mo_coeff = _as_cupy(self.mf_outer.mo_coeff)
+        mo_occ = _as_cupy(self.mf_outer.mo_occ)
+        dm_full_ao_low = _as_cupy(self.mf_outer.make_rdm1())
+        hcore_orig = _as_cupy(self.mf_outer.get_hcore())
+        s_ao = _as_cupy(self.mf_outer.get_ovlp())
+        X, X_inv = lowdin_orth(s_ao)
+
+        ifrag = 0
+
+        self.build_bath(ifrag, mo_coeff, mo_occ, X_inv, X)
+        self.build_embedded_hamiltonian(ifrag, hcore_orig)
+        
+        self.log.info("Running high-level inner SCF in embedding space...")
+        mf_inner = self._build_inner_mf(ifrag, dm_full_ao_low)
+        
+        B_mat = self.B[ifrag]
+        dm_core_mat = self.dm_core[ifrag]
+        h_eval_bare_mat = B_mat.T @ hcore_orig @ B_mat
+        
+        # Add the missing core 1-electron energy (kinetic + nuclear attraction from the frozen core)
+        e1_core = float(cp.sum(dm_core_mat * hcore_orig))
+        
+        # Precompute the frozen core's 2-electron energy (constant during inner SCF)
+        v_eff_core_high = self.mf_inner_template.get_veff(self.full_mol, dm_core_mat)
+        e_coul_core = float(getattr(v_eff_core_high, 'ecoul', 0.0))
+        e_xc_core = float(getattr(v_eff_core_high, 'exc', 0.0))
+        
+        e_nuc_full = float(self.full_mol.energy_nuc())
+        mf_inner.energy_nuc = lambda *args, **kwargs: e_nuc_full
+        
+        # Override energy_elec to print the true full system energy
+        def custom_energy_elec(dm=None, h1e=None, vhf=None):
+            if dm is None: dm = mf_inner.make_rdm1()
+            if vhf is None: vhf = mf_inner.get_veff(mf_inner.mol, dm)
+            
+            dm_cp = _as_cupy(dm)
+            
+            # e1: Active space single-electron energy + Core single-electron energy
+            e1_active = float(cp.sum(dm_cp * h_eval_bare_mat))
+            e1 = e1_active + e1_core
+            
+            # e2: Full system 2e energy minus core 2e energy
+            ecoul_full = float(getattr(vhf, 'ecoul', 0.0))
+            exc_full = float(getattr(vhf, 'exc', 0.0))
+            e2 = ecoul_full + exc_full
+            
+            # Update scf_summary for meaningful debugging output
+            mf_inner.scf_summary['e1'] = e1
+            mf_inner.scf_summary['coul'] = ecoul_full - e_coul_core
+            mf_inner.scf_summary['exc'] = exc_full - e_xc_core
+            
+            return e1 + e2, e2
+            
+        mf_inner.energy_elec = custom_energy_elec
+
+        self.solve_embedded(ifrag)
+        if not self.mf_inner[ifrag].converged:
+            raise RuntimeError(
+                f"Embedded high-level SCF did not converge for fragment {ifrag}; "
+                "do not use this density for delta energy."
+            )
+        
+        dm_emb_high = _as_cupy(mf_inner.make_rdm1())
+        dm_emb_low = self.dm_emb_init[ifrag]
+        
+        B = self.B[ifrag]
+        dm_core = self.dm_core[ifrag]
+        is_mean_field = hasattr(self.mf_inner_template, 'get_veff')
+        
+        if is_mean_field:
+            h_eval_bare = B.T @ hcore_orig @ B
+            
+            e_high = self._evaluate_embedded_energy(
+                self.mf_inner_template, dm_emb_high, h_eval_bare, B, dm_core
+            )
+            self.log.note(f"High-Level E : {e_high:.8f}")
+            
+            # Evaluate Low-Level energy (mf_outer will automatically use exact get_veff for xc here)
+            e_low = self._evaluate_embedded_energy(
+                self.mf_outer, dm_emb_low, h_eval_bare, B, dm_core
+            )
+            self.log.note(f"Low-Level E : {e_low:.8f}")
+        else:
+            raise NotImplementedError("WFT evaluation is not implemented for this class.")
+        
+        delta_e = float(e_high - e_low)
+        self.log.note(f"Global Low-Level E : {e_global_low:.8f}")
+        self.log.note(f"Active Space dE    : {delta_e:.8f}")
+        
+        self.e_tot = e_global_low + delta_e
+        self.log.note(f"Total Embedded E   : {self.e_tot:.8f}")
+        
+        return self.e_tot
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
new file mode 100644
index 000000000..4b643724d
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding.py
@@ -0,0 +1,195 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding import embedding
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
+
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+
+        cls.mol = gto.Mole()
+        cls.mol.atom = '''
+            C      -0.76091    -0.00000     0.00000
+            C       0.76091    -0.00000     0.00000
+            H      -1.16001     1.02029     0.00000
+            H      -1.16001    -0.51014    -0.88357
+            H      -1.16001    -0.51014     0.88357
+            H       1.16001    -1.02029     0.00000
+            H       1.16001     0.51014     0.88357
+            H       1.16001     0.51014    -0.88357    
+        '''
+        cls.mol.basis = '6-31g'
+        cls.mol.spin = 0
+        cls.mol.charge = 0
+        cls.mol.verbose = 0
+        cls.mol.build()
+
+        cls.fragments = [[0, 1], [2, 3]]
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol
+    
+    def test_b3lyp_in_b3lyp(self):
+
+        mf_outer = rks.RKS(self.mol, xc='B3LYP')
+        mf_inner_template = rks.RKS(self.mol, xc='B3LYP')
+
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner_template, [0, 2, 3, 4])
+        emb_obj.kernel()
+
+        e_ref = mf_outer.kernel()
+
+        assert np.abs(e_ref - emb_obj.e_tot) < 1e-8, f"Reference energy {e_ref} != Embedding energy {emb_obj.energy}"
+
+    def test_b3lyp_in_pbe(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner_template = rks.RKS(self.mol, xc='B3LYP')
+
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner_template, [i for i in range(8)])
+        emb_obj.kernel()
+
+        e_ref = mf_inner_template.kernel()
+
+        assert np.abs(e_ref - emb_obj.e_tot) < 1e-8, f"Reference energy {e_ref} != Embedding energy {emb_obj.energy}"
+
+    def test_algebraic_properties(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, [0, 1, 2])
+        emb_obj.kernel()
+
+        S_ao = cp.asarray(mf_outer.get_ovlp())
+        B = emb_obj.B[0]
+        D_core = emb_obj.dm_core[0]
+
+        # Check B^T * S * B == I (Orthonormality of embedding basis)
+        ortho_check = B.T @ S_ao @ B
+        identity = cp.eye(B.shape[1])
+        max_ortho_err = float(cp.abs(ortho_check - identity).max())
+        self.assertTrue(max_ortho_err < 1e-10, 
+                        f"Basis B is not orthogonal, max error: {max_ortho_err}")
+
+        # Check Spatial Isolation (Core DM projected onto the active space must be zero)
+        core_overlap = B.T @ S_ao @ D_core @ S_ao @ B
+        max_overlap_err = float(cp.abs(core_overlap).max())
+        self.assertTrue(max_overlap_err < 1e-10, 
+                        f"Core DM leaks into Active Space, max error: {max_overlap_err}")
+
+    def test_electron_conservation(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='B3LYP')
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, [0, 1])
+        emb_obj.kernel()
+        
+        S_ao = cp.asarray(mf_outer.get_ovlp())
+        D_emb_high = cp.asarray(emb_obj.mf_inner[0].make_rdm1())
+        D_core = emb_obj.dm_core[0]
+        B = emb_obj.B[0]
+        
+        # Project local active density back to full AO basis
+        D_emb_ao = B @ D_emb_high @ B.T # Identity S ignored
+        D_total_ao = D_core + D_emb_ao
+        
+        n_elec_calc = float(cp.trace(D_total_ao @ S_ao))
+        n_elec_exact = float(self.mol.nelectron)
+        
+        self.assertAlmostEqual(n_elec_calc, n_elec_exact, places=8, 
+                               msg=f"Electron loss: {n_elec_calc} != {n_elec_exact}")
+
+    def test_template_isolation_and_convergence(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner_template = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner_template, [0, 2, 3, 4], threshold=-1.0)
+        emb_obj.kernel()
+        
+        mf_inner_template.kernel()
+        
+        # Assert the template is still clean and converges properly
+        self.assertTrue(mf_inner_template.converged, 
+                        "Template object was poisoned and failed to converge!")
+
+    def test_hexane_core_isolation_and_exactness(self):
+        mol = gto.Mole()
+        mol.atom = '''
+            C   1.4522500000  -2.8230000000   0.0000000000
+            C   1.4522500000  -1.2830000000   0.0000000000
+            C   0.0002500000  -0.7700000000   0.0000000000
+            C   0.0002500000   0.7700000000   0.0000000000
+            C  -1.4517500000   1.2830000000   0.0000000000
+            C  -1.4517500000   2.8230000000   0.0000000000
+            H   2.4792500000  -3.1870000000   0.0000000000
+            H   0.9382500000  -3.1870000000   0.8900000000
+            H   0.9382500000  -3.1870000000  -0.8900000000
+            H   1.9652500000  -0.9200000000   0.8900000000
+            H   1.9652500000  -0.9200000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000  -0.8900000000
+            H  -1.9657500000   0.9200000000  -0.8900000000
+            H  -1.9657500000   0.9200000000   0.8900000000
+            H  -2.4797500000   3.1870000000   0.0000000000
+            H  -0.9377500000   3.1870000000   0.8900000000
+            H  -0.9377500000   3.1870000000  -0.8900000000
+        '''
+        mol.basis = 'sto3g'
+        mol.spin = 0
+        mol.verbose = 0
+        mol.build()
+
+        mf_outer = rks.RKS(mol, xc='PBE')
+        mf_inner = rks.RKS(mol, xc='PBE')
+        
+        methyl_fragment = [0, 6, 7, 8]
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, methyl_fragment, threshold=1e-5)
+        emb_obj.kernel()
+        
+        mf_outer.kernel()
+        e_global = mf_outer.e_tot
+        e_embedded = emb_obj.e_tot
+        self.assertTrue(np.abs(e_global - e_embedded) < 1e-6, 
+                        f"PBE-in-PBE Exactness failed! Error: {np.abs(e_global - e_embedded)}")
+        
+        dm_core_sum = float(cp.sum(emb_obj.dm_core[0]))
+        self.assertTrue(dm_core_sum > 1.0, 
+                        "Hexane test did not generate a non-trivial Core DM. SVD truncation might be failing.")
+
+    def test_pure_dft_vk_bypass(self):
+        mf_outer = rks.RKS(self.mol, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding(mf_outer, mf_inner, self.fragments[0])
+        try:
+            emb_obj.kernel()
+        except AttributeError as e:
+            self.fail(f"Embedding failed for Pure DFT due to missing vk attribute: {e}")
+            
+        self.assertTrue(emb_obj.e_tot is not None, "Pure DFT embedding failed to return an energy.")
+
+
+if __name__ == '__main__':
+    print("Full Tests for ONIOM-like DFT embedding.")
+    unittest.main()
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
new file mode 100644
index 000000000..3a2db6773
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dft_embedding_harris.py
@@ -0,0 +1,135 @@
+# Copyright 2021-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embedding_dft import SingleFragmentEmbedding
+from gpu4pyscf.qmmm.embedding.embedding_dft_harris import HarrisRKS, SingleFragmentEmbedding_ML
+
+
+def dummy_eval_density_func(mol, xc, grids):
+    mf = rks.RKS(mol)
+    mf.xc = xc
+    mf.grids = grids
+    mf.verbose = 0
+    mf.kernel()
+    
+    dm = cp.asarray(mf.make_rdm1())
+    
+    # Calculate exact J and K matrices
+    vj, vk = mf.get_jk(mol, dm)
+    e_j = 0.5 * float(cp.sum(dm * vj))
+    
+    is_hybrid = mf._numint.libxc.is_hybrid_xc(xc)
+    if is_hybrid:
+        hyb = mf._numint.libxc.hybrid_coeff(xc, spin=mol.spin)
+        vk = vk * hyb
+        e_k = 0.5 * float(cp.sum(dm * vk))
+    else:
+        vk = None
+        e_k = 0.0
+        
+    # Calculate exact Vxc and Exc
+    _, e_xc, vxc = mf._numint.nr_rks(mol, grids, xc, dm)
+    int_rho_vxc = float(cp.sum(dm * vxc))
+    
+    return vj, vk, vxc, e_j, e_k, float(e_xc), int_rho_vxc
+
+
+class TestMLEmbedding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.mol = gto.Mole()
+        cls.mol.atom = '''
+            C      -0.76091    -0.00000     0.00000
+            C       0.76091    -0.00000     0.00000
+            H      -1.16001     1.02029     0.00000
+            H      -1.16001    -0.51014    -0.88357
+            H      -1.16001    -0.51014     0.88357
+            H       1.16001    -1.02029     0.00000
+            H       1.16001     0.51014     0.88357
+            H       1.16001     0.51014    -0.88357    
+        '''
+        cls.mol.basis = '6-31g'
+        cls.mol.spin = 0
+        cls.mol.charge = 0
+        cls.mol.verbose = 0
+        cls.mol.build()
+
+        cls.methyl_fragment = [0, 2, 3, 4]
+        cls.full_fragment = [i for i in range(cls.mol.natm)]
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol
+
+    def test_harris_rks_exactness(self):
+        mf_ref = rks.RKS(self.mol, xc='PBE')
+        mf_ref.verbose = 0
+        e_ref = mf_ref.kernel()
+
+        mf_harris = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_harris.verbose = 0
+        e_harris = mf_harris.kernel()
+
+        self.assertAlmostEqual(e_ref, e_harris, places=8, 
+                               msg=f"HarrisRKS energy {e_harris} differs from exact RKS {e_ref}")
+
+    def test_full_system_pbe_in_pbe(self):
+        mf_outer = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_inner = rks.RKS(self.mol, xc='PBE')
+        
+        emb_obj = SingleFragmentEmbedding_ML(mf_outer, mf_inner, self.full_fragment, verbose=0)
+        emb_obj.kernel()
+        
+        mf_outer.kernel()
+        e_global = mf_outer.e_tot
+        e_emb = emb_obj.e_tot
+        
+        self.assertAlmostEqual(e_global, e_emb, places=8, 
+                               msg="Full-system PBE-in-PBE failed exact cancellation.")
+
+    def test_equivalence_to_standard_embedding(self):
+
+        mf_outer_std = rks.RKS(self.mol, xc='PBE')
+        mf_inner_std = rks.RKS(self.mol, xc='B3LYP')
+        emb_std = SingleFragmentEmbedding(mf_outer_std, mf_inner_std, self.methyl_fragment, verbose=0)
+        e_std = emb_std.kernel()
+
+        mf_outer_ml = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_inner_ml = rks.RKS(self.mol, xc='B3LYP')
+        emb_ml = SingleFragmentEmbedding_ML(mf_outer_ml, mf_inner_ml, self.methyl_fragment, verbose=0)
+        e_ml = emb_ml.kernel()
+
+        self.assertAlmostEqual(e_std, e_ml, places=8, 
+                               msg=f"ML Embedding {e_ml} diverged from Standard Embedding {e_std}!")
+
+    def test_harris_max_cycle_override(self):
+
+        mf_harris = HarrisRKS(self.mol, dummy_eval_density_func, xc='PBE')
+        mf_harris.max_cycle = 100 
+        mf_harris.verbose = 0
+        
+        mf_harris.kernel()
+        
+        self.assertEqual(mf_harris.max_cycle, 1, 
+                         "HarrisRKS failed to override malicious max_cycle setting!")
+
+if __name__ == '__main__':
+    print("Full Tests for ML-Driven ONIOM-like Embedding...")
+    unittest.main()
+
diff --git a/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
new file mode 100644
index 000000000..ca0c1b6ab
--- /dev/null
+++ b/gpu4pyscf/qmmm/embedding/tests/test_dmet_embedding.py
@@ -0,0 +1,336 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm.embedding.embedding import DMET 
+from gpu4pyscf.qmmm.embedding import embedding
+
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+
+        cls.mol = gto.Mole()
+        cls.mol.atom = '''
+            H 0.0 0.0 0.0
+            H 0.0 0.0 1.0
+            H 0.0 0.0 2.0
+            H 0.0 0.0 3.0
+        '''
+        cls.mol.basis = 'sto-3g'
+        cls.mol.spin = 0
+        cls.mol.charge = 0
+        cls.mol.verbose = 0
+        cls.mol.build()
+
+        cls.fragments = [[0, 1], [2, 3]]
+
+        cls.mf_outer = gpu_hf.RHF(cls.mol)
+        cls.mf_outer.conv_tol = 1e-14
+        cls.mf_inner_template = gpu_hf.RHF(cls.mol)
+        cls.mf_inner_template.conv_tol = 1e-14
+
+        cls.mol2 = gto.Mole()
+        cls.mol2.atom = '''
+            C      -0.76091    -0.00000     0.00000
+            C       0.76091    -0.00000     0.00000
+            H      -1.16001     1.02029     0.00000
+            H      -1.16001    -0.51014    -0.88357
+            H      -1.16001    -0.51014     0.88357
+            H       1.16001    -1.02029     0.00000
+            H       1.16001     0.51014     0.88357
+            H       1.16001     0.51014    -0.88357    
+        '''
+        cls.mol2.basis = '6-31g'
+        cls.mol2.spin = 0
+        cls.mol2.charge = 0
+        cls.mol2.verbose = 0
+        cls.mol2.build()
+
+        cls.fragments2 = [[0, 2, 3, 4], [1, 5, 6, 7]]
+
+        cls.mf_outer2 = gpu_hf.RHF(cls.mol2)
+        cls.mf_outer2.conv_tol = 1e-12
+        cls.mf_inner_template2 = gpu_hf.RHF(cls.mol2)
+        cls.mf_inner_template2.conv_tol = 1e-12
+
+        cls.mol3 = gto.Mole()
+        cls.mol3.atom = '''
+            C   1.4522500000  -2.8230000000   0.0000000000
+            C   1.4522500000  -1.2830000000   0.0000000000
+            C   0.0002500000  -0.7700000000   0.0000000000
+            C   0.0002500000   0.7700000000   0.0000000000
+            C  -1.4517500000   1.2830000000   0.0000000000
+            C  -1.4517500000   2.8230000000   0.0000000000
+            H   2.4792500000  -3.1870000000   0.0000000000
+            H   0.9382500000  -3.1870000000   0.8900000000
+            H   0.9382500000  -3.1870000000  -0.8900000000
+            H   1.9652500000  -0.9200000000   0.8900000000
+            H   1.9652500000  -0.9200000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000  -0.8900000000
+            H  -0.5137500000  -1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000   0.8900000000
+            H   0.5132500000   1.1330000000  -0.8900000000
+            H  -1.9657500000   0.9200000000  -0.8900000000
+            H  -1.9657500000   0.9200000000   0.8900000000
+            H  -2.4797500000   3.1870000000   0.0000000000
+            H  -0.9377500000   3.1870000000   0.8900000000
+            H  -0.9377500000   3.1870000000  -0.8900000000   
+        '''
+        cls.mol3.basis = '6-31g'
+        cls.mol3.spin = 0
+        cls.mol3.charge = 0
+        cls.mol3.verbose = 0
+        cls.mol3.build()
+
+        cls.fragments3 = [[0, 6, 7, 8],
+                        [1, 9, 10],
+                        [2, 11, 12],
+                        [3, 13, 14],
+                        [4, 15, 16],
+                        [5, 17, 18, 19]]
+
+        cls.mf_outer3 = gpu_hf.RHF(cls.mol3)
+        cls.mf_outer3.conv_tol = 1e-12
+        cls.mf_inner_template3 = gpu_hf.RHF(cls.mol3)
+        cls.mf_inner_template3.conv_tol = 1e-12
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol
+        del cls.mf_outer
+        del cls.mf_inner_template
+        del cls.mol2
+        del cls.mf_outer2
+        del cls.mf_inner_template2
+
+    def test_dmet_initialization(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer,
+            mf_inner=self.mf_inner_template,
+            fragments=self.fragments,
+            threshold=1e-5
+        )
+
+        nao = self.mol.nao_nr()
+        
+        self.assertEqual(dmet_solver.nfrags, 2, "Number of fragments should be 2.")
+        self.assertEqual(len(dmet_solver.frag_idx), 2, "Fragment indices list should have length 2.")
+        
+        self.assertEqual(dmet_solver.u_oao.shape, (nao, nao), "Correlation potential u_oao should be of shape (nao, nao).")
+        self.assertTrue(isinstance(dmet_solver.u_oao, cp.ndarray), "Correlation potential should be a CuPy array.")
+
+    def test_lowdin(self):
+        ovlp = self.mf_outer.get_ovlp()
+        X, X_inv = embedding.lowdin_orth(ovlp)
+        X_ref = cp.array([[ 1.1214051976, -0.3278815514,  0.0611473762, -0.0095874461],
+                          [-0.3278815514,  1.2643824327, -0.3597401082,  0.0611473762],
+                          [ 0.0611473762, -0.3597401082,  1.2643824327, -0.3278815514],
+                          [-0.0095874461,  0.0611473762, -0.3278815514,  1.1214051976]])
+        identity = cp.eye(4)
+        assert np.abs(X@X_inv - identity).max() < 1e-8, "Lowdin orthogonalization should yield an identity matrix."
+        assert np.abs(X - X_ref).max() < 1e-8, "Lowdin orthogonalization should yield a close-to-identity matrix."
+
+    def test_schmidt(self):
+        """
+        Test Schmidt decomposition with the rigorous norm-based filtering logic 
+        to prevent null vectors and preserve legitimate physical tails.
+        """
+        mol = gto.Mole()
+        mol.atom = '''
+            H 0.0 0.0 0.0
+            H 0.0 0.0 1.0
+            H 0.0 0.0 2.0
+            H 0.0 0.0 3.0
+        '''
+        mol.basis = '6-31g'
+        mol.spin = 0
+        mol.charge = 0
+        mol.verbose = 0
+        mol.build()
+
+        mf = gpu_hf.RHF(mol)
+        mf.kernel()
+        
+        s = mf.get_ovlp()
+        mo_coeff = mf.mo_coeff
+        X, X_inv = embedding.lowdin_orth(s)
+        mo_coeff_oao = X @ mo_coeff
+        C_occ = mo_coeff_oao[:, :2]
+        C_A = mo_coeff_oao[:4, :2]
+        
+        U, S, Vh = cp.linalg.svd(C_A, full_matrices=True)
+        C_rot = C_occ @ Vh.T
+        
+        threshold = 1e-5
+        is_bath_candidate = S > threshold
+        n_sv = len(S)
+        
+        raw_bath_orb_ref = C_rot[4:, :n_sv][:, is_bath_candidate]
+        if raw_bath_orb_ref.size > 0:
+            norms = cp.linalg.norm(raw_bath_orb_ref, axis=0)
+            valid_mask = norms > 1e-10
+            bath_orb_ref = raw_bath_orb_ref[:, valid_mask]
+            valid_norms = norms[valid_mask]
+            if bath_orb_ref.size > 0:
+                bath_orb_ref /= valid_norms
+        else:
+            bath_orb_ref = raw_bath_orb_ref
+            
+        bath_orb = embedding.schmidt_decompose(mo_coeff_oao, mf.mo_occ, [0,1,2,3], [4,5,6,7], threshold=threshold)[0]
+        
+        self.assertEqual(bath_orb.shape, bath_orb_ref.shape, 
+                         "Matrix shapes must match after norm-based filtering.")
+        if bath_orb.size > 0:
+            assert np.abs(bath_orb.get() - bath_orb_ref.get()).max() < 1e-8, \
+                "Schmidt decomposition should yield highly accurate normalized basis vectors."
+
+    def test_dmet_execution_and_convergence(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer,
+            mf_inner=self.mf_inner_template,
+            fragments=self.fragments,
+            threshold=1e-5,
+            max_macro_iter=20,
+            macro_tol=1e-3
+        )
+
+        e_tot = dmet_solver.kernel()
+
+        e_tot_ref = self.mf_outer.kernel()
+        
+        assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
+        assert np.abs(dmet_solver.u_oao).sum() < 1e-8, "Correlation potential should be close to zero."
+        
+        dmet_solver2 = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=20,
+            macro_tol=1e-3
+        )
+
+        e_tot = dmet_solver2.kernel()
+        self.mf_outer2.mo_coeff = None
+        e_tot_ref = self.mf_outer2.kernel()
+
+        dmet_solver2_iter1 = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=1,
+            macro_tol=1e-3
+        )
+        e_tot_iter1 = dmet_solver2_iter1.kernel()
+
+        total_elec_dmet = 0.0
+        for ifrag in range(dmet_solver2.nfrags):
+            dm_high = cp.asnumpy(dmet_solver2.mf_inner[ifrag].make_rdm1())
+            n_frag_orbs = len(dmet_solver2.frag_idx[ifrag])
+            total_elec_dmet += np.trace(dm_high[:n_frag_orbs, :n_frag_orbs])
+        
+        assert np.abs(total_elec_dmet - (self.mol2.nelec[0] + self.mol2.nelec[1])) < 1e-8, \
+            "Sum of numbers of electrons from fragments should be close to the total number."
+        assert np.abs(e_tot - e_tot_ref) < 1e-8, "DMET energy should be close to the reference energy."
+        assert np.abs(e_tot_iter1 - e_tot) < 1e-8, "DMET energy should be converged in 1 macro iteration."
+        assert np.abs(dmet_solver2.u_oao).sum() < 1e-8, "Correlation potential should be close to zero."
+
+    def test_dmet_template_isolation(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer2,
+            mf_inner=self.mf_inner_template2,
+            fragments=self.fragments2,
+            threshold=1e-5,
+            max_macro_iter=3,
+            macro_tol=1e-3
+        )
+        dmet_solver.kernel()
+
+        self.mf_inner_template2.mo_coeff = None
+        self.mf_inner_template2.kernel()
+        
+        self.assertTrue(self.mf_inner_template2.converged, 
+                        "The inner template was poisoned by DMET macro-loops and failed to converge!")
+
+    def test_correlation_potential_symmetry(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer,
+            mf_inner=self.mf_inner_template,
+            fragments=self.fragments,
+            threshold=1e-5,
+            max_macro_iter=2
+        )
+        dmet_solver.kernel()
+
+        u = dmet_solver.u_oao
+        
+        sym_err = float(cp.abs(u - u.T).max())
+        self.assertTrue(sym_err < 1e-12, f"Correlation potential u_oao is not symmetric. Max err: {sym_err}")
+        
+        max_u_val = float(cp.abs(u).max())
+        self.assertTrue(max_u_val < 1e-7, f"Trivial correlation potential should be zero, but got max: {max_u_val}")
+
+    def test_multifragment_algebraic_and_conservation(self):
+        dmet_solver = DMET(
+            mf_outer=self.mf_outer3,
+            mf_inner=self.mf_inner_template3,
+            fragments=self.fragments3,
+            threshold=1e-5,
+            max_macro_iter=1
+        )
+        dmet_solver.kernel()
+
+        S_ao = cp.asarray(self.mf_outer3.get_ovlp())
+        n_total_elec = float(self.mol3.nelectron)
+
+        e_ref = self.mf_outer3.kernel()
+        assert np.abs(e_ref - dmet_solver.e_tot) < 1e-8, f"Reference energy {e_ref} != Embedding energy {dmet_solver.e_tot}"
+
+        for ifrag in range(dmet_solver.nfrags):
+            B = dmet_solver.B[ifrag]
+            D_core = dmet_solver.dm_core[ifrag]
+            D_emb_high = cp.asarray(dmet_solver.mf_inner[ifrag].make_rdm1())
+
+            # Check B^T * S * B == I for each fragment
+            ortho_check = B.T @ S_ao @ B
+            identity = cp.eye(B.shape[1])
+            max_ortho_err = float(cp.abs(ortho_check - identity).max())
+            self.assertTrue(max_ortho_err < 1e-10, 
+                            f"Fragment {ifrag}: Basis B is not orthonormal. Max err: {max_ortho_err}")
+
+            # Check Core DM spatial isolation from the active space
+            core_overlap = B.T @ S_ao @ D_core @ S_ao @ B
+            max_overlap_err = float(cp.abs(core_overlap).max())
+            self.assertTrue(max_overlap_err < 1e-10, 
+                            f"Fragment {ifrag}: Core DM leaks into Active Space. Max err: {max_overlap_err}")
+
+            # Check total electron conservation for this fragment representation
+            D_emb_ao = B @ D_emb_high @ B.T
+            D_total_ao = D_core + D_emb_ao
+            n_elec_calc = float(cp.trace(D_total_ao @ S_ao))
+            self.assertAlmostEqual(n_elec_calc, n_total_elec, places=8,
+                                   msg=f"Fragment {ifrag}: Electron loss detected. {n_elec_calc} != {n_total_elec}")
+
+
+if __name__ == '__main__':
+    print("Full Tests for DMET")
+    unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index 56c1d30eb..a46b1dbb1 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -662,6 +662,34 @@ def __call__(self, mol_or_geom, **kwargs):
         self._last_mol_fp = mol.ao_loc
         return e_tot
 
+
+def make_rdm2(mo_coeff, mo_occ, **kwargs):
+    '''Two-particle density matrix in AO representation
+
+    NOTE the indices of the two-particle density matrix is ordered to
+
+    dm2[p,q,r,s] = <q^+ s^+ r p>.
+
+    HF energy can be computed
+    E = einsum('pq,qp', hcore, 1pdm) + einsum('pqrs,pqrs', eri, 2pdm) / 2
+    where h1[p,q] = <p|h|q> and eri[p,q,r,s] = (pq|rs)
+to make the density matrix consistent with the density matrix obtained
+    from post-HF methods,
+
+    Args:
+        mo_coeff : 2D ndarray
+            Orbital coefficients. Each column is one orbital.
+        mo_occ : 1D ndarray
+            Occupancy
+    Returns:
+        Two-particle density matrix, 4D ndarray
+    '''
+    dm1 = make_rdm1(mo_coeff, mo_occ, **kwargs)
+    dm2 = (cupy.einsum('ij,kl->ijkl', dm1, dm1)
+         - cupy.einsum('ij,kl->iklj', dm1, dm1)/2)
+    return dm2
+
+
 class SCF(pyscf_lib.StreamObject):
 
     # attributes
@@ -813,7 +841,6 @@ def dump_flags(self, verbose=None):
     init_guess_by_chkfile    = return_cupy_array(hf_cpu.SCF.init_guess_by_chkfile)
     from_chk                 = return_cupy_array(hf_cpu.SCF.from_chk)
     get_init_guess           = hf_cpu.SCF.get_init_guess
-    make_rdm2                = NotImplemented
     energy_elec              = NotImplemented
     energy_tot               = energy_tot
     energy_nuc               = hf_cpu.SCF.energy_nuc
@@ -869,6 +896,11 @@ def make_rdm1(self, mo_coeff=None, mo_occ=None, **kwargs):
         if mo_coeff is None: mo_coeff = self.mo_coeff
         return make_rdm1(mo_coeff, mo_occ)
 
+    def make_rdm2(self, mo_coeff=None, mo_occ=None, **kwargs):
+        if mo_occ is None: mo_occ = self.mo_occ
+        if mo_coeff is None: mo_coeff = self.mo_coeff
+        return make_rdm2(mo_coeff, mo_occ)
+
     def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None,
                    verbose=logger.NOTE):
         if mol is None: mol = self.mol