diff --git a/CHANGELOG b/CHANGELOG index 0abc3af7b..6824ccea9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +v1.8.0 (2026-??-??) +------------------- +* API updates + - The eval_xc_eff function of numint module now returns exc as a 1-dimensional + (N,) array instead of a (N, 1) array. + + v1.7.1 (2026-05-28) ------------------- * New Features diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py index 5ff9eddf9..27eea1d71 100644 --- a/gpu4pyscf/dft/libxc.py +++ b/gpu4pyscf/dft/libxc.py @@ -297,7 +297,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k setattr(out_params, label, array[0].data.ptr) setattr(buf_params, label, array[1].data.ptr) stream = cupy.cuda.get_current_stream() - lapl = cupy.empty(1) + lapl = cupy.empty(0) err = libgdft.GDFT_xc_mgga( stream.ptr, self.xc_func, diff --git a/gpu4pyscf/dft/mcfun_gpu.py b/gpu4pyscf/dft/mcfun_gpu.py index 7ee5c629b..219238ba2 100644 --- a/gpu4pyscf/dft/mcfun_gpu.py +++ b/gpu4pyscf/dft/mcfun_gpu.py @@ -151,7 +151,6 @@ def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples): xc_orig = func(rho_ts, deriv) exc_eff = xc_orig[0] - exc_eff = exc_eff[:,0] omega = omega.reshape(3, ngrids) if deriv > 0: diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index c50f9a678..35d2d29ac 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -25,7 +25,8 @@ from gpu4pyscf.gto.mole import basis_seg_contraction from gpu4pyscf.lib.cupy_helper import ( contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, transpose_sum, - grouped_dot, grouped_gemm, reduce_to_device, take_last2d, ndarray, batched_vec3_norm2) + grouped_dot, grouped_gemm, reduce_to_device, take_last2d, ndarray, + batched_vec_norm2, batched_vec_dot, MEMPOOL_THRESHOLD) from gpu4pyscf.dft import xc_deriv, libxc from gpu4pyscf.lib import logger from gpu4pyscf.lib.multi_gpu import lru_cache @@ -347,18 +348,18 @@ def _vv10nlc(rho_drho, coords, weights, nlc_pars): rho_i = rho_drho[0] - rho_nonzero_mask = cupy.logical_and( + rho_nonzero_mask = cupy.where(cupy.logical_and( rho_i >= NLC_REMOVE_ZERO_RHO_GRID_THRESHOLD, cupy.abs(weights) > 1e-14, - ) + ))[0] rho_i = rho_i[rho_nonzero_mask] coords = cupy.ascontiguousarray(coords[rho_nonzero_mask]) weights = weights[rho_nonzero_mask] ngrids = coords.shape[0] - nabla_rho_i = cupy.ascontiguousarray(rho_drho[1:4, rho_nonzero_mask]) - gamma_i = batched_vec3_norm2(nabla_rho_i) + nabla_rho_i = rho_drho[1:4, rho_nonzero_mask] + gamma_i = batched_vec_norm2(nabla_rho_i.T) del nabla_rho_i omega_i = cupy.empty(ngrids) @@ -498,10 +499,10 @@ def _nr_rks_task(ni, mol, grids, xc_code, dm, mo_coeff, mo_occ, nelec = float(den.sum()) # libxc calls are still running on default stream if xctype != 'HF': - exc, vxc = ni.eval_xc_eff(xc_code, rho_tot, deriv=1, xctype=xctype)[:2] + exc, vxc = ni.eval_xc_eff(xc_code, rho_tot, deriv=1, xctype=xctype, spin=0)[:2] vxc = cupy.asarray(vxc, order='C') exc = cupy.asarray(exc, order='C') - excsum = float(cupy.dot(den, exc[:,0]).get()) + excsum = float(cupy.dot(den, exc).get()) wv = vxc wv *= weights if xctype == 'GGA': @@ -748,15 +749,12 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = [] for i in range(nset): - if xctype == 'LDA': - exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i][0], deriv=1, xctype=xctype)[:2] - else: - exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i], deriv=1, xctype=xctype)[:2] + exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i], deriv=1, xctype=xctype, spin=0)[:2] vxc = cupy.asarray(vxc, order='C') exc = cupy.asarray(exc, order='C') den = rho_tot[i][0] * grids.weights nelec[i] = den.sum() - excsum[i] = cupy.sum(den * exc[:,0]) + excsum[i] = cupy.sum(den * exc) wv.append(vxc * grids.weights) if xctype == 'GGA': wv[i][0] *= .5 @@ -917,7 +915,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, dm_mask_buf = mo_buf = mo_coeff = None weights = cupy.asarray(grids.weights[grid_start:grid_end]) nelec = rho_tot[:,:,0].dot(weights).get() # 'sng,g->sn' - exc = cupy.empty((nset, ngrids_local, 1)) + exc = cupy.empty((nset, ngrids_local)) if xctype == 'LDA': vxc = cupy.zeros((nset, 2, 1, ngrids_local)) elif xctype == 'GGA': @@ -926,8 +924,9 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, vxc = cupy.zeros((nset, 2, 5, ngrids_local)) if xctype != 'HF': for i in range(nset): - exc[i], vxc[i] = ni.eval_xc_eff(xc_code, rho_tot[:,i,:,:], deriv=1, xctype=xctype)[:2] - excsum = cupy.einsum('ijg,g,jg->j', rho_tot[:,:,0], weights, exc[:,:,0]).get() + exc[i], vxc[i] = ni.eval_xc_eff(xc_code, rho_tot[:,i,:,:], + deriv=1, xctype=xctype, spin=1)[:2] + excsum = cupy.einsum('ijg,g,jg->j', rho_tot[:,:,0], weights, exc).get() wv = vxc * weights if xctype == 'GGA': wv[:,:,0] *= .5 @@ -1771,7 +1770,7 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0) t0 = log.timer_debug1('eval rho in fxc', *t0) if xctype != 'HF': - vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype, spin=spin)[1:3] else: vxc = 0 fxc = 0 @@ -1782,16 +1781,8 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, def batch_square(a): return a[0]**2 + a[1]**2 + a[2]**2 -def batch_square_inplace(a, out=None): - if out is None: - out = cupy.empty_like(a[0]) - cupy.square(a[0], out=out) - out += a[1] * a[1] - out += a[2] * a[2] - return out - def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, - verbose=None, spin=None, buf=None): + verbose=None, spin=None, work=None): ''' Different from PySCF, this function employ cuda version libxc ''' @@ -1810,11 +1801,11 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, if not all(x.on_gpu for x, w in xcfuns): ni_cpu = ni.to_cpu() ret = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype) - ret[0] = cupy.asarray(ret[0])[:,None] - for i in range(deriv): - ret[i+1] = cupy.asarray(ret[i+1]) + for i in range(deriv+1): + ret[i] = cupy.asarray(ret[i]) return ret + buf = work inp = {} if spin == 0: assert rho.dtype == np.float64 @@ -1823,49 +1814,33 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, inp['rho'] = rho.ravel() elif xctype in ['GGA', 'MGGA']: inp['rho'] = rho[0] - sigma1 = ndarray(ngrids, buffer=buf) - inp['sigma'] = batch_square_inplace(rho[1:4], out=sigma1) + inp['sigma'] = batched_vec_norm2(rho[1:4].T, out=buf) if xctype == 'MGGA': inp['tau'] = rho[-1] # can be 4 (without laplacian) or 5 (with laplacian) else: - assert rho[0].dtype == np.float64 + assert rho.dtype == np.float64 ngrids = rho.shape[-1] if xctype == 'LDA' or xctype == 'HF': - rho2 = ndarray((ngrids, 2), buffer=buf) - rho2[:,0] = rho[0].ravel() - rho2[:,1] = rho[1].ravel() - inp['rho'] = rho2 + inp['rho'] = rho2 = ndarray((ngrids, 2), buffer=buf) + rho2[:] = rho.reshape(2, ngrids).T elif xctype == 'GGA': buf = ndarray((5, ngrids), buffer=buf) - rho2 = ndarray((ngrids, 2), buffer=buf[:2]) - sigma3 = ndarray((ngrids, 3), buffer=buf[2:]) - rho2[:,0] = rho[0,0] - rho2[:,1] = rho[1,0] - inp['rho'] = rho2 - batch_square_inplace(rho[0, 1:4], out=sigma3[:, 0]) - cupy.multiply(rho[0, 1], rho[1, 1], out=sigma3[:, 1]) - sigma3[:, 1] += rho[0,2]*rho[1,2] - sigma3[:, 1] += rho[0,3]*rho[1,3] - batch_square_inplace(rho[1, 1:4], out=sigma3[:, 2]) - inp['sigma'] = sigma3 + inp['rho'] = rho2 = ndarray((ngrids, 2), buffer=buf[:2]) + inp['sigma'] = sigma3 = ndarray((ngrids, 3), buffer=buf[2:]) + sigma3[:, 0] = batched_vec_norm2(rho[0, 1:4].T, out=buf) + sigma3[:, 1] = batched_vec_dot(rho[0,1:4].T, rho[1,1:4].T, out=buf) + sigma3[:, 2] = batched_vec_norm2(rho[1, 1:4].T, out=buf) + rho2[:] = rho[:,0].T else: # MGGA buf = ndarray((7, ngrids), buffer=buf) - rho2 = ndarray((ngrids, 2), buffer=buf[:2]) - sigma3 = ndarray((ngrids, 3), buffer=buf[2:5]) - tau2 = ndarray((ngrids, 2), buffer=buf[5:]) - rho2[:,0] = rho[0,0] - rho2[:,1] = rho[1,0] - inp['rho'] = rho2 - batch_square_inplace(rho[0, 1:4], out=sigma3[:, 0]) - cupy.multiply(rho[0, 1], rho[1, 1], out=sigma3[:, 1]) - sigma3[:, 1] += rho[0,2]*rho[1,2] - sigma3[:, 1] += rho[0,3]*rho[1,3] - batch_square_inplace(rho[1, 1:4], out=sigma3[:, 2]) - inp['sigma'] = sigma3 - tau2[:, 0] = rho[0,-1] - tau2[:, 1] = rho[1,-1] - inp['tau'] = tau2 # can be 4 (without laplacian) or 5 (with laplacian) - + inp['rho'] = rho2 = ndarray((ngrids, 2), buffer=buf[:2]) + inp['sigma'] = sigma3 = ndarray((ngrids, 3), buffer=buf[2:5]) + inp['tau'] = tau2 = ndarray((ngrids, 2), buffer=buf[5:]) + sigma3[:, 0] = batched_vec_norm2(rho[0, 1:4].T, out=buf) + sigma3[:, 1] = batched_vec_dot(rho[0,1:4].T, rho[1,1:4].T, out=buf) + sigma3[:, 2] = batched_vec_norm2(rho[1, 1:4].T, out=buf) + rho2[:] = rho[:,0].T + tau2[:] = rho[:,-1].T do_vxc = True do_fxc = deriv > 1 do_kxc = deriv > 2 @@ -1903,11 +1878,13 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, ret_full[label] += val else: ret_full[label] = val + val = None + vxc = None fxc = None kxc = None - exc = ret_full["zk"] + exc = ret_full["zk"].ravel() vxc = [ret_full[label] for label in vxc_labels if label in ret_full] if do_fxc: fxc = [ret_full[label] for label in fxc_labels if label in ret_full] @@ -1916,8 +1893,8 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, if do_kxc: kxc = xc_deriv.transform_kxc(rho, fxc, kxc, xctype, spin) if do_fxc: - fxc = xc_deriv.transform_fxc(rho, vxc, fxc, xctype, spin) - vxc = xc_deriv.transform_vxc(rho, vxc, xctype, spin) + fxc = xc_deriv.transform_fxc(rho, vxc, fxc, xctype, spin, work) + vxc = xc_deriv.transform_vxc(rho, vxc, xctype, spin, work) return exc, vxc, fxc, kxc @lru_cache(10) @@ -2218,7 +2195,6 @@ def build(self, mol, coords): cache_xc_kernel = cache_xc_kernel # cannot patch this function - eval_xc_eff = eval_xc_eff block_loop = _block_loop eval_ao = staticmethod(eval_ao) eval_rho = staticmethod(eval_rho) @@ -2240,6 +2216,67 @@ def reset(self): self.non0ao_idx = {} return self + def eval_xc_eff(self, xc_code, rho, deriv=1, *, omega=None, xctype=None, + spin=None, work=None): + if spin is None: + if rho.ndim >= 2 and rho.shape[0] == 2: + spin = 1 + else: + spin = 0 + + if xctype is None: + xctype = self._xc_type(xc_code) + + if spin == 0: + nvar = 1 + else: + if xctype == 'LDA' or xctype == 'HF': + nvar = 2 + elif xctype == 'GGA': + nvar = 5 + else: + nvar = 7 + if deriv == 3: + nvar = 216 + elif deriv == 2: + if spin == 1 and 'GGA' in xctype: + nvar = 36 + elif deriv == 1: + if spin == 1 and 'GGA' in xctype: + nvar = 10 + + ngrids = rho.shape[-1] + if work is None: + blksize = int(MEMPOOL_THRESHOLD / 8 / nvar) + blksize = min(ngrids, blksize // 64 * 64) + work = cupy.empty((nvar, blksize)) + else: + blksize = int(work.nbytes / 8 / nvar) + blksize = min(ngrids, blksize // 64 * 64) + if blksize == 0: + work = None # The input workspace is too small + + if xctype == 'LDA' or xctype == 'HF': + nvar = 1 + elif xctype == 'GGA': + nvar = 4 + else: + nvar = 5 + out = [None] * 4 + for i in range(deriv+1): + if spin == 0: + out[i] = cupy.empty([nvar] * i + [ngrids]) + else: + out[i] = cupy.empty([2, nvar] * i + [ngrids]) + + for p0, p1 in lib.prange(0, ngrids, blksize): + rho_sub = cupy.asarray(rho[...,p0:p1], order='C') + res = eval_xc_eff(self, xc_code, rho_sub, deriv=deriv, + xctype=xctype, spin=spin, work=work) + for i in range(deriv+1): + out[i][...,p0:p1] = res[i] + return out + def _contract_rho(bra, ket, rho=None): nao, ngrids = bra.shape rho = ndarray((ngrids,), buffer=rho) diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py index 8421ab02e..4d9a1a777 100644 --- a/gpu4pyscf/dft/tests/test_libxc.py +++ b/gpu4pyscf/dft/tests/test_libxc.py @@ -70,14 +70,14 @@ def _check_xc(self, xc, spin=0, deriv=2, fxc_tol=1e-10, kxc_tol=1e-10): exc_cpu, vxc_cpu, fxc_cpu, kxc_cpu = ni_cpu.eval_xc_eff(xc, rho, deriv=deriv, xctype=xctype) exc_gpu, vxc_gpu, fxc_gpu, kxc_gpu = ni_gpu.eval_xc_eff(xc, cupy.array(rho), deriv=deriv, xctype=xctype) - print(f"{xc} {spin} exc", _diff(exc_gpu[:,0].get(), exc_cpu).max()) + print(f"{xc} {spin} exc", _diff(exc_gpu.get(), exc_cpu).max()) print(f"{xc} {spin} vxc", _diff(vxc_gpu.get(), vxc_cpu).max()) if fxc_gpu is not None: print(f"{xc} {spin} fxc", _diff(fxc_gpu.get(), fxc_cpu).max()) if kxc_gpu is not None: print(f"{xc} {spin} kxc", _diff(kxc_gpu.get(), kxc_cpu).max()) - assert _diff(exc_gpu[:,0].get(), exc_cpu).max() < 1e-10 + assert _diff(exc_gpu.get(), exc_cpu).max() < 1e-10 assert _diff(vxc_gpu.get(), vxc_cpu).max() < 1e-10 if fxc_gpu is not None: assert _diff(fxc_gpu.get(), fxc_cpu).max() < fxc_tol @@ -85,7 +85,7 @@ def _check_xc(self, xc, spin=0, deriv=2, fxc_tol=1e-10, kxc_tol=1e-10): assert _diff(kxc_gpu.get(), kxc_cpu).max() < kxc_tol def test_LDA(self): - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' + whether_use_gpu = True if whether_use_gpu: deriv = 3 print("test LDA with deriv 3") @@ -95,7 +95,7 @@ def test_LDA(self): self._check_xc('LDA_C_VWN', deriv=deriv) def test_GGA(self): - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' + whether_use_gpu = True if whether_use_gpu: deriv = 3 else: @@ -105,7 +105,7 @@ def test_GGA(self): self._check_xc('GGA_C_PBE', fxc_tol=1e-4, deriv=deriv, kxc_tol=3e2) def test_mGGA(self): - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' + whether_use_gpu = True if whether_use_gpu: deriv = 3 else: diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py index 49d49181a..de51ba2ef 100644 --- a/gpu4pyscf/dft/tests/test_numint.py +++ b/gpu4pyscf/dft/tests/test_numint.py @@ -14,8 +14,9 @@ import unittest import numpy as np -import pyscf import cupy +import cupy as cp +import pyscf from pyscf import lib, scf from pyscf.dft.numint import NumInt as pyscf_numint from gpu4pyscf.dft import Grids @@ -23,6 +24,7 @@ from gpu4pyscf.dft.numint import NumInt from gpu4pyscf import dft from gpu4pyscf.dft import gen_grid +from gpu4pyscf.dft import xc_deriv def setUpModule(): global mol, grids_cpu, grids_gpu, dm, dm0, dm1, mo_occ, mo_coeff @@ -442,6 +444,60 @@ def test_get_rho_with_derivatives_unrestricted_dm_input(self): assert ref_rho.shape == (2, 5, mf.grids.coords.shape[0]) assert np.max(np.abs(test_rho - ref_rho)) < 1e-11 + def test_ud2ts(self): + matrix = cp.array([[0.5, 0.5], + [0.5, -0.5]]) + v_ud = cp.random.rand(2,4,10) + ref = cp.einsum('ra,axg->rxg', matrix, v_ud) + assert abs(xc_deriv.ud2ts(v_ud) - ref).max() < 1e-14 + + v_ud = cp.random.rand(2,4,2,4,10) + ref = cp.einsum('ra,tb,axbyg->rxtyg', matrix, matrix, v_ud) + assert abs(xc_deriv.ud2ts(v_ud) - ref).max() < 1e-14 + + v_ud = cp.random.rand(2,4,2,4,2,4,10) + ref = cp.einsum('ra,tb,sc,axbyczg->rxtyszg', matrix, matrix, matrix, v_ud) + assert abs(xc_deriv.ud2ts(v_ud) - ref).max() < 1e-14 + + def test_eval_xc_eff_limited_memory(self): + ni_cpu = pyscf_numint() + ni_gpu = NumInt() + ngrids = 20000 + with lib.temporary_env(numint, MEMPOOL_THRESHOLD=80000): + cp.random.seed(2) + rho = cp.random.rand(ngrids) * 1e-1 + .5 + dat = ni_gpu.eval_xc_eff('lda', rho, deriv=1) + ref = ni_cpu.eval_xc_eff('lda', rho.get(), deriv=1) + assert abs(dat[1].get() - ref[1]).max() < 1e-14 + + rho = cp.random.rand(2, ngrids) * 1e-1 + .5 + dat = ni_gpu.eval_xc_eff('lda', rho, deriv=2) + ref = ni_cpu.eval_xc_eff('lda', rho.get(), deriv=2) + assert abs(dat[1].get() - ref[1]).max() < 1e-14 + assert abs(dat[2].get() - ref[2]).max() < 1e-14 + + rho = cp.random.rand(4, ngrids) * 1e-1 + .5 + dat = ni_gpu.eval_xc_eff('pbe', rho, deriv=1) + ref = ni_cpu.eval_xc_eff('pbe', rho.get(), deriv=1) + assert abs(dat[1].get() - ref[1]).max() < 1e-14 + + rho = cp.random.rand(2, 4, ngrids) * 1e-1 + .5 + dat = ni_gpu.eval_xc_eff('pbe', rho, deriv=2) + ref = ni_cpu.eval_xc_eff('pbe', rho.get(), deriv=2) + assert abs(dat[1].get() - ref[1]).max() < 1e-14 + assert abs(dat[2].get() - ref[2]).max() < 1e-14 + + rho = cp.random.rand(5, ngrids) * 1e-1 + .5 + dat = ni_gpu.eval_xc_eff('r2scan', rho, deriv=1) + ref = ni_cpu.eval_xc_eff('r2scan', rho.get(), deriv=1) + assert abs(dat[1].get() - ref[1]).max() < 1e-14 + + rho = cp.random.rand(2, 5, ngrids) * 1e-1 + .5 + dat = ni_gpu.eval_xc_eff('r2scan', rho, deriv=2) + ref = ni_cpu.eval_xc_eff('r2scan', rho.get(), deriv=2) + assert abs(dat[1].get() - ref[1]).max() < 1e-14 + assert abs(dat[2].get() - ref[2]).max() < 1e-14 + if __name__ == "__main__": print("Full Tests for dft numint") unittest.main() diff --git a/gpu4pyscf/dft/xc_deriv.py b/gpu4pyscf/dft/xc_deriv.py index 88b66bd69..23b83b1fa 100644 --- a/gpu4pyscf/dft/xc_deriv.py +++ b/gpu4pyscf/dft/xc_deriv.py @@ -16,11 +16,10 @@ Transform XC functional derivatives between different representations ''' import numpy as np -import cupy -from pyscf.dft.xc_deriv import _stack_fg, _stack_frr, _stack_fgg -from gpu4pyscf.lib.cupy_helper import contract +import cupy as cp +from gpu4pyscf.lib.cupy_helper import contract, ndarray -def transform_vxc(rho, vxc, xctype, spin=0): +def transform_vxc(rho, vxc, xctype, spin=0, work=None): r''' The output tensor has the shape: * spin polarized @@ -32,7 +31,7 @@ def transform_vxc(rho, vxc, xctype, spin=0): GGA : [4,N] MGGA: [5,N] ''' - rho = cupy.asarray(rho, order='C') + rho = cp.asarray(rho, order='C') if xctype == 'GGA': order = 1 nvar = 4 @@ -54,24 +53,30 @@ def transform_vxc(rho, vxc, xctype, spin=0): if order == 0: vp = fr.reshape(2, nvar, ngrids) else: - vp = cupy.empty((2, nvar, ngrids)) + vp = cp.empty((2, nvar, ngrids)) vp[:,0] = fr - #vp[:,1:4] = _stack_fg(fg, rho=rho) - vp[:,1:4] = contract('abg,bxg->axg', _stack_fg(fg), rho[:,1:4]) + #vp[:,1:4] = cp.einsum('abg,axg->bxg', _stack_fg(fg), rho[:,1:4]) + buf = ndarray((10, ngrids), buffer=work) + buf1 = buf[4:].reshape(2, 3, ngrids) + fg = _stack_fg(fg, out=buf) + cp.multiply(fg[0,:,None], rho[0,1:4], out=vp[:,1:4]) + vp[:,1:4] += cp.multiply(fg[1,:,None], rho[1,1:4], out=buf1) if order > 1: vp[:,4] = ft else: if order == 0: vp = fr.reshape(nvar, ngrids) else: - vp = cupy.empty((nvar, ngrids)) + vp = cp.empty((nvar, ngrids)) vp[0] = fr - vp[1:4] = 2 * fg * rho[1:4] + # vp[1:4] = 2 * fg * rho[1:4] + cp.multiply(fg, rho[1:4], out=vp[1:4]) + vp[1:4] *= 2 if order > 1: vp[4] = ft return vp -def transform_fxc(rho, vxc, fxc, xctype, spin=0): +def transform_fxc(rho, vxc, fxc, xctype, spin=0, work=None): r''' The output tensor has the shape: * spin polarized @@ -80,7 +85,7 @@ def transform_fxc(rho, vxc, fxc, xctype, spin=0): MGGA: [2,5,2,5,N] * spin unpolarized is not implemented ''' - rho = cupy.asarray(rho, order='C') + rho = cp.asarray(rho, order='C') if xctype == 'GGA': order = 1 nvar = 4 @@ -101,32 +106,37 @@ def transform_fxc(rho, vxc, fxc, xctype, spin=0): ngrids = rho.shape[-1] if spin == 1: if order == 0: - vp = _stack_frr(frr).reshape(2,nvar, 2,nvar, ngrids).transpose(1,3,0,2,4) + vp = _stack_frr(frr).reshape(2,nvar, 2,nvar, ngrids) else: - vp = cupy.empty((2,nvar, 2,nvar, ngrids)).transpose(1,3,0,2,4) - vp[0,0] = _stack_frr(frr) + vp = cp.empty((nvar, nvar, 2,2,ngrids)) + _stack_frr(frr, out=vp[0,0]) i3 = np.arange(3) - qgg = _stack_fgg(fgg) - qgg = cupy.einsum('abcdg,axg->xbcdg', qgg, rho[:,1:4]) - qgg = cupy.einsum('xbcdg,cyg->xybdg', qgg, rho[:,1:4]) - #qgg = _stack_fgg(fgg, rho=rho).transpose(1,3,0,2,4) - qgg[i3,i3] += _stack_fg(fg) - vp[1:4,1:4] = qgg + qgg = _stack_fgg(fgg, out=work) + #:qgg = cp.einsum('abcdg,axg->xbcdg', qgg, rho[:,1:4]) + #:qgg = cp.einsum('xbcdg,cyg->xybdg', qgg, rho[:,1:4]) + #:qgg[i3,i3] += _stack_fg(fg) + #:vp[1:4,1:4] = qgg + tmp = qgg[0] * rho[0,1:4,None,None,None] + tmp += qgg[1] * rho[1,1:4,None,None,None] + qgg = vp[1:4,1:4] + cp.multiply(tmp[:,None,:,0], rho[0,1:4,None,None], out=qgg) + qgg += tmp[:,None,:,1] * rho[1,1:4,None,None] + qgg[i3,i3] += _stack_fg(fg, out=work) frg = frg.reshape(2,3,ngrids) - qrg = _stack_fg(frg, axis=1) - qrg = cupy.einsum('rabg,axg->xrbg', qrg, rho[:,1:4]) - #qrg = _stack_fg(frg, axis=1, rho=rho).transpose(2,0,1,3) - vp[0,1:4] = qrg - vp[1:4,0] = qrg.transpose(0,2,1,3) + qrg = _stack_fg(frg, axis=1, out=work) + #:vp[0,1:4] = cp.einsum('rabg,axg->xrbg', qrg, rho[:,1:4]) + vp[0,1:4] = qrg[:,0] * rho[0,1:4,None,None] + vp[0,1:4] += qrg[:,1] * rho[1,1:4,None,None] + vp[1:4,0] = vp[0,1:4].transpose(0,2,1,3) if order > 1: fgt = fgt.reshape(3,2,ngrids) - qgt = _stack_fg(fgt, axis=0) - qgt = cupy.einsum('abrg,axg->xbrg', qgt, rho[:,1:4]) - # qgt = _stack_fg(fgt, axis=0, rho=rho).transpose(1,0,2,3) - vp[1:4,4] = qgt - vp[4,1:4] = qgt.transpose(0,2,1,3) + qgt = _stack_fg(fgt, axis=0, out=work) + #:vp[1:4,4] = cp.einsum('abrg,axg->xbrg', qgt, rho[:,1:4]) + vp[1:4,4] = qgt[0] * rho[0,1:4,None,None] + vp[1:4,4] += qgt[1] * rho[1,1:4,None,None] + vp[4,1:4] = vp[1:4,4].transpose(0,2,1,3) qrt = frt.reshape(2,2,ngrids) vp[0,4] = qrt @@ -134,21 +144,33 @@ def transform_fxc(rho, vxc, fxc, xctype, spin=0): vp[4,4] = _stack_frr(ftt) - vp = vp.transpose(2,0,3,1,4) + if order != 0: + vp = vp.transpose(2,0,3,1,4) else: if order == 0: vp = frr.reshape(nvar, nvar, ngrids) else: - vp = cupy.empty((nvar, nvar, ngrids)) + vp = cp.empty((nvar, nvar, ngrids)) vp[0,0] = frr i3 = np.arange(3) - qgg = 4 * fgg * rho[1:4] * rho[1:4,None] + #:qgg = 4 * fgg * rho[1:4] * rho[1:4,None] + #:qgg[i3,i3] += fg * 2 + #:vp[1:4,1:4] = qgg + qgg = vp[1:4,1:4] + cp.multiply(rho[1:4], rho[1:4,None], qgg) + qgg *= fgg + qgg *= 4 qgg[i3,i3] += fg * 2 - vp[1:4,1:4] = qgg - vp[0,1:4] = vp[1:4,0] = 2 * frg * rho[1:4] + #:vp[0,1:4] = vp[1:4,0] = 2 * frg * rho[1:4] + cp.multiply(frg, rho[1:4], out=vp[0,1:4]) + vp[0,1:4] *= 2 + vp[1:4,0] = vp[0,1:4] if order > 1: - vp[4,1:4] = vp[1:4,4] = 2 * fgt * rho[1:4] + #:vp[4,1:4] = vp[1:4,4] = 2 * fgt * rho[1:4] + cp.multiply(fgt, rho[1:4], out=vp[4,1:4]) + vp[4,1:4] *= 2 + vp[1:4,4] = vp[4,1:4] vp[0,4] = frt vp[4,0] = frt vp[4,4] = ftt @@ -166,7 +188,7 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): GGA : [4,4,4,N] MGGA: [5,5,5,N] ''' - rho = cupy.asarray(rho, order='C') + rho = cp.asarray(rho, order='C') if xctype == 'GGA': order = 1 nvar = 4 @@ -191,17 +213,17 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): if order == 0: vp = _stack_frrr(frrr).reshape(2,nvar, 2,nvar, 2,nvar, ngrids).transpose(1,3,5,0,2,4,6) else: - vp = cupy.empty((2,nvar, 2,nvar, 2,nvar, ngrids)).transpose(1,3,5,0,2,4,6) + vp = cp.empty((2,nvar, 2,nvar, 2,nvar, ngrids)).transpose(1,3,5,0,2,4,6) vp[0,0,0] = _stack_frrr(frrr) i3 = np.arange(3) qggg = _stack_fggg(fggg) - qggg = contract('abcdefg,axg->xbcdefg', qggg, rho[:,1:4]) - qggg = contract('xbcdefg,cyg->xybdefg', qggg, rho[:,1:4]) - qggg = contract('xybdefg,ezg->xyzbdfg', qggg, rho[:,1:4]) + qggg = cp.einsum('abcdefg,axg->xbcdefg', qggg, rho[:,1:4]) + qggg = cp.einsum('xbcdefg,cyg->xybdefg', qggg, rho[:,1:4]) + qggg = cp.einsum('xybdefg,ezg->xyzbdfg', qggg, rho[:,1:4]) # qggg = _stack_fggg(fggg, rho=rho).transpose(1,3,5,0,2,4,6) - # qggg = cupy.asarray(qggg) + # qggg = cp.asarray(qggg) qgg = _stack_fgg(fgg) - qgg = contract('abcdg,axg->xbcdg', qgg, rho[:,1:4]) + qgg = cp.einsum('abcdg,axg->xbcdg', qgg, rho[:,1:4]) for i in range(3): qggg[:,i,i] += qgg qggg[i,:,i] += qgg.transpose(0,2,1,3,4) @@ -210,11 +232,11 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): frgg = frgg.reshape(2,6,ngrids) qrgg = _stack_fgg(frgg, axis=1) - qrgg = contract('rabcdg,axg->xrbcdg', qrgg, rho[:,1:4]) - qrgg = contract('xrbcdg,cyg->xyrbdg', qrgg, rho[:,1:4]) + qrgg = cp.einsum('rabcdg,axg->xrbcdg', qrgg, rho[:,1:4]) + qrgg = cp.einsum('xrbcdg,cyg->xyrbdg', qrgg, rho[:,1:4]) # qrgg = _stack_fgg(frgg.get(), axis=1, rho=rho.get()).transpose(2,4,0,1,3,5) qrg = _stack_fg(frg.reshape(2,3,ngrids), axis=1) - # qrgg = cupy.asarray(qrgg) + # qrgg = cp.asarray(qrgg) qrgg[i3,i3] += qrg vp[0,1:4,1:4] = qrgg vp[1:4,0,1:4] = qrgg.transpose(0,1,3,2,4,5) @@ -223,9 +245,9 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): frrg = frrg.reshape(3,3,ngrids) qrrg = _stack_frr(frrg, axis=0) qrrg = _stack_fg(qrrg, axis=2) - qrrg = contract('rsabg,axg->rsxbg', qrrg, rho[:,1:4]).transpose(2,0,1,3,4) + qrrg = cp.einsum('rsabg,axg->rsxbg', qrrg, rho[:,1:4]).transpose(2,0,1,3,4) # qrrg = _stack_fg(qrrg.get(), axis=2, rho=rho.get()).transpose(3,0,1,2,4) - # qrrg = cupy.asarray(qrrg) + # qrrg = cp.asarray(qrrg) vp[0,0,1:4] = qrrg vp[0,1:4,0] = qrrg.transpose(0,1,3,2,4) vp[1:4,0,0] = qrrg.transpose(0,3,1,2,4) @@ -233,8 +255,8 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): if order > 1: fggt = fggt.reshape(6,2,ngrids) qggt = _stack_fgg(fggt, axis=0) - qggt = contract('abcdrg,axg->xbcdrg', qggt, rho[:,1:4]) - qggt = contract('xbcdrg,cyg->xybdrg', qggt, rho[:,1:4]) + qggt = cp.einsum('abcdrg,axg->xbcdrg', qggt, rho[:,1:4]) + qggt = cp.einsum('xbcdrg,cyg->xybdrg', qggt, rho[:,1:4]) # qggt = _stack_fgg(fggt, axis=0, rho=rho).transpose(1,3,0,2,4,5) qgt = _stack_fg(fgt.reshape(3,2,ngrids), axis=0) i3 = np.arange(3) @@ -245,7 +267,7 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): qgtt = _stack_frr(fgtt.reshape(3,3,ngrids), axis=1) qgtt = _stack_fg(qgtt, axis=0) - qgtt = contract('abrsg,axg->xbrsg', qgtt, rho[:,1:4]) + qgtt = cp.einsum('abrsg,axg->xbrsg', qgtt, rho[:,1:4]) # qgtt = _stack_fg(qgtt, axis=0, rho=rho).transpose(1,0,2,3,4) vp[1:4,4,4] = qgtt vp[4,1:4,4] = qgtt.transpose(0,2,1,3,4) @@ -253,7 +275,7 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): frgt = frgt.reshape(2,3,2,ngrids) qrgt = _stack_fg(frgt, axis=1) - qrgt = contract('rabsg,axg->xrbsg', qrgt, rho[:,1:4]) + qrgt = cp.einsum('rabsg,axg->xrbsg', qrgt, rho[:,1:4]) # qrgt = _stack_fg(frgt, axis=1, rho=rho).transpose(2,0,1,3,4) vp[0,1:4,4] = qrgt vp[0,4,1:4] = qrgt.transpose(0,1,3,2,4) @@ -280,7 +302,7 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0): if order == 0: vp = frrr.reshape(nvar, nvar, nvar, ngrids) else: - vp = cupy.empty((nvar, nvar, nvar, ngrids)) + vp = cp.empty((nvar, nvar, nvar, ngrids)) vp[0,0,0] = frrr i3 = np.arange(3) qggg = 8 * fggg * rho[1:4] * rho[1:4,None] * rho[1:4,None,None] @@ -344,7 +366,7 @@ def _stack_frrr(frrr, axis=0): [[1, 2], [2, 3]]] return frrr[tuple(slices)] -def _stack_fggg(fggg, axis=0, rho=None): +def _stack_fggg(fggg, axis=0): ''' fggg [uu_uu_uu, uu_uu_ud, uu_uu_dd, uu_ud_ud, uu_ud_dd, uu_dd_dd, ud_ud_ud, ud_ud_dd, ud_dd_dd, dd_dd_dd] -> tensor with shape [2,2, 2,2, 2,2, ...] @@ -356,27 +378,82 @@ def _stack_fggg(fggg, axis=0, rho=None): [[1, 3, 4], [3, 6, 7], [4, 7, 8]], [[2, 4, 5], [4, 7, 8], [5, 8, 9]]] fggg = fggg[tuple(slices)] - fggg = _stack_fg(fggg, axis=axis+2, rho=rho) - fggg = _stack_fg(fggg, axis=axis+1, rho=rho) - return _stack_fg(fggg, axis=axis, rho=rho) + fggg = _stack_fg(fggg, axis=axis+2) + fggg = _stack_fg(fggg, axis=axis+1) + return _stack_fg(fggg, axis=axis) def ud2ts(v_ud): - v_ts = cupy.asarray(v_ud) + v_ud = cp.asarray(v_ud) order = v_ud.ndim // 2 - - if order == 0 and v_ts.shape[0] != 2: + if order == 0 and v_ud.shape[0] != 2: raise ValueError("No spin axis found in the input array.") - matrix = cupy.array([[0.5, 0.5], - [0.5, -0.5]]) + #:matrix = cp.array([[0.5, 0.5], + #: [0.5, -0.5]]) if order == 1: - v_ts = contract('ra,axg->rxg', matrix, v_ud) + #:v_ts = cp.einsum('ra,axg->rxg', matrix, v_ud) + v_ts = cp.empty_like(v_ud) + cp.add(v_ud[0], v_ud[1], out=v_ts[0]) + cp.subtract(v_ud[0], v_ud[1], out=v_ts[1]) + v_ts *= .5 elif order == 2: - v_ts = cupy.einsum('ra,tb,axbyg->rxtyg', matrix, matrix, v_ud) + #:v_ts = cp.einsum('ra,tb,axbyg->rxtyg', matrix, matrix, v_ud) + tmp = cp.empty_like(v_ud) + cp.add(v_ud[0], v_ud[1], out=tmp[0]) + cp.subtract(v_ud[0], v_ud[1], out=tmp[1]) + v_ts = cp.empty_like(v_ud) + cp.add(tmp[:,:,0], tmp[:,:,1], out=v_ts[:,:,0]) + cp.subtract(tmp[:,:,0], tmp[:,:,1], out=v_ts[:,:,1]) + v_ts *= .25 elif order == 3: - v_ts = cupy.einsum('ra,tb,sc,axbyczg->rxtyszg', matrix, matrix, matrix, v_ud) + #:v_ts = cp.einsum('ra,tb,sc,axbyczg->rxtyszg', matrix, matrix, matrix, v_ud) + v_ts = cp.empty_like(v_ud) + tmp = cp.empty_like(v_ud) + cp.add(v_ud[0], v_ud[1], out=v_ts[0]) + cp.subtract(v_ud[0], v_ud[1], out=v_ts[1]) + cp.add(v_ts[:,:,0], v_ts[:,:,1], out=tmp[:,:,0]) + cp.subtract(v_ts[:,:,0], v_ts[:,:,1], out=tmp[:,:,1]) + cp.add(tmp[:,:,:,:,0], tmp[:,:,:,:,1], out=v_ts[:,:,:,:,0]) + cp.subtract(tmp[:,:,:,:,0], tmp[:,:,:,:,1], out=v_ts[:,:,:,:,1]) + v_ts *= .125 else: raise NotImplementedError(f"Order {order} not implemented.") - return v_ts + +def _stack_fg(fg, axis=0, out=None): + '''fg [uu, ud, dd] -> [[uu*2, ud], [du, dd*2]]''' + qg = _stack_frr(fg, axis, out) + if axis == 0: + qg[0,0] *= 2 + qg[1,1] *= 2 + elif axis == 1: + qg[:,0,0] *= 2 + qg[:,1,1] *= 2 + elif axis == 2: + qg[:,:,0,0] *= 2 + qg[:,:,1,1] *= 2 + else: + raise NotImplementedError + return qg + +def _stack_frr(frr, axis=0, out=None): + '''frr [u_u, u_d, d_d] -> [[u_u, u_d], [d_u, d_d]]''' + assert frr.shape[axis] == 3 + out = ndarray(frr.shape[:axis] + (4,) + frr.shape[axis+1:], buffer=out) + cp.take(frr, np.array([0, 1, 1, 2]), axis=axis, out=out) + return out.reshape(frr.shape[:axis] + (2, 2) + frr.shape[axis+1:]) + +def _stack_fgg(fgg, axis=0, out=None): + ''' + fgg [uu_uu, uu_ud, uu_dd, ud_ud, ud_dd, dd_dd] -> + [[uu_uu, ud_ud, ud_dd], + [ud_uu, ud_ud, ud_dd], + [dd_uu, dd_ud, dd_dd]] -> tensor with shape [2,2, 2,2, ...] + ''' + assert fgg.shape[axis] == 6 + tmp = ndarray(fgg.shape[:axis] + (9,) + fgg.shape[axis+1:], buffer=out) + cp.take(fgg, np.array([0,1,2, 1,3,4, 2,4,5]), axis=axis, out=tmp) + tmp = tmp.reshape(fgg.shape[:axis] + (3, 3) + fgg.shape[axis+1:]) + tmp = _stack_fg(tmp, axis=axis+1) + return _stack_fg(tmp, axis=axis, out=out) diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 81e11cf38..68631630a 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -28,7 +28,7 @@ from gpu4pyscf.dft import gen_grid from gpu4pyscf.lib.cupy_helper import ( contract, get_avail_mem, add_sparse, tag_array, sandwich_dot, - reduce_to_device, take_last2d, ndarray, batched_vec3_norm2) + reduce_to_device, take_last2d, ndarray, batched_vec_norm2) from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import num_devices from gpu4pyscf.dft.numint import NLC_REMOVE_ZERO_RHO_GRID_THRESHOLD @@ -170,7 +170,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, mo_coeff_mask = cupy.take(mo_coeff, idx, axis=0, out=mo_buf[:len(idx)]) rho = numint.eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask, mo_occ, None, xctype, buf=aow_buf) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1][0] + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=0)[1][0] wv = cupy.multiply(weight, vxc, out=vxc) aow = numint._scale_ao(ao_mask[0], wv, out=aow_buf) vtmp = _d1_dot_(ao_mask[1:4], aow.T, out=vtmp_buf) @@ -185,7 +185,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, mo_coeff_mask = cupy.take(mo_coeff, idx, axis=0, out=mo_buf[:len(idx)]) rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, buf=aow_buf) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, buf=aow_buf)[1] + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=0, work=aow_buf)[1] wv = cupy.multiply(weight, vxc, out=vxc) wv[0] *= .5 vtmp = _gga_grad_sum_(ao_mask, wv, buf=aow_buf, out=vtmp_buf) @@ -203,7 +203,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, mo_coeff_mask = cupy.take(mo_coeff, idx, axis=0, out=mo_buf[:len(idx)]) rho = numint.eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False, buf=aow_buf) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, buf=aow_buf)[1] + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=0, work=aow_buf)[1] wv = cupy.multiply(weight, vxc, out=vxc) wv[0] *= .5 wv[4] *= .5 # for the factor 1/2 in tau @@ -461,8 +461,7 @@ def get_exc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, rho[:, g0:g1] = numint.eval_rho(_sorted_mol, ao, dms_masked, xctype = xctype, hermi = 1) assert g1 == ngrids - exc, vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[:2] - exc = exc[:,0] + exc, vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=0)[:2] wv = grids.weights * vxc nonzero_weight_mask = cupy.abs(grids.weights) > 1e-14 @@ -613,7 +612,7 @@ def get_nlc_exc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi= ngrids = grids_coords.shape[0] nabla_rho_i = cupy.ascontiguousarray(rho_drho[1:4, rho_nonzero_mask]) - gamma_i = batched_vec3_norm2(nabla_rho_i) + gamma_i = batched_vec_norm2(nabla_rho_i.T) omega_i = cupy.empty(ngrids) domega_drho_i = cupy.empty(ngrids) diff --git a/gpu4pyscf/grad/tdrks.py b/gpu4pyscf/grad/tdrks.py index 38a36a2ed..4f0292ed5 100644 --- a/gpu4pyscf/grad/tdrks.py +++ b/gpu4pyscf/grad/tdrks.py @@ -386,20 +386,7 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, ao0 = ao mo_coeff_mask = mo_coeff[mask, :] rho = ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask, mo_occ, mask, xctype, with_lapl=False) - # quick fix - if deriv > 2: - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' - if not whether_use_gpu: - ni_cpu = numint_cpu() - # TODO: If the libxc is stablized, this should be gpulized - vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:] - if isinstance(vxc,np.ndarray): vxc = cp.asarray(vxc) - if isinstance(fxc,np.ndarray): fxc = cp.asarray(fxc) - if isinstance(kxc,np.ndarray): kxc = cp.asarray(kxc) - else: - vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] - else: - vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] + vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype, spin=0)[1:] dmvo_mask = dmvo[mask[:, None], mask] rho1 = ( ni.eval_rho(_sorted_mol, ao0, dmvo_mask, mask, xctype, hermi=1, with_lapl=False) * 2 @@ -451,16 +438,7 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, rho = ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask, mo_occ, mask, xctype, with_lapl=False) rho *= 0.5 rho = cp.repeat(rho[cp.newaxis], 2, axis=0) - # quick fix - # if deriv > 2: - # ni_cpu = numint_cpu() - # vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:] - # if isinstance(vxc,np.ndarray): vxc = cp.asarray(vxc) - # if isinstance(fxc,np.ndarray): fxc = cp.asarray(fxc) - # if isinstance(kxc,np.ndarray): kxc = cp.asarray(kxc) - # else: - # vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] - vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] + vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype, spin=1)[1:] # fxc_t couples triplet excitation amplitudes # 1/2 int (tia - tIA) fxc (tjb - tJB) = tia fxc_t tjb fxc_t = fxc[:, :, 0] - fxc[:, :, 1] diff --git a/gpu4pyscf/grad/tduks.py b/gpu4pyscf/grad/tduks.py index e9c473350..58cba3ff1 100644 --- a/gpu4pyscf/grad/tduks.py +++ b/gpu4pyscf/grad/tduks.py @@ -407,20 +407,7 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, with_vxc=True, with_k rho = cp.asarray(( ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask_a, mo_occ[0], mask, xctype,with_lapl=False), ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask_b, mo_occ[1], mask, xctype, with_lapl=False))) - if deriv > 2: - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' - if not whether_use_gpu: - ni_cpu = numint_cpu() - # TODO: If the libxc is stablized, this should be gpulized - # vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] - vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:] - vxc = cp.asarray(vxc) - fxc = cp.asarray(fxc) - kxc = cp.asarray(kxc) - else: - vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] - else: - vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] + vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype, spin=1)[1:] dmvo_mask_a = dmvo[0, mask[:, None], mask] dmvo_mask_b = dmvo[1, mask[:, None], mask] rho1 = cp.asarray(( diff --git a/gpu4pyscf/grad/tduks_sf.py b/gpu4pyscf/grad/tduks_sf.py index 72bfe5d34..8dec52ec2 100644 --- a/gpu4pyscf/grad/tduks_sf.py +++ b/gpu4pyscf/grad/tduks_sf.py @@ -371,15 +371,7 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, with_vxc=True, with_k ) if td_grad.base.collinear == 'mcol': - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' - if deriv == 3: - if whether_use_gpu: - eval_xc_eff = mcfun_eval_xc_adapter_sf(ni, xc_code, td_grad.base.collinear_samples) - else: - ni_cpu = ni.to_cpu() - eval_xc_eff = mcfun_eval_xc_adapter_sf(ni_cpu, xc_code, td_grad.base.collinear_samples) - else: - eval_xc_eff = mcfun_eval_xc_adapter_sf(ni, xc_code, td_grad.base.collinear_samples) + eval_xc_eff = mcfun_eval_xc_adapter_sf(ni, xc_code, td_grad.base.collinear_samples) elif td_grad.base.collinear == 'ncol': raise NotImplementedError('Locally collinear approach is not implemented') diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py index b42ef3739..a79696de2 100644 --- a/gpu4pyscf/grad/uks.py +++ b/gpu4pyscf/grad/uks.py @@ -158,7 +158,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, mo_coeff_mask = cupy.take(mo_coeff[1], idx, axis=0, out=mo_buf[:nao_sub]) eval_rho2(_sorted_mol, ao_mask[0], mo_coeff_mask, mo_occ[1], None, xctype, buf=aow_buf, out=rho[1]) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1][:,0] + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=1)[1][:,0] wv = cupy.multiply(weight, vxc, out=vxc) aow = numint._scale_ao(ao_mask[0], wv[0], out=aow_buf) vtmp = rks_grad._d1_dot_(ao_mask[1:4], aow.T, out=vtmp_buf) @@ -181,7 +181,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, mo_coeff_mask = cupy.take(mo_coeff[1], idx, axis=0, out=mo_buf[:nao_sub]) eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ[1], None, xctype, buf=aow_buf, out=rho[1]) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=1)[1] wv = cupy.multiply(weight, vxc, out=vxc) wv[:,0] *= .5 vtmp = rks_grad._gga_grad_sum_(ao_mask, wv[0], buf=aow_buf, out=vtmp_buf) @@ -208,7 +208,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, mo_coeff_mask = cupy.take(mo_coeff[1], idx, axis=0, out=mo_buf[:nao_sub]) eval_rho2(_sorted_mol, ao_mask[:4], mo_coeff_mask, mo_occ[1], None, xctype, buf=aow_buf, out=rho[1]) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=1)[1] wv = cupy.multiply(weight, vxc, out=vxc) wv[:,0] *= .5 wv[:,4] *= .5 # for the factor 1/2 in tau @@ -304,8 +304,7 @@ def get_exc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, rho[1, :, g0:g1] = numint.eval_rho(_sorted_mol, ao, dmb_masked, xctype = xctype, hermi = 1) assert g1 == ngrids - exc, vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[:2] - exc = exc[:,0] + exc, vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype, spin=1)[:2] wv = grids.weights * vxc nonzero_weight_mask = cupy.abs(grids.weights) > 1e-14 diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 43c0a5e5c..aa5f0873e 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -28,8 +28,9 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, - reduce_to_device, transpose_sum, take_last2d, batched_vec3_norm2) +from gpu4pyscf.lib.cupy_helper import ( + contract, add_sparse, get_avail_mem, reduce_to_device, transpose_sum, + take_last2d, batched_vec_norm2) from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import num_devices, min_grid_blksize from gpu4pyscf.dft.numint import NLC_REMOVE_ZERO_RHO_GRID_THRESHOLD, _contract_rho1_fxc @@ -231,7 +232,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): mo_coeff_mask = mo_coeff[mask,:] rho = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff_mask, mo_occ, mask, xctype, buf=aow_buf, out=rho) - vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1][0] + vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype, spin=0)[1][0] wv = cupy.multiply(weight, vxc, out=vxc) aow = cupy.ndarray((nao_sub, blk_size), memptr=aow_buf.data) aow = numint._scale_ao(ao[0], wv, out=aow) @@ -260,7 +261,7 @@ def contract_ao(ao, aoidx, wv, buf, aow, out): mo_coeff_mask = mo_coeff[mask,:] rho = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff_mask, mo_occ, mask, xctype, buf=aow_buf, out=rho) - vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype, spin=0)[1] wv = cupy.multiply(weight, vxc, out=vxc) aow = cupy.ndarray((nao_sub, blk_size), memptr=aow_buf.data) aow = numint._scale_ao(ao[:4], wv[:4], out=aow) @@ -295,7 +296,7 @@ def contract_ao(ao, aoidx, wv, buf, aow, out): mo_coeff_mask = mo_coeff[mask,:] rho = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff_mask, mo_occ, mask, xctype, buf=aow_buf, out=rho) - vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype, spin=0)[1] wv = cupy.multiply(weight, vxc, out=vxc) wv[4] *= .5 # for the factor 1/2 in tau aow = cupy.ndarray((3, nao_sub, blk_size), memptr=aow_buf.data) @@ -457,7 +458,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id rho = numint.eval_rho2(_sorted_mol, ao1[0], mo_coeff, mo_occ, mask, xctype, buf=aow_buf, out=rho) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype, spin=0)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv1 = cupy.multiply(weight, vxc[0], out=vxc[0]) wf = cupy.multiply(weight, fxc[0,0], out=fxc[0,0]) @@ -514,7 +515,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ao = contract('nip,ij->njp', ao_mask, coeff[mask], out=ao1) rho = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff, mo_occ, mask, xctype, buf=aow_buf, out=rho) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype, spin=0)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv1 = cupy.multiply(weight, vxc, out=vxc) wf = cupy.multiply(weight, fxc, out=fxc) @@ -574,7 +575,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ao = contract('nip,ij->njp', ao_mask, coeff[mask], out=ao1) rho = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff, mo_occ, mask, xctype, buf=aow_buf, out=rho) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype, spin=0)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv1 = cupy.multiply(weight, vxc, out=vxc) wf = cupy.multiply(weight, fxc, out=fxc) @@ -802,8 +803,8 @@ def _get_enlc_deriv2(hessobj, mo_coeff, mo_occ, max_memory, log = None): grids_weights = grids.weights[rho_nonzero_mask] ngrids = grids_coords.shape[0] - nabla_rho_i = cupy.ascontiguousarray(rho_drho[1:4, rho_nonzero_mask]) - gamma_i = batched_vec3_norm2(nabla_rho_i) + nabla_rho_i = rho_drho[1:4, rho_nonzero_mask] + gamma_i = batched_vec_norm2(nabla_rho_i.T) stream = cupy.cuda.get_current_stream() @@ -1307,7 +1308,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id rho = numint.eval_rho2(_sorted_mol, ao1[0], mo_coeff, mo_occ, mask, xctype, buf=aow_buf, out=rho) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype, spin=0)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv1 = cupy.multiply(weight, vxc[0], out=vxc[0]) wf = cupy.multiply(weight, fxc[0,0], out=fxc[0,0]) @@ -1355,7 +1356,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ao1 = contract('nip,ij->njp', ao, coeff[mask], out=ao1) rho = numint.eval_rho2(_sorted_mol, ao1[:4], mo_coeff, mo_occ, mask, xctype, buf=aow_buf, out=rho) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype, spin=0)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv = cupy.multiply(weight, vxc, out=vxc) wv[0] *= .5 @@ -1404,7 +1405,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ao1 = contract('nip,ij->njp', ao, coeff[mask], out=ao1) rho = numint.eval_rho2(_sorted_mol, ao1[:10], mo_coeff, mo_occ, mask, xctype, buf=aow_buf, out=rho) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype, spin=0)[1:3] t1 = log.timer_debug2('eval vxc', *t0) wv = cupy.multiply(weight, vxc, out=vxc) wf = cupy.multiply(weight, fxc, out=fxc) @@ -1559,7 +1560,7 @@ def _get_vxc_deriv1_grid_response(hessobj, mo_coeff, mo_occ, max_memory): mocc_masked = mocc_sorted[idx, :] rho = numint.eval_rho(_sorted_mol, ao[0], dm0_masked, xctype = xctype, hermi = 1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype, spin=0)[1:3] del rho depsilon_drho = vxc[0] # Just of shape (ngrids,) @@ -1634,7 +1635,7 @@ def _get_vxc_deriv1_grid_response(hessobj, mo_coeff, mo_occ, max_memory): mocc_masked = mocc_sorted[idx, :] rho = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked, xctype = xctype, hermi = 1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype, spin=0)[1:3] del rho dw_dA = get_dweight_dA(_sorted_mol, grids, (g0,g1)) @@ -1745,7 +1746,7 @@ def _get_vxc_deriv1_grid_response(hessobj, mo_coeff, mo_occ, max_memory): mocc_masked = mocc_sorted[idx, :] rho = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked, xctype = xctype, hermi = 1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype, spin=0)[1:3] del rho dw_dA = get_dweight_dA(_sorted_mol, grids, (g0,g1)) @@ -2121,8 +2122,8 @@ def _get_vnlc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): grids_weights = grids.weights[rho_nonzero_mask] ngrids = grids_coords.shape[0] - nabla_rho_i = cupy.ascontiguousarray(rho_drho[1:4, rho_nonzero_mask]) - gamma_i = batched_vec3_norm2(nabla_rho_i) + nabla_rho_i = rho_drho[1:4, rho_nonzero_mask] + gamma_i = batched_vec_norm2(nabla_rho_i.T) stream = cupy.cuda.get_current_stream() @@ -3237,9 +3238,9 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): dm0_masked = take_last2d(dm0_sorted, idx, out = dm_mask_buf) rho = numint.eval_rho(_sorted_mol, ao, dm0_masked, xctype = xctype, hermi = 1) - exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype)[0] + exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype, spin=0)[0] - epsilon = exc[:, 0] * rho + epsilon = exc * rho del rho, exc d2w_dAdB = get_d2weight_dAdB(_sorted_mol, grids, (g0,g1)) @@ -3265,7 +3266,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): dm0_masked = take_last2d(dm0_sorted, idx, out = dm_mask_buf) rho = numint.eval_rho(_sorted_mol, ao[0], dm0_masked, xctype = xctype, hermi = 1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype, spin=0)[1:3] del rho depsilon_drho = vxc[0] @@ -3314,9 +3315,9 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): dm0_masked = take_last2d(dm0_sorted, idx, out = dm_mask_buf) rho = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked, xctype = xctype, hermi = 1) - exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype)[0] + exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype, spin=0)[0] - epsilon = exc[:, 0] * rho[0, :] + epsilon = exc * rho[0, :] del rho, exc d2w_dAdB = get_d2weight_dAdB(_sorted_mol, grids, (g0,g1)) @@ -3343,7 +3344,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): dm0_masked = take_last2d(dm0_sorted, idx, out = dm_mask_buf) rho = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked, xctype = xctype, hermi = 1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype, spin=0)[1:3] del rho depsilon_drho = vxc[0] @@ -3399,9 +3400,9 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): dm0_masked = take_last2d(dm0_sorted, idx, out = dm_mask_buf) rho = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked, xctype = xctype, hermi = 1) - exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype)[0] + exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype, spin=0)[0] - epsilon = exc[:, 0] * rho[0, :] + epsilon = exc * rho[0, :] del rho, exc d2w_dAdB = get_d2weight_dAdB(_sorted_mol, grids, (g0,g1)) @@ -3428,7 +3429,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): dm0_masked = take_last2d(dm0_sorted, idx, out = dm_mask_buf) rho = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked, xctype = xctype, hermi = 1) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype, spin=0)[1:3] del rho depsilon_drho = vxc[0] @@ -3696,7 +3697,7 @@ def nr_rks_fnlc_mo(mf, mol, mo_coeff, mo_occ, dm1s, return_in_mo = True): grids_weights = grids.weights[rho_nonzero_mask] ngrids = grids_coords.shape[0] - gamma_i = batched_vec3_norm2(nabla_rho_i) + gamma_i = batched_vec_norm2(nabla_rho_i.T) stream = cupy.cuda.get_current_stream() diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index 89c242017..a494332ff 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -243,7 +243,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): mo_coeff_mask = mo_coeff[:,mask,:] rhoa = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff_mask[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff_mask[1], mo_occ[1], mask, xctype) - vxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 1, xctype=xctype, spin=1)[1] wv = weight * vxc[:,0] aowa = numint._scale_ao(ao[0], wv[0]) aowb = numint._scale_ao(ao[0], wv[1]) @@ -267,7 +267,7 @@ def contract_(ao, aoidx, wv, mask): mo_coeff_mask = mo_coeff[:,mask,:] rhoa = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff_mask[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff_mask[1], mo_occ[1], mask, xctype) - vxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 1, xctype=xctype, spin=1)[1] wv = weight * vxc #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) aowa = numint._scale_ao(ao[:4], wv[0,:4]) @@ -306,7 +306,7 @@ def contract_(ao, aoidx, wv, mask): mo_coeff_mask = mo_coeff[:,mask,:] rhoa = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff_mask[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff_mask[1], mo_occ[1], mask, xctype) - vxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 1, xctype=xctype, spin=1)[1] wv = weight * vxc wv[:,4] *= .5 # for the factor 1/2 in tau #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) @@ -463,7 +463,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff[1], mo_occ[1], mask, xctype) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype, spin=1)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc[:,0] aowa = [numint._scale_ao(ao[i], wv[0]) for i in range(1, 4)] @@ -512,7 +512,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff[1], mo_occ[1], mask, xctype) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype, spin=1)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc wv[:,0] *= .5 @@ -574,7 +574,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff[1], mo_occ[1], mask, xctype) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa, rhob)), 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa, rhob)), 2, xctype=xctype, spin=1)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc wv[:,0] *= .5 @@ -968,10 +968,10 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao, dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao, dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype)[0] + exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype, spin=1)[0] del rho - epsilon = exc[:, 0] * (rhoa + rhob) + epsilon = exc * (rhoa + rhob) del rhoa, rhob, exc d2w_dAdB = get_d2weight_dAdB(_sorted_mol, grids, (g0,g1)) @@ -999,7 +999,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[0], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[0], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype, spin=1)[1:3] del rhoa, rhob, rho depsilon_drho = vxc[:,0,:] # Just of shape (2,ngrids) @@ -1053,10 +1053,10 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao, dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao, dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype)[0] + exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype, spin=1)[0] del rho - epsilon = exc[:, 0] * (rhoa[0, :] + rhob[0, :]) + epsilon = exc * (rhoa[0, :] + rhob[0, :]) del rhoa, rhob, exc d2w_dAdB = get_d2weight_dAdB(_sorted_mol, grids, (g0,g1)) @@ -1085,7 +1085,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype, spin=1)[1:3] del rhoa, rhob, rho depsilon_drho = vxc[:, 0, :] @@ -1144,10 +1144,10 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype)[0] + exc = ni.eval_xc_eff(mf.xc, rho, deriv = 0, xctype=xctype, spin=1)[0] del rho - epsilon = exc[:, 0] * (rhoa[0, :] + rhob[0, :]) + epsilon = exc * (rhoa[0, :] + rhob[0, :]) del rhoa, rhob, exc d2w_dAdB = get_d2weight_dAdB(_sorted_mol, grids, (g0,g1)) @@ -1176,7 +1176,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype = xctype, spin=1)[1:3] del rhoa, rhob, rho depsilon_drho = vxc[:, 0, :] @@ -1287,7 +1287,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff[1], mo_occ[1], mask, xctype) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype, spin=1)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc[:,0] aow = numint._scale_ao(ao[0], wv[0]) @@ -1326,7 +1326,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[:4], mo_coeff[1], mo_occ[1], mask, xctype) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype, spin=1)[1:3] t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc wv[:,0] *= .5 @@ -1368,7 +1368,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff[0], mo_occ[0], mask, xctype) rhob = numint.eval_rho2(_sorted_mol, ao[:10], mo_coeff[1], mo_occ[1], mask, xctype) t1 = log.timer_debug2('eval rho', *t1) - vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, cupy.asarray((rhoa,rhob)), 2, xctype=xctype, spin=1)[1:3] t1 = log.timer_debug2('eval vxc', *t0) wv = weight * vxc wv[:,0] *= .5 @@ -1507,7 +1507,7 @@ def _get_vxc_deriv1_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[0], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[0], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype, spin=1)[1:3] del rhoa, rhob, rho depsilon_drho = vxc[:, 0, :] # Just of shape (2, ngrids) @@ -1594,7 +1594,7 @@ def _get_vxc_deriv1_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype, spin=1)[1:3] del rhoa, rhob, rho dw_dA = get_dweight_dA(_sorted_mol, grids, (g0,g1)) @@ -1716,7 +1716,7 @@ def _get_vxc_deriv1_grid_response(hessobj, mo_coeff, mo_occ, max_memory): rhoa = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[0], xctype = xctype, hermi = 1) rhob = numint.eval_rho(_sorted_mol, ao[:4], dm0_masked[1], xctype = xctype, hermi = 1) rho = cupy.asarray((rhoa, rhob)) - vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype)[1:3] + vxc, fxc = ni.eval_xc_eff(mf.xc, rho, deriv = 2, xctype=xctype, spin=1)[1:3] del rhoa, rhob, rho dw_dA = get_dweight_dA(_sorted_mol, grids, (g0,g1)) diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 54eb75b4e..723cbb2a5 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -1173,7 +1173,9 @@ def sandwich_dot(a, c, out=None): out = out[0] return out -def set_conditional_mempool_malloc(n_bytes_threshold=100000000): +MEMPOOL_THRESHOLD = 100000000 + +def set_conditional_mempool_malloc(n_bytes_threshold=MEMPOOL_THRESHOLD): ''' Customize CuPy memory allocator. @@ -1210,41 +1212,107 @@ def batched_vec3_norm2(batched_vec3): assert n * 3 < np.iinfo(np.int32).max if order == "c": - fn_name = "vec3_norm2_kernel_c_order" - if fn_name not in _kernel_registery: - kernel_code = r''' - extern "C" __global__ - void vec3_norm2_kernel_c_order(const double* __restrict__ vec3, double* __restrict__ norm2, const int n) { - const int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= n) return; - const double x = vec3[i * 3 + 0]; - const double y = vec3[i * 3 + 1]; - const double z = vec3[i * 3 + 2]; - norm2[i] = x*x + y*y + z*z; - } - ''' - _kernel_registery[fn_name] = cupy.RawKernel(kernel_code, fn_name) + return batched_vec_norm2(batched_vec3) + else: + return batched_vec_norm2(batched_vec3.T) + +def batched_vec_norm2(vec, out=None): + ''' + einsum('gx,gx->g', vec, vec) + ''' + vec = cupy.asarray(vec) + assert vec.dtype == cupy.float64 + assert vec.ndim == 2 + n, x = vec.shape + c_order = vec.flags.c_contiguous + + if c_order: + fn_name = f'vec{x}_norm2_kernel_c_order' else: - fn_name = "vec3_norm2_kernel_f_order" - if fn_name not in _kernel_registery: - kernel_code = r''' - extern "C" __global__ - void vec3_norm2_kernel_f_order(const double* __restrict__ vec3, double* __restrict__ norm2, const int n) { - const int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= n) return; - const double x = vec3[n * 0 + i]; - const double y = vec3[n * 1 + i]; - const double z = vec3[n * 2 + i]; - norm2[i] = x*x + y*y + z*z; + assert vec.flags.f_contiguous + fn_name = f'vec{x}_norm2_kernel_f_order' + + if fn_name not in _kernel_registery: + if c_order: + loop = ('for (int j = 0; j < ' + str(x) + '; j++) {' + 'double s = vec[i*' + str(x) + '+j];' + 'val += s * s; }') + else: + loop = ('for (int j = 0; j < ' + str(x) + '; j++) {' + 'double s = vec[n*j+i];' + 'val += s * s; }') + kernel_code = ( + r'''extern "C" __global__ void ''' + + fn_name + r'''(double* __restrict__ vec, double* __restrict__ norm2, long long n, long long m) { + size_t off = (size_t)blockIdx.x * m * blockDim.x; + for (int k = 0; k < m; k++) { + size_t i = off + blockDim.x * k + threadIdx.x; + if (i >= n) break; + double val = 0; + ''' + loop + ''' + norm2[i] = val; } - ''' - _kernel_registery[fn_name] = cupy.RawKernel(kernel_code, fn_name) + }''') + _kernel_registery[fn_name] = cupy.RawKernel(kernel_code, fn_name) + kernel = _kernel_registery[fn_name] + out = ndarray(n, np.float64, out) + m = max(n // (2000 * 1024), 1) + kernel(((n + m*1024 - 1) // (m*1024),), (1024,), (vec, out, n, m)) + return out - batched_norm2 = cupy.zeros(n, dtype = cupy.float64) - kernel(((n + 1024 - 1) // 1024,), (1024,), (batched_vec3, batched_norm2, cupy.int32(n))) +def batched_vec_dot(vec1, vec2, out=None): + ''' + einsum('gx,gx->g', vec1, vec2) + ''' + vec1 = cupy.asarray(vec1) + vec2 = cupy.asarray(vec2) + assert vec1.dtype == cupy.float64 + assert vec2.dtype == cupy.float64 + assert vec1.ndim == 2 + assert vec1.shape == vec2.shape + n, x = vec1.shape + c_order = vec1.flags.c_contiguous + + if c_order: + assert vec2.flags.c_contiguous + fn_name = f'vec{x}_dot_kernel_c_order' + else: + assert vec1.flags.f_contiguous + assert vec2.flags.f_contiguous + fn_name = f'vec{x}_dot_kernel_f_order' - return batched_norm2 + if fn_name not in _kernel_registery: + if c_order: + loop = ('for (int j = 0; j < ' + str(x) + '; j++) {' + 'double s1 = vec1[i*' + str(x) + '+j];' + 'double s2 = vec2[i*' + str(x) + '+j];' + 'val += s1 * s2; }') + else: + loop = ('for (int j = 0; j < ' + str(x) + '; j++) {' + 'double s1 = vec1[n*j+i];' + 'double s2 = vec2[n*j+i];' + 'val += s1 * s2; }') + kernel_code = ( + r'''extern "C" __global__ void ''' + + fn_name + r'''(double* __restrict__ vec1, double* __restrict__ vec2, + double* __restrict__ norm2, long long n, long long m) { + size_t off = (size_t)blockIdx.x * m * blockDim.x; + for (int k = 0; k < m; k++) { + size_t i = off + blockDim.x * k + threadIdx.x; + if (i >= n) break; + double val = 0; + ''' + loop + ''' + norm2[i] = val; + } + }''') + _kernel_registery[fn_name] = cupy.RawKernel(kernel_code, fn_name) + + kernel = _kernel_registery[fn_name] + out = ndarray(n, np.float64, out) + m = max(n // (2000 * 1024), 1) + kernel(((n + m*1024 - 1) // (m*1024),), (1024,), (vec1, vec2, out, n, m)) + return out cholesky = cusolver.cholesky diff --git a/gpu4pyscf/nac/tdrks_grad_nacv.py b/gpu4pyscf/nac/tdrks_grad_nacv.py index cd3113323..b3b3e8ccc 100644 --- a/gpu4pyscf/nac/tdrks_grad_nacv.py +++ b/gpu4pyscf/nac/tdrks_grad_nacv.py @@ -145,12 +145,7 @@ def _contract_xc_kernel_batched(td_grad, xc_code, dmvoI, dmvoJ=None, dmoo_batch= rho *= 0.5 rho = cp.repeat(rho[cp.newaxis], 2, axis=0) - if deriv > 2 and os.environ.get('LIBXC_ON_GPU', '0') != '1': - ni_cpu = numint_cpu() - vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:] - vxc, fxc, kxc = cp.asarray(vxc), cp.asarray(fxc), cp.asarray(kxc) - else: - vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:] + vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype, spin=0)[1:] # Pre-calculate Non-singlet coupling factors outside batch loop if not singlet: @@ -793,4 +788,4 @@ def __init__(self, td): def get_nacv_multi(self, x_list, y_list, E_list, singlet=True, ge_targets=None, ee_pairs=None, grad_state_idx=None, atmlst=None, verbose=logger.INFO): return get_nacv_multi(self, x_list, y_list, E_list, singlet=singlet, ge_targets=ge_targets, - ee_pairs=ee_pairs, grad_state_idx=grad_state_idx, atmlst=atmlst, verbose=verbose) \ No newline at end of file + ee_pairs=ee_pairs, grad_state_idx=grad_state_idx, atmlst=atmlst, verbose=verbose) diff --git a/gpu4pyscf/pbc/dft/multigrid.py b/gpu4pyscf/pbc/dft/multigrid.py index 534726319..b1dda8157 100644 --- a/gpu4pyscf/pbc/dft/multigrid.py +++ b/gpu4pyscf/pbc/dft/multigrid.py @@ -535,11 +535,8 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, rhoR = cp.asarray(rhoR.reshape(nvar,ngrids), order='C') nelec = float(rhoR[0].sum().real.get()) * weight - if xctype == 'LDA': - exc, vxc = ni.eval_xc_eff(xc_code, rhoR[0], deriv=1, xctype=xctype)[:2] - else: - exc, vxc = ni.eval_xc_eff(xc_code, rhoR, deriv=1, xctype=xctype)[:2] - excsum = float(rhoR[0].dot(exc[:,0]).real.get()) * weight + exc, vxc = ni.eval_xc_eff(xc_code, rhoR, deriv=1, xctype=xctype, spin=0)[:2] + excsum = float(rhoR[0].dot(exc).real.get()) * weight wv = weight * vxc wv_freq = tools.fft(wv, mesh).reshape(nvar,ngrids) rhoR = rhoG = exc = vxc = wv = None @@ -637,11 +634,8 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, rhoR = cp.asarray(rhoR.reshape(2,nvar,ngrids), order='C') nelec = rhoR[:,0].sum(axis=-1).get() * weight - if xctype == 'LDA': - exc, vxc = ni.eval_xc_eff(xc_code, rhoR[:,0], deriv=1, xctype=xctype)[:2] - else: - exc, vxc = ni.eval_xc_eff(xc_code, rhoR, deriv=1, xctype=xctype)[:2] - excsum = float(rhoR[:,0].dot(exc[:,0]).sum().real.get()) * weight + exc, vxc = ni.eval_xc_eff(xc_code, rhoR, deriv=1, xctype=xctype, spin=1)[:2] + excsum = float(rhoR[:,0].dot(exc).sum().real.get()) * weight wv = (weight * vxc).reshape(2*nvar,ngrids) wv_freq = tools.fft(wv, mesh).reshape(2,nvar,ngrids) rhoR = rhoG = exc = vxc = wv = None @@ -1472,7 +1466,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None): nr_uks = nr_uks get_vxc = nr_vxc = NotImplemented #numint_cpu.KNumInt.nr_vxc - eval_xc_eff = numint.eval_xc_eff + eval_xc_eff = numint.NumInt.eval_xc_eff _init_xcfuns = numint.NumInt._init_xcfuns nr_rks_fxc = NotImplemented diff --git a/gpu4pyscf/pbc/dft/multigrid_v2.py b/gpu4pyscf/pbc/dft/multigrid_v2.py index 8ec4a7c52..8d7e95934 100644 --- a/gpu4pyscf/pbc/dft/multigrid_v2.py +++ b/gpu4pyscf/pbc/dft/multigrid_v2.py @@ -1277,16 +1277,9 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, # eval_xc_eff supports float64 only density = cp.asarray(density, dtype=np.float64, order='C') - if xc_type == "LDA" or xc_type == 'HF': - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density[0], deriv=1, xctype=xc_type - )[:2] - elif xc_type == 'GGA' or xc_type == 'MGGA': - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density, deriv=1, xctype=xc_type - )[:2] - else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + xc_for_energy, xc_for_fock = ni.eval_xc_eff( + xc_code, density, deriv=1, xctype=xc_type, spin=0 + )[:2] rho_sf = density[0].real xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).get()) * weight @@ -1388,16 +1381,9 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, # eval_xc_eff supports float64 only density = cp.asarray(density, dtype=np.float64, order='C') - if xc_type == "LDA" or xc_type == 'HF': - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density[:,0], deriv=1, xctype=xc_type - )[:2] - elif xc_type == 'GGA' or xc_type == 'MGGA': - xc_for_energy, xc_for_fock = ni.eval_xc_eff( - xc_code, density, deriv=1, xctype=xc_type - )[:2] - else: - raise ValueError(f"Incorrect xc_type = {xc_type}") + xc_for_energy, xc_for_fock = ni.eval_xc_eff( + xc_code, density, deriv=1, xctype=xc_type, spin=1 + )[:2] rho_sf = (density[0, 0] + density[1, 0]).real xc_energy_sum = float(rho_sf.dot(xc_for_energy.ravel()).real.get()) * weight @@ -1500,13 +1486,14 @@ def get_veff_ip1( / weight ) - if nset == 1: + if nset == 1: # RHF xc_for_fock = ni.eval_xc_eff( - xc_code, density[0], deriv=1, xctype=xc_type + xc_code, density[0], deriv=1, xctype=xc_type, spin=0 )[1] - else: + else: # UHF + assert nset == 2 xc_for_fock = ni.eval_xc_eff( - xc_code, density, deriv=1, xctype=xc_type + xc_code, density, deriv=1, xctype=xc_type, spin=1 )[1] xc_for_fock = xc_for_fock.reshape(nset, -1, *mesh) * weight @@ -1573,7 +1560,7 @@ def get_j(self, dm, hermi=1, kpts=None, kpts_band=None): nr_uks = nr_uks get_vxc = nr_vxc = NotImplemented #numint_cpu.KNumInt.nr_vxc - eval_xc_eff = numint.eval_xc_eff + eval_xc_eff = numint.NumInt.eval_xc_eff _init_xcfuns = numint.NumInt._init_xcfuns nr_rks_fxc = NotImplemented diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py index 49f9be3dc..952a5ac94 100644 --- a/gpu4pyscf/pbc/dft/numint.py +++ b/gpu4pyscf/pbc/dft/numint.py @@ -398,10 +398,10 @@ def nr_rks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, p0, p1 = p1, p1 + weight.size rho[:,p0:p1] = ni.eval_rho(cell, ao_ks, dm_kpts, xctype=xctype, hermi=hermi) - exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] + exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=0)[:2] den = rho[0] * split_grids.weights nelec += den.sum() - excsum += den.dot(exc[:,0]).get()[()] + excsum += den.dot(exc).get()[()] wv = vxc * split_grids.weights # *.5 for v+v.conj().T at the end @@ -494,10 +494,10 @@ def nr_uks(ni, cell, grids, xc_code, dm_kpts, relativity=0, hermi=1, rho[0,:,p0:p1] = ni.eval_rho(cell, ao_ks, dm_kpts[0], xctype=xctype, hermi=hermi) rho[1,:,p0:p1] = ni.eval_rho(cell, ao_ks, dm_kpts[1], xctype=xctype, hermi=hermi) - exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] + exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=1)[:2] den = rho[:,0] * split_grids.weights nelec += den.sum(axis=1) - excsum += den.dot(exc[:,0]).sum().get()[()] + excsum += den.dot(exc).sum().get()[()] wv = vxc * split_grids.weights # *.5 for v+v.conj().T at the end @@ -644,7 +644,7 @@ def block_loop(self, cell, grids, deriv=0, kpts=None, sort_grids=False): yield ao_ks, weight, coords ao_ks = None - eval_xc_eff = numint.eval_xc_eff + eval_xc_eff = numint.NumInt.eval_xc_eff _init_xcfuns = numint.NumInt._init_xcfuns nr_rks = nr_rks diff --git a/gpu4pyscf/pbc/grad/krks.py b/gpu4pyscf/pbc/grad/krks.py index 87759f154..f75edf5ef 100644 --- a/gpu4pyscf/pbc/grad/krks.py +++ b/gpu4pyscf/pbc/grad/krks.py @@ -89,7 +89,7 @@ def get_vxc(ni, cell, grids, xc_code, dm_kpts, kpts, hermi=1): for ao_ks, weight, coords in ni.block_loop(cell, grids, ao_deriv, kpts, sort_grids=True): rho = ni.eval_rho(cell, ao_ks[:,0], dm_kpts, xctype=xctype, hermi=hermi) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=0)[1] wv = weight * vxc[0] aow = cp.einsum('kpi,p->kpi', ao_ks[:,0], wv) for kn in range(nkpts): @@ -100,7 +100,7 @@ def get_vxc(ni, cell, grids, xc_code, dm_kpts, kpts, hermi=1): for ao_ks, weight, coords in ni.block_loop(cell, grids, ao_deriv, kpts, sort_grids=True): rho = ni.eval_rho(cell, ao_ks[:,:4], dm_kpts, xctype=xctype, hermi=hermi) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=0)[1] wv = weight * vxc wv[0] *= .5 for kn in range(nkpts): @@ -111,7 +111,7 @@ def get_vxc(ni, cell, grids, xc_code, dm_kpts, kpts, hermi=1): for ao_ks, weight, coords in ni.block_loop(cell, grids, ao_deriv, kpts, sort_grids=True): rho = ni.eval_rho(cell, ao_ks[:,:4], dm_kpts, xctype=xctype, hermi=hermi) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=0)[1] wv = weight * vxc wv[0] *= .5 wv[4] *= .5 # for the factor 1/2 in tau diff --git a/gpu4pyscf/pbc/grad/kuks.py b/gpu4pyscf/pbc/grad/kuks.py index 92460fdbc..fd8c91386 100644 --- a/gpu4pyscf/pbc/grad/kuks.py +++ b/gpu4pyscf/pbc/grad/kuks.py @@ -87,7 +87,7 @@ def get_vxc(ni, cell, grids, xc_code, dm_kpts, kpts, hermi=1): rho_a = ni.eval_rho(cell, ao_ks[:,0], dm_kpts[0], xctype=xctype, hermi=hermi) rho_b = ni.eval_rho(cell, ao_ks[:,0], dm_kpts[1], xctype=xctype, hermi=hermi) rho = cp.stack([rho_a, rho_b], axis=0) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=1)[1] wv = weight * vxc[:,0] aowa = cp.einsum('xpi,p->xpi', ao_ks[:,0], wv[0]) aowb = cp.einsum('xpi,p->xpi', ao_ks[:,0], wv[1]) @@ -102,7 +102,7 @@ def get_vxc(ni, cell, grids, xc_code, dm_kpts, kpts, hermi=1): rho_a = ni.eval_rho(cell, ao_ks[:,:4], dm_kpts[0], xctype=xctype, hermi=hermi) rho_b = ni.eval_rho(cell, ao_ks[:,:4], dm_kpts[1], xctype=xctype, hermi=hermi) rho = cp.stack([rho_a, rho_b], axis=0) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=1)[1] wv = weight * vxc wv[:,0] *= .5 for kn in range(nkpts): @@ -116,7 +116,7 @@ def get_vxc(ni, cell, grids, xc_code, dm_kpts, kpts, hermi=1): rho_a = ni.eval_rho(cell, ao_ks[:,:4], dm_kpts[0], xctype=xctype, hermi=hermi) rho_b = ni.eval_rho(cell, ao_ks[:,:4], dm_kpts[1], xctype=xctype, hermi=hermi) rho = cp.stack([rho_a, rho_b], axis=0) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=1)[1] wv = weight * vxc wv[:,0] *= .5 wv[:,4] *= .5 # for the factor 1/2 in tau diff --git a/gpu4pyscf/properties/shielding.py b/gpu4pyscf/properties/shielding.py index aa0d8a2bc..831f207f2 100644 --- a/gpu4pyscf/properties/shielding.py +++ b/gpu4pyscf/properties/shielding.py @@ -85,7 +85,7 @@ def nr_rks(ni, mol, grids, xc_code, dms): for ao, index, weight, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): mo_coeff_mask = mo_coeff[index,:] rho = numint.eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, xctype) - vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[1] + vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype, spin=0)[1] if xctype == 'LDA': wv = weight * vxc[0] giao = _sorted_mol.eval_gto('GTOval_ig', coords.get(), comp=3) # (#C(0 1) g) |AO> diff --git a/gpu4pyscf/tdscf/_uhf_resp_sf.py b/gpu4pyscf/tdscf/_uhf_resp_sf.py index 4b8e02094..21211a36c 100644 --- a/gpu4pyscf/tdscf/_uhf_resp_sf.py +++ b/gpu4pyscf/tdscf/_uhf_resp_sf.py @@ -179,7 +179,7 @@ def vind(dm1): # This function is copied from pyscf.dft.numint2c.py def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv): - evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype) + evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype, spin=1) evfk = list(evfk) for order in range(1, deriv + 1): if evfk[order] is not None: @@ -194,11 +194,10 @@ def __mcfun_fn_eval_xc2(ni, xc_code, xctype, rho, deriv): if not isinstance(s, cp.ndarray): s = cp.asarray(s) rho = cp.stack([(t + s) * 0.5, (t - s) * 0.5]) - spin = 1 if isinstance(ni, pyscf_numint.NumInt): - evfk = ni.eval_xc_eff(xc_code, rho.get(), deriv=deriv, xctype=xctype) + evfk = ni.eval_xc_eff(xc_code, rho.get(), deriv=deriv, xctype=xctype, spin=1) else: - evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype, spin=spin) + evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype, spin=1) evfk = list(evfk) for order in range(1, deriv + 1): if evfk[order] is not None: @@ -256,11 +255,5 @@ def cache_xc_kernel_sf(ni, mol, grids, xc_code, mo_coeff, mo_occ, collinear_samp vxc, fxc = eval_xc_eff(xc_code, rho_z, deriv=2, xctype=xctype)[1:3] return None, vxc, fxc elif deriv == 3: - whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1' - if whether_use_gpu: - vxc, fxc, kxc = eval_xc_eff(xc_code, rho_z, deriv=3, xctype=xctype)[1:4] - else: - ni_cpu = ni.to_cpu() - eval_xc_eff = mcfun_eval_xc_adapter_sf(ni_cpu, xc_code, collinear_samples) - vxc, fxc, kxc = eval_xc_eff(xc_code, rho_z, deriv=3, xctype=xctype)[1:4] + vxc, fxc, kxc = eval_xc_eff(xc_code, rho_z, deriv=3, xctype=xctype)[1:4] return None, vxc, fxc, kxc diff --git a/gpu4pyscf/tdscf/rhf.py b/gpu4pyscf/tdscf/rhf.py index 29d8ed0d5..716f8bd2a 100644 --- a/gpu4pyscf/tdscf/rhf.py +++ b/gpu4pyscf/tdscf/rhf.py @@ -205,10 +205,10 @@ def add_hf_(a, b, hyb=1): rho = ni.eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, mask, xctype, with_lapl=False) if singlet or singlet is None: - fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype, spin=0)[2] wfxc = fxc[0,0] * weight else: - fxc = ni.eval_xc_eff(mf.xc, cp.stack((rho, rho)) * 0.5, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, cp.stack((rho, rho)) * 0.5, deriv=2, xctype=xctype, spin=1)[2] wfxc = (fxc[0, 0, 0, 0] - fxc[1, 0, 0, 0]) * 0.5 * weight orbo_mask = orbo[mask] orbv_mask = orbv[mask] @@ -228,10 +228,10 @@ def add_hf_(a, b, hyb=1): rho = ni.eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, mask, xctype, with_lapl=False) if singlet or singlet is None: - fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype, spin=0)[2] wfxc = fxc * weight else: - fxc = ni.eval_xc_eff(mf.xc, cp.stack((rho, rho)) * 0.5, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, cp.stack((rho, rho)) * 0.5, deriv=2, xctype=xctype, spin=1)[2] wfxc = (fxc[0, :, 0, :] - fxc[1, :, 0, :]) * 0.5 * weight orbo_mask = orbo[mask] orbv_mask = orbv[mask] @@ -258,10 +258,10 @@ def add_hf_(a, b, hyb=1): rho = ni.eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, mask, xctype, with_lapl=False) if singlet or singlet is None: - fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype, spin=0)[2] wfxc = fxc * weight else: - fxc = ni.eval_xc_eff(mf.xc, cp.stack((rho, rho))*0.5, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, cp.stack((rho, rho))*0.5, deriv=2, xctype=xctype, spin=1)[2] wfxc = (fxc[0, :, 0, :] - fxc[1, :, 0, :]) * 0.5 * weight orbo_mask = orbo[mask] orbv_mask = orbv[mask] diff --git a/gpu4pyscf/tdscf/uhf.py b/gpu4pyscf/tdscf/uhf.py index 6333b4dfc..544d1f0e6 100644 --- a/gpu4pyscf/tdscf/uhf.py +++ b/gpu4pyscf/tdscf/uhf.py @@ -277,7 +277,7 @@ def add_hf_(a, b, hyb=1): ni.eval_rho2(_sorted_mol, ao, mo_coeff_mask_b, mo_occ[1], mask, xctype, with_lapl=False))) - fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype, spin=1)[2] wfxc = fxc[:,0,:,0] * weight orbo_a_mask = orbo_a[mask] orbv_a_mask = orbv_a[mask] @@ -315,7 +315,7 @@ def add_hf_(a, b, hyb=1): mo_occ[0], mask, xctype, with_lapl=False), ni.eval_rho2(_sorted_mol, ao, mo_coeff_mask_b, mo_occ[1], mask, xctype, with_lapl=False))) - fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype, spin=1)[2] wfxc = fxc * weight orbo_a_mask = orbo_a[mask] orbv_a_mask = orbv_a[mask] @@ -361,7 +361,7 @@ def add_hf_(a, b, hyb=1): mo_occ[0], mask, xctype, with_lapl=False), ni.eval_rho2(_sorted_mol, ao, mo_coeff_mask_b, mo_occ[1], mask, xctype, with_lapl=False))) - fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype)[2] + fxc = ni.eval_xc_eff(mf.xc, rho, deriv=2, xctype=xctype, spin=1)[2] wfxc = fxc * weight orbo_a_mask = orbo_a[mask] orbv_a_mask = orbv_a[mask]