From be1aef0080bca6ec09836884964b400a3d988ba8 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 2 Nov 2023 21:02:25 -0700 Subject: [PATCH] Optimize hessian 3 (#54) * numpy -> cupy for solvent * for linter * remove grad switch from pcm.py * passed flake8 * solvent integrals on GPU * flake8 * compatiable with pyscf-2.4.0 * added solvent * fixed issues for to_cpu * store intermeidate variable on CPU * cupy.einsum -> contract * optimized dft integration for gradient and hessian * remove lprof * fixed a bug in nlc * precompute fxc_x * optimize hessian & gpu timer * remove scale_ao --- examples/00-h2o.py | 4 +- examples/dft_driver.py | 2 +- gpu4pyscf/df/df.py | 4 +- gpu4pyscf/df/df_jk.py | 4 +- gpu4pyscf/df/grad/rhf.py | 4 +- gpu4pyscf/df/grad/rks.py | 2 +- gpu4pyscf/df/hessian/rhf.py | 32 +-- gpu4pyscf/df/hessian/rks.py | 2 +- gpu4pyscf/df/int3c2e.py | 28 +-- gpu4pyscf/dft/libxc.py | 35 +-- gpu4pyscf/dft/numint.py | 184 ++++++++++++--- gpu4pyscf/dft/rks.py | 6 +- gpu4pyscf/grad/rhf.py | 40 ++-- gpu4pyscf/grad/rks.py | 68 +++--- gpu4pyscf/hessian/rks.py | 142 +++++++---- gpu4pyscf/lib/cupy_helper.py | 18 +- gpu4pyscf/lib/cupy_helper/add_sparse.cu | 13 +- gpu4pyscf/lib/cutensor.py | 27 ++- gpu4pyscf/lib/gdft/CMakeLists.txt | 2 +- gpu4pyscf/lib/gdft/contract_rho.cu | 299 +++++++++++++++++++++++- gpu4pyscf/lib/gdft/nr_eval_gto.cu | 27 --- gpu4pyscf/lib/logger.py | 51 +++- gpu4pyscf/scf/hf.py | 14 +- 23 files changed, 748 insertions(+), 260 deletions(-) diff --git a/examples/00-h2o.py b/examples/00-h2o.py index 7f17e62d..7df44258 100644 --- a/examples/00-h2o.py +++ b/examples/00-h2o.py @@ -24,7 +24,7 @@ H 0.7570000000 0.0000000000 -0.4696000000 ''' -xc='LDA' +xc='B3LYP' bas='def2-tzvpp' auxbasis='def2-tzvpp-jkfit' scf_tol = 1e-10 @@ -34,7 +34,7 @@ mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) -mol.verbose = 4 +mol.verbose = 6 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) mf_GPU.grids.level = grids_level mf_GPU.conv_tol = scf_tol diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 3b68d665..12628086 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -35,7 +35,7 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 6 +mol.verbose = 1 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) if args.solvent: diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 28998230..da59f9da 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -67,8 +67,8 @@ def build(self, direct_scf_tol=1e-14, omega=None): idx = np.arange(nao) self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx) - t0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, mol.verbose) + t0 = log.init_timer() if auxmol is None: self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis) @@ -217,7 +217,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): nq = len(intopt.log_qs) for cp_ij_id, _ in enumerate(intopt.log_qs): if len(intopt.ao_pairs_row[cp_ij_id]) == 0: continue - t1 = (logger.process_clock(), logger.perf_counter()) + t1 = log.init_timer() cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index e27f16f4..76f4bed0 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -20,9 +20,9 @@ import cupy import numpy from pyscf import lib, scf, __config__ -from pyscf.lib import logger from pyscf.scf import dhf from pyscf.df import df_jk, addons +from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract, take_last2d, transpose_sum, load_library, get_avail_mem from gpu4pyscf.dft import rks, numint from gpu4pyscf.scf import hf @@ -250,7 +250,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- nao = dms_tag.shape[-1] dms = dms_tag.reshape([-1,nao,nao]) nset = dms.shape[0] - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() if dfobj._cderi is None: log.debug('CDERI not found, build...') dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 84f5ed23..febdee40 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -18,12 +18,12 @@ import cupy from cupyx.scipy.linalg import solve_triangular from pyscf.df.grad import rhf -from pyscf.lib import logger from pyscf import lib, scf, gto from gpu4pyscf.df import int3c2e from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library from gpu4pyscf.grad.rhf import grad_elec from gpu4pyscf import __config__ +from gpu4pyscf.lib import logger libcupy_helper = load_library('libcupy_helper') @@ -154,7 +154,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega vkaux = cupy.zeros((3,naux_cart)) cupy.get_default_memory_pool().free_all_blocks() for cp_kl_id in range(len(intopt.aux_log_qs)): - t1 = (logger.process_clock(), logger.perf_counter()) + t1 = log.init_timer() k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] assert k1-k0 <= block_size if with_j: diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py index 2ef88e86..4708c4d7 100644 --- a/gpu4pyscf/df/grad/rks.py +++ b/gpu4pyscf/df/grad/rks.py @@ -29,7 +29,7 @@ def get_veff(ks_grad, mol=None, dm=None): ''' if mol is None: mol = ks_grad.mol if dm is None: dm = ks_grad.base.make_rdm1() - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks_grad) mf = ks_grad.base ni = mf._numint diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 46cb38ec..1141de1f 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -55,7 +55,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, '''Partial derivative ''' log = logger.new_logger(hessobj, verbose) - time0 = t1 = (logger.process_clock(), logger.perf_counter()) + time0 = t1 = log.init_timer() mol = hessobj.mol mf = hessobj.base @@ -393,12 +393,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None, with_k=True, omega=None): log = logger.new_logger(hessobj, verbose) - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() mol = hessobj.mol if atmlst is None: atmlst = range(mol.natm) # FIXME with_k = True + mo_coeff = cupy.asarray(mo_coeff) + mo_occ = cupy.asarray(mo_occ) mf = hessobj.base #auxmol = hessobj.base.with_df.auxmol @@ -441,7 +443,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, rhok0_Pl_ = np.empty_like(wk_Pl_) for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) - rhok0_Pl_[:,p0:p1] = cupy.einsum('pq,qio->pio', int2c_inv, wk_tmp).get() + rhok0_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get() wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None # int3c_ip1 contributions @@ -449,8 +451,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] - vj1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vj1_ao, mo_coeff) - vk1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vk1_ao, mo_coeff) + vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff) + vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff) vj1_ao = vk1_ao = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) @@ -475,15 +477,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, for p0, p1 in lib.prange(0,nao,64): rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) - vj1_tmp = cupy.einsum('pio,xp->xpio', rhok_tmp, wj0_10) + vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) - wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok_tmp) - vj1_tmp += cupy.einsum('xpio,p->xpio', wk0_10_Pl_, rhoj0) - vj1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom) + wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp) + vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0) + vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom) if with_k: - vk1_tmp = 2.0 * cupy.einsum('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) - vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok_tmp) - vk1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vk1_tmp, aux2atom) + vk1_tmp = 2.0 * contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) + vk1_tmp += 2.0 * contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp) + vk1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom) wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None aux2atom = None @@ -498,8 +500,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, # ========================== sorted AO end ================================ def _ao2mo(mat): - tmp = cupy.einsum('xij,jo->xio', mat, mocc) - return cupy.einsum('xik,ip->xpk', tmp, mo_coeff) + tmp = contract('xij,jo->xio', mat, mocc) + return contract('xik,ip->xpk', tmp, mo_coeff) vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2 vj1_int3c_ip1 = vj1_int3c_ip2 = None @@ -522,7 +524,7 @@ def _ao2mo(mat): vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) h1 = hcore_deriv(ia) - h1 = _ao2mo(h1) + h1 = _ao2mo(cupy.asarray(h1)) vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) if with_k: vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index d8986f1f..b7432314 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -37,7 +37,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): log = logger.new_logger(hessobj, verbose) - time0 = t1 = (logger.process_clock(), logger.perf_counter()) + time0 = t1 = log.init_timer() mol = hessobj.mol mf = hessobj.base diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index fc755a27..f3a43442 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -184,7 +184,7 @@ def build(self, cutoff=1e-14, group_size=None, a tot_mol is created with concatenating [mol, fake_mol, aux_mol] we will pair (ao,ao) and (aux,1) separately. ''' - cput0 = (logger.process_clock(), logger.perf_counter()) + cput0 = logger.init_timer(self.mol) sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(self.mol) if group_size is not None : uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size) @@ -314,6 +314,7 @@ def build(self, cutoff=1e-14, group_size=None, tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm), tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas), tot_mol._env.ctypes.data_as(ctypes.c_void_p)) + cput1 = logger.timer_debug1(tot_mol, 'Initialize GPU cache', *cput1) self.bas_pairs_locs = bas_pairs_locs ncptype = len(self.log_qs) @@ -745,29 +746,24 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) # initialize intermediate variables if count % ncp_ij == 0: - rhoj0 = cupy.zeros([3,k1-k0,nao_sph]) rhok_tmp = cupy.asarray(rhok[k0:k1]) - vj1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc]) if with_k: rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2 rhok0 = contract('pli,lo->poi', rhok0_slice, orbo) - int3c_ip1_occ = cupy.zeros([3,k1-k0,nao_sph,nocc]) - vk1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc]) - # contraction - rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) + rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) + vj1_ao = contract('pJo,xpi->xiJo', rhok_tmp, rhoj0) + vj1 += 2.0*contract('xiJo,ia->axJo', vj1_ao, ao2atom[i0:i1]) + if with_k: - int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - vk1_ao[:,i0:i1,j0:j1] += contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) vk1_buf[:,i0:i1] += contract('xpji,plj->xil', int3c_blk, rhok0_slice[:,:,j0:j1]) - # reduction - if (count+1) % ncp_ij == 0: - vj1_ao += contract('pjo,xpi->xijo', rhok_tmp, rhoj0) - vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom) - if with_k: - vk1_ao += contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) - vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) + vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) + vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1]) + + int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice[:,:,i0:i1]) + vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1]) count += 1 return vj1_buf, vk1_buf, vj1, vk1 diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py index 65c05a77..5ed63b48 100644 --- a/gpu4pyscf/dft/libxc.py +++ b/gpu4pyscf/dft/libxc.py @@ -21,7 +21,7 @@ import cupy from pyscf import dft -libxc = np.ctypeslib.load_library( +_libxc = np.ctypeslib.load_library( 'libxc', os.path.abspath(os.path.join(__file__, '..', '..', 'lib', 'deps', 'lib'))) def _check_arrays(current_arrays, fields, factor, required): @@ -45,21 +45,21 @@ class _xcfun(ctypes.Structure): pass _xc_func_p = ctypes.POINTER(_xcfun) -libxc.xc_func_alloc.restype = _xc_func_p -libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int) -libxc.xc_func_end.argtypes = (_xc_func_p, ) -libxc.xc_func_free.argtypes = (_xc_func_p, ) +_libxc.xc_func_alloc.restype = _xc_func_p +_libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int) +_libxc.xc_func_end.argtypes = (_xc_func_p, ) +_libxc.xc_func_free.argtypes = (_xc_func_p, ) class XCfun: def __init__(self, xc, spin): assert spin == 'unpolarized' self._spin = 1 - self.xc_func = libxc.xc_func_alloc() + self.xc_func = _libxc.xc_func_alloc() if isinstance(xc, str): - self.func_id = libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode())) + self.func_id = _libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode())) else: self.func_id = xc - ret = libxc.xc_func_init(self.xc_func, self.func_id, self._spin) + ret = _libxc.xc_func_init(self.xc_func, self.func_id, self._spin) if ret != 0: raise RuntimeError('failed to initialize xc fun') self._family = dft.libxc.xc_type(xc) @@ -67,9 +67,10 @@ def __init__(self, xc, spin): def __del__(self): if self.xc_func is None: return - libxc.xc_func_end(self.xc_func) - libxc.xc_func_free(self.xc_func) - + # TODO: deallocate xc func + #_libxc.xc_func_end(self.xc_func) + #_libxc.xc_func_free(self.xc_func) + def needs_laplacian(self): return dft.libxc.needs_laplacian(self.func_id) @@ -85,7 +86,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k npoints = int(inp["rho"].size / self._spin) if (inp["rho"].size % self._spin): raise ValueError("Rho input has an invalid shape, must be divisible by %d" % self._spin) - + # Find the right compute function args = [self.xc_func, ctypes.c_size_t(npoints)] if self._family == 'LDA': @@ -114,7 +115,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k if(isinstance(arg, cupy.ndarray)): arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) cuda_args.append(arg) - libxc.xc_lda(*cuda_args) + _libxc.xc_lda(*cuda_args) elif self._family == 'GGA': input_labels = ["rho", "sigma"] input_num_args = 2 @@ -141,7 +142,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k if(isinstance(arg, cupy.ndarray)): arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) cuda_args.append(arg) - libxc.xc_gga(*cuda_args) + _libxc.xc_gga(*cuda_args) elif self._family == 'MGGA': # Build input args @@ -178,7 +179,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k output = _check_arrays(output, output_labels[5:15], npoints, do_fxc) output = _check_arrays(output, output_labels[15:35], npoints, do_kxc) output = _check_arrays(output, output_labels[35:70], npoints, do_lxc) - + args.extend([ inp[x] for x in input_labels]) if not self.needs_laplacian(): args.insert(-1, cupy.empty((1))) # Add none ptr to laplacian @@ -189,10 +190,10 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k if(isinstance(arg, cupy.ndarray)): arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) cuda_args.append(arg) - libxc.xc_mgga(*cuda_args) + _libxc.xc_mgga(*cuda_args) else: raise KeyError("Functional kind not recognized!") - + return {k: v for k, v in zip(output_labels, args[2+input_num_args:]) if v is not None} diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index cc9c0e6b..6ca2a9f8 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -21,13 +21,13 @@ import cupy from pyscf import gto, lib, dft -from pyscf.lib import logger from pyscf.dft import numint from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.scf.hf import basis_seg_contraction from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ +from gpu4pyscf.lib import logger LMAX_ON_GPU = 6 BAS_ALIGNED = 4 @@ -35,6 +35,7 @@ MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) AO_THRESHOLD = 1e-12 +AO_ALIGNMENT = 32 # Should we release the cupy cache? FREE_CUPY_CACHE = False @@ -269,6 +270,42 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', rho[tau_idx] *= .5 return rho +def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA', + with_lapl=True, verbose=None): + ''' ao: nd x nao x ng + c0: nd x nocc x ng + mo1: na x nao x nocc + ''' + xctype = xctype.upper() + if xctype == 'LDA' or xctype == 'HF': + _, ngrids = ao.shape + else: + _, ngrids = ao[0].shape + + na = mo1.shape[0] + cpos1= mo1 + if xctype == 'LDA' or xctype == 'HF': + c_0 = contract('aio,ig->aog', cpos1, ao)#cupy.dot(cpos1.T, ao) + rho = cupy.empty([na,ngrids]) + for i in range(na): + rho[i] = _contract_rho(c0, c_0[i]) + rho *= 2.0 + elif xctype in ('GGA', 'NLC'): + c_0 = contract('nig,aio->anog', ao, cpos1) + rho = cupy.empty([na, 4, ngrids]) + for i in range(na): + _contract_rho_gga(c0, c_0[i], rho=rho[i]) + + else: # meta-GGA + if with_lapl: + raise NotImplementedError("mGGA with lapl not implemented") + rho = cupy.empty((na,5,ngrids)) + c_0 = contract('nig,aio->anog', ao, cpos1) + for i in range(na): + _contract_rho_mgga(c0, c_0[i], rho=rho[i]) + + return rho + def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): thresh=1e-8 @@ -408,15 +445,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): for i in range(nset): - t0 = (logger.process_clock(), logger.perf_counter()) - #rho = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=1) - #rho = _make_rho(ao, dms[i], xctype=xctype) + t0 = log.init_timer() if mo_coeff is None: rho = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] rho = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) + t1 = log.timer_debug1('eval rho', *t0) exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] vxc = cupy.asarray(vxc, order='C') exc = cupy.asarray(exc, order='C') @@ -662,8 +698,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= ao_deriv = 1 p0 = 0 p1 = 0 - t0 = (logger.process_clock(), logger.perf_counter()) for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv): + t0 = log.init_timer() p0, p1 = p1, p1+len(weights) # precompute molecular orbitals if with_mocc: @@ -671,44 +707,51 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if xctype == 'LDA': c0 = _dot_ao_dm(mol, ao, occ_coeff_mask, None, None, None) elif xctype == "GGA": - c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0]) - for i in range(4): - c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None) + c0 = contract('nig,io->nog', ao, occ_coeff_mask) else: # mgga - c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0]) - for i in range(4): - c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None) + c0 = contract('nig,io->nog', ao, occ_coeff_mask) + + if with_mocc: + rho1 = eval_rho4(opt.mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False) + else: + # slow version + rho1 = [] + for i in range(nset): + rho_tmp = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) + rho1.append(rho_tmp) + rho1 = cupy.stack(rho1, axis=0) + t0 = log.timer_debug1('rho', *t0) + # precompute fxc_w if xctype == 'LDA': fxc_w = fxc[0,0,p0:p1] * weights + wv = rho1 * fxc_w else: fxc_w = fxc[:,:,p0:p1] * weights - # loop perturbed molecular orbitals - for i in range(nset): - if with_mocc: - rho1 = eval_rho3(opt.mol, ao, c0, mo1[i][mask], xctype=xctype, with_lapl=False) - else: - rho1 = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) + wv = contract('axg,xyg->ayg', rho1, fxc_w) + + for i in range(nset): if xctype == 'LDA': - wv = rho1 * fxc_w - vmat_tmp = ao.dot(_scale_ao(ao, wv).T) + vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T) add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'GGA': - wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w) - wv[0] *= .5 - vmat_tmp = ao[0].dot(_scale_ao(ao, wv).T) + wv[i,0] *= .5 + aow = _scale_ao(ao, wv[i]) + vmat_tmp = aow.dot(ao[0].T) add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'NLC': raise NotImplementedError('NLC') else: - wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w) - wv[[0, 4]] *= .5 - vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T) - vmat_tmp+= _tau_dot(ao, ao, wv[4]) + wv[i,0] *= .5 + wv[i,4] *= .5 + vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T) + vmat_tmp+= _tau_dot(ao, ao, wv[i,4]) add_sparse(vmat[i], vmat_tmp, mask) + t0 = log.timer_debug1('vxc', *t0) ao = c0 = rho1 = None + vmat = contract('pi,npq->niq', coeff, vmat) vmat = contract('qj,niq->nij', coeff, vmat) if xctype != 'LDA': @@ -723,6 +766,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= return cupy.asarray(vmat) + def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, relativity=0, singlet=True, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): @@ -851,7 +895,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, 2D array of shape (nao,nao) where nao is the number of AO functions. ''' log = logger.new_logger(mol, verbose) - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() opt = getattr(ni, 'gdftopt', None) if opt is None: ni.build(mol, grids.coords) @@ -1040,6 +1084,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, ''' Define this macro to loop over grids by blocks. Sparsity is not implemented yet + sorted_ao: by default ao_value is sorted for GPU ''' if grids.coords is None: grids.build(with_non0tab=True) @@ -1050,7 +1095,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, log = logger.new_logger(mol, mol.verbose) if blksize is None: - cupy.get_default_memory_pool().free_all_blocks() + #cupy.get_default_memory_pool().free_all_blocks() mem_avail = get_avail_mem() blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) @@ -1070,23 +1115,31 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, coords = grids.coords[ip0:ip1] weight = grids.weights[ip0:ip1] #sindex = ni.screen_index[ip0//GRID_BLKSIZE:] - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() ao = eval_ao(ni, mol, coords, deriv) - log.timer_debug1('eval ao', *t0) + t0 = log.timer_debug1('eval ao', *t0) # cache ao indices if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx: - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() if deriv == 0: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1]) - idx = cupy.argwhere(mask).astype(np.int32)[:,0] + all_idx = cupy.arange(ao.shape[0], dtype=np.int32) + idx = all_idx[mask] + pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) + zero_idx = all_idx[~mask][:pad] + idx = cupy.hstack([idx, zero_idx]) ao_mask = ao[idx,:] else: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - idx = cupy.argwhere(mask).astype(np.int32)[:,0] + all_idx = cupy.arange(ao.shape[1], dtype=np.int32) + idx = all_idx[mask] + pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) + zero_idx = all_idx[~mask][:pad] + idx = cupy.hstack([idx, zero_idx]) ao_mask = ao[:,idx,:] ni.non0ao_idx[deriv, block_id, blksize, ngrids] = idx - log.timer_debug1('initialize ao sparsity', *t0) + log.timer_debug1('init ao sparsity', *t0) else: idx = ni.non0ao_idx[deriv, block_id, blksize, ngrids] if deriv == 0: @@ -1094,7 +1147,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, else: ao_mask = ao[:,idx,:] block_id += 1 - log.timer_debug1('eval rho', *t0) + log.timer_debug1('extract sparse ao', *t0) yield ao_mask, idx, weight, coords class NumInt(numint.NumInt): @@ -1185,6 +1238,63 @@ def _contract_rho(bra, ket, rho=None): rho = cupy.einsum('ig,ig->g', bra, ket) return rho +def _contract_rho1(bra, ket, rho=None): + ''' xip,ip->xp + ''' + if bra.ndim == 2: + bra = cupy.expand_dims(bra, axis=0) + nvar, nao, ngrids = bra.shape + if rho is None: + rho = cupy.empty([nvar, ngrids]) + + for i in range(nvar): + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(rho[i].data.ptr, ctypes.c_void_p), + ctypes.cast(bra[i].data.ptr, ctypes.c_void_p), + ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') + return rho + +def _contract_rho_gga(bra, ket, rho=None): + ''' ig,nig->ng + ''' + n, nao, ngrids = bra.shape + assert n == 4 + if rho is None: + rho = cupy.empty([4,ngrids]) + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho_gga( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(rho.data.ptr, ctypes.c_void_p), + ctypes.cast(bra.data.ptr, ctypes.c_void_p), + ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') + return rho + +def _contract_rho_mgga(bra, ket, rho=None): + ''' nig,nig->ng + ''' + n, nao, ngrids = bra.shape + assert n == 4 + if rho is None: + rho = cupy.empty([5,ngrids]) + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho_mgga( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(rho.data.ptr, ctypes.c_void_p), + ctypes.cast(bra.data.ptr, ctypes.c_void_p), + ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') + return rho + def _dot_ao_dm(mol, ao, dm, non0tab, shls_slice, ao_loc, out=None): return cupy.dot(dm.T, ao) @@ -1272,7 +1382,7 @@ def _scale_ao(ao, wv, out=None): assert wv.size == ngrids else: if ao[0].flags.f_contiguous: - return contract('nip,np->ip', ao, wv) + return cupy.einsum('nip,np->ip', ao, wv) nvar, nao, ngrids = ao.shape assert wv.shape == (nvar, ngrids) @@ -1365,6 +1475,8 @@ def build(self, mol=None): coeff = np.vstack([coeff, np.zeros((paddings, coeff.shape[1]))]) pmol._decontracted = True self.mol = pmol + inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32) + self.rev_ao_idx = cupy.asarray(inv_idx) self.coeff = coeff[ao_idx] self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32) self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32) diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index ea12f511..b4fd72b0 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -68,7 +68,7 @@ def initialize_grids(ks, mol=None, dm=None): # Initialize self.grids the first time call get_veff if mol is None: mol = ks.mol if ks.grids.coords is None: - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks) ks.grids.build() #ks.grids.build(with_non0tab=True) ks.grids.weights = cupy.asarray(ks.grids.weights) @@ -82,7 +82,7 @@ def initialize_grids(ks, mol=None, dm=None): is_nlc = ks.nlc or ks._numint.libxc.is_nlc(ks.xc) if is_nlc and ks.nlcgrids.coords is None: if ks.nlcgrids.coords is None: - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks) #ks.nlcgrids.build(with_non0tab=True) ks.nlcgrids.build() ks.nlcgrids.weights = cupy.asarray(ks.nlcgrids.weights) @@ -124,7 +124,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): if mol is None: mol = ks.mol if dm is None: dm = ks.make_rdm1() - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks) if ks.grids.coords is None: ks.grids.ao_values = None initialize_grids(ks, mol, dm) diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index 59a3aeac..16a3b53e 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -19,12 +19,12 @@ import cupy import numpy from pyscf import lib, gto -from pyscf.lib import logger from pyscf.grad import rhf from gpu4pyscf.lib.cupy_helper import load_library from gpu4pyscf.scf.hf import _VHFOpt -from gpu4pyscf.lib.cupy_helper import tag_array +from gpu4pyscf.lib.cupy_helper import tag_array, contract from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df +from gpu4pyscf.lib import logger LMAX_ON_GPU = 3 FREE_CUPY_CACHE = True @@ -255,8 +255,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, if atmlst is None: atmlst = range(mol.natm) - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() if hermi != 1: raise NotImplementedError('JK-builder only supports hermitian density matrix') if omega is None: @@ -328,7 +328,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, dm_shl = cupy.asarray(np.log(dm_shl)) nshls = dm_shl.shape[0] t0 = time.perf_counter() - + if hermi != 1: dm_ctr_cond = (dm_ctr_cond + dm_ctr_cond.T) * .5 fn = libgvhf.GINTget_veff_ip1 @@ -347,7 +347,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, ll = vhfopt.uniq_l_ctr[cpl,0] if lk > LMAX_ON_GPU or ll > LMAX_ON_GPU or log_q_kl.size == 0: continue - + # TODO: determine cutoff based on the relevant maximum value of dm blocks? sub_dm_cond = max(dm_ctr_cond[cpi,cpj], dm_ctr_cond[cpk,cpl], dm_ctr_cond[cpi,cpk], dm_ctr_cond[cpj,cpk], @@ -416,8 +416,6 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, coeff = dms = None cupy.get_default_memory_pool().free_all_blocks() - #if vj is not None: vj_per_atom = vj_per_atom.T - #if vk is not None: vk_per_atom = vk_per_atom.T if out_cupy: return vj_per_atom, vk_per_atom else: @@ -427,8 +425,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, def _get_jk(gradient_object, mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None): mf = gradient_object.base - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(gradient_object) + cput0 = log.init_timer() log.debug3('apply get_grad_jk on gpu') if hasattr(mf, '_opt_gpu'): vhfopt = mf._opt_gpu @@ -457,7 +455,7 @@ def get_dh1e_ecp(mol, dm): for ia in ecp_atoms: with mol.with_rinv_at_nucleus(ia): ecp = mol.intor('ECPscalar_iprinv', comp=3) - dh1e_ecp[ia] = cupy.einsum('xij,ij->x', ecp, dm) + dh1e_ecp[ia] = contract('xij,ij->x', cupy.asarray(ecp), dm) return 2.0 * dh1e_ecp def grad_nuc(mf_grad, atmlst=None): @@ -489,11 +487,11 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None): atmlst = range(mol.natm) aoslices = mol.aoslice_by_atom() - t0 = (logger.process_clock(), logger.perf_counter()) if mo_energy is None: mo_energy = mf.mo_energy if mo_occ is None: mo_occ = mf.mo_occ if mo_coeff is None: mo_coeff = mf.mo_coeff log = logger.Logger(mf_grad.stdout, mf_grad.verbose) + t0 = log.init_timer() mo_energy = cupy.asarray(mo_energy) mo_occ = cupy.asarray(mo_occ) @@ -515,9 +513,9 @@ def calculate_h1e(h1_gpu, s1_gpu): with lib.call_in_background(calculate_h1e) as calculate_hs: calculate_hs(h1, s1) # (i | \nabla hcore | j) - t3 = log.timer_debug1("get_dh1e", *t0) + t3 = log.init_timer() dh1e = int3c2e.get_dh1e(mol, dm0) - + t4 = log.timer_debug1("get_dh1e", *t3) if mol.has_ecp(): dh1e += get_dh1e_ecp(mol, dm0) @@ -527,20 +525,20 @@ def calculate_h1e(h1_gpu, s1_gpu): log.debug('Computing Gradients of NR-HF Coulomb repulsion') dm0 = tag_array(dm0, mo_coeff=mo_coeff, mo_occ=mo_occ) - + extra_force = cupy.zeros((len(atmlst),3)) for k, ia in enumerate(atmlst): extra_force[k] += mf_grad.extra_force(ia, locals()) - + t2 = log.timer_debug1('gradients of 2e part', *t1) - - dh = cupy.einsum('xij,ij->xi', h1, dm0) - ds = cupy.einsum('xij,ij->xi', s1, dme0) + + dh = contract('xij,ij->xi', h1, dm0) + ds = contract('xij,ij->xi', s1, dme0) delec = 2.0*(dh - ds) - + delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]]) de = 2.0 * dvhf + dh1e + delec + extra_force - + if(hasattr(mf, 'disp') and mf.disp is not None): g_disp = mf_grad.get_dispersion() mf_grad.grad_disp = g_disp @@ -565,13 +563,13 @@ class Gradients(rhf.Gradients): def get_j(self, mol=None, dm=None, hermi=0, omega=None): vj, _ = self.get_jk(mol, dm, with_k=False, omega=omega) return vj - + def get_k(self, mol=None, dm=None, hermi=0, omega=None): _, vk = self.get_jk(mol, dm, with_j=False, omega=omega) return vk def extra_force(self, atom_id, envs): - ''' + ''' grid response is implemented get_veff ''' return 0 \ No newline at end of file diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index f22e8184..316478cc 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -17,6 +17,7 @@ # Modified by Xiaojie Wu '''Non-relativistic RKS analytical nuclear gradients''' +import ctypes import numpy import cupy import pyscf @@ -27,12 +28,15 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.dft import numint, xc_deriv, rks from gpu4pyscf.dft.numint import _GDFTOpt, AO_THRESHOLD -from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array +from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array, load_library from pyscf import __config__ MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) +libgdft = load_library('libgdft') +libgdft.GDFT_make_dR_dao_w.restype = ctypes.c_int + def _get_veff(ks_grad, mol=None, dm=None): ''' First order derivative of DFT effective potential matrix (wrt electron coordinates) @@ -124,7 +128,6 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, nset = len(dms) assert nset == 1 - if xctype == 'LDA': ao_deriv = 1 else: @@ -141,13 +144,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = weight * vxc[0] aow = numint._scale_ao(ao_mask[0], wv) vtmp = _d1_dot_(ao_mask[1:4], aow.T) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) + add_sparse(vmat[idm], vtmp, idx) elif xctype == 'GGA': ao_deriv = 2 for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): @@ -158,13 +155,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = weight * vxc wv[0] *= .5 vtmp = _gga_grad_sum_(ao_mask, wv) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) + add_sparse(vmat[idm], vtmp, idx) elif xctype == 'NLC': raise NotImplementedError('NLC') @@ -180,14 +171,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv[4] *= .5 # for the factor 1/2 in tau vtmp = _gga_grad_sum_(ao_mask, wv) vtmp += _tau_grad_dot_(ao_mask, wv[4]) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) - + add_sparse(vmat[idm], vtmp, idx) vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat] exc = None if nset == 1: @@ -243,9 +227,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 # *.5 because vmat + vmat.T at the end vmat_tmp = _gga_grad_sum_(ao_mask, wv) - add_sparse(vmat[0], vmat_tmp[0], mask) - add_sparse(vmat[1], vmat_tmp[1], mask) - add_sparse(vmat[2], vmat_tmp[2], mask) + add_sparse(vmat, vmat_tmp, mask) vmat = contract('npq,qj->npj', vmat, coeff) vmat = contract('pi,npj->nij', coeff, vmat) @@ -255,6 +237,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, def _make_dR_dao_w(ao, wv): #:aow = numpy.einsum('npi,p->npi', ao[1:4], wv[0]) + ''' aow = [ numint._scale_ao(ao[1], wv[0]), # dX nabla_x numint._scale_ao(ao[2], wv[0]), # dX nabla_y @@ -272,13 +255,34 @@ def _make_dR_dao_w(ao, wv): aow[2] += numint._scale_ao(ao[6], wv[1]) # dZ nabla_x aow[2] += numint._scale_ao(ao[8], wv[2]) # dZ nabla_y aow[2] += numint._scale_ao(ao[9], wv[3]) # dZ nabla_z + ''' + assert ao.flags.c_contiguous + assert wv.flags.c_contiguous + + _, nao, ngrids = ao.shape + aow = cupy.empty([3,nao,ngrids]) + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFT_make_dR_dao_w( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(aow.data.ptr, ctypes.c_void_p), + ctypes.cast(ao.data.ptr, ctypes.c_void_p), + ctypes.cast(wv.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') return aow -def _d1_dot_(ao1, ao2): - vmat0 = cupy.dot(ao1[0], ao2) - vmat1 = cupy.dot(ao1[1], ao2) - vmat2 = cupy.dot(ao1[2], ao2) - return cupy.stack([vmat0,vmat1,vmat2]) +def _d1_dot_(ao1, ao2, out=None): + if out is None: + vmat0 = cupy.dot(ao1[0], ao2) + vmat1 = cupy.dot(ao1[1], ao2) + vmat2 = cupy.dot(ao1[2], ao2) + return cupy.stack([vmat0,vmat1,vmat2]) + else: + cupy.dot(ao1[0], ao2, out=out[0]) + cupy.dot(ao1[1], ao2, out=out[1]) + cupy.dot(ao1[2], ao2, out=out[2]) + return out def _gga_grad_sum_(ao, wv): #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index d3898d42..82269257 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -309,35 +309,35 @@ def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): ao_dm0_y = ao_dm0[2][p0:p1] ao_dm0_z = ao_dm0[3][p0:p1] # (d_X \nabla mu) dot \nalba nu DM_{mu,nu} - rho1[0,4] += cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_x) - rho1[0,4] += cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_y) - rho1[0,4] += cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_z) - rho1[1,4] += cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_x) - rho1[1,4] += cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_y) - rho1[1,4] += cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_z) - rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_x) - rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_y) - rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_z) + rho1[0,4] += numint._contract_rho(ao[XX,p0:p1], ao_dm0_x) + rho1[0,4] += numint._contract_rho(ao[XY,p0:p1], ao_dm0_y) + rho1[0,4] += numint._contract_rho(ao[XZ,p0:p1], ao_dm0_z) + rho1[1,4] += numint._contract_rho(ao[YX,p0:p1], ao_dm0_x) + rho1[1,4] += numint._contract_rho(ao[YY,p0:p1], ao_dm0_y) + rho1[1,4] += numint._contract_rho(ao[YZ,p0:p1], ao_dm0_z) + rho1[2,4] += numint._contract_rho(ao[ZX,p0:p1], ao_dm0_x) + rho1[2,4] += numint._contract_rho(ao[ZY,p0:p1], ao_dm0_y) + rho1[2,4] += numint._contract_rho(ao[ZZ,p0:p1], ao_dm0_z) rho1[:,4] *= .5 else: raise RuntimeError ao_dm0_0 = ao_dm0[0][p0:p1] # (d_X \nabla_x mu) nu DM_{mu,nu} - rho1[:,0] = cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0_0) - rho1[0,1]+= cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_0) - rho1[0,2]+= cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_0) - rho1[0,3]+= cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_0) - rho1[1,1]+= cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_0) - rho1[1,2]+= cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_0) - rho1[1,3]+= cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_0) - rho1[2,1]+= cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_0) - rho1[2,2]+= cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_0) - rho1[2,3]+= cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_0) + rho1[:,0] = numint._contract_rho1(ao[1:4,p0:p1], ao_dm0_0) + rho1[0,1]+= numint._contract_rho(ao[XX,p0:p1], ao_dm0_0) + rho1[0,2]+= numint._contract_rho(ao[XY,p0:p1], ao_dm0_0) + rho1[0,3]+= numint._contract_rho(ao[XZ,p0:p1], ao_dm0_0) + rho1[1,1]+= numint._contract_rho(ao[YX,p0:p1], ao_dm0_0) + rho1[1,2]+= numint._contract_rho(ao[YY,p0:p1], ao_dm0_0) + rho1[1,3]+= numint._contract_rho(ao[YZ,p0:p1], ao_dm0_0) + rho1[2,1]+= numint._contract_rho(ao[ZX,p0:p1], ao_dm0_0) + rho1[2,2]+= numint._contract_rho(ao[ZY,p0:p1], ao_dm0_0) + rho1[2,3]+= numint._contract_rho(ao[ZZ,p0:p1], ao_dm0_0) # (d_X mu) (\nabla_x nu) DM_{mu,nu} - rho1[:,1] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[1][p0:p1]) - rho1[:,2] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[2][p0:p1]) - rho1[:,3] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[3][p0:p1]) + rho1[:,1] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[1][p0:p1]) + rho1[:,2] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[2][p0:p1]) + rho1[:,3] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[3][p0:p1]) # *2 for |mu> DM njp', ao, coeff[mask]) + t0 = log.init_timer() + nao_non0 = len(mask) + ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc[0] aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)] _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) @@ -409,10 +413,14 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): rho1 = contract('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2 # aow ~ rho1 ~ d/dR1 wv = wf * rho1 - aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) + vmat_tmp = cupy.zeros([3,3,nao_non0,nao_non0]) + aow = [numint._scale_ao(ao_mask[0], wv[i]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) + vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + vmat[ia] += vmat_tmp ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] @@ -420,29 +428,44 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): elif xctype == 'GGA': ao_deriv = 2 comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 - for ao, mask, weight, coords \ + for ao_mask, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao): - # TODO: improve efficiency - ao = contract('nip,ij->njp', ao, coeff[mask]) + t0 = log.init_timer() + nao_non0 = len(mask) + ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 aow = rks_grad._make_dR_dao_w(ao, wv) _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)] wf = weight * fxc + for ia in range(mol.natm): dR_rho1 = _make_dR_rho1(ao, ao_dm0, ia, aoslices, xctype) wv = contract('xyg,sxg->syg', wf, dR_rho1) wv[:,0] *= .5 + ''' for i in range(3): aow = rks_grad._make_dR_dao_w(ao, wv[i]) vmat[ia,i] += rks_grad._d1_dot_(aow, ao[0].T) aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) + ''' + vmat_tmp = cupy.empty([3,3,nao_non0,nao_non0]) + for i in range(3): + aow = rks_grad._make_dR_dao_w(ao_mask, wv[i]) + rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i]) + aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) + vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + vmat[ia] += vmat_tmp ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] @@ -453,11 +476,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): YX, YY, YZ = 5, 7, 8 ZX, ZY, ZZ = 6, 8, 9 ao_deriv = 2 - for ao, mask, weight, coords \ + for ao_mask, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - ao = contract('nip,ij->njp', ao, coeff[mask]) + t0 = log.init_timer() + nao_non0 = len(mask) + ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 wv[4] *= .25 @@ -476,20 +503,28 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): wv = contract('xyg,sxg->syg', wf, dR_rho1) wv[:,0] *= .5 wv[:,4] *= .5 # for the factor 1/2 in tau + ''' for i in range(3): aow = rks_grad._make_dR_dao_w(ao, wv[i]) vmat[ia,i] += rks_grad._d1_dot_(aow, ao[0].T) - - aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) - - aow = [numint._scale_ao(ao[1], wv[i,4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, [ao[XX], ao[XY], ao[XZ]], aow, mask, ao_loc, False) - aow = [numint._scale_ao(ao[2], wv[i,4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, [ao[YX], ao[YY], ao[YZ]], aow, mask, ao_loc, False) - aow = [numint._scale_ao(ao[3], wv[i,4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, [ao[ZX], ao[ZY], ao[ZZ]], aow, mask, ao_loc, False) - + ''' + vmat_tmp = cupy.empty([3,3,nao_non0,nao_non0]) + for i in range(3): + aow = rks_grad._make_dR_dao_w(ao_mask, wv[i]) + rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i]) + aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) + + aow = [numint._scale_ao(ao_mask[1], wv[i,4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, [ao_mask[XX], ao_mask[XY], ao_mask[XZ]], aow, mask, ao_loc, False) + aow = [numint._scale_ao(ao_mask[2], wv[i,4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, [ao_mask[YX], ao_mask[YY], ao_mask[YZ]], aow, mask, ao_loc, False) + aow = [numint._scale_ao(ao_mask[3], wv[i,4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, [ao_mask[ZX], ao_mask[ZY], ao_mask[ZZ]], aow, mask, ao_loc, False) + vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + vmat[ia] += vmat_tmp + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] @@ -500,6 +535,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): mol = hessobj.mol mf = hessobj.base + log = logger.new_logger(mol, mol.verbose) if hessobj.grids is not None: grids = hessobj.grids else: @@ -533,9 +569,12 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): ao_deriv = 1 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc[0] aow = numint._scale_ao(ao[0], wv) v_ip += rks_grad._d1_dot_(ao[1:4], aow.T) @@ -550,15 +589,17 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) elif xctype == 'GGA': ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory): - # TODO: improve efficiency + t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 v_ip += rks_grad._gga_grad_sum_(ao, wv) @@ -572,17 +613,20 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): wv[:,0] *= .5 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) + t0 = log.timer_debug1('integration', *t0) ao_dm0 = aow = None - # TODO: debug and test elif xctype == 'MGGA': if grids.level < 5: logger.warn(mol, 'MGGA Hessian is sensitive to dft grids.') ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 wv[4] *= .5 # for the factor 1/2 in tau @@ -602,7 +646,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T) ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,p0:p1] += v_ip[:,p0:p1] diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 2285b9dd..337d2c6a 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -142,17 +142,25 @@ def unpack_sparse(cderi_sparse, row, col, p0, p1, nao, out=None, stream=None): def add_sparse(a, b, indices): ''' - a[np.ix_(indices, indices)] += b + a[:,...,:np.ix_(indices, indices)] += b ''' - n = a.shape[0] - m = b.shape[0] - + assert a.flags.c_contiguous + assert b.flags.c_contiguous + n = a.shape[-1] + m = b.shape[-1] + if a.ndim > 2: + count = np.prod(a.shape[:-2]) + elif a.ndim == 2: + count = 1 + else: + raise RuntimeError('add_sparse only supports 2d or 3d tensor') err = libcupy_helper.add_sparse( ctypes.cast(a.data.ptr, ctypes.c_void_p), ctypes.cast(b.data.ptr, ctypes.c_void_p), ctypes.cast(indices.data.ptr, ctypes.c_void_p), ctypes.c_int(n), - ctypes.c_int(m) + ctypes.c_int(m), + ctypes.c_int(count) ) if err != 0: raise RecursionError('failed in sparse_add2d') diff --git a/gpu4pyscf/lib/cupy_helper/add_sparse.cu b/gpu4pyscf/lib/cupy_helper/add_sparse.cu index eddbf92a..d8033015 100644 --- a/gpu4pyscf/lib/cupy_helper/add_sparse.cu +++ b/gpu4pyscf/lib/cupy_helper/add_sparse.cu @@ -22,8 +22,8 @@ #define THREADS 32 #define BLOCK_DIM 32 -__global__ -void _add_sparse(double *a, double *b, int *indices, int n, int m) +__global__ +void _add_sparse(double *a, double *b, int *indices, int n, int m, int count) { int row = blockIdx.x * BLOCK_DIM + threadIdx.x; int col = blockIdx.y * BLOCK_DIM + threadIdx.y; @@ -32,17 +32,18 @@ void _add_sparse(double *a, double *b, int *indices, int n, int m) } int idx_a = indices[row] * n + indices[col]; int idx_b = row * m + col; - - a[idx_a] += b[idx_b]; + for (int i = 0; i < count; i++){ + a[idx_a + i*n*n] += b[idx_b + i*m*m]; + } } extern "C" { __host__ -int add_sparse(double *a, double *b, int *indices, int n, int m){ +int add_sparse(double *a, double *b, int *indices, int n, int m, int count){ int ntile = (m + THREADS - 1) / THREADS; dim3 threads(THREADS, THREADS); dim3 blocks(ntile, ntile); - _add_sparse<<>>(a, b, indices, n, m); + _add_sparse<<>>(a, b, indices, n, m, count); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { return 1; diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index c590b7f6..aca3b082 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -19,6 +19,7 @@ from cupyx import cutensor from cupy_backends.cuda.libs import cutensor as cutensor_backend from cupy_backends.cuda.libs.cutensor import Handle +from gpu4pyscf.lib import logger libcutensor = None for lib_path in _preload_libs['cutensor']: @@ -31,6 +32,8 @@ _handle = Handle() _modes = {} _contraction_descriptors = {} +_contraction_plans = {} +_contraction_finds = {} cutensor_backend.init(_handle) @@ -82,10 +85,25 @@ def create_contraction_descriptor(handle, return desc def create_contraction_find(handle, algo=cutensor_backend.ALGO_DEFAULT): - find = cutensor_backend.ContractionFind() - cutensor_backend.initContractionFind(handle, find, algo) + key = (handle.ptr, algo) + if key in _contraction_finds: + find = _contraction_finds[key] + else: + find = cutensor_backend.ContractionFind() + cutensor_backend.initContractionFind(handle, find, algo) + _contraction_finds[key] = find return find +def create_contraction_plan(handle, desc, find, ws_size): + key = (handle.ptr, desc.ptr, find.ptr, ws_size) + if key in _contraction_plans: + plan = _contraction_plans[key] + else: + plan = cutensor_backend.ContractionPlan() + cutensor_backend.initContractionPlan(handle, plan, desc, find, ws_size) + _contraction_plans[key] = plan + return plan + def contraction(pattern, a, b, alpha, beta, out=None): pattern = pattern.replace(" ", "") str_a, rest = pattern.split(',') @@ -121,14 +139,15 @@ def contraction(pattern, a, b, alpha, beta, out=None): ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_MIN) ws = cupy.empty(ws_size, dtype=np.int8) - plan = cutensor_backend.ContractionPlan() - cutensor_backend.initContractionPlan(_handle, plan, desc, find, ws_size) + plan = create_contraction_plan(_handle, desc, find, ws_size) alpha = np.asarray(alpha) beta = np.asarray(beta) + cutensor_backend.contraction(_handle, plan, alpha.ctypes.data, a.data.ptr, b.data.ptr, beta.ctypes.data, c.data.ptr, out.data.ptr, ws.data.ptr, ws_size) + return out import os diff --git a/gpu4pyscf/lib/gdft/CMakeLists.txt b/gpu4pyscf/lib/gdft/CMakeLists.txt index fc32786c..9aef39b0 100644 --- a/gpu4pyscf/lib/gdft/CMakeLists.txt +++ b/gpu4pyscf/lib/gdft/CMakeLists.txt @@ -15,7 +15,7 @@ #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80 --ptxas-options=-v") -add_library(gdft SHARED +add_library(gdft SHARED nr_eval_gto.cu contract_rho.cu gen_grids.cu diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu index 4d1dc0b2..1957928e 100644 --- a/gpu4pyscf/lib/gdft/contract_rho.cu +++ b/gpu4pyscf/lib/gdft/contract_rho.cu @@ -17,6 +17,12 @@ * along with this program. If not, see . */ +#include +#include +#include +#include +#include +#include #include "contract_rho.cuh" // TODO: improve this? __global__ @@ -26,20 +32,20 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, const bool active = grid_id < ngrids; size_t Ngrids = ngrids; - int ao_id; double v = 0; if (active){ - for (ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { - v += bra[grid_id + ao_id * Ngrids] * ket[grid_id + ao_id * Ngrids]; + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * Ngrids; + v += bra[ket_idx] * ket[ket_idx]; } } - + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; int ix = threadIdx.x; int iy = threadIdx.y; int ixy = ix + BLKSIZEX * iy; buf[ixy] = v; __syncthreads(); - // assume block dim = 32 x 32 + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); @@ -51,6 +57,162 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, } } +__global__ +void GDFTcontract_rho4_kernel(double *rho, double *bra, double *ket, int ngrids, int nao, int count) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool active = grid_id < ngrids; + size_t ket_stride = nao * ngrids; + size_t rho_stride = count * ngrids; + + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; + + for (int ia = 0; ia < count; ia++){ + double v[4] = {0.0, 0.0, 0.0, 0.0}; + if (active){ + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * ngrids; + double bra_tmp = bra[ket_idx + ia * ket_stride]; + v[0] += bra_tmp * ket[0*ket_stride + ket_idx]; + v[1] += bra_tmp * ket[1*ket_stride + ket_idx]; + v[2] += bra_tmp * ket[2*ket_stride + ket_idx]; + v[3] += bra_tmp * ket[3*ket_stride + ket_idx]; + } + } + + int ix = threadIdx.x; + int iy = threadIdx.y; + int ixy = ix + BLKSIZEX * iy; + for (int i = 0; i < 4; i++){ + buf[ixy] = v[i]; __syncthreads(); + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); + if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); + if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); + if (blockDim.y >= 4 && iy < 2) buf[ixy] += buf[ixy + BLKSIZEX * 2]; __syncthreads(); + if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); + + if (iy == 0 && active) { + rho[grid_id + ia * ngrids + rho_stride * i] = buf[ix]; + } + } + } +} + +__global__ +void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngrids, int nao) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool active = grid_id < ngrids; + + size_t Ngrids = ngrids; + size_t ket_stride = nao * ngrids; + + double v[4] = {0.0, 0.0, 0.0, 0.0}; + if (active){ + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * Ngrids; + double bra_tmp = bra[ket_idx]; + double ket_tmp = ket[ket_idx]; + + v[0] += bra_tmp * ket_tmp; + + ket_idx += ket_stride; + v[1] += bra_tmp * ket[ket_idx]; + v[1] += ket_tmp * bra[ket_idx]; + + ket_idx += ket_stride; + v[2] += bra_tmp * ket[ket_idx]; + v[2] += ket_tmp * bra[ket_idx]; + + ket_idx += ket_stride; + v[3] += bra_tmp * ket[ket_idx]; + v[3] += ket_tmp * bra[ket_idx]; + } + } + + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; + int ix = threadIdx.x; + int iy = threadIdx.y; + int ixy = ix + BLKSIZEX * iy; + + for (int i = 0; i < 4; i++){ + buf[ixy] = v[i]; __syncthreads(); + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); + if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); + if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); + if (blockDim.y >= 4 && iy < 2) buf[ixy] += buf[ixy + BLKSIZEX * 2]; __syncthreads(); + if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); + + if (iy == 0 && active) { + rho[grid_id + ngrids * i] = 2.0 * buf[ix]; + } + } +} + + +__global__ +void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngrids, int nao) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool active = grid_id < ngrids; + + size_t Ngrids = ngrids; + size_t ket_stride = nao * ngrids; + + double v[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; + if (active){ + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * Ngrids; + double bra_tmp0 = bra[ket_idx]; + double ket_tmp0 = ket[ket_idx]; + + v[0] += bra_tmp0 * ket_tmp0; + + ket_idx += ket_stride; + double bra_tmp1 = bra[ket_idx]; + double ket_tmp1 = ket[ket_idx]; + v[1] += bra_tmp0 * ket_tmp1; + v[1] += ket_tmp0 * bra_tmp1; + v[4] += bra_tmp1 * ket_tmp1; + + ket_idx += ket_stride; + bra_tmp1 = bra[ket_idx]; + ket_tmp1 = ket[ket_idx]; + v[2] += bra_tmp0 * ket_tmp1; + v[2] += ket_tmp0 * bra_tmp1; + v[4] += bra_tmp1 * ket_tmp1; + + ket_idx += ket_stride; + bra_tmp1 = bra[ket_idx]; + ket_tmp1 = ket[ket_idx]; + v[3] += bra_tmp0 * ket_tmp1; + v[3] += ket_tmp0 * bra_tmp1; + v[4] += bra_tmp1 * ket_tmp1; + + } + } + + v[4] *= 0.5; + + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; + int ix = threadIdx.x; + int iy = threadIdx.y; + int ixy = ix + BLKSIZEX * iy; + + for (int i = 0; i < 5; i++){ + buf[ixy] = v[i]; __syncthreads(); + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); + if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); + if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); + if (blockDim.y >= 4 && iy < 2) buf[ixy] += buf[ixy + BLKSIZEX * 2]; __syncthreads(); + if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); + + if (iy == 0 && active) { + rho[grid_id + ngrids * i] = 2.0 * buf[ix]; + } + } +} + __global__ void GDFTscale_ao_kernel(double *out, double *ket, double *wv, int ngrids, int nao, int nvar) @@ -71,3 +233,130 @@ void GDFTscale_ao_kernel(double *out, double *ket, double *wv, } out[ixy] = val; } + +__global__ +void GDFT_make_dR_dao_w_kernel(double *out, double *ket, double *wv, + int ngrids, int nao) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + int ao_id = blockIdx.y * blockDim.y + threadIdx.y; + if (grid_id >= ngrids || ao_id >= nao) { + return; + } + + size_t Ngrids = ngrids; + size_t Nag = nao * Ngrids; + size_t ixy = grid_id + ao_id * Ngrids; + + double wv0 = wv[grid_id + ngrids * 0]; + double wv1 = wv[grid_id + ngrids * 1]; + double wv2 = wv[grid_id + ngrids * 2]; + double wv3 = wv[grid_id + ngrids * 3]; + + double ket5 = ket[ixy + Nag * 5]; + double ket6 = ket[ixy + Nag * 6]; + double val; + val = ket[ixy + Nag * 1] * wv0; + val+= ket[ixy + Nag * 4] * wv1; + val+= ket5 * wv2; + val+= ket6 * wv3; + out[ixy + Nag * 0] = val; + + double ket8 = ket[ixy + Nag * 8]; + val = ket[ixy + Nag * 2] * wv0; + val+= ket5 * wv1; + val+= ket[ixy + Nag * 7] * wv2; + val+= ket8 * wv3; + out[ixy + Nag * 1] = val; + + val = ket[ixy + Nag * 3] * wv0; + val+= ket6 * wv1; + val+= ket8 * wv2; + val+= ket[ixy + Nag * 9] * wv3; + out[ixy + Nag * 2] = val; +} + + +extern "C"{ +__host__ +int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho_kernel<<>>(rho, bra, ket, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTcontract_rho4(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao, int count) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho4_kernel<<>>(rho, bra, ket, ngrids, nao, count); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTcontract_rho_gga(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho_gga_kernel<<>>(rho, bra, ket, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho_gga: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTcontract_rho_mgga(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho_mgga_kernel<<>>(rho, bra, ket, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho_mgga: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFT_make_dR_dao_w(cudaStream_t stream, double *out, double *ket, double *wv, + int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY); + GDFT_make_dR_dao_w_kernel<<>>(out, ket, wv, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFT_make_dR_dao_w: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv, + int ngrids, int nao, int nvar) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY); + GDFTscale_ao_kernel<<>>(out, ket, wv, ngrids, nao, nvar); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +} \ No newline at end of file diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu index b87ca434..da59b1c9 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu @@ -1640,31 +1640,4 @@ int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart, //FREE(d_grids); return 0; } - -int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) -{ - dim3 threads(BLKSIZEX, BLKSIZEY); - dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); - GDFTcontract_rho_kernel<<>>(rho, bra, ket, ngrids, nao); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); - return 1; - } - return 0; -} - -int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv, - int ngrids, int nao, int nvar) -{ - dim3 threads(BLKSIZEX, BLKSIZEY); - dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY); - GDFTscale_ao_kernel<<>>(out, ket, wv, ngrids, nao, nvar); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err)); - return 1; - } - return 0; -} } diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py index 58c3f45f..60497816 100644 --- a/gpu4pyscf/lib/logger.py +++ b/gpu4pyscf/lib/logger.py @@ -25,6 +25,8 @@ WARN = lib.logger.WARN DEBUG = lib.logger.DEBUG DEBUG1= lib.logger.DEBUG1 +TIMER_LEVEL = lib.logger.TIMER_LEVEL +flush = lib.logger.flush if sys.version_info < (3, 0): process_clock = time.clock @@ -33,27 +35,66 @@ process_clock = time.process_time perf_counter = time.perf_counter -def _timer_debug1(rec, msg, cpu0=None, wall0=None, sync=True): + +def init_timer(rec): + if rec.verbose >= TIMER_LEVEL: + e0 = cupy.cuda.Event() + e0.record() + return (process_clock(), perf_counter(), e0) + elif rec.verbose >= DEBUG: + return (process_clock(), perf_counter()) + else: + return process_clock(), + +def timer(rec, msg, cpu0=None, wall0=None, gpu0=None): + if cpu0 is None: + cpu0 = rec._t0 + if wall0 and gpu0: + rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event() + if rec.verbose >= TIMER_LEVEL: + rec._e0.record() + rec._e0.synchronize() + flush(rec, ' CPU time for %20s %9.2f sec, wall time %9.2f sec, GPU time for %9.2f ms' + % (msg, rec._t0-cpu0, rec._w0-wall0, cupy.cuda.get_elapsed_time(gpu0,rec._e0))) + return rec._t0, rec._w0, rec._e0 + elif wall0: + rec._t0, rec._w0 = process_clock(), perf_counter() + if rec.verbose >= TIMER_LEVEL: + flush(rec, ' CPU time for %20s %9.2f sec, wall time %9.2f sec' + % (msg, rec._t0-cpu0, rec._w0-wall0)) + return rec._t0, rec._w0 + else: + rec._t0 = process_clock() + if rec.verbose >= TIMER_LEVEL: + flush(rec, ' CPU time for %20s %9.2f sec' % (msg, rec._t0-cpu0)) + return rec._t0, + +def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True): if rec.verbose >= DEBUG1: - if(sync): cupy.cuda.stream.get_current_stream().synchronize() - return timer(rec, msg, cpu0, wall0) + return timer(rec, msg, cpu0, wall0, gpu0) + elif wall0 and gpu0: + rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event() + rec._e0.record() + return rec._t0, rec._w0, rec._e0 elif wall0: rec._t0, rec._w0 = process_clock(), perf_counter() return rec._t0, rec._w0 else: rec._t0 = process_clock() - return rec._t0 + return rec._t0, info = lib.logger.info debug = lib.logger.debug debug1 = lib.logger.debug1 -timer = lib.logger.timer +debug2 = lib.logger.debug2 timer_debug1 = _timer_debug1 class Logger(lib.logger.Logger): def __init__(self, stdout=sys.stdout, verbose=NOTE): super().__init__(stdout=stdout, verbose=verbose) timer_debug1 = _timer_debug1 + timer = timer + init_timer = init_timer def new_logger(rec=None, verbose=None): '''Create and return a :class:`Logger` object diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 1d58a02e..8e1a9855 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -25,11 +25,11 @@ from functools import reduce from pyscf import gto from pyscf import lib as pyscf_lib -from pyscf.lib import logger from pyscf.scf import hf, jk, _vhf from gpu4pyscf import lib from gpu4pyscf.lib.cupy_helper import eigh, load_library, tag_array from gpu4pyscf.scf import diis +from gpu4pyscf.lib import logger LMAX_ON_GPU = 4 FREE_CUPY_CACHE = True @@ -40,8 +40,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, verbose=None): '''Compute J, K matrices with CPU-GPU hybrid algorithm ''' - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() if hermi != 1: raise NotImplementedError('JK-builder only supports hermitian density matrix') if omega is None: @@ -253,8 +253,8 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True, if omega is not None: assert omega >= 0 - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mf) + cput0 = log.init_timer() log.debug3('apply get_jk on gpu') if omega is None: if hasattr(mf, '_opt_gpu'): @@ -369,9 +369,9 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, dump_chk=True, dm0=None, callback=None, conv_check=True, **kwargs): conv_tol = mf.conv_tol mol = mf.mol - t0 = (logger.process_clock(), logger.perf_counter()) verbose = mf.verbose log = logger.new_logger(mol, verbose) + t0 = log.init_timer() if(conv_tol_grad is None): conv_tol_grad = conv_tol**.5 logger.info(mf, 'Set gradient conv threshold to %g', conv_tol_grad) @@ -415,7 +415,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, t_beg = time.time() for cycle in range(mf.max_cycle): - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() dm_last = dm last_hf_e = e_tot @@ -575,7 +575,7 @@ class RHF(hf.RHF): quad_moment = _quad_moment def scf(self, dm0=None, **kwargs): - cput0 = (logger.process_clock(), logger.perf_counter()) + cput0 = logger.init_timer(self) self.dump_flags() self.build(self.mol) @@ -630,8 +630,8 @@ def __init__(self, mol, intor, prescreen='CVHFnoscreen', self._dmcondname = dmcondname def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): - cput0 = (logger.process_clock(), logger.perf_counter()) mol = self.mol + cput0 = logger.init_timer(mol) # Sort basis according to angular momentum and contraction patterns so # as to group the basis functions to blocks in GPU kernel. l_ctrs = mol._bas[:,[gto.ANG_OF, gto.NPRIM_OF]]