diff --git a/examples/18-to_gpu0.py b/examples/18-to_gpu0.py new file mode 100644 index 00000000..d017047a --- /dev/null +++ b/examples/18-to_gpu0.py @@ -0,0 +1,46 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import pyscf +from pyscf import lib +from pyscf.dft import rks +lib.num_threads(8) + +atom = ''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 +''' + +mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, output='./pyscf.log') + +mol.verbose = 4 +mf = rks.RKS(mol, xc='B3LYP').density_fit() +mf_GPU = mf.to_gpu() + +# Compute Energy +e_dft = mf_GPU.kernel() +print(f"total energy = {e_dft}") + +# Compute Gradient +g = mf_GPU.nuc_grad_method() +g.max_memory = 20000 +g.auxbasis_response = True +g_dft = g.kernel() + +# Compute Hessian +h = mf_GPU.Hessian() +h.auxbasis_response = 2 +h_dft = h.kernel() diff --git a/examples/19-pcm_optimize.py b/examples/19-pcm_optimize.py new file mode 100644 index 00000000..75ba681b --- /dev/null +++ b/examples/19-pcm_optimize.py @@ -0,0 +1,41 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy as np +import pyscf +from pyscf import lib, df +from gpu4pyscf.dft import rks +from pyscf.geomopt.geometric_solver import optimize +lib.num_threads(8) + +atom =''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 +''' + +mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=0) + +mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit() +mf = mf.PCM() +mf.verbose = 3 +mf.grids.atom_grid = (99,590) +mf.small_rho_cutoff = 1e-10 +mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids +mf.with_solvent.method = 'C-PCM' +mf.with_solvent.eps = 78.3553 + +mol_eq = optimize(mf, maxsteps=20) + diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 80830e6c..9d17560c 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -27,6 +27,7 @@ parser.add_argument("--solvent", type=str, default='') args = parser.parse_args() +lib.num_threads(16) start_time = time.time() bas = args.basis mol = pyscf.M( @@ -34,10 +35,10 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 4 +mol.verbose = 7 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 4 +mf_df.verbose = 7 if args.solvent: mf_df = mf_df.PCM() diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index 5b740207..93346e36 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -5,7 +5,7 @@ # such as A100-80G if props['totalGlobalMem'] >= 64 * GB: min_ao_blksize = 256 - min_grid_blksize = 256*256 + min_grid_blksize = 128*128 ao_aligned = 32 grid_aligned = 128 mem_fraction = 0.9 diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index ff3c6877..e6cd9d21 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -57,18 +57,6 @@ def build(self, direct_scf_tol=1e-14, omega=None): auxmol = self.auxmol self.nao = mol.nao - # cache indices for better performance - nao = mol.nao - tril_row, tril_col = cupy.tril_indices(nao) - tril_row = cupy.asarray(tril_row) - tril_col = cupy.asarray(tril_col) - - self.tril_row = tril_row - self.tril_col = tril_col - - idx = np.arange(nao) - self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx) - log = logger.new_logger(mol, mol.verbose) t0 = log.init_timer() if auxmol is None: @@ -234,20 +222,20 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): nj = j1 - j0 if sr_only: # TODO: in-place implementation or short-range kernel - ints_slices = cupy.zeros([naoaux, nj, ni], order='C') + ints_slices = cupy.empty([naoaux, nj, ni], order='C') for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1]) if omega is not None: - ints_slices_lr = cupy.zeros([naoaux, nj, ni], order='C') + ints_slices_lr = cupy.empty([naoaux, nj, ni], order='C') for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega) ints_slices -= ints_slices_lr else: - ints_slices = cupy.zeros([naoaux, nj, ni], order='C') + ints_slices = cupy.empty([naoaux, nj, ni], order='C') for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] @@ -261,11 +249,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): row = intopt.ao_pairs_row[cp_ij_id] - i0 col = intopt.ao_pairs_col[cp_ij_id] - j0 - if cpi == cpj: - #ints_slices = ints_slices + ints_slices.transpose([0,2,1]) - transpose_sum(ints_slices) ints_slices = ints_slices[:,col,row] - if cd_low.tag == 'eig': cderi_block = cupy.dot(cd_low.T, ints_slices) ints_slices = None diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index ff0cbd7e..fe3c748e 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -253,10 +253,11 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- nao = dms_tag.shape[-1] dms = dms_tag.reshape([-1,nao,nao]) nset = dms.shape[0] - t0 = log.init_timer() + t1 = t0 = log.init_timer() if dfobj._cderi is None: log.debug('CDERI not found, build...') dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) + t1 = log.timer_debug1('init jk', *t0) assert nao == dfobj.nao vj = None @@ -264,7 +265,6 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- ao_idx = dfobj.intopt.sph_ao_idx dms = take_last2d(dms, ao_idx) - t1 = log.timer_debug1('init jk', *t0) rows = dfobj.intopt.cderi_row cols = dfobj.intopt.cderi_col if with_j: diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index ff3d0ef2..a6e793f6 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -37,6 +37,7 @@ import cupy import numpy as np from pyscf import lib, df +from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d from gpu4pyscf.df import int3c2e @@ -283,6 +284,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None wk_ip2_P__ = int2c_ip1_inv = None + t1 = log.timer_debug1('contract int2c_*', *t1) + ao_idx = np.argsort(intopt.sph_ao_idx) aux_idx = np.argsort(intopt.sph_aux_idx) rev_ao_ao = cupy.ix_(ao_idx, ao_idx) @@ -372,7 +375,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, e1[j0,i0] = e1[i0,j0].T ej[j0,i0] = ej[i0,j0].T ek[j0,i0] = ek[i0,j0].T - + t1 = log.timer_debug1('hcore contribution', *t1) log.timer('RHF partial hessian', *time0) return e1, ej, ek @@ -398,6 +401,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): else: return chkfile ''' + def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None, with_k=True, omega=None): log = logger.new_logger(hessobj, verbose) @@ -521,8 +525,9 @@ def _ao2mo(mat): vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2 vk1_int3c_ip1 = vk1_int3c_ip2 = None + grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) cupy.get_default_memory_pool().free_all_blocks() - hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol) + #hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol) vk1 = None for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] @@ -535,8 +540,9 @@ def _ao2mo(mat): vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:] vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) - h1 = hcore_deriv(ia) - h1 = _ao2mo(cupy.asarray(h1, order='C')) + h1 = grad_hcore[:,i0] + #h1 = hcore_deriv(ia) + #h1 = _ao2mo(cupy.asarray(h1, order='C')) vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) if with_k: vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 56668354..e45f11dc 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -306,10 +306,7 @@ def build(self, cutoff=1e-14, group_size=None, ncptype = len(log_qs) self.bpcache = ctypes.POINTER(BasisProdCache)() - if diag_block_with_triu: - scale_shellpair_diag = 1. - else: - scale_shellpair_diag = 0.5 + scale_shellpair_diag = 1. libgint.GINTinit_basis_prod( ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag), ao_loc.ctypes.data_as(ctypes.c_void_p), @@ -1194,6 +1191,32 @@ def get_dh1e(mol, dm0): dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1]) return 2.0 * cupy.einsum('kx,k->kx', dh1e, -charges) +def get_d2h1e(mol, dm0): + natm = mol.natm + coords = mol.atom_coords() + charges = mol.atom_charges() + fakemol = gto.fakemol_for_charges(coords) + + nao = mol.nao + d2h1e_diag = cupy.zeros([natm,9]) + d2h1e_offdiag = cupy.zeros([natm, nao, 9]) + intopt = VHFOpt(mol, fakemol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) + dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx) + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'): + d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1]) + d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1]) + + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'): + d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1]) + d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1]) + aoslices = mol.aoslice_by_atom() + ao2atom = get_ao2atom(intopt, aoslices) + d2h1e = contract('aix,ib->abx', d2h1e_offdiag, ao2atom) + d2h1e[np.diag_indices(natm), :] += d2h1e_diag + return 2.0 * cupy.einsum('abx,a->xab', d2h1e, charges) + #return 2.0 * cupy.einsum('ijx,i->kx', dh1e, -charges) + def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=None, stream=None): ''' Generate one int3c2e block for given ij, k @@ -1443,14 +1466,6 @@ def get_pairing(p_offsets, q_offsets, q_cond, for q0, q1 in zip(q_offsets[:-1], q_offsets[1:]): if aosym and q0 < p0 or not aosym: q_sub = q_cond[p0:p1,q0:q1].ravel() - ''' - idx = q_sub.argsort(axis=None)[::-1] - q_sorted = q_sub[idx] - mask = q_sorted > cutoff - idx = idx[mask] - ishs, jshs = np.unravel_index(idx, (p1-p0, q1-q0)) - print(ishs.shape) - ''' mask = q_sub > cutoff ishs, jshs = np.indices((p1-p0,q1-q0)) ishs = ishs.ravel()[mask] @@ -1464,13 +1479,6 @@ def get_pairing(p_offsets, q_offsets, q_cond, log_qs.append(log_q) elif aosym and p0 == q0 and p1 == q1: q_sub = q_cond[p0:p1,p0:p1].ravel() - ''' - idx = q_sub.argsort(axis=None)[::-1] - q_sorted = q_sub[idx] - ishs, jshs = np.unravel_index(idx, (p1-p0, p1-p0)) - mask = q_sorted > cutoff - ''' - ishs, jshs = np.indices((p1-p0, p1-p0)) ishs = ishs.ravel() jshs = jshs.ravel() diff --git a/gpu4pyscf/df/tests/test_int3c2e.py b/gpu4pyscf/df/tests/test_int3c2e.py index 8e02ac3d..6c556061 100644 --- a/gpu4pyscf/df/tests/test_int3c2e.py +++ b/gpu4pyscf/df/tests/test_int3c2e.py @@ -116,6 +116,32 @@ def test_int1e_iprinv(self): h1ao = mol.intor('int1e_iprinv', comp=3) # <\nabla|1/r|> assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8 + def test_int1e_ipiprinv(self): + from pyscf import gto + coords = mol.atom_coords() + charges = mol.atom_charges() + + fakemol = gto.fakemol_for_charges(coords) + int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1').get() + + for i,q in enumerate(charges): + mol.set_rinv_origin(coords[i]) + h1ao = mol.intor('int1e_ipiprinv', comp=9) # <\nabla|1/r|> + assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8 + + def test_int1e_iprinvip(self): + from pyscf import gto + coords = mol.atom_coords() + charges = mol.atom_charges() + + fakemol = gto.fakemol_for_charges(coords) + int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1').get() + + for i,q in enumerate(charges): + mol.set_rinv_origin(coords[i]) + h1ao = mol.intor('int1e_iprinvip', comp=9) # <\nabla|1/r|> + assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8 + if __name__ == "__main__": print("Full Tests for int3c") unittest.main() diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 5ced5d94..5528e1e5 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -279,14 +279,16 @@ def get_partition(mol, atom_grids_tab, grid_coord and grid_weight arrays. grid_coord array has shape (N,3); weight 1D array has N elements. ''' + atm_coords = numpy.asarray(mol.atom_coords() , order='C') + atm_coords = cupy.asarray(atm_coords) + ''' if callable(radii_adjust) and atomic_radii is not None: f_radii_adjust = radii_adjust(mol, atomic_radii) else: f_radii_adjust = None - atm_coords = numpy.asarray(mol.atom_coords() , order='C') atm_dist = gto.inter_distance(mol) - atm_coords = cupy.asarray(atm_coords) atm_dist = cupy.asarray(atm_dist) + if (becke_scheme is original_becke and (radii_adjust is radi.treutler_atomic_radii_adjust or radii_adjust is radi.becke_atomic_radii_adjust or @@ -324,7 +326,7 @@ def gen_grid_partition(coords): pbecke[i] *= .5 * (1-g) pbecke[j] *= .5 * (1+g) return pbecke - + ''' coords_all = [] weights_all = [] # support atomic_radii_adjust = None diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 0e46c611..0ca4cd75 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -31,7 +31,7 @@ from gpu4pyscf.lib import logger LMAX_ON_GPU = 6 -BAS_ALIGNED = 4 +BAS_ALIGNED = 1 GRID_BLKSIZE = 32 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) @@ -448,7 +448,9 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, nao, nao0 = coeff.shape dms = cupy.asarray(dms) dm_shape = dms.shape - dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)] + #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)] + dms = dms.reshape(-1,nao0,nao0) + dms = take_last2d(dms, opt.ao_idx) nset = len(dms) if mo_coeff is not None: @@ -482,19 +484,20 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, rho_tot = cupy.empty([nset,6,ngrids]) p0 = p1 = 0 + t1 = t0 = log.init_timer() for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): p1 = p0 + weight.size for i in range(nset): - t0 = log.init_timer() if mo_coeff is None: rho_tot[i,:,p0:p1] = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] rho_tot[i,:,p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) - t1 = log.timer_debug1('eval rho', *t0) p0 = p1 + t1 = log.timer_debug2('eval rho', *t1) + t0 = log.timer_debug1('eval rho', *t0) - vxc_tot = [] + wv = [] for i in range(nset): if xctype == 'LDA': exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i][0], deriv=1, xctype=xctype)[:2] @@ -505,69 +508,37 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, den = rho_tot[i][0] * grids.weights nelec[i] = den.sum() excsum[i] = cupy.sum(den * exc[:,0]) - vxc_tot.append(vxc) - t1 = log.timer_debug1('eval vxc', *t1) + wv.append(vxc * grids.weights) + if xctype == 'GGA': + wv[i][0] *= .5 + if xctype == 'MGGA': + wv[i][[0,4]] *= .5 + t0 = log.timer_debug1('eval vxc', *t0) + t1 = t0 p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): p1 = p0 + weight.size for i in range(nset): - vxc = vxc_tot[i][:,p0:p1] if xctype == 'LDA': - #den = rho * weight - wv = weight * vxc[0] - ''' - if USE_SPARSITY == 0: - vmat[i] += ao.dot(_scale_ao(ao, wv).T) - elif USE_SPARSITY == 1: - _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc, - pair2shls_full, pairs_locs_full, vmat[i]) - ''' if USE_SPARSITY == 2: - aow = _scale_ao(ao_mask, wv) - # vmat[i][cupy.ix_(mask, mask)] += ao_mask.dot(aow.T) + aow = _scale_ao(ao_mask, wv[i][0,p0:p1]) add_sparse(vmat[i], ao_mask.dot(aow.T), idx) else: raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented') elif xctype == 'GGA': - #den = rho[0] * weight - wv = vxc * weight - wv[0] *= .5 - ''' - if USE_SPARSITY == 0: - vmat[i] += ao[0].dot(_scale_ao(ao, wv).T) - elif USE_SPARSITY == 1: - aow = _scale_ao(ao, wv) - _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, - pair2shls_full, pairs_locs_full, vmat[i]) - ''' if USE_SPARSITY == 2: - aow = _scale_ao(ao_mask, wv) + aow = _scale_ao(ao_mask, wv[i][:,p0:p1]) add_sparse(vmat[i], ao_mask[0].dot(aow.T), idx) else: raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented') elif xctype == 'NLC': raise NotImplementedError('NLC') elif xctype == 'MGGA': - #den = rho[0] * weight - wv = vxc * weight - wv[[0, 4]] *= .5 # *.5 for v+v.T - ''' - if USE_SPARSITY == 0: - aow = _scale_ao(ao[:4], wv[:4]) - vmat[i] += ao[0].dot(aow.T) - vmat[i] += _tau_dot(ao, ao, wv[4]) - elif USE_SPARSITY == 1: - _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, - pair2shls_full, pairs_locs_full, vmat[i]) - _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc, - pair2shls_full, pairs_locs_full, vmat[i]) - ''' if USE_SPARSITY == 2: - aow = _scale_ao(ao_mask, wv[:4]) + aow = _scale_ao(ao_mask, wv[i][:4,p0:p1]) vtmp = ao_mask[0].dot(aow.T) - vtmp+= _tau_dot(ao_mask, ao_mask, wv[4]) - #vmat[i][cupy.ix_(mask, mask)] += vtmp + vtmp+= _tau_dot(ao_mask, ao_mask, wv[i][4,p0:p1]) add_sparse(vmat[i], vtmp, idx) else: raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented') @@ -575,17 +546,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, pass else: raise NotImplementedError(f'numint.nr_rks for functional {xc_code}') - t1 = log.timer_debug1('integration', *t1) p0 = p1 - - vmat = contract('pi,npq->niq', coeff, vmat) - vmat = contract('qj,niq->nij', coeff, vmat) - #rev_ao_idx = opt.rev_ao_idx - #vmat = take_last2d(vmat, rev_ao_idx) + t1 = log.timer_debug2('integration', *t1) + t0 = log.timer_debug1('vxc integration', *t0) + rev_ao_idx = opt.rev_ao_idx + vmat = take_last2d(vmat, rev_ao_idx) if xctype != 'LDA': - vmat = vmat + vmat.transpose([0,2,1]) - #transpose_sum(vmat) + transpose_sum(vmat) if FREE_CUPY_CACHE: dms = None @@ -719,27 +687,27 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None): log = logger.new_logger(mol, verbose) coeff = cupy.asarray(opt.coeff) nao = coeff.shape[0] - dm = coeff @ cupy.asarray(dm) @ coeff.T - mo_coeff = getattr(dm, 'mo_coeff', None) mo_occ = getattr(dm,'mo_occ', None) + dm = coeff @ cupy.asarray(dm) @ coeff.T if mo_coeff is not None: mo_coeff = coeff @ mo_coeff ngrids = grids.weights.size rho = cupy.empty(ngrids) p0 = p1 = 0 + t1 = t0 = log.init_timer() for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, 0): p1 = p0 + weight.size - t0 = log.init_timer() if mo_coeff is None: rho[p0:p1] = eval_rho(mol, ao_mask, dm[np.ix_(idx,idx)], xctype='LDA', hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] rho[p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, 'LDA') p0 = p1 - log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho slice', *t1) + t0 = log.timer_debug1('eval rho', *t0) if FREE_CUPY_CACHE: dm = None @@ -763,10 +731,9 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= # AO basis -> gdftopt AO basis with_mocc = hasattr(dms, 'mo1') if with_mocc: - mo1 = contract('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5 - occ_coeff = contract('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5 - dms = contract('nij,qj->niq', dms, coeff) - dms = contract('pi,niq->npq', coeff, dms) + mo1 = dms.mo1[:,opt.ao_idx] * 2.0**0.5 + occ_coeff = dms.occ_coeff[opt.ao_idx] * 2.0**0.5 + dms = take_last2d(dms, opt.ao_idx) nset = len(dms) vmat = cupy.zeros((nset, nao, nao)) @@ -776,8 +743,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= ao_deriv = 1 p0 = 0 p1 = 0 + t1 = t0 = log.init_timer() for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv): - t0 = log.init_timer() p0, p1 = p1, p1+len(weights) # precompute molecular orbitals if with_mocc: @@ -798,7 +765,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= rho_tmp = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) rho1.append(rho_tmp) rho1 = cupy.stack(rho1, axis=0) - t0 = log.timer_debug1('rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) # precompute fxc_w if xctype == 'LDA': @@ -826,14 +793,14 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= vmat_tmp+= _tau_dot(ao, ao, wv[i,4]) add_sparse(vmat[i], vmat_tmp, mask) - t0 = log.timer_debug1('vxc', *t0) + t1 = log.timer_debug2('integration', *t1) ao = c0 = rho1 = None + t0 = log.timer_debug1('vxc', *t0) - vmat = contract('pi,npq->niq', coeff, vmat) - vmat = contract('qj,niq->nij', coeff, vmat) + vmat = take_last2d(vmat, opt.rev_ao_idx) if xctype != 'LDA': - #transpose_sum(vmat) - vmat = vmat + vmat.transpose([0,2,1]) + transpose_sum(vmat) + if FREE_CUPY_CACHE: dms = None cupy.get_default_memory_pool().free_all_blocks() @@ -1067,13 +1034,14 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 aow = _scale_ao(ao, wv) - #vmat += ao[0].dot(aow.T) add_sparse(vmat, ao[0].dot(aow.T), mask) t1 = log.timer_debug1('integration', *t1) - vmat = vmat + vmat.T - vmat = contract('pi,pq->iq', coeff, vmat) - vmat = contract('qj,iq->ij', coeff, vmat) + transpose_sum(vmat) + vmat = take_last2d(vmat, opt.rev_ao_idx) + #vmat = vmat + vmat.T + #vmat = contract('pi,pq->iq', coeff, vmat) + #vmat = contract('qj,iq->ij', coeff, vmat) log.timer_debug1('eval vv10', *t0) return nelec, excsum, vmat @@ -1101,28 +1069,31 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, if spin == 0: mo_coeff = coeff @ mo_coeff rho = [] - t0 = log.init_timer() + t1 = t0 = log.init_timer() for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): mo_coeff_mask = mo_coeff[idx,:] rho_slice = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) rho.append(rho_slice) - t0 = log.timer_debug1('eval rho in fxc', *t0) + t1 = log.timer_debug2('eval rho slice', *t1) rho = cupy.hstack(rho) + t0 = log.timer_debug1('eval rho in fxc', *t0) else: mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff)) rhoa = [] rhob = [] - t0 = log.init_timer() + t1 = t0 = log.init_timer() for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): mo_coeff_mask = mo_coeff[:,idx,:] rhoa_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) rhob_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) rhoa.append(rhoa_slice) rhob.append(rhob_slice) - t0 = log.timer_debug1('eval rho in fxc', *t0) + t1 = log.timer_debug2('eval rho in fxc', *t1) #rho = (cupy.hstack(rhoa), cupy.hstack(rhob)) rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0) + t0 = log.timer_debug1('eval rho in fxc', *t0) vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval fxc', *t0) return rho, vxc, fxc @cupy.fuse() @@ -1267,18 +1238,18 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, mol = opt.mol with opt.gdft_envs_cache(): block_id = 0 + t1 = log.init_timer() for ip0, ip1 in lib.prange(0, ngrids, blksize): coords = grids.coords[ip0:ip1] weight = grids.weights[ip0:ip1] # cache ao indices - if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx: + if (block_id, blksize, ngrids) not in ni.non0ao_idx: stream = cupy.cuda.get_current_stream() cutoff = AO_THRESHOLD ng = ip1 - ip0 ao_loc = mol.ao_loc_nr() nbas = mol.nbas - t0 = log.init_timer() non0shl_idx = cupy.zeros(len(ao_loc)-1, dtype=np.int32) libgdft.GDFTscreen_index( ctypes.cast(stream.ptr, ctypes.c_void_p), @@ -1317,24 +1288,24 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, idx = cupy.hstack([idx, zero_idx[:pad]]) pad = min(pad, len(zero_idx)) non0shl_idx = cupy.asarray(np.where(non0shl_idx)[0], dtype=np.int32) - ni.non0ao_idx[deriv, block_id, blksize, ngrids] = (pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice) - log.timer_debug1('init ao sparsity', *t0) + ni.non0ao_idx[block_id, blksize, ngrids] = (pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice) + t1 = log.timer_debug2('init ao sparsity', *t1) else: - pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[deriv, block_id, blksize, ngrids] - t0 = log.init_timer() + pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids] + ao_mask = eval_ao( ni, mol, coords, deriv, nao_slice=len(idx), shls_slice=non0shl_idx, ao_loc_slice=ao_loc_slice, ctr_offsets_slice=ctr_offsets_slice) + t1 = log.timer_debug2('evaluate ao slice', *t1) if pad > 0: if deriv == 0: ao_mask[-pad:,:] = 0.0 else: ao_mask[:,-pad:,:] = 0.0 block_id += 1 - log.timer_debug1('evaluate ao slice', *t0) yield ao_mask, idx, weight, coords class NumInt(numint.NumInt): @@ -1665,6 +1636,7 @@ def build(self, mol=None): pmol._decontracted = True self.mol = pmol inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32) + self.ao_idx = cupy.asarray(ao_idx, dtype=np.int32) self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32) self.coeff = coeff[ao_idx] self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32) diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index 7b9dd588..70b15723 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -57,6 +57,7 @@ def prune_small_rho_grids_(ks, mol, dm, grids): grids.coords = cupy.vstack( [grids.coords, pad]) grids.weights = cupy.hstack([grids.weights, cupy.zeros(padding)]) + # make_mask has to be executed on cpu for now. #grids.non0tab = grids.make_mask(mol, grids.coords) #grids.screen_index = grids.non0tab diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index c96820b6..9f1f9efc 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -22,7 +22,7 @@ from pyscf.grad import rhf from gpu4pyscf.lib.cupy_helper import load_library from gpu4pyscf.scf.hf import _VHFOpt -from gpu4pyscf.lib.cupy_helper import tag_array, contract +from gpu4pyscf.lib.cupy_helper import tag_array, contract, take_last2d from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df from gpu4pyscf.lib import logger @@ -552,6 +552,50 @@ def calculate_h1e(h1_gpu, s1_gpu): de -= cupy.sum(de, axis=0)/len(atmlst) return de.get() +def get_grad_hcore(mf_grad): + mf = mf_grad.base + mol = mf.mol + natm = mol.natm + nao = mol.nao + mo_occ = mf.mo_occ + mo_coeff = cupy.asarray(mf.mo_coeff) + orbo = mo_coeff[:,mo_occ>0] + nocc = orbo.shape[1] + + dh1e = cupy.zeros([3,natm,nao,nocc]) + coords = mol.atom_coords() + charges = cupy.asarray(mol.atom_charges(), dtype=np.float64) + fakemol = gto.fakemol_for_charges(coords) + intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE) + orbo_sorted = orbo[intopt.sph_ao_idx] + mo_coeff_sorted = mo_coeff[intopt.sph_ao_idx] + for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'): + dh1e[:,k0:k1,j0:j1,:] += contract('xkji,io->xkjo', int3c_blk, orbo_sorted[i0:i1]) + dh1e[:,k0:k1,i0:i1,:] += contract('xkji,jo->xkio', int3c_blk, orbo_sorted[j0:j1]) + dh1e = contract('xkjo,k->xkjo', dh1e, -charges) + dh1e = contract('xkjo,jp->xkpo', dh1e, mo_coeff_sorted) + + h1 = mf_grad.get_hcore(mol) + aoslices = mol.aoslice_by_atom() + with_ecp = mol.has_ecp() + if with_ecp: + ecp_atoms = set(mol._ecpbas[:,gto.ATOM_OF]) + else: + ecp_atoms = () + for atm_id in range(natm): + shl0, shl1, p0, p1 = aoslices[atm_id] + h1ao = numpy.zeros([3,nao,nao]) + with mol.with_rinv_at_nucleus(atm_id): + if with_ecp and atm_id in ecp_atoms: + h1ao += mol.intor('ECPscalar_iprinv', comp=3) + h1ao[:,p0:p1] += h1[:,p0:p1] + h1ao += h1ao.transpose([0,2,1]) + h1ao = cupy.asarray(h1ao) + h1mo = contract('xij,jo->xio', h1ao, orbo) + dh1e[:,atm_id] += contract('xio,ip->xpo', h1mo, mo_coeff) + return dh1e#2.0 * cupy.einsum('kx,k->kx', dh1e, -charges) + class Gradients(rhf.Gradients): from gpu4pyscf.lib.utils import to_cpu, to_gpu, device diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 82b0befd..be6732d6 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -27,7 +27,8 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.dft import numint, xc_deriv, rks from gpu4pyscf.dft.numint import _GDFTOpt, AO_THRESHOLD -from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array, load_library +from gpu4pyscf.lib.cupy_helper import ( + contract, get_avail_mem, add_sparse, tag_array, load_library, take_last2d) from gpu4pyscf.lib import logger from pyscf import __config__ @@ -121,10 +122,12 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_coeff = cupy.asarray(dms.mo_coeff) coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape - dms = cupy.asarray(dms) - dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) - for dm in dms.reshape(-1,nao0,nao0)] - mo_coeff = coeff @ mo_coeff + dms = cupy.asarray(dms).reshape(-1,nao0,nao0) + dms = take_last2d(dms, opt.ao_idx) + #dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) + # for dm in dms.reshape(-1,nao0,nao0)] + #mo_coeff = coeff @ mo_coeff + mo_coeff = mo_coeff[opt.ao_idx] nset = len(dms) assert nset == 1 @@ -172,7 +175,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vtmp = _gga_grad_sum_(ao_mask, wv) vtmp += _tau_grad_dot_(ao_mask, wv[4]) add_sparse(vmat[idm], vtmp, idx) - vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat] + #vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat] + vmat = take_last2d(vmat, opt.rev_ao_idx) exc = None if nset == 1: vmat = vmat[0] diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index b6421424..5efb8f12 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -57,6 +57,7 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mo_coeff = cupy.asarray(mo_coeff) de2 = hessobj.partial_hess_elec(mo_energy, mo_coeff, mo_occ, atmlst, max_memory, log) + t1 = log.timer_debug1('hess elec', *t1) if h1ao is None: h1ao = hessobj.make_h1(mo_coeff, mo_occ, hessobj.chkfile, atmlst, log) t1 = log.timer_debug1('making H1', *t1) @@ -503,11 +504,13 @@ def hcore_generator(self, mol=None): else: ecp_atoms = () aoslices = mol.aoslice_by_atom() - nbas = mol.nbas nao = mol.nao_nr() h1aa, h1ab = self.get_hcore(mol) h1aa = cupy.asarray(h1aa) h1ab = cupy.asarray(h1ab) + + rinv2aa_all = {} + rinv2ab_all = {} def get_hcore(iatm, jatm): ish0, ish1, i0, i1 = aoslices[iatm] jsh0, jsh1, j0, j1 = aoslices[jatm] @@ -515,15 +518,20 @@ def get_hcore(iatm, jatm): zj = mol.atom_charge(jatm) if iatm == jatm: with mol.with_rinv_at_nucleus(iatm): - rinv2aa = mol.intor('int1e_ipiprinv', comp=9) - rinv2ab = mol.intor('int1e_iprinvip', comp=9) - rinv2aa *= zi - rinv2ab *= zi - if with_ecp and iatm in ecp_atoms: - rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9) - rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9) - rinv2aa = cupy.asarray(rinv2aa) - rinv2ab = cupy.asarray(rinv2ab) + if iatm not in rinv2aa_all: + rinv2aa = zi * mol.intor('int1e_ipiprinv', comp=9) + if with_ecp and iatm in ecp_atoms: + rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9) + rinv2aa_all[iatm] = rinv2aa + rinv2aa = cupy.asarray(rinv2aa_all[iatm]) + + if iatm not in rinv2ab_all: + rinv2ab = zi * mol.intor('int1e_iprinvip', comp=9) + if with_ecp and iatm in ecp_atoms: + rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9) + rinv2ab_all[iatm] = rinv2ab + rinv2ab = cupy.asarray(rinv2ab_all[iatm]) + rinv2aa = rinv2aa.reshape(3,3,nao,nao) rinv2ab = rinv2ab.reshape(3,3,nao,nao) hcore = -rinv2aa - rinv2ab @@ -538,28 +546,42 @@ def get_hcore(iatm, jatm): hcore = cupy.zeros((3,3,nao,nao)) hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1] with mol.with_rinv_at_nucleus(iatm): - shls_slice = (jsh0, jsh1, 0, nbas) - rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice) - rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice) - rinv2aa *= zi - rinv2ab *= zi - if with_ecp and iatm in ecp_atoms: - rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9, shls_slice=shls_slice) - rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice) - rinv2aa = cupy.asarray(rinv2aa) - rinv2ab = cupy.asarray(rinv2ab) + #rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice) + if iatm not in rinv2aa_all: + rinv2aa = zi * mol.intor('int1e_ipiprinv', comp=9) + if with_ecp and iatm in ecp_atoms: + rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9) + rinv2aa_all[iatm] = rinv2aa + rinv2aa = cupy.asarray(rinv2aa_all[iatm][:,j0:j1]) + + #rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice) + if iatm not in rinv2ab_all: + rinv2ab = zi * mol.intor('int1e_iprinvip', comp=9) + if with_ecp and iatm in ecp_atoms: + rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9) + rinv2ab_all[iatm] = rinv2ab + rinv2ab = cupy.asarray(rinv2ab_all[iatm][:,j0:j1]) + hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao) hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3) with mol.with_rinv_at_nucleus(jatm): - shls_slice = (ish0, ish1, 0, nbas) - rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice) - rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice) - rinv2aa *= zj - rinv2ab *= zj - if with_ecp and jatm in ecp_atoms: - rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9, shls_slice=shls_slice) - rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice) + # rinv2aa = zj * mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice) + if jatm not in rinv2aa_all: + rinv2aa = zj * mol.intor('int1e_ipiprinv', comp=9) + if with_ecp and jatm in ecp_atoms: + rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9) + rinv2aa_all[jatm] = rinv2aa + rinv2aa = cupy.asarray(rinv2aa_all[jatm][:,i0:i1]) + + # rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice) + if jatm not in rinv2ab_all: + rinv2ab = zj * mol.intor('int1e_iprinvip', comp=9) + if with_ecp and jatm in ecp_atoms: + rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9) + rinv2ab_all[jatm] = rinv2ab + rinv2ab = cupy.asarray(rinv2ab_all[jatm][:,i0:i1]) + rinv2aa = cupy.asarray(rinv2aa) rinv2ab = cupy.asarray(rinv2ab) hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao) diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 2082e707..3e36825c 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -27,7 +27,7 @@ from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract, add_sparse +from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d from gpu4pyscf.lib import logger # import pyscf.grad.rks to activate nuc_grad_method method @@ -393,15 +393,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): ipip = cupy.zeros((3,3,nao,nao)) if xctype == 'LDA': ao_deriv = 1 + t1 = log.init_timer() for ao_mask, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - t0 = log.init_timer() nao_non0 = len(mask) ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) - t0 = log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] - t0 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc[0] aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)] _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) @@ -417,27 +417,28 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): vmat_tmp = cupy.zeros([3,3,nao_non0,nao_non0]) aow = [numint._scale_ao(ao_mask[0], wv[i]) for i in range(3)] _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) - vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) - vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) - vmat[ia] += vmat_tmp + #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + #vmat[ia] += vmat_tmp + add_sparse(vmat[ia], vmat_tmp, mask) ao_dm0 = aow = None - t0 = log.timer_debug1('integration', *t0) + t1 = log.timer_debug2('integration', *t1) for ia in range(mol.natm): + vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx) p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] elif xctype == 'GGA': ao_deriv = 2 - comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 + t1 = log.init_timer() for ao_mask, mask, weight, coords \ - in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao): - t0 = log.init_timer() + in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): nao_non0 = len(mask) ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype) - t0 = log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] - t0 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc wv[0] *= .5 aow = rks_grad._make_dR_dao_w(ao, wv) @@ -462,12 +463,14 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i]) aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)] _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) - vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) - vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) - vmat[ia] += vmat_tmp + #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + #vmat[ia] += vmat_tmp + add_sparse(vmat[ia], vmat_tmp, mask) ao_dm0 = aow = None - t0 = log.timer_debug1('integration', *t0) + t1 = log.timer_debug2('integration', *t1) for ia in range(mol.natm): + vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx) p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2) @@ -477,15 +480,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): YX, YY, YZ = 5, 7, 8 ZX, ZY, ZZ = 6, 8, 9 ao_deriv = 2 + t1 = log.init_timer() for ao_mask, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - t0 = log.init_timer() nao_non0 = len(mask) ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) - t0 = log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] - t0 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc wv[0] *= .5 wv[4] *= .25 @@ -522,11 +525,13 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): _d1d2_dot_(vmat_tmp, mol, [ao_mask[YX], ao_mask[YY], ao_mask[YZ]], aow, mask, ao_loc, False) aow = [numint._scale_ao(ao_mask[3], wv[i,4]) for i in range(3)] _d1d2_dot_(vmat_tmp, mol, [ao_mask[ZX], ao_mask[ZY], ao_mask[ZZ]], aow, mask, ao_loc, False) - vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) - vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) - vmat[ia] += vmat_tmp - t0 = log.timer_debug1('integration', *t0) + #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + #vmat[ia] += vmat_tmp + add_sparse(vmat[ia], vmat_tmp, mask) + t1 = log.timer_debug2('integration', *t1) for ia in range(mol.natm): + vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx) p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2) @@ -570,14 +575,14 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): max_memory = max(2000, max_memory-vmat.size*8/1e6) if xctype == 'LDA': ao_deriv = 1 + t1 = t0 = log.init_timer() for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) - t0 = log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] - t0 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc[0] aow = numint._scale_ao(ao[0], wv) v_ip += rks_grad._d1_dot_(ao[1:4], aow.T) @@ -592,17 +597,17 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) ao_dm0 = aow = None - t0 = log.timer_debug1('integration', *t0) + t1 = log.timer_debug2('integration', *t1) elif xctype == 'GGA': ao_deriv = 2 + t1 = t0 = log.init_timer() for ao, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory): - t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype) - t0 = log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] - t0 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug2('eval vxc', *t1) wv = weight * vxc wv[0] *= .5 v_ip += rks_grad._gga_grad_sum_(ao, wv) @@ -616,20 +621,20 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): wv[:,0] *= .5 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) - t0 = log.timer_debug1('integration', *t0) + t1 = log.timer_debug2('integration', *t1) ao_dm0 = aow = None elif xctype == 'MGGA': if grids.level < 5: log.warn('MGGA Hessian is sensitive to dft grids.') ao_deriv = 2 + t1 = t0 = log.init_timer() for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) - t0 = log.timer_debug1('eval rho', *t0) + t1 = log.timer_debug2('eval rho', *t1) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] - t0 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug2('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 wv[4] *= .5 # for the factor 1/2 in tau @@ -649,7 +654,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T) ao_dm0 = aow = None - t0 = log.timer_debug1('integration', *t0) + t1 = log.timer_debug2('integration', *t1) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,p0:p1] += v_ip[:,p0:p1] diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 9088583c..3e9414ef 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -165,9 +165,27 @@ def add_sparse(a, b, indices): ctypes.c_int(count) ) if err != 0: - raise RecursionError('failed in sparse_add2d') + raise RuntimeError('failed in sparse_add2d') return a +def dist_matrix(coords, out=None): + assert coords.flags.c_contiguous + n = coords.shape[0] + if out is None: + out = cupy.empty([n,n]) + + stream = cupy.cuda.get_current_stream() + err = libcupy_helper.dist_matrix( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(out.data.ptr, ctypes.c_void_p), + ctypes.cast(coords.data.ptr, ctypes.c_void_p), + ctypes.cast(coords.data.ptr, ctypes.c_void_p), + ctypes.c_int(n), + ) + if err != 0: + raise RuntimeError('failed in calculating distance matrix') + return out + def block_c2s_diag(ncart, nsph, angular, counts): ''' constract a cartesian to spherical transformation of n shells @@ -175,21 +193,18 @@ def block_c2s_diag(ncart, nsph, angular, counts): nshells = np.sum(counts) cart2sph = cupy.zeros([ncart, nsph]) - rows = [0] - cols = [0] + + rows = [np.array([0], dtype='int32')] + cols = [np.array([0], dtype='int32')] offsets = [] for l, count in zip(angular, counts): - for _ in range(count): - r, c = c2s_l[l].shape - rows.append(rows[-1] + r) - cols.append(cols[-1] + c) - offsets.append(c2s_offset[l]) - rows = np.asarray(rows, dtype='int32') - cols = np.asarray(cols, dtype='int32') - offsets = np.asarray(offsets, dtype='int32') + r, c = c2s_l[l].shape + rows.append(rows[-1][-1] + np.arange(1,count+1, dtype='int32') * r) + cols.append(cols[-1][-1] + np.arange(1,count+1, dtype='int32') * c) + offsets += [c2s_offset[l]] * count - rows = cupy.asarray(rows, dtype='int32') - cols = cupy.asarray(cols, dtype='int32') + rows = cupy.hstack(rows) + cols = cupy.hstack(cols) offsets = cupy.asarray(offsets, dtype='int32') stream = cupy.cuda.get_current_stream() @@ -271,8 +286,10 @@ def transpose_sum(a): return a + a.transpose(0,2,1) ''' assert a.flags.c_contiguous - assert a.ndim == 3 n = a.shape[-1] + if a.ndim == 2: + a = a.reshape([-1,n,n]) + assert a.ndim == 3 count = a.shape[0] stream = cupy.cuda.get_current_stream() err = libcupy_helper.transpose_sum( diff --git a/gpu4pyscf/lib/cupy_helper/CMakeLists.txt b/gpu4pyscf/lib/cupy_helper/CMakeLists.txt index 3f59109e..445a10f6 100644 --- a/gpu4pyscf/lib/cupy_helper/CMakeLists.txt +++ b/gpu4pyscf/lib/cupy_helper/CMakeLists.txt @@ -17,13 +17,14 @@ #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80") -add_library(cupy_helper SHARED +add_library(cupy_helper SHARED transpose.cu block_diag.cu contract_cderi_k.cu take_last2d.cu async_d2h_2d.cu add_sparse.cu + dist_matrix.cu ) set_target_properties(cupy_helper PROPERTIES diff --git a/gpu4pyscf/lib/cupy_helper/dist_matrix.cu b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu new file mode 100644 index 00000000..77f78de7 --- /dev/null +++ b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu @@ -0,0 +1,49 @@ +/* Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#define THREADS 32 + +__global__ +static void _calc_distances(double *dist, const double *x, const double *y, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= n || j >= n){ + return; + } + + double dx = x[3*i] - y[3*j]; + double dy = x[3*i+1] - y[3*j+1]; + double dz = x[3*i+2] - y[3*j+2]; + dist[i*n+j] = norm3d(dx, dy, dz); +} + +extern "C" { +int dist_matrix(cudaStream_t stream, double *dist, const double *x, const double *y, int n) +{ + int ntile = (n + THREADS - 1) / THREADS; + dim3 threads(THREADS, THREADS); + dim3 blocks(ntile, ntile); + _calc_distances<<>>(dist, x, y, n); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return 1; + } + return 0; +} +} diff --git a/gpu4pyscf/lib/gdft/gen_grids.cu b/gpu4pyscf/lib/gdft/gen_grids.cu index d71eedf9..67cb869e 100644 --- a/gpu4pyscf/lib/gdft/gen_grids.cu +++ b/gpu4pyscf/lib/gdft/gen_grids.cu @@ -67,7 +67,7 @@ int ngrids, int natm) dx = xi - xj_t; dy = yi - yj_t; dz = zi - zj_t; - double dij = norm3d(dx, dy, dz); + double dij = rnorm3d(dx, dy, dz); // distance between atom i and atom j dij_smem[tx] = dij; @@ -88,7 +88,7 @@ int ngrids, int natm) double dij = dij_smem[l]; double aij = a_smem[l]; - double g = (atom_i == atom_j) ? 0.0 : (dig - djg) / dij; + double g = (atom_i == atom_j) ? 0.0 : (dig - djg) * dij; // atomic radii adjust function double g1 = g*g - 1.0; diff --git a/gpu4pyscf/lib/gint/g3c2e.cu b/gpu4pyscf/lib/gint/g3c2e.cu index 2395fbbe..c0f0f425 100644 --- a/gpu4pyscf/lib/gint/g3c2e.cu +++ b/gpu4pyscf/lib/gint/g3c2e.cu @@ -5,7 +5,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o int ntasks_kl = offsets.ntasks_kl; int task_ij = blockIdx.x * blockDim.x + threadIdx.x; int task_kl = blockIdx.y * blockDim.y + threadIdx.y; - + if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) { return; } @@ -26,7 +26,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o int lsh = bas_pair2ket[bas_kl]; double uw[NROOTS*2]; double g[GSIZE_INT3C]; - + double* __restrict__ a12 = c_bpcache.a12; double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; @@ -48,9 +48,9 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o as_ksh = lsh; as_lsh = ksh; } + GINTmemset_int3c2e(envs, eri, ish, jsh, ksh); for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) { - double aij = a12[ij]; double xij = x12[ij]; double yij = y12[ij]; @@ -65,7 +65,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o double aijkl = aij + akl; double a1 = aij * akl; double a0 = a1 / aijkl; - double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; a0 *= theta; double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl); GINTrys_root(x, uw); diff --git a/gpu4pyscf/lib/gint/gout3c2e.cu b/gpu4pyscf/lib/gint/gout3c2e.cu index fb4ff75e..c6fb1eba 100644 --- a/gpu4pyscf/lib/gint/gout3c2e.cu +++ b/gpu4pyscf/lib/gint/gout3c2e.cu @@ -235,15 +235,18 @@ static void GINTwrite_int3c2e_ipip_direct(GINTEnvVars envs, ERITensor eri, doubl double eri_zy = 0; double eri_zz = 0; for (int ir = 0; ir < NROOTS; ++ir){ - eri_xx += g3[ix + ir] * g0[iy + ir] * g0[iz + ir]; - eri_xy += g2[ix + ir] * g1[iy + ir] * g0[iz + ir]; - eri_xz += g2[ix + ir] * g0[iy + ir] * g1[iz + ir]; - eri_yx += g1[ix + ir] * g2[iy + ir] * g0[iz + ir]; - eri_yy += g0[ix + ir] * g3[iy + ir] * g0[iz + ir]; - eri_yz += g0[ix + ir] * g2[iy + ir] * g1[iz + ir]; - eri_zx += g1[ix + ir] * g0[iy + ir] * g2[iz + ir]; - eri_zy += g0[ix + ir] * g1[iy + ir] * g2[iz + ir]; - eri_zz += g0[ix + ir] * g0[iy + ir] * g3[iz + ir]; + double g0_x = g0[ix + ir]; + double g0_y = g0[iy + ir]; + double g0_z = g0[iz + ir]; + eri_xx += g3[ix + ir] * g0_y * g0_z ; + eri_xy += g2[ix + ir] * g1[iy + ir] * g0_z ; + eri_xz += g2[ix + ir] * g0_y * g1[iz + ir]; + eri_yx += g1[ix + ir] * g2[iy + ir] * g0_z ; + eri_yy += g0_x * g3[iy + ir] * g0_z ; + eri_yz += g0_x * g2[iy + ir] * g1[iz + ir]; + eri_zx += g1[ix + ir] * g0_y * g2[iz + ir]; + eri_zy += g0_x * g1[iy + ir] * g2[iz + ir]; + eri_zz += g0_x * g0_y * g3[iz + ir]; } off = i+jstride*j; pxx_eri[off] += eri_xx; @@ -314,13 +317,38 @@ static void GINTwrite_int3c2e_ip_direct(GINTEnvVars envs, ERITensor eri, double* } } +template __device__ +static void GINTmemset_int3c2e(GINTEnvVars envs, ERITensor eri, int ish, int jsh, int ksh) +{ + int *ao_loc = c_bpcache.ao_loc; + size_t jstride = eri.stride_j; + size_t kstride = eri.stride_k; + int i0 = ao_loc[ish ] - eri.ao_offsets_i; + int i1 = ao_loc[ish+1] - eri.ao_offsets_i; + int j0 = ao_loc[jsh ] - eri.ao_offsets_j; + int j1 = ao_loc[jsh+1] - eri.ao_offsets_j; + int k0 = ao_loc[ksh ] - eri.ao_offsets_k; + int k1 = ao_loc[ksh+1] - eri.ao_offsets_k; + int i, j, k, n; + double* __restrict__ p_eri; + + for (n = 0, k = k0; k < k1; ++k) { + p_eri = eri.data + k * kstride; + + for (j = j0; j < j1; ++j) { + for (i = i0; i < i1; ++i, ++n) { + p_eri[i+jstride*j] = 0; + } + } + } +} + template __device__ static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g, int ish, int jsh, int ksh) { int *ao_loc = c_bpcache.ao_loc; size_t jstride = eri.stride_j; size_t kstride = eri.stride_k; - size_t lstride = eri.stride_l; int i0 = ao_loc[ish ] - eri.ao_offsets_i; int i1 = ao_loc[ish+1] - eri.ao_offsets_i; int j0 = ao_loc[jsh ] - eri.ao_offsets_j; @@ -337,7 +365,7 @@ static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g, int ix, iy, iz, off; for (n = 0, k = k0; k < k1; ++k) { - p_eri = eri.data + 0 * lstride + k * kstride; + p_eri = eri.data + k * kstride; for (j = j0; j < j1; ++j) { for (i = i0; i < i1; ++i, ++n) { diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu index 0953fc10..bcb5e382 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu @@ -41,7 +41,7 @@ static int GINTfill_int3c2e_tasks(ERITensor *eri, BasisProdOffsets *offsets, GIN int ntasks_kl = offsets->ntasks_kl; assert(ntasks_kl < 65536*THREADSY); int type_ijkl; - + dim3 threads(THREADSX, THREADSY); dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ntasks_kl+THREADSY-1)/THREADSY); switch (nrys_roots) { @@ -121,14 +121,14 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri, ContractionProdType *cp_kl = bpcache->cptype + cp_kl_id; GINTEnvVars envs; int ng[4] = {0,0,0,0}; - + GINTinit_EnvVars(&envs, cp_ij, cp_kl, ng); envs.omega = omega; if (envs.nrys_roots > 8) { return 2; } - + // TODO: improve the efficiency by unrolling if (envs.nrys_roots > 1) { int16_t *idx4c = (int16_t *)malloc(sizeof(int16_t) * envs.nf * 3); @@ -136,9 +136,9 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri, checkCudaErrors(cudaMemcpyToSymbol(c_idx4c, idx4c, sizeof(int16_t)*envs.nf*3)); free(idx4c); } - + int kl_bin, ij_bin1; - + //checkCudaErrors(cudaMemcpyToSymbol(c_envs, &envs, sizeof(GINTEnvVars))); // move bpcache to constant memory checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); @@ -154,7 +154,7 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri, eritensor.nao = nao; eritensor.data = eri; BasisProdOffsets offsets; - + int *bas_pairs_locs = bpcache->bas_pairs_locs; int *primitive_pairs_locs = bpcache->primitive_pairs_locs; for (kl_bin = 0; kl_bin < nbins; kl_bin++) { @@ -182,12 +182,12 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri, int err = -1; err = GINTfill_int3c2e_tasks(&eritensor, &offsets, &envs, stream); - + if (err != 0) { return err; } } - + return 0; } } diff --git a/gpu4pyscf/lib/gvhf/g3c2e.cuh b/gpu4pyscf/lib/gvhf/g3c2e.cuh index c82d037c..9195cddd 100644 --- a/gpu4pyscf/lib/gvhf/g3c2e.cuh +++ b/gpu4pyscf/lib/gvhf/g3c2e.cuh @@ -58,7 +58,7 @@ static void GINTkernel_int3c2e_ip1_getjk(JKMatrix jk, double* __restrict__ gout, double sx = gout[3*n]; double sy = gout[3*n + 1]; double sz = gout[3*n + 2]; - + double rhoj_tmp = dm[off_dm] * rhoj[k]; double rhok_tmp = rhok[off_rhok]; @@ -73,7 +73,7 @@ static void GINTkernel_int3c2e_ip1_getjk(JKMatrix jk, double* __restrict__ gout, } } } - + for (i = i0; i < i1; ++i){ int ii = 3*(i-i0); atomicAdd(vj + i + 0*nao, j3[ii + 0]); @@ -107,22 +107,22 @@ static void GINTkernel_int3c2e_ip2_getjk(JKMatrix jk, double* __restrict__ gout, double* __restrict__ dm = jk.dm; double j3[GPU_CART_MAX * 3]; double k3[GPU_CART_MAX * 3]; - + for (k = 0; k < (k1-k0) * 3; k++){ j3[k] = 0.0; k3[k] = 0.0; } - + for (n = 0, k = k0; k < k1; ++k) { for (j = j0; j < j1; ++j) { for (i = i0; i < i1; ++i, ++n) { off_dm = i + nao*j; off_rhok = i + nao*j + k*nao*nao; - + double sx = gout[3 * n]; double sy = gout[3 * n + 1]; double sz = gout[3 * n + 2]; - + double rhoj_tmp = dm[off_dm] * rhoj[k]; double rhok_tmp = rhok[off_rhok]; @@ -143,7 +143,7 @@ static void GINTkernel_int3c2e_ip2_getjk(JKMatrix jk, double* __restrict__ gout, atomicAdd(vj + k + 0*naux, j3[kk + 0]); atomicAdd(vj + k + 1*naux, j3[kk + 1]); atomicAdd(vj + k + 2*naux, j3[kk + 2]); - + atomicAdd(vk + k + 0*naux, k3[kk + 0]); atomicAdd(vk + k + 1*naux, k3[kk + 1]); atomicAdd(vk + k + 2*naux, k3[kk + 2]); @@ -169,7 +169,7 @@ static void GINTkernel_int3c2e_getj_pass1(GINTEnvVars envs, JKMatrix jk, double* int i, j, k; double* __restrict__ rhoj = jk.rhoj; double* __restrict__ dm = jk.dm; - + int i_l = envs.i_l; int j_l = envs.j_l; int k_l = envs.k_l; @@ -243,11 +243,11 @@ static void GINTkernel_int3c2e_getj_pass2(GINTEnvVars envs, JKMatrix jk, double* double vj_tmp = 0.0; for (k = k0; k < k1; ++k){ int kp = k - k0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; - + int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i]; int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + envs.g_size; int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + envs.g_size * 2; @@ -306,7 +306,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int jp = j - j0; for (i = i0; i < i1; ++i) { int ip = i - i0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; @@ -327,7 +327,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d sy += gx * f[iy + ir] * gz; sz += gx * gy * f[iz + ir]; } - + int ii = 3*(i-i0); off_rhok = i + nao*j + k*nao*nao; double rhok_tmp = rhok[off_rhok]; @@ -348,7 +348,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int jp = j - j0; for (i = i0; i < i1; ++i) { int ip = i - i0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; @@ -388,7 +388,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int jp = j - j0; for (i = i0; i < i1; ++i) { int ip = i - i0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; @@ -453,7 +453,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d double* __restrict__ rhoj = jk.rhoj; double* __restrict__ rhok = jk.rhok; double* __restrict__ dm = jk.dm; - + int i_l = envs.i_l; int j_l = envs.j_l; int k_l = envs.k_l; @@ -468,7 +468,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int jp = j - j0; for (i = i0; i < i1; ++i) { int ip = i - i0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; @@ -476,7 +476,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i]; int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size; int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2; - + double sx = 0.0; double sy = 0.0; double sz = 0.0; @@ -510,7 +510,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int jp = j - j0; for (i = i0; i < i1; ++i) { int ip = i - i0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; @@ -518,7 +518,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i]; int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size; int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2; - + double sx = 0.0; double sy = 0.0; double sz = 0.0; @@ -551,7 +551,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int jp = j - j0; for (i = i0; i < i1; ++i) { int ip = i - i0; - + int loc_k = c_l_locs[k_l] + kp; int loc_j = c_l_locs[j_l] + jp; int loc_i = c_l_locs[i_l] + ip; @@ -559,7 +559,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i]; int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size; int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2; - + double sx = 0.0; double sy = 0.0; double sz = 0.0; @@ -579,7 +579,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d j3[kk + 0] += sx * rhoj_tmp; j3[kk + 1] += sy * rhoj_tmp; j3[kk + 2] += sz * rhoj_tmp; - + off_rhok = i + nao*j + k*nao*nao; double rhok_tmp = rhok[off_rhok]; k3[kk + 0] += sx * rhok_tmp; @@ -602,7 +602,7 @@ static void write_int3c2e_ip1_jk(JKMatrix jk, double* j3, double* k3, int ish){ int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double sdata[THREADSX][THREADSY]; - + if (vj != NULL){ for (int i = i0; i < i1; ++i){ for (int j = 0; j < 3; j++){ @@ -644,7 +644,7 @@ static void write_int3c2e_ip2_jk(JKMatrix jk, double *j3, double* k3, int ksh){ int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double sdata[THREADSX][THREADSY]; - + if (vj != NULL){ for (int k = k0; k < k1; ++k){ for (int j = 0; j < 3; j++){ @@ -741,7 +741,7 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas double aijkl = aij + akl; double a1 = aij * akl; double a0 = a1 / aijkl; - double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; a0 *= theta; double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl); double fac = eij * ekl * sqrt(a0 / (a1 * a1 * a1)); @@ -770,11 +770,11 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas double g_3 = c00y; double g_4 = norm * fac * weight0; double g_5 = g_4 * c00z; - + double f_1 = ai2 * g_1; double f_3 = ai2 * g_3; double f_5 = ai2 * g_5; - + gout0 += f_1 * g_2 * g_4; gout1 += g_0 * f_3 * g_4; gout2 += g_0 * g_2 * f_5; @@ -784,14 +784,14 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas int i0 = ao_loc[ish] - jk.ao_offsets_i; int j0 = ao_loc[jsh] - jk.ao_offsets_j; int k0 = ao_loc[ksh] - jk.ao_offsets_k; - + int nao = jk.nao; double* __restrict__ dm = jk.dm; double* __restrict__ rhok = jk.rhok; double* __restrict__ rhoj = jk.rhoj; double* __restrict__ vj = jk.vj; double* __restrict__ vk = jk.vk; - + int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double sdata[THREADSX][THREADSY]; @@ -903,7 +903,7 @@ static void GINTrun_int3c2e_ip2_jk_kernel0010(GINTEnvVars envs, JKMatrix jk, Bas double aijkl = aij + akl; double a1 = aij * akl; double a0 = a1 / aijkl; - double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; a0 *= theta; double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl); double fac = norm * eij * ekl * sqrt(a0 / (a1 * a1 * a1)); diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu index 0bd30319..cc100d15 100644 --- a/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu +++ b/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu @@ -13,7 +13,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - + template __global__ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets) @@ -22,7 +22,7 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets int ntasks_kl = offsets.ntasks_kl; int task_ij = blockIdx.x * blockDim.x + threadIdx.x; int task_kl = blockIdx.y * blockDim.y + threadIdx.y; - + if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) { return; } @@ -42,12 +42,16 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets int lsh = bas_pair2ket[bas_kl]; double uw[NROOTS*2]; double g[GSIZE]; - + double* __restrict__ a12 = c_bpcache.a12; double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; double* __restrict__ z12 = c_bpcache.z12; + if (ish == jsh){ + norm *= .5; + } + int ij, kl; int as_ish, as_jsh, as_ksh, as_lsh; if (envs.ibase) { @@ -70,7 +74,7 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets double xij = x12[ij]; double yij = y12[ij]; double zij = z12[ij]; - for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) { + for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) { double akl = a12[kl]; double xkl = x12[kl]; double ykl = y12[kl]; diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu index cb9dbf11..1216726a 100644 --- a/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu +++ b/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu @@ -41,6 +41,9 @@ static void GINTint3c2e_pass1_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; double* __restrict__ z12 = c_bpcache.z12; + if (ish == jsh){ + norm *= .5; + } int ij, kl; double gout0 = 0; for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { @@ -70,7 +73,7 @@ static void GINTint3c2e_pass1_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP } gout0 += fac; } } - + int *ao_loc = c_bpcache.ao_loc; int nao = jk.nao; int i0 = ao_loc[ish] - jk.ao_offsets_i; @@ -107,6 +110,9 @@ static void GINTint3c2e_pass1_j_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisP double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; double* __restrict__ z12 = c_bpcache.z12; + if (ish == jsh){ + norm *= .5; + } int ij, kl; int prim_ij0, prim_ij1, prim_kl0, prim_kl1; int nbas = c_bpcache.nbas; @@ -215,6 +221,9 @@ static void GINTint3c2e_pass1_j_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisP double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; double* __restrict__ z12 = c_bpcache.z12; + if (ish == jsh){ + norm *= .5; + } int ij, kl; int prim_ij0, prim_ij1, prim_kl0, prim_kl1; int nbas = c_bpcache.nbas; diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu index 83d37261..6e395ce7 100644 --- a/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu +++ b/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu @@ -13,7 +13,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - + template __global__ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets) @@ -22,7 +22,7 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets int ntasks_kl = offsets.ntasks_kl; int task_ij = blockIdx.x * blockDim.x + threadIdx.x; int task_kl = blockIdx.y * blockDim.y + threadIdx.y; - + if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) { return; } @@ -42,7 +42,9 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets int lsh = bas_pair2ket[bas_kl]; double uw[NROOTS*2]; double g[GSIZE]; - + if (ish == jsh){ + norm *= .5; + } double* __restrict__ a12 = c_bpcache.a12; double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; @@ -70,7 +72,7 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets double xij = x12[ij]; double yij = y12[ij]; double zij = z12[ij]; - for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) { + for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) { double akl = a12[kl]; double xkl = x12[kl]; double ykl = y12[kl]; diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu index 2f76578c..47c65d3f 100644 --- a/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu +++ b/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu @@ -42,6 +42,9 @@ static void GINTint3c2e_pass2_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP double* __restrict__ x12 = c_bpcache.x12; double* __restrict__ y12 = c_bpcache.y12; double* __restrict__ z12 = c_bpcache.z12; + if (ish == jsh){ + norm *= .5; + } int ij, kl; double gout0 = 0; for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { @@ -71,7 +74,7 @@ static void GINTint3c2e_pass2_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP } gout0 += fac; } } - + int *ao_loc = c_bpcache.ao_loc; int nao = jk.nao; int i0 = ao_loc[ish] - jk.ao_offsets_i; @@ -113,7 +116,9 @@ static void GINTint3c2e_pass2_j_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisP double* __restrict__ bas_x = c_bpcache.bas_coords; double* __restrict__ bas_y = bas_x + nbas; double* __restrict__ bas_z = bas_y + nbas; - + if (ish == jsh){ + norm *= .5; + } double gout0 = 0; double gout1 = 0; double gout2 = 0; @@ -219,7 +224,9 @@ static void GINTint3c2e_pass2_j_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisP double* __restrict__ bas_x = c_bpcache.bas_coords; double* __restrict__ bas_y = bas_x + nbas; double* __restrict__ bas_z = bas_y + nbas; - + if (ish == jsh){ + norm *= .5; + } double gout0 = 0; double gout1 = 0; double gout2 = 0; diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py index 7b46fb27..5ca2ec13 100644 --- a/gpu4pyscf/lib/logger.py +++ b/gpu4pyscf/lib/logger.py @@ -25,6 +25,7 @@ WARN = lib.logger.WARN DEBUG = lib.logger.DEBUG DEBUG1= lib.logger.DEBUG1 +DEBUG2= lib.logger.DEBUG2 TIMER_LEVEL = lib.logger.TIMER_LEVEL flush = lib.logger.flush @@ -84,17 +85,24 @@ def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True): rec._t0 = process_clock() return rec._t0, +def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True): + if rec.verbose >= DEBUG2: + return timer(rec, msg, cpu0, wall0, gpu0) + return cpu0, wall0, gpu0 + info = lib.logger.info note = lib.logger.note debug = lib.logger.debug debug1 = lib.logger.debug1 debug2 = lib.logger.debug2 timer_debug1 = _timer_debug1 +timer_debug2 = _timer_debug2 class Logger(lib.logger.Logger): def __init__(self, stdout=sys.stdout, verbose=NOTE): super().__init__(stdout=stdout, verbose=verbose) timer_debug1 = _timer_debug1 + timer_debug2 = _timer_debug2 timer = timer init_timer = init_timer diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py index 614e0fa3..116f4d5e 100644 --- a/gpu4pyscf/lib/tests/test_cupy_helper.py +++ b/gpu4pyscf/lib/tests/test_cupy_helper.py @@ -18,7 +18,7 @@ import cupy from gpu4pyscf.lib.cupy_helper import ( take_last2d, transpose_sum, krylov, unpack_sparse, - add_sparse) + add_sparse, dist_matrix) class KnownValues(unittest.TestCase): def test_take_last2d(self): @@ -69,6 +69,13 @@ def test_sparse(self): add_sparse(a, b, indices) assert cupy.linalg.norm(a - a0) < 1e-10 + def test_dist_matrix(self): + a = cupy.random.rand(4, 3) + rij = cupy.sum((a[:,None,:] - a[None,:,:])**2, axis=2)**0.5 + + rij0 = dist_matrix(a) + assert cupy.linalg.norm(rij - rij0) < 1e-10 + if __name__ == "__main__": print("Full tests for cupy helper module") unittest.main() \ No newline at end of file diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py index 428e5a08..d2a4e9b6 100644 --- a/gpu4pyscf/solvent/pcm.py +++ b/gpu4pyscf/solvent/pcm.py @@ -29,6 +29,7 @@ from gpu4pyscf.solvent import _attach_solvent from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import dist_matrix libdft = lib.load_library('libdft') @@ -191,7 +192,8 @@ def get_D_S(surface, with_S=True, with_D=False): xi_i, xi_j = cupy.meshgrid(charge_exp, charge_exp, indexing='ij') xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5 #rij = scipy.spatial.distance.cdist(grid_coords, grid_coords) - rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5 + #rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5 + rij = dist_matrix(grid_coords) xi_r_ij = xi_ij * rij cupy.fill_diagonal(rij, 1) S = scipy.special.erf(xi_r_ij) / rij