diff --git a/examples/18-to_gpu0.py b/examples/18-to_gpu0.py
new file mode 100644
index 00000000..d017047a
--- /dev/null
+++ b/examples/18-to_gpu0.py
@@ -0,0 +1,46 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import pyscf
+from pyscf import lib
+from pyscf.dft import rks
+lib.num_threads(8)
+
+atom = '''
+O 0.0000000000 -0.0000000000 0.1174000000
+H -0.7570000000 -0.0000000000 -0.4696000000
+H 0.7570000000 0.0000000000 -0.4696000000
+'''
+
+mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, output='./pyscf.log')
+
+mol.verbose = 4
+mf = rks.RKS(mol, xc='B3LYP').density_fit()
+mf_GPU = mf.to_gpu()
+
+# Compute Energy
+e_dft = mf_GPU.kernel()
+print(f"total energy = {e_dft}")
+
+# Compute Gradient
+g = mf_GPU.nuc_grad_method()
+g.max_memory = 20000
+g.auxbasis_response = True
+g_dft = g.kernel()
+
+# Compute Hessian
+h = mf_GPU.Hessian()
+h.auxbasis_response = 2
+h_dft = h.kernel()
diff --git a/examples/19-pcm_optimize.py b/examples/19-pcm_optimize.py
new file mode 100644
index 00000000..75ba681b
--- /dev/null
+++ b/examples/19-pcm_optimize.py
@@ -0,0 +1,41 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import numpy as np
+import pyscf
+from pyscf import lib, df
+from gpu4pyscf.dft import rks
+from pyscf.geomopt.geometric_solver import optimize
+lib.num_threads(8)
+
+atom ='''
+O 0.0000000000 -0.0000000000 0.1174000000
+H -0.7570000000 -0.0000000000 -0.4696000000
+H 0.7570000000 0.0000000000 -0.4696000000
+'''
+
+mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=0)
+
+mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
+mf = mf.PCM()
+mf.verbose = 3
+mf.grids.atom_grid = (99,590)
+mf.small_rho_cutoff = 1e-10
+mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
+mf.with_solvent.method = 'C-PCM'
+mf.with_solvent.eps = 78.3553
+
+mol_eq = optimize(mf, maxsteps=20)
+
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 80830e6c..9d17560c 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -27,6 +27,7 @@
parser.add_argument("--solvent", type=str, default='')
args = parser.parse_args()
+lib.num_threads(16)
start_time = time.time()
bas = args.basis
mol = pyscf.M(
@@ -34,10 +35,10 @@
basis=bas,
max_memory=32000)
# set verbose >= 6 for debugging timer
-mol.verbose = 4
+mol.verbose = 7
mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 4
+mf_df.verbose = 7
if args.solvent:
mf_df = mf_df.PCM()
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 5b740207..93346e36 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -5,7 +5,7 @@
# such as A100-80G
if props['totalGlobalMem'] >= 64 * GB:
min_ao_blksize = 256
- min_grid_blksize = 256*256
+ min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index ff3c6877..e6cd9d21 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -57,18 +57,6 @@ def build(self, direct_scf_tol=1e-14, omega=None):
auxmol = self.auxmol
self.nao = mol.nao
- # cache indices for better performance
- nao = mol.nao
- tril_row, tril_col = cupy.tril_indices(nao)
- tril_row = cupy.asarray(tril_row)
- tril_col = cupy.asarray(tril_col)
-
- self.tril_row = tril_row
- self.tril_col = tril_col
-
- idx = np.arange(nao)
- self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)
-
log = logger.new_logger(mol, mol.verbose)
t0 = log.init_timer()
if auxmol is None:
@@ -234,20 +222,20 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
nj = j1 - j0
if sr_only:
# TODO: in-place implementation or short-range kernel
- ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
+ ints_slices = cupy.empty([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
k1 = intopt.sph_aux_loc[cp_kl_id+1]
int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1])
if omega is not None:
- ints_slices_lr = cupy.zeros([naoaux, nj, ni], order='C')
+ ints_slices_lr = cupy.empty([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
k1 = intopt.sph_aux_loc[cp_kl_id+1]
int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega)
ints_slices -= ints_slices_lr
else:
- ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
+ ints_slices = cupy.empty([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
k1 = intopt.sph_aux_loc[cp_kl_id+1]
@@ -261,11 +249,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
row = intopt.ao_pairs_row[cp_ij_id] - i0
col = intopt.ao_pairs_col[cp_ij_id] - j0
- if cpi == cpj:
- #ints_slices = ints_slices + ints_slices.transpose([0,2,1])
- transpose_sum(ints_slices)
ints_slices = ints_slices[:,col,row]
-
if cd_low.tag == 'eig':
cderi_block = cupy.dot(cd_low.T, ints_slices)
ints_slices = None
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index ff0cbd7e..fe3c748e 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -253,10 +253,11 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
nao = dms_tag.shape[-1]
dms = dms_tag.reshape([-1,nao,nao])
nset = dms.shape[0]
- t0 = log.init_timer()
+ t1 = t0 = log.init_timer()
if dfobj._cderi is None:
log.debug('CDERI not found, build...')
dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
+ t1 = log.timer_debug1('init jk', *t0)
assert nao == dfobj.nao
vj = None
@@ -264,7 +265,6 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
ao_idx = dfobj.intopt.sph_ao_idx
dms = take_last2d(dms, ao_idx)
- t1 = log.timer_debug1('init jk', *t0)
rows = dfobj.intopt.cderi_row
cols = dfobj.intopt.cderi_col
if with_j:
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index ff3d0ef2..a6e793f6 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -37,6 +37,7 @@
import cupy
import numpy as np
from pyscf import lib, df
+from gpu4pyscf.grad import rhf as rhf_grad
from gpu4pyscf.hessian import rhf as rhf_hess
from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d
from gpu4pyscf.df import int3c2e
@@ -283,6 +284,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00)
rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None
wk_ip2_P__ = int2c_ip1_inv = None
+ t1 = log.timer_debug1('contract int2c_*', *t1)
+
ao_idx = np.argsort(intopt.sph_ao_idx)
aux_idx = np.argsort(intopt.sph_aux_idx)
rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
@@ -372,7 +375,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
e1[j0,i0] = e1[i0,j0].T
ej[j0,i0] = ej[i0,j0].T
ek[j0,i0] = ek[i0,j0].T
-
+ t1 = log.timer_debug1('hcore contribution', *t1)
log.timer('RHF partial hessian', *time0)
return e1, ej, ek
@@ -398,6 +401,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
else:
return chkfile
'''
+
def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
verbose=None, with_k=True, omega=None):
log = logger.new_logger(hessobj, verbose)
@@ -521,8 +525,9 @@ def _ao2mo(mat):
vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2
vk1_int3c_ip1 = vk1_int3c_ip2 = None
+ grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
cupy.get_default_memory_pool().free_all_blocks()
- hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol)
+ #hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol)
vk1 = None
for i0, ia in enumerate(atmlst):
shl0, shl1, p0, p1 = aoslices[ia]
@@ -535,8 +540,9 @@ def _ao2mo(mat):
vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:]
vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
- h1 = hcore_deriv(ia)
- h1 = _ao2mo(cupy.asarray(h1, order='C'))
+ h1 = grad_hcore[:,i0]
+ #h1 = hcore_deriv(ia)
+ #h1 = _ao2mo(cupy.asarray(h1, order='C'))
vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
if with_k:
vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 56668354..e45f11dc 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -306,10 +306,7 @@ def build(self, cutoff=1e-14, group_size=None,
ncptype = len(log_qs)
self.bpcache = ctypes.POINTER(BasisProdCache)()
- if diag_block_with_triu:
- scale_shellpair_diag = 1.
- else:
- scale_shellpair_diag = 0.5
+ scale_shellpair_diag = 1.
libgint.GINTinit_basis_prod(
ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
ao_loc.ctypes.data_as(ctypes.c_void_p),
@@ -1194,6 +1191,32 @@ def get_dh1e(mol, dm0):
dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
return 2.0 * cupy.einsum('kx,k->kx', dh1e, -charges)
+def get_d2h1e(mol, dm0):
+ natm = mol.natm
+ coords = mol.atom_coords()
+ charges = mol.atom_charges()
+ fakemol = gto.fakemol_for_charges(coords)
+
+ nao = mol.nao
+ d2h1e_diag = cupy.zeros([natm,9])
+ d2h1e_offdiag = cupy.zeros([natm, nao, 9])
+ intopt = VHFOpt(mol, fakemol, 'int2e')
+ intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+ dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx)
+ for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
+ d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+ d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+
+ for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'):
+ d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+ d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+ aoslices = mol.aoslice_by_atom()
+ ao2atom = get_ao2atom(intopt, aoslices)
+ d2h1e = contract('aix,ib->abx', d2h1e_offdiag, ao2atom)
+ d2h1e[np.diag_indices(natm), :] += d2h1e_diag
+ return 2.0 * cupy.einsum('abx,a->xab', d2h1e, charges)
+ #return 2.0 * cupy.einsum('ijx,i->kx', dh1e, -charges)
+
def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=None, stream=None):
'''
Generate one int3c2e block for given ij, k
@@ -1443,14 +1466,6 @@ def get_pairing(p_offsets, q_offsets, q_cond,
for q0, q1 in zip(q_offsets[:-1], q_offsets[1:]):
if aosym and q0 < p0 or not aosym:
q_sub = q_cond[p0:p1,q0:q1].ravel()
- '''
- idx = q_sub.argsort(axis=None)[::-1]
- q_sorted = q_sub[idx]
- mask = q_sorted > cutoff
- idx = idx[mask]
- ishs, jshs = np.unravel_index(idx, (p1-p0, q1-q0))
- print(ishs.shape)
- '''
mask = q_sub > cutoff
ishs, jshs = np.indices((p1-p0,q1-q0))
ishs = ishs.ravel()[mask]
@@ -1464,13 +1479,6 @@ def get_pairing(p_offsets, q_offsets, q_cond,
log_qs.append(log_q)
elif aosym and p0 == q0 and p1 == q1:
q_sub = q_cond[p0:p1,p0:p1].ravel()
- '''
- idx = q_sub.argsort(axis=None)[::-1]
- q_sorted = q_sub[idx]
- ishs, jshs = np.unravel_index(idx, (p1-p0, p1-p0))
- mask = q_sorted > cutoff
- '''
-
ishs, jshs = np.indices((p1-p0, p1-p0))
ishs = ishs.ravel()
jshs = jshs.ravel()
diff --git a/gpu4pyscf/df/tests/test_int3c2e.py b/gpu4pyscf/df/tests/test_int3c2e.py
index 8e02ac3d..6c556061 100644
--- a/gpu4pyscf/df/tests/test_int3c2e.py
+++ b/gpu4pyscf/df/tests/test_int3c2e.py
@@ -116,6 +116,32 @@ def test_int1e_iprinv(self):
h1ao = mol.intor('int1e_iprinv', comp=3) # <\nabla|1/r|>
assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
+ def test_int1e_ipiprinv(self):
+ from pyscf import gto
+ coords = mol.atom_coords()
+ charges = mol.atom_charges()
+
+ fakemol = gto.fakemol_for_charges(coords)
+ int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1').get()
+
+ for i,q in enumerate(charges):
+ mol.set_rinv_origin(coords[i])
+ h1ao = mol.intor('int1e_ipiprinv', comp=9) # <\nabla|1/r|>
+ assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
+
+ def test_int1e_iprinvip(self):
+ from pyscf import gto
+ coords = mol.atom_coords()
+ charges = mol.atom_charges()
+
+ fakemol = gto.fakemol_for_charges(coords)
+ int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1').get()
+
+ for i,q in enumerate(charges):
+ mol.set_rinv_origin(coords[i])
+ h1ao = mol.intor('int1e_iprinvip', comp=9) # <\nabla|1/r|>
+ assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
+
if __name__ == "__main__":
print("Full Tests for int3c")
unittest.main()
diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
index 5ced5d94..5528e1e5 100644
--- a/gpu4pyscf/dft/gen_grid.py
+++ b/gpu4pyscf/dft/gen_grid.py
@@ -279,14 +279,16 @@ def get_partition(mol, atom_grids_tab,
grid_coord and grid_weight arrays. grid_coord array has shape (N,3);
weight 1D array has N elements.
'''
+ atm_coords = numpy.asarray(mol.atom_coords() , order='C')
+ atm_coords = cupy.asarray(atm_coords)
+ '''
if callable(radii_adjust) and atomic_radii is not None:
f_radii_adjust = radii_adjust(mol, atomic_radii)
else:
f_radii_adjust = None
- atm_coords = numpy.asarray(mol.atom_coords() , order='C')
atm_dist = gto.inter_distance(mol)
- atm_coords = cupy.asarray(atm_coords)
atm_dist = cupy.asarray(atm_dist)
+
if (becke_scheme is original_becke and
(radii_adjust is radi.treutler_atomic_radii_adjust or
radii_adjust is radi.becke_atomic_radii_adjust or
@@ -324,7 +326,7 @@ def gen_grid_partition(coords):
pbecke[i] *= .5 * (1-g)
pbecke[j] *= .5 * (1+g)
return pbecke
-
+ '''
coords_all = []
weights_all = []
# support atomic_radii_adjust = None
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 0e46c611..0ca4cd75 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -31,7 +31,7 @@
from gpu4pyscf.lib import logger
LMAX_ON_GPU = 6
-BAS_ALIGNED = 4
+BAS_ALIGNED = 1
GRID_BLKSIZE = 32
MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
@@ -448,7 +448,9 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
nao, nao0 = coeff.shape
dms = cupy.asarray(dms)
dm_shape = dms.shape
- dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+ #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+ dms = dms.reshape(-1,nao0,nao0)
+ dms = take_last2d(dms, opt.ao_idx)
nset = len(dms)
if mo_coeff is not None:
@@ -482,19 +484,20 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
rho_tot = cupy.empty([nset,6,ngrids])
p0 = p1 = 0
+ t1 = t0 = log.init_timer()
for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
p1 = p0 + weight.size
for i in range(nset):
- t0 = log.init_timer()
if mo_coeff is None:
rho_tot[i,:,p0:p1] = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1)
else:
mo_coeff_mask = mo_coeff[idx,:]
rho_tot[i,:,p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
- t1 = log.timer_debug1('eval rho', *t0)
p0 = p1
+ t1 = log.timer_debug2('eval rho', *t1)
+ t0 = log.timer_debug1('eval rho', *t0)
- vxc_tot = []
+ wv = []
for i in range(nset):
if xctype == 'LDA':
exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i][0], deriv=1, xctype=xctype)[:2]
@@ -505,69 +508,37 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
den = rho_tot[i][0] * grids.weights
nelec[i] = den.sum()
excsum[i] = cupy.sum(den * exc[:,0])
- vxc_tot.append(vxc)
- t1 = log.timer_debug1('eval vxc', *t1)
+ wv.append(vxc * grids.weights)
+ if xctype == 'GGA':
+ wv[i][0] *= .5
+ if xctype == 'MGGA':
+ wv[i][[0,4]] *= .5
+ t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = t0
p0 = p1 = 0
for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
p1 = p0 + weight.size
for i in range(nset):
- vxc = vxc_tot[i][:,p0:p1]
if xctype == 'LDA':
- #den = rho * weight
- wv = weight * vxc[0]
- '''
- if USE_SPARSITY == 0:
- vmat[i] += ao.dot(_scale_ao(ao, wv).T)
- elif USE_SPARSITY == 1:
- _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc,
- pair2shls_full, pairs_locs_full, vmat[i])
- '''
if USE_SPARSITY == 2:
- aow = _scale_ao(ao_mask, wv)
- # vmat[i][cupy.ix_(mask, mask)] += ao_mask.dot(aow.T)
+ aow = _scale_ao(ao_mask, wv[i][0,p0:p1])
add_sparse(vmat[i], ao_mask.dot(aow.T), idx)
else:
raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
elif xctype == 'GGA':
- #den = rho[0] * weight
- wv = vxc * weight
- wv[0] *= .5
- '''
- if USE_SPARSITY == 0:
- vmat[i] += ao[0].dot(_scale_ao(ao, wv).T)
- elif USE_SPARSITY == 1:
- aow = _scale_ao(ao, wv)
- _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc,
- pair2shls_full, pairs_locs_full, vmat[i])
- '''
if USE_SPARSITY == 2:
- aow = _scale_ao(ao_mask, wv)
+ aow = _scale_ao(ao_mask, wv[i][:,p0:p1])
add_sparse(vmat[i], ao_mask[0].dot(aow.T), idx)
else:
raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
elif xctype == 'NLC':
raise NotImplementedError('NLC')
elif xctype == 'MGGA':
- #den = rho[0] * weight
- wv = vxc * weight
- wv[[0, 4]] *= .5 # *.5 for v+v.T
- '''
- if USE_SPARSITY == 0:
- aow = _scale_ao(ao[:4], wv[:4])
- vmat[i] += ao[0].dot(aow.T)
- vmat[i] += _tau_dot(ao, ao, wv[4])
- elif USE_SPARSITY == 1:
- _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc,
- pair2shls_full, pairs_locs_full, vmat[i])
- _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc,
- pair2shls_full, pairs_locs_full, vmat[i])
- '''
if USE_SPARSITY == 2:
- aow = _scale_ao(ao_mask, wv[:4])
+ aow = _scale_ao(ao_mask, wv[i][:4,p0:p1])
vtmp = ao_mask[0].dot(aow.T)
- vtmp+= _tau_dot(ao_mask, ao_mask, wv[4])
- #vmat[i][cupy.ix_(mask, mask)] += vtmp
+ vtmp+= _tau_dot(ao_mask, ao_mask, wv[i][4,p0:p1])
add_sparse(vmat[i], vtmp, idx)
else:
raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
@@ -575,17 +546,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
pass
else:
raise NotImplementedError(f'numint.nr_rks for functional {xc_code}')
- t1 = log.timer_debug1('integration', *t1)
p0 = p1
-
- vmat = contract('pi,npq->niq', coeff, vmat)
- vmat = contract('qj,niq->nij', coeff, vmat)
- #rev_ao_idx = opt.rev_ao_idx
- #vmat = take_last2d(vmat, rev_ao_idx)
+ t1 = log.timer_debug2('integration', *t1)
+ t0 = log.timer_debug1('vxc integration', *t0)
+ rev_ao_idx = opt.rev_ao_idx
+ vmat = take_last2d(vmat, rev_ao_idx)
if xctype != 'LDA':
- vmat = vmat + vmat.transpose([0,2,1])
- #transpose_sum(vmat)
+ transpose_sum(vmat)
if FREE_CUPY_CACHE:
dms = None
@@ -719,27 +687,27 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
log = logger.new_logger(mol, verbose)
coeff = cupy.asarray(opt.coeff)
nao = coeff.shape[0]
- dm = coeff @ cupy.asarray(dm) @ coeff.T
-
mo_coeff = getattr(dm, 'mo_coeff', None)
mo_occ = getattr(dm,'mo_occ', None)
+ dm = coeff @ cupy.asarray(dm) @ coeff.T
if mo_coeff is not None:
mo_coeff = coeff @ mo_coeff
ngrids = grids.weights.size
rho = cupy.empty(ngrids)
p0 = p1 = 0
+ t1 = t0 = log.init_timer()
for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, 0):
p1 = p0 + weight.size
- t0 = log.init_timer()
if mo_coeff is None:
rho[p0:p1] = eval_rho(mol, ao_mask, dm[np.ix_(idx,idx)], xctype='LDA', hermi=1)
else:
mo_coeff_mask = mo_coeff[idx,:]
rho[p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, 'LDA')
p0 = p1
- log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho slice', *t1)
+ t0 = log.timer_debug1('eval rho', *t0)
if FREE_CUPY_CACHE:
dm = None
@@ -763,10 +731,9 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
# AO basis -> gdftopt AO basis
with_mocc = hasattr(dms, 'mo1')
if with_mocc:
- mo1 = contract('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5
- occ_coeff = contract('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5
- dms = contract('nij,qj->niq', dms, coeff)
- dms = contract('pi,niq->npq', coeff, dms)
+ mo1 = dms.mo1[:,opt.ao_idx] * 2.0**0.5
+ occ_coeff = dms.occ_coeff[opt.ao_idx] * 2.0**0.5
+ dms = take_last2d(dms, opt.ao_idx)
nset = len(dms)
vmat = cupy.zeros((nset, nao, nao))
@@ -776,8 +743,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
ao_deriv = 1
p0 = 0
p1 = 0
+ t1 = t0 = log.init_timer()
for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv):
- t0 = log.init_timer()
p0, p1 = p1, p1+len(weights)
# precompute molecular orbitals
if with_mocc:
@@ -798,7 +765,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
rho_tmp = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False)
rho1.append(rho_tmp)
rho1 = cupy.stack(rho1, axis=0)
- t0 = log.timer_debug1('rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
# precompute fxc_w
if xctype == 'LDA':
@@ -826,14 +793,14 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
vmat_tmp+= _tau_dot(ao, ao, wv[i,4])
add_sparse(vmat[i], vmat_tmp, mask)
- t0 = log.timer_debug1('vxc', *t0)
+ t1 = log.timer_debug2('integration', *t1)
ao = c0 = rho1 = None
+ t0 = log.timer_debug1('vxc', *t0)
- vmat = contract('pi,npq->niq', coeff, vmat)
- vmat = contract('qj,niq->nij', coeff, vmat)
+ vmat = take_last2d(vmat, opt.rev_ao_idx)
if xctype != 'LDA':
- #transpose_sum(vmat)
- vmat = vmat + vmat.transpose([0,2,1])
+ transpose_sum(vmat)
+
if FREE_CUPY_CACHE:
dms = None
cupy.get_default_memory_pool().free_all_blocks()
@@ -1067,13 +1034,14 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
wv = vv_vxc[:,p0:p1] * weight
wv[0] *= .5
aow = _scale_ao(ao, wv)
- #vmat += ao[0].dot(aow.T)
add_sparse(vmat, ao[0].dot(aow.T), mask)
t1 = log.timer_debug1('integration', *t1)
- vmat = vmat + vmat.T
- vmat = contract('pi,pq->iq', coeff, vmat)
- vmat = contract('qj,iq->ij', coeff, vmat)
+ transpose_sum(vmat)
+ vmat = take_last2d(vmat, opt.rev_ao_idx)
+ #vmat = vmat + vmat.T
+ #vmat = contract('pi,pq->iq', coeff, vmat)
+ #vmat = contract('qj,iq->ij', coeff, vmat)
log.timer_debug1('eval vv10', *t0)
return nelec, excsum, vmat
@@ -1101,28 +1069,31 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
if spin == 0:
mo_coeff = coeff @ mo_coeff
rho = []
- t0 = log.init_timer()
+ t1 = t0 = log.init_timer()
for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
mo_coeff_mask = mo_coeff[idx,:]
rho_slice = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
rho.append(rho_slice)
- t0 = log.timer_debug1('eval rho in fxc', *t0)
+ t1 = log.timer_debug2('eval rho slice', *t1)
rho = cupy.hstack(rho)
+ t0 = log.timer_debug1('eval rho in fxc', *t0)
else:
mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff))
rhoa = []
rhob = []
- t0 = log.init_timer()
+ t1 = t0 = log.init_timer()
for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
mo_coeff_mask = mo_coeff[:,idx,:]
rhoa_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype)
rhob_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype)
rhoa.append(rhoa_slice)
rhob.append(rhob_slice)
- t0 = log.timer_debug1('eval rho in fxc', *t0)
+ t1 = log.timer_debug2('eval rho in fxc', *t1)
#rho = (cupy.hstack(rhoa), cupy.hstack(rhob))
rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0)
+ t0 = log.timer_debug1('eval rho in fxc', *t0)
vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
+ t0 = log.timer_debug1('eval fxc', *t0)
return rho, vxc, fxc
@cupy.fuse()
@@ -1267,18 +1238,18 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
mol = opt.mol
with opt.gdft_envs_cache():
block_id = 0
+ t1 = log.init_timer()
for ip0, ip1 in lib.prange(0, ngrids, blksize):
coords = grids.coords[ip0:ip1]
weight = grids.weights[ip0:ip1]
# cache ao indices
- if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx:
+ if (block_id, blksize, ngrids) not in ni.non0ao_idx:
stream = cupy.cuda.get_current_stream()
cutoff = AO_THRESHOLD
ng = ip1 - ip0
ao_loc = mol.ao_loc_nr()
nbas = mol.nbas
- t0 = log.init_timer()
non0shl_idx = cupy.zeros(len(ao_loc)-1, dtype=np.int32)
libgdft.GDFTscreen_index(
ctypes.cast(stream.ptr, ctypes.c_void_p),
@@ -1317,24 +1288,24 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
idx = cupy.hstack([idx, zero_idx[:pad]])
pad = min(pad, len(zero_idx))
non0shl_idx = cupy.asarray(np.where(non0shl_idx)[0], dtype=np.int32)
- ni.non0ao_idx[deriv, block_id, blksize, ngrids] = (pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice)
- log.timer_debug1('init ao sparsity', *t0)
+ ni.non0ao_idx[block_id, blksize, ngrids] = (pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice)
+ t1 = log.timer_debug2('init ao sparsity', *t1)
else:
- pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[deriv, block_id, blksize, ngrids]
- t0 = log.init_timer()
+ pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids]
+
ao_mask = eval_ao(
ni, mol, coords, deriv,
nao_slice=len(idx),
shls_slice=non0shl_idx,
ao_loc_slice=ao_loc_slice,
ctr_offsets_slice=ctr_offsets_slice)
+ t1 = log.timer_debug2('evaluate ao slice', *t1)
if pad > 0:
if deriv == 0:
ao_mask[-pad:,:] = 0.0
else:
ao_mask[:,-pad:,:] = 0.0
block_id += 1
- log.timer_debug1('evaluate ao slice', *t0)
yield ao_mask, idx, weight, coords
class NumInt(numint.NumInt):
@@ -1665,6 +1636,7 @@ def build(self, mol=None):
pmol._decontracted = True
self.mol = pmol
inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32)
+ self.ao_idx = cupy.asarray(ao_idx, dtype=np.int32)
self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32)
self.coeff = coeff[ao_idx]
self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32)
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index 7b9dd588..70b15723 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -57,6 +57,7 @@ def prune_small_rho_grids_(ks, mol, dm, grids):
grids.coords = cupy.vstack(
[grids.coords, pad])
grids.weights = cupy.hstack([grids.weights, cupy.zeros(padding)])
+
# make_mask has to be executed on cpu for now.
#grids.non0tab = grids.make_mask(mol, grids.coords)
#grids.screen_index = grids.non0tab
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index c96820b6..9f1f9efc 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -22,7 +22,7 @@
from pyscf.grad import rhf
from gpu4pyscf.lib.cupy_helper import load_library
from gpu4pyscf.scf.hf import _VHFOpt
-from gpu4pyscf.lib.cupy_helper import tag_array, contract
+from gpu4pyscf.lib.cupy_helper import tag_array, contract, take_last2d
from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df
from gpu4pyscf.lib import logger
@@ -552,6 +552,50 @@ def calculate_h1e(h1_gpu, s1_gpu):
de -= cupy.sum(de, axis=0)/len(atmlst)
return de.get()
+def get_grad_hcore(mf_grad):
+ mf = mf_grad.base
+ mol = mf.mol
+ natm = mol.natm
+ nao = mol.nao
+ mo_occ = mf.mo_occ
+ mo_coeff = cupy.asarray(mf.mo_coeff)
+ orbo = mo_coeff[:,mo_occ>0]
+ nocc = orbo.shape[1]
+
+ dh1e = cupy.zeros([3,natm,nao,nocc])
+ coords = mol.atom_coords()
+ charges = cupy.asarray(mol.atom_charges(), dtype=np.float64)
+ fakemol = gto.fakemol_for_charges(coords)
+ intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
+ intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
+ orbo_sorted = orbo[intopt.sph_ao_idx]
+ mo_coeff_sorted = mo_coeff[intopt.sph_ao_idx]
+ for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'):
+ dh1e[:,k0:k1,j0:j1,:] += contract('xkji,io->xkjo', int3c_blk, orbo_sorted[i0:i1])
+ dh1e[:,k0:k1,i0:i1,:] += contract('xkji,jo->xkio', int3c_blk, orbo_sorted[j0:j1])
+ dh1e = contract('xkjo,k->xkjo', dh1e, -charges)
+ dh1e = contract('xkjo,jp->xkpo', dh1e, mo_coeff_sorted)
+
+ h1 = mf_grad.get_hcore(mol)
+ aoslices = mol.aoslice_by_atom()
+ with_ecp = mol.has_ecp()
+ if with_ecp:
+ ecp_atoms = set(mol._ecpbas[:,gto.ATOM_OF])
+ else:
+ ecp_atoms = ()
+ for atm_id in range(natm):
+ shl0, shl1, p0, p1 = aoslices[atm_id]
+ h1ao = numpy.zeros([3,nao,nao])
+ with mol.with_rinv_at_nucleus(atm_id):
+ if with_ecp and atm_id in ecp_atoms:
+ h1ao += mol.intor('ECPscalar_iprinv', comp=3)
+ h1ao[:,p0:p1] += h1[:,p0:p1]
+ h1ao += h1ao.transpose([0,2,1])
+ h1ao = cupy.asarray(h1ao)
+ h1mo = contract('xij,jo->xio', h1ao, orbo)
+ dh1e[:,atm_id] += contract('xio,ip->xpo', h1mo, mo_coeff)
+ return dh1e#2.0 * cupy.einsum('kx,k->kx', dh1e, -charges)
+
class Gradients(rhf.Gradients):
from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 82b0befd..be6732d6 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -27,7 +27,8 @@
from gpu4pyscf.grad import rhf as rhf_grad
from gpu4pyscf.dft import numint, xc_deriv, rks
from gpu4pyscf.dft.numint import _GDFTOpt, AO_THRESHOLD
-from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array, load_library
+from gpu4pyscf.lib.cupy_helper import (
+ contract, get_avail_mem, add_sparse, tag_array, load_library, take_last2d)
from gpu4pyscf.lib import logger
from pyscf import __config__
@@ -121,10 +122,12 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
mo_coeff = cupy.asarray(dms.mo_coeff)
coeff = cupy.asarray(opt.coeff)
nao, nao0 = coeff.shape
- dms = cupy.asarray(dms)
- dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
- for dm in dms.reshape(-1,nao0,nao0)]
- mo_coeff = coeff @ mo_coeff
+ dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
+ dms = take_last2d(dms, opt.ao_idx)
+ #dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
+ # for dm in dms.reshape(-1,nao0,nao0)]
+ #mo_coeff = coeff @ mo_coeff
+ mo_coeff = mo_coeff[opt.ao_idx]
nset = len(dms)
assert nset == 1
@@ -172,7 +175,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
vtmp = _gga_grad_sum_(ao_mask, wv)
vtmp += _tau_grad_dot_(ao_mask, wv[4])
add_sparse(vmat[idm], vtmp, idx)
- vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
+ #vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
+ vmat = take_last2d(vmat, opt.rev_ao_idx)
exc = None
if nset == 1:
vmat = vmat[0]
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index b6421424..5efb8f12 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -57,6 +57,7 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
mo_coeff = cupy.asarray(mo_coeff)
de2 = hessobj.partial_hess_elec(mo_energy, mo_coeff, mo_occ, atmlst,
max_memory, log)
+ t1 = log.timer_debug1('hess elec', *t1)
if h1ao is None:
h1ao = hessobj.make_h1(mo_coeff, mo_occ, hessobj.chkfile, atmlst, log)
t1 = log.timer_debug1('making H1', *t1)
@@ -503,11 +504,13 @@ def hcore_generator(self, mol=None):
else:
ecp_atoms = ()
aoslices = mol.aoslice_by_atom()
- nbas = mol.nbas
nao = mol.nao_nr()
h1aa, h1ab = self.get_hcore(mol)
h1aa = cupy.asarray(h1aa)
h1ab = cupy.asarray(h1ab)
+
+ rinv2aa_all = {}
+ rinv2ab_all = {}
def get_hcore(iatm, jatm):
ish0, ish1, i0, i1 = aoslices[iatm]
jsh0, jsh1, j0, j1 = aoslices[jatm]
@@ -515,15 +518,20 @@ def get_hcore(iatm, jatm):
zj = mol.atom_charge(jatm)
if iatm == jatm:
with mol.with_rinv_at_nucleus(iatm):
- rinv2aa = mol.intor('int1e_ipiprinv', comp=9)
- rinv2ab = mol.intor('int1e_iprinvip', comp=9)
- rinv2aa *= zi
- rinv2ab *= zi
- if with_ecp and iatm in ecp_atoms:
- rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
- rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
- rinv2aa = cupy.asarray(rinv2aa)
- rinv2ab = cupy.asarray(rinv2ab)
+ if iatm not in rinv2aa_all:
+ rinv2aa = zi * mol.intor('int1e_ipiprinv', comp=9)
+ if with_ecp and iatm in ecp_atoms:
+ rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
+ rinv2aa_all[iatm] = rinv2aa
+ rinv2aa = cupy.asarray(rinv2aa_all[iatm])
+
+ if iatm not in rinv2ab_all:
+ rinv2ab = zi * mol.intor('int1e_iprinvip', comp=9)
+ if with_ecp and iatm in ecp_atoms:
+ rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
+ rinv2ab_all[iatm] = rinv2ab
+ rinv2ab = cupy.asarray(rinv2ab_all[iatm])
+
rinv2aa = rinv2aa.reshape(3,3,nao,nao)
rinv2ab = rinv2ab.reshape(3,3,nao,nao)
hcore = -rinv2aa - rinv2ab
@@ -538,28 +546,42 @@ def get_hcore(iatm, jatm):
hcore = cupy.zeros((3,3,nao,nao))
hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1]
with mol.with_rinv_at_nucleus(iatm):
- shls_slice = (jsh0, jsh1, 0, nbas)
- rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
- rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
- rinv2aa *= zi
- rinv2ab *= zi
- if with_ecp and iatm in ecp_atoms:
- rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9, shls_slice=shls_slice)
- rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
- rinv2aa = cupy.asarray(rinv2aa)
- rinv2ab = cupy.asarray(rinv2ab)
+ #rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
+ if iatm not in rinv2aa_all:
+ rinv2aa = zi * mol.intor('int1e_ipiprinv', comp=9)
+ if with_ecp and iatm in ecp_atoms:
+ rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
+ rinv2aa_all[iatm] = rinv2aa
+ rinv2aa = cupy.asarray(rinv2aa_all[iatm][:,j0:j1])
+
+ #rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
+ if iatm not in rinv2ab_all:
+ rinv2ab = zi * mol.intor('int1e_iprinvip', comp=9)
+ if with_ecp and iatm in ecp_atoms:
+ rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
+ rinv2ab_all[iatm] = rinv2ab
+ rinv2ab = cupy.asarray(rinv2ab_all[iatm][:,j0:j1])
+
hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao)
hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3)
with mol.with_rinv_at_nucleus(jatm):
- shls_slice = (ish0, ish1, 0, nbas)
- rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
- rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
- rinv2aa *= zj
- rinv2ab *= zj
- if with_ecp and jatm in ecp_atoms:
- rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9, shls_slice=shls_slice)
- rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
+ # rinv2aa = zj * mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
+ if jatm not in rinv2aa_all:
+ rinv2aa = zj * mol.intor('int1e_ipiprinv', comp=9)
+ if with_ecp and jatm in ecp_atoms:
+ rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
+ rinv2aa_all[jatm] = rinv2aa
+ rinv2aa = cupy.asarray(rinv2aa_all[jatm][:,i0:i1])
+
+ # rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
+ if jatm not in rinv2ab_all:
+ rinv2ab = zj * mol.intor('int1e_iprinvip', comp=9)
+ if with_ecp and jatm in ecp_atoms:
+ rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
+ rinv2ab_all[jatm] = rinv2ab
+ rinv2ab = cupy.asarray(rinv2ab_all[jatm][:,i0:i1])
+
rinv2aa = cupy.asarray(rinv2aa)
rinv2ab = cupy.asarray(rinv2ab)
hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao)
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 2082e707..3e36825c 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -27,7 +27,7 @@
from gpu4pyscf.hessian import rhf as rhf_hess
from gpu4pyscf.grad import rks as rks_grad
from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, add_sparse
+from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d
from gpu4pyscf.lib import logger
# import pyscf.grad.rks to activate nuc_grad_method method
@@ -393,15 +393,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
ipip = cupy.zeros((3,3,nao,nao))
if xctype == 'LDA':
ao_deriv = 1
+ t1 = log.init_timer()
for ao_mask, mask, weight, coords \
in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
- t0 = log.init_timer()
nao_non0 = len(mask)
ao = contract('nip,ij->njp', ao_mask, coeff[mask])
rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
- t0 = log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
- t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = log.timer_debug2('eval vxc', *t1)
wv = weight * vxc[0]
aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)]
_d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
@@ -417,27 +417,28 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
vmat_tmp = cupy.zeros([3,3,nao_non0,nao_non0])
aow = [numint._scale_ao(ao_mask[0], wv[i]) for i in range(3)]
_d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
- vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
- vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
- vmat[ia] += vmat_tmp
+ #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+ #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+ #vmat[ia] += vmat_tmp
+ add_sparse(vmat[ia], vmat_tmp, mask)
ao_dm0 = aow = None
- t0 = log.timer_debug1('integration', *t0)
+ t1 = log.timer_debug2('integration', *t1)
for ia in range(mol.natm):
+ vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx)
p0, p1 = aoslices[ia][2:]
vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
elif xctype == 'GGA':
ao_deriv = 2
- comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6
+ t1 = log.init_timer()
for ao_mask, mask, weight, coords \
- in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao):
- t0 = log.init_timer()
+ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
nao_non0 = len(mask)
ao = contract('nip,ij->njp', ao_mask, coeff[mask])
rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
- t0 = log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
- t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = log.timer_debug2('eval vxc', *t1)
wv = weight * vxc
wv[0] *= .5
aow = rks_grad._make_dR_dao_w(ao, wv)
@@ -462,12 +463,14 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i])
aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)]
_d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
- vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
- vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
- vmat[ia] += vmat_tmp
+ #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+ #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+ #vmat[ia] += vmat_tmp
+ add_sparse(vmat[ia], vmat_tmp, mask)
ao_dm0 = aow = None
- t0 = log.timer_debug1('integration', *t0)
+ t1 = log.timer_debug2('integration', *t1)
for ia in range(mol.natm):
+ vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx)
p0, p1 = aoslices[ia][2:]
vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2)
@@ -477,15 +480,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
YX, YY, YZ = 5, 7, 8
ZX, ZY, ZZ = 6, 8, 9
ao_deriv = 2
+ t1 = log.init_timer()
for ao_mask, mask, weight, coords \
in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
- t0 = log.init_timer()
nao_non0 = len(mask)
ao = contract('nip,ij->njp', ao_mask, coeff[mask])
rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
- t0 = log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
- t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = log.timer_debug2('eval vxc', *t1)
wv = weight * vxc
wv[0] *= .5
wv[4] *= .25
@@ -522,11 +525,13 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
_d1d2_dot_(vmat_tmp, mol, [ao_mask[YX], ao_mask[YY], ao_mask[YZ]], aow, mask, ao_loc, False)
aow = [numint._scale_ao(ao_mask[3], wv[i,4]) for i in range(3)]
_d1d2_dot_(vmat_tmp, mol, [ao_mask[ZX], ao_mask[ZY], ao_mask[ZZ]], aow, mask, ao_loc, False)
- vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
- vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
- vmat[ia] += vmat_tmp
- t0 = log.timer_debug1('integration', *t0)
+ #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+ #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+ #vmat[ia] += vmat_tmp
+ add_sparse(vmat[ia], vmat_tmp, mask)
+ t1 = log.timer_debug2('integration', *t1)
for ia in range(mol.natm):
+ vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx)
p0, p1 = aoslices[ia][2:]
vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2)
@@ -570,14 +575,14 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
max_memory = max(2000, max_memory-vmat.size*8/1e6)
if xctype == 'LDA':
ao_deriv = 1
+ t1 = t0 = log.init_timer()
for ao, mask, weight, coords \
in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
- t0 = log.init_timer()
ao = contract('nip,ij->njp', ao, coeff[mask])
rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
- t0 = log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
- t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = log.timer_debug2('eval vxc', *t1)
wv = weight * vxc[0]
aow = numint._scale_ao(ao[0], wv)
v_ip += rks_grad._d1_dot_(ao[1:4], aow.T)
@@ -592,17 +597,17 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)]
vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T)
ao_dm0 = aow = None
- t0 = log.timer_debug1('integration', *t0)
+ t1 = log.timer_debug2('integration', *t1)
elif xctype == 'GGA':
ao_deriv = 2
+ t1 = t0 = log.init_timer()
for ao, mask, weight, coords \
in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
- t0 = log.init_timer()
ao = contract('nip,ij->njp', ao, coeff[mask])
rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
- t0 = log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
- t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = log.timer_debug2('eval vxc', *t1)
wv = weight * vxc
wv[0] *= .5
v_ip += rks_grad._gga_grad_sum_(ao, wv)
@@ -616,20 +621,20 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
wv[:,0] *= .5
aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)]
vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T)
- t0 = log.timer_debug1('integration', *t0)
+ t1 = log.timer_debug2('integration', *t1)
ao_dm0 = aow = None
elif xctype == 'MGGA':
if grids.level < 5:
log.warn('MGGA Hessian is sensitive to dft grids.')
ao_deriv = 2
+ t1 = t0 = log.init_timer()
for ao, mask, weight, coords \
in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
- t0 = log.init_timer()
ao = contract('nip,ij->njp', ao, coeff[mask])
rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
- t0 = log.timer_debug1('eval rho', *t0)
+ t1 = log.timer_debug2('eval rho', *t1)
vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
- t0 = log.timer_debug1('eval vxc', *t0)
+ t1 = log.timer_debug2('eval vxc', *t0)
wv = weight * vxc
wv[0] *= .5
wv[4] *= .5 # for the factor 1/2 in tau
@@ -649,7 +654,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)]
vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T)
ao_dm0 = aow = None
- t0 = log.timer_debug1('integration', *t0)
+ t1 = log.timer_debug2('integration', *t1)
for ia in range(mol.natm):
p0, p1 = aoslices[ia][2:]
vmat[ia,:,p0:p1] += v_ip[:,p0:p1]
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 9088583c..3e9414ef 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -165,9 +165,27 @@ def add_sparse(a, b, indices):
ctypes.c_int(count)
)
if err != 0:
- raise RecursionError('failed in sparse_add2d')
+ raise RuntimeError('failed in sparse_add2d')
return a
+def dist_matrix(coords, out=None):
+ assert coords.flags.c_contiguous
+ n = coords.shape[0]
+ if out is None:
+ out = cupy.empty([n,n])
+
+ stream = cupy.cuda.get_current_stream()
+ err = libcupy_helper.dist_matrix(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ ctypes.cast(out.data.ptr, ctypes.c_void_p),
+ ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+ ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(n),
+ )
+ if err != 0:
+ raise RuntimeError('failed in calculating distance matrix')
+ return out
+
def block_c2s_diag(ncart, nsph, angular, counts):
'''
constract a cartesian to spherical transformation of n shells
@@ -175,21 +193,18 @@ def block_c2s_diag(ncart, nsph, angular, counts):
nshells = np.sum(counts)
cart2sph = cupy.zeros([ncart, nsph])
- rows = [0]
- cols = [0]
+
+ rows = [np.array([0], dtype='int32')]
+ cols = [np.array([0], dtype='int32')]
offsets = []
for l, count in zip(angular, counts):
- for _ in range(count):
- r, c = c2s_l[l].shape
- rows.append(rows[-1] + r)
- cols.append(cols[-1] + c)
- offsets.append(c2s_offset[l])
- rows = np.asarray(rows, dtype='int32')
- cols = np.asarray(cols, dtype='int32')
- offsets = np.asarray(offsets, dtype='int32')
+ r, c = c2s_l[l].shape
+ rows.append(rows[-1][-1] + np.arange(1,count+1, dtype='int32') * r)
+ cols.append(cols[-1][-1] + np.arange(1,count+1, dtype='int32') * c)
+ offsets += [c2s_offset[l]] * count
- rows = cupy.asarray(rows, dtype='int32')
- cols = cupy.asarray(cols, dtype='int32')
+ rows = cupy.hstack(rows)
+ cols = cupy.hstack(cols)
offsets = cupy.asarray(offsets, dtype='int32')
stream = cupy.cuda.get_current_stream()
@@ -271,8 +286,10 @@ def transpose_sum(a):
return a + a.transpose(0,2,1)
'''
assert a.flags.c_contiguous
- assert a.ndim == 3
n = a.shape[-1]
+ if a.ndim == 2:
+ a = a.reshape([-1,n,n])
+ assert a.ndim == 3
count = a.shape[0]
stream = cupy.cuda.get_current_stream()
err = libcupy_helper.transpose_sum(
diff --git a/gpu4pyscf/lib/cupy_helper/CMakeLists.txt b/gpu4pyscf/lib/cupy_helper/CMakeLists.txt
index 3f59109e..445a10f6 100644
--- a/gpu4pyscf/lib/cupy_helper/CMakeLists.txt
+++ b/gpu4pyscf/lib/cupy_helper/CMakeLists.txt
@@ -17,13 +17,14 @@
#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80")
-add_library(cupy_helper SHARED
+add_library(cupy_helper SHARED
transpose.cu
block_diag.cu
contract_cderi_k.cu
take_last2d.cu
async_d2h_2d.cu
add_sparse.cu
+ dist_matrix.cu
)
set_target_properties(cupy_helper PROPERTIES
diff --git a/gpu4pyscf/lib/cupy_helper/dist_matrix.cu b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
new file mode 100644
index 00000000..77f78de7
--- /dev/null
+++ b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
@@ -0,0 +1,49 @@
+/* Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#include
+#include
+#define THREADS 32
+
+__global__
+static void _calc_distances(double *dist, const double *x, const double *y, int n)
+{
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= n || j >= n){
+ return;
+ }
+
+ double dx = x[3*i] - y[3*j];
+ double dy = x[3*i+1] - y[3*j+1];
+ double dz = x[3*i+2] - y[3*j+2];
+ dist[i*n+j] = norm3d(dx, dy, dz);
+}
+
+extern "C" {
+int dist_matrix(cudaStream_t stream, double *dist, const double *x, const double *y, int n)
+{
+ int ntile = (n + THREADS - 1) / THREADS;
+ dim3 threads(THREADS, THREADS);
+ dim3 blocks(ntile, ntile);
+ _calc_distances<<>>(dist, x, y, n);
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ return 1;
+ }
+ return 0;
+}
+}
diff --git a/gpu4pyscf/lib/gdft/gen_grids.cu b/gpu4pyscf/lib/gdft/gen_grids.cu
index d71eedf9..67cb869e 100644
--- a/gpu4pyscf/lib/gdft/gen_grids.cu
+++ b/gpu4pyscf/lib/gdft/gen_grids.cu
@@ -67,7 +67,7 @@ int ngrids, int natm)
dx = xi - xj_t;
dy = yi - yj_t;
dz = zi - zj_t;
- double dij = norm3d(dx, dy, dz);
+ double dij = rnorm3d(dx, dy, dz);
// distance between atom i and atom j
dij_smem[tx] = dij;
@@ -88,7 +88,7 @@ int ngrids, int natm)
double dij = dij_smem[l];
double aij = a_smem[l];
- double g = (atom_i == atom_j) ? 0.0 : (dig - djg) / dij;
+ double g = (atom_i == atom_j) ? 0.0 : (dig - djg) * dij;
// atomic radii adjust function
double g1 = g*g - 1.0;
diff --git a/gpu4pyscf/lib/gint/g3c2e.cu b/gpu4pyscf/lib/gint/g3c2e.cu
index 2395fbbe..c0f0f425 100644
--- a/gpu4pyscf/lib/gint/g3c2e.cu
+++ b/gpu4pyscf/lib/gint/g3c2e.cu
@@ -5,7 +5,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
int ntasks_kl = offsets.ntasks_kl;
int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-
+
if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
return;
}
@@ -26,7 +26,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
int lsh = bas_pair2ket[bas_kl];
double uw[NROOTS*2];
double g[GSIZE_INT3C];
-
+
double* __restrict__ a12 = c_bpcache.a12;
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
@@ -48,9 +48,9 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
as_ksh = lsh;
as_lsh = ksh;
}
+ GINTmemset_int3c2e(envs, eri, ish, jsh, ksh);
for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
-
double aij = a12[ij];
double xij = x12[ij];
double yij = y12[ij];
@@ -65,7 +65,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
double aijkl = aij + akl;
double a1 = aij * akl;
double a0 = a1 / aijkl;
- double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+ double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
a0 *= theta;
double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
GINTrys_root(x, uw);
diff --git a/gpu4pyscf/lib/gint/gout3c2e.cu b/gpu4pyscf/lib/gint/gout3c2e.cu
index fb4ff75e..c6fb1eba 100644
--- a/gpu4pyscf/lib/gint/gout3c2e.cu
+++ b/gpu4pyscf/lib/gint/gout3c2e.cu
@@ -235,15 +235,18 @@ static void GINTwrite_int3c2e_ipip_direct(GINTEnvVars envs, ERITensor eri, doubl
double eri_zy = 0;
double eri_zz = 0;
for (int ir = 0; ir < NROOTS; ++ir){
- eri_xx += g3[ix + ir] * g0[iy + ir] * g0[iz + ir];
- eri_xy += g2[ix + ir] * g1[iy + ir] * g0[iz + ir];
- eri_xz += g2[ix + ir] * g0[iy + ir] * g1[iz + ir];
- eri_yx += g1[ix + ir] * g2[iy + ir] * g0[iz + ir];
- eri_yy += g0[ix + ir] * g3[iy + ir] * g0[iz + ir];
- eri_yz += g0[ix + ir] * g2[iy + ir] * g1[iz + ir];
- eri_zx += g1[ix + ir] * g0[iy + ir] * g2[iz + ir];
- eri_zy += g0[ix + ir] * g1[iy + ir] * g2[iz + ir];
- eri_zz += g0[ix + ir] * g0[iy + ir] * g3[iz + ir];
+ double g0_x = g0[ix + ir];
+ double g0_y = g0[iy + ir];
+ double g0_z = g0[iz + ir];
+ eri_xx += g3[ix + ir] * g0_y * g0_z ;
+ eri_xy += g2[ix + ir] * g1[iy + ir] * g0_z ;
+ eri_xz += g2[ix + ir] * g0_y * g1[iz + ir];
+ eri_yx += g1[ix + ir] * g2[iy + ir] * g0_z ;
+ eri_yy += g0_x * g3[iy + ir] * g0_z ;
+ eri_yz += g0_x * g2[iy + ir] * g1[iz + ir];
+ eri_zx += g1[ix + ir] * g0_y * g2[iz + ir];
+ eri_zy += g0_x * g1[iy + ir] * g2[iz + ir];
+ eri_zz += g0_x * g0_y * g3[iz + ir];
}
off = i+jstride*j;
pxx_eri[off] += eri_xx;
@@ -314,13 +317,38 @@ static void GINTwrite_int3c2e_ip_direct(GINTEnvVars envs, ERITensor eri, double*
}
}
+template __device__
+static void GINTmemset_int3c2e(GINTEnvVars envs, ERITensor eri, int ish, int jsh, int ksh)
+{
+ int *ao_loc = c_bpcache.ao_loc;
+ size_t jstride = eri.stride_j;
+ size_t kstride = eri.stride_k;
+ int i0 = ao_loc[ish ] - eri.ao_offsets_i;
+ int i1 = ao_loc[ish+1] - eri.ao_offsets_i;
+ int j0 = ao_loc[jsh ] - eri.ao_offsets_j;
+ int j1 = ao_loc[jsh+1] - eri.ao_offsets_j;
+ int k0 = ao_loc[ksh ] - eri.ao_offsets_k;
+ int k1 = ao_loc[ksh+1] - eri.ao_offsets_k;
+ int i, j, k, n;
+ double* __restrict__ p_eri;
+
+ for (n = 0, k = k0; k < k1; ++k) {
+ p_eri = eri.data + k * kstride;
+
+ for (j = j0; j < j1; ++j) {
+ for (i = i0; i < i1; ++i, ++n) {
+ p_eri[i+jstride*j] = 0;
+ }
+ }
+ }
+}
+
template __device__
static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g, int ish, int jsh, int ksh)
{
int *ao_loc = c_bpcache.ao_loc;
size_t jstride = eri.stride_j;
size_t kstride = eri.stride_k;
- size_t lstride = eri.stride_l;
int i0 = ao_loc[ish ] - eri.ao_offsets_i;
int i1 = ao_loc[ish+1] - eri.ao_offsets_i;
int j0 = ao_loc[jsh ] - eri.ao_offsets_j;
@@ -337,7 +365,7 @@ static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g,
int ix, iy, iz, off;
for (n = 0, k = k0; k < k1; ++k) {
- p_eri = eri.data + 0 * lstride + k * kstride;
+ p_eri = eri.data + k * kstride;
for (j = j0; j < j1; ++j) {
for (i = i0; i < i1; ++i, ++n) {
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu
index 0953fc10..bcb5e382 100644
--- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu
@@ -41,7 +41,7 @@ static int GINTfill_int3c2e_tasks(ERITensor *eri, BasisProdOffsets *offsets, GIN
int ntasks_kl = offsets->ntasks_kl;
assert(ntasks_kl < 65536*THREADSY);
int type_ijkl;
-
+
dim3 threads(THREADSX, THREADSY);
dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ntasks_kl+THREADSY-1)/THREADSY);
switch (nrys_roots) {
@@ -121,14 +121,14 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
ContractionProdType *cp_kl = bpcache->cptype + cp_kl_id;
GINTEnvVars envs;
int ng[4] = {0,0,0,0};
-
+
GINTinit_EnvVars(&envs, cp_ij, cp_kl, ng);
envs.omega = omega;
if (envs.nrys_roots > 8) {
return 2;
}
-
+
// TODO: improve the efficiency by unrolling
if (envs.nrys_roots > 1) {
int16_t *idx4c = (int16_t *)malloc(sizeof(int16_t) * envs.nf * 3);
@@ -136,9 +136,9 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
checkCudaErrors(cudaMemcpyToSymbol(c_idx4c, idx4c, sizeof(int16_t)*envs.nf*3));
free(idx4c);
}
-
+
int kl_bin, ij_bin1;
-
+
//checkCudaErrors(cudaMemcpyToSymbol(c_envs, &envs, sizeof(GINTEnvVars)));
// move bpcache to constant memory
checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
@@ -154,7 +154,7 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
eritensor.nao = nao;
eritensor.data = eri;
BasisProdOffsets offsets;
-
+
int *bas_pairs_locs = bpcache->bas_pairs_locs;
int *primitive_pairs_locs = bpcache->primitive_pairs_locs;
for (kl_bin = 0; kl_bin < nbins; kl_bin++) {
@@ -182,12 +182,12 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
int err = -1;
err = GINTfill_int3c2e_tasks(&eritensor, &offsets, &envs, stream);
-
+
if (err != 0) {
return err;
}
}
-
+
return 0;
}
}
diff --git a/gpu4pyscf/lib/gvhf/g3c2e.cuh b/gpu4pyscf/lib/gvhf/g3c2e.cuh
index c82d037c..9195cddd 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e.cuh
+++ b/gpu4pyscf/lib/gvhf/g3c2e.cuh
@@ -58,7 +58,7 @@ static void GINTkernel_int3c2e_ip1_getjk(JKMatrix jk, double* __restrict__ gout,
double sx = gout[3*n];
double sy = gout[3*n + 1];
double sz = gout[3*n + 2];
-
+
double rhoj_tmp = dm[off_dm] * rhoj[k];
double rhok_tmp = rhok[off_rhok];
@@ -73,7 +73,7 @@ static void GINTkernel_int3c2e_ip1_getjk(JKMatrix jk, double* __restrict__ gout,
}
}
}
-
+
for (i = i0; i < i1; ++i){
int ii = 3*(i-i0);
atomicAdd(vj + i + 0*nao, j3[ii + 0]);
@@ -107,22 +107,22 @@ static void GINTkernel_int3c2e_ip2_getjk(JKMatrix jk, double* __restrict__ gout,
double* __restrict__ dm = jk.dm;
double j3[GPU_CART_MAX * 3];
double k3[GPU_CART_MAX * 3];
-
+
for (k = 0; k < (k1-k0) * 3; k++){
j3[k] = 0.0;
k3[k] = 0.0;
}
-
+
for (n = 0, k = k0; k < k1; ++k) {
for (j = j0; j < j1; ++j) {
for (i = i0; i < i1; ++i, ++n) {
off_dm = i + nao*j;
off_rhok = i + nao*j + k*nao*nao;
-
+
double sx = gout[3 * n];
double sy = gout[3 * n + 1];
double sz = gout[3 * n + 2];
-
+
double rhoj_tmp = dm[off_dm] * rhoj[k];
double rhok_tmp = rhok[off_rhok];
@@ -143,7 +143,7 @@ static void GINTkernel_int3c2e_ip2_getjk(JKMatrix jk, double* __restrict__ gout,
atomicAdd(vj + k + 0*naux, j3[kk + 0]);
atomicAdd(vj + k + 1*naux, j3[kk + 1]);
atomicAdd(vj + k + 2*naux, j3[kk + 2]);
-
+
atomicAdd(vk + k + 0*naux, k3[kk + 0]);
atomicAdd(vk + k + 1*naux, k3[kk + 1]);
atomicAdd(vk + k + 2*naux, k3[kk + 2]);
@@ -169,7 +169,7 @@ static void GINTkernel_int3c2e_getj_pass1(GINTEnvVars envs, JKMatrix jk, double*
int i, j, k;
double* __restrict__ rhoj = jk.rhoj;
double* __restrict__ dm = jk.dm;
-
+
int i_l = envs.i_l;
int j_l = envs.j_l;
int k_l = envs.k_l;
@@ -243,11 +243,11 @@ static void GINTkernel_int3c2e_getj_pass2(GINTEnvVars envs, JKMatrix jk, double*
double vj_tmp = 0.0;
for (k = k0; k < k1; ++k){
int kp = k - k0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
-
+
int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + envs.g_size;
int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + envs.g_size * 2;
@@ -306,7 +306,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int jp = j - j0;
for (i = i0; i < i1; ++i) {
int ip = i - i0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
@@ -327,7 +327,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
sy += gx * f[iy + ir] * gz;
sz += gx * gy * f[iz + ir];
}
-
+
int ii = 3*(i-i0);
off_rhok = i + nao*j + k*nao*nao;
double rhok_tmp = rhok[off_rhok];
@@ -348,7 +348,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int jp = j - j0;
for (i = i0; i < i1; ++i) {
int ip = i - i0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
@@ -388,7 +388,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int jp = j - j0;
for (i = i0; i < i1; ++i) {
int ip = i - i0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
@@ -453,7 +453,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
double* __restrict__ rhoj = jk.rhoj;
double* __restrict__ rhok = jk.rhok;
double* __restrict__ dm = jk.dm;
-
+
int i_l = envs.i_l;
int j_l = envs.j_l;
int k_l = envs.k_l;
@@ -468,7 +468,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int jp = j - j0;
for (i = i0; i < i1; ++i) {
int ip = i - i0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
@@ -476,7 +476,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size;
int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2;
-
+
double sx = 0.0;
double sy = 0.0;
double sz = 0.0;
@@ -510,7 +510,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int jp = j - j0;
for (i = i0; i < i1; ++i) {
int ip = i - i0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
@@ -518,7 +518,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size;
int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2;
-
+
double sx = 0.0;
double sy = 0.0;
double sz = 0.0;
@@ -551,7 +551,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int jp = j - j0;
for (i = i0; i < i1; ++i) {
int ip = i - i0;
-
+
int loc_k = c_l_locs[k_l] + kp;
int loc_j = c_l_locs[j_l] + jp;
int loc_i = c_l_locs[i_l] + ip;
@@ -559,7 +559,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size;
int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2;
-
+
double sx = 0.0;
double sy = 0.0;
double sz = 0.0;
@@ -579,7 +579,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
j3[kk + 0] += sx * rhoj_tmp;
j3[kk + 1] += sy * rhoj_tmp;
j3[kk + 2] += sz * rhoj_tmp;
-
+
off_rhok = i + nao*j + k*nao*nao;
double rhok_tmp = rhok[off_rhok];
k3[kk + 0] += sx * rhok_tmp;
@@ -602,7 +602,7 @@ static void write_int3c2e_ip1_jk(JKMatrix jk, double* j3, double* k3, int ish){
int tx = threadIdx.x;
int ty = threadIdx.y;
__shared__ double sdata[THREADSX][THREADSY];
-
+
if (vj != NULL){
for (int i = i0; i < i1; ++i){
for (int j = 0; j < 3; j++){
@@ -644,7 +644,7 @@ static void write_int3c2e_ip2_jk(JKMatrix jk, double *j3, double* k3, int ksh){
int tx = threadIdx.x;
int ty = threadIdx.y;
__shared__ double sdata[THREADSX][THREADSY];
-
+
if (vj != NULL){
for (int k = k0; k < k1; ++k){
for (int j = 0; j < 3; j++){
@@ -741,7 +741,7 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas
double aijkl = aij + akl;
double a1 = aij * akl;
double a0 = a1 / aijkl;
- double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+ double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
a0 *= theta;
double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
double fac = eij * ekl * sqrt(a0 / (a1 * a1 * a1));
@@ -770,11 +770,11 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas
double g_3 = c00y;
double g_4 = norm * fac * weight0;
double g_5 = g_4 * c00z;
-
+
double f_1 = ai2 * g_1;
double f_3 = ai2 * g_3;
double f_5 = ai2 * g_5;
-
+
gout0 += f_1 * g_2 * g_4;
gout1 += g_0 * f_3 * g_4;
gout2 += g_0 * g_2 * f_5;
@@ -784,14 +784,14 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas
int i0 = ao_loc[ish] - jk.ao_offsets_i;
int j0 = ao_loc[jsh] - jk.ao_offsets_j;
int k0 = ao_loc[ksh] - jk.ao_offsets_k;
-
+
int nao = jk.nao;
double* __restrict__ dm = jk.dm;
double* __restrict__ rhok = jk.rhok;
double* __restrict__ rhoj = jk.rhoj;
double* __restrict__ vj = jk.vj;
double* __restrict__ vk = jk.vk;
-
+
int tx = threadIdx.x;
int ty = threadIdx.y;
__shared__ double sdata[THREADSX][THREADSY];
@@ -903,7 +903,7 @@ static void GINTrun_int3c2e_ip2_jk_kernel0010(GINTEnvVars envs, JKMatrix jk, Bas
double aijkl = aij + akl;
double a1 = aij * akl;
double a0 = a1 / aijkl;
- double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+ double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
a0 *= theta;
double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
double fac = norm * eij * ekl * sqrt(a0 / (a1 * a1 * a1));
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu
index 0bd30319..cc100d15 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu
@@ -13,7 +13,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
-
+
template __global__
void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
@@ -22,7 +22,7 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
int ntasks_kl = offsets.ntasks_kl;
int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-
+
if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
return;
}
@@ -42,12 +42,16 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
int lsh = bas_pair2ket[bas_kl];
double uw[NROOTS*2];
double g[GSIZE];
-
+
double* __restrict__ a12 = c_bpcache.a12;
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
double* __restrict__ z12 = c_bpcache.z12;
+ if (ish == jsh){
+ norm *= .5;
+ }
+
int ij, kl;
int as_ish, as_jsh, as_ksh, as_lsh;
if (envs.ibase) {
@@ -70,7 +74,7 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
double xij = x12[ij];
double yij = y12[ij];
double zij = z12[ij];
- for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
+ for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
double akl = a12[kl];
double xkl = x12[kl];
double ykl = y12[kl];
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu
index cb9dbf11..1216726a 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu
@@ -41,6 +41,9 @@ static void GINTint3c2e_pass1_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
double* __restrict__ z12 = c_bpcache.z12;
+ if (ish == jsh){
+ norm *= .5;
+ }
int ij, kl;
double gout0 = 0;
for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
@@ -70,7 +73,7 @@ static void GINTint3c2e_pass1_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
}
gout0 += fac;
} }
-
+
int *ao_loc = c_bpcache.ao_loc;
int nao = jk.nao;
int i0 = ao_loc[ish] - jk.ao_offsets_i;
@@ -107,6 +110,9 @@ static void GINTint3c2e_pass1_j_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisP
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
double* __restrict__ z12 = c_bpcache.z12;
+ if (ish == jsh){
+ norm *= .5;
+ }
int ij, kl;
int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
int nbas = c_bpcache.nbas;
@@ -215,6 +221,9 @@ static void GINTint3c2e_pass1_j_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisP
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
double* __restrict__ z12 = c_bpcache.z12;
+ if (ish == jsh){
+ norm *= .5;
+ }
int ij, kl;
int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
int nbas = c_bpcache.nbas;
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu
index 83d37261..6e395ce7 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu
@@ -13,7 +13,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
-
+
template __global__
void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
@@ -22,7 +22,7 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
int ntasks_kl = offsets.ntasks_kl;
int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-
+
if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
return;
}
@@ -42,7 +42,9 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
int lsh = bas_pair2ket[bas_kl];
double uw[NROOTS*2];
double g[GSIZE];
-
+ if (ish == jsh){
+ norm *= .5;
+ }
double* __restrict__ a12 = c_bpcache.a12;
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
@@ -70,7 +72,7 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
double xij = x12[ij];
double yij = y12[ij];
double zij = z12[ij];
- for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
+ for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
double akl = a12[kl];
double xkl = x12[kl];
double ykl = y12[kl];
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu
index 2f76578c..47c65d3f 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu
@@ -42,6 +42,9 @@ static void GINTint3c2e_pass2_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
double* __restrict__ x12 = c_bpcache.x12;
double* __restrict__ y12 = c_bpcache.y12;
double* __restrict__ z12 = c_bpcache.z12;
+ if (ish == jsh){
+ norm *= .5;
+ }
int ij, kl;
double gout0 = 0;
for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
@@ -71,7 +74,7 @@ static void GINTint3c2e_pass2_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
}
gout0 += fac;
} }
-
+
int *ao_loc = c_bpcache.ao_loc;
int nao = jk.nao;
int i0 = ao_loc[ish] - jk.ao_offsets_i;
@@ -113,7 +116,9 @@ static void GINTint3c2e_pass2_j_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisP
double* __restrict__ bas_x = c_bpcache.bas_coords;
double* __restrict__ bas_y = bas_x + nbas;
double* __restrict__ bas_z = bas_y + nbas;
-
+ if (ish == jsh){
+ norm *= .5;
+ }
double gout0 = 0;
double gout1 = 0;
double gout2 = 0;
@@ -219,7 +224,9 @@ static void GINTint3c2e_pass2_j_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisP
double* __restrict__ bas_x = c_bpcache.bas_coords;
double* __restrict__ bas_y = bas_x + nbas;
double* __restrict__ bas_z = bas_y + nbas;
-
+ if (ish == jsh){
+ norm *= .5;
+ }
double gout0 = 0;
double gout1 = 0;
double gout2 = 0;
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index 7b46fb27..5ca2ec13 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -25,6 +25,7 @@
WARN = lib.logger.WARN
DEBUG = lib.logger.DEBUG
DEBUG1= lib.logger.DEBUG1
+DEBUG2= lib.logger.DEBUG2
TIMER_LEVEL = lib.logger.TIMER_LEVEL
flush = lib.logger.flush
@@ -84,17 +85,24 @@ def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
rec._t0 = process_clock()
return rec._t0,
+def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
+ if rec.verbose >= DEBUG2:
+ return timer(rec, msg, cpu0, wall0, gpu0)
+ return cpu0, wall0, gpu0
+
info = lib.logger.info
note = lib.logger.note
debug = lib.logger.debug
debug1 = lib.logger.debug1
debug2 = lib.logger.debug2
timer_debug1 = _timer_debug1
+timer_debug2 = _timer_debug2
class Logger(lib.logger.Logger):
def __init__(self, stdout=sys.stdout, verbose=NOTE):
super().__init__(stdout=stdout, verbose=verbose)
timer_debug1 = _timer_debug1
+ timer_debug2 = _timer_debug2
timer = timer
init_timer = init_timer
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
index 614e0fa3..116f4d5e 100644
--- a/gpu4pyscf/lib/tests/test_cupy_helper.py
+++ b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -18,7 +18,7 @@
import cupy
from gpu4pyscf.lib.cupy_helper import (
take_last2d, transpose_sum, krylov, unpack_sparse,
- add_sparse)
+ add_sparse, dist_matrix)
class KnownValues(unittest.TestCase):
def test_take_last2d(self):
@@ -69,6 +69,13 @@ def test_sparse(self):
add_sparse(a, b, indices)
assert cupy.linalg.norm(a - a0) < 1e-10
+ def test_dist_matrix(self):
+ a = cupy.random.rand(4, 3)
+ rij = cupy.sum((a[:,None,:] - a[None,:,:])**2, axis=2)**0.5
+
+ rij0 = dist_matrix(a)
+ assert cupy.linalg.norm(rij - rij0) < 1e-10
+
if __name__ == "__main__":
print("Full tests for cupy helper module")
unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
index 428e5a08..d2a4e9b6 100644
--- a/gpu4pyscf/solvent/pcm.py
+++ b/gpu4pyscf/solvent/pcm.py
@@ -29,6 +29,7 @@
from gpu4pyscf.solvent import _attach_solvent
from gpu4pyscf.df import int3c2e
from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import dist_matrix
libdft = lib.load_library('libdft')
@@ -191,7 +192,8 @@ def get_D_S(surface, with_S=True, with_D=False):
xi_i, xi_j = cupy.meshgrid(charge_exp, charge_exp, indexing='ij')
xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
#rij = scipy.spatial.distance.cdist(grid_coords, grid_coords)
- rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
+ #rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
+ rij = dist_matrix(grid_coords)
xi_r_ij = xi_ij * rij
cupy.fill_diagonal(rij, 1)
S = scipy.special.erf(xi_r_ij) / rij