diff --git a/examples/18-to_gpu0.py b/examples/18-to_gpu0.py
new file mode 100644
index 00000000..d017047a
--- /dev/null
+++ b/examples/18-to_gpu0.py
@@ -0,0 +1,46 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pyscf
+from pyscf import lib
+from pyscf.dft import rks
+lib.num_threads(8)
+
+atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+
+mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, output='./pyscf.log')
+
+mol.verbose = 4
+mf = rks.RKS(mol, xc='B3LYP').density_fit()
+mf_GPU = mf.to_gpu()
+
+# Compute Energy
+e_dft = mf_GPU.kernel()
+print(f"total energy = {e_dft}")
+
+# Compute Gradient
+g = mf_GPU.nuc_grad_method()
+g.max_memory = 20000
+g.auxbasis_response = True
+g_dft = g.kernel()
+
+# Compute Hessian
+h = mf_GPU.Hessian()
+h.auxbasis_response = 2
+h_dft = h.kernel()
diff --git a/examples/19-pcm_optimize.py b/examples/19-pcm_optimize.py
new file mode 100644
index 00000000..75ba681b
--- /dev/null
+++ b/examples/19-pcm_optimize.py
@@ -0,0 +1,41 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import pyscf
+from pyscf import lib, df
+from gpu4pyscf.dft import rks
+from pyscf.geomopt.geometric_solver import optimize
+lib.num_threads(8)
+
+atom ='''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+
+mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=0)
+
+mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
+mf = mf.PCM()
+mf.verbose = 3
+mf.grids.atom_grid = (99,590)
+mf.small_rho_cutoff = 1e-10
+mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
+mf.with_solvent.method = 'C-PCM'
+mf.with_solvent.eps = 78.3553
+
+mol_eq = optimize(mf, maxsteps=20)
+
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 80830e6c..9d17560c 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -27,6 +27,7 @@
 parser.add_argument("--solvent",  type=str, default='')
 args = parser.parse_args()
 
+lib.num_threads(16)
 start_time = time.time()
 bas = args.basis
 mol = pyscf.M(
@@ -34,10 +35,10 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 4
+mol.verbose = 7
 
 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 4
+mf_df.verbose = 7
 
 if args.solvent:
     mf_df = mf_df.PCM()
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 5b740207..93346e36 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -5,7 +5,7 @@
 # such as A100-80G
 if props['totalGlobalMem'] >= 64 * GB:
     min_ao_blksize = 256
-    min_grid_blksize = 256*256
+    min_grid_blksize = 128*128
     ao_aligned = 32
     grid_aligned = 128
     mem_fraction = 0.9
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index ff3c6877..e6cd9d21 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -57,18 +57,6 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         auxmol = self.auxmol
         self.nao = mol.nao
 
-        # cache indices for better performance
-        nao = mol.nao
-        tril_row, tril_col = cupy.tril_indices(nao)
-        tril_row = cupy.asarray(tril_row)
-        tril_col = cupy.asarray(tril_col)
-
-        self.tril_row = tril_row
-        self.tril_col = tril_col
-
-        idx = np.arange(nao)
-        self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)
-
         log = logger.new_logger(mol, mol.verbose)
         t0 = log.init_timer()
         if auxmol is None:
@@ -234,20 +222,20 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
         nj = j1 - j0
         if sr_only:
             # TODO: in-place implementation or short-range kernel
-            ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
+            ints_slices = cupy.empty([naoaux, nj, ni], order='C')
             for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
                 k0 = intopt.sph_aux_loc[cp_kl_id]
                 k1 = intopt.sph_aux_loc[cp_kl_id+1]
                 int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1])
             if omega is not None:
-                ints_slices_lr = cupy.zeros([naoaux, nj, ni], order='C')
+                ints_slices_lr = cupy.empty([naoaux, nj, ni], order='C')
                 for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
                     k0 = intopt.sph_aux_loc[cp_kl_id]
                     k1 = intopt.sph_aux_loc[cp_kl_id+1]
                     int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega)
                 ints_slices -= ints_slices_lr
         else:
-            ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
+            ints_slices = cupy.empty([naoaux, nj, ni], order='C')
             for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
                 k0 = intopt.sph_aux_loc[cp_kl_id]
                 k1 = intopt.sph_aux_loc[cp_kl_id+1]
@@ -261,11 +249,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
 
         row = intopt.ao_pairs_row[cp_ij_id] - i0
         col = intopt.ao_pairs_col[cp_ij_id] - j0
-        if cpi == cpj:
-            #ints_slices = ints_slices + ints_slices.transpose([0,2,1])
-            transpose_sum(ints_slices)
         ints_slices = ints_slices[:,col,row]
-
         if cd_low.tag == 'eig':
             cderi_block = cupy.dot(cd_low.T, ints_slices)
             ints_slices = None
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index ff0cbd7e..fe3c748e 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -253,10 +253,11 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
     nao = dms_tag.shape[-1]
     dms = dms_tag.reshape([-1,nao,nao])
     nset = dms.shape[0]
-    t0 = log.init_timer()
+    t1 = t0 = log.init_timer()
     if dfobj._cderi is None:
         log.debug('CDERI not found, build...')
         dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
+        t1 = log.timer_debug1('init jk', *t0)
 
     assert nao == dfobj.nao
     vj = None
@@ -264,7 +265,6 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
     ao_idx = dfobj.intopt.sph_ao_idx
     dms = take_last2d(dms, ao_idx)
 
-    t1 = log.timer_debug1('init jk', *t0)
     rows = dfobj.intopt.cderi_row
     cols = dfobj.intopt.cderi_col
     if with_j:
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index ff3d0ef2..a6e793f6 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -37,6 +37,7 @@
 import cupy
 import numpy as np
 from pyscf import lib, df
+from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d
 from gpu4pyscf.df import int3c2e
@@ -283,6 +284,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
             rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None
             wk_ip2_P__ = int2c_ip1_inv = None
+    t1 = log.timer_debug1('contract int2c_*', *t1)
+
     ao_idx = np.argsort(intopt.sph_ao_idx)
     aux_idx = np.argsort(intopt.sph_aux_idx)
     rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
@@ -372,7 +375,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             e1[j0,i0] = e1[i0,j0].T
             ej[j0,i0] = ej[i0,j0].T
             ek[j0,i0] = ek[i0,j0].T
-
+    t1 = log.timer_debug1('hcore contribution', *t1)
     log.timer('RHF partial hessian', *time0)
     return e1, ej, ek
 
@@ -398,6 +401,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     else:
         return chkfile
     '''
+
 def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             verbose=None, with_k=True, omega=None):
     log = logger.new_logger(hessobj, verbose)
@@ -521,8 +525,9 @@ def _ao2mo(mat):
         vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2
         vk1_int3c_ip1 = vk1_int3c_ip2 = None
 
+    grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
     cupy.get_default_memory_pool().free_all_blocks()
-    hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol)
+    #hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol)
     vk1 = None
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
@@ -535,8 +540,9 @@ def _ao2mo(mat):
             vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:]
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
 
-        h1 = hcore_deriv(ia)
-        h1 = _ao2mo(cupy.asarray(h1, order='C'))
+        h1 =  grad_hcore[:,i0]
+        #h1 = hcore_deriv(ia)
+        #h1 = _ao2mo(cupy.asarray(h1, order='C'))
         vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
         if with_k:
             vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 56668354..e45f11dc 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -306,10 +306,7 @@ def build(self, cutoff=1e-14, group_size=None,
         ncptype = len(log_qs)
 
         self.bpcache = ctypes.POINTER(BasisProdCache)()
-        if diag_block_with_triu:
-            scale_shellpair_diag = 1.
-        else:
-            scale_shellpair_diag = 0.5
+        scale_shellpair_diag = 1.
         libgint.GINTinit_basis_prod(
             ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
             ao_loc.ctypes.data_as(ctypes.c_void_p),
@@ -1194,6 +1191,32 @@ def get_dh1e(mol, dm0):
         dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
     return 2.0 * cupy.einsum('kx,k->kx', dh1e, -charges)
 
+def get_d2h1e(mol, dm0):
+    natm = mol.natm
+    coords = mol.atom_coords()
+    charges = mol.atom_charges()
+    fakemol = gto.fakemol_for_charges(coords)
+
+    nao = mol.nao
+    d2h1e_diag = cupy.zeros([natm,9])
+    d2h1e_offdiag = cupy.zeros([natm, nao, 9])
+    intopt = VHFOpt(mol, fakemol, 'int2e')
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx)
+    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
+        d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+        d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+
+    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'):
+        d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+        d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
+    aoslices = mol.aoslice_by_atom()
+    ao2atom = get_ao2atom(intopt, aoslices)
+    d2h1e = contract('aix,ib->abx', d2h1e_offdiag, ao2atom)
+    d2h1e[np.diag_indices(natm), :] += d2h1e_diag
+    return 2.0 * cupy.einsum('abx,a->xab', d2h1e, charges)
+    #return 2.0 * cupy.einsum('ijx,i->kx', dh1e, -charges)
+
 def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=None, stream=None):
     '''
     Generate one int3c2e block for given ij, k
@@ -1443,14 +1466,6 @@ def get_pairing(p_offsets, q_offsets, q_cond,
         for q0, q1 in zip(q_offsets[:-1], q_offsets[1:]):
             if aosym and q0 < p0 or not aosym:
                 q_sub = q_cond[p0:p1,q0:q1].ravel()
-                '''
-                idx = q_sub.argsort(axis=None)[::-1]
-                q_sorted = q_sub[idx]
-                mask = q_sorted > cutoff
-                idx = idx[mask]
-                ishs, jshs = np.unravel_index(idx, (p1-p0, q1-q0))
-                print(ishs.shape)
-                '''
                 mask = q_sub > cutoff
                 ishs, jshs = np.indices((p1-p0,q1-q0))
                 ishs = ishs.ravel()[mask]
@@ -1464,13 +1479,6 @@ def get_pairing(p_offsets, q_offsets, q_cond,
                 log_qs.append(log_q)
             elif aosym and p0 == q0 and p1 == q1:
                 q_sub = q_cond[p0:p1,p0:p1].ravel()
-                '''
-                idx = q_sub.argsort(axis=None)[::-1]
-                q_sorted = q_sub[idx]
-                ishs, jshs = np.unravel_index(idx, (p1-p0, p1-p0))
-                mask = q_sorted > cutoff
-                '''
-
                 ishs, jshs = np.indices((p1-p0, p1-p0))
                 ishs = ishs.ravel()
                 jshs = jshs.ravel()
diff --git a/gpu4pyscf/df/tests/test_int3c2e.py b/gpu4pyscf/df/tests/test_int3c2e.py
index 8e02ac3d..6c556061 100644
--- a/gpu4pyscf/df/tests/test_int3c2e.py
+++ b/gpu4pyscf/df/tests/test_int3c2e.py
@@ -116,6 +116,32 @@ def test_int1e_iprinv(self):
             h1ao = mol.intor('int1e_iprinv', comp=3) # <\nabla|1/r|>
             assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
 
+    def test_int1e_ipiprinv(self):
+        from pyscf import gto
+        coords = mol.atom_coords()
+        charges = mol.atom_charges()
+
+        fakemol = gto.fakemol_for_charges(coords)
+        int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1').get()
+
+        for i,q in enumerate(charges):
+            mol.set_rinv_origin(coords[i])
+            h1ao = mol.intor('int1e_ipiprinv', comp=9) # <\nabla|1/r|>
+            assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
+
+    def test_int1e_iprinvip(self):
+        from pyscf import gto
+        coords = mol.atom_coords()
+        charges = mol.atom_charges()
+
+        fakemol = gto.fakemol_for_charges(coords)
+        int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1').get()
+
+        for i,q in enumerate(charges):
+            mol.set_rinv_origin(coords[i])
+            h1ao = mol.intor('int1e_iprinvip', comp=9) # <\nabla|1/r|>
+            assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8
+
 if __name__ == "__main__":
     print("Full Tests for int3c")
     unittest.main()
diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
index 5ced5d94..5528e1e5 100644
--- a/gpu4pyscf/dft/gen_grid.py
+++ b/gpu4pyscf/dft/gen_grid.py
@@ -279,14 +279,16 @@ def get_partition(mol, atom_grids_tab,
         grid_coord and grid_weight arrays.  grid_coord array has shape (N,3);
         weight 1D array has N elements.
     '''
+    atm_coords = numpy.asarray(mol.atom_coords() , order='C')
+    atm_coords = cupy.asarray(atm_coords)
+    '''
     if callable(radii_adjust) and atomic_radii is not None:
         f_radii_adjust = radii_adjust(mol, atomic_radii)
     else:
         f_radii_adjust = None
-    atm_coords = numpy.asarray(mol.atom_coords() , order='C')
     atm_dist = gto.inter_distance(mol)
-    atm_coords = cupy.asarray(atm_coords)
     atm_dist = cupy.asarray(atm_dist)
+
     if (becke_scheme is original_becke and
         (radii_adjust is radi.treutler_atomic_radii_adjust or
          radii_adjust is radi.becke_atomic_radii_adjust or
@@ -324,7 +326,7 @@ def gen_grid_partition(coords):
                     pbecke[i] *= .5 * (1-g)
                     pbecke[j] *= .5 * (1+g)
             return pbecke
-
+    '''
     coords_all = []
     weights_all = []
     # support atomic_radii_adjust = None
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 0e46c611..0ca4cd75 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -31,7 +31,7 @@
 from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 6
-BAS_ALIGNED = 4
+BAS_ALIGNED = 1
 GRID_BLKSIZE = 32
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
 ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
@@ -448,7 +448,9 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
     dm_shape = dms.shape
-    dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+    #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+    dms = dms.reshape(-1,nao0,nao0)
+    dms = take_last2d(dms, opt.ao_idx)
     nset = len(dms)
 
     if mo_coeff is not None:
@@ -482,19 +484,20 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         rho_tot = cupy.empty([nset,6,ngrids])
 
     p0 = p1 = 0
+    t1 = t0 = log.init_timer()
     for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
         p1 = p0 + weight.size
         for i in range(nset):
-            t0 = log.init_timer()
             if mo_coeff is None:
                 rho_tot[i,:,p0:p1] = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1)
             else:
                 mo_coeff_mask = mo_coeff[idx,:]
                 rho_tot[i,:,p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
-            t1 = log.timer_debug1('eval rho', *t0)
         p0 = p1
+        t1 = log.timer_debug2('eval rho', *t1)
+    t0 = log.timer_debug1('eval rho', *t0)
 
-    vxc_tot = []
+    wv = []
     for i in range(nset):
         if xctype == 'LDA':
             exc, vxc = ni.eval_xc_eff(xc_code, rho_tot[i][0], deriv=1, xctype=xctype)[:2]
@@ -505,69 +508,37 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         den = rho_tot[i][0] * grids.weights
         nelec[i] = den.sum()
         excsum[i] = cupy.sum(den * exc[:,0])
-        vxc_tot.append(vxc)
-        t1 = log.timer_debug1('eval vxc', *t1)
+        wv.append(vxc * grids.weights)
+        if xctype == 'GGA':
+            wv[i][0] *= .5
+        if xctype == 'MGGA':
+            wv[i][[0,4]] *= .5
+    t0 = log.timer_debug1('eval vxc', *t0)
 
+    t1 = t0
     p0 = p1 = 0
     for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
         p1 = p0 + weight.size
         for i in range(nset):
-            vxc = vxc_tot[i][:,p0:p1]
             if xctype == 'LDA':
-                #den = rho * weight
-                wv = weight * vxc[0]
-                '''
-                if USE_SPARSITY == 0:
-                    vmat[i] += ao.dot(_scale_ao(ao, wv).T)
-                elif USE_SPARSITY == 1:
-                    _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc,
-                        pair2shls_full, pairs_locs_full, vmat[i])
-                '''
                 if USE_SPARSITY == 2:
-                    aow = _scale_ao(ao_mask, wv)
-                    # vmat[i][cupy.ix_(mask, mask)] += ao_mask.dot(aow.T)
+                    aow = _scale_ao(ao_mask, wv[i][0,p0:p1])
                     add_sparse(vmat[i], ao_mask.dot(aow.T), idx)
                 else:
                     raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
             elif xctype == 'GGA':
-                #den = rho[0] * weight
-                wv = vxc * weight
-                wv[0] *= .5
-                '''
-                if USE_SPARSITY == 0:
-                    vmat[i] += ao[0].dot(_scale_ao(ao, wv).T)
-                elif USE_SPARSITY == 1:
-                    aow = _scale_ao(ao, wv)
-                    _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc,
-                        pair2shls_full, pairs_locs_full, vmat[i])
-                '''
                 if USE_SPARSITY == 2:
-                    aow = _scale_ao(ao_mask, wv)
+                    aow = _scale_ao(ao_mask, wv[i][:,p0:p1])
                     add_sparse(vmat[i], ao_mask[0].dot(aow.T), idx)
                 else:
                     raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
             elif xctype == 'NLC':
                 raise NotImplementedError('NLC')
             elif xctype == 'MGGA':
-                #den = rho[0] * weight
-                wv = vxc * weight
-                wv[[0, 4]] *= .5  # *.5 for v+v.T
-                '''
-                if USE_SPARSITY == 0:
-                    aow = _scale_ao(ao[:4], wv[:4])
-                    vmat[i] += ao[0].dot(aow.T)
-                    vmat[i] += _tau_dot(ao, ao, wv[4])
-                elif USE_SPARSITY == 1:
-                    _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc,
-                        pair2shls_full, pairs_locs_full, vmat[i])
-                    _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc,
-                        pair2shls_full, pairs_locs_full, vmat[i])
-                '''
                 if USE_SPARSITY == 2:
-                    aow = _scale_ao(ao_mask, wv[:4])
+                    aow = _scale_ao(ao_mask, wv[i][:4,p0:p1])
                     vtmp = ao_mask[0].dot(aow.T)
-                    vtmp+= _tau_dot(ao_mask, ao_mask, wv[4])
-                    #vmat[i][cupy.ix_(mask, mask)] += vtmp
+                    vtmp+= _tau_dot(ao_mask, ao_mask, wv[i][4,p0:p1])
                     add_sparse(vmat[i], vtmp, idx)
                 else:
                     raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
@@ -575,17 +546,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 pass
             else:
                 raise NotImplementedError(f'numint.nr_rks for functional {xc_code}')
-            t1 = log.timer_debug1('integration', *t1)
         p0 = p1
-
-    vmat = contract('pi,npq->niq', coeff, vmat)
-    vmat = contract('qj,niq->nij', coeff, vmat)
-    #rev_ao_idx = opt.rev_ao_idx
-    #vmat = take_last2d(vmat, rev_ao_idx)
+        t1 = log.timer_debug2('integration', *t1)
+    t0 = log.timer_debug1('vxc integration', *t0)
+    rev_ao_idx = opt.rev_ao_idx
+    vmat = take_last2d(vmat, rev_ao_idx)
 
     if xctype != 'LDA':
-        vmat = vmat + vmat.transpose([0,2,1])
-        #transpose_sum(vmat)
+        transpose_sum(vmat)
 
     if FREE_CUPY_CACHE:
         dms = None
@@ -719,27 +687,27 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
     log = logger.new_logger(mol, verbose)
     coeff = cupy.asarray(opt.coeff)
     nao = coeff.shape[0]
-    dm = coeff @ cupy.asarray(dm) @ coeff.T
-
     mo_coeff = getattr(dm, 'mo_coeff', None)
     mo_occ = getattr(dm,'mo_occ', None)
 
+    dm = coeff @ cupy.asarray(dm) @ coeff.T
     if mo_coeff is not None:
         mo_coeff = coeff @ mo_coeff
 
     ngrids = grids.weights.size
     rho = cupy.empty(ngrids)
     p0 = p1 = 0
+    t1 = t0 = log.init_timer()
     for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, 0):
         p1 = p0 + weight.size
-        t0 = log.init_timer()
         if mo_coeff is None:
             rho[p0:p1] = eval_rho(mol, ao_mask, dm[np.ix_(idx,idx)], xctype='LDA', hermi=1)
         else:
             mo_coeff_mask = mo_coeff[idx,:]
             rho[p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, 'LDA')
         p0 = p1
-        log.timer_debug1('eval rho', *t0)
+        t1 = log.timer_debug2('eval rho slice', *t1)
+    t0 = log.timer_debug1('eval rho', *t0)
 
     if FREE_CUPY_CACHE:
         dm = None
@@ -763,10 +731,9 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
     # AO basis -> gdftopt AO basis
     with_mocc = hasattr(dms, 'mo1')
     if with_mocc:
-        mo1 = contract('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5
-        occ_coeff = contract('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5
-    dms = contract('nij,qj->niq', dms, coeff)
-    dms = contract('pi,niq->npq', coeff, dms)
+        mo1 = dms.mo1[:,opt.ao_idx] * 2.0**0.5
+        occ_coeff = dms.occ_coeff[opt.ao_idx] * 2.0**0.5
+    dms = take_last2d(dms, opt.ao_idx)
     nset = len(dms)
     vmat = cupy.zeros((nset, nao, nao))
 
@@ -776,8 +743,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         ao_deriv = 1
     p0 = 0
     p1 = 0
+    t1 = t0 = log.init_timer()
     for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv):
-        t0 = log.init_timer()
         p0, p1 = p1, p1+len(weights)
         # precompute molecular orbitals
         if with_mocc:
@@ -798,7 +765,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
                 rho_tmp = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False)
                 rho1.append(rho_tmp)
             rho1 = cupy.stack(rho1, axis=0)
-        t0 = log.timer_debug1('rho', *t0)
+        t1 = log.timer_debug2('eval rho', *t1)
 
         # precompute fxc_w
         if xctype == 'LDA':
@@ -826,14 +793,14 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
                 vmat_tmp+= _tau_dot(ao, ao, wv[i,4])
                 add_sparse(vmat[i], vmat_tmp, mask)
 
-        t0 = log.timer_debug1('vxc', *t0)
+        t1 = log.timer_debug2('integration', *t1)
         ao = c0 = rho1 = None
+    t0 = log.timer_debug1('vxc', *t0)
 
-    vmat = contract('pi,npq->niq', coeff, vmat)
-    vmat = contract('qj,niq->nij', coeff, vmat)
+    vmat = take_last2d(vmat, opt.rev_ao_idx)
     if xctype != 'LDA':
-        #transpose_sum(vmat)
-        vmat = vmat + vmat.transpose([0,2,1])
+        transpose_sum(vmat)
+
     if FREE_CUPY_CACHE:
         dms = None
         cupy.get_default_memory_pool().free_all_blocks()
@@ -1067,13 +1034,14 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         wv = vv_vxc[:,p0:p1] * weight
         wv[0] *= .5
         aow = _scale_ao(ao, wv)
-        #vmat += ao[0].dot(aow.T)
         add_sparse(vmat, ao[0].dot(aow.T), mask)
     t1 = log.timer_debug1('integration', *t1)
 
-    vmat = vmat + vmat.T
-    vmat = contract('pi,pq->iq', coeff, vmat)
-    vmat = contract('qj,iq->ij', coeff, vmat)
+    transpose_sum(vmat)
+    vmat = take_last2d(vmat, opt.rev_ao_idx)
+    #vmat = vmat + vmat.T
+    #vmat = contract('pi,pq->iq', coeff, vmat)
+    #vmat = contract('qj,iq->ij', coeff, vmat)
     log.timer_debug1('eval vv10', *t0)
     return nelec, excsum, vmat
 
@@ -1101,28 +1069,31 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
     if spin == 0:
         mo_coeff = coeff @ mo_coeff
         rho = []
-        t0 = log.init_timer()
+        t1 = t0 = log.init_timer()
         for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
             mo_coeff_mask = mo_coeff[idx,:]
             rho_slice = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
             rho.append(rho_slice)
-            t0 = log.timer_debug1('eval rho in fxc', *t0)
+            t1 = log.timer_debug2('eval rho slice', *t1)
         rho = cupy.hstack(rho)
+        t0 = log.timer_debug1('eval rho in fxc', *t0)
     else:
         mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff))
         rhoa = []
         rhob = []
-        t0 = log.init_timer()
+        t1 = t0 = log.init_timer()
         for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
             mo_coeff_mask = mo_coeff[:,idx,:]
             rhoa_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype)
             rhob_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype)
             rhoa.append(rhoa_slice)
             rhob.append(rhob_slice)
-            t0 = log.timer_debug1('eval rho in fxc', *t0)
+            t1 = log.timer_debug2('eval rho in fxc', *t1)
         #rho = (cupy.hstack(rhoa), cupy.hstack(rhob))
         rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0)
+        t0 = log.timer_debug1('eval rho in fxc', *t0)
     vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
+    t0 = log.timer_debug1('eval fxc', *t0)
     return rho, vxc, fxc
 
 @cupy.fuse()
@@ -1267,18 +1238,18 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     mol = opt.mol
     with opt.gdft_envs_cache():
         block_id = 0
+        t1 = log.init_timer()
         for ip0, ip1 in lib.prange(0, ngrids, blksize):
             coords = grids.coords[ip0:ip1]
             weight = grids.weights[ip0:ip1]
 
             # cache ao indices
-            if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx:
+            if (block_id, blksize, ngrids) not in ni.non0ao_idx:
                 stream = cupy.cuda.get_current_stream()
                 cutoff = AO_THRESHOLD
                 ng = ip1 - ip0
                 ao_loc = mol.ao_loc_nr()
                 nbas = mol.nbas
-                t0 = log.init_timer()
                 non0shl_idx = cupy.zeros(len(ao_loc)-1, dtype=np.int32)
                 libgdft.GDFTscreen_index(
                     ctypes.cast(stream.ptr, ctypes.c_void_p),
@@ -1317,24 +1288,24 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
                 idx = cupy.hstack([idx, zero_idx[:pad]])
                 pad = min(pad, len(zero_idx))
                 non0shl_idx = cupy.asarray(np.where(non0shl_idx)[0], dtype=np.int32)
-                ni.non0ao_idx[deriv, block_id, blksize, ngrids] = (pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice)
-                log.timer_debug1('init ao sparsity', *t0)
+                ni.non0ao_idx[block_id, blksize, ngrids] = (pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice)
+                t1 = log.timer_debug2('init ao sparsity', *t1)
             else:
-                pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[deriv, block_id, blksize, ngrids]
-            t0 = log.init_timer()
+                pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids]
+
             ao_mask = eval_ao(
                 ni, mol, coords, deriv,
                 nao_slice=len(idx),
                 shls_slice=non0shl_idx,
                 ao_loc_slice=ao_loc_slice,
                 ctr_offsets_slice=ctr_offsets_slice)
+            t1 = log.timer_debug2('evaluate ao slice', *t1)
             if pad > 0:
                 if deriv == 0:
                     ao_mask[-pad:,:] = 0.0
                 else:
                     ao_mask[:,-pad:,:] = 0.0
             block_id += 1
-            log.timer_debug1('evaluate ao slice', *t0)
             yield ao_mask, idx, weight, coords
 
 class NumInt(numint.NumInt):
@@ -1665,6 +1636,7 @@ def build(self, mol=None):
         pmol._decontracted = True
         self.mol = pmol
         inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32)
+        self.ao_idx = cupy.asarray(ao_idx, dtype=np.int32)
         self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32)
         self.coeff = coeff[ao_idx]
         self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32)
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index 7b9dd588..70b15723 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -57,6 +57,7 @@ def prune_small_rho_grids_(ks, mol, dm, grids):
                 grids.coords = cupy.vstack(
                         [grids.coords, pad])
                 grids.weights = cupy.hstack([grids.weights, cupy.zeros(padding)])
+
         # make_mask has to be executed on cpu for now.
         #grids.non0tab = grids.make_mask(mol, grids.coords)
         #grids.screen_index = grids.non0tab
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index c96820b6..9f1f9efc 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -22,7 +22,7 @@
 from pyscf.grad import rhf
 from gpu4pyscf.lib.cupy_helper import load_library
 from gpu4pyscf.scf.hf import _VHFOpt
-from gpu4pyscf.lib.cupy_helper import tag_array, contract
+from gpu4pyscf.lib.cupy_helper import tag_array, contract, take_last2d
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
 from gpu4pyscf.lib import logger
 
@@ -552,6 +552,50 @@ def calculate_h1e(h1_gpu, s1_gpu):
     de -= cupy.sum(de, axis=0)/len(atmlst)
     return de.get()
 
+def get_grad_hcore(mf_grad):
+    mf = mf_grad.base
+    mol = mf.mol
+    natm = mol.natm
+    nao = mol.nao
+    mo_occ = mf.mo_occ
+    mo_coeff = cupy.asarray(mf.mo_coeff)
+    orbo = mo_coeff[:,mo_occ>0]
+    nocc = orbo.shape[1]
+
+    dh1e = cupy.zeros([3,natm,nao,nocc])
+    coords = mol.atom_coords()
+    charges = cupy.asarray(mol.atom_charges(), dtype=np.float64)
+    fakemol = gto.fakemol_for_charges(coords)
+    intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
+    orbo_sorted = orbo[intopt.sph_ao_idx]
+    mo_coeff_sorted = mo_coeff[intopt.sph_ao_idx]
+    for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'):
+        dh1e[:,k0:k1,j0:j1,:] += contract('xkji,io->xkjo', int3c_blk, orbo_sorted[i0:i1])
+        dh1e[:,k0:k1,i0:i1,:] += contract('xkji,jo->xkio', int3c_blk, orbo_sorted[j0:j1])
+    dh1e = contract('xkjo,k->xkjo', dh1e, -charges)
+    dh1e = contract('xkjo,jp->xkpo', dh1e, mo_coeff_sorted)
+
+    h1 = mf_grad.get_hcore(mol)
+    aoslices = mol.aoslice_by_atom()
+    with_ecp = mol.has_ecp()
+    if with_ecp:
+        ecp_atoms = set(mol._ecpbas[:,gto.ATOM_OF])
+    else:
+        ecp_atoms = ()
+    for atm_id in range(natm):
+        shl0, shl1, p0, p1 = aoslices[atm_id]
+        h1ao = numpy.zeros([3,nao,nao])
+        with mol.with_rinv_at_nucleus(atm_id):
+            if with_ecp and atm_id in ecp_atoms:
+                h1ao += mol.intor('ECPscalar_iprinv', comp=3)
+        h1ao[:,p0:p1] += h1[:,p0:p1]
+        h1ao += h1ao.transpose([0,2,1])
+        h1ao = cupy.asarray(h1ao)
+        h1mo = contract('xij,jo->xio', h1ao, orbo)
+        dh1e[:,atm_id] += contract('xio,ip->xpo', h1mo, mo_coeff)
+    return dh1e#2.0 * cupy.einsum('kx,k->kx', dh1e, -charges)
+
 class Gradients(rhf.Gradients):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 82b0befd..be6732d6 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -27,7 +27,8 @@
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.dft import numint, xc_deriv, rks
 from gpu4pyscf.dft.numint import _GDFTOpt, AO_THRESHOLD
-from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array, load_library
+from gpu4pyscf.lib.cupy_helper import (
+    contract, get_avail_mem, add_sparse, tag_array, load_library, take_last2d)
 from gpu4pyscf.lib import logger
 from pyscf import __config__
 
@@ -121,10 +122,12 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     mo_coeff = cupy.asarray(dms.mo_coeff)
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
-    dms = cupy.asarray(dms)
-    dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
-           for dm in dms.reshape(-1,nao0,nao0)]
-    mo_coeff = coeff @ mo_coeff
+    dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
+    dms = take_last2d(dms, opt.ao_idx)
+    #dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
+    #       for dm in dms.reshape(-1,nao0,nao0)]
+    #mo_coeff = coeff @ mo_coeff
+    mo_coeff = mo_coeff[opt.ao_idx]
 
     nset = len(dms)
     assert nset == 1
@@ -172,7 +175,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 vtmp = _gga_grad_sum_(ao_mask, wv)
                 vtmp += _tau_grad_dot_(ao_mask, wv[4])
                 add_sparse(vmat[idm], vtmp, idx)
-    vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
+    #vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
+    vmat = take_last2d(vmat, opt.rev_ao_idx)
     exc = None
     if nset == 1:
         vmat = vmat[0]
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index b6421424..5efb8f12 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -57,6 +57,7 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mo_coeff = cupy.asarray(mo_coeff)
     de2 = hessobj.partial_hess_elec(mo_energy, mo_coeff, mo_occ, atmlst,
                                     max_memory, log)
+    t1 = log.timer_debug1('hess elec', *t1)
     if h1ao is None:
         h1ao = hessobj.make_h1(mo_coeff, mo_occ, hessobj.chkfile, atmlst, log)
         t1 = log.timer_debug1('making H1', *t1)
@@ -503,11 +504,13 @@ def hcore_generator(self, mol=None):
         else:
             ecp_atoms = ()
         aoslices = mol.aoslice_by_atom()
-        nbas = mol.nbas
         nao = mol.nao_nr()
         h1aa, h1ab = self.get_hcore(mol)
         h1aa = cupy.asarray(h1aa)
         h1ab = cupy.asarray(h1ab)
+
+        rinv2aa_all = {}
+        rinv2ab_all = {}
         def get_hcore(iatm, jatm):
             ish0, ish1, i0, i1 = aoslices[iatm]
             jsh0, jsh1, j0, j1 = aoslices[jatm]
@@ -515,15 +518,20 @@ def get_hcore(iatm, jatm):
             zj = mol.atom_charge(jatm)
             if iatm == jatm:
                 with mol.with_rinv_at_nucleus(iatm):
-                    rinv2aa = mol.intor('int1e_ipiprinv', comp=9)
-                    rinv2ab = mol.intor('int1e_iprinvip', comp=9)
-                    rinv2aa *= zi
-                    rinv2ab *= zi
-                    if with_ecp and iatm in ecp_atoms:
-                        rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
-                        rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
-                rinv2aa = cupy.asarray(rinv2aa)
-                rinv2ab = cupy.asarray(rinv2ab)
+                    if iatm not in rinv2aa_all:
+                        rinv2aa = zi * mol.intor('int1e_ipiprinv', comp=9)
+                        if with_ecp and iatm in ecp_atoms:
+                            rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
+                        rinv2aa_all[iatm] = rinv2aa
+                    rinv2aa = cupy.asarray(rinv2aa_all[iatm])
+
+                    if iatm not in rinv2ab_all:
+                        rinv2ab = zi * mol.intor('int1e_iprinvip', comp=9)
+                        if with_ecp and iatm in ecp_atoms:
+                            rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
+                        rinv2ab_all[iatm] = rinv2ab
+                    rinv2ab = cupy.asarray(rinv2ab_all[iatm])
+
                 rinv2aa = rinv2aa.reshape(3,3,nao,nao)
                 rinv2ab = rinv2ab.reshape(3,3,nao,nao)
                 hcore = -rinv2aa - rinv2ab
@@ -538,28 +546,42 @@ def get_hcore(iatm, jatm):
                 hcore = cupy.zeros((3,3,nao,nao))
                 hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1]
                 with mol.with_rinv_at_nucleus(iatm):
-                    shls_slice = (jsh0, jsh1, 0, nbas)
-                    rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
-                    rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
-                    rinv2aa *= zi
-                    rinv2ab *= zi
-                    if with_ecp and iatm in ecp_atoms:
-                        rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9, shls_slice=shls_slice)
-                        rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
-                    rinv2aa = cupy.asarray(rinv2aa)
-                    rinv2ab = cupy.asarray(rinv2ab)
+                    #rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
+                    if iatm not in rinv2aa_all:
+                        rinv2aa = zi * mol.intor('int1e_ipiprinv', comp=9)
+                        if with_ecp and iatm in ecp_atoms:
+                            rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
+                        rinv2aa_all[iatm] = rinv2aa
+                    rinv2aa = cupy.asarray(rinv2aa_all[iatm][:,j0:j1])
+
+                    #rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
+                    if iatm not in rinv2ab_all:
+                        rinv2ab = zi * mol.intor('int1e_iprinvip', comp=9)
+                        if with_ecp and iatm in ecp_atoms:
+                            rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
+                        rinv2ab_all[iatm] = rinv2ab
+                    rinv2ab = cupy.asarray(rinv2ab_all[iatm][:,j0:j1])
+
                     hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao)
                     hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3)
 
                 with mol.with_rinv_at_nucleus(jatm):
-                    shls_slice = (ish0, ish1, 0, nbas)
-                    rinv2aa = mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
-                    rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
-                    rinv2aa *= zj
-                    rinv2ab *= zj
-                    if with_ecp and jatm in ecp_atoms:
-                        rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9, shls_slice=shls_slice)
-                        rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
+                    # rinv2aa = zj * mol.intor('int1e_ipiprinv', comp=9, shls_slice=shls_slice)
+                    if jatm not in rinv2aa_all:
+                        rinv2aa = zj * mol.intor('int1e_ipiprinv', comp=9)
+                        if with_ecp and jatm in ecp_atoms:
+                            rinv2aa -= mol.intor('ECPscalar_ipiprinv', comp=9)
+                        rinv2aa_all[jatm] = rinv2aa
+                    rinv2aa = cupy.asarray(rinv2aa_all[jatm][:,i0:i1])
+
+                    # rinv2ab = mol.intor('int1e_iprinvip', comp=9, shls_slice=shls_slice)
+                    if jatm not in rinv2ab_all:
+                        rinv2ab = zj * mol.intor('int1e_iprinvip', comp=9)
+                        if with_ecp and jatm in ecp_atoms:
+                            rinv2ab -= mol.intor('ECPscalar_iprinvip', comp=9)
+                        rinv2ab_all[jatm] = rinv2ab
+                    rinv2ab = cupy.asarray(rinv2ab_all[jatm][:,i0:i1])
+
                     rinv2aa = cupy.asarray(rinv2aa)
                     rinv2ab = cupy.asarray(rinv2ab)
                     hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao)
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 2082e707..3e36825c 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -27,7 +27,7 @@
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, add_sparse
+from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d
 from gpu4pyscf.lib import logger
 
 # import pyscf.grad.rks to activate nuc_grad_method method
@@ -393,15 +393,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
     ipip = cupy.zeros((3,3,nao,nao))
     if xctype == 'LDA':
         ao_deriv = 1
+        t1 = log.init_timer()
         for ao_mask, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            t0 = log.init_timer()
             nao_non0 = len(mask)
             ao = contract('nip,ij->njp', ao_mask, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
-            t0 = log.timer_debug1('eval rho', *t0)
+            t1 = log.timer_debug2('eval rho', *t1)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
-            t0 = log.timer_debug1('eval vxc', *t0)
+            t1 = log.timer_debug2('eval vxc', *t1)
             wv = weight * vxc[0]
             aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)]
             _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
@@ -417,27 +417,28 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmat_tmp = cupy.zeros([3,3,nao_non0,nao_non0])
                 aow = [numint._scale_ao(ao_mask[0], wv[i]) for i in range(3)]
                 _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
-                vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
-                vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
-                vmat[ia] += vmat_tmp
+                #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+                #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+                #vmat[ia] += vmat_tmp
+                add_sparse(vmat[ia], vmat_tmp, mask)
             ao_dm0 = aow = None
-            t0 = log.timer_debug1('integration', *t0)
+            t1 = log.timer_debug2('integration', *t1)
         for ia in range(mol.natm):
+            vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx)
             p0, p1 = aoslices[ia][2:]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
 
     elif xctype == 'GGA':
         ao_deriv = 2
-        comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6
+        t1 = log.init_timer()
         for ao_mask, mask, weight, coords \
-                in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao):
-            t0 = log.init_timer()
+                in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
             nao_non0 = len(mask)
             ao = contract('nip,ij->njp', ao_mask, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
-            t0 = log.timer_debug1('eval rho', *t0)
+            t1 = log.timer_debug2('eval rho', *t1)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
-            t0 = log.timer_debug1('eval vxc', *t0)
+            t1 = log.timer_debug2('eval vxc', *t1)
             wv = weight * vxc
             wv[0] *= .5
             aow = rks_grad._make_dR_dao_w(ao, wv)
@@ -462,12 +463,14 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                     rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i])
                 aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)]
                 _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
-                vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
-                vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
-                vmat[ia] += vmat_tmp
+                #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+                #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+                #vmat[ia] += vmat_tmp
+                add_sparse(vmat[ia], vmat_tmp, mask)
             ao_dm0 = aow = None
-            t0 = log.timer_debug1('integration', *t0)
+            t1 = log.timer_debug2('integration', *t1)
         for ia in range(mol.natm):
+            vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx)
             p0, p1 = aoslices[ia][2:]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2)
@@ -477,15 +480,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
         YX, YY, YZ = 5, 7, 8
         ZX, ZY, ZZ = 6, 8, 9
         ao_deriv = 2
+        t1 = log.init_timer()
         for ao_mask, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            t0 = log.init_timer()
             nao_non0 = len(mask)
             ao = contract('nip,ij->njp', ao_mask, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
-            t0 = log.timer_debug1('eval rho', *t0)
+            t1 = log.timer_debug2('eval rho', *t1)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
-            t0 = log.timer_debug1('eval vxc', *t0)
+            t1 = log.timer_debug2('eval vxc', *t1)
             wv = weight * vxc
             wv[0] *= .5
             wv[4] *= .25
@@ -522,11 +525,13 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 _d1d2_dot_(vmat_tmp, mol, [ao_mask[YX], ao_mask[YY], ao_mask[YZ]], aow, mask, ao_loc, False)
                 aow = [numint._scale_ao(ao_mask[3], wv[i,4]) for i in range(3)]
                 _d1d2_dot_(vmat_tmp, mol, [ao_mask[ZX], ao_mask[ZY], ao_mask[ZZ]], aow, mask, ao_loc, False)
-                vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
-                vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
-                vmat[ia] += vmat_tmp
-            t0 = log.timer_debug1('integration', *t0)
+                #vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+                #vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+                #vmat[ia] += vmat_tmp
+                add_sparse(vmat[ia], vmat_tmp, mask)
+            t1 = log.timer_debug2('integration', *t1)
         for ia in range(mol.natm):
+            vmat[ia] = take_last2d(vmat[ia], opt.rev_ao_idx)
             p0, p1 = aoslices[ia][2:]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2)
@@ -570,14 +575,14 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
     max_memory = max(2000, max_memory-vmat.size*8/1e6)
     if xctype == 'LDA':
         ao_deriv = 1
+        t1 = t0 = log.init_timer()
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            t0 = log.init_timer()
             ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
-            t0 = log.timer_debug1('eval rho', *t0)
+            t1 = log.timer_debug2('eval rho', *t1)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
-            t0 = log.timer_debug1('eval vxc', *t0)
+            t1 = log.timer_debug2('eval vxc', *t1)
             wv = weight * vxc[0]
             aow = numint._scale_ao(ao[0], wv)
             v_ip += rks_grad._d1_dot_(ao[1:4], aow.T)
@@ -592,17 +597,17 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
                 aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)]
                 vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T)
             ao_dm0 = aow = None
-            t0 = log.timer_debug1('integration', *t0)
+            t1 = log.timer_debug2('integration', *t1)
     elif xctype == 'GGA':
         ao_deriv = 2
+        t1 = t0 = log.init_timer()
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            t0 = log.init_timer()
             ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
-            t0 = log.timer_debug1('eval rho', *t0)
+            t1 = log.timer_debug2('eval rho', *t1)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
-            t0 = log.timer_debug1('eval vxc', *t0)
+            t1 = log.timer_debug2('eval vxc', *t1)
             wv = weight * vxc
             wv[0] *= .5
             v_ip += rks_grad._gga_grad_sum_(ao, wv)
@@ -616,20 +621,20 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
                 wv[:,0] *= .5
                 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)]
                 vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T)
-            t0 = log.timer_debug1('integration', *t0)
+            t1 = log.timer_debug2('integration', *t1)
             ao_dm0 = aow = None
     elif xctype == 'MGGA':
         if grids.level < 5:
             log.warn('MGGA Hessian is sensitive to dft grids.')
         ao_deriv = 2
+        t1 = t0 = log.init_timer()
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            t0 = log.init_timer()
             ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
-            t0 = log.timer_debug1('eval rho', *t0)
+            t1 = log.timer_debug2('eval rho', *t1)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
-            t0 = log.timer_debug1('eval vxc', *t0)
+            t1 = log.timer_debug2('eval vxc', *t0)
             wv = weight * vxc
             wv[0] *= .5
             wv[4] *= .5  # for the factor 1/2 in tau
@@ -649,7 +654,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
                     aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)]
                     vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T)
             ao_dm0 = aow = None
-            t0 = log.timer_debug1('integration', *t0)
+            t1 = log.timer_debug2('integration', *t1)
     for ia in range(mol.natm):
         p0, p1 = aoslices[ia][2:]
         vmat[ia,:,p0:p1] += v_ip[:,p0:p1]
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 9088583c..3e9414ef 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -165,9 +165,27 @@ def add_sparse(a, b, indices):
         ctypes.c_int(count)
     )
     if err != 0:
-        raise RecursionError('failed in sparse_add2d')
+        raise RuntimeError('failed in sparse_add2d')
     return a
 
+def dist_matrix(coords, out=None):
+    assert coords.flags.c_contiguous
+    n = coords.shape[0]
+    if out is None:
+        out = cupy.empty([n,n])
+
+    stream = cupy.cuda.get_current_stream()
+    err = libcupy_helper.dist_matrix(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(out.data.ptr, ctypes.c_void_p),
+        ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+        ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(n),
+    )
+    if err != 0:
+        raise RuntimeError('failed in calculating distance matrix')
+    return out
+
 def block_c2s_diag(ncart, nsph, angular, counts):
     '''
     constract a cartesian to spherical transformation of n shells
@@ -175,21 +193,18 @@ def block_c2s_diag(ncart, nsph, angular, counts):
 
     nshells = np.sum(counts)
     cart2sph = cupy.zeros([ncart, nsph])
-    rows = [0]
-    cols = [0]
+
+    rows = [np.array([0], dtype='int32')]
+    cols = [np.array([0], dtype='int32')]
     offsets = []
     for l, count in zip(angular, counts):
-        for _ in range(count):
-            r, c = c2s_l[l].shape
-            rows.append(rows[-1] + r)
-            cols.append(cols[-1] + c)
-            offsets.append(c2s_offset[l])
-    rows = np.asarray(rows, dtype='int32')
-    cols = np.asarray(cols, dtype='int32')
-    offsets = np.asarray(offsets, dtype='int32')
+        r, c = c2s_l[l].shape
+        rows.append(rows[-1][-1] + np.arange(1,count+1, dtype='int32') * r)
+        cols.append(cols[-1][-1] + np.arange(1,count+1, dtype='int32') * c)
+        offsets += [c2s_offset[l]] * count
 
-    rows = cupy.asarray(rows, dtype='int32')
-    cols = cupy.asarray(cols, dtype='int32')
+    rows = cupy.hstack(rows)
+    cols = cupy.hstack(cols)
     offsets = cupy.asarray(offsets, dtype='int32')
 
     stream = cupy.cuda.get_current_stream()
@@ -271,8 +286,10 @@ def transpose_sum(a):
     return a + a.transpose(0,2,1)
     '''
     assert a.flags.c_contiguous
-    assert a.ndim == 3
     n = a.shape[-1]
+    if a.ndim == 2:
+        a = a.reshape([-1,n,n])
+    assert a.ndim == 3
     count = a.shape[0]
     stream = cupy.cuda.get_current_stream()
     err = libcupy_helper.transpose_sum(
diff --git a/gpu4pyscf/lib/cupy_helper/CMakeLists.txt b/gpu4pyscf/lib/cupy_helper/CMakeLists.txt
index 3f59109e..445a10f6 100644
--- a/gpu4pyscf/lib/cupy_helper/CMakeLists.txt
+++ b/gpu4pyscf/lib/cupy_helper/CMakeLists.txt
@@ -17,13 +17,14 @@
 
 #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80")
 
-add_library(cupy_helper SHARED 
+add_library(cupy_helper SHARED
   transpose.cu
   block_diag.cu
   contract_cderi_k.cu
   take_last2d.cu
   async_d2h_2d.cu
   add_sparse.cu
+  dist_matrix.cu
 )
 
 set_target_properties(cupy_helper PROPERTIES
diff --git a/gpu4pyscf/lib/cupy_helper/dist_matrix.cu b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
new file mode 100644
index 00000000..77f78de7
--- /dev/null
+++ b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
@@ -0,0 +1,49 @@
+/* Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#define THREADS        32
+
+__global__
+static void _calc_distances(double *dist, const double *x, const double *y, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= n){
+        return;
+    }
+
+    double dx = x[3*i]   - y[3*j];
+    double dy = x[3*i+1] - y[3*j+1];
+    double dz = x[3*i+2] - y[3*j+2];
+    dist[i*n+j] = norm3d(dx, dy, dz);
+}
+
+extern "C" {
+int dist_matrix(cudaStream_t stream, double *dist, const double *x, const double *y, int n)
+{
+    int ntile = (n + THREADS - 1) / THREADS;
+    dim3 threads(THREADS, THREADS);
+    dim3 blocks(ntile, ntile);
+    _calc_distances<<<blocks, threads, 0, stream>>>(dist, x, y, n);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
+}
diff --git a/gpu4pyscf/lib/gdft/gen_grids.cu b/gpu4pyscf/lib/gdft/gen_grids.cu
index d71eedf9..67cb869e 100644
--- a/gpu4pyscf/lib/gdft/gen_grids.cu
+++ b/gpu4pyscf/lib/gdft/gen_grids.cu
@@ -67,7 +67,7 @@ int ngrids, int natm)
                 dx = xi - xj_t;
                 dy = yi - yj_t;
                 dz = zi - zj_t;
-                double dij = norm3d(dx, dy, dz);
+                double dij = rnorm3d(dx, dy, dz);
 
                 // distance between atom i and atom j
                 dij_smem[tx] = dij;
@@ -88,7 +88,7 @@ int ngrids, int natm)
 
                 double dij = dij_smem[l];
                 double aij = a_smem[l];
-                double g = (atom_i == atom_j) ? 0.0 : (dig - djg) / dij;
+                double g = (atom_i == atom_j) ? 0.0 : (dig - djg) * dij;
 
                 // atomic radii adjust function
                 double g1 = g*g - 1.0;
diff --git a/gpu4pyscf/lib/gint/g3c2e.cu b/gpu4pyscf/lib/gint/g3c2e.cu
index 2395fbbe..c0f0f425 100644
--- a/gpu4pyscf/lib/gint/g3c2e.cu
+++ b/gpu4pyscf/lib/gint/g3c2e.cu
@@ -5,7 +5,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
     int ntasks_kl = offsets.ntasks_kl;
     int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
     int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-    
+
     if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
         return;
     }
@@ -26,7 +26,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
     int lsh = bas_pair2ket[bas_kl];
     double uw[NROOTS*2];
     double g[GSIZE_INT3C];
-    
+
     double* __restrict__ a12 = c_bpcache.a12;
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
@@ -48,9 +48,9 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
         as_ksh = lsh;
         as_lsh = ksh;
     }
+    GINTmemset_int3c2e<NROOTS>(envs, eri, ish, jsh, ksh);
     for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
         for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
-            
             double aij = a12[ij];
             double xij = x12[ij];
             double yij = y12[ij];
@@ -65,7 +65,7 @@ void GINTfill_int3c2e_kernel(GINTEnvVars envs, ERITensor eri, BasisProdOffsets o
             double aijkl = aij + akl;
             double a1 = aij * akl;
             double a0 = a1 / aijkl;
-            double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; 
+            double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
             a0 *= theta;
             double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
             GINTrys_root<NROOTS>(x, uw);
diff --git a/gpu4pyscf/lib/gint/gout3c2e.cu b/gpu4pyscf/lib/gint/gout3c2e.cu
index fb4ff75e..c6fb1eba 100644
--- a/gpu4pyscf/lib/gint/gout3c2e.cu
+++ b/gpu4pyscf/lib/gint/gout3c2e.cu
@@ -235,15 +235,18 @@ static void GINTwrite_int3c2e_ipip_direct(GINTEnvVars envs, ERITensor eri, doubl
                 double eri_zy = 0;
                 double eri_zz = 0;
                 for (int ir = 0; ir < NROOTS; ++ir){
-                    eri_xx += g3[ix + ir] * g0[iy + ir] * g0[iz + ir];
-                    eri_xy += g2[ix + ir] * g1[iy + ir] * g0[iz + ir];
-                    eri_xz += g2[ix + ir] * g0[iy + ir] * g1[iz + ir];
-                    eri_yx += g1[ix + ir] * g2[iy + ir] * g0[iz + ir];
-                    eri_yy += g0[ix + ir] * g3[iy + ir] * g0[iz + ir];
-                    eri_yz += g0[ix + ir] * g2[iy + ir] * g1[iz + ir];
-                    eri_zx += g1[ix + ir] * g0[iy + ir] * g2[iz + ir];
-                    eri_zy += g0[ix + ir] * g1[iy + ir] * g2[iz + ir];
-                    eri_zz += g0[ix + ir] * g0[iy + ir] * g3[iz + ir];
+                    double g0_x = g0[ix + ir];
+                    double g0_y = g0[iy + ir];
+                    double g0_z = g0[iz + ir];
+                    eri_xx += g3[ix + ir] * g0_y        * g0_z       ;
+                    eri_xy += g2[ix + ir] * g1[iy + ir] * g0_z       ;
+                    eri_xz += g2[ix + ir] * g0_y        * g1[iz + ir];
+                    eri_yx += g1[ix + ir] * g2[iy + ir] * g0_z       ;
+                    eri_yy += g0_x        * g3[iy + ir] * g0_z       ;
+                    eri_yz += g0_x        * g2[iy + ir] * g1[iz + ir];
+                    eri_zx += g1[ix + ir] * g0_y        * g2[iz + ir];
+                    eri_zy += g0_x        * g1[iy + ir] * g2[iz + ir];
+                    eri_zz += g0_x        * g0_y        * g3[iz + ir];
                 }
                 off = i+jstride*j;
                 pxx_eri[off] += eri_xx;
@@ -314,13 +317,38 @@ static void GINTwrite_int3c2e_ip_direct(GINTEnvVars envs, ERITensor eri, double*
     }
 }
 
+template <int NROOTS> __device__
+static void GINTmemset_int3c2e(GINTEnvVars envs, ERITensor eri, int ish, int jsh, int ksh)
+{
+    int *ao_loc = c_bpcache.ao_loc;
+    size_t jstride = eri.stride_j;
+    size_t kstride = eri.stride_k;
+    int i0 = ao_loc[ish  ] - eri.ao_offsets_i;
+    int i1 = ao_loc[ish+1] - eri.ao_offsets_i;
+    int j0 = ao_loc[jsh  ] - eri.ao_offsets_j;
+    int j1 = ao_loc[jsh+1] - eri.ao_offsets_j;
+    int k0 = ao_loc[ksh  ] - eri.ao_offsets_k;
+    int k1 = ao_loc[ksh+1] - eri.ao_offsets_k;
+    int i, j, k, n;
+    double* __restrict__ p_eri;
+
+    for (n = 0, k = k0; k < k1; ++k) {
+        p_eri = eri.data + k * kstride;
+
+        for (j = j0; j < j1; ++j) {
+            for (i = i0; i < i1; ++i, ++n) {
+                p_eri[i+jstride*j] = 0;
+            }
+        }
+    }
+}
+
 template <int NROOTS> __device__
 static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g, int ish, int jsh, int ksh)
 {
     int *ao_loc = c_bpcache.ao_loc;
     size_t jstride = eri.stride_j;
     size_t kstride = eri.stride_k;
-    size_t lstride = eri.stride_l;
     int i0 = ao_loc[ish  ] - eri.ao_offsets_i;
     int i1 = ao_loc[ish+1] - eri.ao_offsets_i;
     int j0 = ao_loc[jsh  ] - eri.ao_offsets_j;
@@ -337,7 +365,7 @@ static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g,
     int ix, iy, iz, off;
 
     for (n = 0, k = k0; k < k1; ++k) {
-        p_eri = eri.data + 0 * lstride + k * kstride;
+        p_eri = eri.data + k * kstride;
 
         for (j = j0; j < j1; ++j) {
             for (i = i0; i < i1; ++i, ++n) {
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu
index 0953fc10..bcb5e382 100644
--- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c2e.cu
@@ -41,7 +41,7 @@ static int GINTfill_int3c2e_tasks(ERITensor *eri, BasisProdOffsets *offsets, GIN
     int ntasks_kl = offsets->ntasks_kl;
     assert(ntasks_kl < 65536*THREADSY);
     int type_ijkl;
-    
+
     dim3 threads(THREADSX, THREADSY);
     dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ntasks_kl+THREADSY-1)/THREADSY);
     switch (nrys_roots) {
@@ -121,14 +121,14 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
     ContractionProdType *cp_kl = bpcache->cptype + cp_kl_id;
     GINTEnvVars envs;
     int ng[4] = {0,0,0,0};
-    
+
     GINTinit_EnvVars(&envs, cp_ij, cp_kl, ng);
     envs.omega = omega;
 
     if (envs.nrys_roots > 8) {
         return 2;
     }
-    
+
     // TODO: improve the efficiency by unrolling
     if (envs.nrys_roots > 1) {
         int16_t *idx4c = (int16_t *)malloc(sizeof(int16_t) * envs.nf * 3);
@@ -136,9 +136,9 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
         checkCudaErrors(cudaMemcpyToSymbol(c_idx4c, idx4c, sizeof(int16_t)*envs.nf*3));
         free(idx4c);
     }
-    
+
     int kl_bin, ij_bin1;
-    
+
     //checkCudaErrors(cudaMemcpyToSymbol(c_envs, &envs, sizeof(GINTEnvVars)));
     // move bpcache to constant memory
     checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
@@ -154,7 +154,7 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
     eritensor.nao = nao;
     eritensor.data = eri;
     BasisProdOffsets offsets;
-    
+
     int *bas_pairs_locs = bpcache->bas_pairs_locs;
     int *primitive_pairs_locs = bpcache->primitive_pairs_locs;
     for (kl_bin = 0; kl_bin < nbins; kl_bin++) {
@@ -182,12 +182,12 @@ int GINTfill_int3c2e(cudaStream_t stream, BasisProdCache *bpcache, double *eri,
 
         int err = -1;
         err = GINTfill_int3c2e_tasks(&eritensor, &offsets, &envs, stream);
-        
+
         if (err != 0) {
             return err;
         }
     }
-    
+
     return 0;
 }
 }
diff --git a/gpu4pyscf/lib/gvhf/g3c2e.cuh b/gpu4pyscf/lib/gvhf/g3c2e.cuh
index c82d037c..9195cddd 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e.cuh
+++ b/gpu4pyscf/lib/gvhf/g3c2e.cuh
@@ -58,7 +58,7 @@ static void GINTkernel_int3c2e_ip1_getjk(JKMatrix jk, double* __restrict__ gout,
                 double sx = gout[3*n];
                 double sy = gout[3*n + 1];
                 double sz = gout[3*n + 2];
-                
+
                 double rhoj_tmp = dm[off_dm] * rhoj[k];
                 double rhok_tmp = rhok[off_rhok];
 
@@ -73,7 +73,7 @@ static void GINTkernel_int3c2e_ip1_getjk(JKMatrix jk, double* __restrict__ gout,
             }
         }
     }
-    
+
     for (i = i0; i < i1; ++i){
         int ii = 3*(i-i0);
         atomicAdd(vj + i + 0*nao, j3[ii + 0]);
@@ -107,22 +107,22 @@ static void GINTkernel_int3c2e_ip2_getjk(JKMatrix jk, double* __restrict__ gout,
     double* __restrict__ dm = jk.dm;
     double j3[GPU_CART_MAX * 3];
     double k3[GPU_CART_MAX * 3];
-    
+
     for (k = 0; k < (k1-k0) * 3; k++){
         j3[k] = 0.0;
         k3[k] = 0.0;
     }
-    
+
     for (n = 0, k = k0; k < k1; ++k) {
         for (j = j0; j < j1; ++j) {
             for (i = i0; i < i1; ++i, ++n) {
                 off_dm = i + nao*j;
                 off_rhok = i + nao*j + k*nao*nao;
-                
+
                 double sx = gout[3 * n];
                 double sy = gout[3 * n + 1];
                 double sz = gout[3 * n + 2];
-                
+
                 double rhoj_tmp = dm[off_dm] * rhoj[k];
                 double rhok_tmp = rhok[off_rhok];
 
@@ -143,7 +143,7 @@ static void GINTkernel_int3c2e_ip2_getjk(JKMatrix jk, double* __restrict__ gout,
         atomicAdd(vj + k + 0*naux, j3[kk + 0]);
         atomicAdd(vj + k + 1*naux, j3[kk + 1]);
         atomicAdd(vj + k + 2*naux, j3[kk + 2]);
-        
+
         atomicAdd(vk + k + 0*naux, k3[kk + 0]);
         atomicAdd(vk + k + 1*naux, k3[kk + 1]);
         atomicAdd(vk + k + 2*naux, k3[kk + 2]);
@@ -169,7 +169,7 @@ static void GINTkernel_int3c2e_getj_pass1(GINTEnvVars envs, JKMatrix jk, double*
     int i, j, k;
     double* __restrict__ rhoj = jk.rhoj;
     double* __restrict__ dm = jk.dm;
-    
+
     int i_l = envs.i_l;
     int j_l = envs.j_l;
     int k_l = envs.k_l;
@@ -243,11 +243,11 @@ static void GINTkernel_int3c2e_getj_pass2(GINTEnvVars envs, JKMatrix jk, double*
             double vj_tmp = 0.0;
             for (k = k0; k < k1; ++k){
                 int kp = k - k0;
-                
+
                 int loc_k = c_l_locs[k_l] + kp;
                 int loc_j = c_l_locs[j_l] + jp;
                 int loc_i = c_l_locs[i_l] + ip;
-                
+
                 int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
                 int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + envs.g_size;
                 int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + envs.g_size * 2;
@@ -306,7 +306,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                 int jp = j - j0;
                 for (i = i0; i < i1; ++i) {
                     int ip = i - i0;
-                
+
                     int loc_k = c_l_locs[k_l] + kp;
                     int loc_j = c_l_locs[j_l] + jp;
                     int loc_i = c_l_locs[i_l] + ip;
@@ -327,7 +327,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                         sy += gx * f[iy + ir] * gz;
                         sz += gx * gy * f[iz + ir];
                     }
-                
+
                     int ii = 3*(i-i0);
                     off_rhok = i + nao*j + k*nao*nao;
                     double rhok_tmp = rhok[off_rhok];
@@ -348,7 +348,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                 int jp = j - j0;
                 for (i = i0; i < i1; ++i) {
                     int ip = i - i0;
-                
+
                     int loc_k = c_l_locs[k_l] + kp;
                     int loc_j = c_l_locs[j_l] + jp;
                     int loc_i = c_l_locs[i_l] + ip;
@@ -388,7 +388,7 @@ static void GINTkernel_int3c2e_ip1_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
             int jp = j - j0;
             for (i = i0; i < i1; ++i) {
                 int ip = i - i0;
-                
+
                 int loc_k = c_l_locs[k_l] + kp;
                 int loc_j = c_l_locs[j_l] + jp;
                 int loc_i = c_l_locs[i_l] + ip;
@@ -453,7 +453,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
     double* __restrict__ rhoj = jk.rhoj;
     double* __restrict__ rhok = jk.rhok;
     double* __restrict__ dm = jk.dm;
-    
+
     int i_l = envs.i_l;
     int j_l = envs.j_l;
     int k_l = envs.k_l;
@@ -468,7 +468,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                 int jp = j - j0;
                 for (i = i0; i < i1; ++i) {
                     int ip = i - i0;
-                
+
                     int loc_k = c_l_locs[k_l] + kp;
                     int loc_j = c_l_locs[j_l] + jp;
                     int loc_i = c_l_locs[i_l] + ip;
@@ -476,7 +476,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                     int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
                     int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size;
                     int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2;
-                
+
                     double sx = 0.0;
                     double sy = 0.0;
                     double sz = 0.0;
@@ -510,7 +510,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                 int jp = j - j0;
                 for (i = i0; i < i1; ++i) {
                     int ip = i - i0;
-                
+
                     int loc_k = c_l_locs[k_l] + kp;
                     int loc_j = c_l_locs[j_l] + jp;
                     int loc_i = c_l_locs[i_l] + ip;
@@ -518,7 +518,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                     int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
                     int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size;
                     int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2;
-                
+
                     double sx = 0.0;
                     double sy = 0.0;
                     double sz = 0.0;
@@ -551,7 +551,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
             int jp = j - j0;
             for (i = i0; i < i1; ++i) {
                 int ip = i - i0;
-                
+
                 int loc_k = c_l_locs[k_l] + kp;
                 int loc_j = c_l_locs[j_l] + jp;
                 int loc_i = c_l_locs[i_l] + ip;
@@ -559,7 +559,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                 int ix = dk * idx[loc_k] + dj * idx[loc_j] + di * idx[loc_i];
                 int iy = dk * idy[loc_k] + dj * idy[loc_j] + di * idy[loc_i] + g_size;
                 int iz = dk * idz[loc_k] + dj * idz[loc_j] + di * idz[loc_i] + g_size * 2;
-                
+
                 double sx = 0.0;
                 double sy = 0.0;
                 double sz = 0.0;
@@ -579,7 +579,7 @@ static void GINTkernel_int3c2e_ip2_getjk_direct(GINTEnvVars envs, JKMatrix jk, d
                 j3[kk + 0] += sx * rhoj_tmp;
                 j3[kk + 1] += sy * rhoj_tmp;
                 j3[kk + 2] += sz * rhoj_tmp;
-                
+
                 off_rhok = i + nao*j + k*nao*nao;
                 double rhok_tmp = rhok[off_rhok];
                 k3[kk + 0] += sx * rhok_tmp;
@@ -602,7 +602,7 @@ static void write_int3c2e_ip1_jk(JKMatrix jk, double* j3, double* k3, int ish){
     int tx = threadIdx.x;
     int ty = threadIdx.y;
     __shared__ double sdata[THREADSX][THREADSY];
-    
+
     if (vj != NULL){
         for (int i = i0; i < i1; ++i){
             for (int j = 0; j < 3; j++){
@@ -644,7 +644,7 @@ static void write_int3c2e_ip2_jk(JKMatrix jk, double *j3, double* k3, int ksh){
     int tx = threadIdx.x;
     int ty = threadIdx.y;
     __shared__ double sdata[THREADSX][THREADSY];
-    
+
     if (vj != NULL){
         for (int k = k0; k < k1; ++k){
             for (int j = 0; j < 3; j++){
@@ -741,7 +741,7 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas
         double aijkl = aij + akl;
         double a1 = aij * akl;
         double a0 = a1 / aijkl;
-        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; 
+        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
         a0 *= theta;
         double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
         double fac = eij * ekl * sqrt(a0 / (a1 * a1 * a1));
@@ -770,11 +770,11 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas
         double g_3 = c00y;
         double g_4 = norm * fac * weight0;
         double g_5 = g_4 * c00z;
-        
+
         double f_1 = ai2 * g_1;
         double f_3 = ai2 * g_3;
         double f_5 = ai2 * g_5;
-        
+
         gout0 += f_1 * g_2 * g_4;
         gout1 += g_0 * f_3 * g_4;
         gout2 += g_0 * g_2 * f_5;
@@ -784,14 +784,14 @@ static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, Bas
     int i0 = ao_loc[ish] - jk.ao_offsets_i;
     int j0 = ao_loc[jsh] - jk.ao_offsets_j;
     int k0 = ao_loc[ksh] - jk.ao_offsets_k;
-    
+
     int nao = jk.nao;
     double* __restrict__ dm = jk.dm;
     double* __restrict__ rhok = jk.rhok;
     double* __restrict__ rhoj = jk.rhoj;
     double* __restrict__ vj = jk.vj;
     double* __restrict__ vk = jk.vk;
-    
+
     int tx = threadIdx.x;
     int ty = threadIdx.y;
     __shared__ double sdata[THREADSX][THREADSY];
@@ -903,7 +903,7 @@ static void GINTrun_int3c2e_ip2_jk_kernel0010(GINTEnvVars envs, JKMatrix jk, Bas
         double aijkl = aij + akl;
         double a1 = aij * akl;
         double a0 = a1 / aijkl;
-        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; 
+        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
         a0 *= theta;
         double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
         double fac = norm * eij * ekl * sqrt(a0 / (a1 * a1 * a1));
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu
index 0bd30319..cc100d15 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass1.cu
@@ -13,7 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
- 
+
 
 template <int NROOTS, int GSIZE> __global__
 void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
@@ -22,7 +22,7 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
     int ntasks_kl = offsets.ntasks_kl;
     int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
     int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-    
+
     if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
         return;
     }
@@ -42,12 +42,16 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
     int lsh = bas_pair2ket[bas_kl];
     double uw[NROOTS*2];
     double g[GSIZE];
-    
+
     double* __restrict__ a12 = c_bpcache.a12;
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
     double* __restrict__ z12 = c_bpcache.z12;
 
+    if (ish == jsh){
+        norm *= .5;
+    }
+
     int ij, kl;
     int as_ish, as_jsh, as_ksh, as_lsh;
     if (envs.ibase) {
@@ -70,7 +74,7 @@ void GINTint3c2e_pass1_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
         double xij = x12[ij];
         double yij = y12[ij];
         double zij = z12[ij];
-        for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {  
+        for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
             double akl = a12[kl];
             double xkl = x12[kl];
             double ykl = y12[kl];
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu
index cb9dbf11..1216726a 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass1_root1.cu
@@ -41,6 +41,9 @@ static void GINTint3c2e_pass1_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
     double* __restrict__ z12 = c_bpcache.z12;
+    if (ish == jsh){
+        norm *= .5;
+    }
     int ij, kl;
     double gout0 = 0;
     for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
@@ -70,7 +73,7 @@ static void GINTint3c2e_pass1_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
         }
         gout0 += fac;
     } }
-    
+
     int *ao_loc = c_bpcache.ao_loc;
     int nao = jk.nao;
     int i0 = ao_loc[ish] - jk.ao_offsets_i;
@@ -107,6 +110,9 @@ static void GINTint3c2e_pass1_j_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisP
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
     double* __restrict__ z12 = c_bpcache.z12;
+    if (ish == jsh){
+        norm *= .5;
+    }
     int ij, kl;
     int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
     int nbas = c_bpcache.nbas;
@@ -215,6 +221,9 @@ static void GINTint3c2e_pass1_j_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisP
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
     double* __restrict__ z12 = c_bpcache.z12;
+    if (ish == jsh){
+        norm *= .5;
+    }
     int ij, kl;
     int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
     int nbas = c_bpcache.nbas;
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu
index 83d37261..6e395ce7 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass2.cu
@@ -13,7 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
- 
+
 
 template <int NROOTS, int GSIZE> __global__
 void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
@@ -22,7 +22,7 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
     int ntasks_kl = offsets.ntasks_kl;
     int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
     int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-    
+
     if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
         return;
     }
@@ -42,7 +42,9 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
     int lsh = bas_pair2ket[bas_kl];
     double uw[NROOTS*2];
     double g[GSIZE];
-    
+    if (ish == jsh){
+        norm *= .5;
+    }
     double* __restrict__ a12 = c_bpcache.a12;
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
@@ -70,7 +72,7 @@ void GINTint3c2e_pass2_j_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets
         double xij = x12[ij];
         double yij = y12[ij];
         double zij = z12[ij];
-        for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {  
+        for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
             double akl = a12[kl];
             double xkl = x12[kl];
             double ykl = y12[kl];
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu b/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu
index 2f76578c..47c65d3f 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_pass2_root1.cu
@@ -42,6 +42,9 @@ static void GINTint3c2e_pass2_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
     double* __restrict__ z12 = c_bpcache.z12;
+    if (ish == jsh){
+        norm *= .5;
+    }
     int ij, kl;
     double gout0 = 0;
     for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
@@ -71,7 +74,7 @@ static void GINTint3c2e_pass2_j_kernel0000(GINTEnvVars envs, JKMatrix jk, BasisP
         }
         gout0 += fac;
     } }
-    
+
     int *ao_loc = c_bpcache.ao_loc;
     int nao = jk.nao;
     int i0 = ao_loc[ish] - jk.ao_offsets_i;
@@ -113,7 +116,9 @@ static void GINTint3c2e_pass2_j_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisP
     double* __restrict__ bas_x = c_bpcache.bas_coords;
     double* __restrict__ bas_y = bas_x + nbas;
     double* __restrict__ bas_z = bas_y + nbas;
-
+    if (ish == jsh){
+        norm *= .5;
+    }
     double gout0 = 0;
     double gout1 = 0;
     double gout2 = 0;
@@ -219,7 +224,9 @@ static void GINTint3c2e_pass2_j_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisP
     double* __restrict__ bas_x = c_bpcache.bas_coords;
     double* __restrict__ bas_y = bas_x + nbas;
     double* __restrict__ bas_z = bas_y + nbas;
-
+    if (ish == jsh){
+        norm *= .5;
+    }
     double gout0 = 0;
     double gout1 = 0;
     double gout2 = 0;
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index 7b46fb27..5ca2ec13 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -25,6 +25,7 @@
 WARN = lib.logger.WARN
 DEBUG = lib.logger.DEBUG
 DEBUG1= lib.logger.DEBUG1
+DEBUG2= lib.logger.DEBUG2
 TIMER_LEVEL = lib.logger.TIMER_LEVEL
 flush = lib.logger.flush
 
@@ -84,17 +85,24 @@ def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
         rec._t0 = process_clock()
         return rec._t0,
 
+def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
+    if rec.verbose >= DEBUG2:
+        return timer(rec, msg, cpu0, wall0, gpu0)
+    return cpu0, wall0, gpu0
+
 info = lib.logger.info
 note = lib.logger.note
 debug = lib.logger.debug
 debug1 = lib.logger.debug1
 debug2 = lib.logger.debug2
 timer_debug1 = _timer_debug1
+timer_debug2 = _timer_debug2
 
 class Logger(lib.logger.Logger):
     def __init__(self, stdout=sys.stdout, verbose=NOTE):
         super().__init__(stdout=stdout, verbose=verbose)
     timer_debug1 = _timer_debug1
+    timer_debug2 = _timer_debug2
     timer = timer
     init_timer = init_timer
 
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
index 614e0fa3..116f4d5e 100644
--- a/gpu4pyscf/lib/tests/test_cupy_helper.py
+++ b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -18,7 +18,7 @@
 import cupy
 from gpu4pyscf.lib.cupy_helper import (
     take_last2d, transpose_sum, krylov, unpack_sparse,
-    add_sparse)
+    add_sparse, dist_matrix)
 
 class KnownValues(unittest.TestCase):
     def test_take_last2d(self):
@@ -69,6 +69,13 @@ def test_sparse(self):
         add_sparse(a, b, indices)
         assert cupy.linalg.norm(a - a0) < 1e-10
 
+    def test_dist_matrix(self):
+        a = cupy.random.rand(4, 3)
+        rij = cupy.sum((a[:,None,:] - a[None,:,:])**2, axis=2)**0.5
+
+        rij0 = dist_matrix(a)
+        assert cupy.linalg.norm(rij - rij0) < 1e-10
+
 if __name__ == "__main__":
     print("Full tests for cupy helper module")
     unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
index 428e5a08..d2a4e9b6 100644
--- a/gpu4pyscf/solvent/pcm.py
+++ b/gpu4pyscf/solvent/pcm.py
@@ -29,6 +29,7 @@
 from gpu4pyscf.solvent import _attach_solvent
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import dist_matrix
 
 libdft = lib.load_library('libdft')
 
@@ -191,7 +192,8 @@ def get_D_S(surface, with_S=True, with_D=False):
     xi_i, xi_j = cupy.meshgrid(charge_exp, charge_exp, indexing='ij')
     xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
     #rij = scipy.spatial.distance.cdist(grid_coords, grid_coords)
-    rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
+    #rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
+    rij = dist_matrix(grid_coords)
     xi_r_ij = xi_ij * rij
     cupy.fill_diagonal(rij, 1)
     S = scipy.special.erf(xi_r_ij) / rij