From ca953ed24d000a4e3f3e266309b4d3dfb2b0c3cd Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 16 Oct 2023 14:44:30 -0700
Subject: [PATCH 01/19] cuda12 for libxc wheel

---
 .github/workflows/libxc_wheel.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/libxc_wheel.yml b/.github/workflows/libxc_wheel.yml
index 13540ec3..a6f8f73f 100644
--- a/.github/workflows/libxc_wheel.yml
+++ b/.github/workflows/libxc_wheel.yml
@@ -6,15 +6,18 @@ on:
 jobs:
   release-pypi-linux:
     runs-on: ubuntu-latest
-    env:
-      img: wxj6000/manylinux2014:cuda118
+    strategy:
+      matrix:
+        cuda-version:
+        - cuda118
+        - cuda121
     steps:
     - name: Checkout
       uses: actions/checkout@v3
     - name: Build wheels
       run: |
         docker run --rm -v ${{ github.workspace }}:/gpu4pyscf:rw \
-        ${{ env.img }} \
+        wxj6000/manylinux2014:${{ matrix.cuda-version }} \
         bash -exc 'sh /gpu4pyscf/builder/build_libxc.sh'
     - name: List available wheels
       run: |

From 6a686897c81894b932eab82c8603b9a700544867 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Mon, 16 Oct 2023 15:05:59 -0700
Subject: [PATCH 02/19] support various einsum functions (#44)

* remove cutensor, cublas, cusolver in wheels

* fixed UHF in __init__.py

* support various einsum

* correct for linter
---
 examples/13-einsum_engine.py | 34 ++++++++++++++++++++
 gpu4pyscf/df/df_jk.py        |  6 ++--
 gpu4pyscf/df/grad/rhf.py     |  1 -
 gpu4pyscf/lib/cupy_helper.py | 33 +++++++++----------
 gpu4pyscf/lib/cutensor.py    | 61 +++++++++++++++++++++++++-----------
 5 files changed, 96 insertions(+), 39 deletions(-)
 create mode 100644 examples/13-einsum_engine.py

diff --git a/examples/13-einsum_engine.py b/examples/13-einsum_engine.py
new file mode 100644
index 00000000..f3954222
--- /dev/null
+++ b/examples/13-einsum_engine.py
@@ -0,0 +1,34 @@
+# gpu4pyscf is a plugin to use Nvidia GPU in PySCF package
+#
+# Copyright (C) 2022 Qiming Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+os.environ['CONTRACT_ENGINE'] = 'opt_einsum' # 'cupy', 'cuquantum'
+
+import pyscf
+from gpu4pyscf.dft import rks
+
+atom ='''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+
+mol = pyscf.M(atom=atom, basis='def2-tzvpp')
+mf = rks.RKS(mol, xc='LDA').density_fit()
+
+e_dft = mf.kernel()  # compute total energy
+print(f"total energy = {e_dft}")
\ No newline at end of file
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index fdd857c5..d52dfff2 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -162,7 +162,7 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
             '''
             if mol is None: mol = self.mol
             if dm is None: dm = self.make_rdm1()
-            
+
             # for DFT
             if mf_class == rks.RKS:
                 return rks.get_veff(self, dm=dm)
@@ -248,7 +248,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
     outputs and input are on the same device
     TODO: separate into three cases: j only, k only, j and k
     '''
-    
+
     log = logger.new_logger(dfobj.mol, dfobj.verbose)
     out_shape = dms_tag.shape
     out_cupy = isinstance(dms_tag, cupy.ndarray)
@@ -290,7 +290,7 @@ def get_j(cderi_sparse):
         vj_tmp[:,cols,rows] = vj_sparse
         vj_sparse = None
         return vj_tmp
-    
+
     # SCF K matrix with occ
     if nset == 1 and hasattr(dms_tag, 'occ_coeff'):
         occ_coeff = cupy.asarray(dms_tag.occ_coeff[ao_idx, :], order='C')
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 0598418c..292d9309 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -16,7 +16,6 @@
 
 import numpy
 import cupy
-import pyscf
 from cupyx.scipy.linalg import solve_triangular
 from pyscf.df.grad import rhf
 from pyscf.lib import logger
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 412e74d5..9f2cef59 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -24,7 +24,8 @@
 from gpu4pyscf.gto import mole
 from gpu4pyscf.lib.cutensor import contract
 from gpu4pyscf.lib.cusolver import eigh, cholesky  #NOQA
-LMAX_ON_GPU = 8
+
+LMAX_ON_GPU = 6
 DSOLVE_LINDEP = 1e-15
 
 c2s_l = mole.get_cart2sph(lmax=LMAX_ON_GPU)
@@ -64,7 +65,7 @@ def print_mem_info():
     #mem_stack = stack_size_per_thread
     GB = 1024 * 1024 * 1024
     print(f'mem_avail: {mem_avail/GB:.3f} GB, total_mem: {total_mem/GB:.3f} GB, used_mem: {used_mem/GB:.3f} GB,mem_limt: {mem_limit/GB:.3f} GB')
-    
+
 def get_avail_mem():
     mempool = cupy.get_default_memory_pool()
     used_mem = mempool.used_bytes()
@@ -83,7 +84,7 @@ def device2host_2d(a_cpu, a_gpu, stream=None):
     libcupy_helper.async_d2h_2d(
         ctypes.cast(stream.ptr, ctypes.c_void_p),
         a_cpu.ctypes.data_as(ctypes.c_void_p),
-        ctypes.c_int(a_cpu.strides[0]), 
+        ctypes.c_int(a_cpu.strides[0]),
         ctypes.cast(a_gpu.data.ptr, ctypes.c_void_p),
         ctypes.c_int(a_gpu.strides[0]),
         ctypes.c_int(a_gpu.shape[0]),
@@ -146,7 +147,7 @@ def add_sparse(a, b, indices):
     '''
     n = a.shape[0]
     m = b.shape[0]
-    
+
     err = libcupy_helper.add_sparse(
         ctypes.cast(a.data.ptr, ctypes.c_void_p),
         ctypes.cast(b.data.ptr, ctypes.c_void_p),
@@ -205,7 +206,7 @@ def block_diag(blocks, out=None):
     rows = np.cumsum(np.asarray([0] + [x.shape[0] for x in blocks]))
     cols = np.cumsum(np.asarray([0] + [x.shape[1] for x in blocks]))
     offsets = np.cumsum(np.asarray([0] + [x.shape[0]*x.shape[1] for x in blocks]))
-    
+
     m, n = rows[-1], cols[-1]
     if out is None: out = cupy.zeros([m, n])
     rows = cupy.asarray(rows, dtype='int32')
@@ -227,7 +228,7 @@ def block_diag(blocks, out=None):
     if err != 0:
         raise RuntimeError('failed in block_diag kernel')
     return out
-    
+
 def take_last2d(a, indices, out=None):
     '''
     reorder the last 2 dimensions with 'indices', the first n-2 indices do not change
@@ -303,7 +304,7 @@ def cart2sph(t, axis=0, ang=1, out=None):
     '''
     transform 'axis' of a tensor from cartesian basis into spherical basis
     '''
-    if(ang <= 1): 
+    if(ang <= 1):
         if(out is not None): out[:] = t
         return t
     size = list(t.shape)
@@ -314,9 +315,9 @@ def cart2sph(t, axis=0, ang=1, out=None):
     i0 = max(1, np.prod(size[:axis]))
     i3 = max(1, np.prod(size[axis+1:]))
     out_shape = size[:axis] + [nli*li_size[1]] + size[axis+1:]
-    
+
     t_cart = t.reshape([i0*nli, li_size[0], i3])
-    if(out is not None): 
+    if(out is not None):
         out = out.reshape([i0*nli, li_size[1], i3])
     t_sph = contract('min,ip->mpn', t_cart, c2s, out=out)
     return t_sph.reshape(out_shape)
@@ -364,7 +365,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
 
     if not (isinstance(b, cupy.ndarray) and b.ndim == 1):
         b = cupy.asarray(b)
-    
+
     if x0 is None:
         x1 = b
     else:
@@ -402,7 +403,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
         ax.extend(axt)
         if callable(callback):
             callback(cycle, xs, ax)
-        
+
         x1 = axt.copy()
         for i in range(len(xs)):
             xsi = cupy.asarray(xs[i])
@@ -419,22 +420,22 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
                 idx.append(i)
                 innerprod.append(innerprod1)
         log.debug('krylov cycle %d  r = %g', cycle, max_innerprod**.5)
-        
+
         if max_innerprod < lindep or max_innerprod < tol**2:
             break
 
         x1 = x1[idx]
-        
+
     xs = cupy.asarray(xs)
     ax = cupy.asarray(ax)
     nd = cycle + 1
 
     h = cupy.einsum('in,jn->ij', xs, ax)
-    
+
     # Add the contribution of I in (1+a)
     h += cupy.diag(cupy.asarray(innerprod[:nd]))
     g = cupy.zeros((nd,nroots), dtype=x1.dtype)
-    
+
     if b.ndim == 1:
         g[0] = innerprod[0]
     else:
@@ -447,7 +448,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
             for j in range(nroots):
                 g[i,j] = cupy.dot(xsi.conj(), b[j])
         '''
-    
+
     c = cupy.linalg.solve(h, g)
     x = _gen_x0(c, cupy.asarray(xs))
     if b.ndim == 1:
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index c7703005..c590b7f6 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -13,9 +13,6 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-
-import os
-import ctypes
 import numpy as np
 import cupy
 from cupy._environment import _preload_libs
@@ -31,9 +28,6 @@
     except Exception:
         continue
 
-if libcutensor is None:
-    print('cannot find cutensor')
-
 _handle = Handle()
 _modes = {}
 _contraction_descriptors = {}
@@ -50,7 +44,7 @@ def _create_mode_with_cache(mode):
         else:
             raise TypeError('Cannot create tensor mode: {}'.format(type(x)))
     key = tuple(integer_mode)
-    
+
     if key in _modes:
         mode = _modes[key]
     else:
@@ -70,11 +64,11 @@ def create_contraction_descriptor(handle,
            desc_a.ptr, mode_a.data, alignment_req_A,
            desc_b.ptr, mode_b.data, alignment_req_B,
            desc_c.ptr, mode_c.data, alignment_req_C)
-    
+
     if key in _contraction_descriptors:
         desc = _contraction_descriptors[key]
         return desc
-    
+
     desc = cutensor_backend.ContractionDescriptor()
     cutensor_backend.initContractionDescriptor(
         handle,
@@ -99,11 +93,11 @@ def contraction(pattern, a, b, alpha, beta, out=None):
     key = str_a + str_b
     val = list(a.shape) + list(b.shape)
     shape = {k:v for k, v in zip(key, val)}
-    
+
     mode_a = list(str_a)
     mode_b = list(str_b)
     mode_c = list(str_c)
-    
+
     if(out is not None):
         c = out
     else:
@@ -126,7 +120,7 @@ def contraction(pattern, a, b, alpha, beta, out=None):
     except Exception:
         ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_MIN)
         ws = cupy.empty(ws_size, dtype=np.int8)
-    
+
     plan = cutensor_backend.ContractionPlan()
     cutensor_backend.initContractionPlan(_handle, plan, desc, find, ws_size)
     alpha = np.asarray(alpha)
@@ -137,11 +131,40 @@ def contraction(pattern, a, b, alpha, beta, out=None):
                              ws.data.ptr, ws_size)
     return out
 
-def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
-    '''
-    a wrapper for general tensor contraction
-    pattern has to be a standard einsum notation
-    '''
-    c = contraction(pattern, a, b, alpha, beta, out=out)
+import os
+if 'CONTRACT_ENGINE' in os.environ:
+    contract_engine = os.environ['CONTRACT_ENGINE']
+else:
+    contract_engine = None
 
-    return c
\ No newline at end of file
+if libcutensor is None:
+    contract_engine = 'cupy'
+
+# override the 'contract' function if einsum is customized or cutensor is not found
+if contract_engine is not None:
+    einsum = None
+    if contract_engine == 'opt_einsum':
+        import opt_einsum
+        einsum = opt_einsum.contract
+    elif contract_engine == 'cuquantum':
+        from cuquantum import contract as einsum
+    elif contract_engine == 'cupy':
+        einsum = cupy.einsum
+    else:
+        raise RuntimeError('unknown tensor contraction engine.')
+
+    import warnings
+    warnings.warn(f'using {contract_engine} as the tensor contraction engine.')
+    def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
+        if out is None:
+            return cupy.asarray(einsum(pattern, a, b), order='C')
+        else:
+            out[:] = alpha*einsum(pattern, a, b) + beta*out
+            return cupy.asarray(out, order='C')
+else:
+    def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
+        '''
+        a wrapper for general tensor contraction
+        pattern has to be a standard einsum notation
+        '''
+        return contraction(pattern, a, b, alpha, beta, out=out)

From 036f9bca6120783a1ae118556ef98340d2a9fe0c Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 16 Oct 2023 15:20:03 -0700
Subject: [PATCH 03/19] fix libxc wheel

---
 .github/workflows/libxc_wheel.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/libxc_wheel.yml b/.github/workflows/libxc_wheel.yml
index a6f8f73f..7e54b6c5 100644
--- a/.github/workflows/libxc_wheel.yml
+++ b/.github/workflows/libxc_wheel.yml
@@ -7,6 +7,7 @@ jobs:
   release-pypi-linux:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         cuda-version:
         - cuda118

From 0abb83485e4c7241b318e9566dbe3d4f842e43ef Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 16 Oct 2023 16:01:02 -0700
Subject: [PATCH 04/19] Remove unrelated packages from wheel

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fd675dfe..f72cad48 100755
--- a/setup.py
+++ b/setup.py
@@ -113,7 +113,7 @@ def initialize_with_default_plat_name(self):
     package_dir={'gpu4pyscf': 'gpu4pyscf'},  # packages are under directory pyscf
     # include *.so *.dat files. They are now placed in MANIFEST.in
     include_package_data=True,  # include everything in source control
-    packages=[*find_namespace_packages('.'), 'gpu4pyscf', 'gpu4pyscf.lib'],
+    packages=['gpu4pyscf', 'gpu4pyscf.lib'],
     tests_require=[
         "pytest==7.2.0",
         "pytest-cov==4.0.0",

From 9eab37c9fc7bec9174b97209cdd7b7644aded3c2 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Mon, 16 Oct 2023 16:05:53 -0700
Subject: [PATCH 05/19] Fix packages in setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index f72cad48..fd5efdab 100755
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 import subprocess
 import re
 
-from setuptools import setup, find_packages, Extension, find_namespace_packages
+from setuptools import setup, find_packages, Extension
 from setuptools.command.build_py import build_py
 from distutils.util import get_platform
 
@@ -113,7 +113,7 @@ def initialize_with_default_plat_name(self):
     package_dir={'gpu4pyscf': 'gpu4pyscf'},  # packages are under directory pyscf
     # include *.so *.dat files. They are now placed in MANIFEST.in
     include_package_data=True,  # include everything in source control
-    packages=['gpu4pyscf', 'gpu4pyscf.lib'],
+    packages=find_packages(exclude=['*test*', '*examples*', '*docker*']),
     tests_require=[
         "pytest==7.2.0",
         "pytest-cov==4.0.0",

From 741423f75caae0e89f1746df14908d436cf651ad Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 17 Oct 2023 20:03:30 -0700
Subject: [PATCH 06/19] accelerate solvent models with GPU (#48)

* numpy -> cupy for solvent

* for linter

* remove grad switch from pcm.py

* passed flake8

* solvent integrals on GPU

* flake8
---
 benchmarks/scf/generate_tables.ipynb     | 108 ++----
 examples/14-pcm_solvent.py               |  38 ++
 examples/dft_driver.py                   |  18 +-
 gpu4pyscf/__init__.py                    |   2 +-
 gpu4pyscf/df/int3c2e.py                  |   4 +-
 gpu4pyscf/grad/rks.py                    |   3 +-
 gpu4pyscf/solvent/__init__.py            |  37 ++
 gpu4pyscf/solvent/_attach_solvent.py     | 130 +++++++
 gpu4pyscf/solvent/grad/pcm.py            | 218 ++++++-----
 gpu4pyscf/solvent/pcm.py                 | 470 +++++------------------
 gpu4pyscf/solvent/tests/test_pcm.py      |  13 +-
 gpu4pyscf/solvent/tests/test_pcm_grad.py |  39 +-
 12 files changed, 485 insertions(+), 595 deletions(-)
 create mode 100644 examples/14-pcm_solvent.py
 create mode 100644 gpu4pyscf/solvent/__init__.py
 create mode 100644 gpu4pyscf/solvent/_attach_solvent.py

diff --git a/benchmarks/scf/generate_tables.ipynb b/benchmarks/scf/generate_tables.ipynb
index e49ae7cc..c3eb0dd2 100644
--- a/benchmarks/scf/generate_tables.ipynb
+++ b/benchmarks/scf/generate_tables.ipynb
@@ -2,12 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "execution_count": 10,
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -15,7 +11,7 @@
        "''"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -34,12 +30,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "execution_count": 12,
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -47,7 +39,7 @@
        "''"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -57,15 +49,16 @@
      "text": [
       "|   mol |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |\n",
       "|------:|-------:|-------:|-------:|--------:|-------:|----------:|\n",
-      "|     2 |      3 |   0.22 |   0.32 |    0.13 |   0.24 |      0.69 |\n",
-      "|     3 |     15 |   0.81 |   1.35 |    1.45 |   1.8  |      4.85 |\n",
-      "|     4 |     30 |   1.83 |   2.76 |    4.1  |   6.66 |      7.61 |\n",
-      "|     5 |     60 |   2.88 |   3.71 |    7.24 |   8.36 |      9.44 |\n",
-      "|     6 |     96 |   4.27 |   4.48 |    7.73 |  10    |      9.79 |\n",
-      "|     7 |    141 |   3.94 |   4.07 |    8.57 |  10.39 |      9.39 |\n",
-      "|     8 |    228 | nan    | nan    |  nan    | nan    |    nan    |\n",
-      "|     9 |    300 | nan    | nan    |  nan    | nan    |    nan    |\n",
-      "|    10 |    417 | nan    | nan    |  nan    | nan    |    nan    |\n"
+      "|     2 |      3 |   0.22 |   0.32 |    0.27 |   0.25 |      0.69 |\n",
+      "|     3 |     15 |   0.68 |   0.25 |    1.58 |   2.61 |      4.84 |\n",
+      "|     4 |     30 |   1.59 |   2.63 |    4.09 |   6.93 |      8.17 |\n",
+      "|     5 |     60 |   2.86 |   3.64 |    7.15 |   8.44 |      9.44 |\n",
+      "|     6 |     96 |   4.34 |   4.39 |    7.75 |  10.58 |      9.87 |\n",
+      "|     7 |    141 |   4.07 |   4.1  |    8.87 |  10.47 |     10.13 |\n",
+      "|     8 |    228 |   4.34 |   4.58 |    9.39 |  10.48 |      9.36 |\n",
+      "|     9 |    300 |   5.05 |   5.21 |    9.35 |  11.36 |    nan    |\n",
+      "|    10 |    417 |   4.91 | nan    |  nan    | nan    |    nan    |\n",
+      "|    10 |    nan | nan    | nan    |  nan    | nan    |    nan    |\n"
      ]
     }
    ],
@@ -104,12 +97,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "execution_count": 14,
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -117,7 +106,7 @@
        "''"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -127,15 +116,16 @@
      "text": [
       "|   mol |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |\n",
       "|------:|-------:|-------:|-------:|--------:|-------:|----------:|\n",
-      "|     2 |      3 |   0.85 |   0.84 |    0.77 |   0.74 |      0.57 |\n",
-      "|     3 |     15 |   0.56 |   0.89 |    1.44 |   1.57 |      1.52 |\n",
-      "|     4 |     30 |   0.59 |   1.03 |    2.13 |   2.08 |      1.9  |\n",
-      "|     5 |     60 |   0.53 |   0.87 |    2.45 |   2.35 |      1.73 |\n",
-      "|     6 |     96 |   0.61 |   0.87 |    2.43 |   2.37 |      1.59 |\n",
-      "|     7 |    141 |   0.92 |   1.08 |    2.61 |   2.62 |      1.53 |\n",
-      "|     8 |    228 | nan    | nan    |  nan    | nan    |    nan    |\n",
-      "|     9 |    300 | nan    | nan    |  nan    | nan    |    nan    |\n",
-      "|    10 |    417 | nan    | nan    |  nan    | nan    |    nan    |\n"
+      "|     2 |      3 |   0.82 |   0.89 |    0.75 |   0.82 |      0.6  |\n",
+      "|     3 |     15 |   0.39 |   0.19 |    1.46 |   1.52 |      1.47 |\n",
+      "|     4 |     30 |   0.56 |   1.04 |    2.07 |   2.25 |      1.89 |\n",
+      "|     5 |     60 |   0.54 |   0.87 |    2.42 |   2.4  |      1.77 |\n",
+      "|     6 |     96 |   0.6  |   0.87 |    2.36 |   2.51 |      1.53 |\n",
+      "|     7 |    141 |   0.93 |   1.1  |    2.61 |   2.59 |      1.55 |\n",
+      "|     8 |    228 |   1.92 |   1.9  |    3.37 |   3.39 |      1.83 |\n",
+      "|     9 |    300 |   2.26 |   2.02 |    3.06 |   3.59 |    nan    |\n",
+      "|    10 |    417 |   2.46 | nan    |  nan    | nan    |    nan    |\n",
+      "|    10 |    nan | nan    | nan    |  nan    | nan    |    nan    |\n"
      ]
     }
    ],
@@ -146,66 +136,42 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -223,10 +189,10 @@
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
-   "name": "Python3 with MLSQL",
+   "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "0.1"
+   "version": "3.9.2"
   },
   "orig_nbformat": 4
  },
diff --git a/examples/14-pcm_solvent.py b/examples/14-pcm_solvent.py
new file mode 100644
index 00000000..3def0ee4
--- /dev/null
+++ b/examples/14-pcm_solvent.py
@@ -0,0 +1,38 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pyscf
+from pyscf import lib
+from gpu4pyscf.dft import rks
+lib.num_threads(8)
+
+atom ='''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=4)
+
+mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
+mf = mf.PCM()
+mf.grids.atom_grid = (99,590)
+mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
+mf.with_solvent.method = 'IEF-PCM'
+mf.with_solvent.eps = 78.3553
+mf.kernel()
+
+g = mf.nuc_grad_method()
+g.auxbasis_response = True
+f = g.kernel()
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 0a7073cf..65ca2ad5 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -23,26 +23,28 @@
 import argparse
 
 parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules')
-parser.add_argument("--input", type=str, default='benzene/coord')
-parser.add_argument("--basis", type=str, default='def2-tzvpp')
-parser.add_argument("--auxbasis", type=str, default='def2-tzvpp-jkfit')
+parser.add_argument("--input",    type=str,  default='benzene/coord')
+parser.add_argument("--basis",    type=str,  default='def2-tzvpp')
+parser.add_argument("--auxbasis", type=str,  default='def2-tzvpp-jkfit')
+parser.add_argument("--solvent",  type=bool, default=False)
 args = parser.parse_args()
 
 start_time = time.time()
 bas = args.basis
 mol = pyscf.M(
-    atom=args.input, 
-    basis=bas, 
+    atom=args.input,
+    basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 6
-print(mol.nao)
+mol.verbose = 4
 
 mf_df = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit(auxbasis=args.auxbasis)
+if args.solvent:
+    mf_df = mf_df.PCM()
 mf_df.grids.atom_grid = (99,590)
 mf_df.kernel()
+
 print('compute time for energy: {}s'.format((time.time() - start_time)))
-exit()
 start_time = time.time()
 g = mf_df.nuc_grad_method()
 g.auxbasis_response = True
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index 4d7323eb..a52a096a 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,2 +1,2 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.1'
+__version__ = '0.6.1'
\ No newline at end of file
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 9f01ac3b..7995c6d8 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -1097,7 +1097,7 @@ def get_dh1e(mol, dm0):
     intopt = VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
     dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)]
-    
+
     dh1e = cupy.zeros([natm,3])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'):
         dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
@@ -1120,7 +1120,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=N
 
     log_q_ij = intopt.log_qs[cp_ij_id]
     log_q_kl = intopt.aux_log_qs[cp_aux_id]
-    
+
     nbins = 1
     bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
     bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32)
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 8695927c..276e15b0 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -501,8 +501,7 @@ def get_du(ia, ib):  # JCP 98, 5612 (1993); (B10)
         yield coords, w0, w1
 
 class Gradients(rhf_grad.Gradients, pyscf.grad.rks.Gradients):
-    device = 'gpu'
-    get_veff = patch_cpu_kernel(pyscf.grad.rks.Gradients.get_veff)(_get_veff)
+    from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
     
     def get_dispersion(self):
         if self.base.disp[:2].upper() == 'D3':
diff --git a/gpu4pyscf/solvent/__init__.py b/gpu4pyscf/solvent/__init__.py
new file mode 100644
index 00000000..157e7129
--- /dev/null
+++ b/gpu4pyscf/solvent/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from gpu4pyscf.solvent import pcm
+
+def PCM(method_or_mol, solvent_obj=None, dm=None):
+    '''Initialize PCM model.
+
+    Examples:
+
+    >>> mf = PCM(scf.RHF(mol))
+    >>> mf.kernel()
+    >>> sol = PCM(mol)
+    >>> mc = PCM(CASCI(mf, 6, 6), sol)
+    >>> mc.kernel()
+    '''
+    from pyscf import gto
+    from pyscf import scf
+
+    if isinstance(method_or_mol, gto.mole.Mole):
+        return pcm.PCM(method_or_mol)
+    elif isinstance(method_or_mol, scf.hf.SCF):
+        return pcm.pcm_for_scf(method_or_mol, solvent_obj, dm)
+    else:
+        raise NotImplementedError('PCM model only support SCF')
diff --git a/gpu4pyscf/solvent/_attach_solvent.py b/gpu4pyscf/solvent/_attach_solvent.py
new file mode 100644
index 00000000..da556302
--- /dev/null
+++ b/gpu4pyscf/solvent/_attach_solvent.py
@@ -0,0 +1,130 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.solvent._attach_solvent import _Solvation
+from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf import scf
+
+# NOTE: copied from pyscf, different from the latest version
+
+def _for_scf(mf, solvent_obj, dm=None):
+    '''Add solvent model to SCF (HF and DFT) method.
+
+    Kwargs:
+        dm : if given, solvent does not respond to the change of density
+            matrix. A frozen ddCOSMO potential is added to the results.
+    '''
+    if isinstance(mf, _Solvation):
+        mf.with_solvent = solvent_obj
+        return mf
+
+    oldMF = mf.__class__
+
+    if dm is not None:
+        solvent_obj.e, solvent_obj.v = solvent_obj.kernel(dm)
+        solvent_obj.frozen = True
+
+    class SCFWithSolvent(_Solvation, oldMF):
+        def __init__(self, mf, solvent):
+            self.__dict__.update(mf.__dict__)
+            self.with_solvent = solvent
+            self._keys.update(['with_solvent'])
+
+        def dump_flags(self, verbose=None):
+            oldMF.dump_flags(self, verbose)
+            self.with_solvent.check_sanity()
+            self.with_solvent.dump_flags(verbose)
+            return self
+
+        def reset(self, mol=None):
+            self.with_solvent.reset(mol)
+            return oldMF.reset(self, mol)
+
+        # Note v_solvent should not be added to get_hcore for scf methods.
+        # get_hcore is overloaded by many post-HF methods. Modifying
+        # SCF.get_hcore may lead error.
+
+        def get_veff(self, mol=None, dm=None, *args, **kwargs):
+            vhf = oldMF.get_veff(self, mol, dm, *args, **kwargs)
+            with_solvent = self.with_solvent
+            if not with_solvent.frozen:
+                with_solvent.e, with_solvent.v = with_solvent.kernel(dm)
+            e_solvent, v_solvent = with_solvent.e, with_solvent.v
+
+            # NOTE: v_solvent should not be added to vhf in this place. This is
+            # because vhf is used as the reference for direct_scf in the next
+            # iteration. If v_solvent is added here, it may break direct SCF.
+            return tag_array(vhf, e_solvent=e_solvent, v_solvent=v_solvent)
+
+        def get_fock(self, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1,
+                     diis=None, diis_start_cycle=None,
+                     level_shift_factor=None, damp_factor=None):
+            # DIIS was called inside oldMF.get_fock. v_solvent, as a function of
+            # dm, should be extrapolated as well. To enable it, v_solvent has to be
+            # added to the fock matrix before DIIS was called.
+            if getattr(vhf, 'v_solvent', None) is None:
+                vhf = self.get_veff(self.mol, dm)
+            return oldMF.get_fock(self, h1e, s1e, vhf+vhf.v_solvent, dm, cycle, diis,
+                                  diis_start_cycle, level_shift_factor, damp_factor)
+
+        def energy_elec(self, dm=None, h1e=None, vhf=None):
+            if dm is None:
+                dm = self.make_rdm1()
+            if getattr(vhf, 'e_solvent', None) is None:
+                vhf = self.get_veff(self.mol, dm)
+            e_tot, e_coul = oldMF.energy_elec(self, dm, h1e, vhf)
+            e_tot += vhf.e_solvent
+            self.scf_summary['e_solvent'] = vhf.e_solvent.real
+            logger.debug(self, 'Solvent Energy = %.15g', vhf.e_solvent)
+            return e_tot, e_coul
+
+        def nuc_grad_method(self):
+            grad_method = oldMF.nuc_grad_method(self)
+            return self.with_solvent.nuc_grad_method(grad_method)
+
+        Gradients = nuc_grad_method
+
+        def gen_response(self, *args, **kwargs):
+            vind = oldMF.gen_response(self, *args, **kwargs)
+            is_uhf = isinstance(self, scf.uhf.UHF)
+            # singlet=None is orbital hessian or CPHF type response function
+            singlet = kwargs.get('singlet', True)
+            singlet = singlet or singlet is None
+            def vind_with_solvent(dm1):
+                v = vind(dm1)
+                if self.with_solvent.equilibrium_solvation:
+                    if is_uhf:
+                        v_solvent = self.with_solvent._B_dot_x(dm1)
+                        v += v_solvent[0] + v_solvent[1]
+                    elif singlet:
+                        v += self.with_solvent._B_dot_x(dm1)
+                return v
+            return vind_with_solvent
+
+        def stability(self, *args, **kwargs):
+            # When computing orbital hessian, the second order derivatives of
+            # solvent energy needs to be computed. It is enabled by
+            # the attribute equilibrium_solvation in gen_response method.
+            # If solvent was frozen, its contribution is treated as the
+            # external potential. The response of solvent does not need to
+            # be considered in stability analysis.
+            with lib.temporary_env(self.with_solvent,
+                                   equilibrium_solvation=not self.with_solvent.frozen):
+                return oldMF.stability(self, *args, **kwargs)
+
+    mf1 = SCFWithSolvent(mf, solvent_obj)
+    return mf1
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index a1f2b689..7c9d0047 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -19,24 +19,20 @@
 # pylint: disable=C0103
 
 import numpy
-import scipy
-import ctypes
+import cupy
+from cupyx import scipy
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf import gto, df
-from pyscf.dft import gen_grid
-from pyscf.data import radii
-from pyscf.solvent import ddcosmo
-from pyscf.solvent import _attach_solvent
-
-from gpu4pyscf.solvent import pcm
+from pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.solvent.pcm import PI, switch_h
+from gpu4pyscf.df import int3c2e
 
 libdft = lib.load_library('libdft')
 
 def grad_switch_h(x):
     ''' first derivative of h(x)'''
-    dy = 30.0*x**2 - 60.0*x**3 + 30.0*x**4 
+    dy = 30.0*x**2 - 60.0*x**3 + 30.0*x**4
     dy[x<0] = 0.0
     dy[x>1] = 0.0
     return dy
@@ -62,15 +58,15 @@ def get_dF_dA(surface):
 
     ngrids = grid_coords.shape[0]
     natom = atom_coords.shape[0]
-    dF = numpy.zeros([ngrids, natom, 3])
-    dA = numpy.zeros([ngrids, natom, 3])
-    
+    dF = cupy.zeros([ngrids, natom, 3])
+    dA = cupy.zeros([ngrids, natom, 3])
+
     for ia in range(atom_coords.shape[0]):
         p0,p1 = surface['gslice_by_atom'][ia]
         coords = grid_coords[p0:p1]
         p1 = p0 + coords.shape[0]
-        ri_rJ = numpy.expand_dims(coords, axis=1) - atom_coords
-        riJ = numpy.linalg.norm(ri_rJ, axis=-1)
+        ri_rJ = cupy.expand_dims(coords, axis=1) - atom_coords
+        riJ = cupy.linalg.norm(ri_rJ, axis=-1)
         diJ = (riJ - R_in_J) / R_sw_J
         diJ[:,ia] = 1.0
         diJ[diJ < 1e-8] = 0.0
@@ -79,25 +75,25 @@ def get_dF_dA(surface):
 
         fiJ = switch_h(diJ)
         dfiJ = grad_switch_h(diJ) / (fiJ * riJ * R_sw_J)
-        dfiJ = numpy.expand_dims(dfiJ, axis=-1) * ri_rJ
+        dfiJ = cupy.expand_dims(dfiJ, axis=-1) * ri_rJ
 
         Fi = switch_fun[p0:p1]
         Ai = area[p0:p1]
-        
+
         # grids response
-        Fi = numpy.expand_dims(Fi, axis=-1)
-        Ai = numpy.expand_dims(Ai, axis=-1)
-        dFi_grid = numpy.sum(dfiJ, axis=1)
-        
+        Fi = cupy.expand_dims(Fi, axis=-1)
+        Ai = cupy.expand_dims(Ai, axis=-1)
+        dFi_grid = cupy.sum(dfiJ, axis=1)
+
         dF[p0:p1,ia,:] += Fi * dFi_grid
         dA[p0:p1,ia,:] += Ai * dFi_grid
 
         # atom response
-        Fi = numpy.expand_dims(Fi, axis=-2)
-        Ai = numpy.expand_dims(Ai, axis=-2)
+        Fi = cupy.expand_dims(Fi, axis=-2)
+        Ai = cupy.expand_dims(Ai, axis=-2)
         dF[p0:p1,:,:] -= Fi * dfiJ
         dA[p0:p1,:,:] -= Ai * dfiJ
-    
+
     return dF, dA
 
 def get_dD_dS(surface, dF, with_S=True, with_D=False):
@@ -110,35 +106,35 @@ def get_dD_dS(surface, dF, with_S=True, with_D=False):
     norm_vec    = surface['norm_vec']
     switch_fun  = surface['switch_fun']
 
-    xi_i, xi_j = numpy.meshgrid(exponents, exponents, indexing='ij')
+    xi_i, xi_j = cupy.meshgrid(exponents, exponents, indexing='ij')
     xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
-    ri_rj = numpy.expand_dims(grid_coords, axis=1) - grid_coords
-    rij = numpy.linalg.norm(ri_rj, axis=-1)
+    ri_rj = cupy.expand_dims(grid_coords, axis=1) - grid_coords
+    rij = cupy.linalg.norm(ri_rj, axis=-1)
     xi_r_ij = xi_ij * rij
-    numpy.fill_diagonal(rij, 1)
-    
-    dS_dr = -(scipy.special.erf(xi_r_ij) - 2.0*xi_r_ij/PI**0.5*numpy.exp(-xi_r_ij**2))/rij**2
-    numpy.fill_diagonal(dS_dr, 0)
-    
-    dS_dr= numpy.expand_dims(dS_dr, axis=-1)
-    drij = ri_rj/numpy.expand_dims(rij, axis=-1)
+    cupy.fill_diagonal(rij, 1)
+
+    dS_dr = -(scipy.special.erf(xi_r_ij) - 2.0*xi_r_ij/PI**0.5*cupy.exp(-xi_r_ij**2))/rij**2
+    cupy.fill_diagonal(dS_dr, 0)
+
+    dS_dr= cupy.expand_dims(dS_dr, axis=-1)
+    drij = ri_rj/cupy.expand_dims(rij, axis=-1)
     dS = dS_dr * drij
 
     dD = None
     if with_D:
-        nj_rij = numpy.sum(ri_rj * norm_vec, axis=-1)
-        dD_dri = 4.0*xi_r_ij**2 * xi_ij / PI**0.5 * numpy.exp(-xi_r_ij**2) * nj_rij / rij**3
-        numpy.fill_diagonal(dD_dri, 0.0)
-        
-        rij = numpy.expand_dims(rij, axis=-1)
-        nj_rij = numpy.expand_dims(nj_rij, axis=-1)
-        nj = numpy.expand_dims(norm_vec, axis=0)
-        dD_dri = numpy.expand_dims(dD_dri, axis=-1)
-        
+        nj_rij = cupy.sum(ri_rj * norm_vec, axis=-1)
+        dD_dri = 4.0*xi_r_ij**2 * xi_ij / PI**0.5 * cupy.exp(-xi_r_ij**2) * nj_rij / rij**3
+        cupy.fill_diagonal(dD_dri, 0.0)
+
+        rij = cupy.expand_dims(rij, axis=-1)
+        nj_rij = cupy.expand_dims(nj_rij, axis=-1)
+        nj = cupy.expand_dims(norm_vec, axis=0)
+        dD_dri = cupy.expand_dims(dD_dri, axis=-1)
+
         dD = dD_dri * drij + dS_dr * (-nj/rij + 3.0*nj_rij/rij**2 * drij)
 
     dSii_dF = -exponents * (2.0/PI)**0.5 / switch_fun**2
-    dSii = numpy.expand_dims(dSii_dF, axis=(1,2)) * dF
+    dSii = cupy.expand_dims(dSii_dF, axis=(1,2)) * dF
 
     return dD, dS, dSii
 
@@ -148,8 +144,7 @@ def grad_kernel(pcmobj, dm):
     v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q)
     '''
     mol = pcmobj.mol
-    nao = mol.nao
-    aoslice = mol.aoslice_by_atom()
+
     gridslice    = pcmobj.surface['gslice_by_atom']
     grid_coords  = pcmobj.surface['grid_coords']
     exponents    = pcmobj.surface['charge_exp']
@@ -161,53 +156,53 @@ def grad_kernel(pcmobj, dm):
     q            = pcmobj._intermediates['q']
     q_sym        = pcmobj._intermediates['q_sym']
 
-    vK_1 = numpy.linalg.solve(K.T, v_grids)
+    vK_1 = cupy.linalg.solve(K.T, v_grids)
 
     # ----------------- potential response -----------------------
-    max_memory = pcmobj.max_memory - lib.current_memory()[0]
-    blksize = int(max(max_memory*.9e6/8/nao**2, 400))
-    ngrids = grid_coords.shape[0]
     atom_coords = mol.atom_coords(unit='B')
 
-    dvj = numpy.zeros([nao,3])
-    dq = numpy.zeros([ngrids,3])
-    for p0, p1 in lib.prange(0, ngrids, blksize):
-        fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2)
-        # charge response
-        v_nj_ip1 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip1', aosym='s1', comp=3)
-        vj = numpy.einsum('xijn,n->xij', v_nj_ip1, q_sym)
-        dvj += numpy.einsum('xij,ij->ix', vj, dm)
-        dvj += numpy.einsum('xij,ji->ix', vj, dm)
-
-        # electronic potential response
-        v_nj_ip2 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip2', aosym='s1', comp=3)
-        dq_slice = numpy.einsum('xijn,ij->nx', v_nj_ip2, dm)
-        dq[p0:p1] = numpy.einsum('nx,n->nx', dq_slice, q_sym[p0:p1])
-
-    de = numpy.zeros_like(atom_coords)        
-    de += numpy.asarray([numpy.sum(dq[p0:p1], axis=0) for p0,p1 in gridslice])
-    de += numpy.asarray([numpy.sum(dvj[p0:p1], axis=0) for p0,p1 in aoslice[:,2:]])
-    
+    intopt = pcmobj.intopt
+    intopt.clear()
+    # rebuild with aosym
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
+    coeff = intopt.coeff
+    dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
+
+    dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart)
+    dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart)
+
+    cart_ao_idx = intopt.cart_ao_idx
+    rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
+    dvj = dvj[:,rev_cart_ao_idx]
+
+    aoslice = intopt.mol.aoslice_by_atom()
+    dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
+    dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
+    de = dq + dvj
+
     atom_charges = mol.atom_charges()
     fakemol_nuc = gto.fakemol_for_charges(atom_coords)
-    
+    fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=exponents.get()**2)
+
     # nuclei response
     int2c2e_ip1 = mol._add_suffix('int2c2e_ip1')
     v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol)
-    dv_g = numpy.einsum('g,xng->nx', q_sym, v_ng_ip1)
-    de -= numpy.einsum('nx,n->nx', dv_g, atom_charges)
+    v_ng_ip1 = cupy.asarray(v_ng_ip1)
+    dv_g = cupy.einsum('g,xng->nx', q_sym, v_ng_ip1)
+    de -= cupy.einsum('nx,n->nx', dv_g, atom_charges)
 
     # nuclei potential response
     int2c2e_ip2 = mol._add_suffix('int2c2e_ip2')
     v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip2, fakemol_nuc, fakemol)
-    dv_g = numpy.einsum('n,xng->gx', atom_charges, v_ng_ip2)
-    dv_g = numpy.einsum('gx,g->gx', dv_g, q_sym)
-    de -= numpy.asarray([numpy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice])
-    
+    v_ng_ip2 = cupy.asarray(v_ng_ip2)
+    dv_g = cupy.einsum('n,xng->gx', atom_charges, v_ng_ip2)
+    dv_g = cupy.einsum('gx,g->gx', dv_g, q_sym)
+    de -= cupy.asarray([cupy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice])
+
     ## --------------- response from stiffness matrices ----------------
     gridslice = pcmobj.surface['gslice_by_atom']
     dF, dA = get_dF_dA(pcmobj.surface)
-    
+
     with_D = pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE'
     dD, dS, dSii = get_dD_dS(pcmobj.surface, dF, with_D=with_D, with_S=True)
 
@@ -215,57 +210,57 @@ def grad_kernel(pcmobj, dm):
         DA = D*A
 
     epsilon = pcmobj.eps
-    
+
     #de_dF = v0 * -dSii_dF * q
     #de += 0.5*numpy.einsum('i,inx->nx', de_dF, dF)
     # dQ = v^T K^-1 (dR - dK K^-1 R) v
     if pcmobj.method.upper() == 'C-PCM' or pcmobj.method.upper() == 'COSMO':
         # dR = 0, dK = dS
-        de_dS = numpy.einsum('i,ijx,j->ix', vK_1, dS, q)
-        de -= numpy.asarray([numpy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice])
-        de -= 0.5*numpy.einsum('i,ijx,i->jx', vK_1, dSii, q)
-    
+        de_dS = cupy.einsum('i,ijx,j->ix', vK_1, dS, q)
+        de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice])
+        de -= 0.5*cupy.einsum('i,ijx,i->jx', vK_1, dSii, q)
+
     elif pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE':
         # IEF-PCM and SS(V)PE formally are the same in gradient calculation
-        # dR = f_eps/(2*pi) * (dD*A + D*dA), 
+        # dR = f_eps/(2*pi) * (dD*A + D*dA),
         # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
         f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
         fac = f_epsilon/(2.0*PI)
 
         Av = A*v_grids
-        de_dR  = 0.5*fac * numpy.einsum('i,ijx,j->ix', vK_1, dD, Av)
-        de_dR -= 0.5*fac * numpy.einsum('i,ijx,j->jx', vK_1, dD, Av)
-        de_dR  = numpy.asarray([numpy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dR += 0.5*fac * numpy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids)
-        
-        de_dS0  = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dS, q)
-        de_dS0 -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dS, q)
-        de_dS0  = numpy.asarray([numpy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dS0 += 0.5*numpy.einsum('i,inx,i->nx', vK_1, dSii, q)
-        
-        vK_1_DA = numpy.dot(vK_1, DA)
-        de_dS1  = 0.5*numpy.einsum('j,jkx,k->jx', vK_1_DA, dS, q)
-        de_dS1 -= 0.5*numpy.einsum('j,jkx,k->kx', vK_1_DA, dS, q)
-        de_dS1  = numpy.asarray([numpy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dS1 += 0.5*numpy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q)
-
-        Sq = numpy.dot(S,q)
+        de_dR  = 0.5*fac * cupy.einsum('i,ijx,j->ix', vK_1, dD, Av)
+        de_dR -= 0.5*fac * cupy.einsum('i,ijx,j->jx', vK_1, dD, Av)
+        de_dR  = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
+        de_dR += 0.5*fac * cupy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids)
+
+        de_dS0  = 0.5*cupy.einsum('i,ijx,j->ix', vK_1, dS, q)
+        de_dS0 -= 0.5*cupy.einsum('i,ijx,j->jx', vK_1, dS, q)
+        de_dS0  = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
+        de_dS0 += 0.5*cupy.einsum('i,inx,i->nx', vK_1, dSii, q)
+
+        vK_1_DA = cupy.dot(vK_1, DA)
+        de_dS1  = 0.5*cupy.einsum('j,jkx,k->jx', vK_1_DA, dS, q)
+        de_dS1 -= 0.5*cupy.einsum('j,jkx,k->kx', vK_1_DA, dS, q)
+        de_dS1  = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
+        de_dS1 += 0.5*cupy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q)
+
+        Sq = cupy.dot(S,q)
         ASq = A*Sq
-        de_dD  = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dD, ASq)
-        de_dD -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dD, ASq)
-        de_dD  = numpy.asarray([numpy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
+        de_dD  = 0.5*cupy.einsum('i,ijx,j->ix', vK_1, dD, ASq)
+        de_dD -= 0.5*cupy.einsum('i,ijx,j->jx', vK_1, dD, ASq)
+        de_dD  = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
 
-        vK_1_D = numpy.dot(vK_1, D)
-        de_dA = 0.5*numpy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq)
+        vK_1_D = cupy.dot(vK_1, D)
+        de_dA = 0.5*cupy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq)
 
         de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
         de += de_dR - de_dK
     else:
         raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
-    
-    return de
-    
-def make_grad_object(mf, grad_method):
+
+    return de.get()
+
+def make_grad_object(grad_method):
     '''
     return solvent gradient object
     '''
@@ -281,15 +276,16 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs):
             dm = kwargs.pop('dm', None)
             if dm is None:
                 dm = self.base.make_rdm1(ao_repr=True)
-            
+
             self.de_solvent = grad_kernel(self.base.with_solvent, dm)
             self.de_solute = grad_method_class.kernel(self, *args, **kwargs)
             self.de = self.de_solute + self.de_solvent
-            
+
             if self.verbose >= logger.NOTE:
                 logger.note(self, '--------------- %s (+%s) gradients ---------------',
                             self.base.__class__.__name__,
                             self.base.with_solvent.__class__.__name__)
+                rhf_grad._write(self, self.mol, self.de, self.atmlst)
                 logger.note(self, '----------------------------------------------')
             return self.de
 
@@ -300,4 +296,4 @@ def _finalize(self):
 
     return WithSolventGrad(grad_method)
 
-pcm.PCM.nuc_grad_method = make_grad_object
\ No newline at end of file
+#pcm.PCM.nuc_grad_method = make_grad_object
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
index aaed5922..b7a1d181 100644
--- a/gpu4pyscf/solvent/pcm.py
+++ b/gpu4pyscf/solvent/pcm.py
@@ -17,17 +17,18 @@
 PCM family solvent model
 '''
 # pylint: disable=C0103
-
-import numpy
-import scipy
 import ctypes
+import numpy
+import cupy
+import cupyx.scipy as scipy
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf import gto, df
 from pyscf.dft import gen_grid
 from pyscf.data import radii
 from pyscf.solvent import ddcosmo
-from pyscf.solvent import _attach_solvent
+from gpu4pyscf.solvent import _attach_solvent
+from gpu4pyscf.df import int3c2e
 
 libdft = lib.load_library('libdft')
 
@@ -37,13 +38,9 @@ def pcm_for_scf(mf, solvent_obj=None, dm=None):
         solvent_obj = PCM(mf.mol)
     return _attach_solvent._for_scf(mf, solvent_obj, dm)
 
-
-# Inject ddPCM to other methods
-from pyscf import scf
-from pyscf import mcscf
-from pyscf import mp, ci, cc
-from pyscf import tdscf
-scf.hf.SCF.PCM    = scf.hf.SCF.PCM    = pcm_for_scf
+# Inject PCM to SCF, TODO: add it to other methods later
+from gpu4pyscf import scf
+scf.hf.RHF.PCM    = scf.hf.RHF.PCM    = pcm_for_scf
 
 # TABLE II,  J. Chem. Phys. 122, 194110 (2005)
 XI = {
@@ -85,7 +82,7 @@ def pcm_for_scf(mf, solvent_obj=None, dm=None):
 
 def switch_h(x):
     '''
-    switching function (eq. 3.19)  
+    switching function (eq. 3.19)
     J. Chem. Phys. 133, 244111 (2010)
     notice the typo in the paper
     '''
@@ -94,33 +91,20 @@ def switch_h(x):
     y[x>1] = 1.0
     return y
 
-def grad_switch_h(x):
-    ''' first derivative of h(x)'''
-    dy = 30.0*x**2 - 60.0*x**3 + 30.0*x**4 
-    dy[x<0] = 0.0
-    dy[x>1] = 0.0
-    return dy
-
-def gradgrad_switch_h(x):
-    ''' 2nd derivative of h(x) '''
-    ddy = 60.0*x - 180.0*x**2 + 120*x**3
-    ddy[x<0] = 0.0
-    ddy[x>1] = 0.0
-    return ddy
-
 def gen_surface(mol, ng=302, vdw_scale=1.2):
     '''J. Phys. Chem. A 1999, 103, 11060-11079'''
     unit_sphere = numpy.empty((ng,4))
     libdft.MakeAngularGrid(unit_sphere.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ng))
+    unit_sphere = cupy.asarray(unit_sphere)
 
-    atom_coords = mol.atom_coords(unit='B')
+    atom_coords = cupy.asarray(mol.atom_coords(unit='B'))
     charges = mol.atom_charges()
-    N_J = ng * numpy.ones(mol.natm)
-    R_J = numpy.asarray([vdw_scale*Bondi[chg] for chg in charges])
+    N_J = ng * cupy.ones(mol.natm)
+    R_J = cupy.asarray([vdw_scale*Bondi[chg] for chg in charges])
     R_sw_J = R_J * (14.0 / N_J)**0.5
     alpha_J = 1.0/2.0 + R_J/R_sw_J - ((R_J/R_sw_J)**2 - 1.0/28)**0.5
     R_in_J = R_J - alpha_J * R_sw_J
-    
+
     grid_coords = []
     weights = []
     charge_exp = []
@@ -134,16 +118,17 @@ def gen_surface(mol, ng=302, vdw_scale=1.2):
         symb = mol.atom_symbol(ia)
         chg = gto.charge(symb)
         r_vdw = vdw_scale*Bondi[chg]
-        
+
         atom_grid = r_vdw * unit_sphere[:,:3] + atom_coords[ia,:]
-        riJ = scipy.spatial.distance.cdist(atom_grid[:,:3], atom_coords)
+        #riJ = scipy.spatial.distance.cdist(atom_grid[:,:3], atom_coords)
+        riJ = cupy.sum((atom_grid[:,None,:] - atom_coords[None,:,:])**2, axis=2)**0.5
         diJ = (riJ - R_in_J) / R_sw_J
         diJ[:,ia] = 1.0
-        diJ[diJ < 1e-8] = 0.0
+        diJ[diJ<1e-8] = 0.0
         fiJ = switch_h(diJ)
-        
+
         w = unit_sphere[:,3] * 4.0 * PI
-        swf = numpy.prod(fiJ, axis=1) 
+        swf = cupy.prod(fiJ, axis=1)
         idx = w*swf > 1e-16
 
         p0, p1 = p1, p1+sum(idx)
@@ -154,17 +139,17 @@ def gen_surface(mol, ng=302, vdw_scale=1.2):
         norm_vec.append(unit_sphere[idx,:3])
         xi = XI[ng] / (r_vdw * w[idx]**0.5)
         charge_exp.append(xi)
-        R_vdw.append(numpy.ones(sum(idx)) * r_vdw)
+        R_vdw.append(cupy.ones(idx.sum().get()) * r_vdw)
         area.append(w[idx]*r_vdw**2*swf[idx])
-    
-    grid_coords = numpy.vstack(grid_coords)
-    norm_vec = numpy.vstack(norm_vec)
-    weights = numpy.concatenate(weights)
-    charge_exp = numpy.concatenate(charge_exp)
-    switch_fun = numpy.concatenate(switch_fun)
-    area = numpy.concatenate(area)
-    R_vdw = numpy.concatenate(R_vdw)
-    
+
+    grid_coords = cupy.vstack(grid_coords)
+    norm_vec = cupy.vstack(norm_vec)
+    weights = cupy.concatenate(weights)
+    charge_exp = cupy.concatenate(charge_exp)
+    switch_fun = cupy.concatenate(switch_fun)
+    area = cupy.concatenate(area)
+    R_vdw = cupy.concatenate(R_vdw)
+
     surface = {
         'ng': ng,
         'gslice_by_atom': gslice_by_atom,
@@ -191,58 +176,6 @@ def get_F_A(surface):
     A = weights*R_vdw**2*switch_fun
     return switch_fun, A
 
-def get_dF_dA(surface):
-    '''
-    J. Chem. Phys. 133, 244111 (2010), Appendix C
-    '''
-
-    atom_coords = surface['atom_coords']
-    grid_coords = surface['grid_coords']
-    switch_fun  = surface['switch_fun']
-    area        = surface['area']
-    R_in_J      = surface['R_in_J']
-    R_sw_J      = surface['R_sw_J']
-
-    ngrids = grid_coords.shape[0]
-    natom = atom_coords.shape[0]
-    dF = numpy.zeros([ngrids, natom, 3])
-    dA = numpy.zeros([ngrids, natom, 3])
-    
-    for ia in range(atom_coords.shape[0]):
-        p0,p1 = surface['gslice_by_atom'][ia]
-        coords = grid_coords[p0:p1]
-        p1 = p0 + coords.shape[0]
-        ri_rJ = numpy.expand_dims(coords, axis=1) - atom_coords
-        riJ = numpy.linalg.norm(ri_rJ, axis=-1)
-        diJ = (riJ - R_in_J) / R_sw_J
-        diJ[:,ia] = 1.0
-        diJ[diJ < 1e-8] = 0.0
-        ri_rJ[:,ia,:] = 0.0
-        ri_rJ[diJ < 1e-8] = 0.0
-
-        fiJ = switch_h(diJ)
-        dfiJ = grad_switch_h(diJ) / (fiJ * riJ * R_sw_J)
-        dfiJ = numpy.expand_dims(dfiJ, axis=-1) * ri_rJ
-
-        Fi = switch_fun[p0:p1]
-        Ai = area[p0:p1]
-        
-        # grids response
-        Fi = numpy.expand_dims(Fi, axis=-1)
-        Ai = numpy.expand_dims(Ai, axis=-1)
-        dFi_grid = numpy.sum(dfiJ, axis=1)
-        
-        dF[p0:p1,ia,:] += Fi * dFi_grid
-        dA[p0:p1,ia,:] += Ai * dFi_grid
-
-        # atom response
-        Fi = numpy.expand_dims(Fi, axis=-2)
-        Ai = numpy.expand_dims(Ai, axis=-2)
-        dF[p0:p1,:,:] -= Fi * dfiJ
-        dA[p0:p1,:,:] -= Ai * dfiJ
-    
-    return dF, dA
-
 def get_D_S(surface, with_S=True, with_D=False):
     '''
     generate D and S matrix in  J. Chem. Phys. 133, 244111 (2010)
@@ -254,223 +187,24 @@ def get_D_S(surface, with_S=True, with_D=False):
     norm_vec    = surface['norm_vec']
     R_vdw       = surface['R_vdw']
 
-    xi_i, xi_j = numpy.meshgrid(charge_exp, charge_exp, indexing='ij')
+    xi_i, xi_j = cupy.meshgrid(charge_exp, charge_exp, indexing='ij')
     xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
-    rij = scipy.spatial.distance.cdist(grid_coords, grid_coords)
+    #rij = scipy.spatial.distance.cdist(grid_coords, grid_coords)
+    rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
     xi_r_ij = xi_ij * rij
-    numpy.fill_diagonal(rij, 1)
+    cupy.fill_diagonal(rij, 1)
     S = scipy.special.erf(xi_r_ij) / rij
-    numpy.fill_diagonal(S, charge_exp * (2.0 / PI)**0.5 / switch_fun)
-    
+    cupy.fill_diagonal(S, charge_exp * (2.0 / PI)**0.5 / switch_fun)
+
     D = None
     if with_D:
-        drij = numpy.expand_dims(grid_coords, axis=1) - grid_coords
-        nrij = numpy.sum(drij * norm_vec, axis=-1)
-        
-        D = S*nrij/rij**2 -2.0*xi_r_ij/PI**0.5*numpy.exp(-xi_r_ij**2)*nrij/rij**3
-        numpy.fill_diagonal(D, -charge_exp * (2.0 / PI)**0.5 / (2.0 * R_vdw))
+        drij = cupy.expand_dims(grid_coords, axis=1) - grid_coords
+        nrij = cupy.sum(drij * norm_vec, axis=-1)
 
-    return D, S
+        D = S*nrij/rij**2 -2.0*xi_r_ij/PI**0.5*cupy.exp(-xi_r_ij**2)*nrij/rij**3
+        cupy.fill_diagonal(D, -charge_exp * (2.0 / PI)**0.5 / (2.0 * R_vdw))
 
-def get_dD_dS(surface, dF, with_S=True, with_D=False):
-    '''
-    derivative of D and S w.r.t grids, partial_i D_ij = -partial_j D_ij
-    S is symmetric, D is not
-    '''
-    grid_coords = surface['grid_coords']
-    exponents   = surface['charge_exp']
-    norm_vec    = surface['norm_vec']
-    switch_fun  = surface['switch_fun']
-
-    xi_i, xi_j = numpy.meshgrid(exponents, exponents, indexing='ij')
-    xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
-    ri_rj = numpy.expand_dims(grid_coords, axis=1) - grid_coords
-    rij = numpy.linalg.norm(ri_rj, axis=-1)
-    xi_r_ij = xi_ij * rij
-    numpy.fill_diagonal(rij, 1)
-    
-    dS_dr = -(scipy.special.erf(xi_r_ij) - 2.0*xi_r_ij/PI**0.5*numpy.exp(-xi_r_ij**2))/rij**2
-    numpy.fill_diagonal(dS_dr, 0)
-    
-    dS_dr= numpy.expand_dims(dS_dr, axis=-1)
-    drij = ri_rj/numpy.expand_dims(rij, axis=-1)
-    dS = dS_dr * drij
-
-    dD = None
-    if with_D:
-        nj_rij = numpy.sum(ri_rj * norm_vec, axis=-1)
-        dD_dri = 4.0*xi_r_ij**2 * xi_ij / PI**0.5 * numpy.exp(-xi_r_ij**2) * nj_rij / rij**3
-        numpy.fill_diagonal(dD_dri, 0.0)
-        
-        rij = numpy.expand_dims(rij, axis=-1)
-        nj_rij = numpy.expand_dims(nj_rij, axis=-1)
-        nj = numpy.expand_dims(norm_vec, axis=0)
-        dD_dri = numpy.expand_dims(dD_dri, axis=-1)
-        
-        dD = dD_dri * drij + dS_dr * (-nj/rij + 3.0*nj_rij/rij**2 * drij)
-
-    dSii_dF = -exponents * (2.0/PI)**0.5 / switch_fun**2
-    dSii = numpy.expand_dims(dSii_dF, axis=(1,2)) * dF
-
-    return dD, dS, dSii
-
-def grad_kernel(pcmobj, dm):
-    '''
-    dE = 0.5*v* d(K^-1 R) *v + q*dv
-    v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q)
-    '''
-    mol = pcmobj.mol
-    nao = mol.nao
-    aoslice = mol.aoslice_by_atom()
-    gridslice    = pcmobj.surface['gslice_by_atom']
-    grid_coords  = pcmobj.surface['grid_coords']
-    exponents    = pcmobj.surface['charge_exp']
-    v_grids      = pcmobj._intermediates['v_grids']
-    A            = pcmobj._intermediates['A']
-    D            = pcmobj._intermediates['D']
-    S            = pcmobj._intermediates['S']
-    K            = pcmobj._intermediates['K']
-    q            = pcmobj._intermediates['q']
-    q_sym        = pcmobj._intermediates['q_sym']
-
-    vK_1 = numpy.linalg.solve(K.T, v_grids)
-
-    # ----------------- potential response -----------------------
-    max_memory = pcmobj.max_memory - lib.current_memory()[0]
-    blksize = int(max(max_memory*.9e6/8/nao**2, 400))
-    ngrids = grid_coords.shape[0]
-    atom_coords = mol.atom_coords(unit='B')
-
-    dvj = numpy.zeros([nao,3])
-    dq = numpy.zeros([ngrids,3])
-    for p0, p1 in lib.prange(0, ngrids, blksize):
-        fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2)
-        # charge response
-        v_nj_ip1 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip1', aosym='s1', comp=3)
-        vj = numpy.einsum('xijn,n->xij', v_nj_ip1, q_sym)
-        dvj += numpy.einsum('xij,ij->ix', vj, dm)
-        dvj += numpy.einsum('xij,ji->ix', vj, dm)
-
-        # electronic potential response
-        v_nj_ip2 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip2', aosym='s1', comp=3)
-        dq_slice = numpy.einsum('xijn,ij->nx', v_nj_ip2, dm)
-        dq[p0:p1] = numpy.einsum('nx,n->nx', dq_slice, q_sym[p0:p1])
-
-    de = numpy.zeros_like(atom_coords)        
-    de += numpy.asarray([numpy.sum(dq[p0:p1], axis=0) for p0,p1 in gridslice])
-    de += numpy.asarray([numpy.sum(dvj[p0:p1], axis=0) for p0,p1 in aoslice[:,2:]])
-    
-    atom_charges = mol.atom_charges()
-    fakemol_nuc = gto.fakemol_for_charges(atom_coords)
-    
-    # nuclei response
-    int2c2e_ip1 = mol._add_suffix('int2c2e_ip1')
-    v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol)
-    dv_g = numpy.einsum('g,xng->nx', q_sym, v_ng_ip1)
-    de -= numpy.einsum('nx,n->nx', dv_g, atom_charges)
-
-    # nuclei potential response
-    int2c2e_ip2 = mol._add_suffix('int2c2e_ip2')
-    v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip2, fakemol_nuc, fakemol)
-    dv_g = numpy.einsum('n,xng->gx', atom_charges, v_ng_ip2)
-    dv_g = numpy.einsum('gx,g->gx', dv_g, q_sym)
-    de -= numpy.asarray([numpy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice])
-    
-    ## --------------- response from stiffness matrices ----------------
-    gridslice = pcmobj.surface['gslice_by_atom']
-    dF, dA = get_dF_dA(pcmobj.surface)
-    
-    with_D = pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE'
-    dD, dS, dSii = get_dD_dS(pcmobj.surface, dF, with_D=with_D, with_S=True)
-
-    if pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE':
-        DA = D*A
-
-    epsilon = pcmobj.eps
-    
-    #de_dF = v0 * -dSii_dF * q
-    #de += 0.5*numpy.einsum('i,inx->nx', de_dF, dF)
-    # dQ = v^T K^-1 (dR - dK K^-1 R) v
-    if pcmobj.method.upper() == 'C-PCM' or pcmobj.method.upper() == 'COSMO':
-        # dR = 0, dK = dS
-        de_dS = numpy.einsum('i,ijx,j->ix', vK_1, dS, q)
-        de -= numpy.asarray([numpy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice])
-        de -= 0.5*numpy.einsum('i,ijx,i->jx', vK_1, dSii, q)
-    
-    elif pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE':
-        # IEF-PCM and SS(V)PE formally are the same in gradient calculation
-        # dR = f_eps/(2*pi) * (dD*A + D*dA), 
-        # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
-        f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
-        fac = f_epsilon/(2.0*PI)
-
-        Av = A*v_grids
-        de_dR  = 0.5*fac * numpy.einsum('i,ijx,j->ix', vK_1, dD, Av)
-        de_dR -= 0.5*fac * numpy.einsum('i,ijx,j->jx', vK_1, dD, Av)
-        de_dR  = numpy.asarray([numpy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dR += 0.5*fac * numpy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids)
-        
-        de_dS0  = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dS, q)
-        de_dS0 -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dS, q)
-        de_dS0  = numpy.asarray([numpy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dS0 += 0.5*numpy.einsum('i,inx,i->nx', vK_1, dSii, q)
-        
-        vK_1_DA = numpy.dot(vK_1, DA)
-        de_dS1  = 0.5*numpy.einsum('j,jkx,k->jx', vK_1_DA, dS, q)
-        de_dS1 -= 0.5*numpy.einsum('j,jkx,k->kx', vK_1_DA, dS, q)
-        de_dS1  = numpy.asarray([numpy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dS1 += 0.5*numpy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q)
-
-        Sq = numpy.dot(S,q)
-        ASq = A*Sq
-        de_dD  = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dD, ASq)
-        de_dD -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dD, ASq)
-        de_dD  = numpy.asarray([numpy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
-
-        vK_1_D = numpy.dot(vK_1, D)
-        de_dA = 0.5*numpy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq)
-
-        de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
-        de += de_dR - de_dK
-    else:
-        raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
-    
-    return de
-        
-def make_grad_object(grad_method):
-    '''
-    return solvent gradient object
-    '''
-    grad_method_class = grad_method.__class__
-    class WithSolventGrad(grad_method_class):
-        def __init__(self, grad_method):
-            self.__dict__.update(grad_method.__dict__)
-            self.de_solvent = None
-            self.de_solute = None
-            self._keys = self._keys.union(['de_solvent', 'de_solute'])
-
-        def kernel(self, *args, dm=None, atmlst=None, **kwargs):
-            dm = kwargs.pop('dm', None)
-            if dm is None:
-                dm = self.base.make_rdm1(ao_repr=True)
-            
-            self.de_solvent = grad_kernel(self.base.with_solvent, dm)
-            self.de_solute = grad_method_class.kernel(self, *args, **kwargs)
-            self.de = self.de_solute + self.de_solvent
-            
-            if self.verbose >= logger.NOTE:
-                logger.note(self, '--------------- %s (+%s) gradients ---------------',
-                            self.base.__class__.__name__,
-                            self.base.with_solvent.__class__.__name__)
-                logger.note(self, '----------------------------------------------')
-            return self.de
-
-        def _finalize(self):
-            # disable _finalize. It is called in grad_method.kernel method
-            # where self.de was not yet initialized.
-            pass
-
-    return WithSolventGrad(grad_method)
+    return D, S
 
 class PCM(ddcosmo.DDCOSMO):
     def __init__(self, mol):
@@ -481,10 +215,7 @@ def __init__(self, mol):
         self._intermediates = {}
 
     def dump_flags(self, verbose=None):
-        logger.info(self, '******** %s (In testing) ********', self.__class__)
-        logger.warn(self, 'ddPCM is an experimental feature. It is '
-                    'still in testing.\nFeatures and APIs may be changed '
-                    'in the future.')
+        logger.info(self, '******** %s ********', self.__class__)
         logger.info(self, 'lebedev_order = %s (%d grids per sphere)',
                     self.lebedev_order, gen_grid.LEBEDEV_ORDER[self.lebedev_order])
         logger.info(self, 'lmax = %s'         , self.lmax)
@@ -502,48 +233,65 @@ def build(self, ng=None):
         vdw_scale = self.vdw_scale
         self.radii_table = vdw_scale * Bondi
         mol = self.mol
-        if ng is None: 
+        if ng is None:
             ng = gen_grid.LEBEDEV_ORDER[self.lebedev_order]
-        
+
         self.surface = gen_surface(mol, ng=ng, vdw_scale=vdw_scale)
         self._intermediates = {}
         F, A = get_F_A(self.surface)
         D, S = get_D_S(self.surface, with_S=True, with_D=True)
-        
+
         epsilon = self.eps
         if self.method.upper() == 'C-PCM':
             f_epsilon = (epsilon-1.)/epsilon
             K = S
-            R = -f_epsilon * numpy.eye(K.shape[0])
+            R = -f_epsilon * cupy.eye(K.shape[0])
         elif self.method.upper() == 'COSMO':
             f_epsilon = (epsilon - 1.0)/(epsilon + 1.0/2.0)
             K = S
-            R = -f_epsilon * numpy.eye(K.shape[0])
+            R = -f_epsilon * cupy.eye(K.shape[0])
         elif self.method.upper() == 'IEF-PCM':
             f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
             DA = D*A
-            DAS = numpy.dot(DA, S)
+            DAS = cupy.dot(DA, S)
             K = S - f_epsilon/(2.0*PI) * DAS
-            R = -f_epsilon * (numpy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA)            
+            R = -f_epsilon * (cupy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA)
         elif self.method.upper() == 'SS(V)PE':
             f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
             DA = D*A
-            DAS = numpy.dot(DA, S)
+            DAS = cupy.dot(DA, S)
             K = S - f_epsilon/(4.0*PI) * (DAS + DAS.T)
-            R = -f_epsilon * (numpy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA)
+            R = -f_epsilon * (cupy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA)
         else:
             raise RuntimeError(f"Unknown implicit solvent model: {self.method}")
 
         intermediates = {
-            'S': S,
-            'D': D,
-            'A': A,
-            'K': K,
-            'R': R,
+            'S': cupy.asarray(S),
+            'D': cupy.asarray(D),
+            'A': cupy.asarray(A),
+            'K': cupy.asarray(K),
+            'R': cupy.asarray(R),
             'f_epsilon': f_epsilon
         }
         self._intermediates.update(intermediates)
 
+        charge_exp  = self.surface['charge_exp']
+        grid_coords = self.surface['grid_coords']
+        atom_coords = mol.atom_coords(unit='B')
+        atom_charges = mol.atom_charges()
+
+        # Move this to GPU
+        auxmol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2)
+        intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
+        intopt.build(1e-14, diag_block_with_triu=False, aosym=True, group_size=256)
+        self.intopt = intopt
+
+        int2c2e = mol._add_suffix('int2c2e')
+        fakemol_nuc = gto.fakemol_for_charges(atom_coords)
+        v_ng = gto.mole.intor_cross(int2c2e, fakemol_nuc, auxmol)
+        v_grids_n = numpy.dot(atom_charges, v_ng)
+        self.v_grids_n = cupy.asarray(v_grids_n)
+
     def _get_vind(self, dms):
         if not self._intermediates or self.grids.coords is None:
             self.build()
@@ -553,16 +301,16 @@ def _get_vind(self, dms):
 
         K = self._intermediates['K']
         R = self._intermediates['R']
-        v_grids = self._get_v(self.surface, dms)
-        b = numpy.dot(R, v_grids)
-        q = numpy.linalg.solve(K, b)
+        v_grids = self._get_v(dms)
+        b = cupy.dot(R, v_grids)
+        q = cupy.linalg.solve(K, b)
 
-        vK_1 = numpy.linalg.solve(K.T, v_grids)
-        q_sym = (q + numpy.dot(R.T, vK_1))/2.0
+        vK_1 = cupy.linalg.solve(K.T, v_grids)
+        q_sym = (q + cupy.dot(R.T, vK_1))/2.0
 
         vmat = self._get_vmat(q_sym)
-        epcm = 0.5 * numpy.dot(q_sym, v_grids)
-        
+        epcm = 0.5 * cupy.dot(q_sym, v_grids)
+
         self._intermediates['K'] = K
         self._intermediates['R'] = R
         self._intermediates['q'] = q
@@ -571,54 +319,26 @@ def _get_vind(self, dms):
 
         return epcm, vmat
 
-    def _get_v(self, surface, dms):
+    def _get_v(self, dms):
         '''
         return electrostatic potential on surface
         '''
-        mol = self.mol
-        nao = dms.shape[-1]
-        atom_coords = mol.atom_coords(unit='B')
-        atom_charges = mol.atom_charges()
-        grid_coords = surface['grid_coords']
-        exponents   = surface['charge_exp']
-
-        max_memory = self.max_memory - lib.current_memory()[0]
-        blksize = int(max(max_memory*.9e6/8/nao**2, 400))
-        ngrids = grid_coords.shape[0]
-        int3c2e = mol._add_suffix('int3c2e')
-        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e)
-        v_grids_e = numpy.empty(ngrids)
-        for p0, p1 in lib.prange(0, ngrids, blksize):
-            fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2)
-            v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt)
-            v_grids_e[p0:p1] = numpy.einsum('ijL,ij->L',v_nj, dms[0])
-
-        int2c2e = mol._add_suffix('int2c2e')
-
-        fakemol_nuc = gto.fakemol_for_charges(atom_coords)
-        v_ng = gto.mole.intor_cross(int2c2e, fakemol_nuc, fakemol)
-        v_grids_n = numpy.dot(atom_charges, v_ng)
-        
-        v_grids = v_grids_n - v_grids_e
+        v_grids_e = 2.0*int3c2e.get_j_int3c2e_pass1(self.intopt, dms[0])
+        v_grids = self.v_grids_n - v_grids_e
         return v_grids
 
     def _get_vmat(self, q):
-        mol = self.mol
-        nao = mol.nao
-        grid_coords = self.surface['grid_coords']
-        exponents   = self.surface['charge_exp']
-        max_memory = self.max_memory - lib.current_memory()[0]
-        blksize = int(max(max_memory*.9e6/8/nao**2, 400))
-        ngrids = grid_coords.shape[0]
-        int3c2e = mol._add_suffix('int3c2e')
-        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e)
-        vmat = numpy.zeros([nao,nao])
-        for p0, p1 in lib.prange(0, ngrids, blksize):
-            fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2)
-            v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt)
-            vmat += -numpy.einsum('ijL,L->ij', v_nj, q[p0:p1])
-        return vmat
-    
+        return -int3c2e.get_j_int3c2e_pass2(self.intopt, q)
+
     def nuc_grad_method(self, grad_method):
-        return make_grad_object(grad_method)
+        from gpu4pyscf.solvent.grad import pcm as pcm_grad
+        if self.frozen:
+            raise RuntimeError('Frozen solvent model is not supported')
+        from gpu4pyscf import scf
+        if isinstance(grad_method.base, scf.hf.RHF):
+            return pcm_grad.make_grad_object(grad_method)
+        else:
+            raise RuntimeError('Only SCF gradient is supported')
 
+    def Hessian(self):
+        raise NotImplementedError('not implemented yet')
diff --git a/gpu4pyscf/solvent/tests/test_pcm.py b/gpu4pyscf/solvent/tests/test_pcm.py
index f0087a1a..2a78decc 100644
--- a/gpu4pyscf/solvent/tests/test_pcm.py
+++ b/gpu4pyscf/solvent/tests/test_pcm.py
@@ -15,13 +15,14 @@
 
 import unittest
 import numpy
-from pyscf import scf, gto, df
-from gpu4pyscf.solvent import pcm 
+from pyscf import gto, df
+from gpu4pyscf import scf
+from gpu4pyscf.solvent import pcm
 
 def setUpModule():
     global mol, epsilon, lebedev_order
     mol = gto.Mole()
-    mol.atom = ''' 
+    mol.atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
@@ -59,7 +60,7 @@ def test_COSMO(self):
         e_tot = mf.kernel()
         print(f"Energy error in COSMO: {numpy.abs(e_tot - -74.96900351922464)}")
         assert numpy.abs(e_tot - -74.96900351922464) < 1e-9
-    
+
     def test_IEFPCM(self):
         cm = pcm.PCM(mol)
         cm.eps = epsilon
@@ -70,7 +71,7 @@ def test_IEFPCM(self):
         e_tot = mf.kernel()
         print(f"Energy error in IEF-PCM: {numpy.abs(e_tot - -74.9690111344)}")
         assert numpy.abs(e_tot - -74.9690111344) < 1e-9
-    
+
     def test_SSVPE(self):
         cm = pcm.PCM(mol)
         cm.eps = epsilon
@@ -81,7 +82,7 @@ def test_SSVPE(self):
         e_tot = mf.kernel()
         print(f"Energy error in SS(V)PE: {numpy.abs(e_tot - -74.9689577454)}")
         assert numpy.abs(e_tot - -74.9689577454) < 1e-9
-    
+
 if __name__ == "__main__":
     print("Full Tests for PCMs")
     unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/tests/test_pcm_grad.py b/gpu4pyscf/solvent/tests/test_pcm_grad.py
index 732aa5d2..677aa285 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_grad.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_grad.py
@@ -15,14 +15,15 @@
 
 import unittest
 import numpy
-from pyscf import scf, gto, df
-from gpu4pyscf.solvent import pcm 
+from pyscf import gto
+from gpu4pyscf import scf
+from gpu4pyscf.solvent import pcm
 from gpu4pyscf.solvent.grad import pcm as pcm_grad
 
 def setUpModule():
     global mol, epsilon, lebedev_order
     mol = gto.Mole()
-    mol.atom = ''' 
+    mol.atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
@@ -44,10 +45,10 @@ def test_dA_dF(self):
         cm.lebedev_order = 3
         cm.method = 'IEF-PCM'
         cm.build()
-        
+
         dF, dA = pcm_grad.get_dF_dA(cm.surface)
         dD, dS, dSii = pcm_grad.get_dD_dS(cm.surface, dF, with_S=True, with_D=True)
-        
+
         def get_FADS(mol):
             mol.build()
             cm = pcm.PCM(mol)
@@ -59,7 +60,7 @@ def get_FADS(mol):
             D = cm._intermediates['D']
             S = cm._intermediates['S']
             return F, A, D, S
-        
+
         eps = 1e-5
         for ia in range(mol.natm):
             p0,p1 = cm.surface['gslice_by_atom'][ia]
@@ -84,7 +85,7 @@ def get_FADS(mol):
 
                 assert numpy.linalg.norm(dF0 - dF[:,ia,j]) < 1e-8
                 assert numpy.linalg.norm(dA0 - dA[:,ia,j]) < 1e-8
-                
+
                 # the diagonal entries are calcualted separately
                 assert numpy.linalg.norm(dSii[:,ia,j] - numpy.diag(dS0)) < 1e-8
                 numpy.fill_diagonal(dS0, 0)
@@ -93,12 +94,12 @@ def get_FADS(mol):
                 dS_ia[p0:p1] = dS[p0:p1,:,j]
                 dS_ia[:,p0:p1] -= dS[:,p0:p1,j]
                 assert numpy.linalg.norm(dS0 - dS_ia) < 1e-8
-            
+
                 dD_ia = numpy.zeros_like(dD0)
                 dD_ia[p0:p1] = dD[p0:p1,:,j]
                 dD_ia[:,p0:p1] -= dD[:,p0:p1,j]
                 assert numpy.linalg.norm(dD0 - dD_ia) < 1e-8
-    
+
     def test_grad_CPCM(self):
         cm = pcm.PCM(mol)
         cm.eps = epsilon
@@ -117,10 +118,10 @@ def test_grad_CPCM(self):
              [0.49773047433563E-15,  -0.12128126037559E-15,  -0.58936988992306E-01],
              [0.22810111996954E-01,  -0.68951901317025E-17,   0.29468494708267E-01],
             [-0.22810111996957E-01,   0.12949813945902E-15,   0.29468494708266E-01]])
-        
+
         print(f"Gradient error in CPCM: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-9
-    
+
     def test_grad_COSMO(self):
         cm = pcm.PCM(mol)
         cm.eps = epsilon
@@ -134,15 +135,15 @@ def test_grad_COSMO(self):
 
         g = mf.nuc_grad_method()
         grad = g.kernel()
-        
+
         g0 = numpy.asarray(
             [[-1.33560836e-16,  8.70874355e-17, -5.89638726e-02],
              [ 2.28202396e-02,  2.63784344e-17,  2.94819363e-02],
              [-2.28202396e-02, -1.08799896e-16,  2.94819363e-02]])
-        
+
         print(f"Gradient error in COSMO: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-9
-    
+
     def test_grad_IEFPCM(self):
         cm = pcm.PCM(mol)
         cm.eps = epsilon
@@ -153,17 +154,17 @@ def test_grad_IEFPCM(self):
         mf.verbose = 0
         mf.conv_tol = 1e-12
         e_tot = mf.kernel()
-        
+
         g = mf.nuc_grad_method()
         grad = g.kernel()
-        
+
         g0 = numpy.asarray([
              [0.18357915015649E-14,   0.14192681822347E-15,  -0.58988087999658E-01],
              [0.22822709179063E-01,  -0.10002010417168E-15,   0.29494044211805E-01],
             [-0.22822709179066E-01,  -0.31051364515588E-16,   0.29494044211806E-01]])
         print(f"Gradient error in IEFPCM: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-9
-    
+
     def test_grad_SSVPE(self):
         cm = pcm.PCM(mol)
         cm.eps = epsilon
@@ -177,14 +178,14 @@ def test_grad_SSVPE(self):
 
         g = mf.nuc_grad_method()
         grad = g.kernel()
-        
+
         g0 = numpy.asarray([
              [0.76104817971710E-15,   0.11185701540547E-15,  -0.58909172879217E-01],
              [0.22862990009767E-01,  -0.13861633974903E-15,   0.29454586651678E-01],
             [-0.22862990009769E-01,   0.34988765678591E-16,   0.29454586651679E-01]])
         print(f"Gradient error in SS(V)PE: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-9
-    
+
 if __name__ == "__main__":
     print("Full Tests for Gradient of PCMs")
     unittest.main()
\ No newline at end of file

From 98741874341b546d881d9e6143d435fe86596baf Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 17 Oct 2023 20:05:40 -0700
Subject: [PATCH 07/19] Hotfix 0.6.1 (#49)

* numpy -> cupy for solvent

* for linter

* remove grad switch from pcm.py

* passed flake8

* solvent integrals on GPU

* flake8

* compatiable with pyscf-2.4.0
---
 gpu4pyscf/__init__.py                 |   2 +-
 gpu4pyscf/dft/gen_grid.py             | 129 ++++----------------------
 gpu4pyscf/grad/rks.py                 |  38 ++++----
 gpu4pyscf/grad/tests/test_rks_grad.py |  27 +++---
 4 files changed, 53 insertions(+), 143 deletions(-)

diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index a52a096a..143af69c 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,2 +1,2 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.1'
\ No newline at end of file
+__version__ = '0.6.2'
diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
index 8e70d549..ea76e2f8 100644
--- a/gpu4pyscf/dft/gen_grid.py
+++ b/gpu4pyscf/dft/gen_grid.py
@@ -31,7 +31,6 @@
 import cupy
 from pyscf import lib
 from pyscf.lib import logger
-#from pyscf.dft import radi
 from pyscf import gto
 from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index
 from pyscf import __config__
@@ -262,7 +261,7 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev,
                 vol.append(cupy.einsum('i,j->ji', rad_weight[idx], grid[:,3]).ravel())
 
             atom_grids_tab[symb] = (cupy.vstack(coords), cupy.hstack(vol))
-            
+
     return atom_grids_tab
 
 def get_partition(mol, atom_grids_tab,
@@ -291,26 +290,6 @@ def get_partition(mol, atom_grids_tab,
         (radii_adjust is radi.treutler_atomic_radii_adjust or
          radii_adjust is radi.becke_atomic_radii_adjust or
          f_radii_adjust is None)):
-        '''
-        if f_radii_adjust is None:
-            p_radii_table = lib.c_null_ptr()
-        else:
-            f_radii_table = numpy.asarray([f_radii_adjust(i, j, 0)
-                                           for i in range(mol.natm)
-                                           for j in range(mol.natm)])
-            p_radii_table = f_radii_table.ctypes.data_as(ctypes.c_void_p)
-
-        def gen_grid_partition0(coords):
-            coords = numpy.asarray(coords, order='F')
-            ngrids = coords.shape[0]
-            pbecke = numpy.empty((mol.natm,ngrids))
-            libdft.VXCgen_grid(pbecke.ctypes.data_as(ctypes.c_void_p),
-                               coords.ctypes.data_as(ctypes.c_void_p),
-                               atm_coords.ctypes.data_as(ctypes.c_void_p),
-                               p_radii_table,
-                               ctypes.c_int(mol.natm), ctypes.c_int(ngrids))
-            return pbecke
-        '''
         def gen_grid_partition(coords):
             grid_dist = cupy.linalg.norm(coords[None,:,:] - atm_coords[:,None,:], axis=-1)
             r12 = grid_dist[:,None,:] - grid_dist[None,:,:]
@@ -426,7 +405,8 @@ def _load_conf(mod, name, default):
     else:
         return var
 
-class Grids(lib.StreamObject):
+from pyscf.dft import gen_grid
+class Grids(gen_grid.Grids):
     '''DFT mesh grids
 
     Attributes for Grids:
@@ -501,30 +481,6 @@ class Grids(lib.StreamObject):
 
     alignment = ALIGNMENT_UNIT
     cutoff = CUTOFF
-    
-    def __init__(self, mol):
-        self.mol = mol
-        self.stdout = mol.stdout
-        self.verbose = mol.verbose
-        self.symmetry = mol.symmetry
-        self.atom_grid = {}
-
-##################################################
-# don't modify the following attributes, they are not input options
-        self.non0tab = None
-        # Integral screen index ~= NBINS + log(ao).
-        # screen_index > 0 for non-zero AOs
-        self.screen_index = None
-        self.coords  = None
-        self.weights = None
-        self._keys = set(self.__dict__.keys()).update([
-            'atomic_radii', 'radii_adjust', 'radi_method', 'becke_scheme',
-            'prune', 'level', 'alignment', 'cutoff',
-        ])
-
-    @property
-    def size(self):
-        return getattr(self.weights, 'size', 0)
 
     def __setattr__(self, key, val):
         if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method',
@@ -532,20 +488,6 @@ def __setattr__(self, key, val):
             self.reset()
         super(Grids, self).__setattr__(key, val)
 
-    def dump_flags(self, verbose=None):
-        logger.info(self, 'radial grids: %s', self.radi_method.__doc__)
-        logger.info(self, 'becke partition: %s', self.becke_scheme.__doc__)
-        logger.info(self, 'pruning grids: %s', self.prune)
-        logger.info(self, 'grids dens level: %d', self.level)
-        logger.info(self, 'symmetrized grids: %s', self.symmetry)
-        if self.radii_adjust is not None:
-            logger.info(self, 'atomic radii adjust function: %s',
-                        self.radii_adjust)
-            logger.debug2(self, 'atomic_radii : %s', self.atomic_radii)
-        if self.atom_grid:
-            logger.info(self, 'User specified grid scheme %s', str(self.atom_grid))
-        return self
-
     def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
         if mol is None: mol = self.mol
         if self.verbose >= logger.WARN:
@@ -564,10 +506,10 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
             padding = _padding_size(self.size, self.alignment)
             logger.debug(self, 'Padding %d grids', padding)
             if padding > 0:
-                self.coords = numpy.vstack(
+                # cupy.vstack and cupy.hstack convert numpy array into cupy array first
+                self.coords = cupy.vstack(
                     [self.coords, numpy.repeat([[1e4]*3], padding, axis=0)])
-                self.weights = numpy.hstack([self.weights, numpy.zeros(padding)])
-
+                self.weights = cupy.hstack([self.weights, numpy.zeros(padding)])
         if with_non0tab:
             self.non0tab = self.make_mask(mol, self.coords)
             self.screen_index = self.non0tab
@@ -612,62 +554,27 @@ def prune_by_density_(self, rho, threshold=0):
             return self
 
         mol = self.mol
-        n = numpy.dot(rho, self.weights)
+        n = cupy.dot(rho, self.weights)
         if abs(n-mol.nelectron) < NELEC_ERROR_TOL*n:
             rho *= self.weights
             idx = abs(rho) > threshold / self.weights.size
             logger.debug(self, 'Drop grids %d',
-                         self.weights.size - numpy.count_nonzero(idx))
-            self.coords  = numpy.asarray(self.coords [idx], order='C')
-            self.weights = numpy.asarray(self.weights[idx], order='C')
+                         self.weights.size - cupy.count_nonzero(idx))
+            self.coords  = cupy.asarray(self.coords [idx], order='C')
+            self.weights = cupy.asarray(self.weights[idx], order='C')
             if self.alignment > 1:
                 padding = _padding_size(self.size, self.alignment)
                 logger.debug(self, 'prune_by_density_: %d padding grids', padding)
                 if padding > 0:
-                    self.coords = numpy.vstack(
-                        [self.coords, numpy.repeat([[1e4]*3], padding, axis=0)])
-                    self.weights = numpy.hstack([self.weights, numpy.zeros(padding)])
+                    self.coords = cupy.vstack(
+                        [self.coords, cupy.repeat([[1e4]*3], padding, axis=0)])
+                    self.weights = cupy.hstack([self.weights, cupy.zeros(padding)])
             self.non0tab = self.make_mask(mol, self.coords)
             self.screen_index = self.non0tab
         return self
 
-
-def _default_rad(nuc, level=3):
-    '''Number of radial grids '''
-    tab   = numpy.array( (2 , 10, 18, 36, 54, 86, 118))
-    period = (nuc > tab).sum()
-    return RAD_GRIDS[level,period]
-#                Period    1   2   3   4   5   6   7        # level
-RAD_GRIDS = numpy.array((( 10, 15, 20, 30, 35, 40, 50),     # 0
-                         ( 30, 40, 50, 60, 65, 70, 75),     # 1
-                         ( 40, 60, 65, 75, 80, 85, 90),     # 2
-                         ( 50, 75, 80, 90, 95,100,105),     # 3
-                         ( 60, 90, 95,105,110,115,120),     # 4
-                         ( 70,105,110,120,125,130,135),     # 5
-                         ( 80,120,125,135,140,145,150),     # 6
-                         ( 90,135,140,150,155,160,165),     # 7
-                         (100,150,155,165,170,175,180),     # 8
-                         (200,200,200,200,200,200,200),))   # 9
-
-def _default_ang(nuc, level=3):
-    '''Order of angular grids. See LEBEDEV_ORDER for the mapping of
-    the order and the number of angular grids'''
-    tab   = numpy.array( (2 , 10, 18, 36, 54, 86, 118))
-    period = (nuc > tab).sum()
-    return LEBEDEV_ORDER[ANG_ORDER[level,period]]
-#               Period    1   2   3   4   5   6   7         # level
-ANG_ORDER = numpy.array(((11, 15, 17, 17, 17, 17, 17 ),     # 0
-                         (17, 23, 23, 23, 23, 23, 23 ),     # 1
-                         (23, 29, 29, 29, 29, 29, 29 ),     # 2
-                         (29, 29, 35, 35, 35, 35, 35 ),     # 3
-                         (35, 41, 41, 41, 41, 41, 41 ),     # 4
-                         (41, 47, 47, 47, 47, 47, 47 ),     # 5
-                         (47, 53, 53, 53, 53, 53, 53 ),     # 6
-                         (53, 59, 59, 59, 59, 59, 59 ),     # 7
-                         (59, 59, 59, 59, 59, 59, 59 ),     # 8
-                         (65, 65, 65, 65, 65, 65, 65 ),))   # 9
-
-def _padding_size(ngrids, alignment):
-    if alignment <= 1:
-        return 0
-    return (ngrids + alignment - 1) // alignment * alignment - ngrids
+_default_rad = gen_grid._default_rad
+RAD_GRIDS = gen_grid.RAD_GRIDS
+_default_ang = gen_grid._default_ang
+ANG_ORDER = gen_grid.ANG_ORDER
+_padding_size = gen_grid._padding_size
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 276e15b0..008baa40 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -50,7 +50,7 @@ def _get_veff(ks_grad, mol=None, dm=None):
         grids = ks_grad.grids
     else:
         grids = mf.grids
-    
+
     if grids.coords is None:
         grids.build(sort_grids=True)
 
@@ -89,7 +89,7 @@ def _get_veff(ks_grad, mol=None, dm=None):
     occ_coeff = cupy.asarray(mf.mo_coeff[:, mf.mo_occ>0.5], order='C')
     tmp = contract('nij,jk->nik', vxc, occ_coeff)
     vxc = 2.0*contract('nik,ik->ni', tmp, occ_coeff)
-    
+
     aoslices = mol.aoslice_by_atom()
     vxc = [vxc[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
     vxc = cupy.asarray(vxc)
@@ -116,7 +116,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         opt = ni.gdftopt
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
-    
+
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
@@ -124,7 +124,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
            for dm in dms.reshape(-1,nao0,nao0)]
     mo_coeff = coeff @ mo_coeff
     nset = len(dms)
-    
+
     with opt.gdft_envs_cache():
         if xctype == 'LDA':
             ao_deriv = 1
@@ -136,10 +136,10 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         block_size = int((mem_avail*.4/8/(comp+1)/nao - 3*nao*2)/ ALIGNED) * ALIGNED
         block_size = min(block_size, MIN_BLK_SIZE)
         log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size)
-        
+
         if block_size < ALIGNED:
             raise RuntimeError('Not enough GPU memory')
-        
+
         vmat = cupy.zeros((nset,3,nao,nao))
         if xctype == 'LDA':
             ao_deriv = 1
@@ -207,7 +207,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     exc = None
     if nset == 1:
         vmat = vmat[0]
-    
+
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
 
@@ -221,7 +221,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
-    
+
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
@@ -255,9 +255,9 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         wv = vv_vxc[:,p0:p1] * weight
         wv[0] *= .5  # *.5 because vmat + vmat.T at the end
         vmat += _gga_grad_sum_(ao, wv)
-    
+
     vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff)
-    
+
     exc = None
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
@@ -288,7 +288,7 @@ def _d1_dot_(ao1, ao2):
     vmat1 = cupy.dot(ao1[1], ao2)
     vmat2 = cupy.dot(ao1[2], ao2)
     return cupy.stack([vmat0,vmat1,vmat2])
-    
+
 def _gga_grad_sum_(ao, wv):
     #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4])
     aow = numint._scale_ao(ao[:4], wv[:4])
@@ -296,7 +296,7 @@ def _gga_grad_sum_(ao, wv):
     aow = _make_dR_dao_w(ao, wv[:4])
     vmat += _d1_dot_(aow, ao[0].T)
     return vmat
-    
+
 # XX, XY, XZ = 4, 5, 6
 # YX, YY, YZ = 5, 7, 8
 # ZX, ZY, ZZ = 6, 8, 9
@@ -342,10 +342,10 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         block_size = int((mem_avail*.4/8/(comp+1)/nao - 3*nao*2)/ ALIGNED) * ALIGNED
         block_size = min(block_size, MIN_BLK_SIZE)
         log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size)
-        
+
         if block_size < ALIGNED:
             raise RuntimeError('Not enough GPU memory')
-        
+
         for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)):
             ngrids = weight.size
             for p0, p1 in lib.prange(0,ngrids,block_size):
@@ -371,7 +371,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                     wv = weight[p0:p1] * vxc
                     wv[0] *= .5
                     wv[4] *= .5  # for the factor 1/2 in tau
-                    
+
                     vmat += _gga_grad_sum_(ao, wv)
                     vmat += _tau_grad_dot_(ao, wv[4])
 
@@ -502,7 +502,9 @@ def get_du(ia, ib):  # JCP 98, 5612 (1993); (B10)
 
 class Gradients(rhf_grad.Gradients, pyscf.grad.rks.Gradients):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
-    
+
+    get_veff = _get_veff
+
     def get_dispersion(self):
         if self.base.disp[:2].upper() == 'D3':
             from pyscf import lib
@@ -511,12 +513,12 @@ def get_dispersion(self):
                 d3 = disp.DFTD3Dispersion(self.mol, xc=self.base.xc, version=self.base.disp)
                 _, g_d3 = d3.kernel()
             return g_d3
-        
+
         if self.base.disp[:2].upper() == 'D4':
             from pyscf.data.elements import charge
             atoms = numpy.array([ charge(a[0]) for a in self.mol._atom])
             coords = self.mol.atom_coords()
-            
+
             from pyscf import lib
             with lib.with_omp_threads(1):
                 from dftd4.interface import DampingParam, DispersionModel
diff --git a/gpu4pyscf/grad/tests/test_rks_grad.py b/gpu4pyscf/grad/tests/test_rks_grad.py
index 04d7c7c8..59f36f14 100644
--- a/gpu4pyscf/grad/tests/test_rks_grad.py
+++ b/gpu4pyscf/grad/tests/test_rks_grad.py
@@ -40,7 +40,7 @@ def tearDownModule():
     global mol
     mol.stdout.close()
     del mol
-    
+
 def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
     mf = rks.RKS(mol, xc=xc)
     mf.direct_scf_tol = 1e-14
@@ -50,11 +50,10 @@ def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
     if mf._numint.libxc.is_nlc(mf.xc):
         mf.nlcgrids.level = nlcgrids_level
     mf.kernel()
-
     cpu_gradient = pyscf.grad.RKS(mf)
     cpu_gradient.grid_response = grid_response
     g_cpu = cpu_gradient.kernel()
-    
+
 
     # TODO: use to_gpu functionality
     mf.__class__ = gpu4pyscf.dft.rks.RKS
@@ -63,49 +62,51 @@ def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
     mf.grids.level = grids_level
     mf.grids.prune = None
     mf.grids.small_rho_cutoff = 1e-30
+    mf.grids.build()
     if mf._numint.libxc.is_nlc(mf.xc):
         mf.nlcgrids = gpu4pyscf.dft.gen_grid.Grids(mol)
         mf.nlcgrids.level = nlcgrids_level
-    
+        mf.nlcgrids.build()
+
     gpu_gradient = gpu4pyscf.grad.RKS(mf)
     gpu_gradient.grid_response = grid_response
     g_gpu = gpu_gradient.kernel()
     assert(cupy.linalg.norm(g_cpu - g_gpu) < tol)
 
 class KnownValues(unittest.TestCase):
-    
+
     def test_grad_with_grids_response(self):
         print("-----testing DFT gradient with grids response----")
         _check_grad(grid_response=True, tol=1e-5)
-    
+
     def test_grad_without_grids_response(self):
         print('-----testing DFT gradient without grids response----')
         _check_grad(grid_response=False, tol=1e-5)
-    
+
     def test_grad_lda(self):
         print("-----LDA testing-------")
         _check_grad(xc='LDA', disp=None, tol=1e-5)
-    
+
     def test_grad_gga(self):
         print('-----GGA testing-------')
         _check_grad(xc='PBE', disp=None, tol=1e-5)
-    
+
     def test_grad_hybrid(self):
         print('------hybrid GGA testing--------')
         _check_grad(xc='B3LYP', disp=None, tol=1e-5)
-    
+
     def test_grad_mgga(self):
         print('-------mGGA testing-------------')
         _check_grad(xc='m06', disp=None, tol=1e-4)
-    
+
     def test_grad_rsh(self):
         print('--------RSH testing-------------')
         _check_grad(xc='wb97', disp=None, tol=1e-4)
-    
+
     def test_grad_nlc(self):
         print('--------nlc testing-------------')
         _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-5)
-    
+
 if __name__ == "__main__":
     print("Full Tests for Gradient")
     unittest.main()

From 4b1e36de4204b012f7cba6c2137fa5e55299b91b Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 17 Oct 2023 23:31:53 -0700
Subject: [PATCH 08/19] added __init__.py files (#50)

---
 MANIFEST.in                      |  3 ++-
 gpu4pyscf/__init__.py            |  2 +-
 gpu4pyscf/df/__init__.py         | 15 +++++++++++++++
 gpu4pyscf/df/cderi.py            |  6 +++---
 gpu4pyscf/df/grad/__init__.py    | 19 +++++++++++++++++++
 gpu4pyscf/df/hessian/__init__.py | 19 +++++++++++++++++++
 gpu4pyscf/gto/__init__.py        | 14 ++++++++++++++
 gpu4pyscf/lib/__init__.py        | 15 +++++++++++++++
 8 files changed, 88 insertions(+), 5 deletions(-)
 create mode 100644 gpu4pyscf/df/__init__.py
 create mode 100644 gpu4pyscf/df/grad/__init__.py
 create mode 100644 gpu4pyscf/df/hessian/__init__.py
 create mode 100644 gpu4pyscf/gto/__init__.py

diff --git a/MANIFEST.in b/MANIFEST.in
index bb38fb88..c53a5f21 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,13 +2,14 @@ include MANIFEST.in
 include README.md setup.py CHANGELOG AUTHORS LICENSE NOTICE
 
 global-exclude *.py[cod]
+global-exclude *~
 #global-exclude *.cu
 #global-exclude *.h
 #global-exclude *.c
 #global-exclude *.cuh
 #global-exclude *.sh
 
-prune */__pycache__ 
+prune */__pycache__
 recursive-exclude */__pycache__ *
 
 prune gpu4pyscf/lib/build
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index 143af69c..7fb02e52 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,2 +1,2 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.2'
+__version__ = '0.6.3'
diff --git a/gpu4pyscf/df/__init__.py b/gpu4pyscf/df/__init__.py
new file mode 100644
index 00000000..6716c097
--- /dev/null
+++ b/gpu4pyscf/df/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
diff --git a/gpu4pyscf/df/cderi.py b/gpu4pyscf/df/cderi.py
index 07ee360e..c1ae59c7 100644
--- a/gpu4pyscf/df/cderi.py
+++ b/gpu4pyscf/df/cderi.py
@@ -40,7 +40,7 @@ def __init__(self, nao, naux, nblocks) -> None:
             ctypes.c_int(nblocks),
             ctypes.c_int(nao))
         return
-    
+
     def __del__(self):
         self.row = []
         self.col = []
@@ -57,8 +57,8 @@ def add_block(self, data, rows, cols):
         assert rows.dtype == cupy.int64 and cols.dtype == cupy.int64
         nij = len(rows)
         err = libcupy_helper.add_block(
-            ctypes.byref(self.handle), 
-            ctypes.c_int(nij), 
+            ctypes.byref(self.handle),
+            ctypes.c_int(nij),
             ctypes.c_int(self.naux),
             ctypes.cast(rows.data.ptr, ctypes.c_void_p),
             ctypes.cast(cols.data.ptr, ctypes.c_void_p),
diff --git a/gpu4pyscf/df/grad/__init__.py b/gpu4pyscf/df/grad/__init__.py
new file mode 100644
index 00000000..22c672e3
--- /dev/null
+++ b/gpu4pyscf/df/grad/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from . import rhf, rks
+
+RHF = rhf.Gradients
+RKS = rks.Gradients
\ No newline at end of file
diff --git a/gpu4pyscf/df/hessian/__init__.py b/gpu4pyscf/df/hessian/__init__.py
new file mode 100644
index 00000000..2b55ed12
--- /dev/null
+++ b/gpu4pyscf/df/hessian/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from . import rhf, rks
+
+RHF = rhf.Hessian
+RKS = rks.Hessian
\ No newline at end of file
diff --git a/gpu4pyscf/gto/__init__.py b/gpu4pyscf/gto/__init__.py
new file mode 100644
index 00000000..25a4587e
--- /dev/null
+++ b/gpu4pyscf/gto/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
diff --git a/gpu4pyscf/lib/__init__.py b/gpu4pyscf/lib/__init__.py
index 8ef56b43..147324d9 100644
--- a/gpu4pyscf/lib/__init__.py
+++ b/gpu4pyscf/lib/__init__.py
@@ -1,3 +1,18 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 import os
 import numpy
 from gpu4pyscf.lib import diis

From d22e8823e85a5e354cb4087c28a2ea9aacc16441 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 17 Oct 2023 23:59:27 -0700
Subject: [PATCH 09/19] Create __init__.py

---
 gpu4pyscf/solvent/grad/__init__.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 gpu4pyscf/solvent/grad/__init__.py

diff --git a/gpu4pyscf/solvent/grad/__init__.py b/gpu4pyscf/solvent/grad/__init__.py
new file mode 100644
index 00000000..25a4587e
--- /dev/null
+++ b/gpu4pyscf/solvent/grad/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.

From 3b7b0917f1dd7ab0053083b19982342b3c080b11 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Wed, 18 Oct 2023 00:18:29 -0700
Subject: [PATCH 10/19] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0020f873..58b87b32 100644
--- a/README.md
+++ b/README.md
@@ -47,9 +47,10 @@ Features
 - SCF, analytical Gradient, and analytical Hessian calculations for Hartree-Fock and DFT;
 - LDA, GGA, mGGA, hybrid, and range-separated functionals via [libXC](https://gitlab.com/libxc/libxc/-/tree/master/);
 - Geometry optimization and transition state search via [geomeTRIC](https://geometric.readthedocs.io/en/latest/);
-- Dispersion corrections via [DFT3](https://github.com/dftd3/simple-dftd3) and [DFT4](https://github.com/dftd4/dftd4);
+- Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4);
 - Nonlocal functional correction (vv10) for SCF and gradient;
 - ECP is supported and calculated on CPU;
+- PCM solvent models and their analytical gradients;
 
 Limitations
 --------

From 7c343e4711341e0c50f958bbaadca2285ce7daf2 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Wed, 18 Oct 2023 12:43:56 -0700
Subject: [PATCH 11/19] Refactor _DFHF class. Add tests for to_cpu (#46)

* Refactor _DFHF class. Add tests for to_cpu

* Undefined variables

* Update df_jk.py

---------

Co-authored-by: Xiaojie Wu <wxj6000@gmail.com>
---
 gpu4pyscf/df/df.py                |   1 +
 gpu4pyscf/df/df_jk.py             | 278 ++++++++++++++++--------------
 gpu4pyscf/df/tests/test_df_scf.py |  25 ++-
 gpu4pyscf/dft/gks.py              |   3 +-
 gpu4pyscf/dft/rks.py              |  51 +++---
 gpu4pyscf/lib/utils.py            |   4 +-
 gpu4pyscf/scf/hf.py               |  21 ++-
 gpu4pyscf/scf/tests/test_scf.py   |  17 +-
 8 files changed, 234 insertions(+), 166 deletions(-)

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 75e0753a..cc199ca2 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -47,6 +47,7 @@ def __init__(self, mol, auxbasis=None):
     def to_cpu(self):
         from gpu4pyscf.lib.utils import to_cpu
         obj = to_cpu(self)
+        del obj.intopt, obj.cd_low, obj.nao, obj.naux
         return obj.reset()
 
     def build(self, direct_scf_tol=1e-14, omega=None):
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index d52dfff2..c46bb999 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -96,8 +96,6 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
         with_df.verbose = mf.verbose
         with_df.auxbasis = auxbasis
 
-    mf_class = mf.__class__
-
     if isinstance(mf, df_jk._DFHF):
         if mf.with_df is None:
             mf.with_df = with_df
@@ -108,139 +106,153 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
             mf.only_dfj = only_dfj
         return mf
 
-    class DensityFitting(df_jk._DFHF, mf_class):
-        __doc__ = '''
-        Density fitting SCF class
-        Attributes for density-fitting SCF:
-            auxbasis : str or basis dict
-                Same format to the input attribute mol.basis.
-                The default basis 'weigend+etb' means weigend-coulomb-fit basis
-                for light elements and even-tempered basis for heavy elements.
-            with_df : DF object
-                Set mf.with_df = None to switch off density fitting mode.
-        See also the documents of class %s for other SCF attributes.
-        ''' % mf_class
-
-        from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
-
-        def __init__(self, mf, dfobj, only_dfj):
-            self.__dict__.update(mf.__dict__)
-            self._eri = None
-            self.rhoj = None
-            self.rhok = None
-            self.direct_scf = False
-            self.with_df = dfobj
-            self.only_dfj = only_dfj
-            self._keys = self._keys.union(['with_df', 'only_dfj'])
-
-        init_workflow = init_workflow
-
-        def reset(self, mol=None):
-            self.with_df.reset(mol)
-            return mf_class.reset(self, mol)
-
-        def get_jk(self, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
-                   omega=None):
-            if dm is None: dm = self.make_rdm1()
-            if self.with_df and self.only_dfj:
-                vj = vk = None
-                if with_j:
-                    vj, vk = self.with_df.get_jk(dm, hermi, True, False,
-                                                 self.direct_scf_tol, omega)
-                if with_k:
-                    vk = mf_class.get_jk(self, mol, dm, hermi, False, True, omega)[1]
-            elif self.with_df:
-                vj, vk = self.with_df.get_jk(dm, hermi, with_j, with_k,
-                                             self.direct_scf_tol, omega)
-            else:
-                vj, vk = mf_class.get_jk(self, mol, dm, hermi, with_j, with_k, omega)
-            return vj, vk
-
-        def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
-            '''
-            effective potential
-            '''
-            if mol is None: mol = self.mol
-            if dm is None: dm = self.make_rdm1()
-
-            # for DFT
-            if mf_class == rks.RKS:
-                return rks.get_veff(self, dm=dm)
-
-            if self.direct_scf:
-                ddm = cupy.asarray(dm) - dm_last
-                vj, vk = self.get_jk(mol, ddm, hermi=hermi)
-                return vhf_last + vj - vk * .5
-            else:
-                vj, vk = self.get_jk(mol, dm, hermi=hermi)
-                return vj - vk * .5
-
-        def energy_elec(self, dm=None, h1e=None, vhf=None):
-            '''
-            electronic energy
-            '''
-            if dm is None: dm = self.make_rdm1()
-            if h1e is None: h1e = self.get_hcore()
-            if vhf is None: vhf = self.get_veff(self.mol, dm)
-            # for DFT
-            if mf_class == rks.RKS:
-                e1 = cupy.sum(h1e*dm)
-                ecoul = self.ecoul
-                exc = self.exc
-                e2 = ecoul + exc
-                #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}')
-                return e1+e2, e2
-
-            e1 = cupy.einsum('ij,ji->', h1e, dm).real
-            e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5
-            self.scf_summary['e1'] = e1
-            self.scf_summary['e2'] = e_coul
-            #logger.debug(self, 'E1 = %s  E_coul = %s', e1, e_coul)
-            return e1+e_coul, e_coul
-
-        def energy_tot(self, dm, h1e, vhf=None):
-            '''
-            compute tot energy
-            '''
-            nuc = self.energy_nuc()
-            e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc
-            self.scf_summary['nuc'] = nuc.real
-            return e_tot
-
-        def nuc_grad_method(self):
-            if mf_class == rks.RKS:
-                from gpu4pyscf.df.grad import rks as rks_grad
-                return rks_grad.Gradients(self)
-            if mf_class == hf.RHF:
-                from gpu4pyscf.df.grad import rhf as rhf_grad
-                return rhf_grad.Gradients(self)
-            raise NotImplementedError()
-
-
-        def Hessian(self):
-            from gpu4pyscf.df.hessian import rhf, rks
-            if isinstance(self, scf.rhf.RHF):
-                if isinstance(self, scf.hf.KohnShamDFT):
-                    return rks.Hessian(self)
-                else:
-                    return rhf.Hessian(self)
-            else:
-                raise NotImplementedError
+    dfmf = _DFHF(mf, with_df, only_dfj)
+    return lib.set_class(dfmf, (_DFHF, mf.__class__))
 
-        # for pyscf 1.0, 1.1 compatibility
-        @property
-        def _cderi(self):
-            naux = self.with_df.get_naoaux()
-            return next(self.with_df.loop(blksize=naux))
-        @_cderi.setter
-        def _cderi(self, x):
-            self.with_df._cderi = x
+class _DFHF(df_jk._DFHF):
+    '''
+    Density fitting SCF class
+    Attributes for density-fitting SCF:
+        auxbasis : str or basis dict
+            Same format to the input attribute mol.basis.
+            The default basis 'weigend+etb' means weigend-coulomb-fit basis
+            for light elements and even-tempered basis for heavy elements.
+        with_df : DF object
+            Set mf.with_df = None to switch off density fitting mode.
+    '''
 
-        @property
-        def auxbasis(self):
-            return getattr(self.with_df, 'auxbasis', None)
+    from gpu4pyscf.lib.utils import to_gpu, device
+
+    def __init__(self, mf, dfobj, only_dfj):
+        self.__dict__.update(mf.__dict__)
+        self._eri = None
+        self.rhoj = None
+        self.rhok = None
+        self.direct_scf = False
+        self.with_df = dfobj
+        self.only_dfj = only_dfj
+        self._keys = self._keys.union(['with_df', 'only_dfj'])
+
+    def undo_df(self):
+        '''Remove the DFHF Mixin'''
+        obj = lib.view(self, lib.drop_class(self.__class__, _DFHF))
+        del obj.rhoj, obj.rhok, obj.with_df, obj.only_dfj
+        return obj
+
+    def reset(self, mol=None):
+        self.with_df.reset(mol)
+        return super().reset(mol)
+
+    init_workflow = init_workflow
+
+    def get_jk(self, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
+               omega=None):
+        if dm is None: dm = self.make_rdm1()
+        if self.with_df and self.only_dfj:
+            vj = vk = None
+            if with_j:
+                vj, vk = self.with_df.get_jk(dm, hermi, True, False,
+                                             self.direct_scf_tol, omega)
+            if with_k:
+                vk = super().get_jk(mol, dm, hermi, False, True, omega)[1]
+        elif self.with_df:
+            vj, vk = self.with_df.get_jk(dm, hermi, with_j, with_k,
+                                         self.direct_scf_tol, omega)
+        else:
+            vj, vk = super().get_jk(mol, dm, hermi, with_j, with_k, omega)
+        return vj, vk
 
-    return DensityFitting(mf, with_df, only_dfj)
+    def nuc_grad_method(self):
+        if isinstance(self, rks.RKS):
+            from gpu4pyscf.df.grad import rks as rks_grad
+            return rks_grad.Gradients(self)
+        if isinstance(self, hf.RHF):
+            from gpu4pyscf.df.grad import rhf as rhf_grad
+            return rhf_grad.Gradients(self)
+        raise NotImplementedError()
+
+    def Hessian(self):
+        from pyscf.dft.rks import KohnShamDFT
+        from gpu4pyscf.df.hessian import rhf, rks
+        if isinstance(self, scf.rhf.RHF):
+            if isinstance(self, KohnShamDFT):
+                return rks.Hessian(self)
+            else:
+                return rhf.Hessian(self)
+        else:
+            raise NotImplementedError
+
+    @property
+    def auxbasis(self):
+        return getattr(self.with_df, 'auxbasis', None)
+     
+    def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
+        '''
+        effective potential
+        '''
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.make_rdm1()
+
+        # for DFT
+        if super() == rks.RKS:
+            return rks.get_veff(self, dm=dm)
+
+        if self.direct_scf:
+            ddm = cupy.asarray(dm) - dm_last
+            vj, vk = self.get_jk(mol, ddm, hermi=hermi)
+            return vhf_last + vj - vk * .5
+        else:
+            vj, vk = self.get_jk(mol, dm, hermi=hermi)
+            return vj - vk * .5
+
+    def energy_elec(self, dm=None, h1e=None, vhf=None):
+        '''
+        electronic energy
+        '''
+        if dm is None: dm = self.make_rdm1()
+        if h1e is None: h1e = self.get_hcore()
+        if vhf is None: vhf = self.get_veff(self.mol, dm)
+        # for DFT
+        if super() == rks.RKS:
+            e1 = cupy.sum(h1e*dm)
+            ecoul = self.ecoul
+            exc = self.exc
+            e2 = ecoul + exc
+            #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}')
+            return e1+e2, e2
+
+        e1 = cupy.einsum('ij,ji->', h1e, dm).real
+        e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5
+        self.scf_summary['e1'] = e1
+        self.scf_summary['e2'] = e_coul
+        #logger.debug(self, 'E1 = %s  E_coul = %s', e1, e_coul)
+        return e1+e_coul, e_coul
+
+    def energy_tot(self, dm, h1e, vhf=None):
+        '''
+        compute tot energy
+        '''
+        nuc = self.energy_nuc()
+        e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc
+        self.scf_summary['nuc'] = nuc.real
+        return e_tot
+
+
+    def to_cpu(self):
+        obj = self.undo_df().to_cpu().density_fit()
+        keys = dir(obj)
+        obj.__dict__.update(self.__dict__)
+
+        for key in set(dir(self)).difference(keys):
+            delattr(obj, key)
+
+        for key in keys:
+            val = getattr(obj, key)
+            if isinstance(val, cupy.ndarray):
+                setattr(obj, key, cupy.asnumpy(val))
+            elif hasattr(val, 'to_cpu'):
+                setattr(obj, key, val.to_cpu())
+        return obj
 
 def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
     '''
@@ -387,4 +399,4 @@ def get_j(dfobj, dm, hermi=1, direct_scf_tol=1e-13):
     vj = int3c2e.get_j_int3c2e_pass2(intopt, rhoj)
     return vj
 
-density_fit = _density_fit
\ No newline at end of file
+density_fit = _density_fit
diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py
index ba959b69..1b042096 100644
--- a/gpu4pyscf/df/tests/test_df_scf.py
+++ b/gpu4pyscf/df/tests/test_df_scf.py
@@ -17,7 +17,9 @@
 import numpy as np
 import pyscf
 from pyscf import lib
+from pyscf.df import df_jk as cpu_df_jk
 from gpu4pyscf import scf
+from gpu4pyscf.df import df_jk
 from gpu4pyscf.dft import rks
 
 lib.num_threads(8)
@@ -37,7 +39,7 @@ def setUpModule():
     mol.output = '/dev/null'
     mol.build()
     mol.verbose = 1
-    
+
 def tearDownModule():
     global mol
     mol.stdout.close()
@@ -57,7 +59,7 @@ def test_rhf(self):
         mf = scf.RHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit')
         e_tot = mf.kernel()
         assert np.allclose(e_tot, -76.0624582299)
-    
+
     def test_rks_lda(self):
         print('------- LDA ----------------')
         e_tot = run_dft("LDA_X,LDA_C_VWN")
@@ -67,17 +69,17 @@ def test_rks_pbe(self):
         print('------- PBE ----------------')
         e_tot = run_dft('PBE')
         assert np.allclose(e_tot, -76.3800181250)
-    
+
     def test_rks_b3lyp(self):
         print('-------- B3LYP -------------')
         e_tot = run_dft('B3LYP')
         assert np.allclose(e_tot, -76.4666493796)
-    
+
     def test_rks_m06(self):
         print('--------- M06 --------------')
         e_tot = run_dft("M06")
         assert np.allclose(e_tot, -76.4265841359)
-    
+
     def test_rks_wb97(self):
         print('-------- wB97 --------------')
         e_tot = run_dft("HYB_GGA_XC_WB97")
@@ -88,6 +90,19 @@ def test_rks_wb97(self):
         e_tot = run_dft("HYB_MGGA_XC_WB97M_V")
         assert np.allclose(e_tot, -76.4334567297)
 
+    def test_to_cpu(self):
+        mf = scf.RHF(mol).density_fit().to_cpu()
+        assert isinstance(mf, cpu_df_jk._DFHF)
+        mf = mf.to_gpu()
+        assert isinstance(mf, df_jk._DFHF)
+
+        mf = rks.RKS(mol).density_fit().to_cpu()
+        assert isinstance(mf, cpu_df_jk._DFHF)
+        assert 'gpu' not in mf.grids.__module__
+        mf = mf.to_gpu()
+        assert isinstance(mf, df_jk._DFHF)
+        assert 'gpu' in mf.grids.__module__
+
 if __name__ == "__main__":
     print("Full Tests for SCF")
     unittest.main()
diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index 53992f0e..a94e31f5 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -23,8 +23,7 @@ class GKS(gks.GKS):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
     def __init__(self, mol, xc='LDA,VWN'):
-        super().__init__(mol, xc)
-        self._numint = numint.NumInt()
+        raise NotImplementedError
 
     get_jk = GHF.get_jk
     _eigh = GHF._eigh
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index 8c33ee95..31e67cc8 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -195,11 +195,38 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     else:
         ecoul = None
     t0 = logger.timer_debug1(ks, 'jk total', *t0)
-    ks.ecoul = ecoul
-    ks.exc = exc
     vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
     return vxc
 
+def energy_elec(ks, dm=None, h1e=None, vhf=None):
+    r'''Electronic part of RKS energy.
+
+    Note this function has side effects which cause mf.scf_summary updated.
+
+    Args:
+        ks : an instance of DFT class
+
+        dm : 2D ndarray
+            one-partical density matrix
+        h1e : 2D ndarray
+            Core hamiltonian
+
+    Returns:
+        RKS electronic energy and the 2-electron contribution
+    '''
+    if dm is None: dm = ks.make_rdm1()
+    if h1e is None: h1e = ks.get_hcore()
+    if vhf is None: vhf = ks.get_veff(ks.mol, dm)
+    e1 = cupy.einsum('ij,ji->', h1e, dm).real
+    ecoul = vhf.ecoul.real
+    exc = vhf.exc.real
+    e2 = ecoul + exc
+    ks.scf_summary['e1'] = e1
+    ks.scf_summary['coul'] = ecoul
+    ks.scf_summary['exc'] = exc
+    logger.debug(ks, 'E1 = %s  Ecoul = %s  Exc = %s', e1, ecoul, exc)
+    return e1+e2, e2
+
 class RKS(scf.hf.RHF, rks.RKS):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
@@ -241,27 +268,11 @@ def reset(self, mol=None):
         self._numint.gdftopt = None
         return self
 
-    def energy_elec(self, dm=None, h1e=None, vhf=None):
-        if dm is None: dm = self.make_rdm1()
-        if h1e is None: h1e = self.get_hcore()
-        if vhf is None: vhf = self.get_veff(self.mol, dm)
-
-        e1 = cupy.sum(h1e*dm)
-        ecoul = self.ecoul
-        exc = self.exc
-        e2 = ecoul + exc
-        return e1+e2, e2
-
-    def energy_tot(self, dm, h1e, vhf=None):
-        nuc = self.energy_nuc()
-        e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc
-        self.scf_summary['nuc'] = nuc.real
-        return e_tot
-
     def nuc_grad_method(self):
         from gpu4pyscf.grad import rks as rks_grad
         return rks_grad.Gradients(self)
-    
+
+    energy_elec = energy_elec
     get_jk = RHF.get_jk
     get_veff = get_veff
     _eigh = RHF._eigh
diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py
index 4489fab3..a5cf6187 100644
--- a/gpu4pyscf/lib/utils.py
+++ b/gpu4pyscf/lib/utils.py
@@ -38,10 +38,10 @@ def to_cpu(method):
             break
     method = method.view(pyscf_cls)
 
-    keys = set()
+    keys = []
     for cls in pyscf_cls.__mro__[:-1]:
         if hasattr(cls, '_keys'):
-            keys.update(cls._keys)
+            keys.extend(cls._keys)
     if keys:
         keys = set(keys).intersection(method.__dict__)
 
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index a3fb3417..1d58a02e 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -301,7 +301,7 @@ def get_occ(mf, mo_energy=None, mo_coeff=None):
     return mo_occ
 
 def get_veff(mf, mol=None, dm=None, dm_last=None, vhf_last=None, hermi=1, vhfopt=None):
-    if dm_last is None:
+    if dm_last is None or not mf.direct_scf:
         vj, vk = mf.get_jk(mol, cupy.asarray(dm), hermi)
         return vj - vk * .5
     else:
@@ -351,6 +351,20 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
         f = level_shift(s1e, dm*.5, f, level_shift_factor)
     return f
 
+def energy_elec(self, dm=None, h1e=None, vhf=None):
+    '''
+    electronic energy
+    '''
+    if dm is None: dm = self.make_rdm1()
+    if h1e is None: h1e = self.get_hcore()
+    if vhf is None: vhf = self.get_veff(self.mol, dm)
+    e1 = cupy.einsum('ij,ji->', h1e, dm).real
+    e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5
+    self.scf_summary['e1'] = e1
+    self.scf_summary['e2'] = e_coul
+    logger.debug(self, 'E1 = %s  E_coul = %s', e1, e_coul)
+    return e1+e_coul, e_coul
+
 def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
            dump_chk=True, dm0=None, callback=None, conv_check=True, **kwargs):
     conv_tol = mf.conv_tol
@@ -371,7 +385,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
         mo_occ = cupy.asarray(dm0.mo_occ)
         occ_coeff = cupy.asarray(mo_coeff[:,mo_occ>0])
         dm = tag_array(dm, occ_coeff=occ_coeff, mo_occ=mo_occ, mo_coeff=mo_coeff)
-    
+
     # use optimized workflow if possible
     if hasattr(mf, 'init_workflow'):
         mf.init_workflow(dm0=dm)
@@ -552,6 +566,7 @@ class RHF(hf.RHF):
     #_eigh = staticmethod(_eigh)
     _eigh = _eigh
     make_rdm1 = make_rdm1
+    energy_elec = energy_elec
     get_fock = get_fock
     get_occ = get_occ
     get_veff = get_veff
@@ -595,7 +610,7 @@ def reset(self, mol=None):
     def nuc_grad_method(self):
         from gpu4pyscf.grad import rhf
         return rhf.Gradients(self)
-    
+
     def density_fit(self, auxbasis=None, with_df=None, only_dfj=False):
         import gpu4pyscf.df.df_jk
         return gpu4pyscf.df.df_jk.density_fit(self, auxbasis, with_df, only_dfj)
diff --git a/gpu4pyscf/scf/tests/test_scf.py b/gpu4pyscf/scf/tests/test_scf.py
index c5b32ace..3dd94806 100644
--- a/gpu4pyscf/scf/tests/test_scf.py
+++ b/gpu4pyscf/scf/tests/test_scf.py
@@ -18,6 +18,8 @@
 import cupy
 import pyscf
 from pyscf import lib
+from pyscf import scf as cpu_scf
+from pyscf import dft as cpu_dft
 from gpu4pyscf import scf
 from gpu4pyscf.dft import rks
 
@@ -47,6 +49,19 @@ def test_rhf(self):
         e_tot = mf.kernel()
         assert np.allclose(e_tot, -76.0667232412)
 
+    def test_to_cpu(self):
+        mf = scf.RHF(mol).to_cpu()
+        assert isinstance(mf, cpu_scf.RHF)
+        mf = mf.to_gpu()
+        assert isinstance(mf, scf.RHF)
+
+        mf = rks.RKS(mol).to_cpu()
+        assert isinstance(mf, cpu_dft.rks.RKS)
+        assert 'gpu' not in mf.grids.__module__
+        mf = mf.to_gpu()
+        assert isinstance(mf, rks.RKS)
+        assert 'gpu' in mf.grids.__module__
+
 if __name__ == "__main__":
     print("Full Tests for SCF")
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 5f880a250ad3e67c40df6b0fd5f29038396f63ea Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 24 Oct 2023 15:54:19 -0700
Subject: [PATCH 12/19] Optimize hessian intermediate variables (#51)

* numpy -> cupy for solvent

* for linter

* remove grad switch from pcm.py

* passed flake8

* solvent integrals on GPU

* flake8

* compatiable with pyscf-2.4.0

* added solvent

* fixed issues for to_cpu

* store intermeidate variable on CPU

* cupy.einsum -> contract
---
 benchmarks/df/dft_driver.py            |  13 +-
 benchmarks/df/run_gpu4pyscf.sh         |   2 +-
 examples/00-h2o.py                     |   4 +-
 examples/dft_driver.py                 |  20 +--
 examples/sp.in                         |  20 ---
 gpu4pyscf/df/df.py                     |   8 +-
 gpu4pyscf/df/df_jk.py                  |  38 ++----
 gpu4pyscf/df/hessian/rhf.py            | 146 ++++++++++++---------
 gpu4pyscf/df/int3c2e.py                | 168 ++++++++++++++++---------
 gpu4pyscf/df/tests/test_df_scf.py      |  16 ++-
 gpu4pyscf/hessian/rhf.py               |   3 +-
 gpu4pyscf/lib/cupy_helper.py           |   3 +-
 gpu4pyscf/lib/gdft/nr_eval_gto.cu      |  38 +++---
 gpu4pyscf/lib/gdft/nr_numint_sparse.cu |   6 +-
 gpu4pyscf/lib/gdft/vv10.cu             |  26 ++--
 gpu4pyscf/lib/utils.py                 |   1 -
 gpu4pyscf/scf/tests/test_scf.py        |  23 ++--
 gpu4pyscf/solvent/grad/pcm.py          |   1 -
 gpu4pyscf/solvent/pcm.py               |   2 +-
 19 files changed, 290 insertions(+), 248 deletions(-)
 delete mode 100644 examples/sp.in

diff --git a/benchmarks/df/dft_driver.py b/benchmarks/df/dft_driver.py
index b2682fec..ea979df8 100644
--- a/benchmarks/df/dft_driver.py
+++ b/benchmarks/df/dft_driver.py
@@ -16,6 +16,8 @@
 parser.add_argument('--input_path',   type=str, default='./')
 parser.add_argument('--output_path',  type=str, default='./')
 parser.add_argument('--with_hessian', type=bool, default=False)
+parser.add_argument('--solvent',      type=str, default='')
+
 args = parser.parse_args()
 bas = args.basis
 verbose = args.verbose
@@ -39,13 +41,18 @@
     output_file = 'PySCF-16-cores-CPU.csv'
 output_file = args.output_path + output_file
 
-def run_dft(filename):  
+def run_dft(filename):
     mol = pyscf.M(atom=filename, basis=bas, max_memory=64000)
-    start_time = time.time()  
+    start_time = time.time()
     # set verbose >= 6 for debugging timer
     mol.verbose = 4 #verbose
     mol.max_memory = 40000
     mf = rks.RKS(mol, xc=xc).density_fit(auxbasis='def2-universal-jkfit')
+    if args.solvent:
+        mf = mf.PCM()
+        mf.lebedev_order = 29
+        mf.method = 'IEF-PCM'
+
     mf.grids.atom_grid = (99,590)
     mf.chkfile = None
     prep_time = time.time() - start_time
@@ -75,7 +82,7 @@ def run_dft(filename):
     # calculate hessian
     if args.device == 'GPU':
         cupy.get_default_memory_pool().free_all_blocks()
-    
+
     hess_time = -1
     if args.with_hessian:
         try:
diff --git a/benchmarks/df/run_gpu4pyscf.sh b/benchmarks/df/run_gpu4pyscf.sh
index c3c5dc6b..c50cfceb 100644
--- a/benchmarks/df/run_gpu4pyscf.sh
+++ b/benchmarks/df/run_gpu4pyscf.sh
@@ -3,7 +3,7 @@
 DIR="./organic/xc"
 [ ! -d "$DIR" ] && mkdir -p "$DIR"
 for xc in LDA PBE B3LYP M06 wB97m-v
-do 
+do
     python3 dft_driver.py --input_path ../molecules/organic/ --output_path ./organic/xc/$xc/ --xc $xc
 done
 exit
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
index 622a8194..7f17e62d 100644
--- a/examples/00-h2o.py
+++ b/examples/00-h2o.py
@@ -18,7 +18,7 @@
 from gpu4pyscf.dft import rks
 lib.num_threads(8)
 
-atom =''' 
+atom ='''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
@@ -34,7 +34,7 @@
 
 mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
 
-mol.verbose = 1
+mol.verbose = 4
 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
 mf_GPU.grids.level = grids_level
 mf_GPU.conv_tol = scf_tol
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 65ca2ad5..3b68d665 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -15,17 +15,16 @@
 
 import pyscf
 import time
+import argparse
 from pyscf import lib
-
 from gpu4pyscf.dft import rks
 lib.num_threads(8)
 
-import argparse
-
 parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules')
 parser.add_argument("--input",    type=str,  default='benzene/coord')
 parser.add_argument("--basis",    type=str,  default='def2-tzvpp')
 parser.add_argument("--auxbasis", type=str,  default='def2-tzvpp-jkfit')
+parser.add_argument("--xc",       type=str,  default='B3LYP')
 parser.add_argument("--solvent",  type=bool, default=False)
 args = parser.parse_args()
 
@@ -36,23 +35,28 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 4
+mol.verbose = 6
 
-mf_df = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit(auxbasis=args.auxbasis)
+mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
 if args.solvent:
     mf_df = mf_df.PCM()
+    mf_df.lebedev_order = 29
+    mf_df.method = 'IEF-PCM'
 mf_df.grids.atom_grid = (99,590)
 mf_df.kernel()
+scf_time = time.time() - start_time
+print(f'compute time for energy: {scf_time:.3f} s')
 
-print('compute time for energy: {}s'.format((time.time() - start_time)))
 start_time = time.time()
 g = mf_df.nuc_grad_method()
 g.auxbasis_response = True
 f = g.kernel()
-print('compute time for gradient: {}s'.format((time.time() - start_time)))
+grad_time = time.time() - start_time
+print(f'compute time for gradient: {grad_time:.3f} s')
 
 start_time = time.time()
 h = mf_df.Hessian()
 h.auxbasis_response = 2
 h_dft = h.kernel()
-print('compute time for hessian: {}s'.format((time.time() - start_time)))
+hess_time = time.time() - start_time
+print(f'compute time for hessian: {hess_time:.3f} s')
diff --git a/examples/sp.in b/examples/sp.in
deleted file mode 100644
index aba5a01a..00000000
--- a/examples/sp.in
+++ /dev/null
@@ -1,20 +0,0 @@
-$molecule
-0 1
-O       0.0000000000    -0.0000000000     0.1174000000
-H      -0.7570000000    -0.0000000000    -0.4696000000
-H       0.7570000000     0.0000000000    -0.4696000000
-$end
-
-$rem
-JOBTYPE  sp
-METHOD  B3LYP
-DFT_D D3_BJ
-BASIS   def2-tzvpp
-SCF_CONVERGENCE  10
-THRESH  14
-RI_J    TRUE
-RI_K    TRUE
-AUX_BASIS RIJK-def2-tzvpp
-PURECART 1111
-$end
-
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index cc199ca2..28998230 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -132,7 +132,7 @@ def get_blksize(self, extra=0, nao=None):
             raise RuntimeError("Not enough GPU memory")
         return blksize
 
-    
+
     def loop(self, blksize=None, unpack=True):
         '''
         loop over all cderi and unpack
@@ -208,12 +208,10 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
     else:
         use_gpu_memory = False
     if(not use_gpu_memory):
-        import warnings
-        warnings.warn("Not enough GPU memory")
+        log.debug("Not enough GPU memory")
         # TODO: async allocate memory
         mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
         cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
-
     data_stream = cupy.cuda.stream.Stream(non_blocking=False)
     count = 0
     nq = len(intopt.log_qs)
@@ -260,7 +258,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
         if cpi == cpj:
             ints_slices = ints_slices + ints_slices.transpose([0,2,1])
         ints_slices = ints_slices[:,col,row]
-        
+
         if cd_low.tag == 'eig':
             cderi_block = cupy.dot(cd_low.T, ints_slices)
             ints_slices = None
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index c46bb999..7f6c3933 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -121,7 +121,7 @@ class _DFHF(df_jk._DFHF):
             Set mf.with_df = None to switch off density fitting mode.
     '''
 
-    from gpu4pyscf.lib.utils import to_gpu, device
+    from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
     def __init__(self, mf, dfobj, only_dfj):
         self.__dict__.update(mf.__dict__)
@@ -131,7 +131,7 @@ def __init__(self, mf, dfobj, only_dfj):
         self.direct_scf = False
         self.with_df = dfobj
         self.only_dfj = only_dfj
-        self._keys = self._keys.union(['with_df', 'only_dfj'])
+        self._keys = mf._keys.union(['with_df', 'only_dfj'])
 
     def undo_df(self):
         '''Remove the DFHF Mixin'''
@@ -185,7 +185,7 @@ def Hessian(self):
     @property
     def auxbasis(self):
         return getattr(self.with_df, 'auxbasis', None)
-     
+
     def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
         '''
         effective potential
@@ -194,7 +194,7 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
         if dm is None: dm = self.make_rdm1()
 
         # for DFT
-        if super() == rks.RKS:
+        if isinstance(self, scf.hf.KohnShamDFT):
             return rks.get_veff(self, dm=dm)
 
         if self.direct_scf:
@@ -205,29 +205,6 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
             vj, vk = self.get_jk(mol, dm, hermi=hermi)
             return vj - vk * .5
 
-    def energy_elec(self, dm=None, h1e=None, vhf=None):
-        '''
-        electronic energy
-        '''
-        if dm is None: dm = self.make_rdm1()
-        if h1e is None: h1e = self.get_hcore()
-        if vhf is None: vhf = self.get_veff(self.mol, dm)
-        # for DFT
-        if super() == rks.RKS:
-            e1 = cupy.sum(h1e*dm)
-            ecoul = self.ecoul
-            exc = self.exc
-            e2 = ecoul + exc
-            #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}')
-            return e1+e2, e2
-
-        e1 = cupy.einsum('ij,ji->', h1e, dm).real
-        e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5
-        self.scf_summary['e1'] = e1
-        self.scf_summary['e2'] = e_coul
-        #logger.debug(self, 'E1 = %s  E_coul = %s', e1, e_coul)
-        return e1+e_coul, e_coul
-
     def energy_tot(self, dm, h1e, vhf=None):
         '''
         compute tot energy
@@ -237,13 +214,13 @@ def energy_tot(self, dm, h1e, vhf=None):
         self.scf_summary['nuc'] = nuc.real
         return e_tot
 
-
+    '''
     def to_cpu(self):
         obj = self.undo_df().to_cpu().density_fit()
         keys = dir(obj)
         obj.__dict__.update(self.__dict__)
-
         for key in set(dir(self)).difference(keys):
+            print(key)
             delattr(obj, key)
 
         for key in keys:
@@ -253,6 +230,7 @@ def to_cpu(self):
             elif hasattr(val, 'to_cpu'):
                 setattr(obj, key, val.to_cpu())
         return obj
+    '''
 
 def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
     '''
@@ -274,7 +252,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
     nset = dms.shape[0]
     t0 = (logger.process_clock(), logger.perf_counter())
     if dfobj._cderi is None:
-        log.warn('CDERI not found, build...')
+        log.debug('CDERI not found, build...')
         dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
 
     assert nao == dfobj.nao
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 335f01ca..dfcccddb 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -38,7 +38,7 @@
 import numpy as np
 from pyscf import lib, df
 from gpu4pyscf.hessian import rhf as rhf_hess
-from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
 
@@ -94,7 +94,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=128, group_size=128)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=64, group_size_aux=32)
     sph_ao_idx = intopt.sph_ao_idx
     sph_aux_idx = intopt.sph_aux_idx
 
@@ -117,11 +117,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         hk_ao_aux = cupy.zeros([nao,naux,3,3])
 
     #  int3c contributions
-    wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
+    wj, _, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
     rhoj0_P = contract('pq,q->p', int2c_inv, wj)
-    wk_P__ = contract('Lio,ir->Lro', wk_Pl_, mocc_2)
     rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__)
-    wj = wk_P__ = wk_Pl_ = None
+    wj = wk_P__ = None
     t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
 
     # int3c_ip2 contributions
@@ -143,34 +142,46 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         wj1_01 = None
 
     if with_k:
-        for p0, p1 in lib.prange(0,naux,64):
-            rhok1_Pko = contract('pq,iqox->pxio', int2c_inv[p0:p1], wk1_Pko)
-            # (10|0)(0|10) without response of RI basis
-            vk2_ip1_ip1 = cupy.einsum('ipox,pyko->kixy', wk1_Pko[:,p0:p1], rhok1_Pko)
-            hk_ao_ao += cupy.einsum('kixy,ki->ikxy', vk2_ip1_ip1, dm0)
-            vk2_ip1_ip1 = None
-            # (10|0)(0|01) without response of RI basis
-            bra = cupy.einsum('pyko,io->ikpy', rhok1_Pko, mocc_2)
-            ket = cupy.einsum('ipox,ko->ipkx', wk1_Pko[:,p0:p1], mocc_2)
-            hk_ao_ao += cupy.einsum('ikpy,ipkx->ikxy', bra, ket)
-            bra = ket = None
+        if hessobj.auxbasis_response:
+            wk1_P__ = contract('ypq,qor->ypor', int2c_ip1, rhok0_P__)
+            int2c_ip1_inv = cupy.asarray(int2c_ip1_inv)
+
+        for i0, i1 in lib.prange(0,nao,64):
+            wk1_Pko_islice = cupy.asarray(wk1_Pko[i0:i1])
+            rhok1_Pko = contract('pq,iqox->ipox', int2c_inv, wk1_Pko_islice)
+            for k0, k1 in lib.prange(0,nao,64):
+                wk1_Pko_kslice = cupy.asarray(wk1_Pko[k0:k1])
+
+                # (10|0)(0|10) without response of RI basis
+                vk2_ip1_ip1 = contract('ipox,kpoy->ikxy', rhok1_Pko, wk1_Pko_kslice)
+                hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0[i0:i1,k0:k1])
+                vk2_ip1_ip1 = None
+
+                # (10|0)(0|01) without response of RI basis
+                bra = contract('ipox,ko->ipkx', rhok1_Pko, mocc_2[k0:k1])
+                ket = contract('kpoy,io->kpiy', wk1_Pko_kslice, mocc_2[i0:i1])
+                hk_ao_ao[i0:i1,k0:k1] += contract('ipkx,kpiy->ikxy', bra, ket)
+                bra = ket = None
+            wk1_Pko_kslice = None
             if hessobj.auxbasis_response:
                 # (10|0)(1|00)
-                wk_ip2_Ipo = cupy.einsum('porx,io->ipxr', wk_ip2_P__[p0:p1], mocc_2)
-                hk_ao_aux[:,p0:p1] += cupy.einsum('pxio,ipyo->ipxy', rhok1_Pko, wk_ip2_Ipo)
+                wk_ip2_Ipo = contract('porx,io->iprx', wk_ip2_P__, mocc_2[i0:i1])
+                hk_ao_aux[i0:i1] += contract('ipox,ipoy->ipxy', rhok1_Pko, wk_ip2_Ipo)
                 wk_ip2_Ipo = None
+
                 # (10|0)(1|0)(0|00)
-                wk1_P__ = cupy.einsum('ypq,qor->ypor', int2c_ip1[:,p0:p1], rhok0_P__)
-                wk1_P_I = cupy.einsum('ypor,ir->ypoi', wk1_P__, mocc_2)
-                hk_ao_aux[:,p0:p1] -= cupy.einsum('pxio,ypoi->ipxy', rhok1_Pko, wk1_P_I)
-                wk1_P_I = wk1_P__ = None
+                wk1_P_I = contract('ypor,ir->ipoy', wk1_P__, mocc_2[i0:i1])
+                hk_ao_aux[i0:i1] -= contract("ipox,ipoy->ipxy", rhok1_Pko, wk1_P_I)
+                wk1_P_I = rhok1_Pko = None
+
                 # (10|0)(0|1)(0|00)
-                int2c_tmp = cupy.asarray(int2c_ip1_inv[:,p0:p1], order='C')
-                wk1_I = contract('yqp,ipox->qxyio', int2c_tmp, wk1_Pko)
-                rhok0_tmp = cupy.einsum('qor,ir->qoi', rhok0_P__[p0:p1], mocc_2)
-                hk_ao_aux[:,p0:p1] -= cupy.einsum('qoi,qxyio->iqxy', rhok0_tmp, wk1_I)
+                wk1_I = contract('yqp,ipox->iqoxy', int2c_ip1_inv, wk1_Pko_islice)
+                rhok0_tmp = contract('qor,ir->iqo', rhok0_P__, mocc_2[i0:i1])
+                hk_ao_aux[i0:i1] -= contract('iqo,iqoxy->iqxy', rhok0_tmp, wk1_I)
                 wk1_I = rhok0_tmp = None
-        wk1_Pko = rhok1_Pko = int2c_tmp = None
+            wk1_Pko_islice = None
+        wk1_P__ = None
+    wk1_Pko = None
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
     cupy.get_default_memory_pool().free_all_blocks()
@@ -184,6 +195,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     hj, hk = int3c2e.get_int3c2e_ipvip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
     hj_ao_ao += 2.0*hj
     hk_ao_ao += hk
+    hj = hk = None
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1)
 
     #  int3c_ip1ip2 contributions
@@ -192,6 +204,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         hj, hk = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
         hj_ao_aux += hj
         hk_ao_aux += hk
+        hj = hk = None
         t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1)
 
     #  int3c_ipip2 contributions
@@ -200,11 +213,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         hj, hk = int3c2e.get_int3c2e_ipip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
         hj_aux_diag = hj
         hk_aux_diag = .5*hk
+        hj = hk = None
         t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
 
     # int2c contributions
     if hessobj.auxbasis_response > 1:
-        aux_aux_9 = cupy.ix_(np.arange(9), sph_aux_idx, sph_aux_idx)
+        aux_aux_9 = np.ix_(np.arange(9), sph_aux_idx, sph_aux_idx)
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
@@ -212,12 +226,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1)
         int2c_ipip1 = int2c_ipip1[aux_aux_9]
-        rhoj2c_P = cupy.einsum('xpq,q->xp', int2c_ipip1, rhoj0_P)
+        rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         hj_aux_diag -= cupy.einsum('p,xp->px', rhoj0_P, rhoj2c_P).reshape(-1,3,3)
         if with_k:
-            rho2c_0 = cupy.einsum('pij,qji->pq', rhok0_P__, rhok0_P__)
-            hk_aux_diag -= .5 * cupy.einsum('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
+            rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
+            hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
         if omega and omega > 1e-10:
@@ -229,39 +243,41 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         int2c_ip1ip2 = int2c_ip1ip2[aux_aux_9]
         hj_aux_aux = -.5 * cupy.einsum('p,xpq,q->pqx', rhoj0_P, int2c_ip1ip2, rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
-            hk_aux_aux = -.5 * cupy.einsum('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
+            hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
         int2c_ip1ip2 = aux_aux_9 = None
 
+    cupy.get_default_memory_pool().free_all_blocks()
+    release_gpu_stack()
     # aux-aux pair
     if hessobj.auxbasis_response > 1:
-        wj0_10 = cupy.einsum('ypq,p->ypq', int2c_ip1, rhoj0_P)
-        rhoj1 = cupy.einsum('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
-        rhoj0_01 = cupy.einsum('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
-        rhoj0_10 = cupy.einsum('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
-
-        hj_aux_aux += .5 * cupy.einsum('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
-        hj_aux_aux -=      cupy.einsum('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
-        hj_aux_aux += .5 * cupy.einsum('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
-        hj_aux_aux -=      cupy.einsum('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
-        hj_aux_aux += .5 * cupy.einsum('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
-        hj_aux_aux +=      cupy.einsum('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
+        wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
+        rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
+        rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
+        rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
+
+        hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
+        hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
+        hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
+        hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
+        hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
+        hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
         wj0_01 = wj0_10 = rhoj1 = rhoj0_01 = rhoj0_10 = rhoj0_P = wj_ip2 = None
 
         if with_k:
-            rho2c_10 = cupy.einsum('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
-            rho2c_11 = cupy.einsum('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__)
-            rho2c0_10 = cupy.einsum('xpq,qr->xpr', int2c_ip1, rho2c_0)              # (00|0)(0|1)_(0|00)
-            rho2c1_10 = cupy.einsum('xpr,qry->pqxy', int2c_ip1, rho2c_10)           # (00|1)_(1|0)(0|00)
-            rho2c0_11 = cupy.einsum('xpr,yqr->pqxy', rho2c0_10, int2c_ip1)          # (00|0)(0|1)_(1|0)(0|00)
-            int2c_ip_ip = cupy.einsum('xpr,ysr->xyps', int2c_ip1_inv, int2c_ip1)    # (0|1)(0|0)(1|0)
-
-            hk_aux_aux += .5 * cupy.einsum('xypq,pq->pqxy', int2c_ip_ip, rho2c_0)     # (00|0)(1|0)(0|1)(0|00)
-            hk_aux_aux += .5 * cupy.einsum('pqxy,pq->pqxy', rho2c0_11, int2c_inv)     # (00|0)(0|1)(1|0)(0|00)
-            hk_aux_aux +=      cupy.einsum('xpq,yqp->pqxy', int2c_ip1_inv, rho2c0_10) # (00|0)(1|0)(1|0)(0|00)
-            hk_aux_aux -=      cupy.einsum('pqxy,pq->pqxy', rho2c1_10, int2c_inv)     # (00|1)(1|0)(0|00)
-            hk_aux_aux -=      cupy.einsum('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv)  # (00|1)(0|1)(0|00)
-            hk_aux_aux += .5 * cupy.einsum('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
+            rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
+            rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__)
+            rho2c0_10 = contract('xpq,qr->xpr', int2c_ip1, rho2c_0)              # (00|0)(0|1)_(0|00)
+            rho2c1_10 = contract('xpr,qry->pqxy', int2c_ip1, rho2c_10)           # (00|1)_(1|0)(0|00)
+            rho2c0_11 = contract('xpr,yqr->pqxy', rho2c0_10, int2c_ip1)          # (00|0)(0|1)_(1|0)(0|00)
+            int2c_ip_ip = contract('xpr,ysr->xyps', int2c_ip1_inv, int2c_ip1)    # (0|1)(0|0)(1|0)
+
+            hk_aux_aux += .5 * contract('xypq,pq->pqxy', int2c_ip_ip, rho2c_0)     # (00|0)(1|0)(0|1)(0|00)
+            hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c0_11, int2c_inv)     # (00|0)(0|1)(1|0)(0|00)
+            hk_aux_aux +=      contract('xpq,yqp->pqxy', int2c_ip1_inv, rho2c0_10) # (00|0)(1|0)(1|0)(0|00)
+            hk_aux_aux -=      contract('pqxy,pq->pqxy', rho2c1_10, int2c_inv)     # (00|1)(1|0)(0|00)
+            hk_aux_aux -=      contract('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv)  # (00|1)(0|1)(0|00)
+            hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
             rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None
             wk_ip2_P__ = int2c_ip1_inv = None
     ao_idx = np.argsort(intopt.sph_ao_idx)
@@ -413,12 +429,17 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)]
     int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12)
 
-    wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
-    wk_P__ = contract('pio,ir->pro', wk_Pl_, mocc)
+    wj, wk_Pl_, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
     rhoj0 = contract('pq,q->p', int2c_inv, wj)
-    rhok0_Pl_ = contract('pq,qio->pio', int2c_inv, wk_Pl_)
     if with_k:
         rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__)
+    if isinstance(wk_Pl_, cupy.ndarray):
+        rhok0_Pl_ = contract('pq,qio->pio', int2c_inv, wk_Pl_)
+    else:
+        rhok0_Pl_ = np.empty_like(wk_Pl_)
+        for p0, p1 in lib.prange(0,nao,64):
+            wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
+            rhok0_Pl_[:,p0:p1] = cupy.einsum('pq,qio->pio', int2c_inv, wk_tmp).get()
     wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None
 
     # int3c_ip1 contributions
@@ -451,14 +472,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
 
         for p0, p1 in lib.prange(0,nao,64):
-            vj1_tmp = cupy.einsum('pio,xp->xpio', rhok0_Pl_[:,p0:p1], wj0_10)
+            rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
+            vj1_tmp = cupy.einsum('pio,xp->xpio', rhok_tmp, wj0_10)
 
-            wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok0_Pl_[:,p0:p1])
+            wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok_tmp)
             vj1_tmp += cupy.einsum('xpio,p->xpio', wk0_10_Pl_, rhoj0)
             vj1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom)
             if with_k:
                 vk1_tmp = 2.0 * cupy.einsum('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
-                vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok0_Pl_[:,p0:p1])
+                vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
                 vk1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vk1_tmp, aux2atom)
         wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None
         vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 7995c6d8..36268e08 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -21,7 +21,8 @@
 from pyscf import gto, df, lib
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.hf import BasisProdCache, _make_s_index_offsets
-from gpu4pyscf.lib.cupy_helper import block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l
+from gpu4pyscf.lib.cupy_helper import (
+    block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info)
 from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 8
@@ -316,13 +317,14 @@ def build(self, cutoff=1e-14, group_size=None,
         cput1 = logger.timer_debug1(tot_mol, 'Initialize GPU cache', *cput1)
         self.bas_pairs_locs = bas_pairs_locs
         ncptype = len(self.log_qs)
+        self.aosym = aosym
         if aosym:
             self.cp_idx, self.cp_jdx = np.tril_indices(ncptype)
         else:
             nl = int(round(np.sqrt(ncptype)))
             self.cp_idx, self.cp_jdx = np.unravel_index(np.arange(ncptype), (nl, nl))
 
-def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None):
+def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
     intopt = VHFOpt(mol, auxmol, 'int2e')
     intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size_aux=64)
     orbo = dm0_tag.occ_coeff
@@ -331,7 +333,25 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None):
     nocc = orbo.shape[1]
     row, col = np.tril_indices(nao)
     wj = cupy.zeros([naux])
-    wk = cupy.zeros([naux,nao,nocc])
+    if with_k:
+        wk_P__ = cupy.zeros([naux, nocc, nocc]) # assuming naux*nocc*nocc < max_gpu_memory
+    else:
+        wk_P__ = None
+    avail_mem = get_avail_mem()
+    use_gpu_memory = True
+    if naux*nao*nocc*8 < 0.4*avail_mem:
+        try:
+            wk = cupy.zeros([naux,nao,nocc])
+        except Exception:
+            use_gpu_memory = False
+    else:
+        use_gpu_memory = False
+
+    if not use_gpu_memory:
+        mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8)
+        wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem)
+
+    # TODO: async data transfer
     for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
         k0 = intopt.sph_aux_loc[cp_kl_id]
         k1 = intopt.sph_aux_loc[cp_kl_id+1]
@@ -347,10 +367,17 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None):
             i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1]
             j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1]
             ints_slices[:,j0:j1,i0:i1] = int3c_blk
+
         ints_slices[:, row, col] = ints_slices[:, col, row]
         wj[k0:k1] = contract('Lij,ij->L', ints_slices, dm0_tag)
-        wk[k0:k1] = contract('Lij,jo->Lio', ints_slices, orbo)
-    return wj, wk
+        if with_k:
+            wk_tmp = contract('Lij,jo->Lio', ints_slices, orbo)
+            wk_P__[k0:k1] = contract('Lio,ir->Lro', wk_tmp, orbo)
+            if isinstance(wk, cupy.ndarray):
+                wk[k0:k1] = contract('Lij,jo->Lio', ints_slices, orbo)
+            else:
+                wk[k0:k1] = contract('Lij,jo->Lio', ints_slices, orbo).get()
+    return wj, wk, wk_P__
 
 def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None):
     '''
@@ -688,12 +715,12 @@ def get_int3c2e_jk(intopt, dm0_tag, with_k=True, omega=None):
             i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1]
             j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1]
             ints_slices[:,j0:j1,i0:i1] = int3c_blk
-            if cpi != cpj:
+            if cpi != cpj and intopt.aosym:
                 ints_slices[:,i0:i1,j0:j1] = int3c_blk.transpose([0,2,1])
 
-        rhoj[k0:k1] += cupy.einsum('pji,ij->p', ints_slices, dm0_tag)
-        rhok_tmp = cupy.einsum('pji,jo->poi', ints_slices, orbo)
-        rhok[k0:k1] += cupy.einsum('poi,ir->por', rhok_tmp, orbo)
+        rhoj[k0:k1] += contract('pji,ij->p', ints_slices, dm0_tag)
+        rhok_tmp = contract('pji,jo->poi', ints_slices, orbo)
+        rhok[k0:k1] += contract('poi,ir->por', rhok_tmp, orbo)
 
     return rhoj, rhok
 
@@ -713,23 +740,24 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
 
     for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega):
         k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        vj1_buf += cupy.einsum('xpji,p->xij', int3c_blk, rhoj[k0:k1])
+        vj1_buf += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
 
+        rhok_tmp = cupy.asarray(rhok[k0:k1])
         if with_k:
-            rhok0_slice = cupy.einsum('pio,Jo->piJ', rhok[k0:k1], orbo) * 2
-            vk1_buf += cupy.einsum('xpji,plj->xil', int3c_blk, rhok0_slice)
+            rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2
+            vk1_buf += contract('xpji,plj->xil', int3c_blk, rhok0_slice)
 
-        rhoj0 = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag)
-        vj1_ao = cupy.einsum('pjo,xpi->xijo', rhok[k0:k1], rhoj0)
-        vj1 += 2.0*cupy.einsum('xiko,ia->axko', vj1_ao, ao2atom)
+        rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag)
+        vj1_ao = contract('pjo,xpi->xijo', rhok_tmp, rhoj0)
+        vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom)
         if with_k:
-            int3c_ip1_occ = cupy.einsum('xpji,jo->xpio', int3c_blk, orbo)
-            vk1_ao = cupy.einsum('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice)
-            vk1 += cupy.einsum('xiko,ia->axko', vk1_ao, ao2atom)
+            int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo)
+            vk1_ao = contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice)
+            vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
 
-            rhok0 = cupy.einsum('pli,lo->poi', rhok0_slice, orbo)
-            vk1_ao = cupy.einsum('xpji,poi->xijo', int3c_blk, rhok0)
-            vk1 += cupy.einsum('xiko,ia->axko', vk1_ao, ao2atom)
+            rhok0 = contract('pli,lo->poi', rhok0_slice, orbo)
+            vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0)
+            vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
     return vj1_buf, vk1_buf, vj1, vk1
 
 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None):
@@ -745,21 +773,22 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome
     vk1 = cupy.zeros([natom,3,nao_sph,nocc])
     for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip2', omega=omega):
         k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        wj2 = cupy.einsum('xpji,ji->xp', int3c_blk, dm0_tag)
-        wk2_P__ = cupy.einsum('xpji,jo->xpio', int3c_blk, orbo)
+        wj2 = contract('xpji,ji->xp', int3c_blk, dm0_tag)
+        wk2_P__ = contract('xpji,jo->xpio', int3c_blk, orbo)
 
-        vj1_tmp = -cupy.einsum('pio,xp->xpio', rhok[k0:k1], wj2)
-        vj1_tmp -= cupy.einsum('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
+        rhok_tmp = cupy.asarray(rhok[k0:k1])
+        vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
+        vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
 
-        vj1 += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
+        vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
         if with_k:
-            rhok0_slice = cupy.einsum('pio,jo->pij', rhok[k0:k1], orbo)
-            vk1_tmp = -cupy.einsum('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2
+            rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo)
+            vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2
 
-            rhok0_oo = cupy.einsum('pio,ir->pro', rhok[k0:k1], orbo)
-            vk1_tmp -= cupy.einsum('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2
+            rhok0_oo = contract('pio,ir->pro', rhok_tmp, orbo)
+            vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2
 
-            vk1 += cupy.einsum('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
+            vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
         wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None
     return vj1, vk1
 
@@ -772,11 +801,29 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
     wj = cupy.empty([nao_sph,naux_sph,3])
-    wk = cupy.empty([nao_sph,naux_sph,nocc,3])
+    avail_mem = get_avail_mem()
+    use_gpu_memory = True
+    if nao_sph*naux_sph*nocc*3*8 < 0.4*avail_mem:
+        try:
+            wk = cupy.empty([nao_sph,naux_sph,nocc,3])
+        except Exception:
+            use_gpu_memory = False
+    else:
+        use_gpu_memory = False
+
+    if not use_gpu_memory:
+        mem = cupy.cuda.alloc_pinned_memory(nao_sph*naux_sph*nocc*3*8)
+        wk = np.ndarray([nao_sph,naux_sph,nocc,3], dtype=np.float64, order='C', buffer=mem)
+
+    # TODO: async data transfer
     for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega):
         k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        wj[:,k0:k1] = cupy.einsum('xpji,ij->ipx', int3c_blk, dm0_tag)
-        wk[:,k0:k1] = cupy.einsum('xpji,jo->ipox', int3c_blk, orbo)
+        wj[:,k0:k1] = contract('xpji,ij->ipx', int3c_blk, dm0_tag)
+        wk_tmp = contract('xpji,jo->ipox', int3c_blk, orbo)
+        if use_gpu_memory:
+            wk[:,k0:k1] = wk_tmp
+        else:
+            wk[:,k0:k1] = wk_tmp.get()
     return wj, wk
 
 def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
@@ -786,13 +833,12 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
     naux_sph = len(intopt.sph_aux_idx)
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
-    wj = cupy.empty([naux_sph,3])
-    wk = cupy.empty([naux_sph,nocc,nocc,3])
-    for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip2', omega=omega):
-        k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        wj[k0:k1] = cupy.einsum('xpji,ij->px', int3c_blk, dm0_tag)
-        tmp = cupy.einsum('xpji,jo->piox', int3c_blk, orbo)
-        wk[k0:k1] = cupy.einsum('piox,ir->prox', tmp, orbo)
+    wj = cupy.zeros([naux_sph,3])
+    wk = cupy.zeros([naux_sph,nocc,nocc,3])
+    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip2', omega=omega):
+        wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0_tag[j0:j1,i0:i1])
+        tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
+        wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
     return wj, wk
 
 def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
@@ -804,11 +850,11 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     hj = cupy.zeros([nao_sph,9])
     hk = cupy.zeros([nao_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1', omega=omega):
-        rhok_tmp = cupy.einsum('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-        rhok_tmp = cupy.einsum('pio,jo->pij', rhok_tmp, orbo[j0:j1])
-        tmp = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
-        hj[i0:i1] += cupy.einsum('xpi,p->ix', tmp, rhoj[k0:k1])
-        hk[i0:i1] += cupy.einsum('xpji,pij->ix', int3c_blk, rhok_tmp)
+        rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+        rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
+        tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
+        hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
     hj = hj.reshape([nao_sph,3,3])
     hk = hk.reshape([nao_sph,3,3])
     return hj, hk
@@ -822,11 +868,11 @@ def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
     hj = cupy.zeros([nao_sph,nao_sph,9])
     hk = cupy.zeros([nao_sph,nao_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1', omega=omega):
-        rhok_tmp = cupy.einsum('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-        rhok_tmp = cupy.einsum('pio,jo->pji', rhok_tmp, orbo[j0:j1])
-        tmp = cupy.einsum('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1])
-        hj[i0:i1,j0:j1] += cupy.einsum('xpij,p->ijx', tmp, rhoj[k0:k1])
-        hk[i0:i1,j0:j1] += cupy.einsum('xpji,pji->ijx', int3c_blk, rhok_tmp)
+        rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+        rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
+        tmp = contract('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
+        hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
     hj = hj.reshape([nao_sph,nao_sph,3,3])
     hk = hk.reshape([nao_sph,nao_sph,3,3])
     return hj, hk
@@ -841,11 +887,11 @@ def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
     hj = cupy.zeros([nao_sph,naux_sph,9])
     hk = cupy.zeros([nao_sph,naux_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1ip2', omega=omega):
-        rhok_tmp = cupy.einsum('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-        rhok_tmp = cupy.einsum('pio,jo->pij', rhok_tmp, orbo[j0:j1])
-        tmp = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
-        hj[i0:i1,k0:k1] += cupy.einsum('xpi,p->ipx', tmp, rhoj[k0:k1])
-        hk[i0:i1,k0:k1] += cupy.einsum('xpji,pij->ipx', int3c_blk, rhok_tmp)
+        rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+        rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
+        tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
+        hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
     hj = hj.reshape([nao_sph,naux_sph,3,3])
     hk = hk.reshape([nao_sph,naux_sph,3,3])
     return hj, hk
@@ -859,11 +905,11 @@ def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     hj = cupy.zeros([naux_sph,9])
     hk = cupy.zeros([naux_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip2', omega=omega):
-        rhok_tmp = cupy.einsum('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
-        rhok_tmp = cupy.einsum('pjo,io->pji', rhok_tmp, orbo[i0:i1])
-        tmp = cupy.einsum('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1])
-        hj[k0:k1] += cupy.einsum('xp,p->px', tmp, rhoj[k0:k1])
-        hk[k0:k1] += cupy.einsum('xpji,pji->px', int3c_blk, rhok_tmp)
+        rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
+        rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
+        tmp = contract('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
+        hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
     hj = hj.reshape([naux_sph,3,3])
     hk = hk.reshape([naux_sph,3,3])
     return hj, hk
diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py
index 1b042096..8c686b63 100644
--- a/gpu4pyscf/df/tests/test_df_scf.py
+++ b/gpu4pyscf/df/tests/test_df_scf.py
@@ -93,15 +93,19 @@ def test_rks_wb97(self):
     def test_to_cpu(self):
         mf = scf.RHF(mol).density_fit().to_cpu()
         assert isinstance(mf, cpu_df_jk._DFHF)
-        mf = mf.to_gpu()
-        assert isinstance(mf, df_jk._DFHF)
+        # TODO: coming soon
+        #mf = mf.to_gpu()
+        #assert isinstance(mf, df_jk._DFHF)
 
         mf = rks.RKS(mol).density_fit().to_cpu()
         assert isinstance(mf, cpu_df_jk._DFHF)
-        assert 'gpu' not in mf.grids.__module__
-        mf = mf.to_gpu()
-        assert isinstance(mf, df_jk._DFHF)
-        assert 'gpu' in mf.grids.__module__
+        # grids are still not df._key
+        #assert 'gpu' not in mf.grids.__module__
+
+        # TODO: coming soon
+        #mf = mf.to_gpu()
+        #assert isinstance(mf, df_jk._DFHF)
+        #assert 'gpu' in mf.grids.__module__
 
 if __name__ == "__main__":
     print("Full Tests for SCF")
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 6f642889..26c441cf 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -36,7 +36,7 @@
 # import pyscf.grad.rhf to activate nuc_grad_method method
 from pyscf.grad import rhf  # noqa
 from gpu4pyscf.scf import cphf
-from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, print_mem_info
 from gpu4pyscf.lib import logger
 
 def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
@@ -55,7 +55,6 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mo_energy = cupy.asarray(mo_energy)
     mo_occ = cupy.asarray(mo_occ)
     mo_coeff = cupy.asarray(mo_coeff)
-
     de2 = hessobj.partial_hess_elec(mo_energy, mo_coeff, mo_occ, atmlst,
                                     max_memory, log)
     if h1ao is None:
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 9f2cef59..2285b9dd 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -57,7 +57,6 @@ def print_mem_info():
     cupy.get_default_memory_pool().free_all_blocks()
     cupy.get_default_pinned_memory_pool().free_all_blocks()
     mem_avail = cupy.cuda.runtime.memGetInfo()[0]
-    print(cupy.cuda.runtime.memGetInfo())
     total_mem = mempool.total_bytes()
     used_mem = mempool.used_bytes()
     mem_limit = mempool.get_limit()
@@ -322,7 +321,7 @@ def cart2sph(t, axis=0, ang=1, out=None):
     t_sph = contract('min,ip->mpn', t_cart, c2s, out=out)
     return t_sph.reshape(out_shape)
 
-# a copy with modification from 
+# a copy with modification from
 # https://github.com/pyscf/pyscf/blob/9219058ac0a1bcdd8058166cad0fb9127b82e9bf/pyscf/lib/linalg_helper.py#L1536
 def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
            lindep=DSOLVE_LINDEP, callback=None, hermi=False,
diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu
index 34a2783a..b87ca434 100644
--- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu
+++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu
@@ -40,7 +40,7 @@ static void _nabla1(double *fx1, double *fy1, double *fz1,
     fx1[0] = a2*fx0[1];
     fy1[0] = a2*fy0[1];
     fz1[0] = a2*fz0[1];
-    
+
     for (i = 1; i <= ANG; i++) {
         fx1[i] = i*fx0[i-1] + a2*fx0[i+1];
         fy1[i] = i*fy0[i-1] + a2*fy0[i+1];
@@ -174,11 +174,11 @@ static void _cart_kernel_deriv0(BasOffsets offsets)
         double xpows[LMAX];
         double ypows[LMAX];
         double zpows[LMAX];
-        
+
         xpows[0] = 1.0;
         ypows[0] = 1.0;
         zpows[0] = 1.0;
-        
+
         for(lx = 1; lx <= ANG ; lx++){
             xpows[lx] = xpows[lx-1] * rx;
             ypows[lx] = ypows[lx-1] * ry;
@@ -342,7 +342,7 @@ static void _cart_kernel_deriv1(BasOffsets offsets)
         gtoz[7*ngrids+grid_id] = az * ry * ry * rz +     byy;
         gtoz[8*ngrids+grid_id] = az * ry * rz * rz + 2 * byz;
         gtoz[9*ngrids+grid_id] = az * rz * rz * rz + 3 * bzz;
-    } 
+    }
     // There is a bug in the comment.
     // Using a general formulation.
     // FIXME later
@@ -390,7 +390,7 @@ static void _cart_kernel_deriv1(BasOffsets offsets)
         gtox[12*ngrids+grid_id] = ax * ry * ry * rz * rz;
         gtox[13*ngrids+grid_id] = ax * ry * rz * rz * rz;
         gtox[14*ngrids+grid_id] = ax * rz * rz * rz * rz;
-        gtoy[          grid_id] = ay * rx * rx * rx * rx;          
+        gtoy[          grid_id] = ay * rx * rx * rx * rx;
         gtoy[1 *ngrids+grid_id] = ay * rx * rx * rx * ry +     bxxx;
         gtoy[2 *ngrids+grid_id] = ay * rx * rx * rx * rz;
         gtoy[3 *ngrids+grid_id] = ay * rx * rx * ry * ry + 2 * bxxy;
@@ -404,11 +404,11 @@ static void _cart_kernel_deriv1(BasOffsets offsets)
         gtoy[11*ngrids+grid_id] = ay * ry * ry * ry * rz + 3 * byyz;
         gtoy[12*ngrids+grid_id] = ay * ry * ry * rz * rz + 2 * byzz;
         gtoy[13*ngrids+grid_id] = ay * ry * rz * rz * rz +     bzzz;
-        gtoy[14*ngrids+grid_id] = ay * rz * rz * rz * rz;          
+        gtoy[14*ngrids+grid_id] = ay * rz * rz * rz * rz;
         gtoz[          grid_id] = az * rx * rx * rx * rx;
         gtoz[1 *ngrids+grid_id] = az * rx * rx * rx * ry;
         gtoz[2 *ngrids+grid_id] = az * rx * rx * rx * rz +     bxxx;
-        gtoz[3 *ngrids+grid_id] = az * rx * rx * ry * ry; 
+        gtoz[3 *ngrids+grid_id] = az * rx * rx * ry * ry;
         gtoz[4 *ngrids+grid_id] = az * rx * rx * ry * rz +     bxxy;
         gtoz[5 *ngrids+grid_id] = az * rx * rx * rz * rz + 2 * bxxz;
         gtoz[6 *ngrids+grid_id] = az * rx * ry * ry * ry;
@@ -477,7 +477,7 @@ static void _cart_kernel_deriv2(BasOffsets offsets)
     double* __restrict__ gtoyy = offsets.data + (nao * 7 + i0) * ngrids;
     double* __restrict__ gtoyz = offsets.data + (nao * 8 + i0) * ngrids;
     double* __restrict__ gtozz = offsets.data + (nao * 9 + i0) * ngrids;
-    
+
     double *atom_coordx = c_envs.atom_coordx;
     double *atom_coordy = c_envs.atom_coordx + natm;
     double *atom_coordz = c_envs.atom_coordx + natm * 2;
@@ -494,7 +494,7 @@ static void _cart_kernel_deriv2(BasOffsets offsets)
     double fx0[16], fy0[16], fz0[16];
     double fx1[16], fy1[16], fz1[16];
     double fx2[16], fy2[16], fz2[16];
-    
+
     fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0;
     for (int lx = 1; lx <= ANG+2; lx++){
         fx0[lx] = fx0[lx-1] * rx;
@@ -587,7 +587,7 @@ static void _cart_kernel_deriv3(BasOffsets offsets)
         fy0[lx] = fy0[lx-1] * ry;
         fz0[lx] = fz0[lx-1] * rz;
     }
-    
+
     for (int ip = 0; ip < offsets.nprim; ++ip) {
         double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac;
         _nabla1<ANG+2>(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]);
@@ -701,7 +701,7 @@ static void _cart_kernel_deriv4(BasOffsets offsets)
         fy0[lx] = fy0[lx-1] * ry;
         fz0[lx] = fz0[lx-1] * rz;
     }
-    
+
     for (int ip = 0; ip < offsets.nprim; ++ip) {
         double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac;
         _nabla1<ANG+3>(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]);
@@ -1085,7 +1085,7 @@ static void _sph_kernel_deriv1(BasOffsets offsets)
         gtox[6 *ngrids+grid_id] = 2.838524087272680054 * g5 + 0.473087347878780009 * g10 - 0.473087347878780002 * g0 - 2.838524087272680050 * g12;
         gtox[7 *ngrids+grid_id] = 1.770130769779930531 * g2 - 5.310392309339791590 * g7 ;
         gtox[8 *ngrids+grid_id] = 0.625835735449176134 * g0 - 3.755014412695056800 * g3 + 0.625835735449176134 * g10;
-        g0  = ay * rx * rx * rx * rx;          
+        g0  = ay * rx * rx * rx * rx;
         g1  = ay * rx * rx * rx * ry +     bxxx;
         g2  = ay * rx * rx * rx * rz;
         g3  = ay * rx * rx * ry * ry + 2 * bxxy;
@@ -1099,7 +1099,7 @@ static void _sph_kernel_deriv1(BasOffsets offsets)
         g11 = ay * ry * ry * ry * rz + 3 * byyz;
         g12 = ay * ry * ry * rz * rz + 2 * byzz;
         g13 = ay * ry * rz * rz * rz +     bzzz;
-        g14 = ay * rz * rz * rz * rz;          
+        g14 = ay * rz * rz * rz * rz;
         gtoy[          grid_id] = 2.503342941796704538 * g1 - 2.503342941796704530 * g6 ;
         gtoy[1 *ngrids+grid_id] = 5.310392309339791593 * g4 - 1.770130769779930530 * g11;
         gtoy[2 *ngrids+grid_id] = 5.677048174545360108 * g8 - 0.946174695757560014 * g1 - 0.946174695757560014 * g6 ;
@@ -1112,7 +1112,7 @@ static void _sph_kernel_deriv1(BasOffsets offsets)
         g0  = az * rx * rx * rx * rx;
         g1  = az * rx * rx * rx * ry;
         g2  = az * rx * rx * rx * rz +     bxxx;
-        g3  = az * rx * rx * ry * ry; 
+        g3  = az * rx * rx * ry * ry;
         g4  = az * rx * rx * ry * rz +     bxxy;
         g5  = az * rx * rx * rz * rz + 2 * bxxz;
         g6  = az * rx * ry * ry * ry;
@@ -1162,7 +1162,7 @@ static void _sph_kernel_deriv2(BasOffsets offsets)
     double* __restrict__ gtoyy = offsets.data + (nao * 7 + i0) * ngrids;
     double* __restrict__ gtoyz = offsets.data + (nao * 8 + i0) * ngrids;
     double* __restrict__ gtozz = offsets.data + (nao * 9 + i0) * ngrids;
-    
+
     double *atom_coordx = c_envs.atom_coordx;
     double *atom_coordy = c_envs.atom_coordx + natm;
     double *atom_coordz = c_envs.atom_coordx + natm * 2;
@@ -1179,7 +1179,7 @@ static void _sph_kernel_deriv2(BasOffsets offsets)
     double fx0[16], fy0[16], fz0[16];
     double fx1[16], fy1[16], fz1[16];
     double fx2[16], fy2[16], fz2[16];
-    
+
     fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0;
     for (int lx = 1; lx <= ANG+2; lx++){
         fx0[lx] = fx0[lx-1] * rx;
@@ -1267,14 +1267,14 @@ static void _sph_kernel_deriv3(BasOffsets offsets)
         fy0[lx] = fy0[lx-1] * ry;
         fz0[lx] = fz0[lx-1] * rz;
     }
-    
+
     double g[GTO_MAX_CART];
     for (int ip = 0; ip < offsets.nprim; ++ip) {
         double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac;
         _nabla1<ANG+2>(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]);
         _nabla1<ANG+1>(fx2, fy2, fz2, fx1, fy1, fz1, exps[ip]);
         _nabla1<ANG  >(fx3, fy3, fz3, fx2, fy2, fz2, exps[ip]);
-        
+
         _cart_gto<ANG>(g, ce, fx0, fy0, fz0); _cart2sph<ANG>(g, gto,    ngrids, grid_id);
         _cart_gto<ANG>(g, ce, fx1, fy0, fz0); _cart2sph<ANG>(g, gtox,   ngrids, grid_id);
         _cart_gto<ANG>(g, ce, fx0, fy1, fz0); _cart2sph<ANG>(g, gtoy,   ngrids, grid_id);
@@ -1506,7 +1506,7 @@ int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart,
     for (int bucket = 0; bucket < nbuckets; ++bucket) {
         int ish = bas_loc[bucket];
         int l = bas[ANG_OF+ish*BAS_SLOTS];
-        
+
         offsets.bas_off = ish;
         offsets.nprim = bas[NPRIM_OF+ish*BAS_SLOTS];
         offsets.fac = CINTcommon_fac_sp(l);
diff --git a/gpu4pyscf/lib/gdft/nr_numint_sparse.cu b/gpu4pyscf/lib/gdft/nr_numint_sparse.cu
index 7b7455b9..4ec57c76 100644
--- a/gpu4pyscf/lib/gdft/nr_numint_sparse.cu
+++ b/gpu4pyscf/lib/gdft/nr_numint_sparse.cu
@@ -13,7 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
- 
+
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -206,13 +206,13 @@ static void _dot_aow_ao(double *out, double *bra, double *ket, double *wv,
 
     __shared__ double s_bra[THREADSXY];
     __shared__ double s_ket[THREADSXY];
-    
+
     int grid_blk;
     for (grid_blk = 0; grid_blk < ngrids/THREADSX; grid_blk++) {
         int grid0 = grid_blk * THREADSX;
         uint8_t si = screen_index[grid_blk*bas_blocks+ish4];
         uint8_t sj = screen_index[grid_blk*bas_blocks+jsh4];
-        //printf("%d %d %d %d %d ***", si, sj, nbins, grid_blk*bas_blocks+ish4, grid_blk*bas_blocks+jsh4);    
+        //printf("%d %d %d %d %d ***", si, sj, nbins, grid_blk*bas_blocks+ish4, grid_blk*bas_blocks+jsh4);
         if (si + sj >= 0) { //nbins) {
             int grid_id = grid0 + txy;
             for (int n = 0; n < THREADSY; n++) {
diff --git a/gpu4pyscf/lib/gdft/vv10.cu b/gpu4pyscf/lib/gdft/vv10.cu
index 51a1bd6f..df26dffe 100644
--- a/gpu4pyscf/lib/gdft/vv10.cu
+++ b/gpu4pyscf/lib/gdft/vv10.cu
@@ -49,34 +49,34 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec,
     double F = 0.0;
     double U = 0.0;
     double W = 0.0;
-    
+
     double *xj = vvcoords;
     double *yj = vvcoords + vvngrids;
     double *zj = vvcoords + 2*vvngrids;
 
     __shared__ double3 xj_t[THREADS];
     __shared__ double3 kp_t[THREADS];
-    
+
     const int tx = threadIdx.x;
     for (int j = 0; j < vvngrids; j+=blockDim.x) {
         int idx = j + threadIdx.x;
 
         xj_t[tx] = {xj[idx], yj[idx], zj[idx]};
-        kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; 
-        
+        kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]};
+
         __syncthreads();
         for (int l = 0, M = min(THREADS, vvngrids - j); l < M; ++l){
             double3 xj_tmp = xj_t[l];
             double pjx = xj_tmp.x;
             double pjy = xj_tmp.y;
             double pjz = xj_tmp.z;
-            
+
             // about 23 operations for each pair
             double DX = pjx - xi;
-            double DY = pjy - yi; 
+            double DY = pjy - yi;
             double DZ = pjz - zi;
             double R2 = DX*DX + DY*DY + DZ*DZ;
-            
+
             double3 kp_tmp = kp_t[l];
             double Kpj = kp_tmp.x;
             double W0pj = kp_tmp.y;
@@ -87,7 +87,7 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec,
             double gt = g + gp;
             double ggt = g * gt;
             double T = RpWj / (gp*ggt);
-            
+
             F += T;
             T *= (g + gt)/ggt;
             U += T;
@@ -126,7 +126,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords,
     double *xj = vvcoords;
     double *yj = vvcoords + vvngrids;
     double *zj = vvcoords + 2*vvngrids;
-    
+
     __shared__ double3 xj_t[THREADS];
     __shared__ double3 kp_t[THREADS];
 
@@ -135,7 +135,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords,
         int idx = j + threadIdx.x;
 
         xj_t[tx] = {xj[idx], yj[idx], zj[idx]};
-        kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; 
+        kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]};
 
         __syncthreads();
         for (int l = 0, M = min(THREADS, vvngrids - j); l < M; ++l){
@@ -143,10 +143,10 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords,
             double pjx = xj_tmp.x;
             double pjy = xj_tmp.y;
             double pjz = xj_tmp.z;
-            
+
             // about 23 operations for each pair
             double DX = pjx - xi;
-            double DY = pjy - yi; 
+            double DY = pjy - yi;
             double DZ = pjz - zi;
             double R2 = DX*DX + DY*DY + DZ*DZ;
 
@@ -154,7 +154,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords,
             double Kpj = kp_tmp.x;
             double W0pj = kp_tmp.y;
             double RpWj = kp_tmp.z;
-            
+
             double gp = R2*W0pj + Kpj;
             double g  = R2*W0i + Ki;
             double gt = g + gp;
diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py
index a5cf6187..5ce6613e 100644
--- a/gpu4pyscf/lib/utils.py
+++ b/gpu4pyscf/lib/utils.py
@@ -37,7 +37,6 @@ def to_cpu(method):
         if 'gpu4pyscf' not in pyscf_cls.__module__:
             break
     method = method.view(pyscf_cls)
-
     keys = []
     for cls in pyscf_cls.__mro__[:-1]:
         if hasattr(cls, '_keys'):
diff --git a/gpu4pyscf/scf/tests/test_scf.py b/gpu4pyscf/scf/tests/test_scf.py
index 3dd94806..29adecd4 100644
--- a/gpu4pyscf/scf/tests/test_scf.py
+++ b/gpu4pyscf/scf/tests/test_scf.py
@@ -31,11 +31,16 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 bas='def2-qzvpp'
-mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
-mol.verbose = 4
+def setUpModule():
+    global mol
+    mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
+    mol.output = '/dev/null'
+    mol.verbose = 0
+    mol.build()
 
 def tearDownModule():
     global mol
+    mol.stdout.close()
     del mol
 
 class KnownValues(unittest.TestCase):
@@ -51,16 +56,18 @@ def test_rhf(self):
 
     def test_to_cpu(self):
         mf = scf.RHF(mol).to_cpu()
-        assert isinstance(mf, cpu_scf.RHF)
-        mf = mf.to_gpu()
-        assert isinstance(mf, scf.RHF)
+        assert isinstance(mf, cpu_scf.hf.RHF)
+        # coming soon
+        #mf = mf.to_gpu()
+        #assert isinstance(mf, scf.RHF)
 
         mf = rks.RKS(mol).to_cpu()
         assert isinstance(mf, cpu_dft.rks.RKS)
         assert 'gpu' not in mf.grids.__module__
-        mf = mf.to_gpu()
-        assert isinstance(mf, rks.RKS)
-        assert 'gpu' in mf.grids.__module__
+        # coming soon
+        # mf = mf.to_gpu()
+        # assert isinstance(mf, rks.RKS)
+        #assert 'gpu' in mf.grids.__module__
 
 if __name__ == "__main__":
     print("Full Tests for SCF")
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 7c9d0047..4df17558 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -296,4 +296,3 @@ def _finalize(self):
 
     return WithSolventGrad(grad_method)
 
-#pcm.PCM.nuc_grad_method = make_grad_object
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
index b7a1d181..fd3b36e4 100644
--- a/gpu4pyscf/solvent/pcm.py
+++ b/gpu4pyscf/solvent/pcm.py
@@ -293,7 +293,7 @@ def build(self, ng=None):
         self.v_grids_n = cupy.asarray(v_grids_n)
 
     def _get_vind(self, dms):
-        if not self._intermediates or self.grids.coords is None:
+        if not self._intermediates:
             self.build()
 
         nao = dms.shape[-1]

From 2fff5f3deffe1241dcd9ab34ff0fa809054a21c4 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 24 Oct 2023 15:55:35 -0700
Subject: [PATCH 13/19] Update __init__.py

---
 gpu4pyscf/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index 7fb02e52..ca735418 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,2 +1,2 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.3'
+__version__ = '0.6.4'

From 891ef8180372884cea39ff964a0942e14a34ad8d Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 24 Oct 2023 21:46:30 -0700
Subject: [PATCH 14/19] Add chelpg charges in qmmm folder. (#1) (#52)

* Add chelpg charges in qmmm folder.

* Update chelpg.py

* Update chelpg.py

* Add unit test for chelpg, and compare with Qchem

* Add an example to calculate chelpg

Co-authored-by: puzhichen <147788878+puzhichen@users.noreply.github.com>
---
 examples/15-chelpg.py              |  39 +++
 gpu4pyscf/qmmm/__init__.py         |  16 ++
 gpu4pyscf/qmmm/chelpg.py           | 371 +++++++++++++++++++++++++++++
 gpu4pyscf/qmmm/test/test_chelpg.py |  85 +++++++
 4 files changed, 511 insertions(+)
 create mode 100644 examples/15-chelpg.py
 create mode 100644 gpu4pyscf/qmmm/__init__.py
 create mode 100644 gpu4pyscf/qmmm/chelpg.py
 create mode 100644 gpu4pyscf/qmmm/test/test_chelpg.py

diff --git a/examples/15-chelpg.py b/examples/15-chelpg.py
new file mode 100644
index 00000000..f102741b
--- /dev/null
+++ b/examples/15-chelpg.py
@@ -0,0 +1,39 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from pyscf import gto
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm import chelpg
+
+    
+mol = gto.Mole()
+mol.verbose = 0
+mol.output = None
+mol.atom = [
+    [1 , (1. ,  0.     , 0.000)],
+    [1 , (0. ,  1.     , 0.000)],
+    [1 , (0. , -1.517  , 1.177)],
+    [1 , (0. ,  1.517  , 1.177)] ]
+mol.basis = '631g'
+mol.unit = 'B'
+mol.build()
+mol.verbose = 6
+    
+xc = 'b3lyp'
+mf = rks.RKS(mol, xc=xc)
+mf.grids.level = 5
+mf.kernel()
+q = chelpg.eval_chelpg_layer_gpu(mf)
+print(q) # [ 0.04402311  0.11333945 -0.25767919  0.10031663]
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/__init__.py b/gpu4pyscf/qmmm/__init__.py
new file mode 100644
index 00000000..b165366a
--- /dev/null
+++ b/gpu4pyscf/qmmm/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from gpu4pyscf.qmmm import chelpg
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py
new file mode 100644
index 00000000..3851acc0
--- /dev/null
+++ b/gpu4pyscf/qmmm/chelpg.py
@@ -0,0 +1,371 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pyscf
+import time
+import cupy
+import numpy as np
+import scipy
+import ctypes
+from pyscf import lib, gto
+from pyscf.scf import _vhf
+from gpu4pyscf.df import int3c2e
+from gpu4pyscf.scf.hf import BasisProdCache
+from gpu4pyscf.lib.cupy_helper import load_library, block_c2s_diag
+libgint = load_library('libgint')
+libgvhf = load_library('libgvhf')
+lib.num_threads(8)
+
+
+def get_j_int3c2e_pass1(intopt, dm0):
+    '''
+    get rhoj pass1 for int3c2e
+    '''
+    n_dm = 1
+
+    naux = intopt.naux
+    rhoj = cupy.zeros([naux])
+    coeff = intopt.coeff
+    dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm0, coeff)
+
+    num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
+    num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs]
+
+    bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32)
+    bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32)
+
+    ncp_ij = len(intopt.log_qs)
+    ncp_kl = len(intopt.aux_log_qs)
+    norb = dm_cart.shape[0]
+    err = libgvhf.GINTbuild_j_int3c2e_pass1(
+        intopt.bpcache,
+        ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p),
+        ctypes.cast(rhoj.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(norb),
+        ctypes.c_int(naux),
+        ctypes.c_int(n_dm),
+        bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+        bins_locs_kl.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(ncp_ij),
+        ctypes.c_int(ncp_kl))
+    if err != 0:
+        raise RuntimeError('CUDA error in get_j_pass1')
+    return rhoj
+
+
+class VHFOpt(_vhf.VHFOpt):
+    def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen',
+                 qcondname='CVHFsetnr_direct_scf', dmcondname=None):
+        # use local basis_seg_contraction for efficiency
+        self.mol = int3c2e.basis_seg_contraction(mol, allow_replica=True)
+        self.auxmol = int3c2e.basis_seg_contraction(auxmol, allow_replica=True)
+        '''
+        # Note mol._bas will be sorted in .build() method. VHFOpt should be
+        # initialized after mol._bas updated.
+        '''
+        self.nao = self.mol.nao
+        self.naux = self.auxmol.nao
+
+        self._intor = intor
+        self._prescreen = prescreen
+        self._qcondname = qcondname
+        self._dmcondname = dmcondname
+
+        self.bpcache = None
+
+        self.sorted_auxmol = None
+        self.sorted_mol = None
+
+        self.cart_ao_idx = None
+        self.sph_ao_idx = None
+        self.cart_aux_idx = None
+        self.sph_aux_idx = None
+
+        self.cart_ao_loc = []
+        self.cart_aux_loc = []
+        self.sph_ao_loc = []
+        self.sph_aux_loc = []
+
+        self.cart2sph = None
+        self.aux_cart2sph = None
+
+        self.angular = None
+        self.aux_angular = None
+
+        self.cp_idx = None
+        self.cp_jdx = None
+
+        self.log_qs = None
+        self.aux_log_qs = None
+
+    def clear(self):
+        _vhf.VHFOpt.__del__(self)
+        libgvhf.GINTdel_basis_prod(ctypes.byref(self.bpcache))
+        return self
+
+    def __del__(self):
+        try:
+            self.clear()
+        except AttributeError:
+            pass
+
+    def build(self, cutoff=1e-14, group_size=None,
+              group_size_aux=None, diag_block_with_triu=False, aosym=False):
+        '''
+        int3c2e is based on int2e with (ao,ao|aux,1)
+        a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
+        we will pair (ao,ao) and (aux,1) separately.
+        '''
+        sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = int3c2e.sort_mol(
+            self.mol)
+        if group_size is not None:
+            uniq_l_ctr, l_ctr_counts = int3c2e._split_l_ctr_groups(
+                uniq_l_ctr, l_ctr_counts, group_size)
+        self.sorted_mol = sorted_mol
+
+        # sort fake mol
+        fake_mol = int3c2e.make_fake_mol()
+        _, _, fake_uniq_l_ctr, fake_l_ctr_counts = int3c2e.sort_mol(fake_mol)
+
+        # sort auxiliary mol
+        sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol(
+            self.auxmol)
+        if group_size_aux is not None:
+            aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e._split_l_ctr_groups(
+                aux_uniq_l_ctr, aux_l_ctr_counts, group_size_aux)
+        self.sorted_auxmol = sorted_auxmol
+        tmp_mol = gto.mole.conc_mol(fake_mol, sorted_auxmol)
+        tot_mol = gto.mole.conc_mol(sorted_mol, tmp_mol)
+
+        # Initialize vhfopt after reordering mol._bas
+        _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen,
+                             self._qcondname, self._dmcondname)
+        self.direct_scf_tol = cutoff
+
+        # TODO: is it more accurate to filter with overlap_cond (or exp_cond)?
+        q_cond = self.get_q_cond()
+        l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
+        log_qs, pair2bra, pair2ket = int3c2e.get_pairing(
+            l_ctr_offsets, l_ctr_offsets, q_cond,
+            diag_block_with_triu=diag_block_with_triu, aosym=aosym)
+        self.log_qs = log_qs.copy()
+
+        # contraction coefficient for ao basis
+        cart_ao_loc = self.sorted_mol.ao_loc_nr(cart=True)
+        sph_ao_loc = self.sorted_mol.ao_loc_nr(cart=False)
+        self.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets]
+        self.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets]
+        self.angular = [l[0] for l in uniq_l_ctr]
+
+        cart_ao_loc = self.mol.ao_loc_nr(cart=True)
+        sph_ao_loc = self.mol.ao_loc_nr(cart=False)
+        nao = sph_ao_loc[-1]
+        ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1])
+        self.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
+
+        # cartesian ao index
+        nao = cart_ao_loc[-1]
+        ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
+        self.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
+        ncart = cart_ao_loc[-1]
+        nsph = sph_ao_loc[-1]
+        self.cart2sph = block_c2s_diag(ncart, nsph, self.angular, l_ctr_counts)
+        inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32)
+        self.coeff = self.cart2sph[:, inv_idx]
+
+        # pairing auxiliary basis with fake basis set
+        fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts))
+        fake_l_ctr_offsets += l_ctr_offsets[-1]
+
+        aux_l_ctr_offsets = np.append(0, np.cumsum(aux_l_ctr_counts))
+
+        # contraction coefficient for auxiliary basis
+        cart_aux_loc = self.sorted_auxmol.ao_loc_nr(cart=True)
+        sph_aux_loc = self.sorted_auxmol.ao_loc_nr(cart=False)
+        self.cart_aux_loc = [cart_aux_loc[cp] for cp in aux_l_ctr_offsets]
+        self.sph_aux_loc = [sph_aux_loc[cp] for cp in aux_l_ctr_offsets]
+        self.aux_angular = [l[0] for l in aux_uniq_l_ctr]
+
+        cart_aux_loc = self.auxmol.ao_loc_nr(cart=True)
+        sph_aux_loc = self.auxmol.ao_loc_nr(cart=False)
+        ncart = cart_aux_loc[-1]
+        nsph = sph_aux_loc[-1]
+        # inv_idx = np.argsort(self.sph_aux_idx, kind='stable').astype(np.int32)
+        aux_l_ctr_offsets += fake_l_ctr_offsets[-1]
+
+        # hardcoded for grids
+        aux_pair2bra = [np.arange(aux_l_ctr_offsets[0], aux_l_ctr_offsets[-1])]
+        aux_pair2ket = [np.ones(ncart) * fake_l_ctr_offsets[0]]
+        aux_log_qs = [np.ones(ncart)]
+
+        self.aux_log_qs = aux_log_qs.copy()
+        pair2bra += aux_pair2bra
+        pair2ket += aux_pair2ket
+
+        uniq_l_ctr = np.concatenate(
+            [uniq_l_ctr, fake_uniq_l_ctr, aux_uniq_l_ctr])
+        l_ctr_offsets = np.concatenate([
+            l_ctr_offsets,
+            fake_l_ctr_offsets[1:],
+            aux_l_ctr_offsets[1:]])
+
+        bas_pair2shls = np.hstack(
+            pair2bra + pair2ket).astype(np.int32).reshape(2, -1)
+        bas_pairs_locs = np.append(0, np.cumsum(
+            [x.size for x in pair2bra])).astype(np.int32)
+        log_qs = log_qs + aux_log_qs
+        ao_loc = tot_mol.ao_loc_nr(cart=True)
+        ncptype = len(log_qs)
+
+        self.bpcache = ctypes.POINTER(BasisProdCache)()
+        if diag_block_with_triu:
+            scale_shellpair_diag = 1.
+        else:
+            scale_shellpair_diag = 0.5
+        libgint.GINTinit_basis_prod(
+            ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
+            ao_loc.ctypes.data_as(ctypes.c_void_p),
+            bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
+            bas_pairs_locs.ctypes.data_as(
+                ctypes.c_void_p), ctypes.c_int(ncptype),
+            tot_mol._atm.ctypes.data_as(
+                ctypes.c_void_p), ctypes.c_int(tot_mol.natm),
+            tot_mol._bas.ctypes.data_as(
+                ctypes.c_void_p), ctypes.c_int(tot_mol.nbas),
+            tot_mol._env.ctypes.data_as(ctypes.c_void_p))
+        self.bas_pairs_locs = bas_pairs_locs
+        ncptype = len(self.log_qs)
+        if aosym:
+            self.cp_idx, self.cp_jdx = np.tril_indices(ncptype)
+        else:
+            nl = int(round(np.sqrt(ncptype)))
+            self.cp_idx, self.cp_jdx = np.unravel_index(
+                np.arange(ncptype), (nl, nl))
+
+
+def eval_chelpg_layer_gpu(mf, deltaR=0.3, Rhead=2.8, ifqchem=True):
+    """Cal chelpg charge
+
+    Args:
+        mf: mean field object in pyscf
+        deltaR (float, optional): the intervel in the cube. Defaults to 0.3.
+        Rhead (float, optional): the head length. Defaults to 3.0.
+        ifqchem (bool, optional): whether use the modification in qchem. Defaults to True.
+
+    Returns:
+        numpy.array: charges
+    """
+    t0 = time.process_time()
+    t0w = time.time()
+    BOHR = 0.52917721092  # Angstroms
+    atomcoords = mf.mol.atom_coords(unit='B')
+    dm = cupy.array(mf.make_rdm1())
+    RVDW_bondi = {1: 1.1/BOHR, 2: 1.40/BOHR,
+                  3: 1.82/BOHR, 6: 1.70/BOHR, 7: 1.55/BOHR, 8: 1.52/BOHR, 9: 1.47/BOHR, 10: 1.54/BOHR,
+                  11: 2.27/BOHR, 12: 1.73/BOHR, 14: 2.10/BOHR, 15: 1.80/BOHR, 16: 1.80/BOHR, 17: 1.75/BOHR, 18: 1.88/BOHR,
+                  19: 2.75/BOHR, 35: 1.85/BOHR}
+
+    Roff = Rhead/BOHR
+    Deltar = 0.1
+
+    # smoothing function
+    def tau_f(R, Rcut, Roff):
+        return (R - Rcut)**2 * (3*Roff - Rcut - 2*R) / (Roff - Rcut)**3
+
+    Rshort = np.array([RVDW_bondi[iatom] for iatom in mf.mol._atm[:, 0]])
+    idxxmin = np.argmin(atomcoords[:, 0] - Rshort)
+    idxxmax = np.argmax(atomcoords[:, 0] + Rshort)
+    idxymin = np.argmin(atomcoords[:, 1] - Rshort)
+    idxymax = np.argmax(atomcoords[:, 1] + Rshort)
+    idxzmin = np.argmin(atomcoords[:, 2] - Rshort)
+    idxzmax = np.argmax(atomcoords[:, 2] + Rshort)
+    atomtypes = np.array(mf.mol._atm[:, 0])
+    # Generate the grids in the cube
+    xmin = atomcoords[:, 0].min() - Rhead/BOHR - RVDW_bondi[atomtypes[idxxmin]]
+    xmax = atomcoords[:, 0].max() + Rhead/BOHR + RVDW_bondi[atomtypes[idxxmax]]
+    ymin = atomcoords[:, 1].min() - Rhead/BOHR - RVDW_bondi[atomtypes[idxymin]]
+    ymax = atomcoords[:, 1].max() + Rhead/BOHR + RVDW_bondi[atomtypes[idxymax]]
+    zmin = atomcoords[:, 2].min() - Rhead/BOHR - RVDW_bondi[atomtypes[idxzmin]]
+    zmax = atomcoords[:, 2].max() + Rhead/BOHR + RVDW_bondi[atomtypes[idxzmax]]
+    x = np.arange(xmin, xmax, deltaR/BOHR)
+    y = np.arange(ymin, ymax, deltaR/BOHR)
+    z = np.arange(zmin, zmax, deltaR/BOHR)
+    gridcoords = np.meshgrid(x, y, z)
+    gridcoords = np.vstack(list(map(np.ravel, gridcoords))).T
+
+    # [natom, ngrids] distance between an atom and a grid
+    r_pX = scipy.spatial.distance.cdist(atomcoords, gridcoords)
+    # delete the grids in the vdw surface and out the Rhead surface.
+    # the minimum distance to any atom
+    Rkmin = (r_pX - np.expand_dims(Rshort, axis=1)).min(axis=0)
+    Ron = Rshort + Deltar
+    Rlong = Roff - Deltar
+    AJk = np.ones(r_pX.shape)  # the short-range weight
+    idx = r_pX < np.expand_dims(Rshort, axis=1)
+    AJk[idx] = 0
+    if ifqchem:
+        idx2 = (r_pX < np.expand_dims(Ron, axis=1)) * \
+            (r_pX >= np.expand_dims(Rshort, axis=1))
+        AJk[idx2] = tau_f(r_pX, np.expand_dims(Rshort, axis=1),
+                          np.expand_dims(Ron, axis=1))[idx2]
+        wLR = 1 - tau_f(Rkmin, Rlong, Roff)  # the long-range weight
+        idx1 = Rkmin < Rlong
+        idx2 = Rkmin > Roff
+        wLR[idx1] = 1
+        wLR[idx2] = 0
+    else:
+        wLR = np.ones(r_pX.shape[-1])  # the long-range weight
+        idx = Rkmin > Roff
+        wLR[idx] = 0
+    w = wLR*np.prod(AJk, axis=0)  # weight for a specific poing
+    idx = w <= 1.0E-14
+    w = np.delete(w, idx)
+    r_pX = np.delete(r_pX, idx, axis=1)
+    gridcoords = np.delete(gridcoords, idx, axis=0)
+
+    ngrids = gridcoords.shape[0]
+    r_pX = cupy.array(r_pX)
+    r_pX_potential = 1/r_pX
+    potential_real = cupy.dot(cupy.array(
+        mf.mol.atom_charges()), r_pX_potential)
+    nbatch = 256*256
+
+    # assert nbatch < ngrids
+    fmol = pyscf.gto.fakemol_for_charges(gridcoords[:nbatch])
+    intopt = VHFOpt(mf.mol, fmol, 'int2e')
+    for ibatch in range(0, ngrids, nbatch):
+        max_grid = min(ibatch+nbatch, ngrids)
+        num_grids = max_grid - ibatch
+        ptr = intopt.auxmol._atm[:num_grids, gto.PTR_COORD]
+        intopt.auxmol._env[np.vstack(
+            (ptr, ptr+1, ptr+2)).T] = gridcoords[ibatch:max_grid]
+        intopt.build(1e-14, diag_block_with_triu=False, aosym=True)
+        potential_real[ibatch:max_grid] -= 2.0 * \
+            get_j_int3c2e_pass1(intopt, dm)[:num_grids]
+
+    w = cupy.array(w)
+    r_pX_potential_omega = r_pX_potential*w
+    GXA = r_pX_potential_omega@r_pX_potential.T
+    eX = r_pX_potential_omega@potential_real
+    GXA_inv = cupy.linalg.inv(GXA)
+    g = GXA_inv@eX
+    alpha = (g.sum() - mf.mol.charge)/(GXA_inv.sum())
+    q = g - alpha*GXA_inv@cupy.ones((mf.mol.natm))
+    t6 = time.process_time()
+    t6w = time.time()
+    print("Total cpu time: ", t6 - t0)
+    print("Total wall time: ", t6w - t0w)
+    return q
+
diff --git a/gpu4pyscf/qmmm/test/test_chelpg.py b/gpu4pyscf/qmmm/test/test_chelpg.py
new file mode 100644
index 00000000..836ce15e
--- /dev/null
+++ b/gpu4pyscf/qmmm/test/test_chelpg.py
@@ -0,0 +1,85 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import pyscf
+from pyscf import lib
+from gpu4pyscf.dft import rks
+from gpu4pyscf.qmmm import chelpg
+
+lib.num_threads(8)
+
+atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+
+bas='def2tzvpp'
+grids_level = 5
+
+def setUpModule():
+    global mol
+    mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
+    mol.output = '/dev/null'
+    mol.build()
+    mol.verbose = 1
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+def run_dft_chelpg(xc, deltaR):
+    mf = rks.RKS(mol, xc=xc)
+    mf.grids.level = grids_level
+    e_dft = mf.kernel()
+    q = chelpg.eval_chelpg_layer_gpu(mf, deltaR=deltaR)
+    return e_dft, q
+    
+
+class KnownValues(unittest.TestCase):
+    '''
+    known values are obtained by Q-Chem
+    $rem
+    JOBTYP  SP
+    METHOD  b3lyp
+    BASIS   def2-tzvpp
+    XC_GRID 000099000590
+    CHELPG_DX 2
+    CHELPG        TRUE
+    SCF_CONVERGENCE 10
+    $end
+    
+        Ground-State ChElPG Net Atomic Charges
+
+     Atom                 Charge (a.u.)
+  ----------------------------------------
+      1 O                    -0.712558
+      2 H                     0.356292
+      3 H                     0.356266
+  ----------------------------------------
+    '''
+    def test_rks_b3lyp(self):
+        print('-------- B3LYP -------------')
+        e_tot, q = run_dft_chelpg('B3LYP', 0.1)
+        assert np.allclose(e_tot, -76.4666495181)
+        assert np.allclose(q, np.array([-0.712558, 0.356292, 0.356266]))
+        
+
+if __name__ == "__main__":
+    print("Full Tests for SCF")
+    unittest.main()
\ No newline at end of file

From 79041b420b6e77b86ea82edc97789a849ad3af0b Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Fri, 27 Oct 2023 17:53:53 -0700
Subject: [PATCH 15/19] Optimize hessian 2 (#53)

* numpy -> cupy for solvent

* for linter

* remove grad switch from pcm.py

* passed flake8

* solvent integrals on GPU

* flake8

* compatiable with pyscf-2.4.0

* added solvent

* fixed issues for to_cpu

* store intermeidate variable on CPU

* cupy.einsum -> contract

* optimized dft integration for gradient and hessian

* remove lprof

* fixed a bug in nlc

* precompute fxc_x
---
 gpu4pyscf/df/df_jk.py              |   4 +-
 gpu4pyscf/df/grad/rhf.py           |   2 +-
 gpu4pyscf/df/grad/rks.py           |   2 +-
 gpu4pyscf/df/hessian/rhf.py        |   6 +-
 gpu4pyscf/df/hessian/rks.py        |   1 +
 gpu4pyscf/df/int3c2e.py            | 119 +++++++++++++--------
 gpu4pyscf/df/tests/test_df_grad.py |  28 ++---
 gpu4pyscf/df/tests/test_df_scf.py  |  39 +++++--
 gpu4pyscf/dft/numint.py            | 154 +++++++++++++++------------
 gpu4pyscf/dft/rks.py               |   8 +-
 gpu4pyscf/grad/rks.py              | 161 ++++++++++++++---------------
 gpu4pyscf/hessian/rhf.py           |  26 ++---
 gpu4pyscf/hessian/rks.py           | 124 ++++++++++++----------
 setup.py                           |   2 +-
 14 files changed, 377 insertions(+), 299 deletions(-)

diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index 7f6c3933..e27f16f4 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -310,8 +310,8 @@ def get_j(cderi_sparse):
                 rhok = contract('Lij,jk->Lki', cderi, occ_coeff)
                 for i in range(mo1.shape[0]):
                     rhok1 = contract('Lij,jk->Lki', cderi, mo1[i])
-                    vk[i] += contract('Lki,Lkj->ij', rhok, rhok1)
-                    #contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[i])
+                    #vk[i] += contract('Lki,Lkj->ij', rhok, rhok1)
+                    contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[i])
         occ_coeff = rhok1 = rhok = mo1 = None
         if with_k:
             vk = vk + vk.transpose(0,2,1)
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 292d9309..84f5ed23 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -34,7 +34,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     if mol is None: mol = mf_grad.mol
     #TODO: dm has to be the SCF density matrix in this version.  dm should be
     # extended to any 1-particle density matrix
-    
+
     if(dm0 is None): dm0 = mf_grad.base.make_rdm1()
     mf = mf_grad.base
     if omega is None:
diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py
index 99a8ccf4..2ef88e86 100644
--- a/gpu4pyscf/df/grad/rks.py
+++ b/gpu4pyscf/df/grad/rks.py
@@ -129,7 +129,7 @@ def get_j(self, mol=None, dm=None, hermi=0, omega=None):
     def get_k(self, mol=None, dm=None, hermi=0, omega=None):
         _, vk, _, vkaux = self.get_jk(mol, dm, with_j=False, omega=omega)
         return vk, vkaux
-        
+
     def extra_force(self, atom_id, envs):
         if self.auxbasis_response:
             return envs['dvhf'].aux[atom_id]
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index dfcccddb..46cb38ec 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -42,6 +42,8 @@
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
 
+BLKSIZE = 128
+
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
     e1, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
@@ -94,7 +96,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=64, group_size_aux=32)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
     sph_ao_idx = intopt.sph_ao_idx
     sph_aux_idx = intopt.sph_aux_idx
 
@@ -416,7 +418,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     int2c = cupy.asarray(int2c)
     # ======================= sorted AO begin ======================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=64, group_size=64)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE)
     sph_ao_idx = intopt.sph_ao_idx
     sph_aux_idx = intopt.sph_aux_idx
     rev_ao_idx = np.argsort(intopt.sph_ao_idx)
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index a90b93e7..d8986f1f 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -65,6 +65,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         de2 -= (alpha - hyb) * ek_lr
 
     max_memory = None
+    t1 = log.timer_debug1('computing ej, ek', *t1)
     veff_diag = rks_hess._get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory)
     t1 = log.timer_debug1('computing veff_diag', *t1)
     aoslices = mol.aoslice_by_atom()
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 36268e08..fc755a27 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -517,6 +517,7 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
 
 def loop_aux_jk(intopt, ip_type='', omega=None, stream=None):
     '''
+    **** deprecated **********
     loop over all int3c2e blocks
     - outer loop for k
     - inner loop for ij pair
@@ -738,26 +739,37 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
     vj1 = cupy.zeros([natom,3,nao_sph,nocc])
     vk1 = cupy.zeros([natom,3,nao_sph,nocc])
 
-    for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega):
-        k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        vj1_buf += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
-
-        rhok_tmp = cupy.asarray(rhok[k0:k1])
-        if with_k:
-            rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2
-            vk1_buf += contract('xpji,plj->xil', int3c_blk, rhok0_slice)
-
-        rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag)
-        vj1_ao = contract('pjo,xpi->xijo', rhok_tmp, rhoj0)
-        vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom)
+    ncp_ij = len(intopt.log_qs)
+    count = 0
+    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1', omega=omega):
+        vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
+        # initialize intermediate variables
+        if count % ncp_ij == 0:
+            rhoj0 = cupy.zeros([3,k1-k0,nao_sph])
+            rhok_tmp = cupy.asarray(rhok[k0:k1])
+            vj1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc])
+            if with_k:
+                rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2
+                rhok0 = contract('pli,lo->poi', rhok0_slice, orbo)
+                int3c_ip1_occ = cupy.zeros([3,k1-k0,nao_sph,nocc])
+                vk1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc])
+
+        # contraction
+        rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
         if with_k:
-            int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo)
-            vk1_ao = contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice)
-            vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
+            int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
+            vk1_ao[:,i0:i1,j0:j1] += contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
+            vk1_buf[:,i0:i1] += contract('xpji,plj->xil', int3c_blk, rhok0_slice[:,:,j0:j1])
+
+        # reduction
+        if (count+1) % ncp_ij == 0:
+            vj1_ao += contract('pjo,xpi->xijo', rhok_tmp, rhoj0)
+            vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom)
+            if with_k:
+                vk1_ao += contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice)
+                vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
+        count += 1
 
-            rhok0 = contract('pli,lo->poi', rhok0_slice, orbo)
-            vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0)
-            vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
     return vj1_buf, vk1_buf, vj1, vk1
 
 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None):
@@ -771,25 +783,36 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome
     nocc = orbo.shape[1]
     vj1 = cupy.zeros([natom,3,nao_sph,nocc])
     vk1 = cupy.zeros([natom,3,nao_sph,nocc])
-    for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip2', omega=omega):
-        k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        wj2 = contract('xpji,ji->xp', int3c_blk, dm0_tag)
-        wk2_P__ = contract('xpji,jo->xpio', int3c_blk, orbo)
-
-        rhok_tmp = cupy.asarray(rhok[k0:k1])
-        vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
-        vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
-
-        vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
-        if with_k:
-            rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo)
-            vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2
 
-            rhok0_oo = contract('pio,ir->pro', rhok_tmp, orbo)
-            vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2
-
-            vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
-        wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None
+    ncp_ij = len(intopt.log_qs)
+    count = 0
+    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip2', omega=omega):
+        # initialize intermediate variables
+        if count % ncp_ij == 0:
+            wj2 = cupy.zeros([3,k1-k0])
+            wk2_P__ = cupy.zeros([3,k1-k0,nao_sph,nocc])
+
+        # contraction
+        wj2 += contract('xpji,ji->xp', int3c_blk, dm0_tag[j0:j1,i0:i1])
+        wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
+
+        # reduction
+        if (count+1) % ncp_ij == 0:
+            rhok_tmp = cupy.asarray(rhok[k0:k1])
+            vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
+            vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
+
+            vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
+            if with_k:
+                rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo)
+                vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2
+
+                rhok0_oo = contract('pio,ir->pro', rhok_tmp, orbo)
+                vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2
+
+                vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
+            wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None
+        count += 1
     return vj1, vk1
 
 def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
@@ -800,7 +823,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     naux_sph = len(intopt.sph_aux_idx)
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
-    wj = cupy.empty([nao_sph,naux_sph,3])
+
+    wj = cupy.zeros([nao_sph,naux_sph,3])
     avail_mem = get_avail_mem()
     use_gpu_memory = True
     if nao_sph*naux_sph*nocc*3*8 < 0.4*avail_mem:
@@ -816,14 +840,19 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
         wk = np.ndarray([nao_sph,naux_sph,nocc,3], dtype=np.float64, order='C', buffer=mem)
 
     # TODO: async data transfer
-    for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega):
-        k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1]
-        wj[:,k0:k1] = contract('xpji,ij->ipx', int3c_blk, dm0_tag)
-        wk_tmp = contract('xpji,jo->ipox', int3c_blk, orbo)
-        if use_gpu_memory:
-            wk[:,k0:k1] = wk_tmp
-        else:
-            wk[:,k0:k1] = wk_tmp.get()
+    ncp_ij = len(intopt.log_qs)
+    count = 0
+    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1', omega=omega):
+        if count % ncp_ij == 0:
+            wk_tmp = cupy.zeros([nao_sph, k1-k0, nocc, 3])
+        wj[i0:i1,k0:k1] += contract('xpji,ij->ipx', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        wk_tmp[i0:i1,:] += contract('xpji,jo->ipox', int3c_blk, orbo[j0:j1])
+        if (count+1) % ncp_ij == 0:
+            if use_gpu_memory:
+                wk[:,k0:k1] = wk_tmp
+            else:
+                wk[:,k0:k1] = wk_tmp.get()
+        count += 1
     return wj, wk
 
 def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
diff --git a/gpu4pyscf/df/tests/test_df_grad.py b/gpu4pyscf/df/tests/test_df_grad.py
index ade19e87..9156c2c0 100644
--- a/gpu4pyscf/df/tests/test_df_grad.py
+++ b/gpu4pyscf/df/tests/test_df_grad.py
@@ -40,25 +40,29 @@
 auxbasis0='def2-tzvpp-jkfit'
 disp0='d3bj'
 grids_level = 6
+nlcgrids_level = 3
 def setUpModule():
     global mol
     mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
     mol.output = '/dev/null'
     mol.build()
     mol.verbose = 1
-    
+
 eps = 1.0/1024
 
 def tearDownModule():
     global mol
     mol.stdout.close()
     del mol
-    
+
 def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6):
     mf = rks.RKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0)
     mf.grids.level = grids_level
-    mf.conv_tol = 1e-12
+    mf.nlcgrids.level = nlcgrids_level
+    mf.conv_tol = 1e-10
+    mf.verbose = 1
     e_tot = mf.kernel()
+
     g = mf.nuc_grad_method()
     g.auxbasis_response = True
     g.grid_response = grid_response
@@ -94,39 +98,39 @@ def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6):
     assert(cupy.linalg.norm(g_analy - grad_fd) < tol)
 
 class KnownValues(unittest.TestCase):
-    
+
     def test_grad_with_grids_response(self):
         print("-----testing DF DFT gradient with grids response----")
         _check_grad(grid_response=True)
-    
+
     def test_grad_without_grids_response(self):
         print('-----testing DF DFT gradient without grids response----')
         _check_grad(grid_response=False)
-    
+
     def test_grad_lda(self):
         print("-----LDA testing-------")
         _check_grad(xc='LDA', disp=None, tol=1e-6)
-    
+
     def test_grad_gga(self):
         print('-----GGA testing-------')
         _check_grad(xc='PBE', disp=None, tol=1e-6)
-    
+
     def test_grad_hybrid(self):
         print('------hybrid GGA testing--------')
         _check_grad(xc='B3LYP', disp=None, tol=1e-6)
-    
+
     def test_grad_mgga(self):
         print('-------mGGA testing-------------')
         _check_grad(xc='m06', disp=None, tol=1e-4)
-    
+
     def test_grad_rsh(self):
         print('--------RSH testing-------------')
         _check_grad(xc='wb97', disp=None, tol=1e-4)
-    
+
     def test_grad_nlc(self):
         print('--------nlc testing-------------')
         _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-6)
-    
+
 if __name__ == "__main__":
     print("Full Tests for DF Gradient")
     unittest.main()
diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py
index 8c686b63..eef66e16 100644
--- a/gpu4pyscf/df/tests/test_df_scf.py
+++ b/gpu4pyscf/df/tests/test_df_scf.py
@@ -47,7 +47,9 @@ def tearDownModule():
 
 def run_dft(xc):
     mf = rks.RKS(mol, xc=xc).density_fit(auxbasis='def2-tzvpp-jkfit')
-    mf.grids.level = grids_level
+    mf.grids.atom_grid = (99,590)
+    mf.nlcgrids.atom_grid = (50,194)
+    mf.conv_tol = 1e-10
     e_dft = mf.kernel()
     return e_dft
 
@@ -56,39 +58,54 @@ class KnownValues(unittest.TestCase):
     known values are obtained by Q-Chem
     '''
     def test_rhf(self):
+        print('------- HF -----------------')
         mf = scf.RHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit')
         e_tot = mf.kernel()
-        assert np.allclose(e_tot, -76.0624582299)
+        e_qchem = -76.0624582299
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
     def test_rks_lda(self):
         print('------- LDA ----------------')
-        e_tot = run_dft("LDA_X,LDA_C_VWN")
-        assert np.allclose(e_tot, -75.9046407209)
+        e_tot = run_dft("LDA,VWN5")
+        e_qchem = -75.9046407209
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
     def test_rks_pbe(self):
         print('------- PBE ----------------')
         e_tot = run_dft('PBE')
-        assert np.allclose(e_tot, -76.3800181250)
+        e_qchem = -76.3800181250
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
     def test_rks_b3lyp(self):
         print('-------- B3LYP -------------')
         e_tot = run_dft('B3LYP')
-        assert np.allclose(e_tot, -76.4666493796)
+        e_qchem = -76.4666493796
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
     def test_rks_m06(self):
         print('--------- M06 --------------')
         e_tot = run_dft("M06")
-        assert np.allclose(e_tot, -76.4265841359)
+        e_qchem = -76.4265841359
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
     def test_rks_wb97(self):
         print('-------- wB97 --------------')
         e_tot = run_dft("HYB_GGA_XC_WB97")
-        assert np.allclose(e_tot, -76.4486277053)
+        e_qchem = -76.4486277053
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
-    def test_rks_wb97(self):
-        print('-------- wB97 --------------')
+    def test_rks_wb97m_v(self):
+        print('-------- wB97m-v --------------')
         e_tot = run_dft("HYB_MGGA_XC_WB97M_V")
-        assert np.allclose(e_tot, -76.4334567297)
+        e_qchem = -76.4334567297
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.allclose(e_tot, e_qchem)
 
     def test_to_cpu(self):
         mf = scf.RHF(mol).density_fit().to_cpu()
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 2f4b3811..cc9c0e6b 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -160,7 +160,7 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
 
     shls_slice = (0, mol.nbas)
     ao_loc = mol.ao_loc_nr()
-    
+
     #cpos = cupy.einsum('ij,j->ij', mo_coeff[:,mo_occ>0], cupy.sqrt(mo_occ[mo_occ>0]))
     cpos = mo_coeff[:,mo_occ>0] * cupy.sqrt(mo_occ[mo_occ>0])
     if xctype == 'LDA' or xctype == 'HF':
@@ -228,10 +228,10 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
     elif xctype in ('GGA', 'NLC'):
         rho = cupy.empty((4,ngrids))
         c_0 = contract('nig,io->nog', ao, cpos1)
-        rho[0] = _contract_rho(c0[0], c_0[0])
+        _contract_rho(c0[0], c_0[0], rho=rho[0])
         for i in range(1, 4):
-            rho[i] = _contract_rho(c_0[0], c0[i])
-            rho[i]+= _contract_rho(c0[0], c_0[i])
+            _contract_rho(c_0[0], c0[i], rho=rho[i])
+            rho[i] += _contract_rho(c0[0], c_0[i])
         rho *= 2.0
     else: # meta-GGA
         # TODO: complete this
@@ -382,14 +382,15 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     dm_shape = dms.shape
     dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
     nset = len(dms)
-    ao_loc = mol.ao_loc_nr()
-    
-    if mo_coeff is not None: 
+
+    if mo_coeff is not None:
         mo_coeff = coeff @ mo_coeff
 
     nelec = cupy.zeros(nset)
     excsum = cupy.zeros(nset)
     vmat = cupy.zeros((nset, nao, nao))
+    '''
+    ao_loc = mol.ao_loc_nr()
     if USE_SPARSITY == 1:
         nbins = NBINS * 2 - int(NBINS * np.log(ni.cutoff) / np.log(grids.cutoff))
         pair2shls, pairs_locs = _make_pairs2shls_idx(ni.pair_mask, opt.l_bas_offsets, hermi)
@@ -398,89 +399,69 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                                                                opt.l_bas_offsets)
         else:
             pair2shls_full, pairs_locs_full = pair2shls, pairs_locs
-
+    '''
     release_gpu_stack()
     if xctype == 'LDA':
         ao_deriv = 0
     else:
         ao_deriv = 1
-    
-    block_id = 0
-    for ao, sindex, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv, blksize=ni.grid_blksize):
-        if ni.grid_blksize is None:
-            ni.grid_blksize = weight.shape[0]
-        
-        # cache ao indices
-        if block_id not in ni.non0ao_idx:
-            t0 = (logger.process_clock(), logger.perf_counter())
-            if xctype == 'LDA':
-                mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1])
-                idx = cupy.argwhere(mask).astype(np.int32)[:,0]
-                ao_mask = ao[idx,:]
-            else:
-                mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2])
-                idx = cupy.argwhere(mask).astype(np.int32)[:,0]
-                ao_mask = ao[:,idx,:]
-            ni.non0ao_idx[block_id] = idx
-            t1 = log.timer_debug1('initialize ao sparsity', *t0)
-        else:
-            idx = ni.non0ao_idx[block_id]
-            if xctype == 'LDA':
-                ao_mask = ao[idx,:]
-            else:
-                ao_mask = ao[:,idx,:]
-        block_id += 1
+
+    for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
         for i in range(nset):
             t0 = (logger.process_clock(), logger.perf_counter())
             #rho = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=1)
             #rho = _make_rho(ao, dms[i], xctype=xctype)
             if mo_coeff is None:
-                rho = eval_rho(mol, ao, dms[i], xctype=xctype, hermi=1)
+                rho = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1)
             else:
                 mo_coeff_mask = mo_coeff[idx,:]
                 rho = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
 
-            t1 = log.timer_debug1('eval rho', *t0)
             exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
             vxc = cupy.asarray(vxc, order='C')
             exc = cupy.asarray(exc, order='C')
-            t1 = log.timer_debug1('eval vxc', *t1)
+            t1 = log.timer_debug1('eval vxc', *t0)
             if xctype == 'LDA':
                 den = rho * weight
                 wv = weight * vxc[0]
+                '''
                 if USE_SPARSITY == 0:
                     vmat[i] += ao.dot(_scale_ao(ao, wv).T)
                 elif USE_SPARSITY == 1:
                     _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc,
                         pair2shls_full, pairs_locs_full, vmat[i])
-                elif USE_SPARSITY == 2:
+                '''
+                if USE_SPARSITY == 2:
                     aow = _scale_ao(ao_mask, wv)
                     # vmat[i][cupy.ix_(mask, mask)] += ao_mask.dot(aow.T)
                     add_sparse(vmat[i], ao_mask.dot(aow.T), idx)
                 else:
-                    raise NotImplementedError('Not implemented yet')
+                    raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
             elif xctype == 'GGA':
                 den = rho[0] * weight
                 wv = vxc * weight
                 wv[0] *= .5
+                '''
                 if USE_SPARSITY == 0:
                     vmat[i] += ao[0].dot(_scale_ao(ao, wv).T)
                 elif USE_SPARSITY == 1:
                     aow = _scale_ao(ao, wv)
                     _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc,
                         pair2shls_full, pairs_locs_full, vmat[i])
-                elif USE_SPARSITY == 2:
+                '''
+                if USE_SPARSITY == 2:
                     aow = _scale_ao(ao_mask, wv)
                     #vmat[i][cupy.ix_(mask, mask)] += ao_mask[0].dot(aow.T)
                     add_sparse(vmat[i], ao_mask[0].dot(aow.T), idx)
                 else:
-                    raise NotImplementedError('Not implemented yet')
+                    raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
             elif xctype == 'NLC':
                 raise NotImplementedError('NLC')
             elif xctype == 'MGGA':
                 den = rho[0] * weight
                 wv = vxc * weight
                 wv[[0, 4]] *= .5  # *.5 for v+v.T
+                '''
                 if USE_SPARSITY == 0:
                     aow = _scale_ao(ao[:4], wv[:4])
                     vmat[i] += ao[0].dot(aow.T)
@@ -490,12 +471,15 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                         pair2shls_full, pairs_locs_full, vmat[i])
                     _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc,
                         pair2shls_full, pairs_locs_full, vmat[i])
-                else:
+                '''
+                if USE_SPARSITY == 2:
                     aow = _scale_ao(ao_mask, wv[:4])
                     vtmp = ao_mask[0].dot(aow.T)
                     vtmp+= _tau_dot(ao_mask, ao_mask, wv[4])
                     #vmat[i][cupy.ix_(mask, mask)] += vtmp
                     add_sparse(vmat[i], vtmp, idx)
+                else:
+                    raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
             elif xctype == 'HF':
                 pass
             else:
@@ -503,7 +487,6 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             #nelec[i] += den.sum()
             excsum[i] += cupy.dot(den, exc)[0]
             t1 = log.timer_debug1('integration', *t1)
-        ao = None
 
     vmat = contract('pi,npq->niq', coeff, vmat)
     vmat = contract('qj,niq->nij', coeff, vmat)
@@ -666,8 +649,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
     # AO basis -> gdftopt AO basis
     with_mocc = hasattr(dms, 'mo1')
     if with_mocc:
-        mo1 = cupy.einsum('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5
-        occ_coeff = cupy.einsum('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5
+        mo1 = contract('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5
+        occ_coeff = contract('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5
     dms = contract('nij,qj->niq', dms, coeff)
     dms = contract('pi,niq->npq', coeff, dms)
     nset = len(dms)
@@ -682,43 +665,52 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
     t0 = (logger.process_clock(), logger.perf_counter())
     for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv):
         p0, p1 = p1, p1+len(weights)
-        # precompute the first half
+        # precompute molecular orbitals
         if with_mocc:
+            occ_coeff_mask = occ_coeff[mask]
             if xctype == 'LDA':
-                c0 = _dot_ao_dm(mol, ao, occ_coeff, None, None, None)
+                c0 = _dot_ao_dm(mol, ao, occ_coeff_mask, None, None, None)
             elif xctype == "GGA":
                 c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0])
                 for i in range(4):
-                    c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff, None, None, None)
+                    c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None)
             else: # mgga
                 c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0])
                 for i in range(4):
-                    c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff, None, None, None)
-        # loop the second half
+                    c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None)
+        # precompute fxc_w
+        if xctype == 'LDA':
+            fxc_w = fxc[0,0,p0:p1] * weights
+        else:
+            fxc_w = fxc[:,:,p0:p1] * weights
+        # loop perturbed molecular orbitals
         for i in range(nset):
             if with_mocc:
-                rho1 = eval_rho3(opt.mol, ao, c0, mo1[i], xctype=xctype, with_lapl=False)
+                rho1 = eval_rho3(opt.mol, ao, c0, mo1[i][mask], xctype=xctype, with_lapl=False)
             else:
-                rho1 = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=hermi, with_lapl=False)
+                rho1 = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False)
 
             if xctype == 'LDA':
-                wv = rho1 * fxc[0,0,p0:p1] * weights
-                vmat[i] += ao.dot(_scale_ao(ao, wv).T)
+                wv = rho1 * fxc_w
+                vmat_tmp = ao.dot(_scale_ao(ao, wv).T)
+                add_sparse(vmat[i], vmat_tmp, mask)
             elif xctype == 'GGA':
-                wv = cupy.einsum('xg,xyg->yg', rho1, fxc[:,:,p0:p1]) * weights
+                wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w)
                 wv[0] *= .5
-                vmat[i] += ao[0].dot(_scale_ao(ao, wv).T)
+                vmat_tmp = ao[0].dot(_scale_ao(ao, wv).T)
+                add_sparse(vmat[i], vmat_tmp, mask)
             elif xctype == 'NLC':
                 raise NotImplementedError('NLC')
             else:
-                wv = cupy.einsum('xg,xyg->yg', rho1, fxc[:,:,p0:p1]) * weights
+                wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w)
                 wv[[0, 4]] *= .5
-                vmat[i] += ao[0].dot(_scale_ao(ao[:4], wv[:4]).T)
-                vmat[i] += _tau_dot(ao, ao, wv[4])
+                vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T)
+                vmat_tmp+= _tau_dot(ao, ao, wv[4])
+                add_sparse(vmat[i], vmat_tmp, mask)
         t0 = log.timer_debug1('vxc', *t0)
         ao = c0 = rho1 = None
-    vmat = cupy.einsum('pi,npq->niq', coeff, vmat)
-    vmat = cupy.einsum('qj,niq->nij', coeff, vmat)
+    vmat = contract('pi,npq->niq', coeff, vmat)
+    vmat = contract('qj,niq->nij', coeff, vmat)
     if xctype != 'LDA':
         #transpose_sum(vmat)
         vmat = vmat + vmat.transpose([0,2,1])
@@ -872,7 +864,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     vvrho = []
     for ao, mask, weight, coords \
             in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-        rho = eval_rho(opt.mol, ao, dms[0], xctype='GGA', hermi=1)
+        rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1)
         vvrho.append(rho)
     rho = cupy.hstack(vvrho)
     exc = 0
@@ -895,7 +887,9 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         wv = vv_vxc[:,p0:p1] * weight
         wv[0] *= .5
         aow = _scale_ao(ao, wv)
-        vmat += ao[0].dot(aow.T)
+        #vmat += ao[0].dot(aow.T)
+        add_sparse(vmat, ao[0].dot(aow.T), mask)
+
     vmat = vmat + vmat.T
     vmat = contract('pi,pq->iq', coeff, vmat)
     vmat = contract('qj,iq->ij', coeff, vmat)
@@ -1058,7 +1052,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     if blksize is None:
         cupy.get_default_memory_pool().free_all_blocks()
         mem_avail = get_avail_mem()
-        blksize = int((mem_avail*.2/8/((comp+1)*nao + extra) - nao*2)/ ALIGNED) * ALIGNED
+        blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED
         blksize = min(blksize, MIN_BLK_SIZE)
         log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, blksize)
         if blksize < ALIGNED:
@@ -1071,14 +1065,37 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
 
     mol = opt.mol
     with opt.gdft_envs_cache():
+        block_id = 0
         for ip0, ip1 in lib.prange(0, ngrids, blksize):
             coords = grids.coords[ip0:ip1]
             weight = grids.weights[ip0:ip1]
-            sindex = None#ni.screen_index[ip0//GRID_BLKSIZE:]
+            #sindex = ni.screen_index[ip0//GRID_BLKSIZE:]
             t0 = (logger.process_clock(), logger.perf_counter())
             ao = eval_ao(ni, mol, coords, deriv)
             log.timer_debug1('eval ao', *t0)
-            yield ao, sindex, weight, coords
+
+            # cache ao indices
+            if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx:
+                t0 = (logger.process_clock(), logger.perf_counter())
+                if deriv == 0:
+                    mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1])
+                    idx = cupy.argwhere(mask).astype(np.int32)[:,0]
+                    ao_mask = ao[idx,:]
+                else:
+                    mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2])
+                    idx = cupy.argwhere(mask).astype(np.int32)[:,0]
+                    ao_mask = ao[:,idx,:]
+                ni.non0ao_idx[deriv, block_id, blksize, ngrids] = idx
+                log.timer_debug1('initialize ao sparsity', *t0)
+            else:
+                idx = ni.non0ao_idx[deriv, block_id, blksize, ngrids]
+                if deriv == 0:
+                    ao_mask = ao[idx,:]
+                else:
+                    ao_mask = ao[:,idx,:]
+            block_id += 1
+            log.timer_debug1('eval rho', *t0)
+            yield ao_mask, idx, weight, coords
 
 class NumInt(numint.NumInt):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
@@ -1149,11 +1166,12 @@ def _make_pairs2shls_idx(pair_mask, l_bas_loc, hermi=0):
             pair2bra + pair2ket).astype(np.int32).reshape(2,-1)
     return bas_pair2shls, bas_pairs_locs
 
-def _contract_rho(bra, ket):
+def _contract_rho(bra, ket, rho=None):
     if bra.flags.c_contiguous and ket.flags.c_contiguous:
         assert bra.shape == ket.shape
         nao, ngrids = bra.shape
-        rho = cupy.empty(ngrids)
+        if rho is None:
+            rho = cupy.empty(ngrids)
         stream = cupy.cuda.get_current_stream()
         err = libgdft.GDFTcontract_rho(
             ctypes.cast(stream.ptr, ctypes.c_void_p),
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index 31e67cc8..ea12f511 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -79,7 +79,6 @@ def initialize_grids(ks, mol=None, dm=None):
             # Filter grids the first time setup grids
             ks.grids = prune_small_rho_grids_(ks, ks.mol, dm, ks.grids)
         t0 = logger.timer_debug1(ks, 'setting up grids', *t0)
-
         is_nlc = ks.nlc or ks._numint.libxc.is_nlc(ks.xc)
         if is_nlc and ks.nlcgrids.coords is None:
             if ks.nlcgrids.coords is None:
@@ -235,7 +234,14 @@ def __init__(self, mol, xc='LDA,VWN', disp=None):
         self._numint = numint.NumInt(xc=xc)
         self.disp = disp
         self.screen_tol = 1e-14
+
+        grids_level = self.grids.level
         self.grids = gen_grid.Grids(mol)
+        self.grids.level = grids_level
+
+        nlcgrids_level = self.nlcgrids.level
+        self.nlcgrids = gen_grid.Grids(mol)
+        self.nlcgrids.level = nlcgrids_level
 
     def get_dispersion(self):
         if self.disp is None:
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 008baa40..f22e8184 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -108,7 +108,6 @@ def _get_veff(ks_grad, mol=None, dm=None):
 
 def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             max_memory=2000, verbose=None):
-    log = logger.new_logger(mol, verbose)
     xctype = ni._xc_type(xc_code)
     opt = getattr(ni, 'gdftopt', None)
     if opt is None:
@@ -116,93 +115,79 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         opt = ni.gdftopt
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
-
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
     dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
            for dm in dms.reshape(-1,nao0,nao0)]
     mo_coeff = coeff @ mo_coeff
-    nset = len(dms)
-
-    with opt.gdft_envs_cache():
-        if xctype == 'LDA':
-            ao_deriv = 1
-        else:
-            ao_deriv = 2
 
-        mem_avail = get_avail_mem()
-        comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6
-        block_size = int((mem_avail*.4/8/(comp+1)/nao - 3*nao*2)/ ALIGNED) * ALIGNED
-        block_size = min(block_size, MIN_BLK_SIZE)
-        log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size)
+    nset = len(dms)
+    assert nset == 1
 
-        if block_size < ALIGNED:
-            raise RuntimeError('Not enough GPU memory')
+    if xctype == 'LDA':
+        ao_deriv = 1
+    else:
+        ao_deriv = 2
+
+    vmat = cupy.zeros((nset,3,nao,nao))
+    if xctype == 'LDA':
+        ao_deriv = 1
+        for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
+            for idm in range(nset):
+                mo_coeff_mask = mo_coeff[idx,:]
+                rho = numint.eval_rho2(opt.mol, ao_mask[0], mo_coeff_mask, mo_occ, None, xctype)
+                vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1]
+                wv = weight * vxc[0]
+                aow = numint._scale_ao(ao_mask[0], wv)
+                vtmp = _d1_dot_(ao_mask[1:4], aow.T)
+                #idx = cupy.ix_(mask, mask)
+                #vmat[idm][0][idx] += vtmp[0]
+                #vmat[idm][1][idx] += vtmp[1]
+                #vmat[idm][2][idx] += vtmp[2]
+                add_sparse(vmat[idm][0], vtmp[0], idx)
+                add_sparse(vmat[idm][1], vtmp[1], idx)
+                add_sparse(vmat[idm][2], vtmp[2], idx)
+    elif xctype == 'GGA':
+        ao_deriv = 2
+        for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
+            for idm in range(nset):
+                mo_coeff_mask = mo_coeff[idx,:]
+                rho = numint.eval_rho2(opt.mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype)
+                vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1]
+                wv = weight * vxc
+                wv[0] *= .5
+                vtmp = _gga_grad_sum_(ao_mask, wv)
+                #idx = cupy.ix_(mask, mask)
+                #vmat[idm][0][idx] += vtmp[0]
+                #vmat[idm][1][idx] += vtmp[1]
+                #vmat[idm][2][idx] += vtmp[2]
+                add_sparse(vmat[idm][0], vtmp[0], idx)
+                add_sparse(vmat[idm][1], vtmp[1], idx)
+                add_sparse(vmat[idm][2], vtmp[2], idx)
+    elif xctype == 'NLC':
+        raise NotImplementedError('NLC')
+
+    elif xctype == 'MGGA':
+        ao_deriv = 2
+        for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
+            for idm in range(nset):
+                mo_coeff_mask = mo_coeff[idx,:]
+                rho = numint.eval_rho2(opt.mol, ao_mask[:10], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False)
+                vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1]
+                wv = weight * vxc
+                wv[0] *= .5
+                wv[4] *= .5  # for the factor 1/2 in tau
+                vtmp = _gga_grad_sum_(ao_mask, wv)
+                vtmp += _tau_grad_dot_(ao_mask, wv[4])
+                #idx = cupy.ix_(mask, mask)
+                #vmat[idm][0][idx] += vtmp[0]
+                #vmat[idm][1][idx] += vtmp[1]
+                #vmat[idm][2][idx] += vtmp[2]
+                add_sparse(vmat[idm][0], vtmp[0], idx)
+                add_sparse(vmat[idm][1], vtmp[1], idx)
+                add_sparse(vmat[idm][2], vtmp[2], idx)
 
-        vmat = cupy.zeros((nset,3,nao,nao))
-        if xctype == 'LDA':
-            ao_deriv = 1
-            for ao, _, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-                for idm in range(nset):
-                    rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, None, xctype)
-                    vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1]
-                    wv = weight * vxc[0]
-                    mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2])
-                    idx = cupy.argwhere(mask).astype(numpy.int32)[:,0]
-                    ao_mask = ao[:,idx,:]
-                    aow = numint._scale_ao(ao_mask[0], wv)
-                    vtmp = _d1_dot_(ao_mask[1:4], aow.T)
-                    #idx = cupy.ix_(mask, mask)
-                    #vmat[idm][0][idx] += vtmp[0]
-                    #vmat[idm][1][idx] += vtmp[1]
-                    #vmat[idm][2][idx] += vtmp[2]
-                    add_sparse(vmat[idm][0], vtmp[0], idx)
-                    add_sparse(vmat[idm][1], vtmp[1], idx)
-                    add_sparse(vmat[idm][2], vtmp[2], idx)
-        elif xctype == 'GGA':
-            ao_deriv = 2
-            for ao, _, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-                for idm in range(nset):
-                    rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, None, xctype)
-                    vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1]
-                    wv = weight * vxc
-                    wv[0] *= .5
-                    mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2])
-                    idx = cupy.argwhere(mask).astype(numpy.int32)[:,0]
-                    ao_mask = ao[:,idx,:]
-                    vtmp = _gga_grad_sum_(ao_mask, wv)
-                    #idx = cupy.ix_(mask, mask)
-                    #vmat[idm][0][idx] += vtmp[0]
-                    #vmat[idm][1][idx] += vtmp[1]
-                    #vmat[idm][2][idx] += vtmp[2]
-                    add_sparse(vmat[idm][0], vtmp[0], idx)
-                    add_sparse(vmat[idm][1], vtmp[1], idx)
-                    add_sparse(vmat[idm][2], vtmp[2], idx)
-        elif xctype == 'NLC':
-            raise NotImplementedError('NLC')
-
-        elif xctype == 'MGGA':
-            ao_deriv = 2
-            for ao, _, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-                for idm in range(nset):
-                    rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, None, xctype)
-                    vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1]
-                    wv = weight * vxc
-                    wv[0] *= .5
-                    wv[4] *= .5  # for the factor 1/2 in tau
-                    mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2])
-                    idx = cupy.argwhere(mask).astype(numpy.int32)[:,0]
-                    ao_mask = ao[:,idx,:]
-                    vtmp = _gga_grad_sum_(ao_mask, wv)
-                    vtmp += _tau_grad_dot_(ao_mask, wv[4])
-                    #idx = cupy.ix_(mask, mask)
-                    #vmat[idm][0][idx] += vtmp[0]
-                    #vmat[idm][1][idx] += vtmp[1]
-                    #vmat[idm][2][idx] += vtmp[2]
-                    add_sparse(vmat[idm][0], vtmp[0], idx)
-                    add_sparse(vmat[idm][1], vtmp[1], idx)
-                    add_sparse(vmat[idm][2], vtmp[2], idx)
     vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
     exc = None
     if nset == 1:
@@ -222,6 +207,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
 
+    mol = opt.mol
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
@@ -238,26 +224,31 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     ao_deriv = 2
     vvrho = []
-    for ao, mask, weight, coords \
+    for ao_mask, mask, weight, coords \
             in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-        rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, None, xctype)
+        mo_coeff_mask = mo_coeff[mask]
+        rho = numint.eval_rho2(mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False)
         vvrho.append(rho)
     rho = cupy.hstack(vvrho)
+
     vxc = numint._vv10nlc(rho, grids.coords, rho, grids.weights,
                           grids.coords, nlc_pars)[1]
     vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0)
 
     vmat = cupy.zeros((3,nao,nao))
     p1 = 0
-    for ao, mask, weight, coords \
+    for ao_mask, mask, weight, coords \
             in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
         p0, p1 = p1, p1 + weight.size
         wv = vv_vxc[:,p0:p1] * weight
         wv[0] *= .5  # *.5 because vmat + vmat.T at the end
-        vmat += _gga_grad_sum_(ao, wv)
-
-    vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff)
+        vmat_tmp = _gga_grad_sum_(ao_mask, wv)
+        add_sparse(vmat[0], vmat_tmp[0], mask)
+        add_sparse(vmat[1], vmat_tmp[1], mask)
+        add_sparse(vmat[2], vmat_tmp[2], mask)
 
+    vmat = contract('npq,qj->npj', vmat, coeff)
+    vmat = contract('pi,npj->nij', coeff, vmat)
     exc = None
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 26c441cf..190a118d 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -84,11 +84,11 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         s1ao[:,p0:p1] += s1a[:,p0:p1]
         s1ao[:,:,p0:p1] += s1a[:,p0:p1].transpose(0,2,1)
 
-        tmp = cupy.einsum('xpq,pi->xiq', s1ao, mocc)
-        s1oo = cupy.einsum('xiq,qj->xij', tmp, mocc)
+        tmp = contract('xpq,pi->xiq', s1ao, mocc)
+        s1oo = contract('xiq,qj->xij', tmp, mocc)
 
         #s1oo = cupy.einsum('xpq,pi,qj->xij', s1ao, mocc, mocc)
-        s1mo = cupy.einsum('xij,ip->xpj', s1ao, mo_coeff)
+        s1mo = contract('xij,ip->xpj', s1ao, mo_coeff)
 
         for j0 in range(i0+1):
             ja = atmlst[j0]
@@ -96,10 +96,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 # *2 for double occupancy, *2 for +c.c.
             #dm1 = cupy.einsum('ypi,qi->ypq', mo1[ja], mocc)
             #de2_gpu[i0,j0] += cupy.einsum('xpq,ypq->xy', h1ao[ia], dm1) * 4
-            de2[i0,j0] += cupy.einsum('xpi,ypi->xy', h1ao[ia], mo1[ja]) * 4
+            de2[i0,j0] += contract('xpi,ypi->xy', h1ao[ia], mo1[ja]) * 4
             dm1 = cupy.einsum('ypi,qi,i->ypq', mo1[ja], mocc, mo_energy[mo_occ>0])
-            de2[i0,j0] -= cupy.einsum('xpq,ypq->xy', s1mo, dm1) * 4
-            de2[i0,j0] -= cupy.einsum('xpq,ypq->xy', s1oo, mo_e1[ja]) * 2
+            de2[i0,j0] -= contract('xpq,ypq->xy', s1mo, dm1) * 4
+            de2[i0,j0] -= contract('xpq,ypq->xy', s1oo, mo_e1[ja]) * 2
         for j0 in range(i0):
             de2[j0,i0] = de2[i0,j0].T
 
@@ -325,11 +325,11 @@ def solve_mo1(mf, mo_energy, mo_coeff, mo_occ, h1mo,
     s1a = cupy.asarray(s1a)
 
     def _ao2mo(mat):
-        tmp = cupy.einsum('xij,jo->xio', mat, mocc)
-        return cupy.einsum('xik,ip->xpk', tmp, mo_coeff)
+        tmp = contract('xij,jo->xio', mat, mocc)
+        return contract('xik,ip->xpk', tmp, mo_coeff)
     cupy.get_default_memory_pool().free_all_blocks()
     # TODO: calculate blksize dynamically
-    blksize = 10
+    blksize = 8
     mo1s = [None] * mol.natm
     e1s = [None] * mol.natm
     aoslices = mol.aoslice_by_atom()
@@ -371,13 +371,13 @@ def gen_vind(mf, mo_coeff, mo_occ):
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
         mo1 = mo1.reshape(-1,nmo,nocc)
-        mo1_mo = cupy.einsum('npo,ip->nio', mo1, mo_coeff)
-        dm1 = cupy.einsum('nio,jo->nij', 2.0*mo1_mo, mocc)
+        mo1_mo = contract('npo,ip->nio', mo1, mo_coeff)
+        dm1 = contract('nio,jo->nij', 2.0*mo1_mo, mocc)
         dm1 = dm1 + dm1.transpose(0,2,1)
         dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ)
         v1 = vresp(dm1)
-        tmp = cupy.einsum('nij,jo->nio', v1, mocc)
-        v1vo = cupy.einsum('nio,ip->npo', tmp, mo_coeff)
+        tmp = contract('nij,jo->nio', v1, mocc)
+        v1vo = contract('nio,ip->npo', tmp, mo_coeff)
         return v1vo
     return fx
 
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 19041963..d3898d42 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -27,7 +27,7 @@
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib.cupy_helper import contract, add_sparse
 from gpu4pyscf.lib import logger
 
 # import pyscf.grad.rks to activate nuc_grad_method method
@@ -97,10 +97,10 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             veff -= (alpha-hyb)*.5 * vk1.transpose(0,2,1).reshape(3,3,nao,nao)
             t1 = log.timer_debug1('range-separated int2e_ipvip1 for atom %d'%ia, *t1)
             vk1 = vk2 = None
-        de2[i0,i0] += cupy.einsum('xypq,pq->xy', veff_diag[:,:,p0:p1], dm0[p0:p1])*2
+        de2[i0,i0] += contract('xypq,pq->xy', veff_diag[:,:,p0:p1], dm0[p0:p1])*2
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
-            de2[i0,j0] += cupy.einsum('xypq,pq->xy', veff[:,:,q0:q1], dm0[q0:q1])*2
+            de2[i0,j0] += contract('xypq,pq->xy', veff[:,:,q0:q1], dm0[q0:q1])*2
 
         for j0 in range(i0):
             de2[j0,i0] = de2[i0,j0].T
@@ -213,12 +213,14 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
         ao_deriv = 2
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
+            mo_coeff_mask = mo_coeff[mask,:]
+            rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff_mask, mo_occ, mask, xctype)
             vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1]
             wv = weight * vxc[0]
             aow = numint._scale_ao(ao[0], wv)
             for i in range(6):
-                vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc)
+                vmat_tmp = numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc)
+                add_sparse(vmat[i], vmat_tmp, mask)
             aow = None
 
     elif xctype == 'GGA':
@@ -226,20 +228,22 @@ def contract_(mat, ao, aoidx, wv, mask):
             aow = numint._scale_ao(ao[aoidx[0]], wv[1])
             aow+= numint._scale_ao(ao[aoidx[1]], wv[2])
             aow+= numint._scale_ao(ao[aoidx[2]], wv[3])
-            mat += numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+            mat_tmp = numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+            add_sparse(mat, mat_tmp, mask)
 
         ao_deriv = 3
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
+            mo_coeff_mask = mo_coeff[mask,:]
+            rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff_mask, mo_occ, mask, xctype)
             vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1]
             wv = weight * vxc
             #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4])
             aow = numint._scale_ao(ao[:4], wv[:4])
 
             for i in range(6):
-                vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc)
-
+                vmat_tmp = numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc)
+                add_sparse(vmat[i], vmat_tmp, mask)
             contract_(vmat[0], ao, [XXX,XXY,XXZ], wv, mask)
             contract_(vmat[1], ao, [XXY,XYY,XYZ], wv, mask)
             contract_(vmat[2], ao, [XXZ,XYZ,XZZ], wv, mask)
@@ -253,19 +257,22 @@ def contract_(mat, ao, aoidx, wv, mask):
             aow = numint._scale_ao(ao[aoidx[0]], wv[1])
             aow+= numint._scale_ao(ao[aoidx[1]], wv[2])
             aow+= numint._scale_ao(ao[aoidx[2]], wv[3])
-            mat += numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+            mat_tmp = numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+            add_sparse(mat, mat_tmp, mask)
 
         ao_deriv = 3
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
+            mo_coeff_mask = mo_coeff[mask,:]
+            rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff_mask, mo_occ, mask, xctype)
             vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1]
             wv = weight * vxc
             wv[4] *= .5  # for the factor 1/2 in tau
             #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4])
             aow = numint._scale_ao(ao[:4], wv[:4])
             for i in range(6):
-                vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc)
+                vmat_tmp = numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc)
+                add_sparse(vmat[i], vmat_tmp, mask)
 
             contract_(vmat[0], ao, [XXX,XXY,XXZ], wv, mask)
             contract_(vmat[1], ao, [XXY,XYY,XYZ], wv, mask)
@@ -276,12 +283,14 @@ def contract_(mat, ao, aoidx, wv, mask):
 
             aow = [numint._scale_ao(ao[i], wv[4]) for i in range(1, 4)]
             for i, j in enumerate([XXX, XXY, XXZ, XYY, XYZ, XZZ]):
-                vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[0], mask, shls_slice, ao_loc)
+                vmat_tmp = numint._dot_ao_ao(mol, ao[j], aow[0], mask, shls_slice, ao_loc)
+                add_sparse(vmat[i], vmat_tmp, mask)
             for i, j in enumerate([XXY, XYY, XYZ, YYY, YYZ, YZZ]):
-                vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[1], mask, shls_slice, ao_loc)
+                vmat_tmp = numint._dot_ao_ao(mol, ao[j], aow[1], mask, shls_slice, ao_loc)
+                add_sparse(vmat[i], vmat_tmp, mask)
             for i, j in enumerate([XXZ, XYZ, XZZ, YYZ, YZZ, ZZZ]):
-                vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[2], mask, shls_slice, ao_loc)
-
+                vmat_tmp = numint._dot_ao_ao(mol, ao[j], aow[2], mask, shls_slice, ao_loc)
+                add_sparse(vmat[i], vmat_tmp, mask)
     vmat = vmat[[0,1,2,
                  1,3,4,
                  2,4,5]]
@@ -290,49 +299,45 @@ def contract_(mat, ao, aoidx, wv, mask):
     return vmat.reshape(3,3,nao_sph,nao_sph)
 
 def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype):
-    # TODO: hard coded
-    ao = ao.transpose([0,2,1])
-    ao_dm0 = [x.T for x in ao_dm0]
-
     p0, p1 = aoslices[atm_id][2:]
-    ngrids = ao[0].shape[0]
+    ngrids = ao[0].shape[1]
     if xctype == 'GGA':
         rho1 = cupy.zeros((3,4,ngrids))
     elif xctype == 'MGGA':
         rho1 = cupy.zeros((3,5,ngrids))
-        ao_dm0_x = ao_dm0[1][:,p0:p1]
-        ao_dm0_y = ao_dm0[2][:,p0:p1]
-        ao_dm0_z = ao_dm0[3][:,p0:p1]
+        ao_dm0_x = ao_dm0[1][p0:p1]
+        ao_dm0_y = ao_dm0[2][p0:p1]
+        ao_dm0_z = ao_dm0[3][p0:p1]
         # (d_X \nabla mu) dot \nalba nu DM_{mu,nu}
-        rho1[0,4] += cupy.einsum('pi,pi->p', ao[XX,:,p0:p1], ao_dm0_x)
-        rho1[0,4] += cupy.einsum('pi,pi->p', ao[XY,:,p0:p1], ao_dm0_y)
-        rho1[0,4] += cupy.einsum('pi,pi->p', ao[XZ,:,p0:p1], ao_dm0_z)
-        rho1[1,4] += cupy.einsum('pi,pi->p', ao[YX,:,p0:p1], ao_dm0_x)
-        rho1[1,4] += cupy.einsum('pi,pi->p', ao[YY,:,p0:p1], ao_dm0_y)
-        rho1[1,4] += cupy.einsum('pi,pi->p', ao[YZ,:,p0:p1], ao_dm0_z)
-        rho1[2,4] += cupy.einsum('pi,pi->p', ao[ZX,:,p0:p1], ao_dm0_x)
-        rho1[2,4] += cupy.einsum('pi,pi->p', ao[ZY,:,p0:p1], ao_dm0_y)
-        rho1[2,4] += cupy.einsum('pi,pi->p', ao[ZZ,:,p0:p1], ao_dm0_z)
+        rho1[0,4] += cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_x)
+        rho1[0,4] += cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_y)
+        rho1[0,4] += cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_z)
+        rho1[1,4] += cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_x)
+        rho1[1,4] += cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_y)
+        rho1[1,4] += cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_z)
+        rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_x)
+        rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_y)
+        rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_z)
         rho1[:,4] *= .5
     else:
         raise RuntimeError
 
-    ao_dm0_0 = ao_dm0[0][:,p0:p1]
+    ao_dm0_0 = ao_dm0[0][p0:p1]
     # (d_X \nabla_x mu) nu DM_{mu,nu}
-    rho1[:,0] = cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0_0)
-    rho1[0,1]+= cupy.einsum('pi,pi->p', ao[XX,:,p0:p1], ao_dm0_0)
-    rho1[0,2]+= cupy.einsum('pi,pi->p', ao[XY,:,p0:p1], ao_dm0_0)
-    rho1[0,3]+= cupy.einsum('pi,pi->p', ao[XZ,:,p0:p1], ao_dm0_0)
-    rho1[1,1]+= cupy.einsum('pi,pi->p', ao[YX,:,p0:p1], ao_dm0_0)
-    rho1[1,2]+= cupy.einsum('pi,pi->p', ao[YY,:,p0:p1], ao_dm0_0)
-    rho1[1,3]+= cupy.einsum('pi,pi->p', ao[YZ,:,p0:p1], ao_dm0_0)
-    rho1[2,1]+= cupy.einsum('pi,pi->p', ao[ZX,:,p0:p1], ao_dm0_0)
-    rho1[2,2]+= cupy.einsum('pi,pi->p', ao[ZY,:,p0:p1], ao_dm0_0)
-    rho1[2,3]+= cupy.einsum('pi,pi->p', ao[ZZ,:,p0:p1], ao_dm0_0)
+    rho1[:,0] = cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0_0)
+    rho1[0,1]+= cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_0)
+    rho1[0,2]+= cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_0)
+    rho1[0,3]+= cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_0)
+    rho1[1,1]+= cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_0)
+    rho1[1,2]+= cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_0)
+    rho1[1,3]+= cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_0)
+    rho1[2,1]+= cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_0)
+    rho1[2,2]+= cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_0)
+    rho1[2,3]+= cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_0)
     # (d_X mu) (\nabla_x nu) DM_{mu,nu}
-    rho1[:,1] += cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0[1][:,p0:p1])
-    rho1[:,2] += cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0[2][:,p0:p1])
-    rho1[:,3] += cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0[3][:,p0:p1])
+    rho1[:,1] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[1][p0:p1])
+    rho1[:,2] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[2][p0:p1])
+    rho1[:,3] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[3][p0:p1])
 
     # *2 for |mu> DM <d_X nu|
     return rho1 * 2
@@ -378,13 +383,18 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
 
     dm0 = mf.make_rdm1(mo_coeff, mo_occ)
 
+    ## transform object in sorted AO
+    #mo_coeff = coeff @ mo_coeff
+    #dm0 = coeff @ dm0
+    #dm0 = dm0 @ coeff.T
+
     vmat = cupy.zeros((mol.natm,3,3,nao,nao))
     ipip = cupy.zeros((3,3,nao,nao))
     if xctype == 'LDA':
         ao_deriv = 1
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            ao = contract('nip,ij->njp', ao, coeff)
+            ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
             wv = weight * vxc[0]
@@ -396,7 +406,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             for ia in range(mol.natm):
                 p0, p1 = aoslices[ia][2:]
                 # *2 for \nabla|ket> in rho1
-                rho1 = cupy.einsum('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2
+                rho1 = contract('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2
                 # aow ~ rho1 ~ d/dR1
                 wv = wf * rho1
                 aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)]
@@ -413,7 +423,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao):
             # TODO: improve efficiency
-            ao = contract('nip,ij->njp', ao, coeff)
+            ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
             wv = weight * vxc
@@ -445,7 +455,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
         ao_deriv = 2
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            ao = contract('nip,ij->njp', ao, coeff)
+            ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
             wv = weight * vxc
@@ -523,7 +533,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         ao_deriv = 1
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            ao = contract('nip,ij->njp', ao, coeff)
+            ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
             wv = weight * vxc[0]
@@ -546,7 +556,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
             # TODO: improve efficiency
-            ao = contract('nip,ij->njp', ao, coeff)
+            ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
             wv = weight * vxc
@@ -570,7 +580,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         ao_deriv = 2
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            ao = contract('nip,ij->njp', ao, coeff)
+            ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
             wv = weight * vxc
@@ -583,7 +593,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
             wf = weight * fxc
             for ia in range(mol.natm):
                 dR_rho1 = _make_dR_rho1(ao, ao_dm0, ia, aoslices, xctype)
-                wv = cupy.einsum('xyg,sxg->syg', wf, dR_rho1)
+                wv = contract('xyg,sxg->syg', wf, dR_rho1)
                 wv[:,0] *= .5
                 wv[:,4] *= .25
                 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)]
@@ -597,8 +607,8 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         p0, p1 = aoslices[ia][2:]
         vmat[ia,:,p0:p1] += v_ip[:,p0:p1]
         vmat[ia] = -vmat[ia] - vmat[ia].transpose(0,2,1)
-    vmat = cupy.einsum("kxij,jq->kxiq", vmat, mocc)
-    vmat = cupy.einsum("kxiq,ip->kxpq", vmat, mo_coeff)
+    vmat = contract("kxij,jq->kxiq", vmat, mocc)
+    vmat = contract("kxiq,ip->kxpq", vmat, mo_coeff)
 
     return vmat
 
diff --git a/setup.py b/setup.py
index fd5efdab..57372a31 100755
--- a/setup.py
+++ b/setup.py
@@ -122,7 +122,7 @@ def initialize_with_default_plat_name(self):
     ],
     cmdclass={'build_py': CMakeBuildPy},
     install_requires=[
-        'pyscf>=2.3.0',
+        'pyscf>=2.4.0',
         f'cupy-cuda{CUDA_VERSION}>=12.0',
         'dftd3==0.7.0',
         'dftd4==3.5.0',

From 8635711970a899ea108ecf8b47ff1d0e15d9646c Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Fri, 27 Oct 2023 17:54:43 -0700
Subject: [PATCH 16/19] Update __init__.py

---
 gpu4pyscf/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index ca735418..681a2e88 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,2 +1,2 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.4'
+__version__ = '0.6.5'

From be1aef0080bca6ec09836884964b400a3d988ba8 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Thu, 2 Nov 2023 21:02:25 -0700
Subject: [PATCH 17/19] Optimize hessian 3 (#54)

* numpy -> cupy for solvent

* for linter

* remove grad switch from pcm.py

* passed flake8

* solvent integrals on GPU

* flake8

* compatiable with pyscf-2.4.0

* added solvent

* fixed issues for to_cpu

* store intermeidate variable on CPU

* cupy.einsum -> contract

* optimized dft integration for gradient and hessian

* remove lprof

* fixed a bug in nlc

* precompute fxc_x

* optimize hessian & gpu timer

* remove scale_ao
---
 examples/00-h2o.py                      |   4 +-
 examples/dft_driver.py                  |   2 +-
 gpu4pyscf/df/df.py                      |   4 +-
 gpu4pyscf/df/df_jk.py                   |   4 +-
 gpu4pyscf/df/grad/rhf.py                |   4 +-
 gpu4pyscf/df/grad/rks.py                |   2 +-
 gpu4pyscf/df/hessian/rhf.py             |  32 +--
 gpu4pyscf/df/hessian/rks.py             |   2 +-
 gpu4pyscf/df/int3c2e.py                 |  28 +--
 gpu4pyscf/dft/libxc.py                  |  35 +--
 gpu4pyscf/dft/numint.py                 | 184 ++++++++++++---
 gpu4pyscf/dft/rks.py                    |   6 +-
 gpu4pyscf/grad/rhf.py                   |  40 ++--
 gpu4pyscf/grad/rks.py                   |  68 +++---
 gpu4pyscf/hessian/rks.py                | 142 +++++++----
 gpu4pyscf/lib/cupy_helper.py            |  18 +-
 gpu4pyscf/lib/cupy_helper/add_sparse.cu |  13 +-
 gpu4pyscf/lib/cutensor.py               |  27 ++-
 gpu4pyscf/lib/gdft/CMakeLists.txt       |   2 +-
 gpu4pyscf/lib/gdft/contract_rho.cu      | 299 +++++++++++++++++++++++-
 gpu4pyscf/lib/gdft/nr_eval_gto.cu       |  27 ---
 gpu4pyscf/lib/logger.py                 |  51 +++-
 gpu4pyscf/scf/hf.py                     |  14 +-
 23 files changed, 748 insertions(+), 260 deletions(-)

diff --git a/examples/00-h2o.py b/examples/00-h2o.py
index 7f17e62d..7df44258 100644
--- a/examples/00-h2o.py
+++ b/examples/00-h2o.py
@@ -24,7 +24,7 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 
-xc='LDA'
+xc='B3LYP'
 bas='def2-tzvpp'
 auxbasis='def2-tzvpp-jkfit'
 scf_tol = 1e-10
@@ -34,7 +34,7 @@
 
 mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
 
-mol.verbose = 4
+mol.verbose = 6
 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
 mf_GPU.grids.level = grids_level
 mf_GPU.conv_tol = scf_tol
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 3b68d665..12628086 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -35,7 +35,7 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 6
+mol.verbose = 1
 
 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
 if args.solvent:
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 28998230..da59f9da 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -67,8 +67,8 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         idx = np.arange(nao)
         self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)
 
-        t0 = (logger.process_clock(), logger.perf_counter())
         log = logger.new_logger(mol, mol.verbose)
+        t0 = log.init_timer()
         if auxmol is None:
             self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis)
 
@@ -217,7 +217,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
     nq = len(intopt.log_qs)
     for cp_ij_id, _ in enumerate(intopt.log_qs):
         if len(intopt.ao_pairs_row[cp_ij_id]) == 0: continue
-        t1 = (logger.process_clock(), logger.perf_counter())
+        t1 = log.init_timer()
         cpi = intopt.cp_idx[cp_ij_id]
         cpj = intopt.cp_jdx[cp_ij_id]
         li = intopt.angular[cpi]
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index e27f16f4..76f4bed0 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -20,9 +20,9 @@
 import cupy
 import numpy
 from pyscf import lib, scf, __config__
-from pyscf.lib import logger
 from pyscf.scf import dhf
 from pyscf.df import df_jk, addons
+from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract, take_last2d, transpose_sum, load_library, get_avail_mem
 from gpu4pyscf.dft import rks, numint
 from gpu4pyscf.scf import hf
@@ -250,7 +250,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
     nao = dms_tag.shape[-1]
     dms = dms_tag.reshape([-1,nao,nao])
     nset = dms.shape[0]
-    t0 = (logger.process_clock(), logger.perf_counter())
+    t0 = log.init_timer()
     if dfobj._cderi is None:
         log.debug('CDERI not found, build...')
         dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 84f5ed23..febdee40 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -18,12 +18,12 @@
 import cupy
 from cupyx.scipy.linalg import solve_triangular
 from pyscf.df.grad import rhf
-from pyscf.lib import logger
 from pyscf import lib, scf, gto
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library
 from gpu4pyscf.grad.rhf import grad_elec
 from gpu4pyscf import __config__
+from gpu4pyscf.lib import logger
 
 libcupy_helper = load_library('libcupy_helper')
 
@@ -154,7 +154,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         vkaux = cupy.zeros((3,naux_cart))
     cupy.get_default_memory_pool().free_all_blocks()
     for cp_kl_id in range(len(intopt.aux_log_qs)):
-        t1 = (logger.process_clock(), logger.perf_counter())
+        t1 = log.init_timer()
         k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
         assert k1-k0 <= block_size
         if with_j:
diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py
index 2ef88e86..4708c4d7 100644
--- a/gpu4pyscf/df/grad/rks.py
+++ b/gpu4pyscf/df/grad/rks.py
@@ -29,7 +29,7 @@ def get_veff(ks_grad, mol=None, dm=None):
     '''
     if mol is None: mol = ks_grad.mol
     if dm is None: dm = ks_grad.base.make_rdm1()
-    t0 = (logger.process_clock(), logger.perf_counter())
+    t0 = logger.init_timer(ks_grad)
 
     mf = ks_grad.base
     ni = mf._numint
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 46cb38ec..1141de1f 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -55,7 +55,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     '''Partial derivative
     '''
     log = logger.new_logger(hessobj, verbose)
-    time0 = t1 = (logger.process_clock(), logger.perf_counter())
+    time0 = t1 = log.init_timer()
 
     mol = hessobj.mol
     mf = hessobj.base
@@ -393,12 +393,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
 def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             verbose=None, with_k=True, omega=None):
     log = logger.new_logger(hessobj, verbose)
-    t0 = (logger.process_clock(), logger.perf_counter())
+    t0 = log.init_timer()
     mol = hessobj.mol
     if atmlst is None:
         atmlst = range(mol.natm)
     # FIXME
     with_k = True
+    mo_coeff = cupy.asarray(mo_coeff)
+    mo_occ = cupy.asarray(mo_occ)
 
     mf = hessobj.base
     #auxmol = hessobj.base.with_df.auxmol
@@ -441,7 +443,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         rhok0_Pl_ = np.empty_like(wk_Pl_)
         for p0, p1 in lib.prange(0,nao,64):
             wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
-            rhok0_Pl_[:,p0:p1] = cupy.einsum('pq,qio->pio', int2c_inv, wk_tmp).get()
+            rhok0_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get()
     wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None
 
     # int3c_ip1 contributions
@@ -449,8 +451,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
     vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
 
-    vj1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vj1_ao, mo_coeff)
-    vk1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vk1_ao, mo_coeff)
+    vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
+    vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
     vj1_ao = vk1_ao = None
     t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
 
@@ -475,15 +477,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
         for p0, p1 in lib.prange(0,nao,64):
             rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
-            vj1_tmp = cupy.einsum('pio,xp->xpio', rhok_tmp, wj0_10)
+            vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)
 
-            wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok_tmp)
-            vj1_tmp += cupy.einsum('xpio,p->xpio', wk0_10_Pl_, rhoj0)
-            vj1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom)
+            wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp)
+            vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0)
+            vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom)
             if with_k:
-                vk1_tmp = 2.0 * cupy.einsum('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
-                vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
-                vk1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vk1_tmp, aux2atom)
+                vk1_tmp = 2.0 * contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
+                vk1_tmp += 2.0 * contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
+                vk1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom)
         wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None
         vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None
         aux2atom = None
@@ -498,8 +500,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     # ========================== sorted AO end ================================
     def _ao2mo(mat):
-        tmp = cupy.einsum('xij,jo->xio', mat, mocc)
-        return cupy.einsum('xik,ip->xpk', tmp, mo_coeff)
+        tmp = contract('xij,jo->xio', mat, mocc)
+        return contract('xik,ip->xpk', tmp, mo_coeff)
 
     vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2
     vj1_int3c_ip1 = vj1_int3c_ip2 = None
@@ -522,7 +524,7 @@ def _ao2mo(mat):
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
 
         h1 = hcore_deriv(ia)
-        h1 = _ao2mo(h1)
+        h1 = _ao2mo(cupy.asarray(h1))
         vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
         if with_k:
             vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index d8986f1f..b7432314 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -37,7 +37,7 @@
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
     log = logger.new_logger(hessobj, verbose)
-    time0 = t1 = (logger.process_clock(), logger.perf_counter())
+    time0 = t1 = log.init_timer()
 
     mol = hessobj.mol
     mf = hessobj.base
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index fc755a27..f3a43442 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -184,7 +184,7 @@ def build(self, cutoff=1e-14, group_size=None,
         a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
         we will pair (ao,ao) and (aux,1) separately.
         '''
-        cput0 = (logger.process_clock(), logger.perf_counter())
+        cput0 = logger.init_timer(self.mol)
         sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(self.mol)
         if group_size is not None :
             uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size)
@@ -314,6 +314,7 @@ def build(self, cutoff=1e-14, group_size=None,
             tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm),
             tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas),
             tot_mol._env.ctypes.data_as(ctypes.c_void_p))
+
         cput1 = logger.timer_debug1(tot_mol, 'Initialize GPU cache', *cput1)
         self.bas_pairs_locs = bas_pairs_locs
         ncptype = len(self.log_qs)
@@ -745,29 +746,24 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
         vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
         # initialize intermediate variables
         if count % ncp_ij == 0:
-            rhoj0 = cupy.zeros([3,k1-k0,nao_sph])
             rhok_tmp = cupy.asarray(rhok[k0:k1])
-            vj1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc])
             if with_k:
                 rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2
                 rhok0 = contract('pli,lo->poi', rhok0_slice, orbo)
-                int3c_ip1_occ = cupy.zeros([3,k1-k0,nao_sph,nocc])
-                vk1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc])
 
-        # contraction
-        rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
+        vj1_ao = contract('pJo,xpi->xiJo', rhok_tmp, rhoj0)
+        vj1 += 2.0*contract('xiJo,ia->axJo', vj1_ao, ao2atom[i0:i1])
+
         if with_k:
-            int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-            vk1_ao[:,i0:i1,j0:j1] += contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
             vk1_buf[:,i0:i1] += contract('xpji,plj->xil', int3c_blk, rhok0_slice[:,:,j0:j1])
 
-        # reduction
-        if (count+1) % ncp_ij == 0:
-            vj1_ao += contract('pjo,xpi->xijo', rhok_tmp, rhoj0)
-            vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom)
-            if with_k:
-                vk1_ao += contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice)
-                vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
+            vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
+            vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1])
+
+            int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
+            vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice[:,:,i0:i1])
+            vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1])
         count += 1
 
     return vj1_buf, vk1_buf, vj1, vk1
diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py
index 65c05a77..5ed63b48 100644
--- a/gpu4pyscf/dft/libxc.py
+++ b/gpu4pyscf/dft/libxc.py
@@ -21,7 +21,7 @@
 import cupy
 from pyscf import dft
 
-libxc = np.ctypeslib.load_library(
+_libxc = np.ctypeslib.load_library(
     'libxc', os.path.abspath(os.path.join(__file__, '..', '..', 'lib', 'deps', 'lib')))
 
 def _check_arrays(current_arrays, fields, factor, required):
@@ -45,21 +45,21 @@ class _xcfun(ctypes.Structure):
     pass
 
 _xc_func_p = ctypes.POINTER(_xcfun)
-libxc.xc_func_alloc.restype = _xc_func_p
-libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int)
-libxc.xc_func_end.argtypes = (_xc_func_p, )
-libxc.xc_func_free.argtypes = (_xc_func_p, )
+_libxc.xc_func_alloc.restype = _xc_func_p
+_libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int)
+_libxc.xc_func_end.argtypes = (_xc_func_p, )
+_libxc.xc_func_free.argtypes = (_xc_func_p, )
 
 class XCfun:
     def __init__(self, xc, spin):
         assert spin == 'unpolarized'
         self._spin = 1
-        self.xc_func = libxc.xc_func_alloc()
+        self.xc_func = _libxc.xc_func_alloc()
         if isinstance(xc, str):
-            self.func_id = libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode()))
+            self.func_id = _libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode()))
         else:
             self.func_id = xc
-        ret = libxc.xc_func_init(self.xc_func, self.func_id, self._spin)
+        ret = _libxc.xc_func_init(self.xc_func, self.func_id, self._spin)
         if ret != 0:
             raise RuntimeError('failed to initialize xc fun')
         self._family = dft.libxc.xc_type(xc)
@@ -67,9 +67,10 @@ def __init__(self, xc, spin):
     def __del__(self):
         if self.xc_func is None:
             return
-        libxc.xc_func_end(self.xc_func)
-        libxc.xc_func_free(self.xc_func)
-        
+        # TODO: deallocate xc func
+        #_libxc.xc_func_end(self.xc_func)
+        #_libxc.xc_func_free(self.xc_func)
+
     def needs_laplacian(self):
         return dft.libxc.needs_laplacian(self.func_id)
 
@@ -85,7 +86,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
         npoints = int(inp["rho"].size / self._spin)
         if (inp["rho"].size % self._spin):
             raise ValueError("Rho input has an invalid shape, must be divisible by %d" % self._spin)
-        
+
         # Find the right compute function
         args = [self.xc_func, ctypes.c_size_t(npoints)]
         if self._family == 'LDA':
@@ -114,7 +115,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
                 if(isinstance(arg, cupy.ndarray)):
                     arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
                 cuda_args.append(arg)
-            libxc.xc_lda(*cuda_args)
+            _libxc.xc_lda(*cuda_args)
         elif self._family == 'GGA':
             input_labels   = ["rho", "sigma"]
             input_num_args = 2
@@ -141,7 +142,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
                 if(isinstance(arg, cupy.ndarray)):
                     arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
                 cuda_args.append(arg)
-            libxc.xc_gga(*cuda_args)
+            _libxc.xc_gga(*cuda_args)
 
         elif self._family == 'MGGA':
             # Build input args
@@ -178,7 +179,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
             output = _check_arrays(output, output_labels[5:15], npoints, do_fxc)
             output = _check_arrays(output, output_labels[15:35], npoints, do_kxc)
             output = _check_arrays(output, output_labels[35:70], npoints, do_lxc)
-            
+
             args.extend([   inp[x] for x in  input_labels])
             if not self.needs_laplacian():
                 args.insert(-1, cupy.empty((1)))  # Add none ptr to laplacian
@@ -189,10 +190,10 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
                 if(isinstance(arg, cupy.ndarray)):
                     arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
                 cuda_args.append(arg)
-            libxc.xc_mgga(*cuda_args)
+            _libxc.xc_mgga(*cuda_args)
         else:
             raise KeyError("Functional kind not recognized!")
-        
+
         return {k: v for k, v in zip(output_labels, args[2+input_num_args:]) if v is not None}
 
 
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index cc9c0e6b..6ca2a9f8 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -21,13 +21,13 @@
 import cupy
 
 from pyscf import gto, lib, dft
-from pyscf.lib import logger
 from pyscf.dft import numint
 from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index
 from gpu4pyscf.scf.hf import basis_seg_contraction
 from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack
 from gpu4pyscf.dft import xc_deriv, xc_alias, libxc
 from gpu4pyscf import __config__
+from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 6
 BAS_ALIGNED = 4
@@ -35,6 +35,7 @@
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
 ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
 AO_THRESHOLD = 1e-12
+AO_ALIGNMENT = 32
 
 # Should we release the cupy cache?
 FREE_CUPY_CACHE = False
@@ -269,6 +270,42 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
         rho[tau_idx] *= .5
     return rho
 
+def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
+              with_lapl=True, verbose=None):
+    ''' ao: nd x nao x ng
+        c0: nd x nocc x ng
+        mo1: na x nao x nocc
+    '''
+    xctype = xctype.upper()
+    if xctype == 'LDA' or xctype == 'HF':
+        _, ngrids = ao.shape
+    else:
+        _, ngrids = ao[0].shape
+
+    na = mo1.shape[0]
+    cpos1= mo1
+    if xctype == 'LDA' or xctype == 'HF':
+        c_0 = contract('aio,ig->aog', cpos1, ao)#cupy.dot(cpos1.T, ao)
+        rho = cupy.empty([na,ngrids])
+        for i in range(na):
+            rho[i] = _contract_rho(c0, c_0[i])
+        rho *= 2.0
+    elif xctype in ('GGA', 'NLC'):
+        c_0 = contract('nig,aio->anog', ao, cpos1)
+        rho = cupy.empty([na, 4, ngrids])
+        for i in range(na):
+            _contract_rho_gga(c0, c_0[i], rho=rho[i])
+
+    else: # meta-GGA
+        if with_lapl:
+            raise NotImplementedError("mGGA with lapl not implemented")
+        rho = cupy.empty((na,5,ngrids))
+        c_0 = contract('nig,aio->anog', ao, cpos1)
+        for i in range(na):
+            _contract_rho_mgga(c0, c_0[i], rho=rho[i])
+
+    return rho
+
 def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars):
     thresh=1e-8
 
@@ -408,15 +445,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv):
         for i in range(nset):
-            t0 = (logger.process_clock(), logger.perf_counter())
-            #rho = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=1)
-            #rho = _make_rho(ao, dms[i], xctype=xctype)
+            t0 = log.init_timer()
             if mo_coeff is None:
                 rho = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1)
             else:
                 mo_coeff_mask = mo_coeff[idx,:]
                 rho = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
 
+            t1 = log.timer_debug1('eval rho', *t0)
             exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
             vxc = cupy.asarray(vxc, order='C')
             exc = cupy.asarray(exc, order='C')
@@ -662,8 +698,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         ao_deriv = 1
     p0 = 0
     p1 = 0
-    t0 = (logger.process_clock(), logger.perf_counter())
     for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv):
+        t0 = log.init_timer()
         p0, p1 = p1, p1+len(weights)
         # precompute molecular orbitals
         if with_mocc:
@@ -671,44 +707,51 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
             if xctype == 'LDA':
                 c0 = _dot_ao_dm(mol, ao, occ_coeff_mask, None, None, None)
             elif xctype == "GGA":
-                c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0])
-                for i in range(4):
-                    c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None)
+                c0 = contract('nig,io->nog', ao, occ_coeff_mask)
             else: # mgga
-                c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0])
-                for i in range(4):
-                    c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None)
+                c0 = contract('nig,io->nog', ao, occ_coeff_mask)
+
+        if with_mocc:
+            rho1 = eval_rho4(opt.mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False)
+        else:
+            # slow version
+            rho1 = []
+            for i in range(nset):
+                rho_tmp = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False)
+                rho1.append(rho_tmp)
+            rho1 = cupy.stack(rho1, axis=0)
+        t0 = log.timer_debug1('rho', *t0)
+
         # precompute fxc_w
         if xctype == 'LDA':
             fxc_w = fxc[0,0,p0:p1] * weights
+            wv = rho1 * fxc_w
         else:
             fxc_w = fxc[:,:,p0:p1] * weights
-        # loop perturbed molecular orbitals
-        for i in range(nset):
-            if with_mocc:
-                rho1 = eval_rho3(opt.mol, ao, c0, mo1[i][mask], xctype=xctype, with_lapl=False)
-            else:
-                rho1 = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False)
+            wv = contract('axg,xyg->ayg', rho1, fxc_w)
 
+
+        for i in range(nset):
             if xctype == 'LDA':
-                wv = rho1 * fxc_w
-                vmat_tmp = ao.dot(_scale_ao(ao, wv).T)
+                vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T)
                 add_sparse(vmat[i], vmat_tmp, mask)
             elif xctype == 'GGA':
-                wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w)
-                wv[0] *= .5
-                vmat_tmp = ao[0].dot(_scale_ao(ao, wv).T)
+                wv[i,0] *= .5
+                aow = _scale_ao(ao, wv[i])
+                vmat_tmp = aow.dot(ao[0].T)
                 add_sparse(vmat[i], vmat_tmp, mask)
             elif xctype == 'NLC':
                 raise NotImplementedError('NLC')
             else:
-                wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w)
-                wv[[0, 4]] *= .5
-                vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T)
-                vmat_tmp+= _tau_dot(ao, ao, wv[4])
+                wv[i,0] *= .5
+                wv[i,4] *= .5
+                vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T)
+                vmat_tmp+= _tau_dot(ao, ao, wv[i,4])
                 add_sparse(vmat[i], vmat_tmp, mask)
+
         t0 = log.timer_debug1('vxc', *t0)
         ao = c0 = rho1 = None
+
     vmat = contract('pi,npq->niq', coeff, vmat)
     vmat = contract('qj,niq->nij', coeff, vmat)
     if xctype != 'LDA':
@@ -723,6 +766,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
 
     return cupy.asarray(vmat)
 
+
 def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
                   relativity=0, singlet=True, rho0=None, vxc=None, fxc=None,
                   max_memory=2000, verbose=None):
@@ -851,7 +895,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         2D array of shape (nao,nao) where nao is the number of AO functions.
     '''
     log = logger.new_logger(mol, verbose)
-    t0 = (logger.process_clock(), logger.perf_counter())
+    t0 = log.init_timer()
     opt = getattr(ni, 'gdftopt', None)
     if opt is None:
         ni.build(mol, grids.coords)
@@ -1040,6 +1084,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     '''
     Define this macro to loop over grids by blocks.
     Sparsity is not implemented yet
+    sorted_ao: by default ao_value is sorted for GPU
     '''
     if grids.coords is None:
         grids.build(with_non0tab=True)
@@ -1050,7 +1095,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     log = logger.new_logger(mol, mol.verbose)
 
     if blksize is None:
-        cupy.get_default_memory_pool().free_all_blocks()
+        #cupy.get_default_memory_pool().free_all_blocks()
         mem_avail = get_avail_mem()
         blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED
         blksize = min(blksize, MIN_BLK_SIZE)
@@ -1070,23 +1115,31 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
             coords = grids.coords[ip0:ip1]
             weight = grids.weights[ip0:ip1]
             #sindex = ni.screen_index[ip0//GRID_BLKSIZE:]
-            t0 = (logger.process_clock(), logger.perf_counter())
+            t0 = log.init_timer()
             ao = eval_ao(ni, mol, coords, deriv)
-            log.timer_debug1('eval ao', *t0)
+            t0 = log.timer_debug1('eval ao', *t0)
 
             # cache ao indices
             if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx:
-                t0 = (logger.process_clock(), logger.perf_counter())
+                t0 = log.init_timer()
                 if deriv == 0:
                     mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1])
-                    idx = cupy.argwhere(mask).astype(np.int32)[:,0]
+                    all_idx = cupy.arange(ao.shape[0], dtype=np.int32)
+                    idx = all_idx[mask]
+                    pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx)
+                    zero_idx = all_idx[~mask][:pad]
+                    idx = cupy.hstack([idx, zero_idx])
                     ao_mask = ao[idx,:]
                 else:
                     mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2])
-                    idx = cupy.argwhere(mask).astype(np.int32)[:,0]
+                    all_idx = cupy.arange(ao.shape[1], dtype=np.int32)
+                    idx = all_idx[mask]
+                    pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx)
+                    zero_idx = all_idx[~mask][:pad]
+                    idx = cupy.hstack([idx, zero_idx])
                     ao_mask = ao[:,idx,:]
                 ni.non0ao_idx[deriv, block_id, blksize, ngrids] = idx
-                log.timer_debug1('initialize ao sparsity', *t0)
+                log.timer_debug1('init ao sparsity', *t0)
             else:
                 idx = ni.non0ao_idx[deriv, block_id, blksize, ngrids]
                 if deriv == 0:
@@ -1094,7 +1147,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
                 else:
                     ao_mask = ao[:,idx,:]
             block_id += 1
-            log.timer_debug1('eval rho', *t0)
+            log.timer_debug1('extract sparse ao', *t0)
             yield ao_mask, idx, weight, coords
 
 class NumInt(numint.NumInt):
@@ -1185,6 +1238,63 @@ def _contract_rho(bra, ket, rho=None):
         rho = cupy.einsum('ig,ig->g', bra, ket)
     return rho
 
+def _contract_rho1(bra, ket, rho=None):
+    ''' xip,ip->xp
+    '''
+    if bra.ndim == 2:
+        bra = cupy.expand_dims(bra, axis=0)
+    nvar, nao, ngrids = bra.shape
+    if rho is None:
+        rho = cupy.empty([nvar, ngrids])
+
+    for i in range(nvar):
+        stream = cupy.cuda.get_current_stream()
+        err = libgdft.GDFTcontract_rho(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            ctypes.cast(rho[i].data.ptr, ctypes.c_void_p),
+            ctypes.cast(bra[i].data.ptr, ctypes.c_void_p),
+            ctypes.cast(ket.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(ngrids), ctypes.c_int(nao))
+        if err != 0:
+            raise RuntimeError('CUDA Error')
+    return rho
+
+def _contract_rho_gga(bra, ket, rho=None):
+    ''' ig,nig->ng
+    '''
+    n, nao, ngrids = bra.shape
+    assert n == 4
+    if rho is None:
+        rho = cupy.empty([4,ngrids])
+    stream = cupy.cuda.get_current_stream()
+    err = libgdft.GDFTcontract_rho_gga(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(rho.data.ptr, ctypes.c_void_p),
+        ctypes.cast(bra.data.ptr, ctypes.c_void_p),
+        ctypes.cast(ket.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(ngrids), ctypes.c_int(nao))
+    if err != 0:
+        raise RuntimeError('CUDA Error')
+    return rho
+
+def _contract_rho_mgga(bra, ket, rho=None):
+    ''' nig,nig->ng
+    '''
+    n, nao, ngrids = bra.shape
+    assert n == 4
+    if rho is None:
+        rho = cupy.empty([5,ngrids])
+    stream = cupy.cuda.get_current_stream()
+    err = libgdft.GDFTcontract_rho_mgga(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(rho.data.ptr, ctypes.c_void_p),
+        ctypes.cast(bra.data.ptr, ctypes.c_void_p),
+        ctypes.cast(ket.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(ngrids), ctypes.c_int(nao))
+    if err != 0:
+        raise RuntimeError('CUDA Error')
+    return rho
+
 def _dot_ao_dm(mol, ao, dm, non0tab, shls_slice, ao_loc, out=None):
     return cupy.dot(dm.T, ao)
 
@@ -1272,7 +1382,7 @@ def _scale_ao(ao, wv, out=None):
         assert wv.size == ngrids
     else:
         if ao[0].flags.f_contiguous:
-            return contract('nip,np->ip', ao, wv)
+            return cupy.einsum('nip,np->ip', ao, wv)
         nvar, nao, ngrids = ao.shape
         assert wv.shape == (nvar, ngrids)
 
@@ -1365,6 +1475,8 @@ def build(self, mol=None):
             coeff = np.vstack([coeff, np.zeros((paddings, coeff.shape[1]))])
         pmol._decontracted = True
         self.mol = pmol
+        inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32)
+        self.rev_ao_idx = cupy.asarray(inv_idx)
         self.coeff = coeff[ao_idx]
         self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32)
         self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32)
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index ea12f511..b4fd72b0 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -68,7 +68,7 @@ def initialize_grids(ks, mol=None, dm=None):
     # Initialize self.grids the first time call get_veff
     if mol is None: mol = ks.mol
     if ks.grids.coords is None:
-        t0 = (logger.process_clock(), logger.perf_counter())
+        t0 = logger.init_timer(ks)
         ks.grids.build()
         #ks.grids.build(with_non0tab=True)
         ks.grids.weights = cupy.asarray(ks.grids.weights)
@@ -82,7 +82,7 @@ def initialize_grids(ks, mol=None, dm=None):
         is_nlc = ks.nlc or ks._numint.libxc.is_nlc(ks.xc)
         if is_nlc and ks.nlcgrids.coords is None:
             if ks.nlcgrids.coords is None:
-                t0 = (logger.process_clock(), logger.perf_counter())
+                t0 = logger.init_timer(ks)
                 #ks.nlcgrids.build(with_non0tab=True)
                 ks.nlcgrids.build()
                 ks.nlcgrids.weights = cupy.asarray(ks.nlcgrids.weights)
@@ -124,7 +124,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
 
     if mol is None: mol = ks.mol
     if dm is None: dm = ks.make_rdm1()
-    t0 = (logger.process_clock(), logger.perf_counter())
+    t0 = logger.init_timer(ks)
     if ks.grids.coords is None:
         ks.grids.ao_values = None
     initialize_grids(ks, mol, dm)
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 59a3aeac..16a3b53e 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -19,12 +19,12 @@
 import cupy
 import numpy
 from pyscf import lib, gto
-from pyscf.lib import logger
 from pyscf.grad import rhf
 from gpu4pyscf.lib.cupy_helper import load_library
 from gpu4pyscf.scf.hf import _VHFOpt
-from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf.lib.cupy_helper import tag_array, contract
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
+from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 3
 FREE_CUPY_CACHE = True
@@ -255,8 +255,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
     if atmlst is None:
         atmlst = range(mol.natm)
 
-    cput0 = (logger.process_clock(), logger.perf_counter())
     log = logger.new_logger(mol, verbose)
+    cput0 = log.init_timer()
     if hermi != 1:
         raise NotImplementedError('JK-builder only supports hermitian density matrix')
     if omega is None:
@@ -328,7 +328,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
     dm_shl = cupy.asarray(np.log(dm_shl))
     nshls = dm_shl.shape[0]
     t0 = time.perf_counter()
-    
+
     if hermi != 1:
         dm_ctr_cond = (dm_ctr_cond + dm_ctr_cond.T) * .5
     fn = libgvhf.GINTget_veff_ip1
@@ -347,7 +347,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
             ll = vhfopt.uniq_l_ctr[cpl,0]
             if lk > LMAX_ON_GPU or ll > LMAX_ON_GPU or log_q_kl.size == 0:
                 continue
-            
+
             # TODO: determine cutoff based on the relevant maximum value of dm blocks?
             sub_dm_cond = max(dm_ctr_cond[cpi,cpj], dm_ctr_cond[cpk,cpl],
                               dm_ctr_cond[cpi,cpk], dm_ctr_cond[cpj,cpk],
@@ -416,8 +416,6 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
         coeff = dms = None
         cupy.get_default_memory_pool().free_all_blocks()
 
-    #if vj is not None: vj_per_atom = vj_per_atom.T
-    #if vk is not None: vk_per_atom = vk_per_atom.T
     if out_cupy:
         return vj_per_atom, vk_per_atom
     else:
@@ -427,8 +425,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
 def _get_jk(gradient_object, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
             omega=None):
     mf = gradient_object.base
-    cput0 = (logger.process_clock(), logger.perf_counter())
     log = logger.new_logger(gradient_object)
+    cput0 = log.init_timer()
     log.debug3('apply get_grad_jk on gpu')
     if hasattr(mf, '_opt_gpu'):
         vhfopt = mf._opt_gpu
@@ -457,7 +455,7 @@ def get_dh1e_ecp(mol, dm):
     for ia in ecp_atoms:
         with mol.with_rinv_at_nucleus(ia):
             ecp = mol.intor('ECPscalar_iprinv', comp=3)
-            dh1e_ecp[ia] = cupy.einsum('xij,ij->x', ecp, dm)
+            dh1e_ecp[ia] = contract('xij,ij->x', cupy.asarray(ecp), dm)
     return 2.0 * dh1e_ecp
 
 def grad_nuc(mf_grad, atmlst=None):
@@ -489,11 +487,11 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
         atmlst = range(mol.natm)
     aoslices = mol.aoslice_by_atom()
 
-    t0 = (logger.process_clock(), logger.perf_counter())
     if mo_energy is None: mo_energy = mf.mo_energy
     if mo_occ is None:    mo_occ = mf.mo_occ
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
     log = logger.Logger(mf_grad.stdout, mf_grad.verbose)
+    t0 = log.init_timer()
 
     mo_energy = cupy.asarray(mo_energy)
     mo_occ = cupy.asarray(mo_occ)
@@ -515,9 +513,9 @@ def calculate_h1e(h1_gpu, s1_gpu):
     with lib.call_in_background(calculate_h1e) as calculate_hs:
         calculate_hs(h1, s1)
         # (i | \nabla hcore | j)
-        t3 = log.timer_debug1("get_dh1e", *t0)
+        t3 = log.init_timer()
         dh1e = int3c2e.get_dh1e(mol, dm0)
-        
+
         t4 = log.timer_debug1("get_dh1e", *t3)
         if mol.has_ecp():
             dh1e += get_dh1e_ecp(mol, dm0)
@@ -527,20 +525,20 @@ def calculate_h1e(h1_gpu, s1_gpu):
         log.debug('Computing Gradients of NR-HF Coulomb repulsion')
 
         dm0 = tag_array(dm0, mo_coeff=mo_coeff, mo_occ=mo_occ)
-        
+
         extra_force = cupy.zeros((len(atmlst),3))
         for k, ia in enumerate(atmlst):
             extra_force[k] += mf_grad.extra_force(ia, locals())
-        
+
         t2 = log.timer_debug1('gradients of 2e part', *t1)
-        
-    dh = cupy.einsum('xij,ij->xi', h1, dm0)
-    ds = cupy.einsum('xij,ij->xi', s1, dme0)
+
+    dh = contract('xij,ij->xi', h1, dm0)
+    ds = contract('xij,ij->xi', s1, dme0)
     delec = 2.0*(dh - ds)
-    
+
     delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]])
     de = 2.0 * dvhf + dh1e + delec + extra_force
-    
+
     if(hasattr(mf, 'disp') and mf.disp is not None):
         g_disp = mf_grad.get_dispersion()
         mf_grad.grad_disp = g_disp
@@ -565,13 +563,13 @@ class Gradients(rhf.Gradients):
     def get_j(self, mol=None, dm=None, hermi=0, omega=None):
         vj, _ = self.get_jk(mol, dm, with_k=False, omega=omega)
         return vj
-    
+
     def get_k(self, mol=None, dm=None, hermi=0, omega=None):
         _, vk = self.get_jk(mol, dm, with_j=False, omega=omega)
         return vk
 
     def extra_force(self, atom_id, envs):
-        ''' 
+        '''
         grid response is implemented get_veff
         '''
         return 0
\ No newline at end of file
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index f22e8184..316478cc 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -17,6 +17,7 @@
 # Modified by Xiaojie Wu <wxj6000@gmail.com>
 
 '''Non-relativistic RKS analytical nuclear gradients'''
+import ctypes
 import numpy
 import cupy
 import pyscf
@@ -27,12 +28,15 @@
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.dft import numint, xc_deriv, rks
 from gpu4pyscf.dft.numint import _GDFTOpt, AO_THRESHOLD
-from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array
+from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array, load_library
 from pyscf import __config__
 
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
 ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
 
+libgdft = load_library('libgdft')
+libgdft.GDFT_make_dR_dao_w.restype = ctypes.c_int
+
 def _get_veff(ks_grad, mol=None, dm=None):
     '''
     First order derivative of DFT effective potential matrix (wrt electron coordinates)
@@ -124,7 +128,6 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     nset = len(dms)
     assert nset == 1
-
     if xctype == 'LDA':
         ao_deriv = 1
     else:
@@ -141,13 +144,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 wv = weight * vxc[0]
                 aow = numint._scale_ao(ao_mask[0], wv)
                 vtmp = _d1_dot_(ao_mask[1:4], aow.T)
-                #idx = cupy.ix_(mask, mask)
-                #vmat[idm][0][idx] += vtmp[0]
-                #vmat[idm][1][idx] += vtmp[1]
-                #vmat[idm][2][idx] += vtmp[2]
-                add_sparse(vmat[idm][0], vtmp[0], idx)
-                add_sparse(vmat[idm][1], vtmp[1], idx)
-                add_sparse(vmat[idm][2], vtmp[2], idx)
+                add_sparse(vmat[idm], vtmp, idx)
     elif xctype == 'GGA':
         ao_deriv = 2
         for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
@@ -158,13 +155,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 wv = weight * vxc
                 wv[0] *= .5
                 vtmp = _gga_grad_sum_(ao_mask, wv)
-                #idx = cupy.ix_(mask, mask)
-                #vmat[idm][0][idx] += vtmp[0]
-                #vmat[idm][1][idx] += vtmp[1]
-                #vmat[idm][2][idx] += vtmp[2]
-                add_sparse(vmat[idm][0], vtmp[0], idx)
-                add_sparse(vmat[idm][1], vtmp[1], idx)
-                add_sparse(vmat[idm][2], vtmp[2], idx)
+                add_sparse(vmat[idm], vtmp, idx)
     elif xctype == 'NLC':
         raise NotImplementedError('NLC')
 
@@ -180,14 +171,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 wv[4] *= .5  # for the factor 1/2 in tau
                 vtmp = _gga_grad_sum_(ao_mask, wv)
                 vtmp += _tau_grad_dot_(ao_mask, wv[4])
-                #idx = cupy.ix_(mask, mask)
-                #vmat[idm][0][idx] += vtmp[0]
-                #vmat[idm][1][idx] += vtmp[1]
-                #vmat[idm][2][idx] += vtmp[2]
-                add_sparse(vmat[idm][0], vtmp[0], idx)
-                add_sparse(vmat[idm][1], vtmp[1], idx)
-                add_sparse(vmat[idm][2], vtmp[2], idx)
-
+                add_sparse(vmat[idm], vtmp, idx)
     vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
     exc = None
     if nset == 1:
@@ -243,9 +227,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         wv = vv_vxc[:,p0:p1] * weight
         wv[0] *= .5  # *.5 because vmat + vmat.T at the end
         vmat_tmp = _gga_grad_sum_(ao_mask, wv)
-        add_sparse(vmat[0], vmat_tmp[0], mask)
-        add_sparse(vmat[1], vmat_tmp[1], mask)
-        add_sparse(vmat[2], vmat_tmp[2], mask)
+        add_sparse(vmat, vmat_tmp, mask)
 
     vmat = contract('npq,qj->npj', vmat, coeff)
     vmat = contract('pi,npj->nij', coeff, vmat)
@@ -255,6 +237,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
 def _make_dR_dao_w(ao, wv):
     #:aow = numpy.einsum('npi,p->npi', ao[1:4], wv[0])
+    '''
     aow = [
         numint._scale_ao(ao[1], wv[0]),  # dX nabla_x
         numint._scale_ao(ao[2], wv[0]),  # dX nabla_y
@@ -272,13 +255,34 @@ def _make_dR_dao_w(ao, wv):
     aow[2] += numint._scale_ao(ao[6], wv[1])  # dZ nabla_x
     aow[2] += numint._scale_ao(ao[8], wv[2])  # dZ nabla_y
     aow[2] += numint._scale_ao(ao[9], wv[3])  # dZ nabla_z
+    '''
+    assert ao.flags.c_contiguous
+    assert wv.flags.c_contiguous
+
+    _, nao, ngrids = ao.shape
+    aow = cupy.empty([3,nao,ngrids])
+    stream = cupy.cuda.get_current_stream()
+    err = libgdft.GDFT_make_dR_dao_w(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(aow.data.ptr, ctypes.c_void_p),
+        ctypes.cast(ao.data.ptr, ctypes.c_void_p),
+        ctypes.cast(wv.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(ngrids), ctypes.c_int(nao))
+    if err != 0:
+        raise RuntimeError('CUDA Error')
     return aow
 
-def _d1_dot_(ao1, ao2):
-    vmat0 = cupy.dot(ao1[0], ao2)
-    vmat1 = cupy.dot(ao1[1], ao2)
-    vmat2 = cupy.dot(ao1[2], ao2)
-    return cupy.stack([vmat0,vmat1,vmat2])
+def _d1_dot_(ao1, ao2, out=None):
+    if out is None:
+        vmat0 = cupy.dot(ao1[0], ao2)
+        vmat1 = cupy.dot(ao1[1], ao2)
+        vmat2 = cupy.dot(ao1[2], ao2)
+        return cupy.stack([vmat0,vmat1,vmat2])
+    else:
+        cupy.dot(ao1[0], ao2, out=out[0])
+        cupy.dot(ao1[1], ao2, out=out[1])
+        cupy.dot(ao1[2], ao2, out=out[2])
+        return out
 
 def _gga_grad_sum_(ao, wv):
     #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4])
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index d3898d42..82269257 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -309,35 +309,35 @@ def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype):
         ao_dm0_y = ao_dm0[2][p0:p1]
         ao_dm0_z = ao_dm0[3][p0:p1]
         # (d_X \nabla mu) dot \nalba nu DM_{mu,nu}
-        rho1[0,4] += cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_x)
-        rho1[0,4] += cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_y)
-        rho1[0,4] += cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_z)
-        rho1[1,4] += cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_x)
-        rho1[1,4] += cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_y)
-        rho1[1,4] += cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_z)
-        rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_x)
-        rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_y)
-        rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_z)
+        rho1[0,4] += numint._contract_rho(ao[XX,p0:p1], ao_dm0_x)
+        rho1[0,4] += numint._contract_rho(ao[XY,p0:p1], ao_dm0_y)
+        rho1[0,4] += numint._contract_rho(ao[XZ,p0:p1], ao_dm0_z)
+        rho1[1,4] += numint._contract_rho(ao[YX,p0:p1], ao_dm0_x)
+        rho1[1,4] += numint._contract_rho(ao[YY,p0:p1], ao_dm0_y)
+        rho1[1,4] += numint._contract_rho(ao[YZ,p0:p1], ao_dm0_z)
+        rho1[2,4] += numint._contract_rho(ao[ZX,p0:p1], ao_dm0_x)
+        rho1[2,4] += numint._contract_rho(ao[ZY,p0:p1], ao_dm0_y)
+        rho1[2,4] += numint._contract_rho(ao[ZZ,p0:p1], ao_dm0_z)
         rho1[:,4] *= .5
     else:
         raise RuntimeError
 
     ao_dm0_0 = ao_dm0[0][p0:p1]
     # (d_X \nabla_x mu) nu DM_{mu,nu}
-    rho1[:,0] = cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0_0)
-    rho1[0,1]+= cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_0)
-    rho1[0,2]+= cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_0)
-    rho1[0,3]+= cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_0)
-    rho1[1,1]+= cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_0)
-    rho1[1,2]+= cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_0)
-    rho1[1,3]+= cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_0)
-    rho1[2,1]+= cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_0)
-    rho1[2,2]+= cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_0)
-    rho1[2,3]+= cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_0)
+    rho1[:,0] = numint._contract_rho1(ao[1:4,p0:p1], ao_dm0_0)
+    rho1[0,1]+= numint._contract_rho(ao[XX,p0:p1], ao_dm0_0)
+    rho1[0,2]+= numint._contract_rho(ao[XY,p0:p1], ao_dm0_0)
+    rho1[0,3]+= numint._contract_rho(ao[XZ,p0:p1], ao_dm0_0)
+    rho1[1,1]+= numint._contract_rho(ao[YX,p0:p1], ao_dm0_0)
+    rho1[1,2]+= numint._contract_rho(ao[YY,p0:p1], ao_dm0_0)
+    rho1[1,3]+= numint._contract_rho(ao[YZ,p0:p1], ao_dm0_0)
+    rho1[2,1]+= numint._contract_rho(ao[ZX,p0:p1], ao_dm0_0)
+    rho1[2,2]+= numint._contract_rho(ao[ZY,p0:p1], ao_dm0_0)
+    rho1[2,3]+= numint._contract_rho(ao[ZZ,p0:p1], ao_dm0_0)
     # (d_X mu) (\nabla_x nu) DM_{mu,nu}
-    rho1[:,1] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[1][p0:p1])
-    rho1[:,2] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[2][p0:p1])
-    rho1[:,3] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[3][p0:p1])
+    rho1[:,1] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[1][p0:p1])
+    rho1[:,2] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[2][p0:p1])
+    rho1[:,3] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[3][p0:p1])
 
     # *2 for |mu> DM <d_X nu|
     return rho1 * 2
@@ -358,6 +358,7 @@ def _d1d2_dot_(vmat, mol, ao1, ao2, mask, ao_loc, dR1_on_bra=True):
 def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
     mol = hessobj.mol
     mf = hessobj.base
+    log = logger.new_logger(mol, mol.verbose)
     if hessobj.grids is not None:
         grids = hessobj.grids
     else:
@@ -380,7 +381,6 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
     if opt is None:
         raise RuntimeError("DFT Options are not initialized")
     coeff = cupy.asarray(opt.coeff)
-
     dm0 = mf.make_rdm1(mo_coeff, mo_occ)
 
     ## transform object in sorted AO
@@ -392,11 +392,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
     ipip = cupy.zeros((3,3,nao,nao))
     if xctype == 'LDA':
         ao_deriv = 1
-        for ao, mask, weight, coords \
+        for ao_mask, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            ao = contract('nip,ij->njp', ao, coeff[mask])
+            t0 = log.init_timer()
+            nao_non0 = len(mask)
+            ao = contract('nip,ij->njp', ao_mask, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
+            t0 = log.timer_debug1('eval rho', *t0)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
+            t0 = log.timer_debug1('eval vxc', *t0)
             wv = weight * vxc[0]
             aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)]
             _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
@@ -409,10 +413,14 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 rho1 = contract('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2
                 # aow ~ rho1 ~ d/dR1
                 wv = wf * rho1
-                aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)]
-                _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False)
+                vmat_tmp = cupy.zeros([3,3,nao_non0,nao_non0])
+                aow = [numint._scale_ao(ao_mask[0], wv[i]) for i in range(3)]
+                _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
+                vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+                vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+                vmat[ia] += vmat_tmp
             ao_dm0 = aow = None
-
+            t0 = log.timer_debug1('integration', *t0)
         for ia in range(mol.natm):
             p0, p1 = aoslices[ia][2:]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
@@ -420,29 +428,44 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
     elif xctype == 'GGA':
         ao_deriv = 2
         comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6
-        for ao, mask, weight, coords \
+        for ao_mask, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao):
-            # TODO: improve efficiency
-            ao = contract('nip,ij->njp', ao, coeff[mask])
+            t0 = log.init_timer()
+            nao_non0 = len(mask)
+            ao = contract('nip,ij->njp', ao_mask, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
+            t0 = log.timer_debug1('eval rho', *t0)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
+            t0 = log.timer_debug1('eval vxc', *t0)
             wv = weight * vxc
             wv[0] *= .5
             aow = rks_grad._make_dR_dao_w(ao, wv)
             _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
             ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)]
             wf = weight * fxc
+
             for ia in range(mol.natm):
                 dR_rho1 = _make_dR_rho1(ao, ao_dm0, ia, aoslices, xctype)
                 wv = contract('xyg,sxg->syg', wf, dR_rho1)
                 wv[:,0] *= .5
+                '''
                 for i in range(3):
                     aow = rks_grad._make_dR_dao_w(ao, wv[i])
                     vmat[ia,i] += rks_grad._d1_dot_(aow, ao[0].T)
                 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)]
                 _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False)
+                '''
+                vmat_tmp = cupy.empty([3,3,nao_non0,nao_non0])
+                for i in range(3):
+                    aow = rks_grad._make_dR_dao_w(ao_mask, wv[i])
+                    rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i])
+                aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)]
+                _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
+                vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+                vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+                vmat[ia] += vmat_tmp
             ao_dm0 = aow = None
-
+            t0 = log.timer_debug1('integration', *t0)
         for ia in range(mol.natm):
             p0, p1 = aoslices[ia][2:]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
@@ -453,11 +476,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
         YX, YY, YZ = 5, 7, 8
         ZX, ZY, ZZ = 6, 8, 9
         ao_deriv = 2
-        for ao, mask, weight, coords \
+        for ao_mask, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
-            ao = contract('nip,ij->njp', ao, coeff[mask])
+            t0 = log.init_timer()
+            nao_non0 = len(mask)
+            ao = contract('nip,ij->njp', ao_mask, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
+            t0 = log.timer_debug1('eval rho', *t0)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
+            t0 = log.timer_debug1('eval vxc', *t0)
             wv = weight * vxc
             wv[0] *= .5
             wv[4] *= .25
@@ -476,20 +503,28 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 wv = contract('xyg,sxg->syg', wf, dR_rho1)
                 wv[:,0] *= .5
                 wv[:,4] *= .5  # for the factor 1/2 in tau
+                '''
                 for i in range(3):
                     aow = rks_grad._make_dR_dao_w(ao, wv[i])
                     vmat[ia,i] += rks_grad._d1_dot_(aow, ao[0].T)
-
-                aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)]
-                _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False)
-
-                aow = [numint._scale_ao(ao[1], wv[i,4]) for i in range(3)]
-                _d1d2_dot_(vmat[ia], mol, [ao[XX], ao[XY], ao[XZ]], aow, mask, ao_loc, False)
-                aow = [numint._scale_ao(ao[2], wv[i,4]) for i in range(3)]
-                _d1d2_dot_(vmat[ia], mol, [ao[YX], ao[YY], ao[YZ]], aow, mask, ao_loc, False)
-                aow = [numint._scale_ao(ao[3], wv[i,4]) for i in range(3)]
-                _d1d2_dot_(vmat[ia], mol, [ao[ZX], ao[ZY], ao[ZZ]], aow, mask, ao_loc, False)
-
+                '''
+                vmat_tmp = cupy.empty([3,3,nao_non0,nao_non0])
+                for i in range(3):
+                    aow = rks_grad._make_dR_dao_w(ao_mask, wv[i])
+                    rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i])
+                aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)]
+                _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
+
+                aow = [numint._scale_ao(ao_mask[1], wv[i,4]) for i in range(3)]
+                _d1d2_dot_(vmat_tmp, mol, [ao_mask[XX], ao_mask[XY], ao_mask[XZ]], aow, mask, ao_loc, False)
+                aow = [numint._scale_ao(ao_mask[2], wv[i,4]) for i in range(3)]
+                _d1d2_dot_(vmat_tmp, mol, [ao_mask[YX], ao_mask[YY], ao_mask[YZ]], aow, mask, ao_loc, False)
+                aow = [numint._scale_ao(ao_mask[3], wv[i,4]) for i in range(3)]
+                _d1d2_dot_(vmat_tmp, mol, [ao_mask[ZX], ao_mask[ZY], ao_mask[ZZ]], aow, mask, ao_loc, False)
+                vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
+                vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
+                vmat[ia] += vmat_tmp
+            t0 = log.timer_debug1('integration', *t0)
         for ia in range(mol.natm):
             p0, p1 = aoslices[ia][2:]
             vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1]
@@ -500,6 +535,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
 def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
     mol = hessobj.mol
     mf = hessobj.base
+    log = logger.new_logger(mol, mol.verbose)
     if hessobj.grids is not None:
         grids = hessobj.grids
     else:
@@ -533,9 +569,12 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         ao_deriv = 1
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
+            t0 = log.init_timer()
             ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype)
+            t0 = log.timer_debug1('eval rho', *t0)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
+            t0 = log.timer_debug1('eval vxc', *t0)
             wv = weight * vxc[0]
             aow = numint._scale_ao(ao[0], wv)
             v_ip += rks_grad._d1_dot_(ao[1:4], aow.T)
@@ -550,15 +589,17 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
                 aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)]
                 vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T)
             ao_dm0 = aow = None
-
+            t0 = log.timer_debug1('integration', *t0)
     elif xctype == 'GGA':
         ao_deriv = 2
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            # TODO: improve efficiency
+            t0 = log.init_timer()
             ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype)
+            t0 = log.timer_debug1('eval rho', *t0)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
+            t0 = log.timer_debug1('eval vxc', *t0)
             wv = weight * vxc
             wv[0] *= .5
             v_ip += rks_grad._gga_grad_sum_(ao, wv)
@@ -572,17 +613,20 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
                 wv[:,0] *= .5
                 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)]
                 vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T)
+            t0 = log.timer_debug1('integration', *t0)
             ao_dm0 = aow = None
-    # TODO: debug and test
     elif xctype == 'MGGA':
         if grids.level < 5:
             logger.warn(mol, 'MGGA Hessian is sensitive to dft grids.')
         ao_deriv = 2
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
+            t0 = log.init_timer()
             ao = contract('nip,ij->njp', ao, coeff[mask])
             rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype)
+            t0 = log.timer_debug1('eval rho', *t0)
             vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3]
+            t0 = log.timer_debug1('eval vxc', *t0)
             wv = weight * vxc
             wv[0] *= .5
             wv[4] *= .5  # for the factor 1/2 in tau
@@ -602,7 +646,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
                     aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)]
                     vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T)
             ao_dm0 = aow = None
-
+            t0 = log.timer_debug1('integration', *t0)
     for ia in range(mol.natm):
         p0, p1 = aoslices[ia][2:]
         vmat[ia,:,p0:p1] += v_ip[:,p0:p1]
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 2285b9dd..337d2c6a 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -142,17 +142,25 @@ def unpack_sparse(cderi_sparse, row, col, p0, p1, nao, out=None, stream=None):
 
 def add_sparse(a, b, indices):
     '''
-    a[np.ix_(indices, indices)] += b
+    a[:,...,:np.ix_(indices, indices)] += b
     '''
-    n = a.shape[0]
-    m = b.shape[0]
-
+    assert a.flags.c_contiguous
+    assert b.flags.c_contiguous
+    n = a.shape[-1]
+    m = b.shape[-1]
+    if a.ndim > 2:
+        count = np.prod(a.shape[:-2])
+    elif a.ndim == 2:
+        count = 1
+    else:
+        raise RuntimeError('add_sparse only supports 2d or 3d tensor')
     err = libcupy_helper.add_sparse(
         ctypes.cast(a.data.ptr, ctypes.c_void_p),
         ctypes.cast(b.data.ptr, ctypes.c_void_p),
         ctypes.cast(indices.data.ptr, ctypes.c_void_p),
         ctypes.c_int(n),
-        ctypes.c_int(m)
+        ctypes.c_int(m),
+        ctypes.c_int(count)
     )
     if err != 0:
         raise RecursionError('failed in sparse_add2d')
diff --git a/gpu4pyscf/lib/cupy_helper/add_sparse.cu b/gpu4pyscf/lib/cupy_helper/add_sparse.cu
index eddbf92a..d8033015 100644
--- a/gpu4pyscf/lib/cupy_helper/add_sparse.cu
+++ b/gpu4pyscf/lib/cupy_helper/add_sparse.cu
@@ -22,8 +22,8 @@
 #define THREADS        32
 #define BLOCK_DIM   32
 
-__global__ 
-void _add_sparse(double *a, double *b, int *indices, int n, int m)
+__global__
+void _add_sparse(double *a, double *b, int *indices, int n, int m, int count)
 {
 	int row = blockIdx.x * BLOCK_DIM + threadIdx.x;
     int col = blockIdx.y * BLOCK_DIM + threadIdx.y;
@@ -32,17 +32,18 @@ void _add_sparse(double *a, double *b, int *indices, int n, int m)
     }
     int idx_a = indices[row] * n + indices[col];
     int idx_b = row * m + col;
-
-    a[idx_a] += b[idx_b];
+    for (int i = 0; i < count; i++){
+        a[idx_a + i*n*n] += b[idx_b + i*m*m];
+    }
 }
 
 extern "C" {
 __host__
-int add_sparse(double *a, double *b, int *indices, int n, int m){
+int add_sparse(double *a, double *b, int *indices, int n, int m, int count){
     int ntile = (m + THREADS - 1) / THREADS;
     dim3 threads(THREADS, THREADS);
     dim3 blocks(ntile, ntile);
-    _add_sparse<<<blocks, threads>>>(a, b, indices, n, m);
+    _add_sparse<<<blocks, threads>>>(a, b, indices, n, m, count);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         return 1;
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index c590b7f6..aca3b082 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -19,6 +19,7 @@
 from cupyx import cutensor
 from cupy_backends.cuda.libs import cutensor as cutensor_backend
 from cupy_backends.cuda.libs.cutensor import Handle
+from gpu4pyscf.lib import logger
 
 libcutensor = None
 for lib_path in _preload_libs['cutensor']:
@@ -31,6 +32,8 @@
 _handle = Handle()
 _modes = {}
 _contraction_descriptors = {}
+_contraction_plans = {}
+_contraction_finds = {}
 
 cutensor_backend.init(_handle)
 
@@ -82,10 +85,25 @@ def create_contraction_descriptor(handle,
     return desc
 
 def create_contraction_find(handle, algo=cutensor_backend.ALGO_DEFAULT):
-    find = cutensor_backend.ContractionFind()
-    cutensor_backend.initContractionFind(handle, find, algo)
+    key = (handle.ptr, algo)
+    if key in _contraction_finds:
+        find = _contraction_finds[key]
+    else:
+        find = cutensor_backend.ContractionFind()
+        cutensor_backend.initContractionFind(handle, find, algo)
+        _contraction_finds[key] = find
     return find
 
+def create_contraction_plan(handle, desc, find, ws_size):
+    key = (handle.ptr, desc.ptr, find.ptr, ws_size)
+    if key in _contraction_plans:
+        plan = _contraction_plans[key]
+    else:
+        plan = cutensor_backend.ContractionPlan()
+        cutensor_backend.initContractionPlan(handle, plan, desc, find, ws_size)
+        _contraction_plans[key] = plan
+    return plan
+
 def contraction(pattern, a, b, alpha, beta, out=None):
     pattern = pattern.replace(" ", "")
     str_a, rest = pattern.split(',')
@@ -121,14 +139,15 @@ def contraction(pattern, a, b, alpha, beta, out=None):
         ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_MIN)
         ws = cupy.empty(ws_size, dtype=np.int8)
 
-    plan = cutensor_backend.ContractionPlan()
-    cutensor_backend.initContractionPlan(_handle, plan, desc, find, ws_size)
+    plan = create_contraction_plan(_handle, desc, find, ws_size)
     alpha = np.asarray(alpha)
     beta = np.asarray(beta)
+
     cutensor_backend.contraction(_handle, plan,
                              alpha.ctypes.data, a.data.ptr, b.data.ptr,
                              beta.ctypes.data, c.data.ptr, out.data.ptr,
                              ws.data.ptr, ws_size)
+
     return out
 
 import os
diff --git a/gpu4pyscf/lib/gdft/CMakeLists.txt b/gpu4pyscf/lib/gdft/CMakeLists.txt
index fc32786c..9aef39b0 100644
--- a/gpu4pyscf/lib/gdft/CMakeLists.txt
+++ b/gpu4pyscf/lib/gdft/CMakeLists.txt
@@ -15,7 +15,7 @@
 
 #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80 --ptxas-options=-v")
 
-add_library(gdft SHARED 
+add_library(gdft SHARED
   nr_eval_gto.cu
   contract_rho.cu
   gen_grids.cu
diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu
index 4d1dc0b2..1957928e 100644
--- a/gpu4pyscf/lib/gdft/contract_rho.cu
+++ b/gpu4pyscf/lib/gdft/contract_rho.cu
@@ -17,6 +17,12 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <cuda_runtime.h>
 #include "contract_rho.cuh"
 // TODO: improve this?
 __global__
@@ -26,20 +32,20 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids,
     const bool active = grid_id < ngrids;
 
     size_t Ngrids = ngrids;
-    int ao_id;
     double v = 0;
     if (active){
-        for (ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
-            v += bra[grid_id + ao_id * Ngrids] * ket[grid_id + ao_id * Ngrids];
+        for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
+            int ket_idx = grid_id + ao_id * Ngrids;
+            v += bra[ket_idx] * ket[ket_idx];
         }
     }
-    
+
     __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)];
     int ix = threadIdx.x;
     int iy = threadIdx.y;
     int ixy = ix + BLKSIZEX * iy;
     buf[ixy] = v;   __syncthreads();
-    // assume block dim = 32 x 32
+
     if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads();
     if (blockDim.y >= 16 && iy < 8)  buf[ixy] += buf[ixy + BLKSIZEX * 8];  __syncthreads();
     if (blockDim.y >= 8  && iy < 4)  buf[ixy] += buf[ixy + BLKSIZEX * 4];  __syncthreads();
@@ -51,6 +57,162 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids,
     }
 }
 
+__global__
+void GDFTcontract_rho4_kernel(double *rho, double *bra, double *ket, int ngrids, int nao, int count)
+{
+    int grid_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const bool active = grid_id < ngrids;
+    size_t ket_stride = nao * ngrids;
+    size_t rho_stride = count * ngrids;
+
+    __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)];
+
+    for (int ia = 0; ia < count; ia++){
+        double v[4] = {0.0, 0.0, 0.0, 0.0};
+        if (active){
+            for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
+                int ket_idx = grid_id + ao_id * ngrids;
+                double bra_tmp = bra[ket_idx + ia * ket_stride];
+                v[0] += bra_tmp * ket[0*ket_stride + ket_idx];
+                v[1] += bra_tmp * ket[1*ket_stride + ket_idx];
+                v[2] += bra_tmp * ket[2*ket_stride + ket_idx];
+                v[3] += bra_tmp * ket[3*ket_stride + ket_idx];
+            }
+        }
+
+        int ix = threadIdx.x;
+        int iy = threadIdx.y;
+        int ixy = ix + BLKSIZEX * iy;
+        for (int i = 0; i < 4; i++){
+            buf[ixy] = v[i];   __syncthreads();
+            if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads();
+            if (blockDim.y >= 16 && iy < 8)  buf[ixy] += buf[ixy + BLKSIZEX * 8];  __syncthreads();
+            if (blockDim.y >= 8  && iy < 4)  buf[ixy] += buf[ixy + BLKSIZEX * 4];  __syncthreads();
+            if (blockDim.y >= 4  && iy < 2)  buf[ixy] += buf[ixy + BLKSIZEX * 2];  __syncthreads();
+            if (blockDim.y >= 2  && iy < 1)  buf[ixy] += buf[ixy + BLKSIZEX * 1];  __syncthreads();
+
+            if (iy == 0 && active) {
+                rho[grid_id + ia * ngrids + rho_stride * i] = buf[ix];
+            }
+        }
+    }
+}
+
+__global__
+void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngrids, int nao)
+{
+    int grid_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const bool active = grid_id < ngrids;
+
+    size_t Ngrids = ngrids;
+    size_t ket_stride = nao * ngrids;
+
+    double v[4] = {0.0, 0.0, 0.0, 0.0};
+    if (active){
+        for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
+            int ket_idx = grid_id + ao_id * Ngrids;
+            double bra_tmp = bra[ket_idx];
+            double ket_tmp = ket[ket_idx];
+
+            v[0] += bra_tmp * ket_tmp;
+
+            ket_idx += ket_stride;
+            v[1] += bra_tmp * ket[ket_idx];
+            v[1] += ket_tmp * bra[ket_idx];
+
+            ket_idx += ket_stride;
+            v[2] += bra_tmp * ket[ket_idx];
+            v[2] += ket_tmp * bra[ket_idx];
+
+            ket_idx += ket_stride;
+            v[3] += bra_tmp * ket[ket_idx];
+            v[3] += ket_tmp * bra[ket_idx];
+        }
+    }
+
+    __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)];
+    int ix = threadIdx.x;
+    int iy = threadIdx.y;
+    int ixy = ix + BLKSIZEX * iy;
+
+    for (int i = 0; i < 4; i++){
+        buf[ixy] = v[i];   __syncthreads();
+        if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads();
+        if (blockDim.y >= 16 && iy < 8)  buf[ixy] += buf[ixy + BLKSIZEX * 8];  __syncthreads();
+        if (blockDim.y >= 8  && iy < 4)  buf[ixy] += buf[ixy + BLKSIZEX * 4];  __syncthreads();
+        if (blockDim.y >= 4  && iy < 2)  buf[ixy] += buf[ixy + BLKSIZEX * 2];  __syncthreads();
+        if (blockDim.y >= 2  && iy < 1)  buf[ixy] += buf[ixy + BLKSIZEX * 1];  __syncthreads();
+
+        if (iy == 0 && active) {
+            rho[grid_id + ngrids * i] = 2.0 * buf[ix];
+        }
+    }
+}
+
+
+__global__
+void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngrids, int nao)
+{
+    int grid_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const bool active = grid_id < ngrids;
+
+    size_t Ngrids = ngrids;
+    size_t ket_stride = nao * ngrids;
+
+    double v[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
+    if (active){
+        for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
+            int ket_idx = grid_id + ao_id * Ngrids;
+            double bra_tmp0 = bra[ket_idx];
+            double ket_tmp0 = ket[ket_idx];
+
+            v[0] += bra_tmp0 * ket_tmp0;
+
+            ket_idx += ket_stride;
+            double bra_tmp1 = bra[ket_idx];
+            double ket_tmp1 = ket[ket_idx];
+            v[1] += bra_tmp0 * ket_tmp1;
+            v[1] += ket_tmp0 * bra_tmp1;
+            v[4] += bra_tmp1 * ket_tmp1;
+
+            ket_idx += ket_stride;
+            bra_tmp1 = bra[ket_idx];
+            ket_tmp1 = ket[ket_idx];
+            v[2] += bra_tmp0 * ket_tmp1;
+            v[2] += ket_tmp0 * bra_tmp1;
+            v[4] += bra_tmp1 * ket_tmp1;
+
+            ket_idx += ket_stride;
+            bra_tmp1 = bra[ket_idx];
+            ket_tmp1 = ket[ket_idx];
+            v[3] += bra_tmp0 * ket_tmp1;
+            v[3] += ket_tmp0 * bra_tmp1;
+            v[4] += bra_tmp1 * ket_tmp1;
+
+        }
+    }
+
+    v[4] *= 0.5;
+
+    __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)];
+    int ix = threadIdx.x;
+    int iy = threadIdx.y;
+    int ixy = ix + BLKSIZEX * iy;
+
+    for (int i = 0; i < 5; i++){
+        buf[ixy] = v[i];   __syncthreads();
+        if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads();
+        if (blockDim.y >= 16 && iy < 8)  buf[ixy] += buf[ixy + BLKSIZEX * 8];  __syncthreads();
+        if (blockDim.y >= 8  && iy < 4)  buf[ixy] += buf[ixy + BLKSIZEX * 4];  __syncthreads();
+        if (blockDim.y >= 4  && iy < 2)  buf[ixy] += buf[ixy + BLKSIZEX * 2];  __syncthreads();
+        if (blockDim.y >= 2  && iy < 1)  buf[ixy] += buf[ixy + BLKSIZEX * 1];  __syncthreads();
+
+        if (iy == 0 && active) {
+            rho[grid_id + ngrids * i] = 2.0 * buf[ix];
+        }
+    }
+}
+
 __global__
 void GDFTscale_ao_kernel(double *out, double *ket, double *wv,
                          int ngrids, int nao, int nvar)
@@ -71,3 +233,130 @@ void GDFTscale_ao_kernel(double *out, double *ket, double *wv,
     }
     out[ixy] = val;
 }
+
+__global__
+void GDFT_make_dR_dao_w_kernel(double *out, double *ket, double *wv,
+                         int ngrids, int nao)
+{
+    int grid_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int ao_id = blockIdx.y * blockDim.y + threadIdx.y;
+    if (grid_id >= ngrids || ao_id >= nao) {
+        return;
+    }
+
+    size_t Ngrids = ngrids;
+    size_t Nag = nao * Ngrids;
+    size_t ixy = grid_id + ao_id * Ngrids;
+
+    double wv0 = wv[grid_id + ngrids * 0];
+    double wv1 = wv[grid_id + ngrids * 1];
+    double wv2 = wv[grid_id + ngrids * 2];
+    double wv3 = wv[grid_id + ngrids * 3];
+
+    double ket5 = ket[ixy + Nag * 5];
+    double ket6 = ket[ixy + Nag * 6];
+    double val;
+    val = ket[ixy + Nag * 1] * wv0;
+    val+= ket[ixy + Nag * 4] * wv1;
+    val+= ket5 * wv2;
+    val+= ket6 * wv3;
+    out[ixy + Nag * 0] = val;
+
+    double ket8 = ket[ixy + Nag * 8];
+    val = ket[ixy + Nag * 2] * wv0;
+    val+= ket5 * wv1;
+    val+= ket[ixy + Nag * 7] * wv2;
+    val+= ket8 * wv3;
+    out[ixy + Nag * 1] = val;
+
+    val = ket[ixy + Nag * 3] * wv0;
+    val+= ket6 * wv1;
+    val+= ket8 * wv2;
+    val+= ket[ixy + Nag * 9] * wv3;
+    out[ixy + Nag * 2] = val;
+}
+
+
+extern "C"{
+__host__
+int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao)
+{
+    dim3 threads(BLKSIZEX, BLKSIZEY);
+    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX);
+    GDFTcontract_rho_kernel<<<blocks, threads, 0, stream>>>(rho, bra, ket, ngrids, nao);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int GDFTcontract_rho4(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao, int count)
+{
+    dim3 threads(BLKSIZEX, BLKSIZEY);
+    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX);
+    GDFTcontract_rho4_kernel<<<blocks, threads, 0, stream>>>(rho, bra, ket, ngrids, nao, count);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int GDFTcontract_rho_gga(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao)
+{
+    dim3 threads(BLKSIZEX, BLKSIZEY);
+    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX);
+    GDFTcontract_rho_gga_kernel<<<blocks, threads, 0, stream>>>(rho, bra, ket, ngrids, nao);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error of GDFTcontract_rho_gga: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int GDFTcontract_rho_mgga(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao)
+{
+    dim3 threads(BLKSIZEX, BLKSIZEY);
+    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX);
+    GDFTcontract_rho_mgga_kernel<<<blocks, threads, 0, stream>>>(rho, bra, ket, ngrids, nao);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error of GDFTcontract_rho_mgga: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int GDFT_make_dR_dao_w(cudaStream_t stream, double *out, double *ket, double *wv,
+                 int ngrids, int nao)
+{
+    dim3 threads(BLKSIZEX, BLKSIZEY);
+    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY);
+    GDFT_make_dR_dao_w_kernel<<<blocks, threads, 0, stream>>>(out, ket, wv, ngrids, nao);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error of GDFT_make_dR_dao_w: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv,
+                 int ngrids, int nao, int nvar)
+{
+    dim3 threads(BLKSIZEX, BLKSIZEY);
+    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY);
+    GDFTscale_ao_kernel<<<blocks, threads, 0, stream>>>(out, ket, wv, ngrids, nao, nvar);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+}
\ No newline at end of file
diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu
index b87ca434..da59b1c9 100644
--- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu
+++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu
@@ -1640,31 +1640,4 @@ int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart,
     //FREE(d_grids);
     return 0;
 }
-
-int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao)
-{
-    dim3 threads(BLKSIZEX, BLKSIZEY);
-    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX);
-    GDFTcontract_rho_kernel<<<blocks, threads, 0, stream>>>(rho, bra, ket, ngrids, nao);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err));
-        return 1;
-    }
-    return 0;
-}
-
-int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv,
-                 int ngrids, int nao, int nvar)
-{
-    dim3 threads(BLKSIZEX, BLKSIZEY);
-    dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY);
-    GDFTscale_ao_kernel<<<blocks, threads, 0, stream>>>(out, ket, wv, ngrids, nao, nvar);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err));
-        return 1;
-    }
-    return 0;
-}
 }
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index 58c3f45f..60497816 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -25,6 +25,8 @@
 WARN = lib.logger.WARN
 DEBUG = lib.logger.DEBUG
 DEBUG1= lib.logger.DEBUG1
+TIMER_LEVEL = lib.logger.TIMER_LEVEL
+flush = lib.logger.flush
 
 if sys.version_info < (3, 0):
     process_clock = time.clock
@@ -33,27 +35,66 @@
     process_clock = time.process_time
     perf_counter = time.perf_counter
 
-def _timer_debug1(rec, msg, cpu0=None, wall0=None, sync=True):
+
+def init_timer(rec):
+    if rec.verbose >= TIMER_LEVEL:
+        e0 = cupy.cuda.Event()
+        e0.record()
+        return (process_clock(), perf_counter(), e0)
+    elif rec.verbose >= DEBUG:
+        return (process_clock(), perf_counter())
+    else:
+        return process_clock(),
+
+def timer(rec, msg, cpu0=None, wall0=None, gpu0=None):
+    if cpu0 is None:
+        cpu0 = rec._t0
+    if wall0 and gpu0:
+        rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+        if rec.verbose >= TIMER_LEVEL:
+            rec._e0.record()
+            rec._e0.synchronize()
+            flush(rec, '    CPU time for %20s %9.2f sec, wall time %9.2f sec, GPU time for %9.2f ms'
+                  % (msg, rec._t0-cpu0, rec._w0-wall0, cupy.cuda.get_elapsed_time(gpu0,rec._e0)))
+        return rec._t0, rec._w0, rec._e0
+    elif wall0:
+        rec._t0, rec._w0 = process_clock(), perf_counter()
+        if rec.verbose >= TIMER_LEVEL:
+            flush(rec, '    CPU time for %20s %9.2f sec, wall time %9.2f sec'
+                  % (msg, rec._t0-cpu0, rec._w0-wall0))
+        return rec._t0, rec._w0
+    else:
+        rec._t0 = process_clock()
+        if rec.verbose >= TIMER_LEVEL:
+            flush(rec, '    CPU time for %20s %9.2f sec' % (msg, rec._t0-cpu0))
+        return rec._t0,
+
+def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
     if rec.verbose >= DEBUG1:
-        if(sync): cupy.cuda.stream.get_current_stream().synchronize()
-        return timer(rec, msg, cpu0, wall0)
+        return timer(rec, msg, cpu0, wall0, gpu0)
+    elif wall0 and gpu0:
+        rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+        rec._e0.record()
+        return rec._t0, rec._w0, rec._e0
     elif wall0:
         rec._t0, rec._w0 = process_clock(), perf_counter()
         return rec._t0, rec._w0
     else:
         rec._t0 = process_clock()
-        return rec._t0
+        return rec._t0,
 
 info = lib.logger.info
 debug = lib.logger.debug
 debug1 = lib.logger.debug1
-timer = lib.logger.timer
+debug2 = lib.logger.debug2
 timer_debug1 = _timer_debug1
 
 class Logger(lib.logger.Logger):
     def __init__(self, stdout=sys.stdout, verbose=NOTE):
         super().__init__(stdout=stdout, verbose=verbose)
     timer_debug1 = _timer_debug1
+    timer = timer
+    init_timer = init_timer
 
 def new_logger(rec=None, verbose=None):
     '''Create and return a :class:`Logger` object
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index 1d58a02e..8e1a9855 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -25,11 +25,11 @@
 from functools import reduce
 from pyscf import gto
 from pyscf import lib as pyscf_lib
-from pyscf.lib import logger
 from pyscf.scf import hf, jk, _vhf
 from gpu4pyscf import lib
 from gpu4pyscf.lib.cupy_helper import eigh, load_library, tag_array
 from gpu4pyscf.scf import diis
+from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 4
 FREE_CUPY_CACHE = True
@@ -40,8 +40,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
            verbose=None):
     '''Compute J, K matrices with CPU-GPU hybrid algorithm
     '''
-    cput0 = (logger.process_clock(), logger.perf_counter())
     log = logger.new_logger(mol, verbose)
+    cput0 = log.init_timer()
     if hermi != 1:
         raise NotImplementedError('JK-builder only supports hermitian density matrix')
     if omega is None:
@@ -253,8 +253,8 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
     if omega is not None:
         assert omega >= 0
 
-    cput0 = (logger.process_clock(), logger.perf_counter())
     log = logger.new_logger(mf)
+    cput0 = log.init_timer()
     log.debug3('apply get_jk on gpu')
     if omega is None:
         if hasattr(mf, '_opt_gpu'):
@@ -369,9 +369,9 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
            dump_chk=True, dm0=None, callback=None, conv_check=True, **kwargs):
     conv_tol = mf.conv_tol
     mol = mf.mol
-    t0 = (logger.process_clock(), logger.perf_counter())
     verbose = mf.verbose
     log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
     if(conv_tol_grad is None):
         conv_tol_grad = conv_tol**.5
         logger.info(mf, 'Set gradient conv threshold to %g', conv_tol_grad)
@@ -415,7 +415,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
 
     t_beg = time.time()
     for cycle in range(mf.max_cycle):
-        t0 = (logger.process_clock(), logger.perf_counter())
+        t0 = log.init_timer()
         dm_last = dm
         last_hf_e = e_tot
 
@@ -575,7 +575,7 @@ class RHF(hf.RHF):
     quad_moment = _quad_moment
 
     def scf(self, dm0=None, **kwargs):
-        cput0 = (logger.process_clock(), logger.perf_counter())
+        cput0 = logger.init_timer(self)
 
         self.dump_flags()
         self.build(self.mol)
@@ -630,8 +630,8 @@ def __init__(self, mol, intor, prescreen='CVHFnoscreen',
         self._dmcondname = dmcondname
 
     def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False):
-        cput0 = (logger.process_clock(), logger.perf_counter())
         mol = self.mol
+        cput0 = logger.init_timer(mol)
         # Sort basis according to angular momentum and contraction patterns so
         # as to group the basis functions to blocks in GPU kernel.
         l_ctrs = mol._bas[:,[gto.ANG_OF, gto.NPRIM_OF]]

From 564b3fe5dcd7258609f2f2b4346b60e1a7c785f4 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Sat, 4 Nov 2023 14:18:21 -0700
Subject: [PATCH 18/19] Update README.md

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 58b87b32..b7100771 100644
--- a/README.md
+++ b/README.md
@@ -4,40 +4,40 @@ Installation
 --------
 
 For **CUDA 11.x**
-```
+```sh
 pip3 install gpu4pyscf-cuda11x
 ```
 and install cutensor
-```
+```sh
 python -m cupyx.tools.install_library --cuda 11.x --library cutensor
 ```
 
 For **CUDA 12.x**
-```
+```sh
 pip3 install gpu4pyscf-cuda12x
 ```
 and install cutensor
-```
+```sh
 python -m cupyx.tools.install_library --cuda 12.x --library cutensor
 ```
 
 Compilation
 --------
 The package provides ```dockerfiles/compile/Dockerfile``` for creating the CUDA environment. One can compile the package with
-```
+```sh
 sh build.sh
 ```
 This script will automatically download LibXC, and compile it with CUDA. The script will also build the wheel for installation. The compilation can take more than 5 mins. Then, one can either install the wheel with
-```
+```sh
 cd output
 pip3 install gpu4pyscf-*
 ```
 or simply add it to ```PYTHONPATH```
-```
+```sh
 export PYTHONPATH="${PYTHONPATH}:/your-local-path/gpu4pyscf"
 ```
 Then install cutensor for acceleration
-```
+```sh
 python -m cupyx.tools.install_library --cuda 11.x --library cutensor
 ```
 
@@ -64,7 +64,7 @@ Limitations
 
 Examples
 --------
-```
+```python
 import pyscf
 from gpu4pyscf.dft import rks
 

From e110e9a2ec0c61399b647182cef41d1e3aeb2c95 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Wed, 8 Nov 2023 18:20:10 -0800
Subject: [PATCH 19/19] Add chelpg charges in qmmm folder. (#1) (#56)

* Add chelpg charges in qmmm folder.

* Update chelpg.py

* Update chelpg.py

* Add unit test for chelpg, and compare with Qchem

* Add an example to calculate chelpg

Co-authored-by: puzhichen <147788878+puzhichen@users.noreply.github.com>