diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 5a0d1bd1..7f33e0dd 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -24,7 +24,7 @@ jobs:
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
-        pip3 install "pyscf>2.5"
+        pip3 install pyscf --upgrade
         pip3 install numpy --upgrade
         pip3 install h5py --upgrade
         pip3 install gpu4pyscf-libxc-cuda12x --upgrade
diff --git a/README.md b/README.md
index dbe59a0e..6eeb2e82 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,7 @@ Features
 - Density fitting scheme and direct SCF scheme;
 - SCF, analytical Gradient, and analytical Hessian calculations for Hartree-Fock and DFT;
 - LDA, GGA, mGGA, hybrid, and range-separated functionals via [libXC](https://gitlab.com/libxc/libxc/-/tree/master/);
+- Spin-conserved and spin-flip TDA and TDDFT for excitated states
 - Geometry optimization and transition state search via [geomeTRIC](https://geometric.readthedocs.io/en/latest/);
 - Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4);
 - Nonlocal functional correction (vv10) for SCF and gradient;
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
index 2bf6c993..5ed2b6d1 100644
--- a/examples/00-h2o.py
+++ b/examples/00-h2o.py
@@ -60,6 +60,7 @@
 # Compute Hessian
 h = mf_GPU.Hessian()
 h.auxbasis_response = 2                # 0: no aux contribution, 1: some contributions, 2: all
+mf_GPU.cphf_grids.atom_grid = (50,194) # customize grids for solving CPSCF equation, SG1 by default
 h_dft = h.kernel()
 
 # harmonic analysis
diff --git a/examples/24-cp_bsse.py b/examples/24-cp_bsse.py
new file mode 100644
index 00000000..4ac8dc10
--- /dev/null
+++ b/examples/24-cp_bsse.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+####################################################
+# Example of interaction energy with counterpoise correction
+####################################################
+
+import pyscf
+from gpu4pyscf.dft import rks
+
+atom_A = [
+('O', (0.000000, 0.000000, 0.000000)),
+('H', (0.000000, 0.757160, 0.586260)),
+('H', (0.000000, -0.757160, 0.586260))
+]
+
+atom_B = [
+('O', (0.000000, 0.000000, 2.913530)),
+('H', (0.000000, 0.757160, 3.499790)),
+('H', (0.000000, -0.757160, 3.499790))
+]
+
+atom_AB = atom_A + atom_B
+
+mol_A = pyscf.M(atom=atom_A, basis='cc-pVDZ').build()
+mol_B = pyscf.M(atom=atom_B, basis='cc-pVDZ').build()
+mol_AB = pyscf.M(atom=atom_AB, basis='cc-pVDZ').build()
+
+# Monomer A in the dimer basis
+mol_A_ghost = mol_A.copy()
+ghost_atoms_B = mol_B.atom
+mol_A_ghost.atom.extend([('X-' + atom[0], atom[1]) for atom in ghost_atoms_B])
+mol_A_ghost.build()
+
+# Monomer B in the dimer basis
+mol_B_ghost = mol_B.copy()
+ghost_atoms_A = mol_A.atom
+mol_B_ghost.atom.extend([('X-' + atom[0], atom[1]) for atom in ghost_atoms_A])
+mol_B_ghost.build()
+
+def solve_dft(mol, xc='b3lyp'):
+    mf = rks.RKS(mol, xc='b3lyp').density_fit()
+    mf.grids.atom_grid = (99,590)
+    return mf.kernel()
+
+E_AB = solve_dft(mol_AB)
+E_A = solve_dft(mol_A)
+E_B = solve_dft(mol_B)
+interaction_energy_no_bsse = E_AB - (E_A + E_B)
+print(f"Interaction Energy without BSSE Correction: {interaction_energy_no_bsse:.6f} Hartree")
+
+E_A_ghost = solve_dft(mol_A_ghost)
+E_B_ghost = solve_dft(mol_B_ghost)
+interaction_energy_bsse = E_AB - (E_A_ghost + E_B_ghost)
+print(f"Interaction Energy with BSSE Correction: {interaction_energy_bsse:.6f} Hartree")
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 5ecab3d4..73e90830 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -2,37 +2,16 @@
 
 props = cupy.cuda.runtime.getDeviceProperties(0)
 GB = 1024*1024*1024
-# such as A100-80G
-if props['totalGlobalMem'] >= 64 * GB:
-    min_ao_blksize = 128
-    min_grid_blksize = 128*128
-    ao_aligned = 32
-    grid_aligned = 256
-    mem_fraction = 0.9
-    number_of_threads = 2048 * 108
-# such as V100-32G
-elif props['totalGlobalMem'] >= 32 * GB:
-    min_ao_blksize = 128
-    min_grid_blksize = 128*128
-    ao_aligned = 32
-    grid_aligned = 256
-    mem_fraction = 0.9
-    number_of_threads = 1024 * 80
-# such as A30-24GB
-elif props['totalGlobalMem'] >= 16 * GB:
-    min_ao_blksize = 128
-    min_grid_blksize = 128*128
-    ao_aligned = 32
-    grid_aligned = 256
-    mem_fraction = 0.9
-    number_of_threads = 1024 * 80
-# other gaming cards
-else:
+min_ao_blksize = 128
+min_grid_blksize = 128*128
+ao_aligned = 32
+grid_aligned = 256
+
+# Use smaller blksize for old gaming GPUs
+if props['totalGlobalMem'] < 16 * GB:
     min_ao_blksize = 64
     min_grid_blksize = 64*64
-    ao_aligned = 32
-    grid_aligned = 128
-    mem_fraction = 0.9
-    number_of_threads = 1024 * 80
 
+# Use 90% of the global memory for CuPy memory pool
+mem_fraction = 0.9
 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index e8f66422..2ef8680a 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -91,7 +91,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         log.timer_debug1('prepare intopt', *t0)
         self.j2c = j2c.copy()
 
-        j2c = take_last2d(j2c, intopt.aux_ao_idx)
+        j2c = intopt.sort_orbitals(j2c, aux_axis=[0,1])
         try:
             self.cd_low = cholesky(j2c)
             self.cd_low = tag_array(self.cd_low, tag='cd')
@@ -108,6 +108,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, omega=omega)
         log.timer_debug1('cholesky_eri', *t0)
         self.intopt = intopt
+        return self
 
     def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
                direct_scf_tol=getattr(__config__, 'scf_hf_SCF_direct_scf_tol', 1e-13),
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index 5a271903..ed181f62 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -1,17 +1,18 @@
-#!/usr/bin/env python
-# Copyright 2014-2019 The PySCF Developers. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 # Modified by Xiaojie Wu <wxj6000@gmail.com>
@@ -242,7 +243,7 @@ def to_cpu(self):
         obj = self.undo_df().to_cpu().density_fit()
         return utils.to_cpu(self, obj)
 
-def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
+def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
     '''
     get jk with density fitting
     outputs and input are on the same device
@@ -268,31 +269,37 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
 
     assert nao == dfobj.nao
     vj = vk = None
-    ao_idx = dfobj.intopt.ao_idx
-    dms = take_last2d(dms, ao_idx)
+    intopt = dfobj.intopt
+    dms = intopt.sort_orbitals(dms, axis=[1,2])
     dms_shape = dms.shape
-    rows = dfobj.intopt.cderi_row
-    cols = dfobj.intopt.cderi_col
-    
+    rows = intopt.cderi_row
+    cols = intopt.cderi_col
+
     if with_j:
         dm_sparse = dms[:,rows,cols]
-        dm_sparse[:, dfobj.intopt.cderi_diag] *= .5
+        if hermi == 0:
+            dm_sparse += dms[:,cols,rows]
+        else:
+            dm_sparse *= 2
+        dm_sparse[:, intopt.cderi_diag] *= .5
 
     if with_k:
         vk = cupy.zeros_like(dms)
-    
+
     # SCF K matrix with occ
     if getattr(dms_tag, 'mo_coeff', None) is not None:
+        assert hermi == 1
         mo_occ = dms_tag.mo_occ
         mo_coeff = dms_tag.mo_coeff
         nmo = mo_occ.shape[-1]
         mo_coeff = mo_coeff.reshape(-1,nao,nmo)
         mo_occ   = mo_occ.reshape(-1,nmo)
+        mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
         nocc = 0
         occ_coeff = [0]*nset
         for i in range(nset):
             occ_idx = mo_occ[i] > 0
-            occ_coeff[i] = mo_coeff[i][:,occ_idx][ao_idx] * mo_occ[i][occ_idx]**0.5
+            occ_coeff[i] = mo_coeff[i][:,occ_idx] * mo_occ[i][occ_idx]**0.5
             nocc += mo_occ[i].sum()
         blksize = dfobj.get_blksize(extra=nao*nocc)
         if with_j:
@@ -300,7 +307,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
         for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
             # leading dimension is 1
             if with_j:
-                rhoj = 2.0*dm_sparse.dot(cderi_sparse)
+                rhoj = dm_sparse.dot(cderi_sparse)
                 vj_packed += cupy.dot(rhoj, cderi_sparse.T)
             cderi_sparse = rhoj = None
             for i in range(nset):
@@ -316,18 +323,18 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
             vj[:,rows,cols] = vj_packed
             vj[:,cols,rows] = vj_packed
 
-    # CP-HF K matrix
     elif hasattr(dms_tag, 'mo1'):
+        # K matrix in CP-HF or TDDFT
         occ_coeffs = dms_tag.occ_coeff
         mo1s = dms_tag.mo1
-        mo_occ = dms_tag.mo_occ
-        if not isinstance(occ_coeffs, list):
-            occ_coeffs = [occ_coeffs * 2.0] # For restricted
-        if not isinstance(mo1s, list):
+        if not isinstance(occ_coeffs, (tuple, list)):
+            # *2 for double occupancy in RHF/RKS
+            occ_coeffs = [occ_coeffs * 2.0]
+        if not isinstance(mo1s, (tuple, list)):
             mo1s = [mo1s]
 
-        occ_coeffs = [occ_coeff[ao_idx] for occ_coeff in occ_coeffs]
-        mo1s = [mo1[:,ao_idx] for mo1 in mo1s]
+        occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs]
+        mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
 
         if with_j:
             vj_sparse = cupy.zeros_like(dm_sparse)
@@ -336,7 +343,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
         blksize = dfobj.get_blksize(extra=2*nao*nocc)
         for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
             if with_j:
-                rhoj = 2.0*dm_sparse.dot(cderi_sparse)
+                rhoj = dm_sparse.dot(cderi_sparse)
                 vj_sparse += cupy.dot(rhoj, cderi_sparse.T)
                 rhoj = None
             cderi_sparse = None
@@ -346,8 +353,8 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
                     rhok = contract('Lij,jk->Lki', cderi, occ_coeff).reshape([-1,nao])
                     for i in range(mo1.shape[0]):
                         rhok1 = contract('Lij,jk->Lki', cderi, mo1[i]).reshape([-1,nao])
-                        #contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[iset])
-                        vk[iset] += cupy.dot(rhok.T, rhok1)
+                        #contract('Lki,Lkj->ij', rhok1, rhok, alpha=1.0, beta=1.0, out=vk[iset])
+                        vk[iset] += cupy.dot(rhok1.T, rhok)
                         iset += 1
                 mo1 = rhok1 = rhok = None
             cderi = None
@@ -356,7 +363,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
             vj[:,cols,rows] = vj_sparse
-        if with_k:
+        if with_k and hermi:
             transpose_sum(vk)
         vj_sparse = None
     # general K matrix with density matrix
@@ -366,25 +373,24 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
         blksize = dfobj.get_blksize()
         for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
             if with_j:
-                rhoj = 2.0*dm_sparse.dot(cderi_sparse)
+                rhoj = dm_sparse.dot(cderi_sparse)
                 vj_sparse += cupy.dot(rhoj, cderi_sparse.T)
             if with_k:
                 for k in range(nset):
                     rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
-                    #vk[k] += contract('Lki,Lkj->ij', cderi, rhok)
-                    vk[k] += cupy.dot(cderi.reshape([-1,nao]).T, rhok)
+                    #vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
+                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
         if with_j:
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
             vj[:,cols,rows] = vj_sparse
         rhok = None
 
-    rev_ao_idx = dfobj.intopt.rev_ao_idx
     if with_j:
-        vj = take_last2d(vj, rev_ao_idx)
+        vj = intopt.unsort_orbitals(vj, axis=[1,2])
         vj = vj.reshape(out_shape)
     if with_k:
-        vk = take_last2d(vk, rev_ao_idx)
+        vk = intopt.unsort_orbitals(vk, axis=[1,2])
         vk = vk.reshape(out_shape)
     t1 = log.timer_debug1('vj and vk', *t1)
     if out_cupy:
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 05a09639..15645846 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -17,7 +17,7 @@
 import numpy
 import cupy
 from cupyx.scipy.linalg import solve_triangular
-from pyscf import scf
+from pyscf import scf, gto
 from gpu4pyscf.df import int3c2e, df
 from gpu4pyscf.lib.cupy_helper import (print_mem_info, tag_array,
 unpack_tril, contract, load_library, take_last2d, cholesky)
@@ -88,11 +88,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         raise NotImplementedError()
     mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
     mo_occ = cupy.asarray(mf_grad.base.mo_occ)
-    ao_idx = intopt.ao_idx
 
-    dm = take_last2d(dm0, ao_idx)
+    dm = intopt.sort_orbitals(dm0, axis=[0,1])
     orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
-    orbo = orbo[ao_idx, :]
+    mo_coeff = None
+    orbo = intopt.sort_orbitals(orbo, axis=[0])
     nocc = orbo.shape[-1]
 
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -126,8 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     else:
         int2c_e1 = auxmol.intor('int2c2e_ip1')
     int2c_e1 = cupy.asarray(int2c_e1)
-    aux_ao_idx = intopt.aux_ao_idx
-    rev_aux_idx = numpy.argsort(aux_ao_idx)
+
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low_t = low.T.copy()
@@ -141,7 +140,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj)
         else:
             rhoj_cart = rhoj
-        rhoj = rhoj[rev_aux_idx]
+
+        rhoj = intopt.unsort_orbitals(rhoj, aux_axis=[0])
         tmp = contract('xpq,q->xp', int2c_e1, rhoj)
         vjaux = -contract('xp,p->xp', tmp, rhoj)
         vjaux_2c = cupy.array([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
@@ -153,7 +153,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             #rhok = solve_triangular(low_t, rhok, lower=False)
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
         tmp = contract('pij,qij->pq', rhok, rhok)
-        tmp = take_last2d(tmp, rev_aux_idx)
+        tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
         vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
         vkaux = tmp = None
@@ -166,26 +166,25 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     t0 = log.timer_debug1('rhoj and rhok', *t0)
     int2c_e1 = None
 
-    nao_cart = intopt.mol.nao
+    nao_cart = intopt._sorted_mol.nao
     block_size = with_df.get_blksize(nao=nao_cart)
 
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
                  group_size_aux=block_size)#, group_size=block_size)
-    if not intopt._mol.cart:
+    dm_cart = dm
+    orbo_cart = orbo
+    if not mol.cart:
         # sph2cart for ao
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         dm_cart = cart2sph @ dm @ cart2sph.T
-    else:
-        dm_cart = dm
-        orbo_cart = orbo
-    dm = orbo = None
 
+    dm = orbo = None
     vj = vk = rhoj_tmp = rhok_tmp = None
     vjaux = vkaux = None
 
-    naux_cart = intopt.auxmol.nao
+    naux_cart = intopt._sorted_auxmol.nao
     if with_j:
         vj = cupy.zeros((3,nao_cart), order='C')
         vjaux = cupy.zeros((3,naux_cart))
@@ -193,8 +192,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         vk = cupy.zeros((3,nao_cart), order='C')
         vkaux = cupy.zeros((3,naux_cart))
     cupy.get_default_memory_pool().free_all_blocks()
+    t1 = log.init_timer()
     for cp_kl_id in range(len(intopt.aux_log_qs)):
-        t1 = log.init_timer()
         k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
         assert k1-k0 <= block_size
         if with_j:
@@ -233,33 +232,36 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
 
         rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
         t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-
-    # vj and vk are still in cartesian
-    cart_ao_idx = intopt.cart_ao_idx
-    rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
-    aoslices = intopt.mol.aoslice_by_atom()
+    
+    # NOTE: vj and vk are still in cartesian
+    _sorted_mol = intopt._sorted_mol
+    natm = _sorted_mol.natm
+    ao2atom = numpy.zeros([nao_cart, natm])
+    ao_loc = _sorted_mol.ao_loc
+    for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]):
+        ao2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+    ao2atom = cupy.asarray(ao2atom)
     if with_j:
-        vj = vj[:, rev_cart_ao_idx]
-        vj = [-vj[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
-        vj = cupy.asarray(vj)
+        vj = -ao2atom.T @ vj.T
     if with_k:
-        vk = vk[:, rev_cart_ao_idx]
-        vk = [-vk[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
-        vk = cupy.asarray(vk)
+        vk = -ao2atom.T @ vk.T
     t0 = log.timer_debug1('(di,j|P) and (i,j|dP)', *t0)
-    cart_aux_idx = intopt.cart_aux_idx
-    rev_cart_aux_idx = numpy.argsort(cart_aux_idx)
-    auxslices = intopt.auxmol.aoslice_by_atom()
 
+    _sorted_auxmol = intopt._sorted_auxmol
+    natm = _sorted_auxmol.natm
+    aux2atom = numpy.zeros([naux_cart, natm])
+    ao_loc = _sorted_auxmol.ao_loc
+    for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
+        aux2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+    aux2atom = cupy.asarray(aux2atom)
     if with_j:
-        vjaux = vjaux[:, rev_cart_aux_idx]
-        vjaux_3c = cupy.asarray([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
-        vjaux = vjaux_2c + vjaux_3c
+        vjaux_3c = aux2atom.T @ vjaux.T
+        vjaux = vjaux_2c - vjaux_3c
 
     if with_k:
-        vkaux = vkaux[:, rev_cart_aux_idx]
-        vkaux_3c = cupy.asarray([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
-        vkaux = vkaux_2c + vkaux_3c
+        vkaux_3c = aux2atom.T @ vkaux.T
+        vkaux = vkaux_2c - vkaux_3c
+    
     return vj, vk, vjaux, vkaux
 
 
@@ -303,4 +305,4 @@ def extra_force(self, atom_id, envs):
         else:
             return 0
 
-Grad = Gradients
+Grad = Gradients
\ No newline at end of file
diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
index c19cc3d6..5dcb7c23 100644
--- a/gpu4pyscf/df/grad/uhf.py
+++ b/gpu4pyscf/df/grad/uhf.py
@@ -17,7 +17,7 @@
 import cupy
 import copy
 from cupyx.scipy.linalg import solve_triangular
-from pyscf import scf
+from pyscf import scf, gto
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library, take_last2d
 from gpu4pyscf.grad import uhf as uhf_grad
@@ -68,13 +68,14 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
     if mo_occ is None:
         mo_occ = cupy.asarray(mf_grad.base.mo_occ)
-    ao_idx = intopt.ao_idx
-    dm = take_last2d(dm0, ao_idx)
+
+    dm = intopt.sort_orbitals(dm0, axis=[0,1])
     if dm2 is not None:
-        dm2_tmp = take_last2d(dm2, ao_idx)
+        dm2_tmp = intopt.sort_orbitals(dm2, axis=[0,1])
+
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
     orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
-    orbo = orbo[ao_idx, :]
+    orbo = intopt.sort_orbitals(orbo, axis=[0])
     nocc = orbo.shape[-1]
 
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -115,8 +116,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     else:
         int2c_e1 = auxmol.intor('int2c2e_ip1')
     int2c_e1 = cupy.asarray(int2c_e1)
-    aux_ao_idx = intopt.aux_ao_idx
-    rev_aux_idx = np.argsort(aux_ao_idx)
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low_t = low.T.copy()
@@ -133,11 +132,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj)
         else:
             rhoj_cart = rhoj
-
-        rhoj = rhoj[rev_aux_idx]
+        rhoj = intopt.unsort_orbitals(rhoj, aux_axis=[0])
 
         if dm2 is not None:
-            rhoj2 = rhoj2[rev_aux_idx]
+            rhoj2 = intopt.unsort_orbitals(rhoj2, aux_axis=[0])
+
         tmp = contract('xpq,q->xp', int2c_e1, rhoj)
         if dm2 is not None:
             vjaux = -contract('xp,p->xp', tmp, rhoj2)
@@ -151,7 +150,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         elif low.tag == 'cd':
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
         tmp = contract('pij,qij->pq', rhok, rhok)
-        tmp = take_last2d(tmp, rev_aux_idx)
+        tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
         vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
         vkaux = tmp = None
@@ -164,33 +163,34 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     t0 = log.timer_debug1('rhoj and rhok', *t0)
     int2c_e1 = None
 
-    nao_cart = intopt.mol.nao
+    nao_cart = intopt._sorted_mol.nao
     block_size = with_df.get_blksize(nao=nao_cart)
     
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
                  group_size_aux=block_size)#, group_size=block_size)
-    if not intopt._mol.cart:
+    
+    if not mol.cart:
         # sph2cart for ao
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         if dm2 is None:
             dm_cart = cart2sph @ dm @ cart2sph.T
         else:
-            dm2_tmp = take_last2d(dm2, ao_idx)
+            dm2_tmp = intopt.sort_orbitals(dm2, axis=[0,1])
             dm_cart = cart2sph @ dm2_tmp @ cart2sph.T
     else:
         if dm2 is None:
             dm_cart = dm
         else:
-            dm_cart = take_last2d(dm2, ao_idx)
+            dm_cart = intopt.sort_orbitals(dm2, axis=[0,1])
         orbo_cart = orbo
     dm = orbo = None
 
     vj = vk = rhoj_tmp = rhok_tmp = None
     vjaux = vkaux = None
 
-    naux_cart = intopt.auxmol.nao
+    naux_cart = intopt._sorted_auxmol.nao
     if with_j:
         vj = cupy.zeros((3,nao_cart), order='C')
         vjaux = cupy.zeros((3,naux_cart))
@@ -198,8 +198,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         vk = cupy.zeros((3,nao_cart), order='C')
         vkaux = cupy.zeros((3,naux_cart))
     cupy.get_default_memory_pool().free_all_blocks()
+    t1 = log.init_timer()
     for cp_kl_id in range(len(intopt.aux_log_qs)):
-        t1 = log.init_timer()
         k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
         assert k1-k0 <= block_size
         if with_j:
@@ -239,32 +239,34 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
         t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
 
-    cart_ao_idx = intopt.cart_ao_idx
-    rev_cart_ao_idx = np.argsort(cart_ao_idx)
-    aoslices = intopt.mol.aoslice_by_atom()
+    # NOTE: vj and vk are still in cartesian
+    _sorted_mol = intopt._sorted_mol
+    natm = _sorted_mol.natm
+    ao2atom = np.zeros([nao_cart, natm])
+    ao_loc = _sorted_mol.ao_loc
+    for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]):
+        ao2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+    ao2atom = cupy.asarray(ao2atom)
     if with_j:
-        vj = vj[:, rev_cart_ao_idx]
-        vj = [-vj[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
-        vj = cupy.asarray(vj)
+        vj = -ao2atom.T @ vj.T
     if with_k:
-        vk = vk[:, rev_cart_ao_idx]
-        vk = [-vk[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
-        vk = cupy.asarray(vk)
+        vk = -ao2atom.T @ vk.T
     t0 = log.timer_debug1('(di,j|P) and (i,j|dP)', *t0)
 
-    cart_aux_idx = intopt.cart_aux_idx
-    rev_cart_aux_idx = np.argsort(cart_aux_idx)
-    auxslices = intopt.auxmol.aoslice_by_atom()
-
+    _sorted_auxmol = intopt._sorted_auxmol
+    natm = _sorted_auxmol.natm
+    aux2atom = np.zeros([naux_cart, natm])
+    ao_loc = _sorted_auxmol.ao_loc
+    for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
+        aux2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+    aux2atom = cupy.asarray(aux2atom)
     if with_j:
-        vjaux = vjaux[:, rev_cart_aux_idx]
-        vjaux_3c = cupy.asarray([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
-        vjaux = vjaux_2c + vjaux_3c
+        vjaux_3c = aux2atom.T @ vjaux.T
+        vjaux = vjaux_2c - vjaux_3c
 
     if with_k:
-        vkaux = vkaux[:, rev_cart_aux_idx]
-        vkaux_3c = cupy.asarray([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
-        vkaux = vkaux_2c + vkaux_3c
+        vkaux_3c = aux2atom.T @ vkaux.T
+        vkaux = vkaux_2c - vkaux_3c
     return vj, vk, vjaux, vkaux
 
 class Gradients(uhf_grad.Gradients):
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index cc669174..b09e41af 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -96,19 +96,17 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    ao_idx = intopt.ao_idx
-    aux_ao_idx = intopt.aux_ao_idx
-    naux = len(aux_ao_idx)
-    mocc_2 = mocc_2[ao_idx, :]
-    dm0 = take_last2d(dm0, ao_idx)
+    naux = auxmol.nao #len(aux_ao_idx)
+    mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0])
+    dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
     dm0_tag = tag_array(dm0, occ_coeff=mocc_2)
 
     int2c = cupy.asarray(int2c, order='C')
-    int2c = take_last2d(int2c, aux_ao_idx)
+    int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
 
     int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
-    int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+    int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
     hj_ao_ao = cupy.zeros([nao,nao,3,3])
     hk_ao_ao = cupy.zeros([nao,nao,3,3])
@@ -255,7 +253,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         else:
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
-        int2c_ipip1 = take_last2d(int2c_ipip1, aux_ao_idx)
+        int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
         rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         # p,xp->px
@@ -271,7 +269,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         else:
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
-        int2c_ip1ip2 = take_last2d(int2c_ip1ip2, aux_ao_idx)
+        int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
         hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
@@ -329,29 +327,22 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             rho2c_10= int2c_ip1_inv = None
     t1 = log.timer_debug1('contract int2c_*', *t1)
 
-    ao_idx = np.argsort(intopt.ao_idx)
-    aux_idx = np.argsort(intopt.aux_ao_idx)
-    rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
-    dm0 = dm0[rev_ao_ao]
-    hj_ao_diag = hj_ao_diag[ao_idx]
-    hj_ao_ao = hj_ao_ao[rev_ao_ao]
+    dm0 = intopt.unsort_orbitals(dm0, axis=[0,1])
+    hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
+    hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
     if hessobj.auxbasis_response:
-        rev_ao_aux = cupy.ix_(ao_idx, aux_idx)
-        hj_ao_aux = hj_ao_aux[rev_ao_aux]
+        hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
     if hessobj.auxbasis_response > 1:
-        rev_aux_aux = cupy.ix_(aux_idx, aux_idx)
-        hj_aux_diag = hj_aux_diag[aux_idx]
-        hj_aux_aux = hj_aux_aux[rev_aux_aux]
-
+        hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
+        hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
-        hk_ao_diag = hk_ao_diag[ao_idx]
-        hk_ao_ao = hk_ao_ao[rev_ao_ao]
+        hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
+        hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
-            hk_ao_aux = hk_ao_aux[rev_ao_aux]
+            hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
         if hessobj.auxbasis_response > 1:
-            hk_aux_diag = hk_aux_diag[aux_idx]
-            hk_aux_aux = hk_aux_aux[rev_aux_aux]
-
+            hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0])
+            hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1])
     #======================================== sort AO end ===========================================
     # Energy weighted density matrix
     # pi,qi,i->pq
@@ -460,7 +451,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mo_occ = cupy.asarray(mo_occ, order='C')
 
     mf = hessobj.base
-    #auxmol = hessobj.base.with_df.auxmol
     auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis)
     aoslices = mol.aoslice_by_atom()
     auxslices = auxmol.aoslice_by_atom()
@@ -486,16 +476,14 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
                  aosym=False,
                  group_size_aux=BLKSIZE,
                  group_size=BLKSIZE)
-    ao_idx = intopt.ao_idx
-    aux_ao_idx = intopt.aux_ao_idx
-    naux = len(aux_ao_idx)
-    mocc = mocc[ao_idx, :]
+    naux = auxmol.nao
+    mocc = intopt.sort_orbitals(mocc, axis=[0])
     nocc = mocc.shape[1]
-    mo_coeff = mo_coeff[ao_idx,:]
-    dm0 = take_last2d(dm0, ao_idx)
+    mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0])
+    dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
     dm0_tag = tag_array(dm0, occ_coeff=mocc)
-
-    int2c = take_last2d(int2c, aux_ao_idx)
+    
+    int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
     wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
     rhoj0 = solve_j2c(wj)
@@ -530,7 +518,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         else:
             int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
         int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
-        int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+        int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
         # Generate rhok0_P__
         if isinstance(rhok0_Pl_, cupy.ndarray):
@@ -583,17 +571,17 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     rhoj0 = rhok0_Pl_ = None
     vk1_ao *= 2.0
     vk1_buf *= 2.0
-    rev_ao_idx = np.argsort(ao_idx)
-    vj1_buf = take_last2d(vj1_buf, rev_ao_idx)
-    vk1_buf = take_last2d(vk1_buf, rev_ao_idx)
+    
+    vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+    vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2])
 
     vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
     vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
     vj1_ao = vk1_ao = None
     t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
 
-    mocc = mocc[rev_ao_idx]
-    mo_coeff = mo_coeff[rev_ao_idx]
+    mocc = intopt.unsort_orbitals(mocc, axis=[0])
+    mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[0])
     release_gpu_stack()
 
     # ========================== sorted AO end ================================
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 468a0add..014142fa 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -55,7 +55,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         raise NotImplementedError
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-    with_k = abs(hyb) > 1e-10
+    with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                                 atmlst, max_memory, verbose,
                                                 with_k=with_k)
@@ -98,11 +98,12 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
     h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+    with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
     for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, abs(hyb) > 1e-10):
+                                                atmlst, verbose, with_k):
         h1mo[ia] += h1 + vj1
-        if abs(hyb) > 1e-10 or abs(alpha-hyb) > 1e-10:
+        if with_k:
             h1mo[ia] -= .5 * hyb * vk1
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
         for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
diff --git a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
index 490608e6..c39f0172 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
@@ -83,6 +83,7 @@ def test_df_gga(self):
 
         mf = mf.to_gpu()
         hessobj = mf.Hessian()
+        hessobj.base.cphf_grids = hessobj.base.grids
         hess_gpu = hessobj.kernel()
         assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
 
@@ -98,9 +99,11 @@ def test_df_mgga(self):
 
         mf = mf.to_gpu()
         hessobj = mf.Hessian()
+        hessobj.base.cphf_grids = hessobj.base.grids
         hess_gpu = hessobj.kernel()
         assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for DF UKS Hessian")
-    unittest.main()
\ No newline at end of file
+    unittest.main()
+    
\ No newline at end of file
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index a66b6557..71a6c7dc 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -100,23 +100,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    ao_idx = intopt.ao_idx
-    aux_ao_idx = intopt.aux_ao_idx
 
-    mocca = mocca[ao_idx, :]
-    moccb = moccb[ao_idx, :]
-    dm0a = take_last2d(dm0a, ao_idx)
-    dm0b = take_last2d(dm0b, ao_idx)
+    mocca = intopt.sort_orbitals(mocca, axis=[0])
+    moccb = intopt.sort_orbitals(moccb, axis=[0])
+    dm0a = intopt.sort_orbitals(dm0a, axis=[0,1])
+    dm0b = intopt.sort_orbitals(dm0b, axis=[0,1])
+
     dm0a_tag = tag_array(dm0a, occ_coeff=mocca)
     dm0b_tag = tag_array(dm0b, occ_coeff=moccb)
     int2c = cupy.asarray(int2c, order='C')
-    int2c = take_last2d(int2c, aux_ao_idx)
+    int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
+
     int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR)
     solve_j2c = _gen_metric_solver(int2c)
     int2c = None
 
     int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
-    int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+    int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
     hj_ao_ao = cupy.zeros([nao,nao,3,3])
     hk_ao_ao = cupy.zeros([nao,nao,3,3])
@@ -272,7 +272,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         else:
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
-        int2c_ipip1 = take_last2d(int2c_ipip1, aux_ao_idx)
+        int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
         rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         # p,xp->px
@@ -289,7 +289,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         else:
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
-        int2c_ip1ip2 = take_last2d(int2c_ip1ip2, aux_ao_idx)
+        int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
         hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
@@ -349,32 +349,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             rho2c_10= int2c_ip1_inv = None
     t1 = log.timer_debug1('contract int2c_*', *t1)
 
-    ao_idx = np.argsort(intopt.ao_idx)
-    aux_idx = np.argsort(intopt.aux_ao_idx)
-    rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
-    #dm0 = dm0[rev_ao_ao]
-    hj_ao_diag = hj_ao_diag[ao_idx]
-    hj_ao_ao = hj_ao_ao[rev_ao_ao]
+    hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
+    hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
     if hessobj.auxbasis_response:
-        rev_ao_aux = cupy.ix_(ao_idx, aux_idx)
-        hj_ao_aux = hj_ao_aux[rev_ao_aux]
+        hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
     if hessobj.auxbasis_response > 1:
-        rev_aux_aux = cupy.ix_(aux_idx, aux_idx)
-        hj_aux_diag = hj_aux_diag[aux_idx]
-        hj_aux_aux = hj_aux_aux[rev_aux_aux]
-
+        hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
+        hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
-        hk_ao_diag = hk_ao_diag[ao_idx]
-        hk_ao_ao = hk_ao_ao[rev_ao_ao]
+        hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
+        hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
-            hk_ao_aux = hk_ao_aux[rev_ao_aux]
+            hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
         if hessobj.auxbasis_response > 1:
-            hk_aux_diag = hk_aux_diag[aux_idx]
-            hk_aux_aux = hk_aux_aux[rev_aux_aux]
-
-    mocca = mocca[ao_idx]
-    moccb = moccb[ao_idx]
-
+            hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0])
+            hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1])
+    mocca = intopt.unsort_orbitals(mocca, axis=[0])
+    moccb = intopt.unsort_orbitals(moccb, axis=[0])
     #======================================== sort AO end ===========================================
     # Energy weighted density matrix
     # pi,qi,i->pq
@@ -517,17 +508,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
                  aosym=False, 
                  group_size_aux=BLKSIZE, 
                  group_size=BLKSIZE)
-    ao_idx = intopt.ao_idx
-    aux_ao_idx = intopt.aux_ao_idx
-
-    mocca = mocca[ao_idx, :]
-    moccb = moccb[ao_idx, :]
-    mo_coeff = mo_coeff[:, ao_idx,:]
-    dm0a = take_last2d(dm0a, ao_idx)
-    dm0b = take_last2d(dm0b, ao_idx)
+    
+    mocca = intopt.sort_orbitals(mocca, axis=[0])
+    moccb = intopt.sort_orbitals(moccb, axis=[0])
+    mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
+    dm0a = intopt.sort_orbitals(dm0a, axis=[0,1])
+    dm0b = intopt.sort_orbitals(dm0b, axis=[0,1])
     dm0 = dm0a + dm0b
 
-    int2c = take_last2d(int2c, aux_ao_idx)
+    int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
     int2c = None
 
@@ -567,10 +556,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega)
     dm0_tag = tag_array(dm0, occ_coeff=moccb)
     vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega)
-    rev_ao_idx = np.argsort(ao_idx)
-    vj1_buf = take_last2d(vj1_buf, rev_ao_idx)
-    vk1a_buf = take_last2d(vk1a_buf, rev_ao_idx)
-    vk1b_buf = take_last2d(vk1b_buf, rev_ao_idx)
+
+    vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+    vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2])
+    vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2])
 
     vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0])
     vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1])
@@ -597,13 +586,13 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         else:
             int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
         int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
-        int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+        int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
         # generate rhok0_P__
         if isinstance(rhok0a_Pl_, cupy.ndarray):
             rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca)
         else:
-            naux = len(aux_ao_idx)
+            naux = auxmol.nao
             nocc = mocca.shape[1]
             rhok0a_P__ = cupy.empty([naux,nocc,nocc])
             for p0, p1 in lib.prange(0,naux,64):
@@ -615,7 +604,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         if isinstance(rhok0b_Pl_, cupy.ndarray):
             rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb)
         else:
-            naux = len(aux_ao_idx)
+            naux = auxmol.nao
             nocc = moccb.shape[1]
             rhok0b_P__ = cupy.empty([naux,nocc,nocc])
             for p0, p1 in lib.prange(0,naux,64):
@@ -670,9 +659,9 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         vk1a_int3c_ip2 = vk1b_int3c_ip2 = None
         t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
 
-    mocca = mocca[rev_ao_idx]
-    moccb = moccb[rev_ao_idx]
-    mo_coeff = mo_coeff[:,rev_ao_idx]
+    mocca = intopt.unsort_orbitals(mocca, axis=[0])
+    moccb = intopt.unsort_orbitals(moccb, axis=[0])
+    mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1])
     release_gpu_stack()
 
     # ========================== sorted AO end ================================
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 9ab957be..3a4dbd52 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -57,7 +57,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         raise NotImplementedError
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-    with_k = abs(hyb) > 1e-10
+    with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                                 atmlst, max_memory, verbose,
                                                 with_k=with_k)
@@ -103,13 +103,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
     h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+    with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
     for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, abs(hyb) > 1e-10):
+                                                atmlst, verbose, with_k):
 
         h1moa[ia] += h1[0] + vj1[0]
         h1mob[ia] += h1[1] + vj1[1]
-        if abs(hyb) > 1e-10 or abs(alpha-hyb) > 1e-10:
+        if with_k:
             vk1a, vk1b = vk1
             h1moa[ia] -= hyb * vk1a
             h1mob[ia] -= hyb * vk1b
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index f2aa0a5a..834c587c 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -64,19 +64,13 @@ def make_fake_mol():
 class VHFOpt(_vhf.VHFOpt):
     def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen',
                  qcondname='CVHFsetnr_direct_scf', dmcondname=None):
-        # use local basis_seg_contraction for efficiency
-        # TODO: switch _mol and mol
-        self.mol = basis_seg_contraction(mol,allow_replica=True)
-        self.auxmol = basis_seg_contraction(auxmol, allow_replica=True)
-        self._mol = mol
-        self._auxmol = auxmol
+        self.mol = mol              # original mol
+        self.auxmol = auxmol        # original auxiliary mol
+        self._sorted_mol = None     # sorted mol
+        self._sorted_auxmol = None  # sorted auxilary mol
 
-        '''
-        # Note mol._bas will be sorted in .build() method. VHFOpt should be
-        # initialized after mol._bas updated.
-        '''
-        self.nao = self.mol.nao
-        self.naux = self.auxmol.nao
+        self._ao_idx = None
+        self._aux_ao_idx = None
 
         self._intor = intor
         self._prescreen = prescreen
@@ -85,11 +79,6 @@ def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen',
 
         self.bpcache = None
 
-        self.cart_ao_idx = None
-        self.sph_ao_idx = None
-        self.cart_aux_idx = None
-        self.sph_aux_idx = None
-
         self.cart_ao_loc = []
         self.cart_aux_loc = []
         self.sph_ao_loc = []
@@ -128,14 +117,16 @@ def build(self, cutoff=1e-14, group_size=None,
         a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
         we will pair (ao,ao) and (aux,1) separately.
         '''
-        _mol = self._mol
-        _auxmol = self._auxmol
-        mol = self.mol
-        auxmol = self.auxmol
+        _mol = self.mol
+        _auxmol = self.auxmol
 
+        mol = basis_seg_contraction(_mol,allow_replica=True)
+        auxmol = basis_seg_contraction(_auxmol, allow_replica=True)
+        
         log = logger.new_logger(_mol, _mol.verbose)
         cput0 = log.init_timer()
-        sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
+        _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
+
         if group_size is not None :
             uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size)
         self.nctr = len(uniq_l_ctr)
@@ -145,16 +136,16 @@ def build(self, cutoff=1e-14, group_size=None,
         _, _, fake_uniq_l_ctr, fake_l_ctr_counts = sort_mol(fake_mol, log=log)
 
         # sort auxiliary mol
-        sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(auxmol, log=log)
+        _sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(auxmol, log=log)
         if group_size_aux is not None:
             aux_uniq_l_ctr, aux_l_ctr_counts = _split_l_ctr_groups(aux_uniq_l_ctr, aux_l_ctr_counts, group_size_aux)
-
-        tot_mol = sorted_mol + fake_mol + sorted_auxmol
-        tot_mol.cart = True
-        self.tot_mol = tot_mol
+        
+        _tot_mol = _sorted_mol + fake_mol + _sorted_auxmol
+        _tot_mol.cart = True
+        self._tot_mol = _tot_mol
 
         # Initialize vhfopt after reordering mol._bas
-        _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen,
+        _vhf.VHFOpt.__init__(self, _sorted_mol, self._intor, self._prescreen,
                              self._qcondname, self._dmcondname)
         self.direct_scf_tol = cutoff
 
@@ -169,32 +160,19 @@ def build(self, cutoff=1e-14, group_size=None,
         cput1 = log.timer_debug1('Get pairing', *cput1)
 
         # contraction coefficient for ao basis
-        cart_ao_loc = sorted_mol.ao_loc_nr(cart=True)
-        sph_ao_loc = sorted_mol.ao_loc_nr(cart=False)
+        cart_ao_loc = _sorted_mol.ao_loc_nr(cart=True)
+        sph_ao_loc = _sorted_mol.ao_loc_nr(cart=False)
         self.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets]
         self.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets]
         self.angular = [l[0] for l in uniq_l_ctr]
 
-        cart_ao_loc = mol.ao_loc_nr(cart=True)
-        sph_ao_loc = mol.ao_loc_nr(cart=False)
-        nao = sph_ao_loc[-1]
-        ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1])
-        self.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
+        # Sorted AO indices
+        ao_loc = mol.ao_loc_nr(cart=_mol.cart)
+        ao_idx = np.array_split(np.arange(_mol.nao), ao_loc[1:-1])
+        self._ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
 
         # cartesian ao index
-        nao = cart_ao_loc[-1]
-        ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
-        self.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
-        ncart = cart_ao_loc[-1]
-        nsph = sph_ao_loc[-1]
-        self.cart2sph = block_c2s_diag(ncart, nsph, self.angular, l_ctr_counts)
-
-        if _mol.cart:
-            inv_idx = np.argsort(self.cart_ao_idx, kind='stable').astype(np.int32)
-            self.coeff = cupy.eye(ncart)[:,inv_idx]
-        else:
-            inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32)
-            self.coeff = self.cart2sph[:, inv_idx]
+        self.cart2sph = block_c2s_diag(self.angular, l_ctr_counts)
         cput1 = log.timer_debug1('AO cart2sph coeff', *cput1)
 
         # pairing auxiliary basis with fake basis set
@@ -203,36 +181,22 @@ def build(self, cutoff=1e-14, group_size=None,
         aux_l_ctr_offsets = np.append(0, np.cumsum(aux_l_ctr_counts))
 
         # contraction coefficient for auxiliary basis
-        cart_aux_loc = sorted_auxmol.ao_loc_nr(cart=True)
-        sph_aux_loc = sorted_auxmol.ao_loc_nr(cart=False)
+        cart_aux_loc = _sorted_auxmol.ao_loc_nr(cart=True)
+        sph_aux_loc = _sorted_auxmol.ao_loc_nr(cart=False)
         self.cart_aux_loc = [cart_aux_loc[cp] for cp in aux_l_ctr_offsets]
         self.sph_aux_loc = [sph_aux_loc[cp] for cp in aux_l_ctr_offsets]
         self.aux_angular = [l[0] for l in aux_uniq_l_ctr]
 
-        cart_aux_loc = self.auxmol.ao_loc_nr(cart=True)
-        sph_aux_loc = self.auxmol.ao_loc_nr(cart=False)
-        naux = sph_aux_loc[-1]
-        ao_idx = np.array_split(np.arange(naux), sph_aux_loc[1:-1])
-        self.sph_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
+        aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart)
+        ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1])
+        self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
 
         # cartesian aux index
-        naux = cart_aux_loc[-1]
-        ao_idx = np.array_split(np.arange(naux), cart_aux_loc[1:-1])
-        self.cart_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
-        ncart = cart_aux_loc[-1]
-        nsph = sph_aux_loc[-1]
-        self.aux_cart2sph = block_c2s_diag(ncart, nsph, self.aux_angular, aux_l_ctr_counts)
-
-        if _auxmol.cart:
-            inv_idx = np.argsort(self.cart_aux_idx, kind='stable').astype(np.int32)
-            self.aux_coeff = cupy.eye(ncart)[:,inv_idx]
-        else:
-            inv_idx = np.argsort(self.sph_aux_idx, kind='stable').astype(np.int32)
-            self.aux_coeff = self.aux_cart2sph[:, inv_idx]
+        self.aux_cart2sph = block_c2s_diag(self.aux_angular, aux_l_ctr_counts)
         aux_l_ctr_offsets += fake_l_ctr_offsets[-1]
         cput1 = log.timer_debug1('aux cart2sph coeff', *cput1)
 
-        ao_loc = sorted_mol.ao_loc_nr(cart=_mol.cart)
+        ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart)
         self.ao_pairs_row, self.ao_pairs_col = get_ao_pairs(pair2bra, pair2ket, ao_loc)
         cderi_row = cupy.hstack(self.ao_pairs_row)
         cderi_col = cupy.hstack(self.ao_pairs_col)
@@ -268,7 +232,7 @@ def build(self, cutoff=1e-14, group_size=None,
         bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1)
         bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32)
         log_qs = log_qs + aux_log_qs
-        ao_loc = tot_mol.ao_loc_nr(cart=True)
+        ao_loc = _tot_mol.ao_loc_nr(cart=True)
         ncptype = len(log_qs)
 
         self.bpcache = ctypes.POINTER(BasisProdCache)()
@@ -278,9 +242,9 @@ def build(self, cutoff=1e-14, group_size=None,
             ao_loc.ctypes.data_as(ctypes.c_void_p),
             bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
             bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
-            tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm),
-            tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas),
-            tot_mol._env.ctypes.data_as(ctypes.c_void_p))
+            _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm),
+            _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas),
+            _tot_mol._env.ctypes.data_as(ctypes.c_void_p))
 
         cput1 = log.timer_debug1('Initialize GPU cache', *cput1)
         self.bas_pairs_locs = bas_pairs_locs
@@ -294,25 +258,79 @@ def build(self, cutoff=1e-14, group_size=None,
 
         if _mol.cart:
             self.ao_loc = self.cart_ao_loc
-            self.ao_idx = self.cart_ao_idx
         else:
             self.ao_loc = self.sph_ao_loc
-            self.ao_idx = self.sph_ao_idx
         if _auxmol.cart:
             self.aux_ao_loc = self.cart_aux_loc
-            self.aux_ao_idx = self.cart_aux_idx
         else:
             self.aux_ao_loc = self.sph_aux_loc
-            self.aux_ao_idx = self.sph_aux_idx
 
-        self.rev_ao_idx = np.argsort(self.ao_idx, kind='stable').astype(np.int32)
-        self.ao_idx = cupy.array(self.ao_idx)
-        self.cart_ao_idx = cupy.array(self.cart_ao_idx)
-        self.sph_ao_idx = cupy.array(self.sph_ao_idx)
-        self.aux_ao_idx = cupy.array(self.aux_ao_idx)
-        self.cart_aux_idx = cupy.array(self.cart_aux_idx)
-        self.sph_aux_idx = cupy.array(self.sph_aux_idx)
-        self.rev_ao_idx = cupy.array(self.rev_ao_idx)
+        self._sorted_mol = _sorted_mol
+        self._sorted_auxmol = _sorted_auxmol
+
+    def sort_orbitals(self, mat, axis=[], aux_axis=[]):
+        ''' Transform given axis of a matrix into sorted AO,
+        and transform given auxiliary axis of a matrix into sorted auxiliary AO
+        '''
+        idx = self._ao_idx
+        aux_idx = self._aux_ao_idx
+        shape_ones = (1,) * mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            elif dim in aux_axis:
+                assert n == len(aux_idx)
+                indices = aux_idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        return mat[tuple(fancy_index)]
+
+    def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]):
+        ''' Transform given axis of a matrix into sorted AO,
+        and transform given auxiliary axis of a matrix into original auxiliary AO
+        '''
+        idx = self._ao_idx
+        aux_idx = self._aux_ao_idx
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            elif dim in aux_axis:
+                assert n == len(aux_idx)
+                indices = aux_idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        mat = cupy.empty_like(sorted_mat)
+        mat[tuple(fancy_index)] = sorted_mat
+        return mat
+    
+    @property
+    def coeff(self):
+        nao = self.mol.nao
+        if self.mol.cart:
+            coeff = cupy.eye(nao)
+            self._coeff = self.unsort_orbitals(coeff, axis=[1])
+        else:
+            self._coeff = self.unsort_orbitals(self.cart2sph, axis=[1])
+        return self._coeff
+
+    @property
+    def aux_coeff(self):
+        naux = self.auxmol.nao
+        if self.auxmol.cart:
+            coeff = cupy.eye(naux)
+            self._aux_coeff = self.unsort_orbitals(coeff, aux_axis=[1])
+        else:
+            self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1])
+        return self._aux_coeff
 
 def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
     log = logger.new_logger(mol, mol.verbose)
@@ -351,7 +369,7 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
             li = intopt.angular[cpi]
             lj = intopt.angular[cpj]
             int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
-            if not intopt._mol.cart:
+            if not intopt.mol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
             i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
@@ -378,7 +396,7 @@ def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None):
     '''
     fn = getattr(libgvhf, 'GINTbuild_int3c2e_' + ip_type + '_jk')
     if omega is None: omega = 0.0
-    nao = intopt.mol.nao
+    nao = intopt._sorted_mol.nao
     n_dm = 1
 
     cp_kl_id = cp_aux_id + len(intopt.log_qs)
@@ -451,19 +469,19 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
     if omega is None: omega = 0.0
     if stream is None: stream = cupy.cuda.get_current_stream()
 
-    nao = intopt.mol.nao
-    naux = intopt.auxmol.nao
+    nao = intopt._sorted_mol.nao
+    naux = intopt._sorted_auxmol.nao
     norb = nao + naux + 1
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
     comp = 3**order
 
-    lmax = intopt.mol._bas[:gto.ANG_OF].max()
-    aux_lmax = intopt.auxmol._bas[:gto.ANG_OF].max()
+    lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max()
+    aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max()
     nroots = (lmax + aux_lmax + order)//2 + 1
     if nroots > NROOT_ON_GPU:
         from pyscf.gto.moleintor import getints, make_cintopt
-        pmol = intopt.tot_mol
+        pmol = intopt._tot_mol
         intor = pmol._add_suffix('int3c2e_' + ip_type)
         opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
 
@@ -519,9 +537,9 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
                 int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
                 int3c_blk = cupy.asarray(int3c_cpu)
 
-            if not intopt._auxmol.cart:
+            if not intopt.auxmol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
-            if not intopt._mol.cart:
+            if not intopt.mol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
                 int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
 
@@ -550,9 +568,9 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None):
     if omega is None: omega = 0.0
     if stream is None: stream = cupy.cuda.get_current_stream()
 
-    nao = len(intopt.ao_idx)
-    nao_cart = intopt.mol.nao
-    naux_cart = intopt.auxmol.nao
+    nao = intopt.mol.nao
+    nao_cart = intopt._sorted_mol.nao
+    naux_cart = intopt._sorted_auxmol.nao
     norb_cart = nao_cart + naux_cart + 1
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
@@ -615,20 +633,20 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None):
         yield aux_id, ints_slices
 
 def get_ao2atom(intopt, aoslices):
-    ao_idx = intopt.ao_idx
-    ao2atom = cupy.zeros([len(ao_idx), len(aoslices)])
+    nao = intopt.mol.nao
+    ao2atom = cupy.zeros([nao, len(aoslices)])
     for ia, aoslice in enumerate(aoslices):
         _, _, p0, p1 = aoslice
         ao2atom[p0:p1,ia] = 1.0
-    return ao2atom[ao_idx,:]
+    return intopt.sort_orbitals(ao2atom, axis=[0])
 
 def get_aux2atom(intopt, auxslices):
-    aux_ao_idx = intopt.aux_ao_idx
-    aux2atom = cupy.zeros([len(aux_ao_idx), len(auxslices)])
+    naux = intopt.auxmol.nao
+    aux2atom = cupy.zeros([naux, len(auxslices)])
     for ia, auxslice in enumerate(auxslices):
         _, _, p0, p1 = auxslice
         aux2atom[p0:p1,ia] = 1.0
-    return aux2atom[aux_ao_idx,:]
+    return intopt.sort_orbitals(aux2atom, aux_axis=[0])
 
 def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
     '''
@@ -636,22 +654,24 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
     '''
     n_dm = 1
 
-    naux = intopt.cart_aux_loc[-1]#len(intopt.cart_aux_idx)
-    rhoj = cupy.zeros([naux])
+    naux = intopt._sorted_auxmol.nao
+    
     coeff = intopt.coeff
     if dm0.ndim == 3:
         dm0 = dm0[0] + dm0[1]
     dm_cart = coeff @ dm0 @ coeff.T
-
+    
     num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
     num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs]
 
     bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32)
     bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32)
-
+    
     ncp_ij = len(intopt.log_qs)
     ncp_kl = len(intopt.aux_log_qs)
     norb = dm_cart.shape[0]
+    
+    rhoj = cupy.zeros([naux])
     err = libgvhf.GINTbuild_j_int3c2e_pass1(
         intopt.bpcache,
         ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p),
@@ -665,7 +685,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
         ctypes.c_int(ncp_kl))
     if err != 0:
         raise RuntimeError('CUDA error in get_j_pass1')
-
+    
     if sort_j:
         aux_coeff = intopt.aux_coeff
         rhoj = cupy.dot(rhoj, aux_coeff)
@@ -676,8 +696,8 @@ def get_j_int3c2e_pass2(intopt, rhoj):
     get vj pass2 for int3c2e
     '''
     n_dm = 1
-    norb = len(intopt.cart_ao_idx)
-    naux = len(intopt.cart_aux_idx)
+    norb = intopt._sorted_mol.nao
+    naux = intopt._sorted_auxmol.nao
     vj = cupy.zeros([norb, norb])
 
     num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
@@ -688,9 +708,10 @@ def get_j_int3c2e_pass2(intopt, rhoj):
 
     ncp_ij = len(intopt.log_qs)
     ncp_kl = len(intopt.aux_log_qs)
-
-    aux_coeff = intopt.aux_coeff
-    rhoj = cupy.dot(aux_coeff, rhoj)
+    
+    rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0])
+    if not intopt.auxmol.cart:
+        rhoj = intopt.aux_cart2sph @ rhoj
 
     err = libgvhf.GINTbuild_j_int3c2e_pass2(
         intopt.bpcache,
@@ -706,8 +727,11 @@ def get_j_int3c2e_pass2(intopt, rhoj):
 
     if err != 0:
         raise RuntimeError('CUDA error in get_j_pass2')
-    coeff = intopt.coeff
-    vj = coeff.T @ vj @ coeff
+    
+    if not intopt.mol.cart:
+        cart2sph = intopt.cart2sph
+        vj = cart2sph.T @ vj @ cart2sph
+    vj = intopt.unsort_orbitals(vj, axis=[0,1])
     vj = vj + vj.T
     return vj
 
@@ -719,7 +743,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
     intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
     if omega is None: omega = 0.0
-    naux = len(intopt.aux_ao_idx)
+    naux = auxmol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
     rhoj = cupy.empty([naux])
@@ -736,7 +760,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
             li = intopt.angular[cpi]
             lj = intopt.angular[cpj]
             int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
-            if not intopt._mol.cart:
+            if not intopt.mol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
             i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
@@ -761,8 +785,8 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
     # vj and vk responses (due to int3c2e_ip1) to changes in atomic positions
     '''
     ao2atom = get_ao2atom(intopt, aoslices)
-    natom = len(aoslices)
-    nao = len(intopt.ao_idx)
+    natom = intopt.mol.natm
+    nao = intopt.mol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
     vj1_buf = cupy.zeros([3,nao,nao])
@@ -820,8 +844,8 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome
     vj and vk responses (due to int3c2e_ip2) to changes in atomic positions
     '''
     aux2atom = get_aux2atom(intopt, auxslices)
-    natom = len(auxslices)
-    nao = len(intopt.ao_idx)
+    natom = intopt.mol.natm
+    nao = intopt.mol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
     vj1 = cupy.zeros([natom,3,nao,nocc])
@@ -863,8 +887,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     '''
     get wj and wk for int3c2e_ip1
     '''
-    nao = len(intopt.ao_idx)
-    naux = len(intopt.aux_ao_idx)
+    nao = intopt.mol.nao
+    naux = intopt.auxmol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
 
@@ -903,7 +927,7 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
     '''
     get wj and wk for int3c2e_ip2
     '''
-    naux = len(intopt.aux_ao_idx)
+    naux = intopt.auxmol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     nocc = orbo.shape[1]
     wj = cupy.zeros([naux,3])
@@ -918,12 +942,12 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     '''
     get hj and hk with int3c2e_ipip1
     '''
-    nao_sph = dm0_tag.shape[0]
+    nao = dm0_tag.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    hj = cupy.zeros([nao_sph,9])
+    hj = cupy.zeros([nao,9])
     hk = None
     if with_k:
-        hk = cupy.zeros([nao_sph,9])
+        hk = cupy.zeros([nao,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1', omega=omega):
         tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
@@ -931,21 +955,21 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
             rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
             rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
             hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
-    hj = hj.reshape([nao_sph,3,3])
+    hj = hj.reshape([nao,3,3])
     if with_k:
-        hk = hk.reshape([nao_sph,3,3])
+        hk = hk.reshape([nao,3,3])
     return hj, hk
 
 def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     '''
     # get hj and hk with int3c2e_ipvip1
     '''
-    nao_sph = dm0_tag.shape[0]
+    nao = dm0_tag.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    hj = cupy.zeros([nao_sph,nao_sph,9])
+    hj = cupy.zeros([nao,nao,9])
     hk = None
     if with_k:
-        hk = cupy.zeros([nao_sph,nao_sph,9])
+        hk = cupy.zeros([nao,nao,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1', omega=omega):
         tmp = contract('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
@@ -953,22 +977,22 @@ def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
             rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
             rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
             hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
-    hj = hj.reshape([nao_sph,nao_sph,3,3])
+    hj = hj.reshape([nao,nao,3,3])
     if with_k:
-        hk = hk.reshape([nao_sph,nao_sph,3,3])
+        hk = hk.reshape([nao,nao,3,3])
     return hj, hk
 
 def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     '''
     # get hj and hk with int3c2e_ip1ip2
     '''
-    nao_sph = dm0_tag.shape[0]
-    naux_sph = rhok.shape[0]
+    nao = dm0_tag.shape[0]
+    naux = rhok.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    hj = cupy.zeros([nao_sph,naux_sph,9])
+    hj = cupy.zeros([nao,naux,9])
     hk = None
     if with_k:
-        hk = cupy.zeros([nao_sph,naux_sph,9])
+        hk = cupy.zeros([nao,naux,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1ip2', omega=omega):
         tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
@@ -976,21 +1000,21 @@ def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
             rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
             rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
             hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
-    hj = hj.reshape([nao_sph,naux_sph,3,3])
+    hj = hj.reshape([nao,naux,3,3])
     if with_k:
-        hk = hk.reshape([nao_sph,naux_sph,3,3])
+        hk = hk.reshape([nao,naux,3,3])
     return hj, hk
 
 def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     '''
     # get hj and hk with int3c2e_ipip2
     '''
-    naux_sph = rhok.shape[0]
+    naux = rhok.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    hj = cupy.zeros([naux_sph,9])
+    hj = cupy.zeros([naux,9])
     hk = None
     if with_k:
-        hk = cupy.zeros([naux_sph,9])
+        hk = cupy.zeros([naux,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip2', omega=omega):
         tmp = contract('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
@@ -998,9 +1022,9 @@ def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
             rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
             rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
             hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
-    hj = hj.reshape([naux_sph,3,3])
+    hj = hj.reshape([naux,3,3])
     if with_k:
-        hk = hk.reshape([naux_sph,3,3])
+        hk = hk.reshape([naux,3,3])
     return hj, hk
 
 def get_hess_nuc_elec(mol, dm):
@@ -1016,8 +1040,7 @@ def get_hess_nuc_elec(mol, dm):
     fakemol.stdout = mol.stdout
     intopt = VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    ao_idx = intopt.ao_idx
-    dm = take_last2d(cupy.asarray(dm), ao_idx)
+    dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
 
     natm = mol.natm
     nao = mol.nao
@@ -1172,9 +1195,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s
             if err != 0:
                 raise RuntimeError("int3c2e_ip failed\n")
 
-            if not intopt._auxmol.cart:
+            if not intopt.auxmol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
-            if not intopt._mol.cart:
+            if not intopt.mol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
                 int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
 
@@ -1183,13 +1206,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s
             k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
 
             int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk
-    ao_idx = np.argsort(intopt.ao_idx)
-    aux_idx = np.argsort(intopt.aux_ao_idx)
-    int3c = int3c[cupy.ix_(np.arange(3), aux_idx, ao_idx, ao_idx)]
-
+    int3c = intopt.unsort_orbitals(int3c, aux_axis=[1], axis=[2,3])
     return int3c.transpose([0,3,2,1])
 
-
 def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', direct_scf_tol=1e-13, omega=None, stream=None):
     '''
     Generate full int3c2e type tensor on GPU
@@ -1219,13 +1238,12 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
     nroots = (lmax + aux_lmax + order)//2 + 1
     if nroots > NROOT_ON_GPU:
         from pyscf.gto.moleintor import getints, make_cintopt
-        mol = intopt.mol
-        pmol = intopt.tot_mol
+        pmol = intopt._tot_mol
         intor = pmol._add_suffix('int3c2e_' + ip_type)
         opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
 
-    nao_cart = intopt.mol.nao
-    naux_cart = intopt.auxmol.nao
+    nao_cart = intopt._sorted_mol.nao
+    naux_cart = intopt._sorted_auxmol.nao
     norb_cart = nao_cart + naux_cart + 1
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
@@ -1281,9 +1299,9 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
                 int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
                 int3c_blk = cupy.asarray(int3c_cpu)
 
-            if not intopt._auxmol.cart:
+            if not intopt.auxmol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
-            if not intopt._mol.cart:
+            if not intopt.mol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
                 int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
 
@@ -1293,10 +1311,7 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
 
             int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk
 
-    ao_idx = np.argsort(intopt.ao_idx)
-    aux_idx = np.argsort(intopt.aux_ao_idx)
-    int3c = int3c[cupy.ix_(np.arange(comp), aux_idx, ao_idx, ao_idx)]
-
+    int3c = intopt.unsort_orbitals(int3c, aux_axis=[1], axis=[2,3])
     return int3c.transpose([0,3,2,1])
 
 def get_dh1e(mol, dm0):
@@ -1313,7 +1328,7 @@ def get_dh1e(mol, dm0):
     fakemol.stdout = mol.stdout
     intopt = VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    dm0_sorted = take_last2d(dm0, intopt.ao_idx)
+    dm0_sorted = intopt.sort_orbitals(dm0, axis=[0,1])
     dh1e = cupy.zeros([natm,3])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'):
         dh1e[k0:k1,:3] += contract('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
@@ -1332,7 +1347,7 @@ def get_d2h1e(mol, dm0):
     d2h1e_offdiag = cupy.zeros([natm, nao, 9])
     intopt = VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    dm0_sorted = take_last2d(dm0, intopt.ao_idx)
+    dm0_sorted = intopt.sort_orbitals(dm0, axis=[0,1])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
         d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
         d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
@@ -1352,8 +1367,8 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
     '''
     if stream is None: stream = cupy.cuda.get_current_stream()
     if omega is None: omega = 0.0
-    nao_cart = intopt.mol.nao
-    naux_cart = intopt.auxmol.nao
+    nao_cart = intopt._sorted_mol.nao
+    naux_cart = intopt._sorted_auxmol.nao
     norb_cart = nao_cart + naux_cart + 1
 
     cpi = intopt.cp_idx[cp_ij_id]
@@ -1381,7 +1396,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
     # if possible, write the data into the given allocated space
     # otherwise, need a temporary space for cart2sph
     '''
-    if out is None or (lk > 1 and not intopt._auxmol.cart):
+    if out is None or (lk > 1 and not intopt.auxmol.cart):
         int3c_blk = cupy.zeros([nk,nj,ni], order='C')
         strides = np.array([1, ni, ni*nj, 1], dtype=np.int32)
     else:
@@ -1408,7 +1423,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
         raise RuntimeError('GINT_fill_int2e failed')
 
     # move this operation to j2c?
-    if lk > 1 and intopt._auxmol.cart == 0:
+    if lk > 1 and intopt.auxmol.cart == 0:
         int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out)
     return int3c_blk
 
@@ -1445,10 +1460,7 @@ def get_int3c2e(mol, auxmol=None, auxbasis='weigend+etb', direct_scf_tol=1e-13,
         int3c[:, j0:j1, i0:i1] = int3c_slice
     row, col = np.tril_indices(nao)
     int3c[:, row, col] = int3c[:, col, row]
-    ao_idx = np.argsort(intopt.ao_idx)
-    aux_id = np.argsort(intopt.aux_ao_idx)
-    int3c = int3c[np.ix_(aux_id, ao_idx, ao_idx)]
-
+    int3c = intopt.unsort_orbitals(int3c, aux_axis=[0], axis=[1,2])
     return int3c.transpose([2,1,0])
 
 def sort_mol(mol0, cart=True, log=None):
diff --git a/gpu4pyscf/df/tests/test_jk.py b/gpu4pyscf/df/tests/test_df_jk.py
similarity index 54%
rename from gpu4pyscf/df/tests/test_jk.py
rename to gpu4pyscf/df/tests/test_df_jk.py
index f353e529..6fb3f841 100644
--- a/gpu4pyscf/df/tests/test_jk.py
+++ b/gpu4pyscf/df/tests/test_df_jk.py
@@ -17,9 +17,10 @@
 import numpy as np
 import cupy
 import pyscf
-from pyscf import df
+from pyscf import df, lib
 from gpu4pyscf import scf as gpu_scf
 from gpu4pyscf.df import int3c2e, df_jk
+from gpu4pyscf.df.df import DF
 
 atom='''
 Ti 0.0 0.0 0.0
@@ -31,18 +32,20 @@
 bas='def2-tzvpp'
 
 def setUpModule():
-    global mol, auxmol
-    mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.cart = True
-    mol.build()
-    mol.verbose = 1
+    global mol, mol_sph, auxmol, auxmol_sph
+    mol = pyscf.M(atom=atom, basis=bas, output='/dev/null', cart=True, verbose=1)
     auxmol = df.addons.make_auxmol(mol, auxbasis='sto3g')
 
+    mol_sph = pyscf.M(atom=atom, basis=bas, output='/dev/null', cart=False, verbose=1)
+    auxmol_sph = df.addons.make_auxmol(mol_sph, auxbasis='sto3g')
+
 def tearDownModule():
-    global mol, auxmol
+    global mol, mol_sph, auxmol, auxmol_sph
     mol.stdout.close()
-    del mol, auxmol
+    mol_sph.stdout.close()
+    auxmol.stdout.close()
+    auxmol_sph.stdout.close()
+    del mol, auxmol, mol_sph, auxmol_sph
 
 class KnownValues(unittest.TestCase):
 
@@ -51,7 +54,7 @@ def test_vj_incore(self):
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
         intopt.build(1e-14, diag_block_with_triu=False, aosym=True)
         cupy.random.seed(np.asarray(1, dtype=np.uint64))
-        nao = len(intopt.ao_idx)
+        nao = intopt.mol.nao
         dm = cupy.random.rand(nao, nao)
         dm = dm + dm.T
 
@@ -64,6 +67,25 @@ def test_vj_incore(self):
         vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore)
         vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore)
         assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5
+    
+    def test_vj_sph_incore(self):
+        int3c_gpu = int3c2e.get_int3c2e(mol_sph, auxmol, aosym=True, direct_scf_tol=1e-14)
+        intopt = int3c2e.VHFOpt(mol_sph, auxmol, 'int2e')
+        intopt.build(1e-14, diag_block_with_triu=False, aosym=True)
+        cupy.random.seed(np.asarray(1, dtype=np.uint64))
+        nao = intopt.mol.nao
+        dm = cupy.random.rand(nao, nao)
+        dm = dm + dm.T
+        
+        # pass 1
+        rhoj_outcore = cupy.einsum('ijL,ij->L', int3c_gpu, dm)
+        rhoj_incore = 2.0*int3c2e.get_j_int3c2e_pass1(intopt, dm)
+        assert cupy.linalg.norm(rhoj_outcore - rhoj_incore) < 1e-8
+
+        # pass 2
+        vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore)
+        vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore)
+        assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5
 
     def test_j_outcore(self):
         cupy.random.seed(np.asarray(1, dtype=np.uint64))
@@ -72,10 +94,22 @@ def test_j_outcore(self):
         dm = dm + dm.T
         mf = gpu_scf.RHF(mol).density_fit()
         mf.kernel()
-        vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False)
+        vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False, hermi=1)
         vj = df_jk.get_j(mf.with_df, dm)
         assert cupy.linalg.norm(vj - vj0) < 1e-4
+    
+    def test_jk_hermi0(self):
+        dfobj = DF(mol, 'sto3g').build()
+        np.random.seed(3)
+        nao = mol.nao
+        dm = np.random.rand(nao, nao)
+        refj, refk = dfobj.to_cpu().get_jk(dm, hermi=0)
+        vj, vk = dfobj.get_jk(dm, hermi=0)
+        assert abs(vj - refj).max() < 1e-9
+        assert abs(vk - refk).max() < 1e-9
+        assert abs(lib.fp(vj) - 455.864593801164).max() < 1e-9
+        assert abs(lib.fp(vk) - 37.7022369618297).max() < 1e-9
 
 if __name__ == "__main__":
     print("Full Tests for DF JK")
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/gpu4pyscf/df/tests/test_df_rhf.py b/gpu4pyscf/df/tests/test_df_rhf.py
index 3852c70b..abb2da46 100644
--- a/gpu4pyscf/df/tests/test_df_rhf.py
+++ b/gpu4pyscf/df/tests/test_df_rhf.py
@@ -31,15 +31,11 @@
 
 def setUpModule():
     global mol_sph, mol_cart
-    mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0)
-    mol_sph.output = '/dev/null'
-    mol_sph.build()
-    mol_sph.verbose = 1
+    mol_sph = pyscf.M(atom=atom, basis=bas, cart=0,
+                      symmetry=True, output='/dev/null', verbose=1)
 
-    mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1)
-    mol_cart.output = '/dev/null'
-    mol_cart.build()
-    mol_cart.verbose = 1
+    mol_cart = pyscf.M(atom=atom, basis=bas, cart=1,
+                       output='/dev/null', verbose=1)
 
 def tearDownModule():
     global mol_sph, mol_cart
diff --git a/gpu4pyscf/df/tests/test_df_rks.py b/gpu4pyscf/df/tests/test_df_rks.py
index 4cd40701..1aa69944 100644
--- a/gpu4pyscf/df/tests/test_df_rks.py
+++ b/gpu4pyscf/df/tests/test_df_rks.py
@@ -31,15 +31,11 @@
 
 def setUpModule():
     global mol_sph, mol_cart
-    mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0)
-    mol_sph.output = '/dev/null'
-    mol_sph.build()
-    mol_sph.verbose = 1
-
-    mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1)
-    mol_cart.output = '/dev/null'
-    mol_cart.build()
-    mol_cart.verbose = 1
+    mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0,
+                      output='/dev/null', verbose=1)
+
+    mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1,
+                       output='/dev/null', verbose=1)
 
 def tearDownModule():
     global mol_sph, mol_cart
diff --git a/gpu4pyscf/df/tests/test_df_rks_grad.py b/gpu4pyscf/df/tests/test_df_rks_grad.py
index ea382e66..a218630d 100644
--- a/gpu4pyscf/df/tests/test_df_rks_grad.py
+++ b/gpu4pyscf/df/tests/test_df_rks_grad.py
@@ -117,17 +117,17 @@ def _vs_cpu(mol, grid_response=False, xc=xc0, disp=disp0, tol=1e-9):
     assert abs(g_analy - ref).max() < tol
 
 class KnownValues(unittest.TestCase):
-
+    
     def test_grad_with_grids_response(self):
         print("-----testing DF DFT gradient with grids response----")
         _check_grad(mol_sph, grid_response=True, xc='LDA', disp=None)
         _check_grad(mol_sph, grid_response=True, xc='B3LYP', disp=None)
         _check_grad(mol_sph, grid_response=True, xc='m06', disp=None, tol=1e-4)
-
+    
     def test_grad_lda(self):
         print("-----LDA testing-------")
         _vs_cpu(mol_sph, xc='LDA', disp=None)
-
+    
     def test_grad_gga(self):
         print('-----GGA testing-------')
         _vs_cpu(mol_sph, xc='PBE', disp=None)
@@ -147,7 +147,7 @@ def test_grad_rsh(self):
     def test_grad_nlc(self):
         print('--------nlc testing-------------')
         _vs_cpu(mol_sph, xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-7)
-
+    
     def test_grad_cart(self):
         print('------ Cart testing--------')
         _vs_cpu(mol_cart, xc='B3LYP', disp=None)
@@ -163,7 +163,7 @@ def test_grad_d4(self):
     def test_grad_wb97m_d3bj(self):
         print('------ wB97m-d3bj --------')
         _vs_cpu(mol_sph, xc='wb97m-d3bj', tol=1e-8)
-
+    
 if __name__ == "__main__":
     print("Full Tests for DF Gradient")
     unittest.main()
diff --git a/gpu4pyscf/dft/__init__.py b/gpu4pyscf/dft/__init__.py
index d1ae3570..c65e412d 100644
--- a/gpu4pyscf/dft/__init__.py
+++ b/gpu4pyscf/dft/__init__.py
@@ -1,9 +1,9 @@
 from . import rks
-from .rks import RKS
+from .rks import RKS, KohnShamDFT
 from .uks import UKS
 from .gks import GKS
 from .roks import ROKS
-from gpu4pyscf.dft.gen_grid import Grids
+from .gen_grid import Grids
 
 def KS(mol, xc='LDA,VWN'):
     if mol.spin == 0:
diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index dda28353..3f709733 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -26,6 +26,7 @@ class GKS(gks.GKS, GHF):
     def __init__(self, mol, xc='LDA,VWN'):
         raise NotImplementedError
 
+    reset = rks.RKS.reset
     energy_elec = rks.RKS.energy_elec
     get_veff = NotImplemented
     nuc_grad_method = NotImplemented
diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py
index 8a07e3c3..850a879a 100644
--- a/gpu4pyscf/dft/libxc.py
+++ b/gpu4pyscf/dft/libxc.py
@@ -124,17 +124,18 @@ def _check_arrays(current_arrays, fields, sizes, factor, required):
     """
     A specialized function built to construct and check the sizes of arrays given to the LibXCFunctional class.
     """
-
     # Nothing supplied so we build it out
     if current_arrays is None:
         current_arrays = {}
 
+    if not required:
+        for label in fields:
+            current_arrays[label] = None
+        return current_arrays
+
     for label in fields:
-        if required:
-            size = sizes[label]
-            current_arrays[label] = cupy.empty((factor, size), dtype=np.float64)
-        else:
-            current_arrays[label] = None # cupy.empty((1))
+        size = sizes[label]
+        current_arrays[label] = cupy.empty((factor, size), dtype=np.float64)
 
     return current_arrays
 
@@ -150,6 +151,7 @@ class _xcfun(ctypes.Structure):
 
 class XCfun:
     def __init__(self, xc, spin):
+        self.spin = spin
         self._spin = 1 if spin == 'unpolarized' else 2
         self.xc_func = _libxc.xc_func_alloc()
         if isinstance(xc, str):
@@ -178,6 +180,9 @@ def needs_laplacian(self):
     rsh_coeff = dft.libxc.rsh_coeff
 
     def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_kxc=False, do_lxc=False):
+        # TODO: turn to dft.libxc.eval_xc for do_kxc and do_lxc
+        assert not do_kxc
+        assert not do_lxc
         if isinstance(inp, cupy.ndarray):
             inp = {"rho": cupy.asarray(inp, dtype=cupy.double)}
         elif isinstance(inp, dict):
@@ -207,12 +212,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
 
             args.extend([   inp[x] for x in  input_labels])
             args.extend([output[x] for x in output_labels])
-            cuda_args = []
-            for arg in args:
-                if(isinstance(arg, cupy.ndarray)):
-                    arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
-                cuda_args.append(arg)
-            #_libxc.xc_lda(*cuda_args)
 
             out_params = xc_lda_out_params()
             buf_params = xc_lda_out_params()
@@ -246,12 +245,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
 
             args.extend([   inp[x] for x in  input_labels])
             args.extend([output[x] for x in output_labels])
-            cuda_args = []
-            for arg in args:
-                if(isinstance(arg, cupy.ndarray)):
-                    arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
-                cuda_args.append(arg)
-            #_libxc.xc_gga(*cuda_args)
 
             out_params = xc_gga_out_params()
             buf_params = xc_gga_out_params()
@@ -295,12 +288,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
                 args.insert(-1, cupy.empty((1)))  # Add none ptr to laplacian
             #args.insert(-1, cupy.zeros_like(inp['rho']))
             args.extend([output[x] for x in output_labels])
-            cuda_args = []
-            for arg in args:
-                if(isinstance(arg, cupy.ndarray)):
-                    arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
-                cuda_args.append(arg)
-            #_libxc.xc_mgga(*cuda_args)
 
             out_params = xc_mgga_out_params()
             buf_params = xc_mgga_out_params()
@@ -310,13 +297,14 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
                     setattr(buf_params, label, buf[label].data.ptr)
                     setattr(out_params, label, output[label].data.ptr)
             stream = cupy.cuda.get_current_stream()
+            lapl = cupy.empty(1)
             err = libgdft.GDFT_xc_mgga(
                 stream.ptr,
                 self.xc_func,
                 npoints,
                 inp['rho'].data.ptr,
                 inp['sigma'].data.ptr,
-                cupy.empty(1).data.ptr,
+                lapl.data.ptr,
                 inp['tau'].data.ptr,
                 ctypes.byref(out_params),
                 ctypes.byref(buf_params)
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index b68d5368..c1bb1180 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import ctypes
+from functools import lru_cache
 import contextlib
 import numpy as np
 import cupy
@@ -25,7 +26,7 @@
 from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index
 from gpu4pyscf.gto.mole import basis_seg_contraction
 from gpu4pyscf.lib.cupy_helper import (
-    contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, take_last2d, transpose_sum,
+    contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, transpose_sum,
     grouped_dot, grouped_gemm)
 from gpu4pyscf.dft import xc_deriv, xc_alias, libxc
 from gpu4pyscf import __config__
@@ -41,7 +42,6 @@
 
 # Should we release the cupy cache?
 FREE_CUPY_CACHE = False
-MGGA_DENSITY_LAPL = False
 USE_SPARSITY = 2    # 0: no sparsity, 1: in-house GEMM, 2: sparse in AO direction
 
 libgdft = load_library('libgdft')
@@ -52,23 +52,26 @@
 libgdft.GDFTdot_ao_ao_sparse.restype = ctypes.c_int
 libgdft.GDFTdot_aow_ao_sparse.restype = ctypes.c_int
 
-def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None,
-            non0tab=None, out=None, verbose=None, ctr_offsets_slice=None):
+def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None,
+            non0tab=None, out=None, verbose=None, ctr_offsets_slice=None, gdftopt=None,
+            transpose=True):
     ''' evaluate ao values for given coords and shell indices
     Kwargs:
         shls_slice :       offsets of shell slices to be evaluated
         ao_loc_slice:      offsets of ao slices to be evaluated
         ctr_offsets_slice: offsets of contraction patterns
     Returns:
-        ao: comp x nao_slice x ngrids, ao is in C-contiguous
+        ao: comp x nao_slice x ngrids, ao is in C-contiguous.
+            comp x ngrids x nao_slice if tranpose, be compatiable with PySCF.
     '''
-    opt = getattr(ni, 'gdftopt', None)
-    with_opt = True
-    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
-        ni.build(mol, coords)
-        opt = ni.gdftopt
-        with_opt = False
-    mol = None
+    if gdftopt is None:
+        opt = _GDFTOpt.from_mol(mol)
+        with opt.gdft_envs_cache():
+            return eval_ao(
+                mol, coords, deriv, shls_slice, nao_slice, ao_loc_slice,
+                non0tab, out, verbose, ctr_offsets_slice, opt, transpose)
+
+    opt = gdftopt
     _sorted_mol = opt._sorted_mol
 
     if shls_slice is None:
@@ -78,6 +81,9 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_sl
         ao_loc_slice = cupy.asarray(_sorted_mol.ao_loc_nr())
         nao_slice = _sorted_mol.nao
     else:
+        assert ao_loc_slice is not None
+        assert nao_slice is not None
+        assert ctr_offsets_slice is not None
         ctr_offsets = opt.l_ctr_offsets
 
     nctr = ctr_offsets.size - 1
@@ -96,44 +102,34 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_sl
         if out is None:
             out = cupy.empty((comp, nao_slice, ngrids), order='C')
 
-    if not with_opt:
-        # mol may be different to _GDFTOpt._sorted_mol.
-        # nao should be consistent with the _GDFTOpt._sorted_mol object
-        coeff = cupy.asarray(opt.coeff)
-        with opt.gdft_envs_cache():
-            err = libgdft.GDFTeval_gto(
-                ctypes.cast(stream.ptr, ctypes.c_void_p),
-                ctypes.cast(out.data.ptr, ctypes.c_void_p),
-                ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart),
-                ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids),
-                ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p),
-                ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p),
-                ctypes.c_int(nao_slice),
-                ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr),
-                ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p),
-                _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p))
-            out = contract('nig,ij->njg', out, coeff).transpose([0,2,1])
-    else:
-        err = libgdft.GDFTeval_gto(
-            ctypes.cast(stream.ptr, ctypes.c_void_p),
-            ctypes.cast(out.data.ptr, ctypes.c_void_p),
-            ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart),
-            ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids),
-            ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p),
-            ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p),
-            ctypes.c_int(nao_slice),
-            ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr),
-            ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p),
-            _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p))
+    err = libgdft.GDFTeval_gto(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(out.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart),
+        ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids),
+        ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p),
+        ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(nao_slice),
+        ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr),
+        ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p),
+        _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p))
+
     if err != 0:
         raise RuntimeError('CUDA Error in evaluating AO')
 
+    if mol is not _sorted_mol:
+        coeff = cupy.asarray(opt.coeff)
+        out = contract('nig,ij->njg', out, coeff)
+
+    if transpose:
+        out = out.transpose(0,2,1)
+    
     if deriv == 0:
         out = out[0]
     return out
 
 def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
-             with_lapl=True, verbose=None):
+             with_lapl=False, verbose=None):
     xctype = xctype.upper()
     if xctype in ('LDA', 'HF'):
         _, ngrids = ao.shape
@@ -153,17 +149,13 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
         if hermi:
             rho[1:4] *= 2  # *2 for + einsum('pi,ij,pj->p', ao[i], dm, ao[0])
         else:
-            c0 = dm.dot(ao[0])
+            c0 = dm.T.dot(ao[0])
             for i in range(1, 4):
                 rho[i] += _contract_rho(ao[i], c0)
     else:  # meta-GGA
-        if with_lapl:
-            # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
-            rho = cupy.empty((6,ngrids))
-            tau_idx = 5
-        else:
-            rho = cupy.empty((5,ngrids))
-            tau_idx = 4
+        assert not with_lapl
+        rho = cupy.empty((5,ngrids))
+        tau_idx = 4
         c0 = dm.dot(ao[0])
         rho[0] = _contract_rho(c0, ao[0])
 
@@ -181,11 +173,11 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
     return rho
 
 def eval_rho1(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
-              with_lapl=True, verbose=None):
+              with_lapl=False, verbose=None):
     raise NotImplementedError
 
 def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
-              with_lapl=True, verbose=None, out=None):
+              with_lapl=False, verbose=None, out=None):
     xctype = xctype.upper()
     if xctype == 'LDA' or xctype == 'HF':
         _, ngrids = ao.shape
@@ -205,40 +197,24 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
             _contract_rho(c0, c1, rho=rho[i])
         rho[1:] *= 2
     else: # meta-GGA
-        if with_lapl:
-            # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
-            rho = cupy.empty((6,ngrids))
-            tau_idx = 5
-        else:
-            rho = cupy.empty((5,ngrids))
-            tau_idx = 4
+        assert not with_lapl
+        rho = cupy.empty((5,ngrids))
+        tau_idx = 4
 
         c0 = cupy.dot(cpos.T, ao[0])
         _contract_rho(c0, c0, rho=rho[0])
-
         rho[tau_idx] = 0
         for i in range(1, 4):
             c1 = cupy.dot(cpos.T, ao[i])
             rho[i] = _contract_rho(c0, c1)
             rho[tau_idx] += _contract_rho(c1, c1)
 
-        if with_lapl:
-            if ao.shape[0] > 4:
-                XX, YY, ZZ = 4, 7, 9
-                ao2 = ao[XX] + ao[YY] + ao[ZZ]
-                c1 = cupy.dot(cpos.T, ao2)
-                #:rho[4] = numpy.einsum('pi,pi->p', c0, c1)
-                rho[4] = _contract_rho(c0, c1)
-                rho[4] += rho[5]
-                rho[4] *= 2
-            else:
-                rho[4] = 0
         rho[1:4] *= 2
         rho[tau_idx] *= .5
     return rho
 
 def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
-              with_lapl=True, verbose=None):
+              with_lapl=False, verbose=None):
     xctype = xctype.upper()
     if xctype == 'LDA' or xctype == 'HF':
         _, ngrids = ao.shape
@@ -261,15 +237,9 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
             rho[i] += _contract_rho(c0[0], c_0[i])
         rho *= 2.0
     else: # meta-GGA
-        # TODO: complete this
-        if with_lapl:
-            raise NotImplementedError("mGGA with lapl not implemented")
-            # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
-            rho = cupy.empty((6,ngrids))
-            tau_idx = 5
-        else:
-            rho = cupy.empty((5,ngrids))
-            tau_idx = 4
+        assert not with_lapl
+        rho = cupy.empty((5,ngrids))
+        tau_idx = 4
         c_0 = contract('nig,io->nog', ao, cpos1)
         #:rho[0] = numpy.einsum('pi,pi->p', c0, c0)
         rho[0] = _contract_rho(c0[0], c_0[0])
@@ -281,27 +251,22 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
             rho[i]+= _contract_rho(c0[0], c_0[i])
             rho[tau_idx] += _contract_rho(c_0[i], c0[i])
         rho *= 2.0
-        if with_lapl:
-            raise NotImplementedError("mGGA with lapl not implemented")
-            if ao.shape[0] > 4:
-                XX, YY, ZZ = 4, 7, 9
-                ao2 = ao[XX] + ao[YY] + ao[ZZ]
-                c1 = _dot_ao_dm(mol, ao2, cpos1, non0tab, shls_slice, ao_loc)
-                #:rho[4] = numpy.einsum('pi,pi->p', c0, c1)
-                rho[4] = _contract_rho(c0, c1)
-                rho[4] += rho[5]
-                rho[4] *= 2
-            else:
-                rho[4] = 0
         rho[tau_idx] *= .5
     return rho
 
-def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
-              with_lapl=True, verbose=None):
-    ''' ao: nd x nao x ng
-        c0: nd x nocc x ng
-        mo1: na x nao x nocc
+def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0,
+              with_lapl=False, verbose=None):
+    '''Evaluate density using first order orbitals. This density is typically
+    derived from the non-symmetric density matrix (hermi=0) in TDDFT
+    dm[i] = mo0.dot(mo1[i].T) and symmetric density matrix (hermi=1) in CPHF
+    dm[i] = mo0.dot(mo1[i].T) + mo1[i].dot(mo0.T)
+
+    ao: nd x nao x ng
+    mo0: nao x nocc
+    mo1: na x nao x nocc
     '''
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
     xctype = xctype.upper()
     if xctype == 'LDA' or xctype == 'HF':
         _, ngrids = ao.shape
@@ -309,30 +274,32 @@ def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
         _, ngrids = ao[0].shape
 
     na = mo1.shape[0]
-    cpos1= mo1
     if xctype == 'LDA' or xctype == 'HF':
-        c_0 = contract('aio,ig->aog', cpos1, ao)#cupy.dot(cpos1.T, ao)
+        c0 = mo0.T.dot(ao)
+        t1 = log.timer_debug2('eval occ_coeff', *t0)
+        c_0 = contract('aio,ig->aog', mo1, ao)
         rho = cupy.empty([na,ngrids])
         for i in range(na):
             rho[i] = _contract_rho(c0, c_0[i])
-        rho *= 2.0
     elif xctype in ('GGA', 'NLC'):
-        log = logger.new_logger(mol, mol.verbose)
-        t0 = log.init_timer()
-        c_0 = contract('nig,aio->anog', ao, cpos1)
-        t0 = log.timer_debug2('ao * cpos', *t0)
+        c0 = contract('nig,io->nog', ao, mo0)
+        t1 = log.timer_debug2('eval occ_coeff', *t0)
+        c_0 = contract('nig,aio->anog', ao, mo1)
+        t1 = log.timer_debug2('ao * cpos', *t1)
         rho = cupy.empty([na, 4, ngrids])
         for i in range(na):
             _contract_rho_gga(c0, c_0[i], rho=rho[i])
-        t0 = log.timer_debug2('contract rho', *t0)
     else: # meta-GGA
-        if with_lapl:
-            raise NotImplementedError("mGGA with lapl not implemented")
+        assert not with_lapl
         rho = cupy.empty((na,5,ngrids))
-        c_0 = contract('nig,aio->anog', ao, cpos1)
+        c0 = contract('nig,io->nog', ao, mo0)
+        c_0 = contract('nig,aio->anog', ao, mo1)
         for i in range(na):
             _contract_rho_mgga(c0, c_0[i], rho=rho[i])
-
+    if hermi:
+        # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao
+        rho *= 2.
+    t0 = log.timer_debug2('contract rho', *t0)
     return rho
 
 def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars):
@@ -435,7 +402,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     log = logger.new_logger(mol, verbose)
     xctype = ni._xc_type(xc_code)
     opt = getattr(ni, 'gdftopt', None)
-    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+    if opt is None:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
 
@@ -443,17 +410,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     mo_occ = getattr(dms,'mo_occ', None)
     mol = None
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    nao, nao0 = coeff.shape
+    nao, nao0 = opt.coeff.shape
     dms = cupy.asarray(dms)
     dm_shape = dms.shape
-    #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
-    dms = dms.reshape(-1,nao0,nao0)
-    dms = take_last2d(dms, opt.ao_idx)
+    dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
     nset = len(dms)
 
     if mo_coeff is not None:
-        mo_coeff = mo_coeff[opt.ao_idx]
+        mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
 
     nelec = cupy.empty(nset)
     excsum = cupy.empty(nset)
@@ -464,27 +428,24 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         ao_deriv = 0
     else:
         ao_deriv = 1
-    with_lapl = MGGA_DENSITY_LAPL
     ngrids = grids.weights.size
     if xctype == 'LDA':
         rho_tot = cupy.empty([nset,1,ngrids])
     elif xctype == 'GGA':
         rho_tot = cupy.empty([nset,4,ngrids])
     else:
-        if with_lapl:
-            rho_tot = cupy.empty([nset,6,ngrids])
-        else:
-            rho_tot = cupy.empty([nset,5,ngrids])
+        rho_tot = cupy.empty([nset,5,ngrids])
     p0 = p1 = 0
     t1 = t0 = log.init_timer()
-    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                 max_memory=max_memory):
         p1 = p0 + weight.size
         for i in range(nset):
             if mo_coeff is None:
-                rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
+                rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1)
             else:
                 mo_coeff_mask = mo_coeff[idx,:]
-                rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl)
+                rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
         p0 = p1
         t1 = log.timer_debug2('eval rho slice', *t1)
     t0 = log.timer_debug1('eval rho', *t0)
@@ -501,6 +462,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         excsum[i] = cupy.dot(den, exc[:,0])
 
         wv.append(vxc * grids.weights)
+        # *.5 for v+v.conj().T at the end
         if xctype == 'GGA':
             wv[i][0] *= .5
         if xctype == 'MGGA':
@@ -512,7 +474,8 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     t1 = t0
     p0 = p1 = 0
-    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                 max_memory=max_memory):
         p1 = p0 + weight.size
         for i in range(nset):
             if xctype == 'LDA':
@@ -535,8 +498,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         p0 = p1
         t1 = log.timer_debug2('integration', *t1)
     t0 = log.timer_debug1('vxc integration', *t0)
-    rev_ao_idx = opt.rev_ao_idx
-    vmat = take_last2d(vmat, rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
 
     if xctype != 'LDA':
         transpose_sum(vmat)
@@ -553,7 +515,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     return nelec, excsum, vmat
 
 def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype='LDA',
-              with_lapl=True, verbose=None, out=None):
+              with_lapl=False, verbose=None, out=None):
     groups = len(ao_group)
     xctype = xctype.upper()
     if xctype == 'LDA' or xctype == 'HF':
@@ -600,6 +562,7 @@ def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype='
             rho[1:] *= 2
             rho_group.append(rho)
     else: # meta-GGA
+        assert not with_lapl
         c0_group = []
         cpos_group4 = []
         ao_group4 = []
@@ -646,7 +609,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     log = logger.new_logger(mol, verbose)
     xctype = ni._xc_type(xc_code)
     opt = getattr(ni, 'gdftopt', None)
-    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+    if opt is None:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
 
@@ -655,17 +618,14 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     mol = None
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    nao, nao0 = coeff.shape
+    nao, nao0 = opt.coeff.shape
     dms = cupy.asarray(dms)
     dm_shape = dms.shape
-    #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
-    dms = dms.reshape(-1,nao0,nao0)
-    dms = take_last2d(dms, opt.ao_idx)
+    dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
     nset = len(dms)
 
     if mo_coeff is not None:
-        mo_coeff = mo_coeff[opt.ao_idx]
+        mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
 
     nelec = cupy.zeros(nset)
     excsum = cupy.zeros(nset)
@@ -676,27 +636,24 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         ao_deriv = 0
     else:
         ao_deriv = 1
-    with_lapl = MGGA_DENSITY_LAPL
     ngrids = grids.weights.size
     if xctype == 'LDA':
         rho_tot = cupy.empty([nset,1,ngrids])
     elif xctype == 'GGA':
         rho_tot = cupy.empty([nset,4,ngrids])
     else:
-        if with_lapl:
-            rho_tot = cupy.empty([nset,6,ngrids])
-        else:
-            rho_tot = cupy.empty([nset,5,ngrids])
+        rho_tot = cupy.empty([nset,5,ngrids])
     p0 = p1 = 0
     t1 = t0 = log.init_timer()
-    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                 max_memory=max_memory):
         p1 = p0 + weight.size
         for i in range(nset):
             if mo_coeff is None:
-                rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
+                rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1)
             else:
                 mo_coeff_mask = mo_coeff[idx,:]
-                rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl)
+                rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
         p0 = p1
         t1 = log.timer_debug2('eval rho slice', *t1)
     t0 = log.timer_debug1('eval rho', *t0)
@@ -772,8 +729,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 raise NotImplementedError(f'numint.nr_rks for functional {xc_code}')
         t1 = log.timer_debug2('integration', *t1)
     t0 = log.timer_debug1('vxc integration', *t0)
-    rev_ao_idx = opt.rev_ao_idx
-    vmat = take_last2d(vmat, rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
 
     if xctype != 'LDA':
         transpose_sum(vmat)
@@ -794,7 +750,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     log = logger.new_logger(mol, verbose)
     xctype = ni._xc_type(xc_code)
     opt = getattr(ni, 'gdftopt', None)
-    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+    if opt is None:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
 
@@ -802,18 +758,17 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     mo_occ = getattr(dms,'mo_occ', None)
     mol = None
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    nao, nao0 = coeff.shape
+    nao, nao0 = opt.coeff.shape
     dma, dmb = dms
     dm_shape = dma.shape
     dma = cupy.asarray(dma).reshape(-1,nao0,nao0)
     dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0)
-    dma = [coeff @ dm @ coeff.T for dm in dma]
-    dmb = [coeff @ dm @ coeff.T for dm in dmb]
+    dma = opt.sort_orbitals(dma, axis=[1,2])
+    dmb = opt.sort_orbitals(dmb, axis=[1,2])
     nset = len(dma)
 
     if mo_coeff is not None:
-        mo_coeff = coeff @ mo_coeff
+        mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
 
     nelec = np.zeros((2,nset))
     excsum = np.zeros(nset)
@@ -825,18 +780,18 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         ao_deriv = 0
     else:
         ao_deriv = 1
-    with_lapl = MGGA_DENSITY_LAPL
 
-    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                 max_memory=max_memory):
         for i in range(nset):
             t0 = log.init_timer()
             if mo_coeff is None:
-                rho_a = eval_rho(_sorted_mol, ao_mask, dma[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
-                rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
+                rho_a = eval_rho(_sorted_mol, ao_mask, dma[i][idx[:,None],idx], xctype=xctype, hermi=1)
+                rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][idx[:,None],idx], xctype=xctype, hermi=1)
             else:
                 mo_coeff_mask = mo_coeff[:, idx,:]
-                rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype, with_lapl)
-                rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype, with_lapl)
+                rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype)
+                rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype)
 
             rho = cupy.stack([rho_a, rho_b], axis=0)
             exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
@@ -882,8 +837,8 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             excsum[i] += cupy.dot(den_b, exc[:,0])
             t1 = log.timer_debug1('integration', *t1)
 
-    vmata = [coeff.T @ v @ coeff for v in vmata]
-    vmatb = [coeff.T @ v @ coeff for v in vmatb]
+    vmata = opt.unsort_orbitals(vmata, axis=[1,2])
+    vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
     if xctype != 'LDA':
         for i in range(nset):
             vmata[i] = vmata[i] + vmata[i].T
@@ -918,7 +873,6 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
     dm = coeff @ cupy.asarray(dm) @ coeff.T
     if mo_coeff is not None:
         mo_coeff = coeff @ mo_coeff
-    with_lapl = MGGA_DENSITY_LAPL
 
     mem_avail = get_avail_mem()
     blksize = mem_avail*.2/8/nao//ALIGNED * ALIGNED
@@ -932,11 +886,11 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
         t1 = t0 = log.init_timer()
         for p0, p1 in lib.prange(0,ngrids,blksize):
             coords = grids.coords[p0:p1]
-            ao = eval_ao(ni, _sorted_mol, coords, 0)
+            ao = eval_ao(_sorted_mol, coords, 0, gdftopt=opt, transpose=False)
             if mo_coeff is None:
-                rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1, with_lapl=with_lapl)
+                rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1)
             else:
-                rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA', with_lapl)
+                rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA')
             t1 = log.timer_debug2('eval rho slice', *t1)
     t0 = log.timer_debug1('eval rho', *t0)
 
@@ -957,16 +911,15 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         opt = ni.gdftopt
 
     _sorted_mol = opt.mol
-    coeff = cupy.asarray(opt.coeff)
-    nao, nao0 = coeff.shape
+    nao, nao0 = opt.coeff.shape
     dms = cupy.asarray(dms)
     dm_shape = dms.shape
     # AO basis -> gdftopt AO basis
     with_mocc = hasattr(dms, 'mo1')
     if with_mocc:
-        mo1 = dms.mo1[:,opt.ao_idx] * 2.0**0.5
-        occ_coeff = dms.occ_coeff[opt.ao_idx] * 2.0**0.5
-    dms = take_last2d(dms, opt.ao_idx)
+        mo1 = opt.sort_orbitals(dms.mo1, axis=[1])
+        occ_coeff = opt.sort_orbitals(dms.occ_coeff, axis=[0]) * 2.0
+    dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
     nset = len(dms)
     vmat = cupy.zeros((nset, nao, nao))
 
@@ -974,29 +927,23 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         ao_deriv = 0
     else:
         ao_deriv = 1
-    with_lapl = MGGA_DENSITY_LAPL
     p0 = 0
     p1 = 0
     t1 = t0 = log.init_timer()
-    for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+    for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                   max_memory=max_memory):
         p0, p1 = p1, p1+len(weights)
         # precompute molecular orbitals
         if with_mocc:
             occ_coeff_mask = occ_coeff[mask]
-            if xctype == 'LDA':
-                c0 = _dot_ao_dm(_sorted_mol, ao, occ_coeff_mask, None, None, None)
-            elif xctype == "GGA":
-                c0 = contract('nig,io->nog', ao, occ_coeff_mask)
-            else: # mgga
-                c0 = contract('nig,io->nog', ao, occ_coeff_mask)
-        t1 = log.timer_debug2(f'eval occ_coeff, with mocc: {with_mocc}', *t1)
-        if with_mocc:
-            rho1 = eval_rho4(_sorted_mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False)
+            rho1 = eval_rho4(_sorted_mol, ao, occ_coeff_mask, mo1[:,mask],
+                             xctype=xctype, hermi=hermi)
         else:
             # slow version
             rho1 = []
             for i in range(nset):
-                rho_tmp = eval_rho(_sorted_mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl)
+                rho_tmp = eval_rho(_sorted_mol, ao, dms[i,mask[:,None],mask],
+                                   xctype=xctype, hermi=hermi)
                 rho1.append(rho_tmp)
             rho1 = cupy.stack(rho1, axis=0)
         t1 = log.timer_debug2('eval rho', *t1)
@@ -1012,12 +959,10 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         for i in range(nset):
             if xctype == 'LDA':
                 vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T)
-                add_sparse(vmat[i], vmat_tmp, mask)
             elif xctype == 'GGA':
                 wv[i,0] *= .5
                 aow = _scale_ao(ao, wv[i])
                 vmat_tmp = aow.dot(ao[0].T)
-                add_sparse(vmat[i], vmat_tmp, mask)
             elif xctype == 'NLC':
                 raise NotImplementedError('NLC')
             else:
@@ -1025,13 +970,13 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
                 wv[i,4] *= .5
                 vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T)
                 vmat_tmp+= _tau_dot(ao, ao, wv[i,4])
-                add_sparse(vmat[i], vmat_tmp, mask)
+            add_sparse(vmat[i], vmat_tmp, mask)
 
         t1 = log.timer_debug2('integration', *t1)
-        ao = c0 = rho1 = None
+        ao = rho1 = None
     t0 = log.timer_debug1('vxc', *t0)
 
-    vmat = take_last2d(vmat, opt.rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
     if xctype != 'LDA':
         transpose_sum(vmat)
 
@@ -1054,7 +999,8 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
         fxc = fxc[0,:,0] + fxc[0,:,1]
     else:
         fxc = fxc[0,:,0] - fxc[0,:,1]
-    return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc)
+    return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc,
+                      max_memory=max_memory, verbose=verbose)
 
 
 def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0,
@@ -1069,8 +1015,7 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         opt = ni.gdftopt
     mol = None
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    nao, nao0 = coeff.shape
+    nao, nao0 = opt.coeff.shape
     dma, dmb = dms
     dm_shape = dma.shape
     # AO basis -> gdftopt AO basis
@@ -1078,17 +1023,15 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
     if with_mocc:
         mo1a, mo1b = dms.mo1
         occ_coeffa, occ_coeffb = dms.occ_coeff
-        mo1a = contract('nio,pi->npo', mo1a, coeff)
-        mo1b = contract('nio,pi->npo', mo1b, coeff)
-        occ_coeff_a = contract('io,pi->po', occ_coeffa, coeff)
-        occ_coeff_b = contract('io,pi->po', occ_coeffb, coeff)
+        mo1a = opt.sort_orbitals(mo1a, axis=[1])
+        mo1b = opt.sort_orbitals(mo1b, axis=[1])
+        occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0])
+        occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0])
 
     dma = cupy.asarray(dma).reshape(-1,nao0,nao0)
     dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0)
-    dma = contract('nij,qj->niq', dma, coeff)
-    dma = contract('pi,niq->npq', coeff, dma)
-    dmb = contract('nij,qj->niq', dmb, coeff)
-    dmb = contract('pi,niq->npq', coeff, dmb)
+    dma = opt.sort_orbitals(dma, axis=[1,2])
+    dmb = opt.sort_orbitals(dmb, axis=[1,2])
 
     nset = len(dma)
     vmata = cupy.zeros((nset, nao, nao))
@@ -1096,84 +1039,65 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
 
     if xctype == 'LDA':
         ao_deriv = 0
+        nvar = 1
+    elif xctype == 'GGA':
+        ao_deriv = 1
+        nvar = 4
     else:
         ao_deriv = 1
-    with_lapl = MGGA_DENSITY_LAPL
-    p0 = 0
-    p1 = 0
-    for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+        nvar = 5
+    p0 = p1 = 0
+    for ao, mask, weights, coords in ni.block_loop(
+            _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory):
         t0 = log.init_timer()
         p0, p1 = p1, p1+len(weights)
+        # precompute fxc_w
+        fxc_w = fxc[:,:,:,:,p0:p1] * weights
+
         # precompute molecular orbitals
         if with_mocc:
             occ_coeff_a_mask = occ_coeff_a[mask]
             occ_coeff_b_mask = occ_coeff_b[mask]
-            if xctype == 'LDA':
-                c0_a = _dot_ao_dm(_sorted_mol, ao, occ_coeff_a_mask, None, None, None)
-                c0_b = _dot_ao_dm(_sorted_mol, ao, occ_coeff_b_mask, None, None, None)
-            elif xctype == "GGA":
-                c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask)
-                c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask)
-            else: # mgga
-                c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask)
-                c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask)
-
-        if with_mocc:
-            rho1a = eval_rho4(_sorted_mol, ao, c0_a, mo1a[:,mask], xctype=xctype, with_lapl=with_lapl)
-            rho1b = eval_rho4(_sorted_mol, ao, c0_b, mo1b[:,mask], xctype=xctype, with_lapl=with_lapl)
-        else:
-            # slow version
-            rho1a = []
-            rho1b = []
+            rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask],
+                              xctype=xctype, hermi=hermi)
+            rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask],
+                              xctype=xctype, hermi=hermi)
+            rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0)
+        else: # slow version
+            rho1 = cupy.empty((2, nset, nvar, p1-p0))
             for i in range(nset):
-                rho_tmp = eval_rho(_sorted_mol, ao, dma[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl)
-                rho1a.append(rho_tmp)
-                rho_tmp = eval_rho(_sorted_mol, ao, dmb[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl)
-                rho1b.append(rho_tmp)
-            rho1a = cupy.stack(rho1a, axis=0)
-            rho1b = cupy.stack(rho1b, axis=0)
-        rho1 = cupy.stack([rho1a, rho1b], axis=0)
+                rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask],
+                                     xctype=xctype, hermi=hermi)
+                rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask],
+                                     xctype=xctype, hermi=hermi)
         t0 = log.timer_debug1('rho', *t0)
 
-        # precompute fxc_w
-        if xctype == 'LDA':
-            fxc_w = fxc[:,0,:,0,p0:p1] * weights
-        else:
-            fxc_w = fxc[:,:,:,:,p0:p1] * weights
-
         for i in range(nset):
+            wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
             if xctype == 'LDA':
-                wv = contract('ag,abg->bg', rho1[:,i], fxc_w)
-                va = ao.dot(_scale_ao(ao, wv[0]).T)
-                vb = ao.dot(_scale_ao(ao, wv[1]).T)
-                add_sparse(vmata[i], va, mask)
-                add_sparse(vmatb[i], vb, mask)
+                va = ao.dot(_scale_ao(ao, wv[0,0]).T)
+                vb = ao.dot(_scale_ao(ao, wv[1,0]).T)
             elif xctype == 'GGA':
-                wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
-                wv[:,0] *= .5
+                wv[:,0] *= .5 # for transpose_sum at the end
                 va = ao[0].dot(_scale_ao(ao, wv[0]).T)
                 vb = ao[0].dot(_scale_ao(ao, wv[1]).T)
-                add_sparse(vmata[i], va, mask)
-                add_sparse(vmatb[i], vb, mask)
             elif xctype == 'NLC':
                 raise NotImplementedError('NLC')
             else:
-                wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
-                wv[:,[0, 4]] *= .5
+                wv[:,[0,4]] *= .5 # for transpose_sum at the end
                 va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T)
                 vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T)
                 va += _tau_dot(ao, ao, wv[0,4])
                 vb += _tau_dot(ao, ao, wv[1,4])
-                add_sparse(vmata[i], va, mask)
-                add_sparse(vmatb[i], vb, mask)
-    vmata = [coeff.T @ v @ coeff for v in vmata]
-    vmatb = [coeff.T @ v @ coeff for v in vmatb]
+            add_sparse(vmata[i], va, mask)
+            add_sparse(vmatb[i], vb, mask)
+    vmata = opt.unsort_orbitals(vmata, axis=[1,2])
+    vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
     if xctype != 'LDA':
         # For real orbitals, K_{ia,bj} = K_{ia,jb}. It simplifies real fxc_jb
         # [(\nabla mu) nu + mu (\nabla nu)] * fxc_jb = ((\nabla mu) nu f_jb) + h.c.
-        for i in range(nset):
-            vmata[i] = vmata[i] + vmata[i].T
-            vmatb[i] = vmatb[i] + vmatb[i].T
+        transpose_sum(vmata)
+        transpose_sum(vmatb)
 
     if FREE_CUPY_CACHE:
         dma = dmb = None
@@ -1228,23 +1152,22 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     nao, nao0 = opt.coeff.shape
     mol = None
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+    
+    dms = dms.reshape(-1,nao0,nao0)
     assert len(dms) == 1
+    dms = opt.sort_orbitals(dms, axis=[1,2])
 
     if mo_coeff is not None:
-        mo_coeff = coeff @ mo_coeff
-    with_lapl = MGGA_DENSITY_LAPL
+        mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
     ao_deriv = 1
     vvrho = []
     for ao, idx, weight, coords \
             in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory):
-        #rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1)
         if mo_coeff is None:
-            rho = eval_rho(_sorted_mol, ao, dms[0][np.ix_(idx,idx)], xctype='GGA', hermi=1, with_lapl=with_lapl)
+            rho = eval_rho(_sorted_mol, ao, dms[0][idx[:,None],idx], xctype='GGA', hermi=1)
         else:
             mo_coeff_mask = mo_coeff[idx,:]
-            rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA', with_lapl)
+            rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA')
         vvrho.append(rho)
 
     rho = cupy.hstack(vvrho)
@@ -1277,7 +1200,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     t1 = log.timer_debug1('integration', *t1)
 
     transpose_sum(vmat)
-    vmat = take_last2d(vmat, opt.rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[0,1])
     log.timer_debug1('eval vv10', *t0)
     return nelec, excsum, vmat
 
@@ -1293,7 +1216,6 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
         raise NotImplementedError('NLC')
     else:
         ao_deriv = 0
-    with_lapl = MGGA_DENSITY_LAPL
     opt = getattr(ni, 'gdftopt', None)
     if opt is None or mol not in [opt.mol, opt._sorted_mol]:
         ni.build(mol, grids.coords)
@@ -1301,28 +1223,34 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
 
     mol = None
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    nao = coeff.shape[0]
-    if spin == 0:
-        mo_coeff = coeff @ mo_coeff
+    mo_coeff = cupy.asarray(mo_coeff)
+    nao = opt.coeff.shape[0]
+    if mo_coeff.ndim == 2: # RHF
+        mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
         rho = []
         t1 = t0 = log.init_timer()
-        for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+        for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                     max_memory=max_memory):
             mo_coeff_mask = mo_coeff[idx,:]
-            rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl)
+            rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
             rho.append(rho_slice)
             t1 = log.timer_debug2('eval rho slice', *t1)
         rho = cupy.hstack(rho)
+        if spin == 1: # RKS with nr_rks_fxc_st
+            rho *= .5
+            rho = cupy.repeat(rho[None], 2, axis=0)
         t0 = log.timer_debug1('eval rho in fxc', *t0)
     else:
-        mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff))
+        assert spin == 1
+        mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
         rhoa = []
         rhob = []
         t1 = t0 = log.init_timer()
-        for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+        for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                     max_memory=max_memory):
             mo_coeff_mask = mo_coeff[:,idx,:]
-            rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype, with_lapl)
-            rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype, with_lapl)
+            rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype)
+            rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype)
             rhoa.append(rhoa_slice)
             rhob.append(rhob_slice)
             t1 = log.timer_debug2('eval rho in fxc', *t1)
@@ -1348,7 +1276,8 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None
 
     if omega is None: omega = ni.omega
     if xctype is None: xctype = ni._xc_type(xc_code)
-    if ni.xcfuns is None: ni.xcfuns = _init_xcfuns(xc_code, spin_polarized)
+
+    xcfuns = ni._init_xcfuns(xc_code, spin_polarized)
 
     inp = {}
     if not spin_polarized:
@@ -1391,13 +1320,13 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None
            "v3sigma2lapl", "v3sigma2tau",
            "v3sigmalapl2", "v3sigmalapltau", "v3sigmatau2",
            "v3lapl3", "v3lapl2tau", "v3lapltau2", "v3tau3"]
-    if len(ni.xcfuns) == 1:
-        xcfun, _ = ni.xcfuns[0]
+    if len(xcfuns) == 1:
+        xcfun, _ = xcfuns[0]
         xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc)
         ret_full = xc_res
     else:
         ret_full = {}
-        for xcfun, w in ni.xcfuns:
+        for xcfun, w in xcfuns:
             xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc)
             for label in xc_res:
                 if label in ret_full:
@@ -1539,11 +1468,14 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
 
             pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids]
             ao_mask = eval_ao(
-                ni, _sorted_mol, coords, deriv,
+                _sorted_mol, coords, deriv,
                 nao_slice=len(idx),
                 shls_slice=non0shl_idx,
                 ao_loc_slice=ao_loc_slice,
-                ctr_offsets_slice=ctr_offsets_slice)
+                ctr_offsets_slice=ctr_offsets_slice,
+                gdftopt=opt,
+                transpose=False
+            )
 
             t1 = log.timer_debug2('evaluate ao slice', *t1)
             if pad > 0:
@@ -1579,7 +1511,7 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
             raise RuntimeError('Not enough GPU memory')
 
     opt = getattr(ni, 'gdftopt', None)
-    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+    if opt is None:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
 
@@ -1590,7 +1522,6 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     total_used_bytes = 0
     mem_limit = get_avail_mem()
 
-    mol = None
     _sorted_mol = opt._sorted_mol
     with opt.gdft_envs_cache():
         block_id = 0
@@ -1605,11 +1536,14 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
             pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids]
 
             ao_mask = eval_ao(
-                ni, _sorted_mol, coords, deriv,
+                _sorted_mol, coords, deriv,
                 nao_slice=len(idx),
                 shls_slice=non0shl_idx,
                 ao_loc_slice=ao_loc_slice,
-                ctr_offsets_slice=ctr_offsets_slice)
+                ctr_offsets_slice=ctr_offsets_slice,
+                gdftopt=opt,
+                transpose=False
+            )
 
             if pad > 0:
                 if deriv == 0:
@@ -1660,7 +1594,7 @@ def _xc_type(self, xc_code):
 class NumInt(lib.StreamObject, LibXCMixin):
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    _keys = {'screen_idx', 'xcfuns', 'gdftopt'}
+    _keys = {'screen_index', 'xcfuns', 'gdftopt', 'pair_mask', 'grid_blksize', 'non0ao_idx'}
     gdftopt      = None
     pair_mask    = None
     screen_index = None
@@ -1700,14 +1634,27 @@ def build(self, mol, coords):
     # cannot patch this function
     eval_xc_eff = eval_xc_eff
     block_loop = _block_loop
-    eval_rho2 = eval_rho2
-    eval_ao = eval_ao
-    #eval_rho2 = staticmethod(eval_rho2)
+    eval_ao = staticmethod(eval_ao)
+    eval_rho = staticmethod(eval_rho)
+    eval_rho2 = staticmethod(eval_rho2)
 
     def to_cpu(self):
         ni = numint.NumInt()
         return ni
 
+    @lru_cache(10)
+    def _init_xcfuns(self, xc_code, spin):
+        return _init_xcfuns(xc_code, spin)
+
+    def reset(self):
+        self.gdftopt      = None
+        self.pair_mask    = None
+        self.screen_index = None
+        self.xcfuns       = None
+        self.grid_blksize = None
+        self.non0ao_idx = {}
+        return self
+
 def _make_pairs2shls_idx(pair_mask, l_bas_loc, hermi=0):
     if hermi:
         pair_mask = np.tril(pair_mask)
@@ -1985,9 +1932,7 @@ def build(self, mol=None):
             coeff = np.vstack([coeff, np.zeros((paddings, coeff.shape[1]))])
         pmol._decontracted = True
         self._sorted_mol = pmol
-        inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32)
-        self.ao_idx = cupy.asarray(ao_idx, dtype=np.int32)
-        self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32)
+        self._ao_idx = cupy.asarray(ao_idx, dtype=np.int32)
         self.coeff = coeff[ao_idx]
         self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32)
         self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32)
@@ -2014,5 +1959,40 @@ def gdft_envs_cache(self):
         finally:
             libgdft.GDFTdel_envs(ctypes.byref(self.envs_cache))
 
+    def sort_orbitals(self, mat, axis=[]):
+        ''' Transform given axis of a matrix into sorted AO
+        '''
+        idx = self._ao_idx
+        shape_ones = (1,) * mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        return mat[tuple(fancy_index)]
+
+    def unsort_orbitals(self, sorted_mat, axis=[], out=None):
+        ''' Transform given axis of a matrix into original AO
+        '''
+        idx = self._ao_idx
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        if out is None:
+            out = cupy.empty_like(sorted_mat)
+        out[tuple(fancy_index)] = sorted_mat
+        return out
+
 class _GDFTEnvsCache(ctypes.Structure):
     pass
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index fb3820b3..333034f8 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -25,15 +25,13 @@
 from gpu4pyscf.lib import logger
 from gpu4pyscf.dft import numint, gen_grid
 from gpu4pyscf.scf import hf
-from gpu4pyscf.lib.cupy_helper import load_library, tag_array
+from gpu4pyscf.lib.cupy_helper import tag_array
 from pyscf import __config__
 
 __all__ = [
-    'get_veff', 'RKS'
+    'get_veff', 'RKS', 'KohnShamDFT',
 ]
 
-libcupy_helper = load_library('libcupy_helper')
-
 def prune_small_rho_grids_(ks, mol, dm, grids):
     rho = ks._numint.get_rho(mol, dm, grids, ks.max_memory, verbose=ks.verbose)
 
@@ -134,16 +132,14 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     if hermi == 2:  # because rho = 0
         n, exc, vxc = 0, 0, 0
     else:
-        max_memory = ks.max_memory - lib.current_memory()[0]
-        n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm, max_memory=max_memory)
+        n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm)
         if ks.do_nlc():
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
             else:
                 assert ni.libxc.is_nlc(ks.nlc)
                 xc = ks.nlc
-            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
-                                          max_memory=max_memory)
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm)
 
             exc += enlc
             vxc += vnlc
@@ -151,8 +147,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     t0 = logger.timer_debug1(ks, 'vxc tot', *t0)
 
     #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(ks.xc):
         vk = None
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vj', None) is not None):
@@ -164,6 +159,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
 
         vxc += vj
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vk', None) is not None):
             ddm = cupy.asarray(dm) - cupy.asarray(dm_last)
@@ -232,6 +228,16 @@ def energy_elec(ks, dm=None, h1e=None, vhf=None):
 # Inherit pyscf KohnShamDFT class since this is tested in the pyscf dispersion code
 class KohnShamDFT(rks.KohnShamDFT):
 
+    _keys = {'cphf_grids', *rks.KohnShamDFT._keys}
+
+    to_rhf = NotImplemented
+    to_uhf = NotImplemented
+    to_ghf = NotImplemented
+    to_hf  = NotImplemented
+    to_rks = NotImplemented
+    to_uks = NotImplemented
+    to_gks = NotImplemented
+
     _keys = rks.KohnShamDFT._keys
 
     def __init__(self, xc='LDA,VWN'):
@@ -245,6 +251,14 @@ def __init__(self, xc='LDA,VWN'):
         self.nlcgrids = gen_grid.Grids(self.mol)
         self.nlcgrids.level = getattr(
             __config__, 'dft_rks_RKS_nlcgrids_level', self.nlcgrids.level)
+        
+        # Default CPHF grids is SG1 grids
+        # Reference:
+        # https://gaussian.com/integral/?tabid=1#Integral_keyword__Grid_option
+        self.cphf_grids = gen_grid.Grids(self.mol)
+        self.cphf_grids.prune = gen_grid.sg1_prune
+        self.cphf_grids.atom_grid = (50,194)
+        
         # Use rho to filter grids
         self.small_rho_cutoff = getattr(
             __config__, 'dft_rks_RKS_small_rho_cutoff', 1e-7)
@@ -261,7 +275,7 @@ def omega(self, v):
     def dump_flags(self, verbose=None):
         # TODO: add this later
         return
-    
+
     reset = rks.KohnShamDFT.reset
     do_nlc = rks.KohnShamDFT.do_nlc
 
@@ -285,7 +299,8 @@ def reset(self, mol=None):
         hf.SCF.reset(self, mol)
         self.grids.reset(mol)
         self.nlcgrids.reset(mol)
-        self._numint.gdftopt = None
+        self.cphf_grids.reset(mol)
+        self._numint.reset()
         return self
 
     def nuc_grad_method(self):
diff --git a/gpu4pyscf/dft/tests/test_ao_values.py b/gpu4pyscf/dft/tests/test_ao_values.py
index 86d52d6c..8a1a1457 100644
--- a/gpu4pyscf/dft/tests/test_ao_values.py
+++ b/gpu4pyscf/dft/tests/test_ao_values.py
@@ -55,40 +55,35 @@ def test_ao_sph_deriv0(self):
         coords = np.random.random((100,3))
         ao = mol_sph.eval_gto('GTOval_sph_deriv0', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=0)
+        ao_gpu = numint.eval_ao(mol_sph, coords, deriv=0)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
-
+        
     def test_ao_sph_deriv1(self):
         coords = np.random.random((100,3))
         ao = mol_sph.eval_gto('GTOval_sph_deriv1', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=1)
+        ao_gpu = numint.eval_ao(mol_sph, coords, deriv=1)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     def test_ao_sph_deriv2(self):
         coords = np.random.random((4,3))
         ao = mol_sph.eval_gto('GTOval_sph_deriv2', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=2)
+        ao_gpu = numint.eval_ao(mol_sph, coords, deriv=2)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     def test_ao_sph_deriv3(self):
         coords = np.random.random((100,3))
         ao = mol_sph.eval_gto('GTOval_sph_deriv3', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=3)
+        ao_gpu = numint.eval_ao(mol_sph, coords, deriv=3)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     def test_ao_sph_deriv4(self):
         coords = np.random.random((100,3))
         ao = mol_sph.eval_gto('GTOval_sph_deriv4', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=4)
+        ao_gpu = numint.eval_ao(mol_sph, coords, deriv=4)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     # cart mol
@@ -96,24 +91,21 @@ def test_ao_cart_deriv0(self):
         coords = np.random.random((100,3))
         ao = mol_cart.eval_gto('GTOval_cart_deriv0', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=0)
+        ao_gpu = numint.eval_ao(mol_cart, coords, deriv=0)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     def test_ao_cart_deriv1(self):
         coords = np.random.random((100,3))
         ao = mol_cart.eval_gto('GTOval_cart_deriv1', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=1)
+        ao_gpu = numint.eval_ao(mol_cart, coords, deriv=1)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     def test_ao_cart_deriv2(self):
         coords = np.random.random((100,3))
         ao = mol_cart.eval_gto('GTOval_cart_deriv2', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=2)
+        ao_gpu = numint.eval_ao(mol_cart, coords, deriv=2)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
     def test_ao_cart_deriv3(self):
@@ -128,8 +120,7 @@ def test_ao_cart_deriv4(self):
         coords = np.random.random((100,3))
         ao = mol_cart.eval_gto('GTOval_cart_deriv4', coords)
         ao_cpu = cupy.asarray(ao)
-        ni = NumInt()
-        ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=4)
+        ao_gpu = numint.eval_ao(mol_cart, coords, deriv=4)
         assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py
index 229f0854..80d305aa 100644
--- a/gpu4pyscf/dft/tests/test_libxc.py
+++ b/gpu4pyscf/dft/tests/test_libxc.py
@@ -47,8 +47,12 @@ def tearDownModule():
     mol.stdout.close()
     del mol
 
+def _diff(dat, ref):
+    d = dat - ref
+    return np.min((abs(d/(ref+1e-300)), abs(d)), axis=0)
+
 class KnownValues(unittest.TestCase):
-    def _check_xc(self, xc):
+    def _check_xc(self, xc, spin=0, fxc_tol=1e-10, kxc_tol=1e-10):
         ni_cpu = numint_cpu()
         ni_gpu = numint_gpu()
         xctype = ni_cpu._xc_type(xc)
@@ -60,26 +64,42 @@ def _check_xc(self, xc):
         grids = Grids(mol).build()
         ao = ni_cpu.eval_ao(mol, grids.coords, ao_deriv)
         rho = ni_cpu.eval_rho(mol, ao, dm0, xctype=xctype)
+        if spin != 0:
+            rho = (rho, rho)
 
         exc_cpu, vxc_cpu, fxc_cpu, kxc_cpu = ni_cpu.eval_xc_eff(xc, rho, deriv=2, xctype=xctype)
         exc_gpu, vxc_gpu, fxc_gpu, kxc_gpu = ni_gpu.eval_xc_eff(xc, cupy.array(rho), deriv=2, xctype=xctype)
 
-        assert(np.linalg.norm((exc_gpu[:,0].get() - exc_cpu)) < 1e-10)
-        assert(np.linalg.norm((vxc_gpu.get() - vxc_cpu)) < 1e-10)
+        assert _diff(exc_gpu[:,0].get(), exc_cpu).max() < 1e-10
+        assert _diff(vxc_gpu.get(), vxc_cpu).max() < 1e-10
         if fxc_gpu is not None:
-            assert(np.linalg.norm((fxc_gpu.get() - fxc_cpu))/np.linalg.norm(fxc_cpu) < 1e-6)
+            assert _diff(fxc_gpu.get(), fxc_cpu).max() < fxc_tol
         if kxc_gpu is not None:
-            assert(np.linalg.norm(kxc_gpu.get() - kxc_cpu) < 1e-5)
+            assert _diff(kxc_gpu.get(), kxc_cpu).max() < kxc_tol
 
     def test_LDA(self):
         self._check_xc('LDA_C_VWN')
 
     def test_GGA(self):
-        self._check_xc('GGA_C_PBE')
+        self._check_xc('HYB_GGA_XC_B3LYP')
+        self._check_xc('GGA_X_B88', fxc_tol=1e-10)
+        self._check_xc('GGA_C_PBE', fxc_tol=1e-5)
 
     def test_mGGA(self):
-        self._check_xc('MGGA_C_M06')
+        self._check_xc('MGGA_C_M06', fxc_tol=1e-5)
+
+    def test_u_LDA(self):
+        self._check_xc('LDA_C_VWN', spin=1)
+
+    def test_u_GGA(self):
+        # large errors found in B88 for the spin polarized case
+        self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-3)
+        self._check_xc('GGA_X_B88', spin=1, fxc_tol=1e-1)
+        self._check_xc('GGA_C_PBE', spin=1, fxc_tol=1e-5)
+
+    def test_u_mGGA(self):
+        self._check_xc('MGGA_C_M06', spin=1, fxc_tol=1e-5)
 
 if __name__ == "__main__":
     print("Full Tests for xc fun")
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py
index ba34f63d..505df831 100644
--- a/gpu4pyscf/dft/tests/test_numint.py
+++ b/gpu4pyscf/dft/tests/test_numint.py
@@ -155,7 +155,7 @@ def test_rks_gga(self):
 
     def test_rks_mgga(self):
         self._check_vxc('nr_rks', MGGA_M06)
-    
+
     def test_uks_lda(self):
         self._check_vxc('nr_uks', LDA)#'lda', -6.362059440515177)
 
@@ -212,7 +212,25 @@ def test_vv10(self):
         v = dft.numint._vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars)
         self.assertAlmostEqual(lib.fp(v[0].get()), 0.15894647203764295, 8)
         self.assertAlmostEqual(lib.fp(v[1].get()), 0.20500922537924576, 8)
-        return
+
+    def test_eval_rho(self):
+        np.random.seed(1)
+        dm = np.random.random(dm0.shape)
+        ni_gpu = NumInt()
+        ni_cpu = pyscf_numint()
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            self.assertAlmostEqual(abs(rho.get() - ref).max(), 0, 10)
+
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            self.assertAlmostEqual(abs(rho.get() - ref).max(), 0, 10)
 
 if __name__ == "__main__":
     print("Full Tests for dft numint")
diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py
index 398f8b81..7ccf20c7 100644
--- a/gpu4pyscf/dft/uks.py
+++ b/gpu4pyscf/dft/uks.py
@@ -133,7 +133,8 @@ def reset(self, mol=None):
         hf.SCF.reset(self, mol)
         self.grids.reset(mol)
         self.nlcgrids.reset(mol)
-        self._numint.gdftopt = None
+        self.cphf_grids.reset(mol)
+        self._numint.reset()
         return self
 
     def nuc_grad_method(self):
@@ -145,4 +146,4 @@ def to_cpu(self):
         mf = uks.UKS(self.mol, xc=self.xc)
         mf.disp = self.disp
         utils.to_cpu(self, mf)
-        return mf
\ No newline at end of file
+        return mf
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 7cc5e78d..70ab8240 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -256,8 +256,8 @@ def get_grad_hcore(mf_grad, mo_coeff=None, mo_occ=None):
     intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False,
                  group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
-    orbo_sorted = orbo[intopt.ao_idx]
-    mo_coeff_sorted = mo_coeff[intopt.ao_idx]
+    orbo_sorted = intopt.sort_orbitals(orbo, axis=[0])
+    mo_coeff_sorted = intopt.sort_orbitals(mo_coeff, axis=[0])
     for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'):
         dh1e[k0:k1,:,j0:j1,:] += contract('xkji,io->kxjo', int3c_blk, orbo_sorted[i0:i1])
         dh1e[k0:k1,:,i0:i1,:] += contract('xkji,jo->kxio', int3c_blk, orbo_sorted[j0:j1])
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 2ef4a6d8..1fd43ac0 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -135,9 +135,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
-    dms = take_last2d(dms, opt.ao_idx)
-    mo_coeff = mo_coeff[opt.ao_idx]
-
+    dms = opt.sort_orbitals(dms, axis=[1,2])
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
     nset = len(dms)
     assert nset == 1
     vmat = cupy.zeros((nset,3,nao,nao))
@@ -179,8 +178,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 vtmp = _gga_grad_sum_(ao_mask, wv)
                 vtmp += _tau_grad_dot_(ao_mask, wv[4])
                 add_sparse(vmat[idm], vtmp, idx)
-    #vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
-    vmat = take_last2d(vmat, opt.rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[2,3])
     exc = None
     if nset == 1:
         vmat = vmat[0]
@@ -203,10 +201,9 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     _sorted_mol = opt._sorted_mol
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
-    dms = cupy.asarray(dms)
-    dms = [coeff @ dm @ coeff.T
-           for dm in dms.reshape(-1,nao0,nao0)]
-    mo_coeff = coeff @ mo_coeff
+    dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
+    dms = opt.sort_orbitals(dms, axis=[1,2])
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
     nset = len(dms)
     assert nset == 1
 
@@ -238,10 +235,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         vmat_tmp = _gga_grad_sum_(ao_mask, wv)
         add_sparse(vmat, vmat_tmp, mask)
 
-    #vmat = contract('npq,qj->npj', vmat, coeff)
-    #vmat = contract('pi,npj->nij', coeff, vmat)
-    rev_ao_idx = opt.rev_ao_idx
-    vmat = take_last2d(vmat, rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
     exc = None
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
@@ -358,7 +352,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)):
             ngrids = weight.size
             for p0, p1 in lib.prange(0,ngrids,block_size):
-                ao = numint.eval_ao(ni, _sorted_mol, coords[p0:p1, :], ao_deriv)
+                ao = numint.eval_ao(_sorted_mol, coords[p0:p1, :], ao_deriv, gdftopt=opt, transpose=False)
 
                 if xctype == 'LDA':
                     rho = numint.eval_rho(_sorted_mol, ao[0], dms,
@@ -409,7 +403,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     #:vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff)
     vmat = sandwich_dot(vmat, coeff)
-    
+
     # - sign because nabla_X = -nabla_x
     return excsum, -vmat
 
@@ -424,7 +418,7 @@ def grids_response_cc(grids):
     atm_dist = gto.inter_distance(mol, atm_coords)
     atm_dist = cupy.asarray(atm_dist)
     atm_coords = cupy.asarray(atm_coords)
-    
+
     def _radii_adjust(mol, atomic_radii):
         charges = mol.atom_charges()
         if grids.radii_adjust == radi.treutler_atomic_radii_adjust:
diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py
index 32848381..32d18207 100644
--- a/gpu4pyscf/grad/uks.py
+++ b/gpu4pyscf/grad/uks.py
@@ -90,7 +90,7 @@ def get_veff(ks_grad, mol=None, dm=None, verbose=None):
             vxc_tmp[0] += vnlc
             vxc_tmp[1] += vnlc
     t0 = logger.timer(ks_grad, 'vxc', *t0)
-    
+
     mo_coeff_alpha = mf.mo_coeff[0]
     mo_coeff_beta = mf.mo_coeff[1]
     occ_coeff0 = cupy.asarray(mo_coeff_alpha[:, mf.mo_occ[0]>0.5], order='C')
@@ -139,9 +139,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
-    dms = take_last2d(dms, opt.ao_idx)
-    mo_coeff = mo_coeff[:, opt.ao_idx]
-
+    dms = opt.sort_orbitals(dms, axis=[1,2])
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
     nset = len(dms)
     vmat = cupy.zeros((nset,3,nao,nao))
     if xctype == 'LDA':
@@ -193,7 +192,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             vtmp += rks_grad._tau_grad_dot_(ao_mask, wv[1,4])
             add_sparse(vmat[1], vtmp, idx)
 
-    vmat = take_last2d(vmat, opt.rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[2,3])
     exc = None
 
     # - sign because nabla_X = -nabla_x
@@ -216,8 +215,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     nao, nao0 = coeff.shape
     dms = cupy.asarray(dms)
     assert dms.ndim == 3 and dms.shape[0] == 2
-    #:dms = cupy.einsum('pi,nij,qj->npq', coeff, dms, coeff)
-    dms = sandwich_dot(dms.reshape(-1,nao0,nao0), coeff.T)
+    dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
 
     excsum = cupy.zeros((natm, 3))
     vmat = cupy.zeros((2,3,nao,nao))
@@ -239,7 +237,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         for atm_id, (coords, weight, weight1) in enumerate(rks_grad.grids_response_cc(grids)):
             ngrids = weight.size
             for p0, p1 in lib.prange(0,ngrids,block_size):
-                ao = numint.eval_ao(ni, _sorted_mol, coords[p0:p1, :], ao_deriv)
+                ao = numint.eval_ao(_sorted_mol, coords[p0:p1, :], ao_deriv, gdftopt=opt, transpose=False)
                 if xctype == 'LDA':
                     rho_a = numint.eval_rho(_sorted_mol, ao[0], dms[0],
                                         xctype=xctype, hermi=1, with_lapl=False)
@@ -304,9 +302,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                     excsum[atm_id] += cupy.einsum('xij,ji->x', vtmp, dms[1]) * 2
                     rho = vxc = None
 
-    #:vmat = cupy.einsum('pi,snpq,qj->snij', coeff, vmat, coeff)
-    vmat = sandwich_dot(vmat.reshape(6,nao,nao), coeff).reshape(2,3,nao0,nao0)
-
+    vmat = opt.unsort_orbitals(vmat, axis=[2,3])
     # - sign because nabla_X = -nabla_x
     return excsum, -vmat
 
@@ -326,8 +322,8 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he
     _sorted_mol = opt._sorted_mol
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
-    mo_coeff_0 = coeff @ mo_coeff[0]
-    mo_coeff_1 = coeff @ mo_coeff[1]
+    mo_coeff_0 = opt.sort_orbitals(mo_coeff[0], axis=[0])
+    mo_coeff_1 = opt.sort_orbitals(mo_coeff[1], axis=[0])
     nset = 1
     assert nset == 1
 
@@ -361,8 +357,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he
         vmat_tmp = rks_grad._gga_grad_sum_(ao_mask, wv)
         add_sparse(vmat, vmat_tmp, mask)
 
-    rev_ao_idx = opt.rev_ao_idx
-    vmat = take_last2d(vmat, rev_ao_idx)
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
     exc = None
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py
index 83e3e323..01af5ca0 100644
--- a/gpu4pyscf/gto/mole.py
+++ b/gpu4pyscf/gto/mole.py
@@ -86,7 +86,7 @@ def basis_seg_contraction(mol, allow_replica=False):
     pmol.output = mol.output
     pmol.verbose = mol.verbose
     pmol.stdout = mol.stdout
-    pmol.cart = True
+    pmol.cart = True #mol.cart
     pmol._bas = np.asarray(np.vstack(_bas), dtype=np.int32)
     pmol._env = _env
     return pmol
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 41669c93..3d2545e2 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -561,15 +561,17 @@ def gen_vind(mf, mo_coeff, mo_occ):
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     nocc = mocc.shape[1]
-    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+    mocc_2 = mocc * 2
+    grids = getattr(mf, 'cphf_grids', None)
+    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
 
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
         mo1 = mo1.reshape(-1,nmo,nocc)
         mo1_mo = contract('npo,ip->nio', mo1, mo_coeff)
-        #dm1 = contract('nio,jo->nij', 2.0*mo1_mo, mocc)
+        #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2)
         #dm1 = dm1 + dm1.transpose(0,2,1)
-        dm1 = mo1_mo.dot(2.0*mocc.T)
+        dm1 = mo1_mo.dot(mocc_2.T)
         transpose_sum(dm1)
         dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ)
         v1 = vresp(dm1)
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 942438f9..4f03da9e 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -52,7 +52,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     if mf.do_nlc():
         raise NotImplementedError
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-    with_k = abs(hyb) > 1e-10
+    with_k = ni.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                              atmlst, max_memory, verbose,
                                              with_k=with_k)
@@ -103,7 +103,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mf = hessobj.base
     ni = mf._numint
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-    with_k = abs(hyb) > 1e-10
+    with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
     avail_mem -= 8 * h1mo.size
     slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
@@ -146,7 +146,6 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
     mo_occ = cupy.asarray(mo_occ)
     mo_coeff = cupy.asarray(mo_coeff)
 
-    nao_sph = mo_coeff.shape[0]
     ni = mf._numint
     xctype = ni._xc_type(mf.xc)
     shls_slice = (0, mol.nbas)
@@ -157,8 +156,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
     _sorted_mol = opt._sorted_mol
-    coeff = cupy.asarray(opt.coeff)
-    mo_coeff = coeff @ mo_coeff
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
     nao = mo_coeff.shape[0]
 
     vmat = cupy.zeros((6,nao,nao))
@@ -251,9 +249,8 @@ def contract_(ao, aoidx, wv, mask):
                  1,3,4,
                  2,4,5]]
 
-    vmat = contract('npq,qj->npj', vmat, coeff)
-    vmat = contract('pi,npj->nij', coeff, vmat)
-    return vmat.reshape(3,3,nao_sph,nao_sph)
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
+    return vmat.reshape(3,3,nao,nao)
 
 def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype):
     p0, p1 = aoslices[atm_id][2:]
@@ -344,7 +341,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
     _sorted_mol = opt._sorted_mol
     coeff = cupy.asarray(opt.coeff)
     dm0 = mf.make_rdm1(mo_coeff, mo_occ)
-    dm0_sorted = take_last2d(dm0, opt.ao_idx)
+    dm0_sorted = opt.sort_orbitals(dm0, axis=[0,1])
     vmat_dm = cupy.zeros((_sorted_mol.natm,3,3,nao))
     ipip = cupy.zeros((3,3,nao,nao))
     if xctype == 'LDA':
@@ -361,7 +358,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             wv = weight * vxc[0]
             aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)]
             _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
-            dm0_mask = dm0_sorted[numpy.ix_(mask, mask)]
+            dm0_mask = dm0_sorted[mask[:,None], mask]
 
             ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask)
             ao_dm0 = numint._dot_ao_dm(mol, ao[0], dm0, mask, shls_slice, ao_loc)
@@ -379,7 +376,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             ao_dm0 = aow = None
             t1 = log.timer_debug2('integration', *t1)
         for ia in range(_sorted_mol.natm):
-            vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx]
+            vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
     elif xctype == 'GGA':
@@ -399,7 +396,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
             ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)]
             wf = weight * fxc
-            dm0_mask = dm0_sorted[numpy.ix_(mask, mask)]
+            dm0_mask = dm0_sorted[mask[:,None], mask]
             ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask)
             vmat_dm_tmp = cupy.empty([3,3,nao_non0])
             for ia in range(_sorted_mol.natm):
@@ -416,7 +413,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             ao_dm0 = aow = None
             t1 = log.timer_debug2('integration', *t1)
         for ia in range(_sorted_mol.natm):
-            vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx]
+            vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
             vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1])
@@ -444,7 +441,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             _d1d2_dot_(ipip, mol, [aow[0], aow[1], aow[2]], [ao[XX], ao[XY], ao[XZ]], mask, ao_loc, False)
             _d1d2_dot_(ipip, mol, [aow[1], aow[3], aow[4]], [ao[YX], ao[YY], ao[YZ]], mask, ao_loc, False)
             _d1d2_dot_(ipip, mol, [aow[2], aow[4], aow[5]], [ao[ZX], ao[ZY], ao[ZZ]], mask, ao_loc, False)
-            dm0_mask = dm0_sorted[numpy.ix_(mask, mask)]
+            dm0_mask = dm0_sorted[mask[:,None], mask]
             ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)]
             ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask)
             wf = weight * fxc
@@ -483,7 +480,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmat_dm[ia][:,:,mask] += vmat_dm_tmp
             t1 = log.timer_debug2('integration', *t1)
         for ia in range(_sorted_mol.natm):
-            vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx]
+            vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
             vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1])
diff --git a/gpu4pyscf/hessian/tests/test_rks_hessian.py b/gpu4pyscf/hessian/tests/test_rks_hessian.py
index bdc1b2f6..bbe272d3 100644
--- a/gpu4pyscf/hessian/tests/test_rks_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rks_hessian.py
@@ -70,7 +70,9 @@ def _check_vxc(method, xc='LDA'):
 def _vs_cpu(mf, tol=1e-7):
     mf.conv_tol_cpscf = 1e-8
     ref = mf.Hessian().kernel()
-    e2_gpu = mf.Hessian().to_gpu().kernel()
+    hessobj = mf.Hessian().to_gpu()
+    hessobj.base.cphf_grids = hessobj.base.grids
+    e2_gpu = hessobj.kernel()
     assert abs(ref - e2_gpu).max() < tol
 
 class KnownValues(unittest.TestCase):
diff --git a/gpu4pyscf/hessian/tests/test_uks_hessian.py b/gpu4pyscf/hessian/tests/test_uks_hessian.py
index c9853579..76beb1e8 100644
--- a/gpu4pyscf/hessian/tests/test_uks_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_uks_hessian.py
@@ -81,7 +81,9 @@ def _check_vxc(method, xc='LDA'):
 def _vs_cpu(mf, tol=1e-7):
     mf.conv_tol_cpscf = 1e-8
     ref = mf.Hessian().kernel()
-    e2_gpu = mf.Hessian().to_gpu().kernel()
+    hessobj = mf.Hessian().to_gpu()
+    hessobj.base.cphf_grids = hessobj.base.grids
+    e2_gpu = hessobj.kernel()
     assert abs(ref - e2_gpu).max() < tol
 
 class KnownValues(unittest.TestCase):
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index a338dc59..76f9ae9f 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -324,7 +324,8 @@ def gen_vind(mf, mo_coeff, mo_occ):
     moccb = mo_coeff[1][:,mo_occ[1]>0]
     nocca = mocca.shape[1]
     noccb = moccb.shape[1]
-    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+    grids = getattr(mf, 'cphf_grids', None)
+    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
 
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index b4d9fc48..00c861b3 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -53,7 +53,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     if mf.nlc != '':
         raise NotImplementedError
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-    with_k = abs(hyb) > 1e-10
+    with_k = ni.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                              atmlst, max_memory, verbose,
                                              with_k=with_k)
@@ -112,7 +112,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mf = hessobj.base
     ni = mf._numint
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-    with_k = abs(hyb) > 1e-10
+    with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
     avail_mem -= 8 * (h1moa.size + h1mob.size)
     slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
@@ -183,8 +183,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
         opt = ni.gdftopt
     _sorted_mol = opt._sorted_mol
 
-    coeff = cupy.asarray(opt.coeff)
-    mo_coeff = contract('nij,pi->npj', mo_coeff, coeff)
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
     nao = mo_coeff.shape[1]
     # TODO: check mol in opt?
     vmata = cupy.zeros((6,nao,nao))
@@ -304,10 +303,10 @@ def contract_(ao, aoidx, wv, mask):
     vmatb = vmatb[[0,1,2,
                  1,3,4,
                  2,4,5]]
-    vmata = contract('npq,qj->npj', vmata, coeff)
-    vmata = contract('pi,npj->nij', coeff, vmata).reshape(3,3,nao_sph,nao_sph)
-    vmatb = contract('npq,qj->npj', vmatb, coeff)
-    vmatb = contract('pi,npj->nij', coeff, vmatb).reshape(3,3,nao_sph,nao_sph)
+    vmata = opt.unsort_orbitals(vmata, axis=[1,2])
+    vmata = vmata.reshape(3,3,nao_sph,nao_sph)
+    vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
+    vmatb = vmatb.reshape(3,3,nao_sph,nao_sph)
     return vmata, vmatb
 
 def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype):
@@ -400,8 +399,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
 
     coeff = cupy.asarray(opt.coeff)
     dm0a, dm0b = mf.make_rdm1(mo_coeff, mo_occ)
-    dm0a_sorted = take_last2d(dm0a, opt.ao_idx)
-    dm0b_sorted = take_last2d(dm0b, opt.ao_idx)
+    dm0a_sorted = opt.sort_orbitals(dm0a, axis=[0,1])
+    dm0b_sorted = opt.sort_orbitals(dm0b, axis=[0,1])
     vmata_dm = cupy.zeros((mol.natm,3,3,nao))
     vmatb_dm = cupy.zeros((mol.natm,3,3,nao))
     ipipa = cupy.zeros((3,3,nao,nao))
@@ -423,8 +422,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             _d1d2_dot_(ipipa, mol, aowa, ao[1:4], mask, ao_loc, False)
             aowb = [numint._scale_ao(ao[i], wv[1]) for i in range(1, 4)]
             _d1d2_dot_(ipipb, mol, aowb, ao[1:4], mask, ao_loc, False)
-            dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)]
-            dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)]
+            dm0a_mask = dm0a_sorted[mask[:,None], mask]
+            dm0b_mask = dm0b_sorted[mask[:,None], mask]
 
             ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask)
             ao_dmb_mask = contract('nig,ij->njg', ao_mask[:4], dm0b_mask)
@@ -451,8 +450,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             t1 = log.timer_debug2('integration', *t1)
         for ia in range(_sorted_mol.natm):
             p0, p1 = aoslices[ia][2:]
-            vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx]
-            vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx]
+            vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
+            vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
             vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
             vmatb_dm[ia] += contract('xypq,pq->xyp', ipipb[:,:,:,p0:p1], dm0b[:,p0:p1])
     elif xctype == 'GGA':
@@ -476,8 +475,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             ao_dm0a = [numint._dot_ao_dm(mol, ao[i], dm0a, mask, shls_slice, ao_loc) for i in range(4)]
             ao_dm0b = [numint._dot_ao_dm(mol, ao[i], dm0b, mask, shls_slice, ao_loc) for i in range(4)]
             wf = weight * fxc
-            dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)]
-            dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)]
+            dm0a_mask = dm0a_sorted[mask[:,None], mask]
+            dm0b_mask = dm0b_sorted[mask[:,None], mask]
             ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask)
             ao_dmb_mask = contract('nig,ij->njg', ao_mask[:4], dm0b_mask)
             vmata_dm_tmp = cupy.empty([3,3,nao_non0])
@@ -507,8 +506,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             ao_dm0a = ao_dm0b = aow = None
             t1 = log.timer_debug2('integration', *t1)
         for ia in range(_sorted_mol.natm):
-            vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx]
-            vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx]
+            vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
+            vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
             vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1])
@@ -546,8 +545,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             _d1d2_dot_(ipipb, mol, [aow[1], aow[3], aow[4]], [ao[YX], ao[YY], ao[YZ]], mask, ao_loc, False)
             _d1d2_dot_(ipipb, mol, [aow[2], aow[4], aow[5]], [ao[ZX], ao[ZY], ao[ZZ]], mask, ao_loc, False)
 
-            dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)]
-            dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)]
+            dm0a_mask = dm0a_sorted[mask[:,None], mask]
+            dm0b_mask = dm0b_sorted[mask[:,None], mask]
             ao_dm0a = [numint._dot_ao_dm(mol, ao[i], dm0a, mask, shls_slice, ao_loc) for i in range(4)]
             ao_dm0b = [numint._dot_ao_dm(mol, ao[i], dm0b, mask, shls_slice, ao_loc) for i in range(4)]
             ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask)
@@ -622,8 +621,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmatb_dm[ia][:,:,mask] += vmatb_dm_tmp
             t1 = log.timer_debug2('integration', *t1)
         for ia in range(_sorted_mol.natm):
-            vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx]
-            vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx]
+            vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
+            vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
             vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1])
diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
index e5115f5b..4390407e 100644
--- a/gpu4pyscf/lib/CMakeLists.txt
+++ b/gpu4pyscf/lib/CMakeLists.txt
@@ -148,6 +148,7 @@ if(BUILD_SOLVENT)
 endif()
 
 add_subdirectory(gvhf-rys)
+add_subdirectory(gvhf-md)
 
 option(BUILD_LIBXC "Using libxc for DFT" ON)
 if(BUILD_LIBXC)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index a3b3b341..2edfd17e 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -226,9 +226,12 @@ def dist_matrix(x, y, out=None):
         raise RuntimeError('failed in calculating distance matrix')
     return out
 
-def block_c2s_diag(ncart, nsph, angular, counts):
+def block_c2s_diag(angular, counts):
     '''
-    constract a cartesian to spherical transformation of n shells
+    Diagonal blocked cartesian to spherical transformation
+    Args: 
+        angular (list): angular momentum type, e.g. [0,1,2,3]
+        counts (list): count of each angular momentum
     '''
     if _data['c2s'] is None:
         c2s_data = cupy.concatenate([cupy.asarray(x.ravel()) for x in c2s_l])
@@ -246,7 +249,8 @@ def block_c2s_diag(ncart, nsph, angular, counts):
         offsets += [c2s_offset[l]] * count
     rows = cupy.hstack(rows)
     cols = cupy.hstack(cols)
-
+    
+    ncart, nsph = int(rows[-1]), int(cols[-1])
     cart2sph = cupy.zeros([ncart, nsph])
     offsets = cupy.asarray(offsets, dtype='int32')
 
@@ -358,11 +362,12 @@ def transpose_sum(a, stream=None):
     return a + a.transpose(0,2,1)
     '''
     assert a.flags.c_contiguous
-    n = a.shape[-1]
+    out = a
     if a.ndim == 2:
-        a = a.reshape([-1,n,n])
+        a = a[None]
     assert a.ndim == 3
-    count = a.shape[0]
+    count, m, n = a.shape
+    assert m == n
     stream = cupy.cuda.get_current_stream()
     err = libcupy_helper.transpose_sum(
         ctypes.cast(stream.ptr, ctypes.c_void_p),
@@ -372,7 +377,7 @@ def transpose_sum(a, stream=None):
     )
     if err != 0:
         raise RuntimeError('failed in transpose_sum kernel')
-    return a
+    return out
 
 # for i > j of 2d mat, mat[j,i] = mat[i,j]
 def hermi_triu(mat, hermi=1, inplace=True):
@@ -911,10 +916,11 @@ def sandwich_dot(a, c, out=None):
         a = a[None]
     counts = a.shape[0]
     m = c.shape[1]
-    out = cupy.empty((counts, m, m))
+    dtype = np.result_type(a, c)
+    out = cupy.empty((counts, m, m), dtype=dtype)
     tmp = None
     for i in range(counts):
-        tmp = cupy.dot(c.T, a[i], out=tmp)
+        tmp = cupy.dot(c.conj().T, a[i], out=tmp)
         cupy.dot(tmp, c, out=out[i])
     if a_ndim == 2:
         out = out[0]
diff --git a/gpu4pyscf/lib/cusolver.py b/gpu4pyscf/lib/cusolver.py
index 27fcb0b0..454567bd 100644
--- a/gpu4pyscf/lib/cusolver.py
+++ b/gpu4pyscf/lib/cusolver.py
@@ -66,22 +66,65 @@
     ctypes.c_void_p   # *devInfo
 ]
 
+# https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd
+libcusolver.cusolverDnZhegvd_bufferSize.argtypes = [
+    ctypes.c_void_p, # handle
+    ctypes.c_int,    # itype
+    ctypes.c_int,    # jobz
+    ctypes.c_int,    # uplo
+    ctypes.c_int,    # n
+    ctypes.c_void_p, # *A
+    ctypes.c_int,    # lda
+    ctypes.c_void_p, # *B
+    ctypes.c_int,    # ldb
+    ctypes.c_void_p, # *w
+    ctypes.c_void_p  # *lwork
+]
+
+libcusolver.cusolverDnZhegvd.argtypes = [
+    ctypes.c_void_p,  # handle
+    ctypes.c_int,     # itype
+    ctypes.c_int,     # jobz
+    ctypes.c_int,     # uplo
+    ctypes.c_int,     # n
+    ctypes.c_void_p,  # *A
+    ctypes.c_int,     # lda
+    ctypes.c_void_p,  # *B
+    ctypes.c_int,     # ldb
+    ctypes.c_void_p,  # *w
+    ctypes.c_void_p,  # *work
+    ctypes.c_int,     # lwork
+    ctypes.c_void_p   # *devInfo
+]
+
 def eigh(h, s):
     '''
     solve generalized eigenvalue problem
     '''
+    assert h.dtype == s.dtype
+    assert h.dtype in (np.float64, np.complex128)
     n = h.shape[0]
     w = cupy.zeros(n)
-    A = h.copy()
-    B = s.copy()
+    if h.dtype == np.complex128 and h.flags.c_contiguous:
+        # zhegvd requires the matrices in F-order. For hermitian matrices,
+        # .T.copy() is equivalent to .conj()
+        A = h.conj()
+        B = s.conj()
+    else:
+        A = h.copy()
+        B = s.copy()
     _handle = device.get_cusolver_handle()
 
     # TODO: reuse workspace
-    if n in _buffersize:
-        lwork = _buffersize[n]
+    if (h.dtype, n) in _buffersize:
+        lwork = _buffersize[h.dtype, n]
     else:
-        lwork = ctypes.c_int()
-        status = libcusolver.cusolverDnDsygvd_bufferSize(
+        lwork = ctypes.c_int(0)
+        if h.dtype == np.float64:
+            fn = libcusolver.cusolverDnDsygvd_bufferSize
+        else:
+            fn = libcusolver.cusolverDnZhegvd_bufferSize
+        status = fn(
             _handle,
             CUSOLVER_EIG_TYPE_1,
             CUSOLVER_EIG_MODE_VECTOR,
@@ -98,10 +141,14 @@ def eigh(h, s):
 
         if status != 0:
             raise RuntimeError("failed in buffer size")
-    
-    work = cupy.empty(lwork)
+
+    if h.dtype == np.float64:
+        fn = libcusolver.cusolverDnDsygvd
+    else:
+        fn = libcusolver.cusolverDnZhegvd
+    work = cupy.empty(lwork, dtype=h.dtype)
     devInfo = cupy.empty(1, dtype=np.int32)
-    status = libcusolver.cusolverDnDsygvd(
+    status = fn(
         _handle,
         CUSOLVER_EIG_TYPE_1,
         CUSOLVER_EIG_MODE_VECTOR,
@@ -116,7 +163,7 @@ def eigh(h, s):
         lwork,
         devInfo.data.ptr
     )
-    
+
     if status != 0:
         raise RuntimeError("failed in eigh kernel")
     return w, A.T
@@ -126,10 +173,14 @@ def cholesky(A):
     assert A.flags['C_CONTIGUOUS']
     x = A.copy()
     handle = device.get_cusolver_handle()
-    potrf = cusolver.dpotrf
-    potrf_bufferSize = cusolver.dpotrf_bufferSize
+    if A.dtype == np.float64:
+        potrf = cusolver.dpotrf
+        potrf_bufferSize = cusolver.dpotrf_bufferSize
+    else:
+        potrf = cusolver.zpotrf
+        potrf_bufferSize = cusolver.zpotrf_bufferSize
     buffersize = potrf_bufferSize(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n)
-    workspace = cupy.empty(buffersize)
+    workspace = cupy.empty(buffersize, dtype=A.dtype)
     dev_info = cupy.empty(1, dtype=np.int32)
     potrf(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n,
         workspace.data.ptr, buffersize, dev_info.data.ptr)
@@ -137,4 +188,4 @@ def cholesky(A):
     if dev_info[0] != 0:
         raise RuntimeError('failed to perform Cholesky Decomposition')
     cupy.linalg._util._tril(x,k=0)
-    return x
\ No newline at end of file
+    return x
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index 07d35547..573e1777 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -42,20 +42,20 @@ def _auto_create_mode(array, mode):
             'ndim mismatch: {} != {}'.format(array.ndim, mode.ndim))
     return mode
 
-def _create_tensor_descriptor(a):
-    handle = cutensor._get_handle()
-    key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides))
-    # hard coded
-    alignment_req = 8
-    if key not in _tensor_descriptors:
-        num_modes = a.ndim
-        extent = np.array(a.shape, dtype=np.int64)
-        stride = np.array(a.strides, dtype=np.int64) // a.itemsize
-        cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype)
-        _tensor_descriptors[key] = cutensor.TensorDescriptor(
-            handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data,
-            cutensor_dtype, alignment_req=alignment_req)
-    return _tensor_descriptors[key]
+#def _create_tensor_descriptor(a):
+#    handle = cutensor._get_handle()
+#    key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides))
+#    # hard coded
+#    alignment_req = 8
+#    if key not in _tensor_descriptors:
+#        num_modes = a.ndim
+#        extent = np.array(a.shape, dtype=np.int64)
+#        stride = np.array(a.strides, dtype=np.int64) // a.itemsize
+#        cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype)
+#        _tensor_descriptors[key] = cutensor.TensorDescriptor(
+#            handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data,
+#            cutensor_dtype, alignment_req=alignment_req)
+#    return _tensor_descriptors[key]
 
 def contraction(
     pattern, a, b, alpha, beta,
@@ -80,14 +80,14 @@ def contraction(
     mode_b = list(str_b)
     mode_c = list(str_c)
 
-    if(out is not None):
-        c = out
-    else:
-        c = cupy.empty([shape[k] for k in str_c], order='C')
+    if out is None:
+        dtype = np.result_type(a, b, alpha)
+        out = cupy.empty([shape[k] for k in str_c], order='C', dtype=dtype)
+    c = out
 
-    desc_a = _create_tensor_descriptor(a)
-    desc_b = _create_tensor_descriptor(b)
-    desc_c = _create_tensor_descriptor(c)
+    desc_a = cutensor.create_tensor_descriptor(a)
+    desc_b = cutensor.create_tensor_descriptor(b)
+    desc_c = cutensor.create_tensor_descriptor(c)
 
     mode_a = _auto_create_mode(a, mode_a)
     mode_b = _auto_create_mode(b, mode_b)
diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu
index 5c6dbd1c..1f6a6939 100644
--- a/gpu4pyscf/lib/gdft/contract_rho.cu
+++ b/gpu4pyscf/lib/gdft/contract_rho.cu
@@ -56,6 +56,7 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids,
     }
 }
 
+// half of the GGA rho
 __global__
 void GDFTcontract_rho4_kernel(double *rho, double *bra, double *ket, int ngrids, int nao, int count)
 {
@@ -109,7 +110,7 @@ void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngri
     double v[4] = {0.0, 0.0, 0.0, 0.0};
     if (active){
         for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
-            int ket_idx = grid_id + ao_id * Ngrids;
+            size_t ket_idx = grid_id + ao_id * Ngrids;
             double bra_tmp = bra[ket_idx];
             double ket_tmp = ket[ket_idx];
 
@@ -143,7 +144,7 @@ void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngri
         if (blockDim.y >= 2  && iy < 1)  buf[ixy] += buf[ixy + BLKSIZEX * 1];  __syncthreads();
 
         if (iy == 0 && active) {
-            rho[grid_id + ngrids * i] = 2.0 * buf[ix];
+            rho[grid_id + ngrids * i] = buf[ix];
         }
     }
 }
@@ -161,7 +162,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr
     double v[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
     if (active){
         for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
-            int ket_idx = grid_id + ao_id * Ngrids;
+            size_t ket_idx = grid_id + ao_id * Ngrids;
             double bra_tmp0 = bra[ket_idx];
             double ket_tmp0 = ket[ket_idx];
 
@@ -207,7 +208,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr
         if (blockDim.y >= 2  && iy < 1)  buf[ixy] += buf[ixy + BLKSIZEX * 1];  __syncthreads();
 
         if (iy == 0 && active) {
-            rho[grid_id + ngrids * i] = 2.0 * buf[ix];
+            rho[grid_id + ngrids * i] = buf[ix];
         }
     }
 }
@@ -358,4 +359,4 @@ int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv,
     return 0;
 }
 
-}
\ No newline at end of file
+}
diff --git a/gpu4pyscf/lib/gdft/libxc.cu b/gpu4pyscf/lib/gdft/libxc.cu
index 639eecc6..3eeb1b76 100644
--- a/gpu4pyscf/lib/gdft/libxc.cu
+++ b/gpu4pyscf/lib/gdft/libxc.cu
@@ -73,37 +73,121 @@ void _memset_lda(xc_lda_out_params *out, int order, int np, const xc_dimensions
     if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk);
     if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
     if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
+    if(order >= 3) cudaMemset(out->v3rho3, 0, sizeof(double)*np*dim->v3rho3);
+    if(order >= 4) cudaMemset(out->v4rho4, 0, sizeof(double)*np*dim->v4rho4);
 }
 
 __host__
 void _memset_gga(xc_gga_out_params *out, int order, int np, const xc_dimensions *dim){
     if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk);
-    if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
-    if(order >= 1) cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); // (sigma, lapl, tau)
-    if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
-    if(order >= 2) cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
-    if(order >= 2) cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
+    if(order >= 1) {
+        cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
+        cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); // (sigma, lapl, tau)
+    }
+    if(order >= 2) {
+        cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
+        cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
+        cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
+    }
+    if(order >= 3) {
+        cudaMemset(out->v3rho3,       0, sizeof(double)*np*dim->v3rho3);
+        cudaMemset(out->v3rho2sigma,  0, sizeof(double)*np*dim->v3rho2sigma);
+        cudaMemset(out->v3rhosigma2,  0, sizeof(double)*np*dim->v3rhosigma2);
+        cudaMemset(out->v3sigma3,     0, sizeof(double)*np*dim->v3sigma3);
+    }
+    if(order >= 4) {
+        cudaMemset(out->v4rho4,       0, sizeof(double)*np*dim->v4rho4);
+        cudaMemset(out->v4rho3sigma,  0, sizeof(double)*np*dim->v4rho3sigma);
+        cudaMemset(out->v4rho2sigma2, 0, sizeof(double)*np*dim->v4rho2sigma2);
+        cudaMemset(out->v4rhosigma3,  0, sizeof(double)*np*dim->v4rhosigma3);
+        cudaMemset(out->v4sigma4,     0, sizeof(double)*np*dim->v4sigma4);
+    }
 }
 
 __host__
 void _memset_mgga(xc_mgga_out_params *out, int order, int np, const xc_dimensions *dim){
     if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk);
 
-    if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
-    if(order >= 1) cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma);
-    if(order >= 1 && out->vlapl != NULL) cudaMemset(out->vlapl, 0, sizeof(double)*np*dim->vlapl); // (sigma, lapl, tau)
-    if(order >= 1) cudaMemset(out->vtau, 0, sizeof(double)*np*dim->vtau);
+    if(order >= 1) {
+        cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
+        cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma);
+        cudaMemset(out->vtau, 0, sizeof(double)*np*dim->vtau);
+        if(out->vlapl != NULL) cudaMemset(out->vlapl, 0, sizeof(double)*np*dim->vlapl); // (sigma, lapl, tau)
+    }
 
-    if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
-    if(order >= 2) cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
-    if(order >= 2 && out->v2rholapl != NULL) cudaMemset(out->v2rholapl, 0, sizeof(double)*np*dim->v2rholapl);
-    if(order >= 2) cudaMemset(out->v2rhotau, 0, sizeof(double)*np*dim->v2rhotau);
-    if(order >= 2) cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
-    if(order >= 2 && out->v2sigmalapl != NULL) cudaMemset(out->v2sigmalapl, 0, sizeof(double)*np*dim->v2sigmalapl);
-    if(order >= 2) cudaMemset(out->v2sigmatau, 0, sizeof(double)*np*dim->v2sigmatau);
-    if(order >= 2 && out->v2lapl2 != NULL) cudaMemset(out->v2lapl2, 0, sizeof(double)*np*dim->v2lapl2);
-    if(order >= 2 && out->v2lapltau != NULL) cudaMemset(out->v2lapltau, 0, sizeof(double)*np*dim->v2lapltau);
-    if(order >= 2) cudaMemset(out->v2tau2, 0, sizeof(double)*np*dim->v2tau2);
+    if(order >= 2) {
+        cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
+        cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
+        cudaMemset(out->v2rhotau, 0, sizeof(double)*np*dim->v2rhotau);
+        cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
+        cudaMemset(out->v2sigmatau, 0, sizeof(double)*np*dim->v2sigmatau);
+        cudaMemset(out->v2tau2, 0, sizeof(double)*np*dim->v2tau2);
+        if(out->v2rholapl != NULL) cudaMemset(out->v2rholapl, 0, sizeof(double)*np*dim->v2rholapl);
+        if(out->v2sigmalapl != NULL) cudaMemset(out->v2sigmalapl, 0, sizeof(double)*np*dim->v2sigmalapl);
+        if(out->v2lapl2 != NULL) cudaMemset(out->v2lapl2, 0, sizeof(double)*np*dim->v2lapl2);
+        if(out->v2lapltau != NULL) cudaMemset(out->v2lapltau, 0, sizeof(double)*np*dim->v2lapltau);
+    }
+
+    if (order >= 3) {
+        cudaMemset(out->v3rho3        , 0, sizeof(double)*np*dim->v3rho3);
+        cudaMemset(out->v3rho2sigma   , 0, sizeof(double)*np*dim->v3rho2sigma);
+        cudaMemset(out->v3rho2tau     , 0, sizeof(double)*np*dim->v3rho2tau);
+        cudaMemset(out->v3rhosigma2   , 0, sizeof(double)*np*dim->v3rhosigma2);
+        cudaMemset(out->v3rhosigmatau , 0, sizeof(double)*np*dim->v3rhosigmatau);
+        cudaMemset(out->v3rhotau2     , 0, sizeof(double)*np*dim->v3rhotau2);
+        cudaMemset(out->v3sigma3      , 0, sizeof(double)*np*dim->v3sigma3);
+        cudaMemset(out->v3sigma2tau   , 0, sizeof(double)*np*dim->v3sigma2tau);
+        cudaMemset(out->v3sigmatau2   , 0, sizeof(double)*np*dim->v3sigmatau2);
+        cudaMemset(out->v3tau3        , 0, sizeof(double)*np*dim->v3tau3);
+        if (out->v3rho2lapl    != NULL) cudaMemset(out->v3rho2lapl    , 0, sizeof(double)*np*dim->v3rho2lapl);
+        if (out->v3rhosigmalapl!= NULL) cudaMemset(out->v3rhosigmalapl, 0, sizeof(double)*np*dim->v3rhosigmalapl);
+        if (out->v3rholapl2    != NULL) cudaMemset(out->v3rholapl2    , 0, sizeof(double)*np*dim->v3rholapl2);
+        if (out->v3rholapltau  != NULL) cudaMemset(out->v3rholapltau  , 0, sizeof(double)*np*dim->v3rholapltau);
+        if (out->v3sigma2lapl  != NULL) cudaMemset(out->v3sigma2lapl  , 0, sizeof(double)*np*dim->v3sigma2lapl);
+        if (out->v3sigmalapl2  != NULL) cudaMemset(out->v3sigmalapl2  , 0, sizeof(double)*np*dim->v3sigmalapl2);
+        if (out->v3sigmalapltau!= NULL) cudaMemset(out->v3sigmalapltau, 0, sizeof(double)*np*dim->v3sigmalapltau);
+        if (out->v3lapl3       != NULL) cudaMemset(out->v3lapl3       , 0, sizeof(double)*np*dim->v3lapl3);
+        if (out->v3lapl2tau    != NULL) cudaMemset(out->v3lapl2tau    , 0, sizeof(double)*np*dim->v3lapl2tau);
+        if (out->v3lapltau2    != NULL) cudaMemset(out->v3lapltau2    , 0, sizeof(double)*np*dim->v3lapltau2);
+    }
+
+    if (order >= 4) {
+        cudaMemset(out->v4rho4           , 0, sizeof(double)*np*dim->v4rho4);
+        cudaMemset(out->v4rho3sigma      , 0, sizeof(double)*np*dim->v4rho3sigma);
+        cudaMemset(out->v4rho3tau        , 0, sizeof(double)*np*dim->v4rho3tau);
+        cudaMemset(out->v4rho2sigma2     , 0, sizeof(double)*np*dim->v4rho2sigma2);
+        cudaMemset(out->v4rho2sigmatau   , 0, sizeof(double)*np*dim->v4rho2sigmatau);
+        cudaMemset(out->v4rho2tau2       , 0, sizeof(double)*np*dim->v4rho2tau2);
+        cudaMemset(out->v4rhosigma3      , 0, sizeof(double)*np*dim->v4rhosigma3);
+        cudaMemset(out->v4rhosigma2tau   , 0, sizeof(double)*np*dim->v4rhosigma2tau);
+        cudaMemset(out->v4rhosigmatau2   , 0, sizeof(double)*np*dim->v4rhosigmatau2);
+        cudaMemset(out->v4rhotau3        , 0, sizeof(double)*np*dim->v4rhotau3);
+        cudaMemset(out->v4sigma4         , 0, sizeof(double)*np*dim->v4sigma4);
+        cudaMemset(out->v4sigma3tau      , 0, sizeof(double)*np*dim->v4sigma3tau);
+        cudaMemset(out->v4sigma2tau2     , 0, sizeof(double)*np*dim->v4sigma2tau2);
+        cudaMemset(out->v4sigmatau3      , 0, sizeof(double)*np*dim->v4sigmatau3);
+        cudaMemset(out->v4tau4           , 0, sizeof(double)*np*dim->v4tau4);
+        if (out->v4rho3lapl       != NULL) cudaMemset(out->v4rho3lapl       , 0, sizeof(double)*np*dim->v4rho3lapl);
+        if (out->v4rho2sigmalapl  != NULL) cudaMemset(out->v4rho2sigmalapl  , 0, sizeof(double)*np*dim->v4rho2sigmalapl);
+        if (out->v4rho2lapl2      != NULL) cudaMemset(out->v4rho2lapl2      , 0, sizeof(double)*np*dim->v4rho2lapl2);
+        if (out->v4rho2lapltau    != NULL) cudaMemset(out->v4rho2lapltau    , 0, sizeof(double)*np*dim->v4rho2lapltau);
+        if (out->v4rhosigma2lapl  != NULL) cudaMemset(out->v4rhosigma2lapl  , 0, sizeof(double)*np*dim->v4rhosigma2lapl);
+        if (out->v4rhosigmalapl2  != NULL) cudaMemset(out->v4rhosigmalapl2  , 0, sizeof(double)*np*dim->v4rhosigmalapl2);
+        if (out->v4rhosigmalapltau!= NULL) cudaMemset(out->v4rhosigmalapltau, 0, sizeof(double)*np*dim->v4rhosigmalapltau);
+        if (out->v4rholapl3       != NULL) cudaMemset(out->v4rholapl3       , 0, sizeof(double)*np*dim->v4rholapl3);
+        if (out->v4rholapl2tau    != NULL) cudaMemset(out->v4rholapl2tau    , 0, sizeof(double)*np*dim->v4rholapl2tau);
+        if (out->v4rholapltau2    != NULL) cudaMemset(out->v4rholapltau2    , 0, sizeof(double)*np*dim->v4rholapltau2);
+        if (out->v4sigma3lapl     != NULL) cudaMemset(out->v4sigma3lapl     , 0, sizeof(double)*np*dim->v4sigma3lapl);
+        if (out->v4sigma2lapl2    != NULL) cudaMemset(out->v4sigma2lapl2    , 0, sizeof(double)*np*dim->v4sigma2lapl2);
+        if (out->v4sigma2lapltau  != NULL) cudaMemset(out->v4sigma2lapltau  , 0, sizeof(double)*np*dim->v4sigma2lapltau);
+        if (out->v4sigmalapl3     != NULL) cudaMemset(out->v4sigmalapl3     , 0, sizeof(double)*np*dim->v4sigmalapl3);
+        if (out->v4sigmalapl2tau  != NULL) cudaMemset(out->v4sigmalapl2tau  , 0, sizeof(double)*np*dim->v4sigmalapl2tau);
+        if (out->v4sigmalapltau2  != NULL) cudaMemset(out->v4sigmalapltau2  , 0, sizeof(double)*np*dim->v4sigmalapltau2);
+        if (out->v4lapl4          != NULL) cudaMemset(out->v4lapl4          , 0, sizeof(double)*np*dim->v4lapl4);
+        if (out->v4lapl3tau       != NULL) cudaMemset(out->v4lapl3tau       , 0, sizeof(double)*np*dim->v4lapl3tau);
+        if (out->v4lapl2tau2      != NULL) cudaMemset(out->v4lapl2tau2      , 0, sizeof(double)*np*dim->v4lapl2tau2);
+        if (out->v4lapltau3       != NULL) cudaMemset(out->v4lapltau3       , 0, sizeof(double)*np*dim->v4lapltau3);
+    }
 }
 
 __host__
diff --git a/gpu4pyscf/lib/gvhf-md/CMakeLists.txt b/gpu4pyscf/lib/gvhf-md/CMakeLists.txt
new file mode 100644
index 00000000..c241d1c2
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=128")
+
+add_library(gvhf_md SHARED
+  md_contract_j.cu md_j_driver.cu md_pairdata.c unrolled_md_j.cu
+)
+
+#option(BUILD_SHARED_LIBS "build shared libraries" 1)
+#option(ENABLE_STATIC "Enforce static library build" 0)
+#if(ENABLE_STATIC)
+#  set(BUILD_SHARED_LIBS 0)
+#endif()
+
+set_target_properties(gvhf_md PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
+  CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(gvhf_md OpenMP::OpenMP_C)
diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
new file mode 100644
index 00000000..2d1b3a12
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
@@ -0,0 +1,467 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/gamma_inc.cu"
+
+#define TILEX   2
+#define TILEY   4
+
+extern __constant__ uint16_t c_Rt_idx[];
+extern __constant__ uint16_t c_Rt_offsets[];
+
+#define ADDR(l, t, u, v) \
+        ((l+1)*(l+2)*(l+3)/6 - ((l)-(t)+1)*((l)-(t)+2)*((l)-(t)+3)/6 + \
+         ((l)-(t)+1)*((l)-(t)+2)/2 - ((l)-(t)-(u)+1)*((l)-(t)-(u)+2)/2 + (v))
+
+__device__
+static void iter_Rt_n(double *out, double *Rt, double rx, double ry, double rz,
+                      int l, int sq_id, int nsq_per_block)
+{
+    uint16_t *p1 = c_Rt_idx + c_Rt_offsets[l];
+    double *pout = out + nsq_per_block;
+    int k = 0;
+    for (int v = 0, i = 0; v < l; ++v) {
+        pout[sq_id+k*nsq_per_block] = rz * Rt[sq_id+i*nsq_per_block] + v * Rt[sq_id+p1[k]*nsq_per_block];
+        ++k; ++i;
+    }
+    for (int u = 0, i = 0; u < l; ++u) {
+        for (int v = 0; v < l-u; ++v) {
+            pout[sq_id+k*nsq_per_block] = ry * Rt[sq_id+i*nsq_per_block] + u * Rt[sq_id+p1[k]*nsq_per_block];
+            ++k; ++i;
+        }
+    }
+    //int nf3 = l*(l+1)*(l+2)/6;
+    //Fold3Index *fold3idx = c_i_in_fold3idx + (l-1)*nf3/4;;
+    //for (int i = 0; i < nf3; ++i) {
+    //    Fold3Index f3i = fold3idx[i];
+    //    int t = f3i.x;
+    //    pout[sq_id+(k+i)*nsq_per_block] = rx * Rt[sq_id+i*nsq_per_block]
+    //        + t * Rt[sq_id+p1[k+i]*nsq_per_block];
+    //}
+    for (int t = 0, i = 0; t < l; ++t) {
+        // corresponding to the nested loops
+        // for (u = 0; u < l-t; ++u) for (v = 0; v < l-t-u; ++v)
+        for (int uv = 0; uv < (l-t) * (l-t+1) / 2; ++uv) {
+            pout[sq_id+(k+i)*nsq_per_block] = rx * Rt[sq_id+i*nsq_per_block]
+                + t * Rt[sq_id+p1[k+i]*nsq_per_block];
+            ++i;
+        }
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void md_j_kernel(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int threadsx = blockDim.x;
+    int threadsy = blockDim.y;
+    int bsizex = threadsx * TILEX;
+    int bsizey = threadsy * TILEY;
+    int task_ij0 = blockIdx.x * bsizex;
+    int task_kl0 = blockIdx.y * bsizey;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + threadsx * ty;
+    int nsq_per_block = threadsx * threadsy;
+    int gout_id = threadIdx.z;
+    int gout_stride = blockDim.z;
+    int t_id = sq_id + nsq_per_block * gout_id;
+    int threads = nsq_per_block * gout_stride;
+    int li = bounds.li;
+    int lj = bounds.lj;
+    int lk = bounds.lk;
+    int ll = bounds.ll;
+    int lij = li + lj;
+    int lkl = lk + ll;
+    int order = lij + lkl;
+    int nf3ijkl = (order+1)*(order+2)*(order+3)/6;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
+    int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
+    int ij_fold3idx_cum = lij*nf3ij/4;
+    int kl_fold3idx_cum = lkl*nf3kl/4;
+    Fold3Index *ij_fold3idx = c_i_in_fold3idx + ij_fold3idx_cum;
+    Fold3Index *kl_fold3idx = c_i_in_fold3idx + kl_fold3idx_cum;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + (order+1) * nsq_per_block;
+    double *Rq_cache = Rp_cache + bsizex*4;
+    double *vj_ij_cache = Rq_cache + bsizey*4;
+    double *vj_kl_cache = vj_ij_cache + nf3ij * bsizex;
+
+    // zero out all cache;
+    for (int n = t_id; n < (bsizex*4 + bsizey*4 + nf3ij*bsizex + nf3kl*bsizey); n += threads) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+    if (t_id < bsizex) {
+        int task_ij = blockIdx.x * bsizex + t_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[t_id+0*bsizex] = xij;
+            Rp_cache[t_id+1*bsizex] = yij;
+            Rp_cache[t_id+2*bsizex] = zij;
+            Rp_cache[t_id+3*bsizex] = aij;
+        } else {
+            Rp_cache[t_id+3*bsizex] = 1.;
+        }
+    }
+    if (t_id < bsizey) {
+        int task_kl = blockIdx.y * bsizey + t_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[t_id+0*bsizey] = xkl;
+            Rq_cache[t_id+1*bsizey] = ykl;
+            Rq_cache[t_id+2*bsizey] = zkl;
+            Rq_cache[t_id+3*bsizey] = akl;
+        } else {
+            Rq_cache[t_id+3*bsizey] = 1.;
+        }
+    }
+    //for (int n = ty+threadsy*gout_id; n < nf3ij*TILEX; n += threadsy*gout_stride) {
+    //    int i = n / TILEX;
+    //    int tile = n % TILEX;
+    //    int task_ij = blockIdx.x * bsizex + tile * threadsx + tx;
+    //    if (task_ij < npairs_ij) {
+    //        int pair_ij = pair_ij_mapping[task_ij];
+    //        int dm_ij_pair0 = dm_pair_loc[pair_ij];
+    //        int sq_ij = tx + tile * threadsx;
+    //        dm_ij_cache[sq_ij+i*bsizex] = dm[dm_ij_pair0+i];
+    //    }
+    //}
+    //for (int n = tx+threadsx*gout_id; n < nf3kl*TILEY; n += threadsx*gout_stride) {
+    //    int i = n / TILEY;
+    //    int tile = n % TILEY;
+    //    int task_kl = blockIdx.y * bsizey + tile * threadsy + ty;
+    //    if (task_kl < npairs_kl) {
+    //        int pair_kl = pair_kl_mapping[task_kl];
+    //        int dm_kl_pair0 = dm_pair_loc[pair_kl];
+    //        int sq_kl = ty + tile * threadsy;
+    //        dm_kl_cache[sq_kl+i*bsizey] = dm[dm_kl_pair0+i];
+    //    }
+    //}
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < TILEX; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < TILEY; ++batch_kl) {
+        int task_ij0 = blockIdx.x * bsizex + batch_ij * threadsx;
+        int task_kl0 = blockIdx.y * bsizey + batch_kl * threadsy;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * threadsx;
+        int sq_kl = ty + batch_kl * threadsy;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            // TODO: skip certain blocks when task_ij < task_kl
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        int dm_ij_pair0 = dm_pair_loc[pair_ij];
+        int dm_kl_pair0 = dm_pair_loc[pair_kl];
+        double *Rt, *buf;
+        if (gout_id == 0) {
+            double xij = Rp_cache[sq_ij+0*bsizex];
+            double yij = Rp_cache[sq_ij+1*bsizex];
+            double zij = Rp_cache[sq_ij+2*bsizex];
+            double aij = Rp_cache[sq_ij+3*bsizex];
+            double xkl = Rq_cache[sq_kl+0*bsizey];
+            double ykl = Rq_cache[sq_kl+1*bsizey];
+            double zkl = Rq_cache[sq_kl+2*bsizey];
+            double akl = Rq_cache[sq_kl+3*bsizey];
+            double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+            double xpq = xij - xkl;
+            double ypq = yij - ykl;
+            double zpq = zij - zkl;
+            double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+            double theta = aij * akl / (aij + akl);
+            double theta_rr = theta * rr;
+            eval_gamma_inc_fn(gamma_inc, theta_rr, order, sq_id, nsq_per_block);
+            double a2 = -2. * theta;
+            gamma_inc[sq_id] *= fac;
+            for (int i = 1; i <= order; i++) {
+                fac *= a2;
+                gamma_inc[sq_id+i*nsq_per_block] *= fac;
+            }
+            if (order % 2 == 0) {
+                Rt = vj_kl_cache + nf3kl*bsizey;
+                buf = Rt + nf3ijkl * nsq_per_block;
+            } else {
+                buf = vj_kl_cache + nf3kl*bsizey;
+                Rt = buf + nf3ijkl * nsq_per_block;
+            }
+            Rt[sq_id] = gamma_inc[sq_id+order*nsq_per_block];
+            for (int n = 1; n <= order; ++n) {
+                // swap input and output
+                double *tmp = buf;
+                buf = Rt;
+                Rt = tmp;
+                Rt[sq_id] = gamma_inc[sq_id+(order-n)*nsq_per_block];
+                switch (n) {
+                case 1:
+                    Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+2*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+3*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+                    break;
+                case 2:
+                    Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+3*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+4*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+6*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+7*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+8*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+9*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    break;
+                case 3:
+                    Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+3*nsq_per_block] = zpq * buf[sq_id+2*nsq_per_block] + 2 * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+4*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+6*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+7*nsq_per_block] = ypq * buf[sq_id+3*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+8*nsq_per_block] = ypq * buf[sq_id+4*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+9*nsq_per_block] = ypq * buf[sq_id+5*nsq_per_block] + 2 * buf[sq_id+3*nsq_per_block];
+                    Rt[sq_id+10*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+11*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+12*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+13*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block];
+                    Rt[sq_id+14*nsq_per_block] = xpq * buf[sq_id+4*nsq_per_block];
+                    Rt[sq_id+15*nsq_per_block] = xpq * buf[sq_id+5*nsq_per_block];
+                    Rt[sq_id+16*nsq_per_block] = xpq * buf[sq_id+6*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+17*nsq_per_block] = xpq * buf[sq_id+7*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+18*nsq_per_block] = xpq * buf[sq_id+8*nsq_per_block] + buf[sq_id+3*nsq_per_block];
+                    Rt[sq_id+19*nsq_per_block] = xpq * buf[sq_id+9*nsq_per_block] + 2 * buf[sq_id+6*nsq_per_block];
+                    break;
+                case 4:
+                    Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+3*nsq_per_block] = zpq * buf[sq_id+2*nsq_per_block] + 2 * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+4*nsq_per_block] = zpq * buf[sq_id+3*nsq_per_block] + 3 * buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+6*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+7*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+8*nsq_per_block] = ypq * buf[sq_id+3*nsq_per_block];
+                    Rt[sq_id+9*nsq_per_block] = ypq * buf[sq_id+4*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+10*nsq_per_block] = ypq * buf[sq_id+5*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+11*nsq_per_block] = ypq * buf[sq_id+6*nsq_per_block] + buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+12*nsq_per_block] = ypq * buf[sq_id+7*nsq_per_block] + 2 * buf[sq_id+4*nsq_per_block];
+                    Rt[sq_id+13*nsq_per_block] = ypq * buf[sq_id+8*nsq_per_block] + 2 * buf[sq_id+5*nsq_per_block];
+                    Rt[sq_id+14*nsq_per_block] = ypq * buf[sq_id+9*nsq_per_block] + 3 * buf[sq_id+7*nsq_per_block];
+                    Rt[sq_id+15*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+16*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+17*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+18*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block];
+                    Rt[sq_id+19*nsq_per_block] = xpq * buf[sq_id+4*nsq_per_block];
+                    Rt[sq_id+20*nsq_per_block] = xpq * buf[sq_id+5*nsq_per_block];
+                    Rt[sq_id+21*nsq_per_block] = xpq * buf[sq_id+6*nsq_per_block];
+                    Rt[sq_id+22*nsq_per_block] = xpq * buf[sq_id+7*nsq_per_block];
+                    Rt[sq_id+23*nsq_per_block] = xpq * buf[sq_id+8*nsq_per_block];
+                    Rt[sq_id+24*nsq_per_block] = xpq * buf[sq_id+9*nsq_per_block];
+                    Rt[sq_id+25*nsq_per_block] = xpq * buf[sq_id+10*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+                    Rt[sq_id+26*nsq_per_block] = xpq * buf[sq_id+11*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+                    Rt[sq_id+27*nsq_per_block] = xpq * buf[sq_id+12*nsq_per_block] + buf[sq_id+2*nsq_per_block];
+                    Rt[sq_id+28*nsq_per_block] = xpq * buf[sq_id+13*nsq_per_block] + buf[sq_id+4*nsq_per_block];
+                    Rt[sq_id+29*nsq_per_block] = xpq * buf[sq_id+14*nsq_per_block] + buf[sq_id+5*nsq_per_block];
+                    Rt[sq_id+30*nsq_per_block] = xpq * buf[sq_id+15*nsq_per_block] + buf[sq_id+7*nsq_per_block];
+                    Rt[sq_id+31*nsq_per_block] = xpq * buf[sq_id+16*nsq_per_block] + 2 * buf[sq_id+10*nsq_per_block];
+                    Rt[sq_id+32*nsq_per_block] = xpq * buf[sq_id+17*nsq_per_block] + 2 * buf[sq_id+11*nsq_per_block];
+                    Rt[sq_id+33*nsq_per_block] = xpq * buf[sq_id+18*nsq_per_block] + 2 * buf[sq_id+13*nsq_per_block];
+                    Rt[sq_id+34*nsq_per_block] = xpq * buf[sq_id+19*nsq_per_block] + 3 * buf[sq_id+16*nsq_per_block];
+                    break;
+                default: iter_Rt_n(Rt, buf, xpq, ypq, zpq, n, sq_id, nsq_per_block);
+                }
+            }
+        }
+
+        Rt = vj_kl_cache + nf3kl*bsizey;
+        double *vj_cache = Rt + nf3ijkl * nsq_per_block;
+        //for (k = 0, e = 0; e <= l1; ++e) {
+        //for (f = 0; f <= l1-e; ++f) {
+        //for (g = 0; g <= l1-e-f; ++g, ++k) {
+        //    double rho_kl_val = rho_kl[k];
+        //    double jvec_kl_val = 0.;
+        //    double fac = 1;
+        //    if ((e + f + g) % 2 != 0) {
+        //        fac = -1;
+        //    }
+        //    for (i = 0, t = 0; t <= l2; ++t) {
+        //    for (u = 0; u <= l2-t; ++u) {
+        //    for (v = 0; v <= l2-t-u; ++v, ++i) {
+        //        s = fac * R[e+t,f+u,g+v]
+        //        jvec_kl_val += s * rho_ij[i];
+        //        jvec_ij[i]  += s * rho_kl_val;
+        //    } } }
+        //    jvec_kl[k] += jvec_kl_val;
+        //} } }
+        for (int k = gout_id; k < nf3kl+gout_id; k += gout_stride) {
+            __syncthreads();
+            double vj_kl = 0.;
+            if (k < nf3kl) {
+                Fold3Index f3k = kl_fold3idx[k];
+                int e = f3k.x;
+                int f = f3k.y;
+                int g = f3k.z;
+                double fac = 1.;
+                if ((e + f + g) % 2 != 0) {
+                    fac = -1.;
+                }
+                for (int i = 0, t = 0; t <= lij; ++t) {
+                for (int u = 0; u <= lij-t; ++u) {
+                for (int v = 0; v <= lij-t-u; ++v, ++i) {
+                    //double s = Rt[sq_id+ADDR(order,e+t,f+u,g+v)*nsq_per_block];
+                    int ix = order-e-t;
+                    int xoffset = ix*(ix+1)*(ix+2)/6;
+                    int iy = ix-f-u;
+                    int i2y = (iy+1)*(iy+2)/2;
+                    double s = Rt[sq_id+(nf3ijkl-xoffset-i2y+g+v)*nsq_per_block];
+                    vj_kl += fac * s * dm[dm_ij_pair0+i];
+                } } }
+                //atomicAdd(vj+dm_kl_pair0+k, vj_kl);
+            }
+            vj_cache[t_id] = vj_kl;
+            for (int stride = threadsx/2; stride > 0; stride /= 2) {
+                __syncthreads();
+                if (tx < stride) {
+                    vj_cache[t_id] += vj_cache[t_id + stride];
+                }
+            }
+            __syncthreads();
+            if (tx == 0 && task_kl0+ty < npairs_kl) {
+                vj_kl_cache[sq_kl+k*bsizey] += vj_cache[t_id];
+            }
+        }
+
+        for (int i = gout_id; i < nf3ij+gout_id; i += gout_stride) {
+            __syncthreads();
+            double vj_ij = 0.;
+            if (i < nf3ij) {
+                Fold3Index f3i = ij_fold3idx[i];
+                int t = f3i.x;
+                int u = f3i.y;
+                int v = f3i.z;
+                for (int k = 0, e = 0; e <= lkl; ++e) {
+                for (int f = 0; f <= lkl-e; ++f) {
+                for (int g = 0; g <= lkl-e-f; ++g, ++k) {
+                    //double s = Rt[sq_id+ADDR(order,e+t,f+u,g+v)*nsq_per_block];
+                    int ix = order-e-t;
+                    int xoffset = ix*(ix+1)*(ix+2)/6;
+                    int iy = ix-f-u;
+                    int i2y = (iy+1)*(iy+2)/2;
+                    double s = Rt[sq_id+(nf3ijkl-xoffset-i2y+g+v)*nsq_per_block];
+                    double d = dm[dm_kl_pair0+k];
+                    if ((e + f + g) % 2 == 0) {
+                        vj_ij += s * d;
+                    } else {
+                        vj_ij -= s * d;
+                    }
+                } } }
+                //atomicAdd(vj+dm_ij_pair0+i, vj_ij);
+            }
+            vj_cache[t_id] = vj_ij;
+            for (int stride = threadsy/2; stride > 0; stride /= 2) {
+                __syncthreads();
+                if (ty < stride) {
+                    vj_cache[t_id] += vj_cache[t_id + stride*threadsx];
+                }
+            }
+            __syncthreads();
+            if (ty == 0 && task_ij0+tx < npairs_ij) {
+                vj_ij_cache[sq_ij+i*bsizex] += vj_cache[t_id];
+            }
+        }
+        __syncthreads();
+    } }
+
+    for (int n = ty+threadsy*gout_id; n < nf3ij*TILEX; n += threadsy*gout_stride) {
+        int i = n / TILEX;
+        int tile = n % TILEX;
+        int task_ij = blockIdx.x * bsizex + tile * threadsx + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * threadsx;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*bsizex]);
+        }
+    }
+    for (int n = tx+threadsx*gout_id; n < nf3kl*TILEY; n += threadsx*gout_stride) {
+        int i = n / TILEY;
+        int tile = n % TILEY;
+        int task_kl = blockIdx.y * bsizey + tile * threadsy + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * threadsy;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*bsizey]);
+        }
+    }
+}
diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
new file mode 100644
index 00000000..e48407a6
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
@@ -0,0 +1,434 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+
+#define TILEX   2
+#define TILEY   4
+
+__constant__ uint16_t c_Rt_idx[5967];
+__constant__ uint16_t c_Rt_offsets[19];
+__constant__ Fold2Index c_i_in_fold2idx[165];
+__constant__ Fold3Index c_i_in_fold3idx[495];
+
+
+extern __global__ void md_j_kernel(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds);
+int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
+                  int *scheme, int workers, double omega);
+void set_md_j_unrolled_shm_size();
+
+static uint16_t Rt_idx[] = {
+// l = 1
+0,0,0,
+// l = 2
+0,0,0,0,0,0,0,0,0,
+// l = 3
+0,0,1,0,0,0,0,1,3,0,0,0,0,0,0,0,1,3,6,
+// l = 4
+0,0,1,2,0,0,0,0,0,1,2,4,5,7,0,0,0,0,0,0,
+0,0,0,0,0,1,2,4,5,7,10,11,13,16,
+// l = 5
+0,0,1,2,3,0,0,0,0,0,0,1,2,3,5,6,7,9,10,12,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,5,
+6,7,9,10,12,15,16,17,19,20,22,25,26,28,31,
+// l = 6
+0,0,1,2,3,4,0,0,0,0,0,0,0,1,2,3,4,6,7,8,
+9,11,12,13,15,16,18,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,6,7,8,9,11,12,13,
+15,16,18,21,22,23,24,26,27,28,30,31,33,36,37,38,40,41,43,46,
+47,49,52,
+// l = 7
+0,0,1,2,3,4,5,0,0,0,0,0,0,0,0,1,2,3,4,5,
+7,8,9,10,11,13,14,15,16,18,19,20,22,23,25,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,1,2,3,4,5,7,8,9,10,11,13,14,15,16,18,19,
+20,22,23,25,28,29,30,31,32,34,35,36,37,39,40,41,43,44,46,49,
+50,51,52,54,55,56,58,59,61,64,65,66,68,69,71,74,75,77,80,
+// l = 8
+0,0,1,2,3,4,5,6,0,0,0,0,0,0,0,0,0,1,2,3,
+4,5,6,8,9,10,11,12,13,15,16,17,18,19,21,22,23,24,26,27,
+28,30,31,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,1,2,3,4,5,6,8,9,10,11,12,13,15,16,17,18,19,21,22,
+23,24,26,27,28,30,31,33,36,37,38,39,40,41,43,44,45,46,47,49,
+50,51,52,54,55,56,58,59,61,64,65,66,67,68,70,71,72,73,75,76,
+77,79,80,82,85,86,87,88,90,91,92,94,95,97,100,101,102,104,105,107,
+110,111,113,116,
+// l = 9
+0,0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0,0,0,1,
+2,3,4,5,6,7,9,10,11,12,13,14,15,17,18,19,20,21,22,24,
+25,26,27,28,30,31,32,33,35,36,37,39,40,42,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+1,2,3,4,5,6,7,9,10,11,12,13,14,15,17,18,19,20,21,22,
+24,25,26,27,28,30,31,32,33,35,36,37,39,40,42,45,46,47,48,49,
+50,51,53,54,55,56,57,58,60,61,62,63,64,66,67,68,69,71,72,73,
+75,76,78,81,82,83,84,85,86,88,89,90,91,92,94,95,96,97,99,100,
+101,103,104,106,109,110,111,112,113,115,116,117,118,120,121,122,124,125,127,130,
+131,132,133,135,136,137,139,140,142,145,146,147,149,150,152,155,156,158,161,
+// l = 10
+0,0,1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0,0,0,
+0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,20,21,
+22,23,24,25,27,28,29,30,31,32,34,35,36,37,38,40,41,42,43,45,
+46,47,49,50,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,20,21,
+22,23,24,25,27,28,29,30,31,32,34,35,36,37,38,40,41,42,43,45,
+46,47,49,50,52,55,56,57,58,59,60,61,62,64,65,66,67,68,69,70,
+72,73,74,75,76,77,79,80,81,82,83,85,86,87,88,90,91,92,94,95,
+97,100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,118,119,121,
+122,123,124,126,127,128,130,131,133,136,137,138,139,140,141,143,144,145,146,147,
+149,150,151,152,154,155,156,158,159,161,164,165,166,167,168,170,171,172,173,175,
+176,177,179,180,182,185,186,187,188,190,191,192,194,195,197,200,201,202,204,205,
+207,210,211,213,216,
+// l = 11
+0,0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0,0,0,0,
+0,0,0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,
+19,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,38,39,40,41,
+42,43,45,46,47,48,49,51,52,53,54,56,57,58,60,61,63,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,
+18,19,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,38,39,40,
+41,42,43,45,46,47,48,49,51,52,53,54,56,57,58,60,61,63,66,67,
+68,69,70,71,72,73,74,76,77,78,79,80,81,82,83,85,86,87,88,89,
+90,91,93,94,95,96,97,98,100,101,102,103,104,106,107,108,109,111,112,113,
+115,116,118,121,122,123,124,125,126,127,128,130,131,132,133,134,135,136,138,139,
+140,141,142,143,145,146,147,148,149,151,152,153,154,156,157,158,160,161,163,166,
+167,168,169,170,171,172,174,175,176,177,178,179,181,182,183,184,185,187,188,189,
+190,192,193,194,196,197,199,202,203,204,205,206,207,209,210,211,212,213,215,216,
+217,218,220,221,222,224,225,227,230,231,232,233,234,236,237,238,239,241,242,243,
+245,246,248,251,252,253,254,256,257,258,260,261,263,266,267,268,270,271,273,276,
+277,279,282,
+// l = 12
+0,0,1,2,3,4,5,6,7,8,9,10,0,0,0,0,0,0,0,0,
+0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,
+17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,
+39,40,42,43,44,45,46,47,48,50,51,52,53,54,55,57,58,59,60,61,
+63,64,65,66,68,69,70,72,73,75,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,12,
+13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34,
+35,36,37,38,39,40,42,43,44,45,46,47,48,50,51,52,53,54,55,57,
+58,59,60,61,63,64,65,66,68,69,70,72,73,75,78,79,80,81,82,83,
+84,85,86,87,89,90,91,92,93,94,95,96,97,99,100,101,102,103,104,105,
+106,108,109,110,111,112,113,114,116,117,118,119,120,121,123,124,125,126,127,129,
+130,131,132,134,135,136,138,139,141,144,145,146,147,148,149,150,151,152,154,155,
+156,157,158,159,160,161,163,164,165,166,167,168,169,171,172,173,174,175,176,178,
+179,180,181,182,184,185,186,187,189,190,191,193,194,196,199,200,201,202,203,204,
+205,206,208,209,210,211,212,213,214,216,217,218,219,220,221,223,224,225,226,227,
+229,230,231,232,234,235,236,238,239,241,244,245,246,247,248,249,250,252,253,254,
+255,256,257,259,260,261,262,263,265,266,267,268,270,271,272,274,275,277,280,281,
+282,283,284,285,287,288,289,290,291,293,294,295,296,298,299,300,302,303,305,308,
+309,310,311,312,314,315,316,317,319,320,321,323,324,326,329,330,331,332,334,335,
+336,338,339,341,344,345,346,348,349,351,354,355,357,360,
+// l = 13
+0,0,1,2,3,4,5,6,7,8,9,10,11,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,13,14,
+15,16,17,18,19,20,21,22,23,25,26,27,28,29,30,31,32,33,34,36,
+37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,55,56,57,58,
+59,60,61,63,64,65,66,67,68,70,71,72,73,74,76,77,78,79,81,82,
+83,85,86,88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,
+5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,25,26,
+27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,46,47,48,
+49,50,51,52,53,55,56,57,58,59,60,61,63,64,65,66,67,68,70,71,
+72,73,74,76,77,78,79,81,82,83,85,86,88,91,92,93,94,95,96,97,
+98,99,100,101,103,104,105,106,107,108,109,110,111,112,114,115,116,117,118,119,
+120,121,122,124,125,126,127,128,129,130,131,133,134,135,136,137,138,139,141,142,
+143,144,145,146,148,149,150,151,152,154,155,156,157,159,160,161,163,164,166,169,
+170,171,172,173,174,175,176,177,178,180,181,182,183,184,185,186,187,188,190,191,
+192,193,194,195,196,197,199,200,201,202,203,204,205,207,208,209,210,211,212,214,
+215,216,217,218,220,221,222,223,225,226,227,229,230,232,235,236,237,238,239,240,
+241,242,243,245,246,247,248,249,250,251,252,254,255,256,257,258,259,260,262,263,
+264,265,266,267,269,270,271,272,273,275,276,277,278,280,281,282,284,285,287,290,
+291,292,293,294,295,296,297,299,300,301,302,303,304,305,307,308,309,310,311,312,
+314,315,316,317,318,320,321,322,323,325,326,327,329,330,332,335,336,337,338,339,
+340,341,343,344,345,346,347,348,350,351,352,353,354,356,357,358,359,361,362,363,
+365,366,368,371,372,373,374,375,376,378,379,380,381,382,384,385,386,387,389,390,
+391,393,394,396,399,400,401,402,403,405,406,407,408,410,411,412,414,415,417,420,
+421,422,423,425,426,427,429,430,432,435,436,437,439,440,442,445,446,448,451,
+// l = 14
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,
+12,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,
+34,35,36,37,39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55,
+56,57,58,60,61,62,63,64,65,66,67,69,70,71,72,73,74,75,77,78,
+79,80,81,82,84,85,86,87,88,90,91,92,93,95,96,97,99,100,102,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,
+17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,36,37,
+39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55,56,57,58,60,
+61,62,63,64,65,66,67,69,70,71,72,73,74,75,77,78,79,80,81,82,
+84,85,86,87,88,90,91,92,93,95,96,97,99,100,102,105,106,107,108,109,
+110,111,112,113,114,115,116,118,119,120,121,122,123,124,125,126,127,128,130,131,
+132,133,134,135,136,137,138,139,141,142,143,144,145,146,147,148,149,151,152,153,
+154,155,156,157,158,160,161,162,163,164,165,166,168,169,170,171,172,173,175,176,
+177,178,179,181,182,183,184,186,187,188,190,191,193,196,197,198,199,200,201,202,
+203,204,205,206,208,209,210,211,212,213,214,215,216,217,219,220,221,222,223,224,
+225,226,227,229,230,231,232,233,234,235,236,238,239,240,241,242,243,244,246,247,
+248,249,250,251,253,254,255,256,257,259,260,261,262,264,265,266,268,269,271,274,
+275,276,277,278,279,280,281,282,283,285,286,287,288,289,290,291,292,293,295,296,
+297,298,299,300,301,302,304,305,306,307,308,309,310,312,313,314,315,316,317,319,
+320,321,322,323,325,326,327,328,330,331,332,334,335,337,340,341,342,343,344,345,
+346,347,348,350,351,352,353,354,355,356,357,359,360,361,362,363,364,365,367,368,
+369,370,371,372,374,375,376,377,378,380,381,382,383,385,386,387,389,390,392,395,
+396,397,398,399,400,401,402,404,405,406,407,408,409,410,412,413,414,415,416,417,
+419,420,421,422,423,425,426,427,428,430,431,432,434,435,437,440,441,442,443,444,
+445,446,448,449,450,451,452,453,455,456,457,458,459,461,462,463,464,466,467,468,
+470,471,473,476,477,478,479,480,481,483,484,485,486,487,489,490,491,492,494,495,
+496,498,499,501,504,505,506,507,508,510,511,512,513,515,516,517,519,520,522,525,
+526,527,528,530,531,532,534,535,537,540,541,542,544,545,547,550,551,553,556,
+// l = 15
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,
+10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,
+32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,48,49,50,51,52,
+54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,70,71,72,73,75,
+76,77,78,79,80,81,82,84,85,86,87,88,89,90,92,93,94,95,96,97,
+99,100,101,102,103,105,106,107,108,110,111,112,114,115,117,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,
+5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,
+26,27,29,30,31,32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,
+48,49,50,51,52,54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,
+70,71,72,73,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90,92,
+93,94,95,96,97,99,100,101,102,103,105,106,107,108,110,111,112,114,115,117,
+120,121,122,123,124,125,126,127,128,129,130,131,132,134,135,136,137,138,139,140,
+141,142,143,144,145,147,148,149,150,151,152,153,154,155,156,157,159,160,161,162,
+163,164,165,166,167,168,170,171,172,173,174,175,176,177,178,180,181,182,183,184,
+185,186,187,189,190,191,192,193,194,195,197,198,199,200,201,202,204,205,206,207,
+208,210,211,212,213,215,216,217,219,220,222,225,226,227,228,229,230,231,232,233,
+234,235,236,238,239,240,241,242,243,244,245,246,247,248,250,251,252,253,254,255,
+256,257,258,259,261,262,263,264,265,266,267,268,269,271,272,273,274,275,276,277,
+278,280,281,282,283,284,285,286,288,289,290,291,292,293,295,296,297,298,299,301,
+302,303,304,306,307,308,310,311,313,316,317,318,319,320,321,322,323,324,325,326,
+328,329,330,331,332,333,334,335,336,337,339,340,341,342,343,344,345,346,347,349,
+350,351,352,353,354,355,356,358,359,360,361,362,363,364,366,367,368,369,370,371,
+373,374,375,376,377,379,380,381,382,384,385,386,388,389,391,394,395,396,397,398,
+399,400,401,402,403,405,406,407,408,409,410,411,412,413,415,416,417,418,419,420,
+421,422,424,425,426,427,428,429,430,432,433,434,435,436,437,439,440,441,442,443,
+445,446,447,448,450,451,452,454,455,457,460,461,462,463,464,465,466,467,468,470,
+471,472,473,474,475,476,477,479,480,481,482,483,484,485,487,488,489,490,491,492,
+494,495,496,497,498,500,501,502,503,505,506,507,509,510,512,515,516,517,518,519,
+520,521,522,524,525,526,527,528,529,530,532,533,534,535,536,537,539,540,541,542,
+543,545,546,547,548,550,551,552,554,555,557,560,561,562,563,564,565,566,568,569,
+570,571,572,573,575,576,577,578,579,581,582,583,584,586,587,588,590,591,593,596,
+597,598,599,600,601,603,604,605,606,607,609,610,611,612,614,615,616,618,619,621,
+624,625,626,627,628,630,631,632,633,635,636,637,639,640,642,645,646,647,648,650,
+651,652,654,655,657,660,661,662,664,665,667,670,671,673,676,
+// l = 16
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,
+8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28,
+29,31,32,33,34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,
+51,52,53,54,55,56,58,59,60,61,62,63,64,65,66,67,68,70,71,72,
+73,74,75,76,77,78,79,81,82,83,84,85,86,87,88,89,91,92,93,94,
+95,96,97,98,100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,
+118,119,121,122,123,124,126,127,128,130,131,133,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,
+12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,
+34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54,
+55,56,58,59,60,61,62,63,64,65,66,67,68,70,71,72,73,74,75,76,
+77,78,79,81,82,83,84,85,86,87,88,89,91,92,93,94,95,96,97,98,
+100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,118,119,121,122,
+123,124,126,127,128,130,131,133,136,137,138,139,140,141,142,143,144,145,146,147,
+148,149,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169,
+170,171,172,173,174,175,176,178,179,180,181,182,183,184,185,186,187,188,190,191,
+192,193,194,195,196,197,198,199,201,202,203,204,205,206,207,208,209,211,212,213,
+214,215,216,217,218,220,221,222,223,224,225,226,228,229,230,231,232,233,235,236,
+237,238,239,241,242,243,244,246,247,248,250,251,253,256,257,258,259,260,261,262,
+263,264,265,266,267,268,270,271,272,273,274,275,276,277,278,279,280,281,283,284,
+285,286,287,288,289,290,291,292,293,295,296,297,298,299,300,301,302,303,304,306,
+307,308,309,310,311,312,313,314,316,317,318,319,320,321,322,323,325,326,327,328,
+329,330,331,333,334,335,336,337,338,340,341,342,343,344,346,347,348,349,351,352,
+353,355,356,358,361,362,363,364,365,366,367,368,369,370,371,372,374,375,376,377,
+378,379,380,381,382,383,384,386,387,388,389,390,391,392,393,394,395,397,398,399,
+400,401,402,403,404,405,407,408,409,410,411,412,413,414,416,417,418,419,420,421,
+422,424,425,426,427,428,429,431,432,433,434,435,437,438,439,440,442,443,444,446,
+447,449,452,453,454,455,456,457,458,459,460,461,462,464,465,466,467,468,469,470,
+471,472,473,475,476,477,478,479,480,481,482,483,485,486,487,488,489,490,491,492,
+494,495,496,497,498,499,500,502,503,504,505,506,507,509,510,511,512,513,515,516,
+517,518,520,521,522,524,525,527,530,531,532,533,534,535,536,537,538,539,541,542,
+543,544,545,546,547,548,549,551,552,553,554,555,556,557,558,560,561,562,563,564,
+565,566,568,569,570,571,572,573,575,576,577,578,579,581,582,583,584,586,587,588,
+590,591,593,596,597,598,599,600,601,602,603,604,606,607,608,609,610,611,612,613,
+615,616,617,618,619,620,621,623,624,625,626,627,628,630,631,632,633,634,636,637,
+638,639,641,642,643,645,646,648,651,652,653,654,655,656,657,658,660,661,662,663,
+664,665,666,668,669,670,671,672,673,675,676,677,678,679,681,682,683,684,686,687,
+688,690,691,693,696,697,698,699,700,701,702,704,705,706,707,708,709,711,712,713,
+714,715,717,718,719,720,722,723,724,726,727,729,732,733,734,735,736,737,739,740,
+741,742,743,745,746,747,748,750,751,752,754,755,757,760,761,762,763,764,766,767,
+768,769,771,772,773,775,776,778,781,782,783,784,786,787,788,790,791,793,796,797,
+798,800,801,803,806,807,809,812,
+// l = 17
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,
+6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,
+27,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46,48,
+49,50,51,52,53,54,55,56,57,58,59,60,62,63,64,65,66,67,68,69,
+70,71,72,73,75,76,77,78,79,80,81,82,83,84,85,87,88,89,90,91,
+92,93,94,95,96,98,99,100,101,102,103,104,105,106,108,109,110,111,112,113,
+114,115,117,118,119,120,121,122,123,125,126,127,128,129,130,132,133,134,135,136,
+138,139,140,141,143,144,145,147,148,150,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,
+18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,
+39,40,41,42,43,44,45,46,48,49,50,51,52,53,54,55,56,57,58,59,
+60,62,63,64,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81,
+82,83,84,85,87,88,89,90,91,92,93,94,95,96,98,99,100,101,102,103,
+104,105,106,108,109,110,111,112,113,114,115,117,118,119,120,121,122,123,125,126,
+127,128,129,130,132,133,134,135,136,138,139,140,141,143,144,145,147,148,150,153,
+154,155,156,157,158,159,160,161,162,163,164,165,166,167,169,170,171,172,173,174,
+175,176,177,178,179,180,181,182,184,185,186,187,188,189,190,191,192,193,194,195,
+196,198,199,200,201,202,203,204,205,206,207,208,209,211,212,213,214,215,216,217,
+218,219,220,221,223,224,225,226,227,228,229,230,231,232,234,235,236,237,238,239,
+240,241,242,244,245,246,247,248,249,250,251,253,254,255,256,257,258,259,261,262,
+263,264,265,266,268,269,270,271,272,274,275,276,277,279,280,281,283,284,286,289,
+290,291,292,293,294,295,296,297,298,299,300,301,302,304,305,306,307,308,309,310,
+311,312,313,314,315,316,318,319,320,321,322,323,324,325,326,327,328,329,331,332,
+333,334,335,336,337,338,339,340,341,343,344,345,346,347,348,349,350,351,352,354,
+355,356,357,358,359,360,361,362,364,365,366,367,368,369,370,371,373,374,375,376,
+377,378,379,381,382,383,384,385,386,388,389,390,391,392,394,395,396,397,399,400,
+401,403,404,406,409,410,411,412,413,414,415,416,417,418,419,420,421,423,424,425,
+426,427,428,429,430,431,432,433,434,436,437,438,439,440,441,442,443,444,445,446,
+448,449,450,451,452,453,454,455,456,457,459,460,461,462,463,464,465,466,467,469,
+470,471,472,473,474,475,476,478,479,480,481,482,483,484,486,487,488,489,490,491,
+493,494,495,496,497,499,500,501,502,504,505,506,508,509,511,514,515,516,517,518,
+519,520,521,522,523,524,525,527,528,529,530,531,532,533,534,535,536,537,539,540,
+541,542,543,544,545,546,547,548,550,551,552,553,554,555,556,557,558,560,561,562,
+563,564,565,566,567,569,570,571,572,573,574,575,577,578,579,580,581,582,584,585,
+586,587,588,590,591,592,593,595,596,597,599,600,602,605,606,607,608,609,610,611,
+612,613,614,615,617,618,619,620,621,622,623,624,625,626,628,629,630,631,632,633,
+634,635,636,638,639,640,641,642,643,644,645,647,648,649,650,651,652,653,655,656,
+657,658,659,660,662,663,664,665,666,668,669,670,671,673,674,675,677,678,680,683,
+684,685,686,687,688,689,690,691,692,694,695,696,697,698,699,700,701,702,704,705,
+706,707,708,709,710,711,713,714,715,716,717,718,719,721,722,723,724,725,726,728,
+729,730,731,732,734,735,736,737,739,740,741,743,744,746,749,750,751,752,753,754,
+755,756,757,759,760,761,762,763,764,765,766,768,769,770,771,772,773,774,776,777,
+778,779,780,781,783,784,785,786,787,789,790,791,792,794,795,796,798,799,801,804,
+805,806,807,808,809,810,811,813,814,815,816,817,818,819,821,822,823,824,825,826,
+828,829,830,831,832,834,835,836,837,839,840,841,843,844,846,849,850,851,852,853,
+854,855,857,858,859,860,861,862,864,865,866,867,868,870,871,872,873,875,876,877,
+879,880,882,885,886,887,888,889,890,892,893,894,895,896,898,899,900,901,903,904,
+905,907,908,910,913,914,915,916,917,919,920,921,922,924,925,926,928,929,931,934,
+935,936,937,939,940,941,943,944,946,949,950,951,953,954,956,959,960,962,965,
+};
+
+// l*(l+1)*(l+2)*(l+3)//24 - l
+static uint16_t Rt_idx_offsets[] = {
+0,0,3,12,31,65,120,203,322,486,705,990,1353,1807,2366,3045,3860,4828,5967,
+};
+
+extern "C" {
+int MD_build_j(double *vj, double *dm, int n_dm, int nao,
+                RysIntEnvVars envs, int *scheme, int *shls_slice,
+                int ntile_ij_pairs, int ntile_kl_pairs,
+                int *tile_ij_mapping, int *tile_kl_mapping, float *tile_q_cond,
+                float *q_cond, float *dm_cond, float cutoff,
+                uint32_t *batch_head, int workers, double omega,
+                int *atm, int natm, int *bas, int nbas, double *env)
+{
+    uint16_t ish0 = shls_slice[0];
+    uint16_t jsh0 = shls_slice[2];
+    uint16_t ksh0 = shls_slice[4];
+    uint16_t lsh0 = shls_slice[6];
+    uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS];
+    uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+    uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS];
+    uint8_t ll = bas[ANG_OF + lsh0*BAS_SLOTS];
+    uint8_t order = li + lj + lk + ll;
+    BoundsInfo bounds = {li, lj, lk, ll,
+        0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0 , 0,
+        ntile_ij_pairs, ntile_kl_pairs, tile_ij_mapping, tile_kl_mapping,
+        q_cond, dm_cond, cutoff};
+
+    JKMatrix jk = {vj, NULL, dm, (uint16_t)n_dm};
+
+    if (!md_j_unrolled(&envs, &jk, &bounds, scheme, workers, omega)) {
+        int lij = li + lj;
+        int lkl = lk + ll;
+        int threads_ij = scheme[0];
+        int threads_kl = scheme[1];
+        int bsizex = threads_ij * TILEX;
+        int bsizey = threads_kl * TILEY;
+        int nsq_per_block = threads_ij * threads_kl;
+        int gout_stride = scheme[2];
+        dim3 threads(threads_ij, threads_kl, gout_stride);
+        int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
+        int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
+        int buflen = (order+1) * nsq_per_block
+            + bsizex * (4+nf3ij) + bsizey * (4+nf3kl)
+            + (order+1)*(order+2)*(order+3)/6 * nsq_per_block;
+        buflen += MAX(order*(order+1)*(order+2)/6, gout_stride) * nsq_per_block;
+        int blocks_ij = (ntile_ij_pairs + bsizex - 1) / bsizex;
+        int blocks_kl = (ntile_kl_pairs + bsizey - 1) / bsizey;
+        dim3 blocks(blocks_ij, blocks_kl);
+        md_j_kernel<<<blocks, threads, buflen*sizeof(double)>>>(envs, jk, bounds);
+    }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in MD_build_j: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+void init_mdj_constant(int shm_size)
+{
+    Fold2Index i_in_fold2idx[165];
+    Fold3Index i_in_fold3idx[495];
+    int n2 = 0;
+    int n3 = 0;
+    for (int l = 0; l <= LMAX*2; ++l) {
+        for (int i = 0, ijk = 0; i <= l; ++i) {
+        for (int j = 0; j <= l-i; ++j, ++n2) {
+            i_in_fold2idx[n2].x = i;
+            i_in_fold2idx[n2].y = j;
+            i_in_fold2idx[n2].fold3offset = ijk;
+            for (int k = 0; k <= l-i-j; ++k, ++n3, ++ijk) {
+                i_in_fold3idx[n3].x = i;
+                i_in_fold3idx[n3].y = j;
+                i_in_fold3idx[n3].z = k;
+                i_in_fold3idx[n3].fold2yz = (l+1)*(l+2)/2 - (l-j+1)*(l-j+2)/2 + k;
+            }
+        } }
+    }
+    cudaMemcpyToSymbol(c_Rt_idx, Rt_idx, sizeof(Rt_idx)); // reuse these buffer to store Rt1_idx
+    cudaMemcpyToSymbol(c_Rt_offsets, Rt_idx_offsets, sizeof(Rt_idx_offsets));
+    cudaMemcpyToSymbol(c_i_in_fold2idx, i_in_fold2idx, 165*sizeof(Fold2Index));
+    cudaMemcpyToSymbol(c_i_in_fold3idx, i_in_fold3idx, 495*sizeof(Fold3Index));
+    cudaFuncSetAttribute(md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    set_md_j_unrolled_shm_size();
+}
+}
diff --git a/gpu4pyscf/lib/gvhf-md/md_pairdata.c b/gpu4pyscf/lib/gvhf-md/md_pairdata.c
new file mode 100644
index 00000000..8b8b7017
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/md_pairdata.c
@@ -0,0 +1,203 @@
+#include <stdlib.h>
+#include <math.h>
+#include "gvhf-rys/vhf.cuh"
+
+#define Ex_at(i,j,t)    Ex[(i)*stride1+(j)*stride2+t]
+#define Ey_at(i,j,t)    Ey[(i)*stride1+(j)*stride2+t]
+#define Ez_at(i,j,t)    Ez[(i)*stride1+(j)*stride2+t]
+
+void get_E_cart_components(double *Ecart, int li, int lj, double ai, double aj,
+                           double *Ra, double *Rb)
+{
+        double aij = ai + aj;
+        double xixj = Ra[0] - Rb[0];
+        double yiyj = Ra[1] - Rb[1];
+        double zizj = Ra[2] - Rb[2];
+        double theta_ij = ai * aj / aij;
+        double Kab = exp(-theta_ij * (xixj*xixj + yiyj*yiyj + zizj*zizj));
+        double Xp = (ai * Ra[0] + aj * Rb[0]) / aij;
+        double Yp = (ai * Ra[1] + aj * Rb[1]) / aij;
+        double Zp = (ai * Ra[2] + aj * Rb[2]) / aij;
+        double Xpa = Xp - Ra[0];
+        double Ypa = Yp - Ra[1];
+        double Zpa = Zp - Ra[2];
+        double Xpb = Xp - Rb[0];
+        double Ypb = Yp - Rb[1];
+        double Zpb = Zp - Rb[2];
+        int lij = li + lj;
+        int stride2 = lij+1;
+        int stride1 = (lj+1) * stride2;
+        int Ex_size = (li+1) * stride1;
+        double *Ex = Ecart;
+        double *Ey = Ex + Ex_size;
+        double *Ez = Ey + Ex_size;
+        int i, j, t;
+        double fac, fac1;
+
+        Ex_at(0,0,0) = 1.;
+        Ey_at(0,0,0) = 1.;
+        Ez_at(0,0,0) = Kab;
+        for (t = 1; t <= lij; t++) {
+                Ex_at(0,0,t) = 0.;
+                Ey_at(0,0,t) = 0.;
+                Ez_at(0,0,t) = 0.;
+        }
+
+        for (j = 1; j <= lj; j++) {
+                Ex_at(0,j,0) = Xpb * Ex_at(0,j-1,0) + Ex_at(0,j-1,1);
+                Ey_at(0,j,0) = Ypb * Ey_at(0,j-1,0) + Ey_at(0,j-1,1);
+                Ez_at(0,j,0) = Zpb * Ez_at(0,j-1,0) + Ez_at(0,j-1,1);
+                for (t = 1; t <= lij; t++) {
+                        fac = j/(2*aij*t);
+                        Ex_at(0,j,t) = fac * Ex_at(0,j-1,t-1);
+                        Ey_at(0,j,t) = fac * Ey_at(0,j-1,t-1);
+                        Ez_at(0,j,t) = fac * Ez_at(0,j-1,t-1);
+                }
+        }
+
+        for (i = 1; i <= li; i++) {
+                Ex_at(i,0,0) = Xpa * Ex_at(i-1,0,0) + Ex_at(i-1,0,1);
+                Ey_at(i,0,0) = Ypa * Ey_at(i-1,0,0) + Ey_at(i-1,0,1);
+                Ez_at(i,0,0) = Zpa * Ez_at(i-1,0,0) + Ez_at(i-1,0,1);
+                for (t = 1; t <= lij; t++) {
+                        fac = i/(2*aij*t);
+                        Ex_at(i,0,t) = fac * Ex_at(i-1,0,t-1);
+                        Ey_at(i,0,t) = fac * Ey_at(i-1,0,t-1);
+                        Ez_at(i,0,t) = fac * Ez_at(i-1,0,t-1);
+                }
+        }
+
+        for (i = 1; i <= li; i++) {
+                for (j = 1; j <= lj; j++) {
+                        Ex_at(i,j,0) = Xpb * Ex_at(i,j-1,0) + Ex_at(i,j-1,1);
+                        Ey_at(i,j,0) = Ypb * Ey_at(i,j-1,0) + Ey_at(i,j-1,1);
+                        Ez_at(i,j,0) = Zpb * Ez_at(i,j-1,0) + Ez_at(i,j-1,1);
+                        for (t = 1; t <= lij; t++) {
+                                fac = i/(2*aij*t);
+                                fac1 = j/(2*aij*t);
+                                Ex_at(i,j,t) = fac*Ex_at(i-1,j,t-1) + fac1*Ex_at(i,j-1,t-1);
+                                Ey_at(i,j,t) = fac*Ey_at(i-1,j,t-1) + fac1*Ey_at(i,j-1,t-1);
+                                Ez_at(i,j,t) = fac*Ez_at(i-1,j,t-1) + fac1*Ez_at(i,j-1,t-1);
+                        }
+                }
+        }
+}
+
+// Shape of E tensor is [:li+lj,:li,:lj]
+void get_E_tensor(double *Et, int li, int lj, double ai, double aj,
+                  double *Ra, double *Rb, double *buf)
+{
+        get_E_cart_components(buf, li, lj, ai, aj, Ra, Rb);
+        int lij = li + lj;
+        int stride2 = lij+1;
+        int stride1 = (lj+1) * stride2;
+        int Ex_size = (li+1) * stride1;
+        double *Ex = buf;
+        double *Ey = Ex + Ex_size;
+        double *Ez = Ey + Ex_size;
+        int t, u, v, n;
+        int ix, iy, iz;
+        int jx, jy, jz;
+
+        n = 0;
+        // products subject to t+u+v <= li+lj
+        for (t = 0; t <= lij; t++) {
+        for (u = 0; u <= lij-t; u++) {
+        for (v = 0; v <= lij-t-u; v++) {
+                for (ix = li; ix >= 0; ix--) {
+                for (iy = li-ix; iy >= 0; iy--) {
+                        iz = li - ix - iy;
+                        for (jx = lj; jx >= 0; jx--) {
+                        for (jy = lj-jx; jy >= 0; jy--) {
+                                jz = lj - jx - jy;
+                                Et[n] = Ex_at(ix,jx,t) * Ey_at(iy,jy,u) * Ez_at(iz,jz,v);
+                                n++;
+                        } }
+                } }
+        } } }
+}
+
+void Et_dot_dm(double *Et_dm, double *dm, int *ao_loc, int *pair_loc,
+               int *bas, int nbas, double *env)
+{
+        int l2 = 2*LMAX;
+        int Et_size = (l2+1)*(l2+2)*(l2+3)/6*NCART_MAX*NCART_MAX;
+        int Ex_size = (2*LMAX+1)*(LMAX+1)*(LMAX+1);
+        double *Et = malloc(sizeof(double) * (Et_size+3*Ex_size));
+        double *buf = Et + Et_size;
+
+        size_t nao = ao_loc[nbas];
+        for (int ish = 0; ish < nbas; ish++) {
+                int li = bas[ish*BAS_SLOTS+ANG_OF];
+                int i0 = ao_loc[ish];
+                double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+                double ci = env[bas[ish*BAS_SLOTS+PTR_COEFF]];
+                double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+                for (int jsh = 0; jsh <= ish; jsh++) {
+                        int lj = bas[jsh*BAS_SLOTS+ANG_OF];
+                        int j0 = ao_loc[jsh];
+                        double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+                        double cj = env[bas[jsh*BAS_SLOTS+PTR_COEFF]];
+                        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+                        double *rho = Et_dm + pair_loc[ish*nbas+jsh];
+                        int lij = li + lj;
+                        int nfi = (li + 1) * (li + 2) / 2;
+                        int nfj = (lj + 1) * (lj + 2) / 2;
+                        int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6;
+                        get_E_tensor(Et, li, lj, ai, aj, ri, rj, buf);
+                        double cc = ci * cj;
+                        double *pdm = dm + j0*nao + i0;
+                        for (int n = 0, t = 0; t < Et_len; t++) {
+                                double rho_t = 0.;
+                                for (int i = 0; i < nfi; i++) {
+                                for (int j = 0; j < nfj; j++, n++) {
+                                        rho_t += Et[n] * cc * pdm[j*nao+i];
+                                } }
+                                rho[t] = rho_t;
+                        }
+                }
+        }
+        free(Et);
+}
+
+void jengine_dot_Et(double *vj, double *jvec, int *ao_loc, int *pair_loc,
+                    int *bas, int nbas, double *env)
+{
+        int l2 = 2*LMAX;
+        int Et_size = (l2+1)*(l2+2)*(l2+3)/6*NCART_MAX*NCART_MAX;
+        int Ex_size = (2*LMAX+1)*(LMAX+1)*(LMAX+1);
+        double *Et = malloc(sizeof(double) * (Et_size+3*Ex_size));
+        double *buf = Et + Et_size;
+
+        size_t nao = ao_loc[nbas];
+        for (int ish = 0; ish < nbas; ish++) {
+                int li = bas[ish*BAS_SLOTS+ANG_OF];
+                int i0 = ao_loc[ish];
+                double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+                double ci = env[bas[ish*BAS_SLOTS+PTR_COEFF]];
+                double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+                for (int jsh = 0; jsh <= ish; jsh++) {
+                        int lj = bas[jsh*BAS_SLOTS+ANG_OF];
+                        int j0 = ao_loc[jsh];
+                        double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+                        double cj = env[bas[jsh*BAS_SLOTS+PTR_COEFF]];
+                        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+                        double *jvec_ij = jvec + pair_loc[ish*nbas+jsh];
+                        int lij = li + lj;
+                        int nfi = (li + 1) * (li + 2) / 2;
+                        int nfj = (lj + 1) * (lj + 2) / 2;
+                        int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6;
+                        get_E_tensor(Et, li, lj, ai, aj, ri, rj, buf);
+                        double cc = ci * cj;
+                        double *pj = vj + i0*nao+j0;
+                        for (int n = 0, t = 0; t < Et_len; t++) {
+                                double fac = cc * jvec_ij[t];
+                                for (int i = 0; i < nfi; i++) {
+                                for (int j = 0; j < nfj; j++, n++) {
+                                        pj[i*nao+j] += Et[n] * fac;
+                                } }
+                        }
+                }
+        }
+        free(Et);
+}
diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
new file mode 100644
index 00000000..61a679f3
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
@@ -0,0 +1,5077 @@
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/gamma_inc_unrolled.cu"
+
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 256;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 256;
+    double *vj_cache = vj_kl_cache + 256;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 256;
+    // zero out all cache;
+    for (int n = sq_id; n < 3328; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 0);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 0; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 512;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 1024;
+    double *vj_cache = vj_kl_cache + 256;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 1024;
+    // zero out all cache;
+    for (int n = sq_id; n < 4864; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 1);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 1; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 768;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 1024;
+    double *vj_cache = vj_kl_cache + 1024;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 1024;
+    // zero out all cache;
+    for (int n = sq_id; n < 6400; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 2);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 2; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+512];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+512];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+512];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+// TILEX=16, TILEY=8, cache_dm=True
+__global__
+void md_j_1_2(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 128;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 1024;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 512;
+    double *vj_kl_cache = vj_ij_cache + 1024;
+    double *vj_cache = vj_kl_cache + 1280;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 1024;
+    // zero out all cache;
+    for (int n = sq_id; n < 6400; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 128) {
+        int task_kl = blockIdx.y * 128 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+128] = ykl;
+            Rq_cache[sq_id+256] = zkl;
+            Rq_cache[sq_id+384] = akl;
+        } else {
+            Rq_cache[sq_id+384] = 1.;
+        }
+    }
+    for (int n = ty; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 80; n += 16) {
+        int i = n / 8;
+        int tile = n % 8;
+        int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+            dm_kl_cache[sq_kl+i*128] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 8; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 128 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+128];
+        double zkl = Rq_cache[sq_kl+256];
+        double akl = Rq_cache[sq_kl+384];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 3);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 3; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+512];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+128] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+0];
+        double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+        double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+        vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_2 = ypq * R_1_0_0_2;
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+512];
+        double R_0_1_0_2 = xpq * R_1_0_0_2;
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+512];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+384] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+256];
+        double R_1_0_1_1 = ypq * R_2_0_0_1;
+        double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+512];
+        double R_0_1_1_1 = xpq * R_1_0_1_1;
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+256];
+        double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+        double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+        vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+512];
+        double R_0_1_2_0 = xpq * R_1_0_2_0;
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+640] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+512];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+768] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+512];
+        double R_1_1_0_1 = xpq * R_2_0_0_1;
+        double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+896] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+512];
+        double R_1_1_1_0 = xpq * R_2_0_1_0;
+        double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+1024] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+512];
+        double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+        double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+        vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+1152] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+128];
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+640];
+        vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768];
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+896];
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+1024];
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+1152];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+128];
+        vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+640];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768];
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+896];
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+1024];
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+1152];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+128];
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+640];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768];
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+896];
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+1024];
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+1152];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+128];
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+640];
+        vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768];
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+896];
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+1024];
+        vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+1152];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 80; n += 16) {
+        int i = n / 8;
+        int tile = n % 8;
+        int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+        }
+    }
+}
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 768;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 2560;
+    double *vj_cache = vj_kl_cache + 256;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 2560;
+    // zero out all cache;
+    for (int n = sq_id; n < 7936; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 2);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 2; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+// TILEX=16, TILEY=8, cache_dm=True
+__global__
+void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 128;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 1024;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 512;
+    double *vj_kl_cache = vj_ij_cache + 2560;
+    double *vj_cache = vj_kl_cache + 512;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 2560;
+    // zero out all cache;
+    for (int n = sq_id; n < 7936; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 128) {
+        int task_kl = blockIdx.y * 128 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+128] = ykl;
+            Rq_cache[sq_id+256] = zkl;
+            Rq_cache[sq_id+384] = akl;
+        } else {
+            Rq_cache[sq_id+384] = 1.;
+        }
+    }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 32; n += 16) {
+        int i = n / 8;
+        int tile = n % 8;
+        int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+            dm_kl_cache[sq_kl+i*128] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 8; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 128 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+128];
+        double zkl = Rq_cache[sq_kl+256];
+        double akl = Rq_cache[sq_kl+384];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 3);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 3; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+        double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+        double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+        vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+768];
+        double R_0_0_1_2 = ypq * R_1_0_0_2;
+        vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+1024];
+        double R_1_0_1_1 = ypq * R_2_0_0_1;
+        double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+        vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1280];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_0_2 = xpq * R_1_0_0_2;
+        vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_1_1 = xpq * R_1_0_1_1;
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_1 = xpq * R_2_0_0_1;
+        double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+        vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+128] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1024];
+        double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+        double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+        vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_2_0 = xpq * R_1_0_2_0;
+        vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_1_0 = xpq * R_2_0_1_0;
+        double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+        vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1024];
+        vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1792];
+        vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2048];
+        double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+        double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+        vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+384] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+384];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 32; n += 16) {
+        int i = n / 8;
+        int tile = n % 8;
+        int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+        }
+    }
+}
+
+// TILEX=16, TILEY=4, cache_dm=True
+__global__
+void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 256;
+    int task_kl0 = blockIdx.y * 64;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 1280;
+    double *Rq_cache = Rp_cache + 1024;
+    double *vj_ij_cache = Rq_cache + 256;
+    double *vj_kl_cache = vj_ij_cache + 2560;
+    double *vj_cache = vj_kl_cache + 640;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 2560;
+    // zero out all cache;
+    for (int n = sq_id; n < 7936; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 256) {
+        int task_ij = blockIdx.x * 256 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+256] = yij;
+            Rp_cache[sq_id+512] = zij;
+            Rp_cache[sq_id+768] = aij;
+        } else {
+            Rp_cache[sq_id+768] = 1.;
+        }
+    }
+    if (sq_id < 64) {
+        int task_kl = blockIdx.y * 64 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+64] = ykl;
+            Rq_cache[sq_id+128] = zkl;
+            Rq_cache[sq_id+192] = akl;
+        } else {
+            Rq_cache[sq_id+192] = 1.;
+        }
+    }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 40; n += 16) {
+        int i = n / 4;
+        int tile = n % 4;
+        int task_kl = blockIdx.y * 64 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*64]);
+            dm_kl_cache[sq_kl+i*64] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 4; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 64 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+256];
+        double zij = Rp_cache[sq_ij+512];
+        double aij = Rp_cache[sq_ij+768];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+64];
+        double zkl = Rq_cache[sq_kl+128];
+        double akl = Rq_cache[sq_kl+192];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 4);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 4; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+        double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+        double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+        vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+768];
+        double R_0_0_1_2 = ypq * R_1_0_0_2;
+        vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+1024];
+        double R_1_0_1_1 = ypq * R_2_0_0_1;
+        double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+        vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1280];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_0_2 = xpq * R_1_0_0_2;
+        vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_1_1 = xpq * R_1_0_1_1;
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_1 = xpq * R_2_0_0_1;
+        double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+        vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+64] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+256];
+        double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256];
+        double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1;
+        double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2;
+        vj_kl += R_0_0_0_4 * dm_ij_cache[sq_ij+512];
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+768];
+        double R_0_0_1_3 = ypq * R_1_0_0_3;
+        vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+1024];
+        double R_1_0_1_2 = ypq * R_2_0_0_2;
+        double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2;
+        vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+1280];
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_0_3 = xpq * R_1_0_0_3;
+        vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_1_2 = xpq * R_1_0_1_2;
+        vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_2 = xpq * R_2_0_0_2;
+        double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2;
+        vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+128] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1024];
+        double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+        double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+        vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_2_0 = xpq * R_1_0_2_0;
+        vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_1_0 = xpq * R_2_0_1_0;
+        double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+        vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+192] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+512];
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+768];
+        vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+1024];
+        double R_2_0_1_1 = ypq * R_3_0_0_1;
+        double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1;
+        double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1;
+        vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+1280];
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1536];
+        vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_2_1 = xpq * R_1_0_2_1;
+        vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_1_1 = xpq * R_2_0_1_1;
+        double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1;
+        vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+512];
+        vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+768];
+        vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+1024];
+        double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256];
+        double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0;
+        double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0;
+        vj_kl += R_0_0_4_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_3_0 = xpq * R_1_0_3_0;
+        vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_2_0 = xpq * R_2_0_2_0;
+        double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0;
+        vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+320] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1024];
+        vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1792];
+        vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2048];
+        double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+        double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+        vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+384] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+512];
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+768];
+        vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1024];
+        vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1280];
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1536];
+        vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+1792];
+        vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+2048];
+        double R_2_1_0_1 = xpq * R_3_0_0_1;
+        double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1;
+        double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1;
+        vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+448] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+512];
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+768];
+        vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1024];
+        vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1792];
+        vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+2048];
+        double R_2_1_1_0 = xpq * R_3_0_1_0;
+        double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0;
+        double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0;
+        vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+256];
+        vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+512];
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+768];
+        vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1024];
+        vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+1280];
+        vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1536];
+        vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+1792];
+        vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2048];
+        double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256];
+        double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0;
+        double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0;
+        vj_kl += R_0_4_0_0 * dm_ij_cache[sq_ij+2304];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+576] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_0_0_4 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_0_4_0 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+64];
+        vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+128];
+        vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+192];
+        vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+320];
+        vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+384];
+        vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+448];
+        vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij += R_0_4_0_0 * dm_kl_cache[sq_kl+576];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+        }
+    }
+    for (int n = tx; n < 40; n += 16) {
+        int i = n / 4;
+        int tile = n % 4;
+        int task_kl = blockIdx.y * 64 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*64]);
+        }
+    }
+}
+
+// TILEX=8, TILEY=16, cache_dm=True
+__global__
+void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 128;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 1024;
+    double *Rq_cache = Rp_cache + 512;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 2560;
+    double *vj_cache = vj_kl_cache + 256;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 2560;
+    // zero out all cache;
+    for (int n = sq_id; n < 7424; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 128) {
+        int task_ij = blockIdx.x * 128 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+128] = yij;
+            Rp_cache[sq_id+256] = zij;
+            Rp_cache[sq_id+384] = aij;
+        } else {
+            Rp_cache[sq_id+384] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 8;
+        int tile = n % 8;
+        int task_ij = blockIdx.x * 128 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*128] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 8; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 128 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+128];
+        double zij = Rp_cache[sq_ij+256];
+        double aij = Rp_cache[sq_ij+384];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 3);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 3; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+128];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+        double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+        double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+        vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+384];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+640];
+        double R_0_0_1_2 = ypq * R_1_0_0_2;
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+768];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+896];
+        double R_1_0_1_1 = ypq * R_2_0_0_1;
+        double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+1024];
+        double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+        double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+        vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+1152];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1280];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1408];
+        double R_0_1_0_2 = xpq * R_1_0_0_2;
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1536];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+1664];
+        double R_0_1_1_1 = xpq * R_1_0_1_1;
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1792];
+        double R_0_1_2_0 = xpq * R_1_0_2_0;
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1920];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2048];
+        double R_1_1_0_1 = xpq * R_2_0_0_1;
+        double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+2176];
+        double R_1_1_1_0 = xpq * R_2_0_1_0;
+        double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+2304];
+        double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+        double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+        vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+2432];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+128] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+384] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+640] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+896] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1152] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1408] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1664] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1920] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2176] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2432] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 160; n += 16) {
+        int i = n / 8;
+        int tile = n % 8;
+        int task_ij = blockIdx.x * 128 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*128]);
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+// TILEX=4, TILEY=16, cache_dm=True
+__global__
+void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 64;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 1280;
+    double *Rq_cache = Rp_cache + 256;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 1280;
+    double *vj_cache = vj_kl_cache + 1024;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 1280;
+    // zero out all cache;
+    for (int n = sq_id; n < 6144; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 64) {
+        int task_ij = blockIdx.x * 64 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+64] = yij;
+            Rp_cache[sq_id+128] = zij;
+            Rp_cache[sq_id+192] = aij;
+        } else {
+            Rp_cache[sq_id+192] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 80; n += 16) {
+        int i = n / 4;
+        int tile = n % 4;
+        int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*64] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 4; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 64 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+64];
+        double zij = Rp_cache[sq_ij+128];
+        double aij = Rp_cache[sq_ij+192];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 4);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 4; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+64];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+128];
+        double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+        double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+        vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+192];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+320];
+        double R_0_0_1_2 = ypq * R_1_0_0_2;
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+384];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+448];
+        double R_1_0_1_1 = ypq * R_2_0_0_1;
+        double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+512];
+        double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+        double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+        vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+576];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+640];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+704];
+        double R_0_1_0_2 = xpq * R_1_0_0_2;
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+768];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+832];
+        double R_0_1_1_1 = xpq * R_1_0_1_1;
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+896];
+        double R_0_1_2_0 = xpq * R_1_0_2_0;
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+960];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+1024];
+        double R_1_1_0_1 = xpq * R_2_0_0_1;
+        double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1088];
+        double R_1_1_1_0 = xpq * R_2_0_1_0;
+        double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1152];
+        double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+        double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+        vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1216];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+64];
+        vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+128];
+        double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256];
+        double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1;
+        double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2;
+        vj_kl -= R_0_0_0_4 * dm_ij_cache[sq_ij+192];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+320];
+        double R_0_0_1_3 = ypq * R_1_0_0_3;
+        vj_kl -= R_0_0_1_3 * dm_ij_cache[sq_ij+384];
+        vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+448];
+        double R_1_0_1_2 = ypq * R_2_0_0_2;
+        double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2;
+        vj_kl -= R_0_0_2_2 * dm_ij_cache[sq_ij+512];
+        double R_2_0_1_1 = ypq * R_3_0_0_1;
+        double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1;
+        double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1;
+        vj_kl -= R_0_0_3_1 * dm_ij_cache[sq_ij+576];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+640];
+        vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+704];
+        double R_0_1_0_3 = xpq * R_1_0_0_3;
+        vj_kl -= R_0_1_0_3 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+832];
+        double R_0_1_1_2 = xpq * R_1_0_1_2;
+        vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+896];
+        double R_0_1_2_1 = xpq * R_1_0_2_1;
+        vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+960];
+        vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1024];
+        double R_1_1_0_2 = xpq * R_2_0_0_2;
+        double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2;
+        vj_kl -= R_0_2_0_2 * dm_ij_cache[sq_ij+1088];
+        double R_1_1_1_1 = xpq * R_2_0_1_1;
+        double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1;
+        vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+1152];
+        double R_2_1_0_1 = xpq * R_3_0_0_1;
+        double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1;
+        double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1;
+        vj_kl -= R_0_3_0_1 * dm_ij_cache[sq_ij+1216];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+64];
+        vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+128];
+        vj_kl -= R_0_0_1_3 * dm_ij_cache[sq_ij+192];
+        vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+320];
+        vj_kl -= R_0_0_2_2 * dm_ij_cache[sq_ij+384];
+        vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+448];
+        vj_kl -= R_0_0_3_1 * dm_ij_cache[sq_ij+512];
+        double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256];
+        double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0;
+        double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0;
+        vj_kl -= R_0_0_4_0 * dm_ij_cache[sq_ij+576];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+640];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+704];
+        vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+832];
+        vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+896];
+        double R_0_1_3_0 = xpq * R_1_0_3_0;
+        vj_kl -= R_0_1_3_0 * dm_ij_cache[sq_ij+960];
+        vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+1024];
+        vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+1088];
+        double R_1_1_2_0 = xpq * R_2_0_2_0;
+        double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0;
+        vj_kl -= R_0_2_2_0 * dm_ij_cache[sq_ij+1152];
+        double R_2_1_1_0 = xpq * R_3_0_1_0;
+        double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0;
+        double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0;
+        vj_kl -= R_0_3_1_0 * dm_ij_cache[sq_ij+1216];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+        }
+        vj_kl = 0.;
+        vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+        vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+64];
+        vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+128];
+        vj_kl -= R_0_1_0_3 * dm_ij_cache[sq_ij+192];
+        vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+256];
+        vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+320];
+        vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+384];
+        vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+448];
+        vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+512];
+        vj_kl -= R_0_1_3_0 * dm_ij_cache[sq_ij+576];
+        vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+640];
+        vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+704];
+        vj_kl -= R_0_2_0_2 * dm_ij_cache[sq_ij+768];
+        vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+832];
+        vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+896];
+        vj_kl -= R_0_2_2_0 * dm_ij_cache[sq_ij+960];
+        vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+1024];
+        vj_kl -= R_0_3_0_1 * dm_ij_cache[sq_ij+1088];
+        vj_kl -= R_0_3_1_0 * dm_ij_cache[sq_ij+1152];
+        double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256];
+        double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0;
+        double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0;
+        vj_kl -= R_0_4_0_0 * dm_ij_cache[sq_ij+1216];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+64] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+128] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_0_4 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_1_3 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_0_3 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+192] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+320] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_1_3 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_2_2 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+384] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+448] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_2_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_3_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_0_3_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_0_4_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_1_3_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+576] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+640] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+704] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_0_3 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_0_2 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+832] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+896] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_1_3_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_2_2_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+960] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_2_0_2 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_3_0_1 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1088] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_2_2_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_3_1_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1152] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0];
+        vj_ij -= R_0_3_0_1 * dm_kl_cache[sq_kl+256];
+        vj_ij -= R_0_3_1_0 * dm_kl_cache[sq_kl+512];
+        vj_ij -= R_0_4_0_0 * dm_kl_cache[sq_kl+768];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1216] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 80; n += 16) {
+        int i = n / 4;
+        int tile = n % 4;
+        int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*64]);
+        }
+    }
+    for (int n = tx; n < 64; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+// TILEX=4, TILEY=16, cache_dm=True
+__global__
+void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+    int *pair_ij_mapping = bounds.tile_ij_mapping;
+    int *pair_kl_mapping = bounds.tile_kl_mapping;
+    int task_ij0 = blockIdx.x * 64;
+    int task_kl0 = blockIdx.y * 256;
+    int pair_ij0 = pair_ij_mapping[task_ij0];
+    int pair_kl0 = pair_kl_mapping[task_kl0];
+    float *q_cond = bounds.q_cond;
+    if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+        return;
+    }
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int sq_id = tx + 16 * ty;
+    int *bas = envs.bas;
+    int *dm_pair_loc = envs.ao_loc;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double *dm = jk.dm;
+    double *vj = jk.vj;
+    double vj_ij, vj_kl;
+
+    int npairs_ij = bounds.npairs_ij;
+    int npairs_kl = bounds.npairs_kl;
+    extern __shared__ double gamma_inc[];
+    double *Rp_cache = gamma_inc + 1280;
+    double *Rq_cache = Rp_cache + 256;
+    double *vj_ij_cache = Rq_cache + 1024;
+    double *vj_kl_cache = vj_ij_cache + 2240;
+    double *vj_cache = vj_kl_cache + 256;
+    double *dm_ij_cache = vj_cache + 256;
+    double *dm_kl_cache = dm_ij_cache + 2240;
+    // zero out all cache;
+    for (int n = sq_id; n < 6528; n += 256) {
+        Rp_cache[n] = 0.;
+    }
+    __syncthreads();
+
+    if (sq_id < 64) {
+        int task_ij = blockIdx.x * 64 + sq_id;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int ish = pair_ij / nbas;
+            int jsh = pair_ij % nbas;
+            double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+            double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+            double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+            double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+            double aij = ai + aj;
+            double xij = (ai * ri[0] + aj * rj[0]) / aij;
+            double yij = (ai * ri[1] + aj * rj[1]) / aij;
+            double zij = (ai * ri[2] + aj * rj[2]) / aij;
+            Rp_cache[sq_id+0] = xij;
+            Rp_cache[sq_id+64] = yij;
+            Rp_cache[sq_id+128] = zij;
+            Rp_cache[sq_id+192] = aij;
+        } else {
+            Rp_cache[sq_id+192] = 1.;
+        }
+    }
+    if (sq_id < 256) {
+        int task_kl = blockIdx.y * 256 + sq_id;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int ksh = pair_kl / nbas;
+            int lsh = pair_kl % nbas;
+            double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+            double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double akl = ak + al;
+            double xkl = (ak * rk[0] + al * rl[0]) / akl;
+            double ykl = (ak * rk[1] + al * rl[1]) / akl;
+            double zkl = (ak * rk[2] + al * rl[2]) / akl;
+            Rq_cache[sq_id+0] = xkl;
+            Rq_cache[sq_id+256] = ykl;
+            Rq_cache[sq_id+512] = zkl;
+            Rq_cache[sq_id+768] = akl;
+        } else {
+            Rq_cache[sq_id+768] = 1.;
+        }
+    }
+    for (int n = ty; n < 140; n += 16) {
+        int i = n / 4;
+        int tile = n % 4;
+        int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            dm_ij_cache[sq_ij+i*64] = dm[dm_ij_pair0+i];
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+            dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+        }
+    }
+    __syncthreads();
+
+    for (int batch_ij = 0; batch_ij < 4; ++batch_ij) {
+    for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+        int task_ij0 = blockIdx.x * 64 + batch_ij * 16;
+        int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+        if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+            continue;
+        }
+        int pair_ij0 = pair_ij_mapping[task_ij0];
+        int pair_kl0 = pair_kl_mapping[task_kl0];
+        if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+            continue;
+        }
+
+        int sq_ij = tx + batch_ij * 16;
+        int sq_kl = ty + batch_kl * 16;
+        int task_ij = task_ij0 + tx;
+        int task_kl = task_kl0 + ty;
+        double fac_sym = PI_FAC;
+        if (task_ij >= npairs_ij) {
+            task_ij = task_ij0;
+            fac_sym = 0.;
+        }
+        if (task_kl >= npairs_kl) {
+            task_kl = task_kl0;
+            fac_sym = 0.;
+        }
+        int pair_ij = pair_ij_mapping[task_ij];
+        int pair_kl = pair_kl_mapping[task_kl];
+
+        int ish = pair_ij / nbas;
+        int jsh = pair_ij % nbas;
+        int ksh = pair_kl / nbas;
+        int lsh = pair_kl % nbas;
+        if (ish == jsh) fac_sym *= .5;
+        if (ksh == lsh) fac_sym *= .5;
+        if (pair_ij_mapping == pair_kl_mapping) {
+            if (task_ij == task_kl) fac_sym *= .5;
+            if (task_ij < task_kl) fac_sym = 0.;
+        }
+        double xij = Rp_cache[sq_ij+0];
+        double yij = Rp_cache[sq_ij+64];
+        double zij = Rp_cache[sq_ij+128];
+        double aij = Rp_cache[sq_ij+192];
+        double xkl = Rq_cache[sq_kl+0];
+        double ykl = Rq_cache[sq_kl+256];
+        double zkl = Rq_cache[sq_kl+512];
+        double akl = Rq_cache[sq_kl+768];
+        double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+        double xpq = xij - xkl;
+        double ypq = yij - ykl;
+        double zpq = zij - zkl;
+        double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+        double theta = aij * akl / (aij + akl);
+        double theta_rr = theta * rr;
+        eval_gamma_inc_fn(gamma_inc, theta_rr, 4);
+        double a2 = -2. * theta;
+        gamma_inc[sq_id] *= fac;
+        for (int i = 1; i <= 4; i++) {
+            fac *= a2;
+            gamma_inc[sq_id+i*256] *= fac;
+        }
+        vj_kl = 0.;
+        vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+        double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+64];
+        double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+        double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+128];
+        double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+        double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+        vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+192];
+        double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256];
+        double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1;
+        double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2;
+        vj_kl += R_0_0_0_4 * dm_ij_cache[sq_ij+256];
+        double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+320];
+        double R_0_0_1_1 = ypq * R_1_0_0_1;
+        vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+384];
+        double R_0_0_1_2 = ypq * R_1_0_0_2;
+        vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+448];
+        double R_0_0_1_3 = ypq * R_1_0_0_3;
+        vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+512];
+        double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+        double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+576];
+        double R_1_0_1_1 = ypq * R_2_0_0_1;
+        double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+640];
+        double R_1_0_1_2 = ypq * R_2_0_0_2;
+        double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2;
+        vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+704];
+        double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+        double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+        vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+768];
+        double R_2_0_1_1 = ypq * R_3_0_0_1;
+        double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1;
+        double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1;
+        vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+832];
+        double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256];
+        double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0;
+        double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0;
+        vj_kl += R_0_0_4_0 * dm_ij_cache[sq_ij+896];
+        double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+960];
+        double R_0_1_0_1 = xpq * R_1_0_0_1;
+        vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1024];
+        double R_0_1_0_2 = xpq * R_1_0_0_2;
+        vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1088];
+        double R_0_1_0_3 = xpq * R_1_0_0_3;
+        vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+1152];
+        double R_0_1_1_0 = xpq * R_1_0_1_0;
+        vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+1216];
+        double R_0_1_1_1 = xpq * R_1_0_1_1;
+        vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1280];
+        double R_0_1_1_2 = xpq * R_1_0_1_2;
+        vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1344];
+        double R_0_1_2_0 = xpq * R_1_0_2_0;
+        vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1408];
+        double R_0_1_2_1 = xpq * R_1_0_2_1;
+        vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1472];
+        double R_0_1_3_0 = xpq * R_1_0_3_0;
+        vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+1536];
+        double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+        double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+        vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+1600];
+        double R_1_1_0_1 = xpq * R_2_0_0_1;
+        double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+        vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1664];
+        double R_1_1_0_2 = xpq * R_2_0_0_2;
+        double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2;
+        vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+1728];
+        double R_1_1_1_0 = xpq * R_2_0_1_0;
+        double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+        vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1792];
+        double R_1_1_1_1 = xpq * R_2_0_1_1;
+        double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1;
+        vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1856];
+        double R_1_1_2_0 = xpq * R_2_0_2_0;
+        double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0;
+        vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+1920];
+        double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+        double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+        double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+        vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1984];
+        double R_2_1_0_1 = xpq * R_3_0_0_1;
+        double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1;
+        double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1;
+        vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+2048];
+        double R_2_1_1_0 = xpq * R_3_0_1_0;
+        double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0;
+        double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0;
+        vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2112];
+        double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256];
+        double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256];
+        double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0;
+        double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0;
+        vj_kl += R_0_4_0_0 * dm_ij_cache[sq_ij+2176];
+        __syncthreads();
+        vj_cache[sq_id] = vj_kl;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (tx < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride];
+            }
+        }
+        __syncthreads();
+        if (tx == 0 && task_kl0+ty < npairs_kl) {
+            vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+64] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+128] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+192] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_0_4 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+320] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+384] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+448] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+576] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+640] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+704] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+832] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_0_4_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+896] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+960] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1088] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1152] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1216] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1344] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1408] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1472] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1600] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1664] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1728] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1856] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1920] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+1984] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2112] += vj_cache[sq_id];
+        }
+        vj_ij = 0.;
+        vj_ij += R_0_4_0_0 * dm_kl_cache[sq_kl+0];
+        __syncthreads();
+        vj_cache[sq_id] = vj_ij;
+        for (int stride = 8; stride > 0; stride /= 2) {
+            __syncthreads();
+            if (ty < stride) {
+                vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+            }
+        }
+        __syncthreads();
+        if (ty == 0 && task_ij0+tx < npairs_ij) {
+            vj_ij_cache[sq_ij+2176] += vj_cache[sq_id];
+        }
+        __syncthreads();
+    } }
+    for (int n = ty; n < 140; n += 16) {
+        int i = n / 4;
+        int tile = n % 4;
+        int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+        if (task_ij < npairs_ij) {
+            int pair_ij = pair_ij_mapping[task_ij];
+            int dm_ij_pair0 = dm_pair_loc[pair_ij];
+            int sq_ij = tx + tile * 16;
+            atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*64]);
+        }
+    }
+    for (int n = tx; n < 16; n += 16) {
+        int i = n / 16;
+        int tile = n % 16;
+        int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+        if (task_kl < npairs_kl) {
+            int pair_kl = pair_kl_mapping[task_kl];
+            int dm_kl_pair0 = dm_pair_loc[pair_kl];
+            int sq_kl = ty + tile * 16;
+            atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+        }
+    }
+}
+
+int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
+                    int *scheme, int workers, double omega)
+{
+    int li = bounds->li;
+    int lj = bounds->lj;
+    int lk = bounds->lk;
+    int ll = bounds->ll;
+    int lij = li + lj;
+    int lkl = lk + ll;
+    dim3 threads(16, 16);
+    dim3 blocks;
+    int ijkl = lij*9 + lkl;
+    switch (ijkl) {
+    case 0: // lij=0, lkl=0
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_0_0<<<blocks, threads, 3584*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 9: // lij=1, lkl=0
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_1_0<<<blocks, threads, 5376*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 10: // lij=1, lkl=1
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_1_1<<<blocks, threads, 7168*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 11: // lij=1, lkl=2
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 127) / 128;
+        md_j_1_2<<<blocks, threads, 7424*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 18: // lij=2, lkl=0
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_2_0<<<blocks, threads, 8704*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 19: // lij=2, lkl=1
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 127) / 128;
+        md_j_2_1<<<blocks, threads, 8960*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 20: // lij=2, lkl=2
+        blocks.x = (bounds->npairs_ij + 255) / 256;
+        blocks.y = (bounds->npairs_kl + 63) / 64;
+        md_j_2_2<<<blocks, threads, 9216*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 27: // lij=3, lkl=0
+        blocks.x = (bounds->npairs_ij + 127) / 128;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_3_0<<<blocks, threads, 8448*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 28: // lij=3, lkl=1
+        blocks.x = (bounds->npairs_ij + 63) / 64;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_3_1<<<blocks, threads, 7424*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    case 36: // lij=4, lkl=0
+        blocks.x = (bounds->npairs_ij + 63) / 64;
+        blocks.y = (bounds->npairs_kl + 255) / 256;
+        md_j_4_0<<<blocks, threads, 7808*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    default: return 0;
+    }
+    return 1;
+}
+
+void set_md_j_unrolled_shm_size()
+{
+    cudaFuncSetAttribute(md_j_0_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 3584*sizeof(double));
+    cudaFuncSetAttribute(md_j_1_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 5376*sizeof(double));
+    cudaFuncSetAttribute(md_j_1_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 7168*sizeof(double));
+    cudaFuncSetAttribute(md_j_1_2, cudaFuncAttributeMaxDynamicSharedMemorySize, 7424*sizeof(double));
+    cudaFuncSetAttribute(md_j_2_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 8704*sizeof(double));
+    cudaFuncSetAttribute(md_j_2_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 8960*sizeof(double));
+    cudaFuncSetAttribute(md_j_2_2, cudaFuncAttributeMaxDynamicSharedMemorySize, 9216*sizeof(double));
+    cudaFuncSetAttribute(md_j_3_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 8448*sizeof(double));
+    cudaFuncSetAttribute(md_j_3_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 7424*sizeof(double));
+    cudaFuncSetAttribute(md_j_4_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 7808*sizeof(double));
+}
diff --git a/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu b/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu
index 3953c63b..89758bfd 100644
--- a/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu
+++ b/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu
@@ -4,10 +4,8 @@
 #define SQRTPIE4        .886226925452758013
 
 __device__
-static void eval_gamma_inc_fn(double *f, double t, int m)
+static void eval_gamma_inc_fn(double *f, double t, int m, int sq_id, int block_size)
 {
-    int sq_id = threadIdx.x;
-    int block_size = blockDim.x;
     if (t < EPS_FLOAT64) {
         f[sq_id] = 1.;
         for (int i = 1; i <= m; i++) {
diff --git a/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu b/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu
index 17c8c570..88ba3436 100644
--- a/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu
+++ b/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu
@@ -6,8 +6,8 @@
 __device__
 static void eval_gamma_inc_fn(double *f, double t, int m)
 {
-    int t_id = threadIdx.x + blockDim.x * threadIdx.y;
-    int block_size = blockDim.x * blockDim.y;
+    int t_id = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
+    int block_size = blockDim.x * blockDim.y * blockDim.z;
     if (t < EPS_FLOAT64) {
         f[t_id] = 1.;
         for (int i = 1; i <= m; i++) {
diff --git a/gpu4pyscf/lib/tests/test_cusolver.py b/gpu4pyscf/lib/tests/test_cusolver.py
index e69de29b..0f4941c7 100644
--- a/gpu4pyscf/lib/tests/test_cusolver.py
+++ b/gpu4pyscf/lib/tests/test_cusolver.py
@@ -0,0 +1,64 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import scipy.linalg
+import cupy as cp
+from gpu4pyscf.lib.cusolver import eigh, cholesky
+
+def test_eigh_real():
+    np.random.seed(6)
+    n = 12
+    a = np.random.rand(n, n)
+    a = a + a.T
+    b = np.random.rand(n, n)
+    b = b.dot(b.T)
+    ref = scipy.linalg.eigh(a, b)
+    e, c = eigh(cp.asarray(a), cp.asarray(b))
+    assert abs(e.get() - ref[0]).max() < 1e-10
+    ovlp = c.get().T.dot(b).dot(ref[1])
+    assert abs(abs(ovlp) - np.eye(n)).max() < 1e-10
+
+def test_eigh_cmplx():
+    np.random.seed(6)
+    n = 12
+    a = np.random.rand(n, n) + np.random.rand(n, n) * 1j
+    a = a + a.conj().T
+    b = np.random.rand(n, n) + np.random.rand(n, n) * 1j
+    b = b.dot(b.conj().T)
+    ref = scipy.linalg.eigh(a, b)
+    e, c = eigh(cp.asarray(a), cp.asarray(b))
+    assert abs(e.get() - ref[0]).max() < 1e-10
+    ovlp = c.get().conj().T.dot(b).dot(ref[1])
+    assert abs(abs(ovlp) - np.eye(n)).max() < 1e-10
+
+def test_cholesky_real():
+    np.random.seed(6)
+    n = 12
+    a = np.random.rand(n, n)
+    a = a.dot(a.T)
+    ref = np.linalg.cholesky(a)
+    x = cholesky(cp.asarray(a))
+    assert abs(x.get() - ref).max() < 1e-12
+
+def test_cholesky_cmplx():
+    np.random.seed(6)
+    n = 12
+    a = np.random.rand(n, n) + np.random.rand(n, n) * 1j
+    a = a.dot(a.conj().T)
+    ref = np.linalg.cholesky(a)
+    x = cholesky(cp.asarray(a))
+    assert abs(x.get() - ref).max() < 1e-12
diff --git a/gpu4pyscf/lib/tests/test_cutensor.py b/gpu4pyscf/lib/tests/test_cutensor.py
index ca338331..3e9ef1c4 100644
--- a/gpu4pyscf/lib/tests/test_cutensor.py
+++ b/gpu4pyscf/lib/tests/test_cutensor.py
@@ -38,6 +38,13 @@ def test_contract(self):
         c_contract = contract('lkji,jk->il', a, b[10:20,10:20])
         assert cupy.linalg.norm(c_einsum - c_contract) < 1e-10
 
+    def test_complex_valued(self):
+        a = cupy.random.rand(10,9,11) + cupy.random.rand(10,9,11)*1j
+        b = cupy.random.rand(11,7,13) + cupy.random.rand(11,7,13)*1j
+        c_einsum = cupy.einsum('ijk,ikl->jl', a[3:9,:,4:10], b[3:9,:6, 7:13])
+        c_contract = contract('ijk,ikl->jl', a[3:9,:,4:10], b[3:9,:6, 7:13])
+        assert cupy.linalg.norm(c_einsum - c_contract) < 1e-10
+
     def test_cache(self):
         a = cupy.random.rand(20,20,20,20)
         b = cupy.random.rand(20,20)
@@ -52,4 +59,4 @@ def test_cache(self):
 
 if __name__ == "__main__":
     print("Full tests for cutensor module")
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py
index 753b987c..d8c3b0c2 100644
--- a/gpu4pyscf/mp/dfmp2.py
+++ b/gpu4pyscf/mp/dfmp2.py
@@ -100,8 +100,7 @@ def loop_ao2mo(self, mo_coeff, nocc):
         mo_coeff = cupy.asarray(mo_coeff, order='C')
         Lov = None
         with_df = self.with_df
-        ao_idx = with_df.intopt.ao_idx
-        mo_coeff = mo_coeff[ao_idx]
+        mo_coeff = with_df.intopt.sort_orbitals(mo_coeff, axis=[0])
         orbo = mo_coeff[:,:nocc]
         orbv = mo_coeff[:,nocc:]
         blksize = with_df.get_blksize()
diff --git a/gpu4pyscf/mp/tests/test_mp2.py b/gpu4pyscf/mp/tests/test_mp2.py
index 43142fd8..9cffad01 100644
--- a/gpu4pyscf/mp/tests/test_mp2.py
+++ b/gpu4pyscf/mp/tests/test_mp2.py
@@ -155,4 +155,4 @@ def test_to_gpu(self):
 
 if __name__ == "__main__":
     print("Full Tests for mp2")
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/gpu4pyscf/pbc/__init__.py b/gpu4pyscf/pbc/__init__.py
new file mode 100644
index 00000000..f7ec6fe8
--- /dev/null
+++ b/gpu4pyscf/pbc/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from gpu4pyscf.pbc import scf
+from gpu4pyscf.pbc import dft
diff --git a/gpu4pyscf/pbc/df/__init__.py b/gpu4pyscf/pbc/df/__init__.py
new file mode 100644
index 00000000..6b9e0c3f
--- /dev/null
+++ b/gpu4pyscf/pbc/df/__init__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from . import fft
+#from . import aft
+#from . import df
+from .fft import FFTDF
+#from .df import DF, GDF
+#from .aft import AFTDF
+
+class DF: pass # Just a placeholder
diff --git a/gpu4pyscf/pbc/df/fft.py b/gpu4pyscf/pbc/df/fft.py
new file mode 100644
index 00000000..f84894ac
--- /dev/null
+++ b/gpu4pyscf/pbc/df/fft.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''GPW method'''
+
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from pyscf import lib
+from pyscf.pbc.df import fft as fft_cpu
+from pyscf.pbc.df import aft as aft_cpu
+from pyscf.pbc.df.aft import _check_kpts, ft_ao
+from pyscf.pbc.gto import pseudo
+from pyscf.pbc.lib.kpts_helper import is_zero
+from gpu4pyscf.lib import logger, utils
+from gpu4pyscf.pbc import tools
+from gpu4pyscf.pbc.df import fft_jk
+
+__all__ = [
+    'get_nuc', 'get_pp', 'get_SI', 'FFTDF'
+]
+
+def get_nuc(mydf, kpts=None):
+    from gpu4pyscf.pbc.dft import numint
+    kpts, is_single_kpt = _check_kpts(mydf, kpts)
+    cell = mydf.cell
+    assert cell.low_dim_ft_type != 'inf_vacuum'
+    assert cell.dimension > 1
+    mesh = mydf.mesh
+    charge = cp.asarray(-cell.atom_charges())
+    Gv = cell.get_Gv(mesh)
+    SI = get_SI(cell, mesh=mesh)
+    rhoG = charge.dot(SI)
+
+    coulG = tools.get_coulG(cell, mesh=mesh, Gv=Gv)
+    vneG = rhoG * coulG
+    vneR = tools.ifft(vneG, mesh).real
+
+    nkpts = len(kpts)
+    nao = cell.nao
+    if is_zero(kpts):
+        vne = cp.zeros((nkpts,nao,nao))
+    else:
+        vne = cp.zeros((nkpts,nao,nao), dtype=np.complex128)
+    kpts = np.asarray(kpts)
+    ao_ks = numint.eval_ao_kpts(cell, mydf.grids.coords, kpts)
+    for k, ao in enumerate(ao_ks):
+        vne[k] += (ao.conj().T*vneR).dot(ao)
+
+    if is_single_kpt:
+        vne = vne[0]
+    return vne
+
+def get_pp(mydf, kpts=None):
+    '''Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed.
+    '''
+    from gpu4pyscf.pbc.dft import numint
+    kpts, is_single_kpt = _check_kpts(mydf, kpts)
+    cell = mydf.cell
+    assert cell.low_dim_ft_type != 'inf_vacuum'
+    assert cell.dimension > 1
+    mesh = mydf.mesh
+    Gv = cell.get_Gv(mesh)
+    SI = get_SI(cell, mesh=mesh)
+    vpplocG = pseudo.get_vlocG(cell, Gv)
+    vpplocG = -np.einsum('ij,ij->j', SI, vpplocG)
+    vpplocG = cp.asarray(vpplocG)
+    # vpploc evaluated in real-space
+    vpplocR = tools.ifft(vpplocG, mesh).real
+
+    ngrids = len(vpplocG)
+    nkpts = len(kpts)
+    nao = cell.nao
+    if is_zero(kpts):
+        vpp = cp.zeros((nkpts,nao,nao))
+    else:
+        vpp = cp.zeros((nkpts,nao,nao), dtype=np.complex128)
+    kpts = np.asarray(kpts)
+    ao_ks = numint.eval_ao_kpts(cell, mydf.grids.coords, kpts)
+    for k, ao in enumerate(ao_ks):
+        vpp[k] += (ao.conj().T*vpplocR).dot(ao)
+
+    # vppnonloc evaluated in reciprocal space
+    fakemol = gto.Mole()
+    fakemol._atm = np.zeros((1,gto.ATM_SLOTS), dtype=np.int32)
+    fakemol._bas = np.zeros((1,gto.BAS_SLOTS), dtype=np.int32)
+    ptr = gto.PTR_ENV_START
+    fakemol._env = np.zeros(ptr+10)
+    fakemol._bas[0,gto.NPRIM_OF ] = 1
+    fakemol._bas[0,gto.NCTR_OF  ] = 1
+    fakemol._bas[0,gto.PTR_EXP  ] = ptr+3
+    fakemol._bas[0,gto.PTR_COEFF] = ptr+4
+
+    # buf for SPG_lmi upto l=0..3 and nl=3
+    buf = np.empty((48,ngrids), dtype=np.complex128)
+    def vppnl_by_k(kpt):
+        Gk = Gv + kpt
+        G_rad = lib.norm(Gk, axis=1)
+        aokG = ft_ao.ft_ao(cell, Gv, kpt=kpt) * (1/cell.vol)**.5
+        vppnl = 0
+        for ia in range(cell.natm):
+            symb = cell.atom_symbol(ia)
+            if symb not in cell._pseudo:
+                continue
+            pp = cell._pseudo[symb]
+            p1 = 0
+            for l, proj in enumerate(pp[5:]):
+                rl, nl, hl = proj
+                if nl > 0:
+                    fakemol._bas[0,gto.ANG_OF] = l
+                    fakemol._env[ptr+3] = .5*rl**2
+                    fakemol._env[ptr+4] = rl**(l+1.5)*np.pi**1.25
+                    pYlm_part = fakemol.eval_gto('GTOval', Gk)
+
+                    p0, p1 = p1, p1+nl*(l*2+1)
+                    # pYlm is real, SI[ia] is complex
+                    pYlm = np.ndarray((nl,l*2+1,ngrids), dtype=np.complex128, buffer=buf[p0:p1])
+                    for k in range(nl):
+                        qkl = pseudo.pp._qli(G_rad*rl, l, k)
+                        pYlm[k] = pYlm_part.T * qkl
+                    #:SPG_lmi = np.einsum('g,nmg->nmg', SI[ia].conj(), pYlm)
+                    #:SPG_lm_aoG = np.einsum('nmg,gp->nmp', SPG_lmi, aokG)
+                    #:tmp = np.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+                    #:vppnl += np.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+            if p1 > 0:
+                SPG_lmi = buf[:p1]
+                SPG_lmi *= SI[ia].conj()
+                SPG_lm_aoGs = lib.zdot(SPG_lmi, aokG)
+                p1 = 0
+                for l, proj in enumerate(pp[5:]):
+                    rl, nl, hl = proj
+                    if nl > 0:
+                        p0, p1 = p1, p1+nl*(l*2+1)
+                        hl = np.asarray(hl)
+                        SPG_lm_aoG = SPG_lm_aoGs[p0:p1].reshape(nl,l*2+1,-1)
+                        tmp = np.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+                        vppnl += np.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+        return vppnl * (1./cell.vol)
+
+    for k, kpt in enumerate(kpts):
+        vppnl = vppnl_by_k(kpt)
+        if is_zero(kpt):
+            vpp[k] += cp.asarray(vppnl.real)
+        else:
+            vpp[k] += cp.asarray(vppnl)
+
+    if is_single_kpt:
+        vpp = vpp[0]
+    return vpp
+
+def get_SI(cell, Gv=None, mesh=None, atmlst=None):
+    '''Calculate the structure factor (0D, 1D, 2D, 3D) for all atoms; see MH (3.34).
+
+    Args:
+        cell : instance of :class:`Cell`
+
+        Gv : (N,3) array
+            G vectors
+
+        atmlst : list of ints, optional
+            Indices of atoms for which the structure factors are computed.
+
+    Returns:
+        SI : (natm, ngrids) ndarray, dtype=np.complex128
+            The structure factor for each atom at each G-vector.
+    '''
+    coords = cp.asarray(cell.atom_coords())
+    if atmlst is not None:
+        coords = coords[np.asarray(atmlst)]
+    if Gv is None:
+        if mesh is None:
+            mesh = cell.mesh
+        basex, basey, basez = cell.get_Gv_weights(mesh)[1]
+        basex = cp.asarray(basex)
+        basey = cp.asarray(basey)
+        basez = cp.asarray(basez)
+        b = cp.asarray(cell.reciprocal_vectors())
+        rb = coords.dot(b.T)
+        SIx = cp.exp(-1j*rb[:,0,None] * basex)
+        SIy = cp.exp(-1j*rb[:,1,None] * basey)
+        SIz = cp.exp(-1j*rb[:,2,None] * basez)
+        SI = SIx[:,:,None,None] * SIy[:,None,:,None] * SIz[:,None,None,:]
+        natm = coords.shape[0]
+        SI = SI.reshape(natm, -1)
+    else:
+        SI = cp.exp(-1j*coords.dot(cp.asarray(Gv).T))
+    return SI
+
+
+class FFTDF(lib.StreamObject):
+    '''Density expansion on plane waves (GPW method)
+    '''
+
+    blockdim = 240
+
+    _keys = fft_cpu.FFTDF._keys
+
+    def __init__(self, cell, kpts=np.zeros((1,3))):
+        from gpu4pyscf.pbc.dft import gen_grid
+        from gpu4pyscf.pbc.dft import numint
+        self.cell = cell
+        self.stdout = cell.stdout
+        self.verbose = cell.verbose
+        self.max_memory = cell.max_memory
+        self.kpts = kpts
+        self.grids = gen_grid.UniformGrids(cell)
+
+        # The following attributes are not input options.
+        # self.exxdiv has no effects. It was set in the get_k_kpts function to
+        # mimic the KRHF/KUHF object in the call to tools.get_coulG.
+        self.exxdiv = None
+        self._numint = numint.KNumInt()
+        self._rsh_df = {}  # Range separated Coulomb DF objects
+
+    mesh = fft_cpu.FFTDF.mesh
+    dump_flags = fft_cpu.FFTDF.dump_flags
+    check_sanity = fft_cpu.FFTDF.check_sanity
+    build = fft_cpu.FFTDF.build
+    reset = fft_cpu.FFTDF.reset
+
+    aoR_loop = NotImplemented
+
+    get_pp = get_pp
+    get_nuc = get_nuc
+
+    def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
+               with_j=True, with_k=True, omega=None, exxdiv=None):
+        if omega is not None:  # J/K for RSH functionals
+            with self.range_coulomb(omega) as rsh_df:
+                return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+                                     omega=None, exxdiv=exxdiv)
+
+        kpts, is_single_kpt = _check_kpts(self, kpts)
+        if is_single_kpt:
+            vj, vk = fft_jk.get_jk(self, dm, hermi, kpts[0], kpts_band,
+                                   with_j, with_k, exxdiv)
+        else:
+            vj = vk = None
+            if with_k:
+                vk = fft_jk.get_k_kpts(self, dm, hermi, kpts, kpts_band, exxdiv)
+            if with_j:
+                vj = fft_jk.get_j_kpts(self, dm, hermi, kpts, kpts_band)
+        return vj, vk
+
+    get_eri = get_ao_eri = NotImplemented
+    ao2mo = get_mo_eri = NotImplemented
+    ao2mo_7d = NotImplemented
+    get_ao_pairs_G = get_ao_pairs = NotImplemented
+    get_mo_pairs_G = get_mo_pairs = NotImplemented
+
+    range_coulomb = aft_cpu.AFTDF.range_coulomb
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    def to_cpu(self):
+        obj = utils.to_cpu(self)
+        return obj.reset()
diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py
new file mode 100644
index 00000000..31e9a5d7
--- /dev/null
+++ b/gpu4pyscf/pbc/df/fft_jk.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+JK with GPW
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.lib.kpts_helper import is_zero, member
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.pbc import tools
+
+__all__ = [
+    'get_j_kpts', 'get_k_kpts', 'get_jk', 'get_j', 'get_k',
+    'get_j_e1_kpts', 'get_k_e1_kpts'
+]
+
+def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
+    '''Get the Coulomb (J) AO matrix at sampled k-points.
+
+    Args:
+        dm_kpts : (nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray
+            Density matrix at each k-point.  If a list of k-point DMs, eg,
+            UHF alpha and beta DM, the alpha and beta DMs are contracted
+            separately.
+        kpts : (nkpts, 3) ndarray
+
+    Kwargs:
+        kpts_band : (3,) ndarray or (*,3) ndarray
+            A list of arbitrary "band" k-points at which to evalute the matrix.
+
+    Returns:
+        vj : (nkpts, nao, nao) ndarray
+        or list of vj if the input dm_kpts is a list of DMs
+    '''
+    cell = mydf.cell
+    mesh = mydf.mesh
+    assert cell.low_dim_ft_type != 'inf_vacuum'
+    assert cell.dimension > 1
+
+    ni = mydf._numint
+    dm_kpts = cp.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    ngrids = len(coulG)
+
+    if hermi == 1 or is_zero(kpts):
+        vR = cp.zeros((nset,ngrids))
+        ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts)
+        for i in range(nset):
+            rhoR = ni.eval_rho(cell, ao_ks, dm_kpts[i], hermi=hermi).real
+            rhoG = tools.fft(rhoR, mesh)
+            vG = coulG * rhoG
+            vR[i] = tools.ifft(vG, mesh).real
+    else:
+        vR = cp.zeros((nset,ngrids), dtype=np.complex128)
+        ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts)
+        for i in range(nset):
+            rhoR = ni.eval_rho(cell, ao_ks, dm_kpts[i], hermi=hermi)
+            rhoG = tools.fft(rhoR, mesh)
+            vG = coulG * rhoG
+            vR[i] = tools.ifft(vG, mesh)
+
+    vR *= cell.vol / ngrids
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    nband = len(kpts_band)
+    if is_zero(kpts_band):
+        vj_kpts = cp.zeros((nset,nband,nao,nao))
+    else:
+        vj_kpts = cp.zeros((nset,nband,nao,nao), dtype=np.complex128)
+
+    if input_band is not None:
+        ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts_band)
+    for k, ao in enumerate(ao_ks):
+        for i in range(nset):
+            aow = ao * vR[i,:,None]
+            vj_kpts[i,k] += ao.conj().T.dot(aow)
+
+    return _format_jks(vj_kpts, dm_kpts, input_band, kpts)
+
+def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None,
+               exxdiv=None):
+    '''Get the Coulomb (J) and exchange (K) AO matrices at sampled k-points.
+
+    Args:
+        dm_kpts : (nkpts, nao, nao) ndarray
+            Density matrix at each k-point
+        kpts : (nkpts, 3) ndarray
+
+    Kwargs:
+        hermi : int
+            Whether K matrix is hermitian
+
+            | 0 : not hermitian and not symmetric
+            | 1 : hermitian
+
+        kpts_band : (3,) ndarray or (*,3) ndarray
+            A list of arbitrary "band" k-points at which to evalute the matrix.
+
+    Returns:
+        vj : (nkpts, nao, nao) ndarray
+        vk : (nkpts, nao, nao) ndarray
+        or list of vj and vk if the input dm_kpts is a list of DMs
+    '''
+    cell = mydf.cell
+    mesh = mydf.mesh
+    assert cell.low_dim_ft_type != 'inf_vacuum'
+    assert cell.dimension > 1
+    coords = mydf.grids.coords
+    ngrids = coords.shape[0]
+
+    if getattr(dm_kpts, 'mo_coeff', None) is not None:
+        mo_coeff = dm_kpts.mo_coeff
+        mo_occ   = dm_kpts.mo_occ
+    else:
+        mo_coeff = None
+
+    ni = mydf._numint
+    kpts = np.asarray(kpts)
+    dm_kpts = cp.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+
+    weight = 1./nkpts * (cell.vol/ngrids)
+
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    nband = len(kpts_band)
+
+    if is_zero(kpts_band) and is_zero(kpts):
+        vk_kpts = cp.zeros((nset,nband,nao,nao), dtype=dms.dtype)
+    else:
+        vk_kpts = cp.zeros((nset,nband,nao,nao), dtype=np.complex128)
+
+    ao2_kpts = ni.eval_ao(cell, coords, kpts=kpts)
+    if input_band is None:
+        ao1_kpts = ao2_kpts
+    else:
+        ao1_kpts = ni.eval_ao(cell, coords, kpts=kpts_band)
+
+    if mo_coeff is not None and nset == 1:
+        mo2_kpts = [
+            ao.dot(mo[:,occ>0] * occ[occ>0]**.5)
+            for occ, mo, ao in zip(mo_occ, mo_coeff, ao2_kpts)]
+        ao2_kpts = mo2_kpts
+    else:
+        mo2_kpts = None
+
+    vR_dm = cp.empty((nset,nao,ngrids), dtype=vk_kpts.dtype)
+    blksize = 32
+
+    for k2, ao2 in enumerate(ao2_kpts):
+        ao2T = ao2.T
+        kpt2 = kpts[k2]
+        naoj = ao2.shape[1]
+        if mo2_kpts is None:
+            ao_dms = [dms[i,k2].dot(ao2T.conj()) for i in range(nset)]
+        else:
+            ao_dms = [ao2T.conj()]
+
+        for k1, ao1 in enumerate(ao1_kpts):
+            ao1T = ao1.T
+            kpt1 = kpts_band[k1]
+
+            # If we have an ewald exxdiv, we add the G=0 correction near the
+            # end of the function to bypass any discretization errors
+            # that arise from the FFT.
+            if exxdiv == 'ewald':
+                coulG = tools.get_coulG(cell, kpt2-kpt1, False, mydf, mesh)
+            else:
+                coulG = tools.get_coulG(cell, kpt2-kpt1, exxdiv, mydf, mesh)
+            if is_zero(kpt1-kpt2):
+                expmikr = cp.array(1.)
+            else:
+                expmikr = cp.exp(-1j * coords.dot(cp.asarray(kpt2-kpt1)))
+
+            for p0, p1 in lib.prange(0, nao, blksize):
+                rho1 = contract('ig,jg->ijg', ao1T[p0:p1].conj()*expmikr, ao2T)
+                vG = tools.fft(rho1.reshape(-1,ngrids), mesh)
+                rho1 = None
+                vG *= coulG
+                vR = tools.ifft(vG, mesh).reshape(p1-p0,naoj,ngrids)
+                vG = None
+                if vk_kpts.dtype == np.double:
+                    vR = vR.real
+                for i in range(nset):
+                    vR_dm[i,p0:p1] = contract('ijg,jg->ig', vR, ao_dms[i])
+                vR = None
+            vR_dm *= expmikr.conj()
+
+            for i in range(nset):
+                vk_kpts[i,k1] += weight * vR_dm[i].dot(ao1)
+
+    # Function _ewald_exxdiv_for_G0 to add back in the G=0 component to vk_kpts
+    # Note in the _ewald_exxdiv_for_G0 implementation, the G=0 treatments are
+    # different for 1D/2D and 3D systems.  The special treatments for 1D and 2D
+    # can only be used with AFTDF/GDF/MDF method.  In the FFTDF method, 1D, 2D
+    # and 3D should use the ewald probe charge correction.
+    if exxdiv == 'ewald':
+        vk_kpts = _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band=kpts_band)
+
+    return _format_jks(vk_kpts, dm_kpts, input_band, kpts)
+
+def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None,
+           with_j=True, with_k=True, exxdiv=None):
+    '''Get the Coulomb (J) and exchange (K) AO matrices for the given density matrix.
+
+    Args:
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Kwargs:
+        hermi : int
+            Whether J, K matrix is hermitian
+            | 0 : no hermitian or symmetric
+            | 1 : hermitian
+            | 2 : anti-hermitian
+        kpt : (3,) ndarray
+            The "inner" dummy k-point at which the DM was evaluated (or
+            sampled).
+        kpts_band : (3,) ndarray or (*,3) ndarray
+            The "outer" primary k-point at which J and K are evaluated.
+
+    Returns:
+        The function returns one J and one K matrix, corresponding to the input
+        density matrix (both order and shape).
+    '''
+    dm = cp.asarray(dm, order='C')
+    vj = vk = None
+    if with_j:
+        vj = get_j(mydf, dm, hermi, kpt, kpts_band)
+    if with_k:
+        vk = get_k(mydf, dm, hermi, kpt, kpts_band, exxdiv)
+    return vj, vk
+
+def get_j(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None):
+    '''Get the Coulomb (J) AO matrix for the given density matrix.
+
+    Args:
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Kwargs:
+        hermi : int
+            Whether J, K matrix is hermitian
+            | 0 : no hermitian or symmetric
+            | 1 : hermitian
+            | 2 : anti-hermitian
+        kpt : (3,) ndarray
+            The "inner" dummy k-point at which the DM was evaluated (or
+            sampled).
+        kpts_band : (3,) ndarray or (*,3) ndarray
+            The "outer" primary k-point at which J and K are evaluated.
+
+    Returns:
+        The function returns one J matrix, corresponding to the input
+        density matrix (both order and shape).
+    '''
+    dm = cp.asarray(dm, order='C')
+    nao = dm.shape[-1]
+    dm_kpts = dm.reshape(-1,1,nao,nao)
+    vj = get_j_kpts(mydf, dm_kpts, hermi, kpt.reshape(1,3), kpts_band)
+    if kpts_band is None:
+        vj = vj[:,0,:,:]
+    if dm.ndim == 2:
+        vj = vj[0]
+    return vj
+
+
+def get_k(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None, exxdiv=None):
+    '''Get the Coulomb (J) and exchange (K) AO matrices for the given density matrix.
+
+    Args:
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Kwargs:
+        hermi : int
+            Whether J, K matrix is hermitian
+            | 0 : no hermitian or symmetric
+            | 1 : hermitian
+            | 2 : anti-hermitian
+        kpt : (3,) ndarray
+            The "inner" dummy k-point at which the DM was evaluated (or
+            sampled).
+        kpts_band : (3,) ndarray or (*,3) ndarray
+            The "outer" primary k-point at which J and K are evaluated.
+
+    Returns:
+        The function returns one J and one K matrix, corresponding to the input
+        density matrix (both order and shape).
+    '''
+    dm = cp.asarray(dm, order='C')
+    nao = dm.shape[-1]
+    dm_kpts = dm.reshape(-1,1,nao,nao)
+    vk = get_k_kpts(mydf, dm_kpts, hermi, kpt.reshape(1,3), kpts_band, exxdiv)
+    if kpts_band is None:
+        vk = vk[:,0,:,:]
+    if dm.ndim == 2:
+        vk = vk[0]
+    return vk
+
+get_j_e1_kpts = NotImplemented
+get_k_e1_kpts = NotImplemented
+
+def _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band=None):
+    from pyscf.pbc.tools.pbc import madelung
+    s = cp.asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+    m = madelung(cell, kpts)
+    if kpts is None:
+        for i,dm in enumerate(dms):
+            vk[i] += m * s.dot(dm).dot(s)
+    elif np.shape(kpts) == (3,):
+        if kpts_band is None or is_zero(kpts_band-kpts):
+            for i,dm in enumerate(dms):
+                vk[i] += m * s.dot(dm).dot(s)
+
+    elif kpts_band is None or np.array_equal(kpts, kpts_band):
+        for k in range(len(kpts)):
+            for i,dm in enumerate(dms):
+                vk[i,k] += m * s[k].dot(dm[k]).dot(s[k])
+    else:
+        for k, kpt in enumerate(kpts):
+            for kp in member(kpt, kpts_band.reshape(-1,3)):
+                for i,dm in enumerate(dms):
+                    vk[i,kp] += m * s[k].dot(dm[k]).dot(s[k])
+    return vk
diff --git a/gpu4pyscf/pbc/dft/__init__.py b/gpu4pyscf/pbc/dft/__init__.py
new file mode 100644
index 00000000..1de0a907
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/__init__.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''Kohn-Sham DFT for periodic systems
+'''
+
+from .gen_grid import UniformGrids, BeckeGrids
+from . import rks
+#from . import uks
+#from . import krks
+#from . import kuks
+from .rks import KohnShamDFT
+
+RKS = rks.RKS
+#UKS = uks.UKS
+#KRKS = krks.KRKS
+#KUKS = kuks.KUKS
diff --git a/gpu4pyscf/pbc/dft/gen_grid.py b/gpu4pyscf/pbc/dft/gen_grid.py
new file mode 100644
index 00000000..af1c40b4
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/gen_grid.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import ctypes
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.pbc.dft import gen_grid as gen_grid_cpu
+from pyscf.pbc.gto.cell import get_uniform_grids
+from gpu4pyscf.lib import utils
+
+class UniformGrids(lib.StreamObject):
+    '''Uniform Grid class.'''
+
+    def __init__(self, cell):
+        self.cell = cell
+        self.stdout = cell.stdout
+        self.verbose = cell.verbose
+        self.mesh = cell.mesh
+        self.non0tab = None
+        self._coords = None
+        self._weights = None
+
+    @property
+    def coords(self):
+        if self._coords is not None:
+            return self._coords
+        else:
+            return cp.asarray(get_uniform_grids(self.cell, self.mesh))
+    @coords.setter
+    def coords(self, x):
+        self._coords = x
+
+    @property
+    def weights(self):
+        if self._weights is not None:
+            return self._weights
+        else:
+            ngrids = np.prod(self.mesh)
+            weights = cp.empty(ngrids)
+            weights[:] = self.cell.vol / ngrids
+            return weights
+    @weights.setter
+    def weights(self, x):
+        self._weights = x
+
+    @property
+    def size(self):
+        return np.prod(self.mesh)
+
+    reset = gen_grid_cpu.UniformGrids.reset
+    build = gen_grid_cpu.UniformGrids.build
+    dump_flags = gen_grid_cpu.UniformGrids.dump_flags
+    kernel = gen_grid_cpu.UniformGrids.kernel
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    def to_cpu(self):
+        obj = utils.to_cpu(self)
+        return obj.reset()
+
+class BeckeGrids:
+    pass
diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py
new file mode 100644
index 00000000..7ecf6202
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/numint.py
@@ -0,0 +1,433 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.lib.kpts_helper import is_zero
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+from pyscf.pbc.dft import numint as numint_cpu
+from pyscf.dft.gen_grid import CUTOFF
+from pyscf.pbc.lib.kpts import KPoints
+from gpu4pyscf.dft import numint
+from gpu4pyscf.lib.cupy_helper import transpose_sum, contract, get_avail_mem
+from gpu4pyscf.lib import utils
+
+MIN_BLK_SIZE = numint.MIN_BLK_SIZE
+ALIGNED = numint.ALIGNED
+
+def eval_ao(cell, coords, kpt=np.zeros(3), deriv=0, relativity=0, shls_slice=None,
+            non0tab=None, cutoff=None, out=None, verbose=None):
+    '''Collocate AO crystal orbitals (opt. gradients) on the real-space grid.
+
+    Args:
+        cell : instance of :class:`Cell`
+
+        coords : (nx*ny*nz, 3) ndarray
+            The real-space grid point coordinates.
+
+    Kwargs:
+        kpt : (3,) ndarray
+            The k-point corresponding to the crystal AO.
+        deriv : int
+            AO derivative order.  It affects the shape of the return array.
+            If deriv=0, the returned AO values are stored in a (N,nao) array.
+            Otherwise the AO values are stored in an array of shape (M,N,nao).
+            Here N is the number of grids, nao is the number of AO functions,
+            M is the size associated to the derivative deriv.
+
+    Returns:
+        aoR : ([4,] nx*ny*nz, nao=cell.nao_nr()) ndarray
+            The value of the AO crystal orbitals on the real-space grid by default.
+            If deriv=1, also contains the value of the orbitals gradient in the
+            x, y, and z directions.  It can be either complex or float array,
+            depending on the kpt argument.  If kpt is not given (gamma point),
+            aoR is a float array.
+    '''
+    ao_kpts = eval_ao_kpts(cell, coords, np.reshape(kpt, (-1,3)), deriv)
+    return ao_kpts[0]
+
+def eval_ao_kpts(cell, coords, kpts=None, deriv=0, relativity=0,
+                 shls_slice=None, non0tab=None, cutoff=None, out=None, verbose=None):
+    '''
+    Returns:
+        ao_kpts: (nkpts, [comp], ngrids, nao) ndarray
+            AO values at each k-point
+    '''
+    return [cp.asarray(ao) for ao in numint_cpu.eval_ao_kpts(cell, coords.get(), kpts, deriv)]
+
+
+def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=False,
+             verbose=None):
+    '''Collocate the density (opt. gradients) on the real-space grid.
+
+    Args:
+        cell : instance of :class:`Mole` or :class:`Cell`
+
+        ao : ([4,] nx*ny*nz, nao=cell.nao_nr()) ndarray
+            The value of the AO crystal orbitals on the real-space grid by default.
+            If xctype='GGA', also contains the value of the gradient in the x, y,
+            and z directions.
+
+    Returns:
+        rho : ([4,] nx*ny*nz) ndarray
+            The value of the density on the real-space grid. If xctype='GGA',
+            also contains the value of the gradient in the x, y, and z
+            directions.
+
+    See Also:
+        pyscf.dft.numint.eval_rho
+
+    '''
+    if np.iscomplexobj(ao) or np.iscomplexobj(dm):
+        ngrids, nao = ao.shape[-2:]
+        ao_loc = cell.ao_loc_nr()
+        assert nao == ao_loc[-1]
+        dm = cp.asarray(dm, dtype=np.complex128)
+
+        if hermi == 1:
+            def dot_bra(bra, aodm):
+                rho = contract('pi,pi->p', bra.real, aodm.real)
+                rho += contract('pi,pi->p', bra.imag, aodm.imag)
+                return rho
+            dtype = np.float64
+        else:
+            def dot_bra(bra, aodm):
+                return contract('pi,pi->p', bra.conj(), aodm)
+            dtype = np.complex128
+
+        if xctype == 'LDA' or xctype == 'HF':
+            c0 = ao.dot(dm)
+            rho = dot_bra(ao, c0)
+
+        elif xctype == 'GGA':
+            rho = cp.empty((4,ngrids), dtype=dtype)
+            c0 = ao[0].dot(dm)
+            rho[0] = dot_bra(ao[0], c0)
+            for i in range(1, 4):
+                rho[i] = dot_bra(ao[i], c0)
+            if hermi == 1:
+                rho[1:4] *= 2
+            else:
+                c1 = ao[0].dot(dm.conj().T)
+                for i in range(1, 4):
+                    rho[i] += dot_bra(c1, ao[i])
+
+        else: # MGGA
+            assert not with_lapl
+            rho = cp.empty((5,ngrids), dtype=dtype)
+            tau_idx = 4
+            c0 = ao[0].dot(dm)
+            rho[0] = dot_bra(ao[0], c0)
+            rho[tau_idx] = 0
+            for i in range(1, 4):
+                c1 = ao[i].dot(dm)
+                rho[tau_idx] += dot_bra(ao[i], c1)
+                rho[i] = dot_bra(ao[i], c0)
+                if hermi == 1:
+                    rho[i] *= 2
+                else:
+                    rho[i] += dot_bra(ao[0], c1)
+            rho[tau_idx] *= .5
+    else:
+        # real orbitals and real DM
+        # TODO: call numint.eval_rho. However, the structure of ao is not compatible
+        # rho = numint.eval_rho(cell, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
+        ngrids, nao = ao.shape[-2:]
+        ao_loc = cell.ao_loc_nr()
+        assert nao == ao_loc[-1]
+
+        def dot_bra(bra, aodm):
+            return contract('pi,pi->p', bra, aodm)
+
+        if xctype == 'LDA' or xctype == 'HF':
+            c0 = ao.dot(dm)
+            rho = dot_bra(ao, c0)
+
+        elif xctype == 'GGA':
+            rho = cp.empty((4,ngrids))
+            c0 = ao[0].dot(dm)
+            rho[0] = dot_bra(ao[0], c0)
+            for i in range(1, 4):
+                rho[i] = dot_bra(ao[i], c0)
+            if hermi == 1:
+                rho[1:4] *= 2
+            else:
+                c1 = ao[0].dot(dm.T)
+                for i in range(1, 4):
+                    rho[i] += dot_bra(c1, ao[i])
+
+        else: # MGGA
+            assert not with_lapl
+            rho = cp.empty((5,ngrids))
+            tau_idx = 4
+            c0 = ao[0].dot(dm)
+            rho[0] = dot_bra(ao[0], c0)
+            rho[tau_idx] = 0
+            for i in range(1, 4):
+                c1 = ao[i].dot(dm)
+                rho[tau_idx] += dot_bra(ao[i], c1)
+                rho[i] = dot_bra(ao[i], c0)
+                if hermi == 1:
+                    rho[i] *= 2
+                else:
+                    rho[i] += dot_bra(ao[0], c1)
+            rho[tau_idx] *= .5
+    return rho
+
+nr_uks_vxc = nr_uks = NotImplemented
+nr_nlc_vxc = NotImplemented
+nr_rks_fxc = NotImplemented
+nr_rks_fxc_st = NotImplemented
+nr_uks_fxc = NotImplemented
+cache_xc_kernel = NotImplemented
+cache_xc_kernel1 = NotImplemented
+
+
+def get_rho(ni, cell, dm, grids, kpts=np.zeros((1,3)), max_memory=2000):
+    '''Density in real space
+    '''
+    assert dm.ndim == 2 or dm.shape[0] == 1
+    rho = cp.empty(grids.size)
+    nao = cell.nao
+    p1 = 0
+    for ao_k1, ao_k2, mask, weight, coords \
+            in ni.block_loop(cell, grids, nao, 0, kpts, None, max_memory):
+        p0, p1 = p1, p1 + weight.size
+        rho[p0:p1] = ni.eval_rho(cell, ao_k1, dm, xctype='LDA', hermi=1)
+    return rho
+
+def _scale_ao(ao, wv, out=None):
+    # TODO: reuse gpu4pyscf.dft.numint._scale_ao
+    if wv.ndim == 1:
+        return ao * wv[:,None]
+    else:
+        return contract('ngi,ng->gi', ao, wv)
+
+def _tau_dot(bra, ket, wv):
+    '''1/2 <nabla i| v | nabla j>'''
+    # TODO: reuse gpu4pyscf.dft.numint._tau_dot
+    wv = .5 * wv
+    mat  = bra[1].conj().T.dot(_scale_ao(ket[1], wv))
+    mat += bra[2].conj().T.dot(_scale_ao(ket[2], wv))
+    mat += bra[3].conj().T.dot(_scale_ao(ket[3], wv))
+    return mat
+
+class NumInt(lib.StreamObject, numint.LibXCMixin):
+    '''Generalization of pyscf's NumInt class for a single k-point shift and
+    periodic images.
+    '''
+
+    get_vxc = nr_vxc = numint_cpu.NumInt.nr_vxc
+
+    def nr_rks(self, cell, grids, xc_code, dms, relativity=0, hermi=1,
+               kpt=None, kpts_band=None, max_memory=2000, verbose=None):
+        if kpt is None:
+            kpt = np.zeros(3)
+        xctype = self._xc_type(xc_code)
+        if xctype == 'LDA':
+            ao_deriv = 0
+            nvar = 1
+        elif xctype == 'GGA':
+            ao_deriv = 1
+            nvar = 4
+        elif xctype == 'MGGA':
+            ao_deriv = 1
+            nvar = 5
+        elif xctype == 'HF':
+            return 0, 0, cp.zeros_like(dms)
+        else:
+            raise NotImplementedError(f'nr_rks for functional {xc_code}')
+
+        dms = cp.asarray(dms)
+        dm_shape = dms.shape
+        nao = dm_shape[-1]
+        dms = dms.reshape(nao,nao)
+        ngrids = grids.size
+
+        rho = cp.empty([nvar,ngrids])
+        p0 = p1 = 0
+        for ao_ks, weight, coords \
+                in self.block_loop(cell, grids, ao_deriv, kpt=kpt):
+            p0, p1 = p1, p1 + weight.size
+            rho[:,p0:p1] = eval_rho(cell, ao_ks[0], dms, xctype=xctype, hermi=hermi)
+
+        if xctype == 'LDA':
+            exc, vxc = self.eval_xc_eff(xc_code, rho[0], deriv=1, xctype=xctype)[:2]
+        else:
+            exc, vxc = self.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
+        den = rho[0] * grids.weights
+        nelec = den.sum()
+        excsum = cp.sum(den * exc[:,0])
+
+        wv = vxc * grids.weights
+        # *.5 for v+v.conj().T at the end
+        if xctype == 'GGA':
+            wv[0] *= .5
+        elif xctype == 'MGGA':
+            wv[[0,4]] *= .5
+
+        kpts_band, input_band = _format_kpts_band(kpts_band, kpt), kpts_band
+        nband = len(kpts_band)
+        if is_zero(kpts_band):
+            vmat = cp.zeros((nband, nao, nao))
+        else:
+            vmat = cp.zeros((nband, nao, nao), dtype=np.complex128)
+        v_hermi = 1  # the output matrix must be hermitian
+        p0 = p1 = 0
+        for ao_ks, weight, coords \
+                in self.block_loop(cell, grids, ao_deriv, kpts_band=kpts_band):
+            p0, p1 = p1, p1 + weight.size
+            for k, ao in enumerate(ao_ks):
+                if xctype == 'LDA':
+                    aow = _scale_ao(ao, wv[0,p0:p1])
+                    vmat[k] += ao.conj().T.dot(aow)
+                elif xctype == 'GGA':
+                    aow = _scale_ao(ao[:4], wv[:4,p0:p1])
+                    vmat[k] += ao[0].conj().T.dot(aow)
+                elif xctype == 'MGGA':
+                    aow = _scale_ao(ao[:4], wv[:4,p0:p1])
+                    vmat[k] += ao[0].conj().T.dot(aow)
+                    vmat[k] += _tau_dot(ao, ao, wv[4,p0:p1])
+
+        if v_hermi and xctype != 'LDA':
+            vmat = vmat + vmat.transpose(0, 2, 1).conj()
+        if input_band is None:
+            vmat = vmat[0]
+        return nelec, excsum, vmat
+
+    def nr_uks(self, cell, grids, xc_code, dms, relativity=0, hermi=1,
+               kpt=None, kpts_band=None, max_memory=2000, verbose=None):
+        raise NotImplementedError
+
+    def block_loop(self, cell, grids, deriv=0, kpt=None, kpts_band=None):
+        '''Define this macro to loop over grids by blocks.
+        '''
+        nao = cell.nao
+        grids_coords = grids.coords
+        grids_weights = grids.weights
+        ngrids = grids_coords.shape[0]
+        comp = (deriv+1)*(deriv+2)*(deriv+3)//6
+
+        #cupy.get_default_memory_pool().free_all_blocks()
+        mem_avail = get_avail_mem()
+        blksize = int((mem_avail*.2/8/((comp+1)*nao))/ ALIGNED) * ALIGNED
+        blksize = min(blksize, MIN_BLK_SIZE)
+        if blksize < ALIGNED:
+            raise RuntimeError('Not enough GPU memory')
+
+        if kpts_band is None:
+            if kpt is None:
+                kpts = np.zeros((1, 3))
+            else:
+                kpts = np.reshape(kpt, (1, 3))
+        elif kpt is None:
+            kpts = np.reshape(kpts_band, (-1, 3))
+        else:
+            raise RuntimeError('Cannot produce AOs for kpt and kpts_band in the same run')
+
+        for ip0, ip1 in lib.prange(0, ngrids, blksize):
+            coords = grids_coords[ip0:ip1]
+            weight = grids_weights[ip0:ip1]
+            ao_ks = eval_ao_kpts(cell, coords, kpts, deriv=deriv)
+            yield ao_ks, weight, coords
+            ao_ks = None
+
+    eval_xc_eff = numint.eval_xc_eff
+    _init_xcfuns = numint.NumInt._init_xcfuns
+
+    get_fxc = nr_fxc = numint_cpu.NumInt.nr_fxc
+    nr_rks_fxc = nr_rks_fxc
+    nr_uks_fxc = nr_uks_fxc
+    nr_rks_fxc_st = nr_rks_fxc_st
+    nr_nlc_vxc = nr_nlc_vxc
+    cache_xc_kernel = cache_xc_kernel
+    cache_xc_kernel1 = cache_xc_kernel1
+    get_rho = get_rho
+
+    eval_ao = staticmethod(eval_ao)
+    eval_rho = staticmethod(eval_rho)
+    eval_rho2 = NotImplemented
+    eval_rho1 = NotImplemented
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    def to_cpu(self):
+        return numint_cpu.NumInt()
+
+_NumInt = NumInt
+
+
+class KNumInt(lib.StreamObject, numint.LibXCMixin):
+    '''Generalization of pyscf's NumInt class for k-point sampling and
+    periodic images.
+    '''
+    def __init__(self, kpts=np.zeros((1,3))):
+        self.kpts = np.reshape(kpts, (-1,3))
+
+    eval_ao = staticmethod(eval_ao_kpts)
+
+    make_mask = NotImplemented
+
+    def eval_rho(self, cell, ao_kpts, dm_kpts, non0tab=None, xctype='LDA',
+                 hermi=0, with_lapl=True, verbose=None):
+        '''Collocate the density (opt. gradients) on the real-space grid.
+
+        Args:
+            cell : Mole or Cell object
+            ao_kpts : (nkpts, ngrids, nao) ndarray
+                AO values at each k-point
+            dm_kpts: (nkpts, nao, nao) ndarray
+                Density matrix at each k-point
+
+        Returns:
+           rhoR : (ngrids,) ndarray
+        '''
+        nkpts = len(ao_kpts)
+        rho_ks = [eval_rho(cell, ao_kpts[k], dm_kpts[k], non0tab, xctype,
+                           hermi, with_lapl, verbose)
+                  for k in range(nkpts)]
+        dtype = np.result_type(*rho_ks)
+        rho = cp.zeros(rho_ks[0].shape, dtype=dtype)
+        for k in range(nkpts):
+            rho += rho_ks[k]
+        rho *= 1./nkpts
+        return rho
+
+    get_vxc = nr_vxc = numint_cpu.KNumInt.nr_vxc
+    eval_rho1 = NotImplemented
+    nr_rks = NotImplemented
+    nr_uks = NotImplemented
+
+    block_loop = NotImplemented
+    eval_rho2 = NotImplemented
+    get_vxc = nr_vxc = numint_cpu.KNumInt.nr_vxc
+    nr_rks_fxc = nr_rks_fxc
+    nr_uks_fxc = nr_uks_fxc
+    nr_rks_fxc_st = nr_rks_fxc_st
+    cache_xc_kernel  = cache_xc_kernel
+    cache_xc_kernel1 = cache_xc_kernel1
+    get_rho = get_rho
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    def to_cpu(self):
+        return numint_cpu.KNumInt()
+
+_KNumInt = KNumInt
diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py
new file mode 100644
index 00000000..f514b9e4
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/rks.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Non-relativistic Restricted Kohn-Sham for periodic systems at a single k-point
+'''
+
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.dft import rks as ks_cpu
+from pyscf.pbc.scf import khf
+from pyscf.pbc.dft import multigrid
+from gpu4pyscf.lib import logger, utils
+from gpu4pyscf.dft import rks as mol_ks
+from gpu4pyscf.pbc.scf import hf as pbchf
+from gpu4pyscf.pbc.dft import gen_grid
+from gpu4pyscf.pbc.dft import numint
+from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from pyscf import __config__
+
+__all__ = [
+    'get_veff', 'RKS', 'KohnShamDFT',
+]
+
+def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
+             kpt=None, kpts_band=None):
+    '''Coulomb + XC functional
+
+    .. note::
+        This function will change the ks object.
+
+    Args:
+        ks : an instance of :class:`RKS`
+            XC functional are controlled by ks.xc attribute.  Attribute
+            ks.grids might be initialized.
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Returns:
+        matrix Veff = J + Vxc.  Veff can be a list matrices, if the input
+        dm is a list of density matrices.
+    '''
+    if cell is None: cell = ks.cell
+    if dm is None: dm = ks.make_rdm1()
+    if kpt is None: kpt = ks.kpt
+    t0 = logger.init_timer(ks)
+
+    ni = ks._numint
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
+
+    if isinstance(ks.with_df, multigrid.MultiGridFFTDF):
+        if ks.do_nlc():
+            raise NotImplementedError(f'MultiGrid for NLC functional {ks.xc} + {ks.nlc}')
+
+    ground_state = (isinstance(dm, cp.ndarray) and dm.ndim == 2
+                    and kpts_band is None)
+    ks.initialize_grids(cell, dm, kpt, ground_state)
+
+    if hermi == 2:  # because rho = 0
+        n, exc, vxc = 0, 0, 0
+    else:
+        n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
+                                kpt, kpts_band)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
+        if ks.do_nlc():
+            if ni.libxc.is_nlc(ks.xc):
+                xc = ks.xc
+            else:
+                assert ni.libxc.is_nlc(ks.nlc)
+                xc = ks.nlc
+            n, enlc, vnlc = ni.nr_nlc_vxc(cell, ks.nlcgrids, xc, dm, 0, hermi, kpt)
+            exc += enlc
+            vxc += vnlc
+            logger.info(ks, 'nelec with nlc grids = %s', n)
+        t0 = logger.timer(ks, 'vxc', *t0)
+
+    if not hybrid:
+        vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
+        vxc += vj
+    else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
+        if omega == 0:
+            vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
+            vk *= hyb
+        elif alpha == 0: # LR=0, only SR exchange
+            vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
+            vk = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=-omega)
+            vk *= hyb
+        elif hyb == 0: # SR=0, only LR exchange
+            vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
+            vk = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
+            vk *= alpha
+        else: # SR and LR exchange with different ratios
+            vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
+            vk *= hyb
+            vklr = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
+            vklr *= (alpha - hyb)
+            vk += vklr
+        vxc += vj - vk * .5
+
+        if ground_state:
+            exc -= contract('ij,ji->', dm, vk).real * .5 * .5
+
+    if ground_state:
+        ecoul = contract('ij,ji->', dm, vj).real * .5
+    else:
+        ecoul = None
+
+    vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=None, vk=None)
+    return vxc
+
+def prune_small_rho_grids_(ks, cell, dm, grids, kpts):
+    raise NotImplementedError
+
+def get_rho(mf, dm=None, grids=None, kpt=None):
+    if dm is None: dm = mf.make_rdm1()
+    if grids is None: grids = mf.grids
+    if kpt is None: kpt = mf.kpt
+    if dm[0].ndim == 2:  # the UKS density matrix
+        dm = dm[0] + dm[1]
+    if isinstance(mf.with_df, multigrid.MultiGridFFTDF):
+        rho = mf.with_df.get_rho(dm, kpt)
+    else:
+        rho = mf._numint.get_rho(mf.cell, dm, grids, kpt, mf.max_memory)
+    return rho
+
+
+class KohnShamDFT(mol_ks.KohnShamDFT):
+    '''PBC-KS'''
+
+    _keys = ks_cpu.KohnShamDFT._keys
+
+    def __init__(self, xc='LDA,VWN'):
+        self.xc = xc
+        self.grids = gen_grid.UniformGrids(self.cell)
+        self.nlc = ''
+        self.nlcgrids = gen_grid.UniformGrids(self.cell)
+        self.small_rho_cutoff = getattr(
+            __config__, 'dft_rks_RKS_small_rho_cutoff', 1e-7)
+        if isinstance(self, khf.KSCF):
+            self._numint = numint.KNumInt(self.kpts)
+        else:
+            self._numint = numint.NumInt()
+
+    build = ks_cpu.KohnShamDFT.build
+    reset = ks_cpu.KohnShamDFT.reset
+    dump_flags = ks_cpu.KohnShamDFT.dump_flags
+
+    get_veff = NotImplemented
+    get_rho = get_rho
+
+    density_fit = NotImplemented
+    rs_density_fit = NotImplemented
+
+    jk_method = NotImplemented
+
+    to_rks = NotImplemented
+    to_uks = NotImplemented
+    to_gks = NotImplemented
+    to_hf = NotImplemented
+
+    def initialize_grids(self, cell, dm, kpts, ground_state=True):
+        '''Initialize self.grids the first time call get_veff'''
+        if self.grids.coords is None:
+            t0 = (logger.process_clock(), logger.perf_counter())
+            self.grids.build(with_non0tab=True)
+            if (isinstance(self.grids, gen_grid.BeckeGrids) and
+                self.small_rho_cutoff > 1e-20 and ground_state):
+                self.grids = prune_small_rho_grids_(
+                    self, self.cell, dm, self.grids, kpts)
+            t0 = logger.timer(self, 'setting up grids', *t0)
+        is_nlc = self.do_nlc()
+        if is_nlc and self.nlcgrids.coords is None:
+            t0 = (logger.process_clock(), logger.perf_counter())
+            self.nlcgrids.build(with_non0tab=True)
+            if (isinstance(self.grids, gen_grid.BeckeGrids) and
+                self.small_rho_cutoff > 1e-20 and ground_state):
+                self.nlcgrids = prune_small_rho_grids_(
+                    self, self.cell, dm, self.nlcgrids, kpts)
+            t0 = logger.timer(self, 'setting up nlc grids', *t0)
+        return self
+
+# Update the KohnShamDFT label in pbc.scf.hf module
+pbchf.KohnShamDFT = KohnShamDFT
+
+
+class RKS(KohnShamDFT, pbchf.RHF):
+    '''RKS class adapted for PBCs.
+
+    This is a literal duplication of the molecular RKS class with some `mol`
+    variables replaced by `cell`.
+    '''
+
+    def __init__(self, cell, kpt=np.zeros(3), xc='LDA,VWN', exxdiv='ewald'):
+        pbchf.RHF.__init__(self, cell, kpt, exxdiv=exxdiv)
+        KohnShamDFT.__init__(self, xc)
+
+    def dump_flags(self, verbose=None):
+        pbchf.RHF.dump_flags(self, verbose)
+        KohnShamDFT.dump_flags(self, verbose)
+        return self
+
+    get_veff = get_veff
+    energy_elec = mol_ks.energy_elec
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    def to_cpu(self):
+        mf = ks_cpu.RKS(self.cell)
+        utils.to_cpu(self, out=mf)
+        return mf
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
new file mode 100644
index 00000000..1489ee40
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import tempfile
+import numpy as np
+from pyscf.pbc import gto as pbcgto
+from gpu4pyscf.pbc import dft as pbcdft
+
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        global cell
+        L = 4
+        n = 21
+        cell = pbcgto.Cell()
+        cell.build(unit = 'B',
+                   a = ((L,0,0),(0,L,0),(0,0,L)),
+                   mesh = [n,n,n],
+                   atom = [['He', (L/2.-.5,L/2.,L/2.-.5)],
+                           ['He', (L/2.   ,L/2.,L/2.+.5)]],
+                   basis = { 'He': [[0, (0.8, 1.0)],
+                                    [0, (1.0, 1.0)],
+                                    [0, (1.2, 1.0)]]})
+        cls.cell = cell
+
+    @classmethod
+    def tearDownClass(cls):
+        global cell
+        del cell
+
+    def test_lda_fft(self):
+        mf = pbcdft.RKS(cell, xc='lda,vwn').run()
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e0, c0 = mf_ref.get_bands(kpts_band)
+        e1, c1 = mf.get_bands(kpts_band)
+        self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+        self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+    def test_gga_fft(self):
+        mf = pbcdft.RKS(cell, xc='pbe0').run()
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e0, c0 = mf_ref.get_bands(kpts_band)
+        e1, c1 = mf.get_bands(kpts_band)
+        self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+        self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+    def test_rsh_fft(self):
+        mf = pbcdft.RKS(cell, xc='camb3lyp').run()
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e0, c0 = mf_ref.get_bands(kpts_band)
+        e1, c1 = mf.get_bands(kpts_band)
+        self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+        self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+    def test_lda_fft_with_kpt(self):
+        np.random.seed(1)
+        k = np.random.random(3)
+        mf = pbcdft.RKS(cell, xc='lda,vwn', kpt=k).run()
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e0, c0 = mf_ref.get_bands(kpts_band)
+        e1, c1 = mf.get_bands(kpts_band)
+        self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+        self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+    def test_gga_fft_with_kpt(self):
+        np.random.seed(1)
+        k = np.random.random(3)
+        mf = pbcdft.RKS(cell, xc='pbe0', kpt=k).run()
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e0, c0 = mf_ref.get_bands(kpts_band)
+        e1, c1 = mf.get_bands(kpts_band)
+        self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+        self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+    def test_rsh_fft_with_kpt(self):
+        np.random.seed(1)
+        k = np.random.random(3)
+        mf = pbcdft.RKS(cell, xc='camb3lyp', kpt=k).run(conv_tol=1e-8)
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e0, c0 = mf_ref.get_bands(kpts_band)
+        e1, c1 = mf.get_bands(kpts_band)
+        self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+        self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+if __name__ == '__main__':
+    print("Full Tests for pbc.dft.rks")
+    unittest.main()
diff --git a/gpu4pyscf/pbc/scf/__init__.py b/gpu4pyscf/pbc/scf/__init__.py
new file mode 100644
index 00000000..70ac5a5b
--- /dev/null
+++ b/gpu4pyscf/pbc/scf/__init__.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''Hartree-Fock for periodic systems
+'''
+
+from .import hf
+#from . import uhf
+#from . import khf
+#from . import kuhf
+
+rhf = hf
+#krhf = khf
+
+#UHF = uhf.UHF
+RHF = rhf.RHF
+#KRHF = krhf.KRHF
+#KUHF = kuhf.KRHF
diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py
new file mode 100644
index 00000000..83ad7b47
--- /dev/null
+++ b/gpu4pyscf/pbc/scf/hf.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Hartree-Fock for periodic systems at a single k-point
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.scf import hf as hf_cpu
+from gpu4pyscf.lib import logger, utils
+from gpu4pyscf.lib.cupy_helper import return_cupy_array, contract
+from gpu4pyscf.scf import hf as mol_hf
+from gpu4pyscf.pbc import df
+
+__all__ = [
+    'RHF', 'SCF'
+]
+
+def get_bands(mf, kpts_band, cell=None, dm=None, kpt=None):
+    '''Get energy bands at the given (arbitrary) 'band' k-points.
+
+    Returns:
+        mo_energy : (nmo,) ndarray or a list of (nmo,) ndarray
+            Bands energies E_n(k)
+        mo_coeff : (nao, nmo) ndarray or a list of (nao,nmo) ndarray
+            Band orbitals psi_n(k)
+    '''
+    if cell is None: cell = mf.cell
+    if dm is None: dm = mf.make_rdm1()
+    if kpt is None: kpt = mf.kpt
+
+    kpts_band = np.asarray(kpts_band)
+    single_kpt_band = (getattr(kpts_band, 'ndim', None) == 1)
+    kpts_band = kpts_band.reshape(-1,3)
+
+    fock = mf.get_veff(cell, dm, kpt=kpt, kpts_band=kpts_band)
+    fock += mf.get_hcore(cell, kpts_band)
+    s1e = mf.get_ovlp(cell, kpts_band)
+    nkpts = len(kpts_band)
+    mo_energy = []
+    mo_coeff = []
+    for k in range(nkpts):
+        e, c = mf.eig(fock[k], s1e[k])
+        mo_energy.append(e)
+        mo_coeff.append(c)
+
+    if single_kpt_band:
+        mo_energy = mo_energy[0]
+        mo_coeff = mo_coeff[0]
+    return mo_energy, mo_coeff
+
+get_fock = mol_hf.get_fock
+get_occ = mol_hf.get_occ
+get_grad = mol_hf.get_grad
+make_rdm1 = mol_hf.make_rdm1
+energy_elec = mol_hf.energy_elec
+
+def get_rho(mf, dm=None, grids=None, kpt=None):
+    '''Compute density in real space
+    '''
+    from gpu4pyscf.pbc.dft import gen_grid
+    from gpu4pyscf.pbc.dft import numint
+    if dm is None:
+        dm = mf.make_rdm1()
+    if getattr(dm, 'ndim', None) != 2:  # UHF
+        dm = dm[0] + dm[1]
+    if grids is None:
+        grids = gen_grid.UniformGrids(mf.cell)
+    if kpt is None:
+        kpt = mf.kpt
+    ni = numint.NumInt()
+    return ni.get_rho(mf.cell, dm, grids, kpt, mf.max_memory)
+
+class SCF(mol_hf.SCF):
+    '''SCF base class adapted for PBCs.
+
+    Attributes:
+        kpt : (3,) ndarray
+            The AO k-point in Cartesian coordinates, in units of 1/Bohr.
+
+        exxdiv : str
+            Exchange divergence treatment, can be one of
+
+            | None : ignore G=0 contribution in exchange
+            | 'ewald' : Ewald probe charge correction [JCP 122, 234102 (2005); DOI:10.1063/1.1926272]
+
+        with_df : density fitting object
+            Default is the instance of FFTDF class (GPW method).
+    '''
+
+    _keys = hf_cpu.SCF._keys
+
+    def __init__(self, cell, kpt=np.zeros(3), exxdiv='ewald'):
+        if not cell._built:
+            cell.build()
+        mol_hf.SCF.__init__(self, cell)
+        self.with_df = df.FFTDF(cell)
+        # Range separation JK builder
+        self.rsjk = None
+        self.exxdiv = exxdiv
+        self.kpt = kpt
+        self.conv_tol = 1e-8
+        if cell.precision:
+            self.conv_tol = max(cell.precision * 10, 1e-8)
+
+    def check_sanity(self):
+        if (isinstance(self.exxdiv, str) and self.exxdiv.lower() != 'ewald' and
+            isinstance(self.with_df, df.DF)):
+            logger.warn(self, 'exxdiv %s is not supported in DF', self.exxdiv)
+
+        if self.verbose >= logger.DEBUG:
+            super().check_sanity()
+        return self
+
+    kpt = hf_cpu.SCF.kpt
+    kpts = hf_cpu.SCF.kpts
+    mol = hf_cpu.SCF.mol # required by the hf.kernel
+
+    reset = hf_cpu.SCF.reset
+    build = hf_cpu.SCF.build
+    dump_flags = hf_cpu.SCF.dump_flags
+
+    get_bands = get_bands
+    get_rho = get_rho
+
+    get_ovlp = return_cupy_array(hf_cpu.SCF.get_ovlp)
+
+    def get_hcore(self, cell=None, kpt=None):
+        if cell is None: cell = self.cell
+        if kpt is None: kpt = self.kpt
+        if cell.pseudo:
+            nuc = self.with_df.get_pp(kpt)
+        else:
+            nuc = self.with_df.get_nuc(kpt)
+        if len(cell._ecpbas) > 0:
+            raise NotImplementedError('ECP in PBC SCF')
+        return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt))
+
+    def get_jk(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
+               with_j=True, with_k=True, omega=None, **kwargs):
+        r'''Get Coulomb (J) and exchange (K) following :func:`scf.hf.RHF.get_jk_`.
+        for particular k-point (kpt).
+
+        When kpts_band is given, the J, K matrices on kpts_band are evaluated.
+
+            J_{pq} = \sum_{rs} (pq|rs) dm[s,r]
+            K_{pq} = \sum_{rs} (pr|sq) dm[r,s]
+
+        where r,s are orbitals on kpt. p and q are orbitals on kpts_band
+        if kpts_band is given otherwise p and q are orbitals on kpt.
+        '''
+        if cell is None: cell = self.cell
+        if dm is None: dm = self.make_rdm1()
+        if kpt is None: kpt = self.kpt
+
+        cpu0 = logger.init_timer(self)
+        dm = cp.asarray(dm)
+        nao = dm.shape[-1]
+        vj, vk = self.with_df.get_jk(dm.reshape(-1,nao,nao), hermi, kpt, kpts_band,
+                                     with_j, with_k, omega, exxdiv=self.exxdiv)
+        if with_j:
+            vj = _format_jks(vj, dm, kpts_band)
+        if with_k:
+            vk = _format_jks(vk, dm, kpts_band)
+        logger.timer(self, 'vj and vk', *cpu0)
+        return vj, vk
+
+    def get_j(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
+              omega=None):
+        r'''Compute J matrix for the given density matrix and k-point (kpt).
+        When kpts_band is given, the J matrices on kpts_band are evaluated.
+
+            J_{pq} = \sum_{rs} (pq|rs) dm[s,r]
+
+        where r,s are orbitals on kpt. p and q are orbitals on kpts_band
+        if kpts_band is given otherwise p and q are orbitals on kpt.
+        '''
+        return self.get_jk(cell, dm, hermi, kpt, kpts_band, with_k=False,
+                           omega=omega)[0]
+
+    def get_k(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
+              omega=None):
+        '''Compute K matrix for the given density matrix.
+        '''
+        return self.get_jk(cell, dm, hermi, kpt, kpts_band, with_j=False,
+                           omega=omega)[1]
+
+    get_veff = hf_cpu.SCF.get_veff
+    energy_nuc = hf_cpu.SCF.energy_nuc
+    _finalize = hf_cpu.SCF._finalize
+
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if cell is None: cell = self.cell
+        dm = mol_hf.SCF.get_init_guess(self, cell, key)
+        dm = normalize_dm_(self, dm, s1e)
+        return dm
+
+    init_guess_by_1e = hf_cpu.SCF.init_guess_by_1e
+    init_guess_by_chkfile = hf_cpu.SCF.init_guess_by_chkfile
+    from_chk = hf_cpu.SCF.from_chk
+    dump_chk = hf_cpu.SCF.dump_chk
+    analyze = NotImplemented
+    mulliken_pop = NotImplemented
+    density_fit = NotImplemented
+    rs_density_fit = NotImplemented
+    x2c = x2c1e = sfx2c1e = NotImplemented
+    spin_square = NotImplemented
+    dip_moment = NotImplemented
+
+
+class KohnShamDFT:
+    '''A mock DFT base class
+
+    The base class is defined in the pbc.dft.rks module. This class can
+    be used to verify if an SCF object is an pbc-Hartree-Fock method or an
+    pbc-DFT method. It should be overwritten by the actual KohnShamDFT class
+    when loading dft module.
+    '''
+
+
+class RHF(SCF):
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    def to_cpu(self):
+        mf = hf_cpu.RHF(self.cell)
+        utils.to_cpu(self, out=mf)
+        return mf
+
+def _format_jks(vj, dm, kpts_band):
+    if kpts_band is None:
+        vj = vj.reshape(dm.shape)
+    elif kpts_band.ndim == 1:  # a single k-point on bands
+        vj = vj.reshape(dm.shape)
+    elif getattr(dm, "ndim", 0) == 2:
+        vj = vj[0]
+    return vj
+
+def normalize_dm_(mf, dm, s1e=None):
+    '''
+    Force density matrices integrated to the correct number of electrons.
+    '''
+    cell = mf.cell
+    if s1e is None:
+        s1e = mf.get_ovlp(cell)
+    ne = contract('ij,ji->', dm, s1e).real
+    if abs(ne - cell.nelectron) > 0.01:
+        logger.debug(mf, 'Big errors in the electron number of initial guess '
+                     'density matrix (Ne/cell = %g)!', ne)
+        dm *= cell.nelectron / ne
+    return dm
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
new file mode 100644
index 00000000..ca11d5b0
--- /dev/null
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.scf import hf as pbchf_cpu
+from pyscf.pbc import gto as pbcgto
+from gpu4pyscf.pbc import scf
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        L = 4
+        n = 21
+        cell = pbcgto.Cell()
+        cell.build(unit = 'B',
+                   verbose = 7,
+                   output = '/dev/null',
+                   a = ((L,0,0),(0,L,0),(0,0,L)),
+                   mesh = [n,n,n],
+                   atom = [['He', (L/2.-.5,L/2.,L/2.-.5)],
+                           ['He', (L/2.   ,L/2.,L/2.+.5)]],
+                   basis = { 'He': [[0, (0.8, 1.0)],
+                                    [0, (1.0, 1.0)],
+                                    [0, (1.2, 1.0)]]})
+        cls.cell = cell
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.cell.stdout.close()
+
+    def test_rhf_exx_ewald(self):
+        cell = self.cell
+        mf = scf.RHF(cell, exxdiv='ewald').run()
+        self.assertAlmostEqual(mf.e_tot, -4.3511582284698633, 7)
+        self.assertTrue(mf.mo_coeff.dtype == np.double)
+        #kmf = scf.KRHF(cell, [[0,0,0]], exxdiv='ewald').run()
+        #self.assertAlmostEqual(mf.e_tot, kmf.e_tot, 8)
+
+        # test bands
+        np.random.seed(1)
+        kpts_band = np.random.random((2,3))
+        e1, c1 = mf.get_bands(kpts_band)
+        #e0, c0 = kmf.get_bands(kpts_band)
+        #self.assertAlmostEqual(abs(e0[0]-e1[0]).max(), 0, 7)
+        #self.assertAlmostEqual(abs(e0[1]-e1[1]).max(), 0, 7)
+        self.assertAlmostEqual(lib.fp(e1[0].get()), -6.2986775452228283, 6)
+        self.assertAlmostEqual(lib.fp(e1[1].get()), -7.6616273746782362, 6)
+
+    def test_rhf_exx_ewald_with_kpt(self):
+        np.random.seed(1)
+        k = np.random.random(3)
+        cell = self.cell
+        mf = scf.RHF(cell, k, exxdiv='ewald')
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, -4.2048655827967139, 7)
+        self.assertTrue(mf.mo_coeff.dtype == np.complex128)
+
+        #kmf = scf.KRHF(cell, k, exxdiv='ewald')
+        #e0 = kmf.kernel()
+        #self.assertTrue(np.allclose(e0,e1))
+
+        # test bands
+        np.random.seed(1)
+        kpt_band = np.random.random(3)
+        e1, c1 = mf.get_bands(kpt_band)
+        #e0, c0 = kmf.get_bands(kpt_band)
+        #self.assertAlmostEqual(abs(e0-e1).max(), 0, 7)
+        self.assertAlmostEqual(lib.fp(e1.get()), -6.8312867098806249, 6)
+
+    def test_rhf_exx_None(self):
+        cell = self.cell
+        mf = scf.RHF(cell, exxdiv=None)
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, -2.9325094887283196, 7)
+        self.assertTrue(mf.mo_coeff.dtype == np.double)
+
+        #mf = scf.KRHF(cell, [[0,0,0]], exxdiv=None)
+        #e0 = mf.kernel()
+        #self.assertTrue(np.allclose(e0,e1))
+
+        np.random.seed(1)
+        k = np.random.random(3)
+        mf = scf.RHF(cell, k, exxdiv=None)
+        mf.init_guess = 'hcore'
+        e1 = mf.kernel()
+        self.assertAlmostEqual(e1, -2.7862168430230341, 7)
+        self.assertTrue(mf.mo_coeff.dtype == np.complex128)
+
+        #mf = scf.KRHF(cell, k, exxdiv=None)
+        #mf.init_guess = 'hcore'
+        #e0 = mf.kernel()
+        #self.assertTrue(np.allclose(e0,e1))
+
+    def test_jk(self):
+        cell = self.cell
+        nao = cell.nao
+        np.random.seed(2)
+        dm = np.random.random((2,nao,nao)) + .5j*np.random.random((2,nao,nao))
+        dm = dm + dm.conj().transpose(0,2,1)
+        ref = pbchf_cpu.RHF(cell).get_jk(cell, dm)
+
+        dm = cp.asarray(dm)
+        vj, vk = scf.RHF(cell).get_jk(cell, dm)
+        self.assertAlmostEqual(abs(vj.get() - ref[0]).max(), 0, 9)
+        self.assertAlmostEqual(abs(vk.get() - ref[1]).max(), 0, 9)
+
+
+if __name__ == '__main__':
+    print("Full Tests for pbc.scf.hf")
+    unittest.main()
diff --git a/gpu4pyscf/pbc/tools/__init__.py b/gpu4pyscf/pbc/tools/__init__.py
new file mode 100644
index 00000000..12b67013
--- /dev/null
+++ b/gpu4pyscf/pbc/tools/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .pbc import *
diff --git a/gpu4pyscf/pbc/tools/pbc.py b/gpu4pyscf/pbc/tools/pbc.py
new file mode 100644
index 00000000..c5fc91e8
--- /dev/null
+++ b/gpu4pyscf/pbc/tools/pbc.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import cupy as cp
+from gpu4pyscf.lib.cupy_helper import return_cupy_array
+from pyscf.pbc.tools.pbc import get_coulG
+
+get_coulG = return_cupy_array(get_coulG)
+
+def fft(f, mesh):
+    '''Perform the 3D FFT from real (R) to reciprocal (G) space.
+
+    After FFT, (u, v, w) -> (j, k, l).
+    (jkl) is in the index order of Gv.
+
+    FFT normalization factor is 1., as in MH and in `numpy.fft`.
+
+    Args:
+        f : (nx*ny*nz,) ndarray
+            The function to be FFT'd, flattened to a 1D array corresponding
+            to the index order of :func:`cartesian_prod`.
+        mesh : (3,) ndarray of ints (= nx,ny,nz)
+            The number G-vectors along each direction.
+
+    Returns:
+        (nx*ny*nz,) ndarray
+            The FFT 1D array in same index order as Gv (natural order of
+            numpy.fft).
+
+    '''
+    if f.size == 0:
+        return cp.zeros_like(f)
+
+    f3d = cp.asarray(f).reshape(-1, *mesh)
+    assert (f3d.shape[0] == 1 or f[0].size == f3d[0].size)
+    g3d = cp.fft.fftn(f3d, axes=(1,2,3))
+    ngrids = np.prod(mesh)
+    if f.ndim == 1 or (f.ndim == 3 and f.size == ngrids):
+        return g3d.ravel()
+    else:
+        return g3d.reshape(-1, ngrids)
+
+def ifft(g, mesh):
+    '''Perform the 3D inverse FFT from reciprocal (G) space to real (R) space.
+
+    Inverse FFT normalization factor is 1./N, same as in `numpy.fft` but
+    **different** from MH (they use 1.).
+
+    Args:
+        g : (nx*ny*nz,) ndarray
+            The function to be inverse FFT'd, flattened to a 1D array
+            corresponding to the index order of `span3`.
+        mesh : (3,) ndarray of ints (= nx,ny,nz)
+            The number G-vectors along each direction.
+
+    Returns:
+        (nx*ny*nz,) ndarray
+            The inverse FFT 1D array in same index order as Gv (natural order
+            of numpy.fft).
+
+    '''
+    if g.size == 0:
+        return cp.zeros_like(g)
+
+    g3d = cp.asarray(g).reshape(-1, *mesh)
+    assert (g3d.shape[0] == 1 or g[0].size == g3d[0].size)
+    f3d = cp.fft.ifftn(g3d, axes=(1,2,3))
+    ngrids = np.prod(mesh)
+    if g.ndim == 1 or (g.ndim == 3 and g.size == ngrids):
+        return f3d.ravel()
+    else:
+        return f3d.reshape(-1, ngrids)
+
+
+def fftk(f, mesh, expmikr):
+    r'''Perform the 3D FFT of a real-space function which is (periodic*e^{ikr}).
+
+    fk(k+G) = \sum_r fk(r) e^{-i(k+G)r} = \sum_r [f(k)e^{-ikr}] e^{-iGr}
+    '''
+    return fft(f*expmikr, mesh)
+
+
+def ifftk(g, mesh, expikr):
+    r'''Perform the 3D inverse FFT of f(k+G) into a function which is (periodic*e^{ikr}).
+
+    fk(r) = (1/Ng) \sum_G fk(k+G) e^{i(k+G)r} = (1/Ng) \sum_G [fk(k+G)e^{iGr}] e^{ikr}
+    '''
+    return ifft(g, mesh) * expikr
diff --git a/gpu4pyscf/properties/shielding.py b/gpu4pyscf/properties/shielding.py
index 1ef5e844..ae98dc04 100644
--- a/gpu4pyscf/properties/shielding.py
+++ b/gpu4pyscf/properties/shielding.py
@@ -18,7 +18,7 @@
 from pyscf.data import nist
 from pyscf.scf import _vhf, jk
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, take_last2d, add_sparse
+from gpu4pyscf.lib.cupy_helper import contract, sandwich_dot, add_sparse
 from gpu4pyscf.scf import cphf
 
 def gen_vind(mf, mo_coeff, mo_occ):
@@ -37,23 +37,20 @@ def gen_vind(mf, mo_coeff, mo_occ):
     mvir = mo_coeff[:, mo_occ == 0]
     nocc = mocc.shape[1]
     nvir = nmo - nocc
-    omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(
-            mf.xc, spin=mf.mol.spin)
+    omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin)
+    # FIXME: check if hybrid
+    # FIXME: handle rsh
 
     def fx(mo1):
         mo1 = mo1.reshape(-1, nvir, nocc)  # * the saving pattern
         mo1_mo_real = contract('nai,ua->nui', mo1, mvir)
         dm1 = 2*contract('nui,vi->nuv', mo1_mo_real, mocc.conj())
-        dm1 -= dm1.transpose(0, 2, 1)
+        dm1 = dm1 - dm1.transpose(0, 2, 1)
         if hasattr(mf,'with_df'):
-            v1 = cupy.empty((3, nao, nao))
-            for i in range(3):
-                v1[i] =+mf.get_jk(mf.mol, dm1[i], hermi=2, with_j=False)[1]*0.5*hyb
+            vk = mf.get_jk(mf.mol, dm1, hermi=2, with_j=False)[1]
         else:
-            v1 = np.empty((3, nao, nao))
-            for i in range(3):
-                v1[i] = -jk.get_jk(mf.mol, dm1[i].get(), 'ijkl,jk->il')*0.5*hyb
-            v1 = cupy.array(v1)
+            vk = cupy.array(jk.get_jk(mf.mol, dm1.get(), ['ijkl,jk->il']*3))
+        v1 = -.5*hyb * vk
         tmp = contract('nuv,vi->nui', v1, mocc)
         v1vo = contract('nui,ua->nai', tmp, mvir.conj())
 
@@ -68,7 +65,7 @@ def nr_rks(ni, mol, grids, xc_code, dms):
     mo_coeff = getattr(dms, 'mo_coeff', None)
     mo_occ = getattr(dms, 'mo_occ', None)
     nao = mo_coeff.shape[1]
-    
+
     opt = getattr(ni, 'gdftopt', None)
     if opt is None:
         ni.build(mol, grids.coords)
@@ -77,9 +74,8 @@ def nr_rks(ni, mol, grids, xc_code, dms):
 
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
-    dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
-    dms = take_last2d(dms, opt.ao_idx)
-    mo_coeff = mo_coeff[opt.ao_idx]
+    dms = sandwich_dot(cupy.asarray(dms).reshape(-1,nao0,nao0), coeff.T)
+    mo_coeff = coeff.dot(mo_coeff)
 
     vmat = cupy.zeros((3, nao, nao))
     if xctype == 'LDA':
@@ -100,7 +96,7 @@ def nr_rks(ni, mol, grids, xc_code, dms):
                 vtmp = contract('pu,p,vp->uv', giao_aux[idirect], wv, ao)
                 vtmp = cupy.ascontiguousarray(vtmp)
                 add_sparse(vmat[idirect], vtmp, index)
-            
+
         elif xctype == 'GGA':
             wv = vxc * weight
             giao = _sorted_mol.eval_gto('GTOval_ig', coords.get(), comp=3)
@@ -133,7 +129,7 @@ def nr_rks(ni, mol, grids, xc_code, dms):
 
         ao = None
 
-    vmat = take_last2d(vmat, opt.rev_ao_idx)
+    vmat = sandwich_dot(vmat, coeff)
 
     if numint.FREE_CUPY_CACHE:
         dms = None
@@ -164,8 +160,9 @@ def get_vxc(mf, dm0):
         vk = None
         vxc += vj
     else:
-        omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(
-            mf.xc, spin=mf.mol.spin)
+        omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin)
+        # FIXME: check if hybrid
+        # FIXME: handle rsh
         vxc += vj - vk*hyb*0.5
     return vxc
 
@@ -211,19 +208,16 @@ def eval_shielding(mf):
     s1jk = -contract('xiq,qj->xij', tmp, mocc)*0.5
     tmp = contract('nai,ua->nui', s1jk, mocc)
     s1jkdm1 = contract('nui,vi->nuv', tmp, mocc.conj())*2
-    s1jkdm1 -= s1jkdm1.transpose(0, 2, 1)
-    omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(
-            mf.xc, spin=mf.mol.spin)
+    s1jkdm1 = s1jkdm1 - s1jkdm1.transpose(0, 2, 1)
+    omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin)
+    # FIXME: check if hybrid
+    # FIXME: handle rsh
+
     if hasattr(mf,'with_df'):
-        vk2 = cupy.empty((3, nao, nao))
-        for i in range(3):
-            vk2[i] = +mf.get_jk(mf.mol, s1jkdm1[i], hermi=2, with_j=False)[1]*0.5*hyb
-    
+        vk = mf.get_jk(mf.mol, s1jkdm1, hermi=2, with_j=False)[1]
     else:
-        vk2 = np.empty((3, nao, nao))
-        for i in range(3):
-            vk2[i] = -jk.get_jk(mf.mol, s1jkdm1[i].get(), 'ijkl,jk->il')*0.5*hyb
-        vk2 = cupy.array(vk2)
+        vk = cupy.array(jk.get_jk(mf.mol, s1jkdm1.get(), ['ijkl,jk->il']*3))
+    vk2 = -.5*hyb * vk
     h1ao += vk2
     tmp = contract('xuv,ua->xav', h1ao, mvir)
     veff_ai = contract('xav,vi->xai', tmp, mocc)
diff --git a/gpu4pyscf/properties/tests/test_shielding.py b/gpu4pyscf/properties/tests/test_shielding.py
index e2415c80..0bbe9c07 100644
--- a/gpu4pyscf/properties/tests/test_shielding.py
+++ b/gpu4pyscf/properties/tests/test_shielding.py
@@ -135,4 +135,4 @@ def test_rks_b3lyp_df(self):
 
 if __name__ == "__main__":
     print("Full Tests for nmr shielding constants")
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py
index 874ab513..c2c8b056 100644
--- a/gpu4pyscf/qmmm/chelpg.py
+++ b/gpu4pyscf/qmmm/chelpg.py
@@ -48,7 +48,7 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
     _, _, fake_uniq_l_ctr, fake_l_ctr_counts = int3c2e.sort_mol(fake_mol)
 
     # sort auxiliary mol
-    sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol(
+    sorted_auxmol, _, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol(
         intopt.auxmol)
     if group_size_aux is not None:
         aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e._split_l_ctr_groups(
@@ -88,10 +88,7 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
     ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
     intopt.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
     ncart = cart_ao_loc[-1]
-    nsph = sph_ao_loc[-1]
-    intopt.cart2sph = block_c2s_diag(ncart, nsph, intopt.angular, l_ctr_counts)
-    inv_idx = np.argsort(intopt.sph_ao_idx, kind='stable').astype(np.int32)
-    intopt.coeff = intopt.cart2sph[:, inv_idx]
+    intopt.cart2sph = block_c2s_diag(intopt.angular, l_ctr_counts)
 
     # pairing auxiliary basis with fake basis set
     fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts))
@@ -109,7 +106,6 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
     cart_aux_loc = intopt.auxmol.ao_loc_nr(cart=True)
     sph_aux_loc = intopt.auxmol.ao_loc_nr(cart=False)
     ncart = cart_aux_loc[-1]
-    nsph = sph_aux_loc[-1]
     # inv_idx = np.argsort(intopt.sph_aux_idx, kind='stable').astype(np.int32)
     aux_l_ctr_offsets += fake_l_ctr_offsets[-1]
 
@@ -159,6 +155,13 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
         intopt.cp_idx, intopt.cp_jdx = np.unravel_index(
             np.arange(ncptype), (nl, nl))
 
+    intopt._sorted_mol = sorted_mol
+    intopt._sorted_auxmol = sorted_auxmol
+    if intopt.mol.cart:
+        intopt._ao_idx = intopt.cart_ao_idx
+    else:
+        intopt._ao_idx = intopt.sph_ao_idx
+
 def eval_chelpg_layer_gpu(mf, deltaR=0.3, Rhead=2.8, ifqchem=True, Rvdw=modified_Bondi, verbose=None):
     """Cal chelpg charge
 
diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py
index f704133e..986ae2f2 100644
--- a/gpu4pyscf/qmmm/pbc/itrf.py
+++ b/gpu4pyscf/qmmm/pbc/itrf.py
@@ -1026,7 +1026,8 @@ def calculate_h1e(self, h1_gpu):
                 v = cp.zeros_like(g_qm)
                 for i0,i1,j0,j1,k0,k1,j3c in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'):
                     v[:,i0:i1,j0:j1] += contract('xkji,k->xij', j3c, charges[k0:k1])
-                g_qm += cupy_helper.take_last2d(v, intopt.rev_ao_idx)
+                v = intopt.unsort_orbitals(v, axis=[1,2])
+                g_qm += v #cupy_helper.take_last2d(v, intopt.rev_ao_idx)
             elif mm_mol.charge_model == 'point' and len(coords) != 0:
                 max_memory = self.max_memory - lib.current_memory()[0]
                 blksize = int(min(max_memory*1e6/8/nao**2/3, 200))
@@ -1079,7 +1080,7 @@ def grad_hcore_mm(self, dm, mol=None):
             intopt.build(self.base.direct_scf_tol, diag_block_with_triu=True, aosym=False, 
                          group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
 
-            dm_ = cupy_helper.take_last2d(dm, intopt.sph_ao_idx)
+            dm_ = intopt.sort_orbitals(dm, axis=[0,1])
             for i0,i1,j0,j1,k0,k1,j3c in int3c2e.loop_int3c2e_general(intopt, ip_type='ip2'):
                 j3c = contract('xkji,k->xkji', j3c, charges[k0:k1])
                 g_[k0:k1] += contract('xkji,ij->kx', j3c, dm_[i0:i1,j0:j1])
diff --git a/gpu4pyscf/scf/_response_functions.py b/gpu4pyscf/scf/_response_functions.py
index 6677cf6f..b86b0514 100644
--- a/gpu4pyscf/scf/_response_functions.py
+++ b/gpu4pyscf/scf/_response_functions.py
@@ -19,7 +19,7 @@
 from gpu4pyscf.scf import hf, uhf
 
 def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
-                      singlet=None, hermi=0, max_memory=None):
+                      singlet=None, hermi=0, grids=None, max_memory=None):
     '''Generate a function to compute the product of RHF response function and
     RHF density matrices.
 
@@ -31,24 +31,29 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
     if mo_coeff is None: mo_coeff = mf.mo_coeff
     if mo_occ is None: mo_occ = mf.mo_occ
     mol = mf.mol
+    
     if isinstance(mf, hf.KohnShamDFT):
+        if grids is None:
+            grids = mf.grids
+        if grids and grids.coords is None:
+            grids.build(mol=mol, with_non0tab=False, sort_grids=True)
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
+        if mf.do_nlc():
             logger.warn(mf, 'NLC functional found in DFT object.  Its second '
                         'deriviative is not available. Its contribution is '
                         'not included in the response function.')
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
-        hybrid = abs(hyb) > 1e-10
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
         if singlet is None:
             # for ground state orbital hessian
-            rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
-                                                mo_coeff, mo_occ, 0)
+            spin = 0
         else:
-            rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
-                                                [mo_coeff]*2, [mo_occ*.5]*2, spin=1)
-        dm0 = None  #mf.make_rdm1(mo_coeff, mo_occ)
+            spin = 1
+        rho0, vxc, fxc = ni.cache_xc_kernel(
+            mol, grids, mf.xc, mo_coeff, mo_occ, spin, max_memory=max_memory)
+        dm0 = None
 
         if singlet is None:
             # Without specify singlet, used in ground state orbital hessian
@@ -57,9 +62,9 @@ def vind(dm1):
                 if hermi == 2:
                     v1 = cupy.zeros_like(dm1)
                 else:
-                    v1 = ni.nr_rks_fxc(mol, mf.grids, mf.xc, dm0, dm1, 0, hermi,
+                    v1 = ni.nr_rks_fxc(mol, grids, mf.xc, dm0, dm1, 0, hermi,
                                        rho0, vxc, fxc, max_memory=max_memory)
-                if hybrid or abs(alpha) > 1e-10:
+                if hybrid:
                     if hermi != 2:
                         vj, vk = mf.get_jk(mol, dm1, hermi=hermi)
                         vk *= hyb
@@ -71,8 +76,45 @@ def vind(dm1):
                 elif hermi != 2:
                     v1 += mf.get_j(mol, dm1, hermi=hermi)
                 return v1
-        else:
-            raise NotImplementedError('only singlet response is supported!')
+
+        elif singlet:
+            fxc *= .5
+            def vind(dm1):
+                if hermi == 2:
+                    v1 = cupy.zeros_like(dm1)
+                else:
+                    # nr_rks_fxc_st requires alpha of dm1, dm1*.5 should be scaled
+                    v1 = ni.nr_rks_fxc_st(mol, grids, mf.xc, dm0, dm1, 0, True,
+                                          rho0, vxc, fxc, max_memory=max_memory)
+                if hybrid:
+                    if hermi != 2:
+                        vj, vk = mf.get_jk(mol, dm1, hermi=hermi)
+                        vk *= hyb
+                        if abs(omega) > 1e-10:  # For range separated Coulomb
+                            vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
+                        v1 += vj - .5 * vk
+                    else:
+                        v1 -= .5 * hyb * mf.get_k(mol, dm1, hermi=hermi)
+                elif hermi != 2:
+                    v1 += mf.get_j(mol, dm1, hermi=hermi)
+                return v1
+
+        else:  # triplet
+            fxc *= .5
+            def vind(dm1):
+                if hermi == 2:
+                    v1 = cupy.zeros_like(dm1)
+                else:
+                    # nr_rks_fxc_st requires alpha of dm1, dm1*.5 should be scaled
+                    v1 = ni.nr_rks_fxc_st(mol, grids, mf.xc, dm0, dm1, 0, False,
+                                          rho0, vxc, fxc, max_memory=max_memory)
+                if hybrid:
+                    vk = mf.get_k(mol, dm1, hermi=hermi)
+                    vk *= hyb
+                    if abs(omega) > 1e-10:  # For range separated Coulomb
+                        vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
+                    v1 += -.5 * vk
+                return v1
 
     else:  # HF
         if (singlet is None or singlet) and hermi != 2:
@@ -87,7 +129,7 @@ def vind(dm1):
 
 
 def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
-                      with_j=True, hermi=0, max_memory=None):
+                      with_j=True, hermi=0, grids=None, max_memory=None):
     '''Generate a function to compute the product of UHF response function and
     UHF density matrices.
     '''
@@ -96,6 +138,10 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
     if mo_occ is None: mo_occ = mf.mo_occ
     mol = mf.mol
     if isinstance(mf, hf.KohnShamDFT):
+        if grids is None:
+            grids = mf.grids
+        if grids and grids.coords is None:
+            grids.build(mol=mol, with_non0tab=False, sort_grids=True)
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
         if mf.do_nlc():
@@ -105,19 +151,15 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
         hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
-        rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
+        rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
                                             mo_coeff, mo_occ, 1)
         dm0 = None
 
-        if max_memory is None:
-            mem_now = lib.current_memory()[0]
-            max_memory = max(2000, mf.max_memory*.8-mem_now)
-
         def vind(dm1):
             if hermi == 2:
                 v1 = cupy.zeros_like(dm1)
             else:
-                v1 = ni.nr_uks_fxc(mol, mf.grids, mf.xc, dm0, dm1, 0, hermi,
+                v1 = ni.nr_uks_fxc(mol, grids, mf.xc, dm0, dm1, 0, hermi,
                                    rho0, vxc, fxc, max_memory=max_memory)
             if not hybrid:
                 if with_j:
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index b84d0a58..a069d89b 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -25,13 +25,14 @@
 from pyscf.scf import hf
 from pyscf.scf import chkfile
 from gpu4pyscf import lib
+from gpu4pyscf.lib import utils
 from gpu4pyscf.lib.cupy_helper import eigh, tag_array, return_cupy_array, cond
 from gpu4pyscf.scf import diis, jk
 from gpu4pyscf.lib import logger
 
 __all__ = [
     'get_jk', 'get_occ', 'get_grad', 'damping', 'level_shift', 'get_fock',
-    'energy_elec', 'RHF'
+    'energy_elec', 'RHF', 'SCF'
 ]
 
 def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
@@ -238,33 +239,13 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
             scf_conv = True
             break
 
-    if(cycle == mf.max_cycle):
-        logger.warn("SCF failed to converge")
+    if (cycle + 1 == mf.max_cycle):
+        assert not scf_conv
+        logger.warn(mf, "SCF failed to converge")
 
     return scf_conv, e_tot, mo_energy, mo_coeff, mo_occ
 
 
-def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'):
-    from pyscf.data import nist
-    if mol is None: mol = mf.mol
-    if dm is None: dm = mf.make_rdm1()
-    nao = mol.nao
-    with mol.with_common_orig((0,0,0)):
-        ao_quad = mol.intor_symmetric('int1e_rr').reshape(3,3,nao,nao)
-
-    el_quad = np.einsum('xyij,ji->xy', ao_quad, dm).real
-
-    # Nuclear contribution
-    charges = mol.atom_charges()
-    coords  = mol.atom_coords()
-    nucl_quad = np.einsum('i,ix,iy->xy', charges, coords, coords)
-
-    mol_quad = nucl_quad - el_quad
-
-    if unit.upper() == 'DEBYE-ANG':
-        mol_quad *= nist.AU2DEBYE * nist.BOHR
-    return mol_quad
-
 def energy_tot(mf, dm=None, h1e=None, vhf=None):
     r'''Total Hartree-Fock energy, electronic part plus nuclear repulstion
     See :func:`scf.hf.energy_elec` for the electron part
@@ -310,6 +291,27 @@ def scf(mf, dm0=None, **kwargs):
     mf._finalize()
     return mf.e_tot
 
+def canonicalize(mf, mo_coeff, mo_occ, fock=None):
+    '''Canonicalization diagonalizes the Fock matrix within occupied, open,
+    virtual subspaces separatedly (without change occupancy).
+    '''
+    if fock is None:
+        dm = mf.make_rdm1(mo_coeff, mo_occ)
+        fock = mf.get_fock(dm=dm)
+    coreidx = mo_occ == 2
+    viridx = mo_occ == 0
+    openidx = ~(coreidx | viridx)
+    mo = cupy.empty_like(mo_coeff)
+    mo_e = cupy.empty(mo_occ.size)
+    for idx in (coreidx, openidx, viridx):
+        if cupy.any(idx) > 0:
+            orb = mo_coeff[:,idx]
+            f1 = orb.conj().T.dot(fock).dot(orb)
+            e, c = cupy.linalg.eigh(f1)
+            mo[:,idx] = orb.dot(c)
+            mo_e[idx] = e
+    return mo_e, mo
+
 def as_scanner(mf):
     if isinstance(mf, pyscf_lib.SinglePointScanner):
         return mf
@@ -354,9 +356,10 @@ class SCF(pyscf_lib.StreamObject):
     conv_tol_grad       = hf.SCF.conv_tol_grad
     max_cycle           = hf.SCF.max_cycle
     init_guess          = hf.SCF.init_guess
+    conv_tol_cpscf      = 1e-4
 
     disp                = None
-    DIIS                = hf.SCF.DIIS
+    DIIS                = diis.SCF_DIIS
     diis                = hf.SCF.diis
     diis_space          = hf.SCF.diis_space
     diis_damp           = hf.SCF.diis_damp
@@ -410,9 +413,11 @@ def check_sanity(self):
     build                    = hf.SCF.build
     opt                      = NotImplemented
     dump_flags               = hf.SCF.dump_flags
-    get_fock                 = hf.SCF.get_fock
-    get_occ                  = hf.SCF.get_occ
-    get_grad                 = hf.SCF.get_grad
+    get_hcore                = return_cupy_array(hf.SCF.get_hcore)
+    get_ovlp                 = return_cupy_array(hf.SCF.get_ovlp)
+    get_fock                 = get_fock
+    get_occ                  = get_occ
+    get_grad                 = staticmethod(get_grad)
     dump_chk                 = hf.SCF.dump_chk
     init_guess_by_minao      = hf.SCF.init_guess_by_minao
     init_guess_by_atom       = hf.SCF.init_guess_by_atom
@@ -421,41 +426,65 @@ def check_sanity(self):
     init_guess_by_1e         = hf.SCF.init_guess_by_1e
     init_guess_by_chkfile    = hf.SCF.init_guess_by_chkfile
     from_chk                 = hf.SCF.from_chk
-    get_init_guess           = hf.SCF.get_init_guess
-    make_rdm1                = hf.SCF.make_rdm1
-    make_rdm2                = hf.SCF.make_rdm2
-    energy_elec              = hf.SCF.energy_elec
-    energy_tot               = hf.SCF.energy_tot
+    get_init_guess           = return_cupy_array(hf.SCF.get_init_guess)
+    make_rdm1                = make_rdm1
+    make_rdm2                = NotImplemented
+    energy_elec              = energy_elec
+    energy_tot               = energy_tot
     energy_nuc               = hf.SCF.energy_nuc
     check_convergence        = None
     _eigh                    = staticmethod(eigh)
     eig                      = hf.SCF.eig
     do_disp                  = hf.SCF.do_disp
     get_dispersion           = hf.SCF.get_dispersion
-
-    scf                      = hf.SCF.scf
+    kernel = scf             = scf
     as_scanner               = hf.SCF.as_scanner
     _finalize                = hf.SCF._finalize
     init_direct_scf          = hf.SCF.init_direct_scf
-    get_jk                   = hf.SCF.get_jk
+    get_jk                   = _get_jk
     get_j                    = hf.SCF.get_j
     get_k                    = hf.SCF.get_k
-    get_veff                 = hf.SCF.get_veff
-    analyze                  = hf.SCF.analyze
+    get_veff                 = NotImplemented
     mulliken_meta            = hf.SCF.mulliken_meta
     pop                      = hf.SCF.pop
-    dip_moment               = hf.SCF.dip_moment
     _is_mem_enough           = NotImplemented
     density_fit              = NotImplemented
-    sfx2c1e                  = NotImplemented
-    x2c1e                    = NotImplemented
-    x2c                      = NotImplemented
     newton                   = NotImplemented
-    remove_soscf             = NotImplemented
+    x2c = x2c1e = sfx2c1e    = NotImplemented
     stability                = NotImplemented
     nuc_grad_method          = NotImplemented
     update_                  = NotImplemented
+    canonicalize             = NotImplemented
     istype                   = hf.SCF.istype
+    to_rhf                   = NotImplemented
+    to_uhf                   = NotImplemented
+    to_ghf                   = NotImplemented
+    to_rks                   = NotImplemented
+    to_uks                   = NotImplemented
+    to_gks                   = NotImplemented
+    to_ks                    = NotImplemented
+    canonicalize             = NotImplemented
+    mulliken_pop             = NotImplemented
+    mulliken_meta            = NotImplemented
+
+    def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None,
+                   verbose=logger.NOTE):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.make_rdm1()
+        return hf.dip_moment(mol, dm.get(), unit, origin, verbose)
+
+    def quad_moment(self, mol=None, dm=None, unit='DebyeAngstrom', origin=None,
+                    verbose=logger.NOTE):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.make_rdm1()
+        return hf.quad_moment(mol, dm.get(), unit, origin, verbose)
+
+    def remove_soscf(self):
+        lib.logger.warn('remove_soscf has no effect in current version')
+        return self
+
+    def analyze(self, *args, **kwargs):
+        return self.to_cpu().analyze()
 
     def reset(self, mol=None):
         if mol is not None:
@@ -469,7 +498,6 @@ class KohnShamDFT:
     A mock DFT base class, to be compatible with PySCF
     '''
 
-from gpu4pyscf.lib import utils
 class RHF(SCF):
 
     to_gpu = utils.to_gpu
@@ -477,42 +505,8 @@ class RHF(SCF):
 
     _keys = {'e_disp', 'h1e', 's1e', 'e_mf', 'conv_tol_cpscf', 'disp_with_3body'}
 
-    conv_tol_cpscf = 1e-4
-    DIIS = diis.SCF_DIIS
-    get_jk = _get_jk
-    _eigh = staticmethod(eigh)
-    make_rdm1 = make_rdm1
-    energy_elec = energy_elec
-    get_fock = get_fock
-    get_occ = get_occ
     get_veff = get_veff
-    get_grad = staticmethod(get_grad)
-    quad_moment = _quad_moment
-    energy_tot = energy_tot
-
-    get_hcore = return_cupy_array(hf.RHF.get_hcore)
-    get_ovlp = return_cupy_array(hf.RHF.get_ovlp)
-    get_init_guess = return_cupy_array(hf.RHF.get_init_guess)
-    init_direct_scf = NotImplemented
-    make_rdm2 = NotImplemented
-    newton = NotImplemented
-    x2c = x2c1e = sfx2c1e = NotImplemented
-    to_rhf = NotImplemented
-    to_uhf = NotImplemented
-    to_ghf = NotImplemented
-    to_rks = NotImplemented
-    to_uks = NotImplemented
-    to_gks = NotImplemented
-    to_ks = NotImplemented
-    canonicalize = NotImplemented
-    # TODO: Enable followings after testing
-    analyze = NotImplemented
-    stability = NotImplemented
-    mulliken_pop = NotImplemented
-    mulliken_meta = NotImplemented
-
-    scf = scf
-    kernel = scf
+    canonicalize = canonicalize
 
     def check_sanity(self):
         mol = self.mol
@@ -529,6 +523,10 @@ def density_fit(self, auxbasis=None, with_df=None, only_dfj=False):
         import gpu4pyscf.df.df_jk
         return gpu4pyscf.df.df_jk.density_fit(self, auxbasis, with_df, only_dfj)
 
+    def newton(self):
+        from gpu4pyscf.scf.soscf import newton
+        return newton(self)
+
     def to_cpu(self):
         mf = hf.RHF(self.mol)
         utils.to_cpu(self, out=mf)
diff --git a/gpu4pyscf/scf/hf_symm.py b/gpu4pyscf/scf/hf_symm.py
new file mode 100644
index 00000000..486c02fd
--- /dev/null
+++ b/gpu4pyscf/scf/hf_symm.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from gpu4pyscf.scf.hf import RHF
+from gpu4pyscf.scf.rohf import ROHF
+
+SymAdaptedRHF = RHF
+SymAdaptedROHF = ROHF
diff --git a/gpu4pyscf/scf/int2c2e.py b/gpu4pyscf/scf/int2c2e.py
index 8ec1564d..0dbc8730 100644
--- a/gpu4pyscf/scf/int2c2e.py
+++ b/gpu4pyscf/scf/int2c2e.py
@@ -33,7 +33,7 @@ def get_int2c2e_sorted(mol, intopt=None, direct_scf_tol=1e-13, aosym=None, omega
     nao = mol.nao
     rows, cols = np.tril_indices(nao)
 
-    nao_cart = intopt.mol.nao
+    nao_cart = intopt._sorted_mol.nao
     norb_cart = nao_cart + 1
 
     int2c = cupy.zeros([nao_cart, nao_cart], order='F')
@@ -137,5 +137,5 @@ def get_int2c2e(mol, direct_scf_tol=1e-13):
     intopt = VHFOpt(mol, mol, 'int2e')
     intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=True)
     int2c = get_int2c2e_sorted(mol, intopt=intopt)
-    int2c = take_last2d(int2c, intopt.rev_ao_idx)
+    int2c = intopt.unsort_orbitals(int2c, axis=[0,1])
     return int2c
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
new file mode 100644
index 00000000..7ec884b8
--- /dev/null
+++ b/gpu4pyscf/scf/j_engine.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+J engine using McMurchie-Davidson algorithm
+'''
+
+import ctypes
+import functools
+import math
+import numpy as np
+import cupy as cp
+import scipy.linalg
+from pyscf import lib
+from pyscf import __config__
+from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum
+from gpu4pyscf.__config__ import props as gpu_specs
+from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import jk
+from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars
+
+__all__ = [
+    'get_j',
+]
+
+PTR_BAS_COORD = 7
+LMAX = 4
+SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
+                   int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
+THREADS = 256
+
+libvhf_md = load_library('libgvhf_md')
+libvhf_md.MD_build_j.restype = ctypes.c_int
+
+def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None):
+    '''Compute J matrix
+    '''
+    log = logger.new_logger(mol, verbose)
+    cput0 = log.init_timer()
+    if vhfopt is None:
+        with mol.with_range_coulomb(omega):
+            vhfopt = _VHFOpt(mol).build()
+    if omega is None:
+        omega = mol.omega
+
+    mol = vhfopt.mol
+    nbas = mol.nbas
+    nao, nao_orig = vhfopt.coeff.shape
+    dm = cp.asarray(dm, order='C')
+    dms = dm.reshape(-1,nao_orig,nao_orig)
+    n_dm = dms.shape[0]
+    assert n_dm == 1
+    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+    dms = sandwich_dot(dms, vhfopt.coeff.T)
+    dms = cp.asarray(dms, order='C')
+    if hermi != 1:
+        dms = transpose_sum(dms)
+    else:
+        dms *= 2.
+
+    ao_loc = mol.ao_loc
+    dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
+    log_max_dm = dm_cond.max()
+    log_cutoff = math.log(vhfopt.direct_scf_tol)
+
+    dms = dms.get()
+    pair_loc = _make_j_engine_pair_locs(mol)
+    dm_xyz = np.zeros(pair_loc[-1])
+    # Must use this modified _env to ensure the consistency with GPU kernel
+    # In this _env, normalization coefficients for s and p funcitons are scaled.
+    _env = vhfopt._mol_gpu[2].get()
+    libvhf_md.Et_dot_dm(
+        dm_xyz.ctypes, dms.ctypes, ao_loc.ctypes, pair_loc.ctypes,
+        mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes)
+    dm_xyz = cp.asarray(dm_xyz)
+    vj_xyz = cp.zeros_like(dm_xyz)
+
+    pair_loc_on_gpu = cp.asarray(pair_loc)
+    rys_envs = RysIntEnvVars(
+        mol.natm, mol.nbas,
+        vhfopt.rys_envs.atm, vhfopt.rys_envs.bas, vhfopt.rys_envs.env,
+        pair_loc_on_gpu.data.ptr,
+    )
+
+    libvhf_md.init_mdj_constant(ctypes.c_int(SHM_SIZE))
+
+    uniq_l_ctr = vhfopt.uniq_l_ctr
+    uniq_l = uniq_l_ctr[:,0]
+    l_ctr_bas_loc = vhfopt.l_ctr_offsets
+    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+    n_groups = len(uniq_l_ctr)
+    tile_mappings = {}
+    workers = gpu_specs['multiProcessorCount']
+    info = cp.empty(2, dtype=np.uint32)
+
+    for i in range(n_groups):
+        for j in range(i+1):
+            ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
+            jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]
+            ij_shls = (ish0, ish1, jsh0, jsh1)
+            sub_q = vhfopt.q_cond[ish0:ish1,jsh0:jsh1]
+            mask = sub_q > log_cutoff# - log_max_dm
+            if i == j:
+                mask = cp.tril(mask)
+            t_ij = (cp.arange(ish0, ish1, dtype=np.int32)[:,None] * nbas +
+                    cp.arange(jsh0, jsh1, dtype=np.int32))
+            idx = cp.argsort(sub_q[mask])[::-1]
+            tile_mappings[i,j] = t_ij[mask][idx]
+    t1 = t2 = log.timer_debug1('q_cond and dm_cond', *cput0)
+
+    timing_collection = {}
+    kern_counts = 0
+    kern = libvhf_md.MD_build_j
+
+    for i in range(n_groups):
+        for j in range(i+1):
+            ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
+                       l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
+            tile_ij_mapping = tile_mappings[i,j]
+            for k in range(i+1):
+                for l in range(k+1):
+                    if i == k and j < l: continue
+                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+                    tile_kl_mapping = tile_mappings[k,l]
+                    scheme = _md_j_engine_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+                    err = kern(
+                        ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
+                        ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
+                        ctypes.c_int(n_dm), ctypes.c_int(nao),
+                        rys_envs, (ctypes.c_int*3)(*scheme),
+                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                        ctypes.c_int(tile_ij_mapping.size),
+                        ctypes.c_int(tile_kl_mapping.size),
+                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                        ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
+                        ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
+                        lib.c_null_ptr(),
+                        ctypes.c_float(log_cutoff-log_max_dm),
+                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                        ctypes.c_int(workers), ctypes.c_double(omega),
+                        mol._atm.ctypes, ctypes.c_int(mol.natm),
+                        mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes)
+                    if err != 0:
+                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+                    if log.verbose >= logger.DEBUG1:
+                        ntasks = tile_ij_mapping.size * tile_kl_mapping.size
+                        t1, t1p = log.timer_debug1(f'processing {llll}, tasks ~= {ntasks}', *t1), t1
+                        if llll not in timing_collection:
+                            timing_collection[llll] = 0
+                        timing_collection[llll] += t1[1] - t1p[1]
+                        kern_counts += 1
+
+    if log.verbose >= logger.DEBUG1:
+        log.debug1('kernel launches %d', kern_counts)
+        for llll, t in timing_collection.items():
+            log.debug1('%s wall time %.2f', llll, t)
+        cp.cuda.Stream.null.synchronize()
+        log.timer_debug1('cuda kernel', *t2)
+
+    vj_xyz = vj_xyz.get()
+    vj = np.zeros_like(dms)
+    libvhf_md.jengine_dot_Et(
+        vj.ctypes, vj_xyz.ctypes, ao_loc.ctypes, pair_loc.ctypes,
+        mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes)
+    #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, cp.asarray(vj), vhfopt.coeff)
+    vj = sandwich_dot(vj, vhfopt.coeff)
+    vj = transpose_sum(vj)
+    vj = vj.reshape(dm.shape)
+    log.timer('vj', *cput0)
+    return vj
+
+class _VHFOpt(jk._VHFOpt):
+    def __init__(self, mol, cutoff=1e-13):
+        self.mol, self.coeff = mol.decontract_basis(to_cart=True, aggregate=True)
+        self.direct_scf_tol = cutoff
+        self.uniq_l_ctr = None
+        self.l_ctr_offsets = None
+        self.q_cond = None
+        self.tile_q_cond = None
+        self.tile = 1
+
+def _md_j_engine_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
+    ls = l_ctr_pattern[:,0]
+    li, lj, lk, ll = ls
+    order = li + lj + lk + ll
+    lij = li + lj
+    lkl = lk + ll
+    nf3ij = (lij+1)*(lij+2)*(lij+3)//6
+    nf3kl = (lkl+1)*(lkl+2)*(lkl+3)//6
+    unit = order+1 + (order+1)*(order+2)*(2*order+3)//6
+    counts = shm_size // (unit*8)
+    if counts >= THREADS:
+        nsq = THREADS
+    else:
+        nsq = _nearest_power2(counts)
+    ij = _nearest_power2(int(nsq**.5))
+    kl = nsq // ij
+    tilex, tiley = 2, 4
+    cache_size = ij*tilex * (4+nf3ij) + kl*tiley * (4+nf3kl)
+    while (nsq * unit + cache_size) * 8 > shm_size:
+        nsq //= 2
+        ij = _nearest_power2(int(nsq**.5))
+        kl = nsq // ij
+        cache_size = ij*tilex * (4+nf3ij) + kl*tiley * (4+nf3kl)
+    gout_stride = THREADS // nsq
+    return ij, kl, gout_stride
+
+def _nearest_power2(n):
+    t = 0
+    while n > 1:
+        n >>= 1
+        t += 1
+    return 2**t
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 13b19277..939ba956 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -1,3 +1,24 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Compute J/K matrices
+'''
+
 import ctypes
 import math
 import numpy as np
diff --git a/gpu4pyscf/scf/rohf.py b/gpu4pyscf/scf/rohf.py
index 9e80a93b..67153195 100644
--- a/gpu4pyscf/scf/rohf.py
+++ b/gpu4pyscf/scf/rohf.py
@@ -15,29 +15,76 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+from functools import reduce
 import numpy as np
 import cupy
-from pyscf.scf import rohf
+from pyscf.scf import rohf as rohf_cpu
 from gpu4pyscf.scf import hf, uhf
-from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf.lib.cupy_helper import tag_array, contract
 
 
-class ROHF(rohf.ROHF, hf.RHF):
+def get_roothaan_fock(focka_fockb, dma_dmb, s):
+    '''Roothaan's effective fock.
+    Ref. http://www-theor.ch.cam.ac.uk/people/ross/thesis/node15.html
+
+    ======== ======== ====== =========
+    space     closed   open   virtual
+    ======== ======== ====== =========
+    closed      Fc      Fb     Fc
+    open        Fb      Fc     Fa
+    virtual     Fc      Fa     Fc
+    ======== ======== ====== =========
+
+    where Fc = (Fa + Fb) / 2
+
+    Returns:
+        Roothaan effective Fock matrix
+    '''
+    nao = s.shape[0]
+    focka, fockb = focka_fockb
+    dma, dmb = dma_dmb
+    fc = (focka + fockb) * .5
+# Projector for core, open-shell, and virtual
+    pc = cupy.dot(dmb, s)
+    po = cupy.dot(dma-dmb, s)
+    pv = cupy.eye(nao) - cupy.dot(dma, s)
+    fock  = reduce(cupy.dot, (pc.conj().T, fc, pc)) * .5
+    fock += reduce(cupy.dot, (po.conj().T, fc, po)) * .5
+    fock += reduce(cupy.dot, (pv.conj().T, fc, pv)) * .5
+    fock += reduce(cupy.dot, (po.conj().T, fockb, pc))
+    fock += reduce(cupy.dot, (po.conj().T, focka, pv))
+    fock += reduce(cupy.dot, (pv.conj().T, fc, pc))
+    fock = fock + fock.conj().T
+    fock = tag_array(fock, focka=focka, fockb=fockb)
+    return fock
+
+def canonicalize(mf, mo_coeff, mo_occ, fock=None):
+    '''Canonicalization diagonalizes the Fock matrix within occupied, open,
+    virtual subspaces separatedly (without change occupancy).
+    '''
+    if getattr(fock, 'focka', None) is None:
+        dm = mf.make_rdm1(mo_coeff, mo_occ)
+        fock = mf.get_fock(dm=dm)
+    mo_e, mo_coeff = hf.canonicalize(mf, mo_coeff, mo_occ, fock)
+    fa, fb = fock.focka, fock.fockb
+    mo_ea = contract('pi,pi->i', mo_coeff.conj(), fa.dot(mo_coeff)).real
+    mo_eb = contract('pi,pi->i', mo_coeff.conj(), fb.dot(mo_coeff)).real
+    mo_e = tag_array(mo_e, mo_ea=mo_ea, mo_eb=mo_eb)
+    return mo_e, mo_coeff
+
+class ROHF(hf.RHF):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
+    nelec = rohf_cpu.ROHF.nelec
     get_jk = hf._get_jk
-    _eigh = hf.RHF._eigh
+    _eigh = staticmethod(hf.eigh)
     scf = kernel = hf.RHF.kernel
     # FIXME: Needs more tests for get_fock and get_occ
-    get_fock = hf.return_cupy_array(rohf.ROHF.get_fock)
-    get_occ = hf.return_cupy_array(rohf.ROHF.get_occ)
+    get_occ = hf.return_cupy_array(rohf_cpu.ROHF.get_occ)
     get_hcore = hf.RHF.get_hcore
     get_ovlp = hf.RHF.get_ovlp
     get_init_guess = uhf.UHF.get_init_guess
-    make_rdm1 = hf.return_cupy_array(rohf.ROHF.make_rdm1)
     make_rdm2 = NotImplemented
-    dump_chk = NotImplemented
-    newton = NotImplemented
     x2c = x2c1e = sfx2c1e = NotImplemented
     to_rhf = NotImplemented
     to_uhf = NotImplemented
@@ -46,18 +93,83 @@ class ROHF(rohf.ROHF, hf.RHF):
     to_uks = NotImplemented
     to_gks = NotImplemented
     to_ks = NotImplemented
-    canonicalize = NotImplemented
     analyze = NotImplemented
     stability = NotImplemented
     mulliken_pop = NotImplemented
     mulliken_meta = NotImplemented
     nuc_grad_method = NotImplemented
 
+    canonicalize = canonicalize
+
+    def make_rdm1(self, mo_coeff, mo_occ, **kwargs):
+        '''One-particle density matrix.  mo_occ is a 1D array, with occupancy 1 or 2.
+        '''
+        if isinstance(mo_occ, cupy.ndarray) and mo_occ.ndim == 1:
+            mo_occa = (mo_occ > 0).astype(np.double)
+            mo_occb = (mo_occ ==2).astype(np.double)
+        else:
+            mo_occa, mo_occb = mo_occ
+        dm_a = cupy.dot(mo_coeff*mo_occa, mo_coeff.conj().T)
+        dm_b = cupy.dot(mo_coeff*mo_occb, mo_coeff.conj().T)
+        return tag_array((dm_a, dm_b), mo_coeff=mo_coeff, mo_occ=mo_occ)
+
+    def eig(self, fock, s):
+        e, c = self._eigh(fock, s)
+        if getattr(fock, 'focka', None) is not None:
+            mo_ea = contract('pi,pi->i', c.conj(), fock.focka.dot(c)).real
+            mo_eb = contract('pi,pi->i', c.conj(), fock.fockb.dot(c)).real
+            e = tag_array(e, mo_ea=mo_ea, mo_eb=mo_eb)
+        return e, c
+
+    def energy_elec(self, dm=None, h1e=None, vhf=None):
+        if dm is None: dm = self.make_rdm1()
+        elif isinstance(dm, cupy.ndarray) and dm.ndim == 2:
+            dm = [dm*.5, dm*.5]
+        return uhf.energy_elec(self, dm, h1e, vhf)
+
+    def get_fock(self, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
+                 diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+                 fock_last=None):
+        '''Build fock matrix based on Roothaan's effective fock.
+        See also :func:`get_roothaan_fock`
+        '''
+        if h1e is None: h1e = self.get_hcore()
+        if s1e is None: s1e = self.get_ovlp()
+        if vhf is None: vhf = self.get_veff(self.mol, dm)
+        if dm is None: dm = self.make_rdm1()
+        if isinstance(dm, cupy.ndarray) and dm.ndim == 2:
+            dm = [dm*.5, dm*.5]
+# To Get orbital energy in get_occ, we saved alpha and beta fock, because
+# Roothaan effective Fock cannot provide correct orbital energy with `eig`
+# TODO, check other treatment  J. Chem. Phys. 133, 141102
+        focka = h1e + vhf[0]
+        fockb = h1e + vhf[1]
+        f = get_roothaan_fock((focka,fockb), dm, s1e)
+        if cycle < 0 and diis is None:  # Not inside the SCF iteration
+            return f
+
+        if diis_start_cycle is None:
+            diis_start_cycle = self.diis_start_cycle
+        if level_shift_factor is None:
+            level_shift_factor = self.level_shift
+        if damp_factor is None:
+            damp_factor = self.damp
+
+        dm_tot = dm[0] + dm[1]
+        if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None:
+            raise NotImplementedError('ROHF Fock-damping')
+        if diis and cycle >= diis_start_cycle:
+            f = diis.update(s1e, dm_tot, f, self, h1e, vhf, f_prev=fock_last)
+        if abs(level_shift_factor) > 1e-4:
+            f = hf.level_shift(s1e, dm_tot*.5, f, level_shift_factor)
+        f = tag_array(f, focka=focka, fockb=fockb)
+        return f
+
     def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
         if mol is None: mol = self.mol
         if dm is None: dm = self.make_rdm1()
         if getattr(dm, 'ndim', 0) == 2:
-            dm = cupy.asarray((dm*.5,dm*.5))
+            dm = cupy.stack((dm*.5,dm*.5))
 
         if dm_last is None or not self.direct_scf:
             if getattr(dm, 'mo_coeff', None) is not None:
@@ -74,3 +186,35 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
             vhf = vj[0] + vj[1] - vk
             vhf += vhf_last
         return vhf
+
+    def get_grad(self, mo_coeff, mo_occ, fock):
+        '''ROHF gradients is the off-diagonal block [co + cv + ov], where
+        [ cc co cv ]
+        [ oc oo ov ]
+        [ vc vo vv ]
+        '''
+        occidxa = mo_occ > 0
+        occidxb = mo_occ == 2
+        viridxa = ~occidxa
+        viridxb = ~occidxb
+        uniq_var_a = viridxa.reshape(-1,1) & occidxa
+        uniq_var_b = viridxb.reshape(-1,1) & occidxb
+
+        if getattr(fock, 'focka', None) is not None:
+            focka = fock.focka
+            fockb = fock.fockb
+        elif isinstance(fock, (tuple, list)) or getattr(fock, 'ndim', None) == 3:
+            focka, fockb = fock
+        else:
+            focka = fockb = fock
+        focka = mo_coeff.conj().T.dot(focka).dot(mo_coeff)
+        fockb = mo_coeff.conj().T.dot(fockb).dot(mo_coeff)
+
+        g = cupy.zeros_like(focka)
+        g[uniq_var_a]  = focka[uniq_var_a]
+        g[uniq_var_b] += fockb[uniq_var_b]
+        return g[uniq_var_a | uniq_var_b]
+
+    def newton(self):
+        from gpu4pyscf.scf.soscf import newton
+        return newton(self)
diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py
new file mode 100644
index 00000000..f64aa441
--- /dev/null
+++ b/gpu4pyscf/scf/soscf.py
@@ -0,0 +1,704 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Second order SCF solver
+'''
+
+import sys
+import math
+import numpy as np
+import cupy as cp
+import scipy.linalg
+from cupyx.scipy.linalg import expm
+from pyscf import lib
+from pyscf.scf import chkfile
+from pyscf.soscf import ciah
+from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu
+from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import hf, rohf, uhf
+from gpu4pyscf.lib.cupy_helper import transpose_sum, contract
+from gpu4pyscf.lib import utils
+
+def gen_g_hop_rhf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None):
+    assert mo_coeff.dtype == np.float64
+    occidx = cp.nonzero(mo_occ==2)[0]
+    viridx = cp.nonzero(mo_occ==0)[0]
+    orbo = mo_coeff[:,occidx]
+    orbv = mo_coeff[:,viridx]
+    nocc = orbo.shape[1]
+    nvir = orbv.shape[1]
+
+    if fock_ao is None:
+        dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+        fock_ao = mf.get_fock(h1e, dm=dm0)
+    fock = mo_coeff.conj().T.dot(fock_ao).dot(mo_coeff)
+    foo = fock[occidx[:,None],occidx]
+    fvv = fock[viridx[:,None],viridx]
+
+    g = fock[viridx[:,None],occidx] * 2
+    h_diag = (fvv.diagonal().real[:,None] - foo.diagonal().real) * 2
+
+    vind = mf.gen_response(mo_coeff, mo_occ, singlet=None, hermi=1)
+
+    def h_op(x):
+        x = x.reshape(nvir,nocc)
+        x2 = contract('ps,sq->pq', fvv, x)
+        x2-= contract('ps,rp->rs', foo, x)
+
+        # *2 for double occupancy
+        dm1 = orbv.dot(x*2).dot(orbo.conj().T)
+        dm1 = transpose_sum(dm1)
+        v1 = vind(dm1)
+        x2 += orbv.conj().T.dot(v1).dot(orbo)
+        return x2.ravel() * 2
+
+    return g.reshape(-1), h_op, h_diag.reshape(-1)
+
+def gen_g_hop_rohf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None):
+    if getattr(fock_ao, 'focka', None) is None:
+        dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+        fock_ao = mf.get_fock(h1e, dm=dm0)
+    fock_ao = fock_ao.focka, fock_ao.fockb
+    mo_occa = occidxa = mo_occ > 0
+    mo_occb = occidxb = mo_occ ==2
+    ug, uh_op, uh_diag = gen_g_hop_uhf(
+        mf, (mo_coeff,)*2, (mo_occa,mo_occb), fock_ao, None)
+
+    viridxa = ~occidxa
+    viridxb = ~occidxb
+    uniq_var_a = viridxa[:,None] & occidxa
+    uniq_var_b = viridxb[:,None] & occidxb
+    uniq_ab = uniq_var_a | uniq_var_b
+    nmo = mo_coeff.shape[-1]
+    nocca, noccb = mf.nelec
+    nvira = nmo - nocca
+
+    def sum_ab(x):
+        x1 = cp.zeros((nmo,nmo), dtype=x.dtype)
+        x1[uniq_var_a]  = x[:nvira*nocca]
+        x1[uniq_var_b] += x[nvira*nocca:]
+        return x1[uniq_ab]
+
+    g = sum_ab(ug)
+    h_diag = sum_ab(uh_diag)
+    def h_op(x):
+        x1 = cp.zeros((nmo,nmo), dtype=x.dtype)
+        # unpack ROHF rotation parameters
+        x1[uniq_ab] = x
+        x1 = cp.hstack((x1[uniq_var_a],x1[uniq_var_b]))
+        return sum_ab(uh_op(x1))
+
+    return g, h_op, h_diag
+
+def gen_g_hop_uhf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None):
+    assert mo_coeff[0].dtype == np.float64
+    occidxa = cp.nonzero(mo_occ[0] >  0)[0]
+    occidxb = cp.nonzero(mo_occ[1] >  0)[0]
+    viridxa = cp.nonzero(mo_occ[0] == 0)[0]
+    viridxb = cp.nonzero(mo_occ[1] == 0)[0]
+    orboa = mo_coeff[0][:,occidxa]
+    orbob = mo_coeff[1][:,occidxb]
+    orbva = mo_coeff[0][:,viridxa]
+    orbvb = mo_coeff[1][:,viridxb]
+    nmo = mo_occ[0].size
+    nocca, noccb = mf.nelec
+    nvira = nmo - nocca
+    nvirb = nmo - noccb
+
+    if fock_ao is None:
+        dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+        fock_ao = mf.get_fock(h1e, dm=dm0)
+    focka = mo_coeff[0].conj().T.dot(fock_ao[0]).dot(mo_coeff[0])
+    fockb = mo_coeff[1].conj().T.dot(fock_ao[1]).dot(mo_coeff[1])
+    fooa = focka[occidxa[:,None],occidxa]
+    fvva = focka[viridxa[:,None],viridxa]
+    foob = fockb[occidxb[:,None],occidxb]
+    fvvb = fockb[viridxb[:,None],viridxb]
+
+    g = cp.hstack((focka[viridxa[:,None],occidxa].ravel(),
+                   fockb[viridxb[:,None],occidxb].ravel()))
+    h_diaga = fvva.diagonal().real[:,None] - fooa.diagonal().real
+    h_diagb = fvvb.diagonal().real[:,None] - foob.diagonal().real
+    h_diag = cp.hstack((h_diaga.reshape(-1), h_diagb.reshape(-1)))
+
+    vind = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+
+    def h_op(x):
+        x1a = x[:nvira*nocca].reshape(nvira,nocca)
+        x1b = x[nvira*nocca:].reshape(nvirb,noccb)
+        x2a = contract('pr,rq->pq', fvva, x1a)
+        x2a-= contract('sq,ps->pq', fooa, x1a)
+        x2b = contract('pr,rq->pq', fvvb, x1b)
+        x2b-= contract('sq,ps->pq', foob, x1b)
+
+        d1a = orbva.dot(x1a).dot(orboa.conj().T)
+        d1b = orbvb.dot(x1b).dot(orbob.conj().T)
+        dm1 = cp.array([transpose_sum(d1a),
+                        transpose_sum(d1b)])
+        v1 = vind(dm1)
+        x2a += orbva.conj().T.dot(v1[0]).dot(orboa)
+        x2b += orbvb.conj().T.dot(v1[1]).dot(orbob)
+        return cp.hstack((x2a.ravel(), x2b.ravel()))
+
+    return g, h_op, h_diag
+
+
+def _rotate_orb_cc(mf, h1e, s1e, conv_tol_grad=None, verbose=None):
+    log = logger.new_logger(mf, verbose)
+
+    if conv_tol_grad is None:
+        conv_tol_grad = (mf.conv_tol*.1)**.5
+        #TODO: dynamically adjust max_stepsize, as done in mc1step.py
+
+    def precond(x, e):
+        hdiagd = h_diag-(e-mf.ah_level_shift)
+        hdiagd[abs(hdiagd)<1e-8] = 1e-8
+        x = x/hdiagd
+        return x
+
+    t3m = log.init_timer()
+    u = g_kf = g_orb = norm_gorb = dxi = kfcount = jkcount = None
+    dm0 = vhf0 = None
+    g_op = lambda: g_orb
+    while True:
+        mo_coeff, mo_occ, dm0, vhf0, e_tot = (yield u, g_kf, kfcount, jkcount, dm0, vhf0)
+        fock_ao = mf.get_fock(h1e, s1e, vhf0, dm0)
+
+        g_kf, h_op, h_diag = mf.gen_g_hop(mo_coeff, mo_occ, fock_ao)
+        norm_gkf = cp.linalg.norm(g_kf)
+        if g_orb is None:
+            log.debug('    |g|= %4.3g (keyframe)', norm_gkf)
+            kf_trust_region = mf.kf_trust_region
+            x0_guess = g_kf
+        else:
+            norm_dg = cp.linalg.norm(g_kf-g_orb)
+            log.debug('    |g|= %4.3g (keyframe), |g-correction|= %4.3g',
+                      norm_gkf, norm_dg)
+            kf_trust_region = min(max(norm_gorb/(norm_dg+1e-9), mf.kf_trust_region), 10)
+            log.debug1('Set  kf_trust_region = %g', kf_trust_region)
+            x0_guess = dxi
+        g_orb = g_kf
+        norm_gorb = norm_gkf
+        problem_size = g_orb.size
+
+        ah_conv_tol = min(norm_gorb**2, mf.ah_conv_tol)
+        # increase the AH accuracy when approach convergence
+        ah_start_cycle = mf.ah_start_cycle
+        imic = 0
+        dr = 0.
+        u = 1.
+        ukf = None
+        jkcount = 0
+        kfcount = 0
+        ikf = 0
+        ihop = 0
+
+        for ah_end, ihop, w, dxi, hdxi, residual, seig \
+                in _davidson_cc(h_op, g_op, precond, x0_guess,
+                                tol=ah_conv_tol, max_cycle=mf.ah_max_cycle,
+                                lindep=mf.ah_lindep, verbose=log):
+            norm_residual = cp.linalg.norm(residual)
+            ah_start_tol = min(norm_gorb*5, mf.ah_start_tol)
+            if (ah_end or ihop == mf.ah_max_cycle or # make sure to use the last step
+                ((norm_residual < ah_start_tol) and (ihop >= ah_start_cycle)) or
+                (seig < mf.ah_lindep)):
+                imic += 1
+                dxmax = abs(dxi).max()
+                if ihop == problem_size:
+                    log.debug1('... Hx=g fully converged for small systems')
+                elif dxmax > mf.max_stepsize:
+                    scale = mf.max_stepsize / dxmax
+                    log.debug1('... scale rotation size %g', scale)
+                    dxi *= scale
+                    hdxi *= scale
+
+                dr = dr + dxi
+                g_orb = g_orb + hdxi
+                norm_dr = cp.linalg.norm(dr)
+                norm_gorb = cp.linalg.norm(g_orb)
+                norm_dxi = cp.linalg.norm(dxi)
+                log.debug('    imic %d(%d)  |g|= %4.3g  |dxi|= %4.3g  '
+                          'max(|x|)= %4.3g  |dr|= %4.3g  eig= %4.3g  seig= %4.3g',
+                          imic, ihop, norm_gorb, norm_dxi,
+                          dxmax, norm_dr, w, seig)
+
+                max_cycle = max(mf.max_cycle_inner,
+                                mf.max_cycle_inner-int(math.log(norm_gkf+1e-9)*2))
+                log.debug1('Set ah_start_tol %g, ah_start_cycle %d, max_cycle %d',
+                           ah_start_tol, ah_start_cycle, max_cycle)
+                ikf += 1
+                if imic > 3 and norm_gorb > norm_gkf*mf.ah_grad_trust_region:
+                    g_orb = g_orb - hdxi
+                    dr -= dxi
+                    norm_gorb = cp.linalg.norm(g_orb)
+                    log.debug('|g| >> keyframe, Restore previouse step')
+                    break
+
+                elif (imic >= max_cycle or norm_gorb < conv_tol_grad/mf.ah_grad_trust_region):
+                    break
+
+                elif (ikf > 2 and # avoid frequent keyframe
+                      #TODO: replace it with keyframe_scheduler
+                      (ikf >= max(mf.kf_interval, mf.kf_interval-math.log(norm_dr+1e-9)) or
+                       # Insert keyframe if the keyframe and the estimated g_orb are too different
+                       norm_gorb < norm_gkf/kf_trust_region)):
+                    ikf = 0
+                    u = mf.update_rotate_matrix(dr, mo_occ, mo_coeff=mo_coeff)
+                    if ukf is not None:
+                        u = mf.rotate_mo(ukf, u)
+                    ukf = u
+                    dr[:] = 0
+                    mo1 = mf.rotate_mo(mo_coeff, u)
+                    dm = mf.make_rdm1(mo1, mo_occ)
+                    # use mf._scf.get_veff to avoid density-fit mf polluting get_veff
+                    vhf0 = mf._scf.get_veff(mf._scf.mol, dm, dm_last=dm0, vhf_last=vhf0)
+                    dm0 = dm
+                    # Use API to compute fock instead of "fock=h1e+vhf0". This is because get_fock
+                    # is the hook being overloaded in many places.
+                    fock_ao = mf.get_fock(h1e, s1e, vhf0, dm0)
+                    g_kf1 = mf.get_grad(mo1, mo_occ, fock_ao)
+                    norm_gkf1 = cp.linalg.norm(g_kf1)
+                    norm_dg = cp.linalg.norm(g_kf1-g_orb)
+                    jkcount += 1
+                    kfcount += 1
+                    if log.verbose >= logger.DEBUG:
+                        e_tot, e_last = mf._scf.energy_tot(dm, h1e, vhf0), e_tot
+                        log.debug('Adjust keyframe g_orb to |g|= %4.3g  '
+                                  '|g-correction|=%4.3g  E=%.12g dE=%.5g',
+                                  norm_gkf1, norm_dg, e_tot, e_tot-e_last)
+
+                    if (norm_dg < norm_gorb*mf.ah_grad_trust_region  # kf not too diff
+                        #or norm_gkf1 < norm_gkf  # grad is decaying
+                        # close to solution
+                        or norm_gkf1 < conv_tol_grad*mf.ah_grad_trust_region):
+                        kf_trust_region = min(max(norm_gorb/(norm_dg+1e-9), mf.kf_trust_region), 10)
+                        log.debug1('Set kf_trust_region = %g', kf_trust_region)
+                        g_orb = g_kf = g_kf1
+                        norm_gorb = norm_gkf = norm_gkf1
+                    else:
+                        g_orb = g_orb - hdxi
+                        dr -= dxi
+                        norm_gorb = cp.linalg.norm(g_orb)
+                        log.debug('Out of trust region. Restore previouse step')
+                        break
+
+        if ihop > 0:
+            u = mf.update_rotate_matrix(dr, mo_occ, mo_coeff=mo_coeff)
+            if ukf is not None:
+                u = mf.rotate_mo(ukf, u)
+            jkcount += ihop + 1
+            log.debug('    tot inner=%d  %d JK  |g|= %4.3g  |u-1|= %4.3g',
+                      imic, jkcount, norm_gorb, cp.linalg.norm(dr))
+        h_op = h_diag = None
+        t3m = log.timer('aug_hess in %d inner iters' % imic, *t3m)
+
+def _davidson_cc(h_op, g_op, precond, x0, tol=1e-10, xs=[], ax=[],
+                 max_cycle=30, lindep=1e-14, verbose=logger.WARN):
+    if isinstance(verbose, logger.Logger):
+        log = verbose
+    else:
+        log = logger.Logger(sys.stdout, verbose)
+
+    toloose = tol**.5
+    # the first trial vector is (1,0,0,...), which is not included in xs
+    xs = list(xs)
+    ax = list(ax)
+    nx = len(xs)
+
+    problem_size = x0.size
+    max_cycle = min(max_cycle, problem_size)
+    heff = np.zeros((max_cycle+nx+1,max_cycle+nx+1), dtype=x0.dtype)
+    ovlp = np.eye(max_cycle+nx+1, dtype=x0.dtype)
+    if nx == 0:
+        xs.append(x0)
+        ax.append(h_op(x0))
+    else:
+        for i in range(1, nx+1):
+            for j in range(1, i+1):
+                heff[i,j] = xs[i-1].conj().dot(ax[j-1])
+                ovlp[i,j] = xs[i-1].conj().dot(xs[j-1])
+            heff[1:i,i] = heff[i,1:i].conj()
+            ovlp[1:i,i] = ovlp[i,1:i].conj()
+
+    w_t = 0
+    for istep in range(max_cycle):
+        g = g_op()
+        nx = len(xs)
+        for i in range(nx):
+            heff[i+1,0] = xs[i].conj().dot(g)
+            heff[nx,i+1] = xs[nx-1].conj().dot(ax[i])
+            ovlp[nx,i+1] = xs[nx-1].conj().dot(xs[i])
+        heff[0,:nx+1] = heff[:nx+1,0].conj()
+        heff[1:nx,nx] = heff[nx,1:nx].conj()
+        ovlp[1:nx,nx] = ovlp[nx,1:nx].conj()
+        nvec = nx + 1
+        #s0 = scipy.linalg.eigh(ovlp[:nvec,:nvec])[0][0]
+        #if s0 < lindep:
+        #    yield True, istep, w_t, xtrial, hx, dx, s0
+        #    break
+        wlast = w_t
+        xtrial, w_t, v_t, index, seig = \
+                _regular_step(heff[:nvec,:nvec], ovlp[:nvec,:nvec], xs,
+                              lindep, log)
+        s0 = seig[0]
+        hx = _dgemv(v_t[1:], ax)
+        # note g*v_t[0], as the first trial vector is (1,0,0,...)
+        dx = hx + g*v_t[0] - w_t * v_t[0]*xtrial
+        norm_dx = np.linalg.norm(dx)
+        log.debug1('... AH step %d  index= %d  |dx|= %.5g  eig= %.5g  v[0]= %.5g  lindep= %.5g',
+                   istep+1, index, norm_dx, w_t, v_t[0].real, s0)
+        hx *= 1/v_t[0] # == h_op(xtrial)
+        if ((abs(w_t-wlast) < tol and norm_dx < toloose) or
+            s0 < lindep or
+            istep+1 == problem_size):
+            # Avoid adding more trial vectors if hessian converged
+            yield True, istep+1, w_t, xtrial, hx, dx, s0
+            if s0 < lindep or norm_dx < lindep:# or np.linalg.norm(xtrial) < lindep:
+                # stop the iteration because eigenvectors would be barely updated
+                break
+        else:
+            yield False, istep+1, w_t, xtrial, hx, dx, s0
+            x0 = precond(dx, w_t)
+            xs.append(x0)
+            ax.append(h_op(x0))
+
+def _regular_step(heff, ovlp, xs, lindep, log, root_id=0):
+    w, v, seig = lib.safe_eigh(heff, ovlp, lindep)
+    #if e[0] < -.1:
+    #    sel = 0
+    #else:
+    # There exists systems that the first eigenvalue of AH is -inf.
+    # Dynamically choosing the eigenvectors may be better.
+    idx = np.nonzero(abs(v[0]) > 0.1)[0]
+    sel = idx[root_id]
+    log.debug1('CIAH eigen-sel %s', sel)
+    w_t = w[sel]
+
+    if w_t < 1e-4:
+        try:
+            e, c = scipy.linalg.eigh(heff[1:,1:], ovlp[1:,1:])
+        except scipy.linalg.LinAlgError:
+            e, c = lib.safe_eigh(heff[1:,1:], ovlp[1:,1:], lindep)[:2]
+        if np.any(e < -1e-5):
+            log.debug('Negative hessians found %s', e[e<0])
+
+    xtrial = _dgemv(v[1:,sel]/v[0,sel], xs)
+    return xtrial, w_t, v[:,sel], sel, seig
+
+def _dgemv(v, m):
+    vm = v[0] * m[0]
+    for i,vi in enumerate(v[1:]):
+        vm += vi * m[i+1]
+    return vm
+
+
+def kernel(mf, mo_coeff=None, mo_occ=None, dm=None,
+           conv_tol=1e-10, conv_tol_grad=None, max_cycle=50, dump_chk=True,
+           callback=None, verbose=logger.NOTE):
+    log = logger.new_logger(mf, verbose)
+    cput0 = log.init_timer()
+    mol = mf._scf.mol
+    assert mol is mf.mol
+
+    if conv_tol_grad is None:
+        conv_tol_grad = conv_tol**.5
+        log.info('Set conv_tol_grad to %g', conv_tol_grad)
+
+    # call mf._scf.get_hcore, mf._scf.get_ovlp because they might be overloaded
+    h1e = mf._scf.get_hcore(mol)
+    s1e = mf._scf.get_ovlp(mol)
+
+    if mo_coeff is not None and mo_occ is not None:
+        dm = mf.make_rdm1(mo_coeff, mo_occ)
+        # call mf._scf.get_veff, to avoid "newton().density_fit()" polluting get_veff
+        vhf = mf._scf.get_veff(mol, dm)
+        fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+        mo_energy, mo_tmp = mf.eig(fock, s1e)
+        mf.get_occ(mo_energy, mo_tmp)
+        mo_tmp = None
+
+    else:
+        if dm is None:
+            dm = mf.get_init_guess(mol, mf.init_guess)
+        vhf = mf._scf.get_veff(mol, dm)
+        fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+        mo_energy, mo_coeff = mf.eig(fock, s1e)
+        mo_occ = mf.get_occ(mo_energy, mo_coeff)
+        dm, dm_last = mf.make_rdm1(mo_coeff, mo_occ), dm
+        vhf = mf._scf.get_veff(mol, dm, dm_last=dm_last, vhf_last=vhf)
+
+    # Save mo_coeff and mo_occ because they are needed by function rotate_mo
+    mf.mo_coeff, mf.mo_occ = mo_coeff, mo_occ
+
+    e_tot = mf._scf.energy_tot(dm, h1e, vhf)
+    fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+    log.info('Initial guess E= %.15g  |g|= %g', e_tot,
+             cp.linalg.norm(mf._scf.get_grad(mo_coeff, mo_occ, fock)))
+
+    if dump_chk and mf.chkfile:
+        chkfile.save_mol(mol, mf.chkfile)
+
+    # Copy the integral file to soscf object to avoid the integrals being
+    # cached twice.
+    if mol is mf.mol and not getattr(mf, 'with_df', None):
+        mf._eri = mf._scf._eri
+
+    rotaiter = _rotate_orb_cc(mf, h1e, s1e, conv_tol_grad, verbose=log)
+    next(rotaiter)  # start the iterator
+    kftot = jktot = 0
+    norm_gorb = 0.
+    scf_conv = False
+    cput1 = log.timer('initializing second order scf', *cput0)
+
+    for imacro in range(max_cycle):
+        u, g_orb, kfcount, jkcount, dm_last, vhf = \
+                rotaiter.send((mo_coeff, mo_occ, dm, vhf, e_tot))
+        kftot += kfcount + 1
+        jktot += jkcount + 1
+
+        last_hf_e = e_tot
+        norm_gorb = cp.linalg.norm(g_orb)
+        mo_coeff = mf.rotate_mo(mo_coeff, u, log)
+        dm = mf.make_rdm1(mo_coeff, mo_occ)
+        vhf = mf._scf.get_veff(mol, dm, dm_last=dm_last, vhf_last=vhf)
+        fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+        # NOTE: DO NOT change the initial guess mo_occ, mo_coeff
+        if mf.verbose >= logger.DEBUG:
+            mo_energy, mo_tmp = mf.eig(fock, s1e)
+            mf.get_occ(mo_energy, mo_tmp)
+            # call mf._scf.energy_tot for dft, because the (dft).get_veff step saved _exc in mf._scf
+        e_tot = mf._scf.energy_tot(dm, h1e, vhf)
+
+        log.info('macro= %d  E= %.15g  delta_E= %g  |g|= %g  %d KF %d JK',
+                 imacro, e_tot, e_tot-last_hf_e, norm_gorb,
+                 kfcount+1, jkcount)
+        cput1 = log.timer('cycle= %d'%(imacro+1), *cput1)
+
+        if callable(mf.check_convergence):
+            scf_conv = mf.check_convergence(locals())
+        elif abs(e_tot-last_hf_e) < conv_tol and norm_gorb < conv_tol_grad:
+            scf_conv = True
+
+        if dump_chk:
+            mf.dump_chk(locals())
+
+        if callable(callback):
+            callback(locals())
+
+        if scf_conv:
+            break
+
+    if callable(callback):
+        callback(locals())
+
+    rotaiter.close()
+    mo_energy, mo_coeff1 = mf._scf.canonicalize(mo_coeff, mo_occ, fock)
+    if mf.canonicalization:
+        log.info('Canonicalize SCF orbitals')
+        mo_coeff = mo_coeff1
+        if dump_chk:
+            mf.dump_chk(locals())
+    log.info('macro X = %d  E=%.15g  |g|= %g  total %d KF %d JK',
+             imacro+1, e_tot, norm_gorb, kftot+1, jktot+1)
+
+    if cp.any(mo_occ==0):
+        homo = mo_energy[mo_occ>0].max()
+        lumo = mo_energy[mo_occ==0].min()
+        if homo > lumo:
+            log.warn('canonicalized orbital HOMO %s > LUMO %s ', homo, lumo)
+    return scf_conv, e_tot, mo_energy, mo_coeff, mo_occ
+
+# A tag to label the derived SCF class
+class _CIAH_SOSCF:
+    '''
+    Attributes for Newton solver:
+        max_cycle_inner : int
+            AH iterations within eacy macro iterations. Default is 10
+        max_stepsize : int
+            The step size for orbital rotation.  Small step is prefered.  Default is 0.05.
+        canonicalization : bool
+            To control whether to canonicalize the orbitals optimized by
+            Newton solver.  Default is True.
+    '''
+
+    __name_mixin__ = 'SecondOrder'
+
+    max_cycle_inner = _SOSCF_cpu.max_cycle_inner
+    max_stepsize = _SOSCF_cpu.max_stepsize
+    canonicalization = _SOSCF_cpu.canonicalization
+
+    ah_start_tol = _SOSCF_cpu.ah_start_tol
+    ah_start_cycle = _SOSCF_cpu.ah_start_cycle
+    ah_level_shift = _SOSCF_cpu.ah_level_shift
+    ah_conv_tol = _SOSCF_cpu.ah_conv_tol
+    ah_lindep = _SOSCF_cpu.ah_lindep
+    ah_max_cycle = _SOSCF_cpu.ah_max_cycle
+    ah_grad_trust_region = _SOSCF_cpu.ah_grad_trust_region
+    kf_interval = _SOSCF_cpu.kf_interval
+    kf_trust_region = _SOSCF_cpu.kf_trust_region
+
+    _keys = _SOSCF_cpu._keys
+
+    to_gpu = utils.to_gpu
+    device = utils.device
+    to_cpu = utils.to_cpu
+
+    def __init__(self, mf):
+        self.__dict__.update(mf.__dict__)
+        self._scf = mf
+
+    def undo_soscf(self):
+        '''Remove the SOSCF Mixin'''
+        from gpu4pyscf.df.df_jk import _DFHF
+        if isinstance(self, _DFHF) and not isinstance(self._scf, _DFHF):
+            # where density fitting is only applied on the SOSCF hessian
+            mf = self.undo_df()
+        else:
+            mf = self
+        obj = lib.view(mf, lib.drop_class(mf.__class__, _CIAH_SOSCF))
+        del obj._scf
+        # When both self and self._scf are DF objects, they may be different df
+        # objects. The DF object of the base scf object should be used.
+        if hasattr(self._scf, 'with_df'):
+            obj.with_df = self._scf.with_df
+        return obj
+
+    undo_newton = undo_soscf
+
+    def dump_flags(self, verbose=None):
+        log = logger.new_logger(self, verbose)
+        log.info('\n')
+        super().dump_flags(verbose)
+        log.info('******** %s Newton solver flags ********', self._scf.__class__)
+        log.info('max_cycle_inner = %d',  self.max_cycle_inner)
+        log.info('max_stepsize = %g',     self.max_stepsize)
+        log.info('ah_start_tol = %g',     self.ah_start_tol)
+        log.info('ah_level_shift = %g',   self.ah_level_shift)
+        log.info('ah_conv_tol = %g',      self.ah_conv_tol)
+        log.info('ah_lindep = %g',        self.ah_lindep)
+        log.info('ah_start_cycle = %d',   self.ah_start_cycle)
+        log.info('ah_max_cycle = %d',     self.ah_max_cycle)
+        log.info('ah_grad_trust_region = %g', self.ah_grad_trust_region)
+        log.info('kf_interval = %d', self.kf_interval)
+        log.info('kf_trust_region = %d', self.kf_trust_region)
+        log.info('canonicalization = %s', self.canonicalization)
+        return self
+
+    build = _SOSCF_cpu.build
+    reset = _SOSCF_cpu.reset
+
+    def kernel(self, mo_coeff=None, mo_occ=None, dm0=None):
+        if mo_coeff is None: mo_coeff = self.mo_coeff
+        if mo_occ is None: mo_occ = self.mo_occ
+        cput0 = logger.init_timer(self)
+        self.build(self.mol)
+        self.dump_flags()
+
+        self.converged, self.e_tot, \
+                self.mo_energy, self.mo_coeff, self.mo_occ = \
+                kernel(self, mo_coeff, mo_occ, dm0, conv_tol=self.conv_tol,
+                       conv_tol_grad=self.conv_tol_grad,
+                       max_cycle=self.max_cycle,
+                       callback=self.callback, verbose=self.verbose)
+
+        logger.timer(self, 'Second order SCF', *cput0)
+        self._finalize()
+        return self.e_tot
+
+    from_dm = _SOSCF_cpu.from_dm
+
+    gen_g_hop = gen_g_hop_rhf
+
+    def update_rotate_matrix(self, dx, mo_occ, u0=1, mo_coeff=None):
+        nmo = len(mo_occ)
+        x1 = cp.zeros((nmo,nmo), dtype=dx.dtype)
+        occidxa = mo_occ>0
+        occidxb = mo_occ==2
+        viridxa = ~occidxa
+        viridxb = ~occidxb
+        mask = (viridxa[:,None] & occidxa) | (viridxb[:,None] & occidxb)
+        x1[mask] = dx
+        dr = x1 - x1.conj().T
+        u = expm(dr)
+        if isinstance(u0, cp.ndarray):
+            u = u0.dot(u)
+        return u
+
+    def rotate_mo(self, mo_coeff, u, log=None):
+        return mo_coeff.dot(u)
+
+class _SecondOrderROHF(_CIAH_SOSCF):
+    gen_g_hop = gen_g_hop_rohf
+
+class _SecondOrderUHF(_CIAH_SOSCF):
+    gen_g_hop = gen_g_hop_uhf
+
+    def update_rotate_matrix(self, dx, mo_occ, u0=1, mo_coeff=None):
+        occidxa = mo_occ[0] > 0
+        occidxb = mo_occ[1] > 0
+        viridxa = ~occidxa
+        viridxb = ~occidxb
+
+        nmo = len(occidxa)
+        dr = cp.zeros((2,nmo,nmo), dtype=dx.dtype)
+        uniq = cp.array((viridxa[:,None] & occidxa,
+                         viridxb[:,None] & occidxb))
+        dr[uniq] = dx
+        dr = dr - dr.conj().transpose(0,2,1)
+
+        if isinstance(u0, int) and u0 == 1:
+            return cp.asarray((expm(dr[0]), expm(dr[1])))
+        else:
+            return cp.asarray((u0[0].dot(expm(dr[0])),
+                               u0[1].dot(expm(dr[1]))))
+
+    def rotate_mo(self, mo_coeff, u, log=None):
+        mo = cp.asarray((mo_coeff[0].dot(u[0]),
+                         mo_coeff[1].dot(u[1])))
+        return mo
+
+    def kernel(self, mo_coeff=None, mo_occ=None, dm0=None):
+        if isinstance(mo_coeff, cp.ndarray) and mo_coeff.ndim == 2:
+            mo_coeff = (mo_coeff, mo_coeff)
+        if isinstance(mo_occ, cp.ndarray) and mo_occ.ndim == 1:
+            mo_occ = (cp.asarray(mo_occ >0, dtype=np.float64),
+                      cp.asarray(mo_occ==2, dtype=np.float64))
+        return _CIAH_SOSCF.kernel(self, mo_coeff, mo_occ, dm0)
+
+class _SecondOrderRHF(_CIAH_SOSCF):
+    gen_g_hop = gen_g_hop_rhf
+
+def newton(mf):
+    if isinstance(mf, _CIAH_SOSCF):
+        return mf
+
+    assert isinstance(mf, hf.SCF)
+
+    if mf.istype('ROHF'):
+        cls = _SecondOrderROHF
+    elif mf.istype('UHF'):
+        cls = _SecondOrderUHF
+    elif mf.istype('GHF'):
+        raise NotImplementedError
+    elif mf.istype('RDHF'):
+        raise NotImplementedError
+    elif mf.istype('DHF'):
+        raise NotImplementedError
+    else:
+        cls = _SecondOrderRHF
+    return lib.set_class(cls(mf), (cls, mf.__class__))
diff --git a/gpu4pyscf/scf/tests/test_scf_j_engine.py b/gpu4pyscf/scf/tests/test_scf_j_engine.py
new file mode 100644
index 00000000..19291e5a
--- /dev/null
+++ b/gpu4pyscf/scf/tests/test_scf_j_engine.py
@@ -0,0 +1,45 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import pyscf
+from pyscf import lib
+from gpu4pyscf.scf import j_engine
+from pyscf.scf.hf import get_jk
+
+def test_j_engine():
+    mol = pyscf.M(
+        atom = '''
+        O   0.000   -0.    0.1174
+        H  -0.757    4.   -0.4696
+        H   0.757    4.   -0.4696
+        C   1.      1.    0.
+        H   4.      0.    3.
+        H   0.      1.    .6
+        ''',
+        basis='def2-tzvp',
+        unit='B',)
+
+    np.random.seed(9)
+    nao = mol.nao
+    dm = np.random.rand(nao, nao)
+    dm = dm.dot(dm.T)
+
+    vj = j_engine.get_j(mol, dm)
+    vj1 = vj.get()
+    ref = get_jk(mol, dm, with_k=False)[0]
+    assert abs(lib.fp(vj1) - -2327.4715195591784) < 1e-9
+    assert abs(vj1 - ref).max() < 1e-9
diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py
new file mode 100644
index 00000000..b7fa3990
--- /dev/null
+++ b/gpu4pyscf/scf/tests/test_soscf.py
@@ -0,0 +1,224 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf import scf
+from gpu4pyscf import dft
+
+def setUpModule():
+    global h2o_z0, h2o_z1
+    h2o_z0 = gto.M(
+        verbose = 5,
+        output = '/dev/null',
+        atom = [
+        ["O" , (0. , 0.     , 0.)],
+        [1   , (0. , -0.757 , 0.587)],
+        [1   , (0. , 0.757  , 0.587)] ],
+        basis = '6-31g')
+
+    h2o_z1 = gto.M(
+        verbose = 5,
+        output = '/dev/null',
+        atom = [
+        ["O" , (0. , 0.     , 0.)],
+        [1   , (0. , -0.757 , 0.587)],
+        [1   , (0. , 0.757  , 0.587)] ],
+        basis = '6-31g',
+        charge = 1,
+        spin = 1,)
+
+def tearDownModule():
+    global h2o_z0, h2o_z1
+    h2o_z0.stdout.close()
+    h2o_z1.stdout.close()
+    del h2o_z0, h2o_z1
+
+class KnownValues(unittest.TestCase):
+    def test_nr_rhf(self):
+        mf = scf.RHF(h2o_z0)
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 2
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), -75.98394849812, 9)
+
+    def test_nr_rohf(self):
+        mf = scf.ROHF(h2o_z1)
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 20
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), -75.5783963795897, 9)
+
+    def test_nr_uhf(self):
+        mf = scf.UHF(h2o_z1)
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 2
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), -75.58051984397145, 9)
+
+    def test_nr_rks_lda(self):
+        mf = dft.RKS(h2o_z0)
+        eref = mf.kernel()
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 3
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_nr_rks_rsh(self):
+        '''test range-separated Coulomb'''
+        mf = dft.RKS(h2o_z0)
+        mf.xc = 'wb97x'
+        eref = mf.kernel()
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 3
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_nr_rks(self):
+        mf = dft.RKS(h2o_z0)
+        mf.xc = 'b3lyp'
+        eref = mf.kernel()
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 3
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_rks_gen_g_hop(self):
+        mf = dft.RKS(h2o_z0)
+        mf.grids.build()
+        mf.xc = 'b3lyp'
+        nao = h2o_z0.nao_nr()
+        mo = cp.random.random((nao,nao))
+        mo_occ = cp.zeros(nao)
+        mo_occ[:5] = 2
+        nocc, nvir = 5, nao-5
+        dm1 = cp.random.random(nvir*nocc)
+        nr = mf.newton()
+        g, hop, hdiag = nr.gen_g_hop(mo, mo_occ)
+        mf_cpu = mf.to_cpu().newton()
+        hop_ref = mf_cpu.gen_g_hop(mo.get(), mo_occ.get())[1]
+        self.assertAlmostEqual(abs(hop(dm1).get() - hop_ref(dm1.get())).max(), 0, 9)
+
+    def test_nr_roks(self):
+        mf = dft.RKS(h2o_z1)
+        mf.xc = 'b3lyp'
+        eref = mf.kernel()
+
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 3
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_nr_uks_lda(self):
+        mf = dft.UKS(h2o_z1)
+        eref = mf.kernel()
+
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 2
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_nr_uks_rsh(self):
+        '''test range-separated Coulomb'''
+        mf = dft.UKS(h2o_z1)
+        mf.xc = 'wb97x'
+        eref = mf.kernel()
+
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 3
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_nr_uks(self):
+        mf = dft.UKS(h2o_z1)
+        mf.xc = 'b3lyp'
+        eref = mf.kernel()
+
+        mf.max_cycle = 1
+        mf.conv_check = False
+        mf.kernel()
+        nr = mf.newton()
+        nr.max_cycle = 3
+        nr.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+    def test_uks_gen_g_hop(self):
+        mf = dft.UKS(h2o_z0)
+        mf.grids.build()
+        mf.xc = 'hse06'
+        nao = h2o_z0.nao_nr()
+        mo = cp.random.random((2, nao,nao))
+        mo_occ = cp.zeros((2,nao))
+        mo_occ[:,:5] = 1
+        nocc, nvir = 5, nao-5
+        dm1 = cp.random.random(nvir*nocc*2)
+        nr = mf.newton()
+        g, hop, hdiag = nr.gen_g_hop(mo, mo_occ)
+        mf_cpu = mf.to_cpu().newton()
+        hop_ref = mf_cpu.gen_g_hop(mo.get(), mo_occ.get())[1]
+        self.assertAlmostEqual(abs(hop(dm1).get() - hop_ref(dm1.get())).max(), 0, 9)
+
+    def test_with_df(self):
+        mf = scf.RHF(h2o_z0).density_fit().newton().run()
+        self.assertTrue(mf._eri is None)
+        self.assertAlmostEqual(mf.e_tot, -75.983944727996, 9)
+        self.assertEqual(mf.__class__.__name__, 'SecondOrderDFRHF')
+
+        mf = scf.RHF(h2o_z0).newton().density_fit().run()
+        self.assertTrue(mf._eri is None)
+        self.assertAlmostEqual(mf.e_tot, -75.9839484980661, 9)
+        mf = mf.undo_newton()
+        self.assertEqual(mf.__class__.__name__, 'RHF')
+
+    def test_secondary_auxbasis(self):
+        mf_ref = scf.UHF(h2o_z0).run()
+        mf = scf.UHF(h2o_z0).newton().density_fit(auxbasis=[[0, [1., 1.]]]).run()
+        self.assertAlmostEqual(mf_ref.e_tot, mf.e_tot, 8)
+
+        mf_ref = scf.UHF(h2o_z0).density_fit().run()
+        mf = scf.UHF(h2o_z0).density_fit().newton().density_fit(auxbasis=[[0, [1., 1.]]]).run()
+        self.assertAlmostEqual(mf_ref.e_tot, mf.e_tot, 8)
+
+if __name__ == "__main__":
+    print("Full Tests for Newton solver")
+    unittest.main()
diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py
index 17826721..2c7dbf08 100644
--- a/gpu4pyscf/scf/uhf.py
+++ b/gpu4pyscf/scf/uhf.py
@@ -70,7 +70,8 @@ def spin_square(mo, s=1):
 def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
              diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
     if dm is None: dm = mf.make_rdm1()
-    if h1e is None: h1e = cupy.asarray(mf.get_hcore())
+    if h1e is None: h1e = mf.get_hcore()
+    if s1e is None: s1e = mf.get_ovlp()
     if vhf is None: vhf = mf.get_veff(mf.mol, dm)
     if not isinstance(s1e, cupy.ndarray): s1e = cupy.asarray(s1e)
     if not isinstance(dm, cupy.ndarray): dm = cupy.asarray(dm)
@@ -150,6 +151,36 @@ def energy_elec(mf, dm=None, h1e=None, vhf=None):
     logger.debug(mf, 'E1 = %s  Ecoul = %s', e1, e_coul.real)
     return e_elec, e_coul
 
+def canonicalize(mf, mo_coeff, mo_occ, fock=None):
+    '''Canonicalization diagonalizes the UHF Fock matrix within occupied,
+    virtual subspaces separatedly (without change occupancy).
+    '''
+    mo_occ = cupy.asarray(mo_occ)
+    assert mo_occ.ndim == 2
+    if fock is None:
+        dm = mf.make_rdm1(mo_coeff, mo_occ)
+        fock = mf.get_fock(dm=dm)
+    occidxa = mo_occ[0] == 1
+    occidxb = mo_occ[1] == 1
+    viridxa = mo_occ[0] == 0
+    viridxb = mo_occ[1] == 0
+
+    def eig_(fock, mo_coeff, idx, es, cs):
+        if cupy.any(idx) > 0:
+            orb = mo_coeff[:,idx]
+            f1 = orb.conj().T.dot(fock).dot(orb)
+            e, c = cupy.linalg.eigh(f1)
+            es[idx] = e
+            cs[:,idx] = cupy.dot(orb, c)
+
+    mo = cupy.empty_like(mo_coeff)
+    mo_e = cupy.empty(mo_occ.shape)
+    eig_(fock[0], mo_coeff[0], occidxa, mo_e[0], mo[0])
+    eig_(fock[0], mo_coeff[0], viridxa, mo_e[0], mo[0])
+    eig_(fock[1], mo_coeff[1], occidxb, mo_e[1], mo[1])
+    eig_(fock[1], mo_coeff[1], viridxb, mo_e[1], mo[1])
+    return mo_e, mo
+
 class UHF(hf.SCF):
     from gpu4pyscf.lib.utils import to_gpu, device
 
@@ -195,6 +226,7 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
             fock = self.get_hcore(self.mol) + self.get_veff(self.mol, dm1)
         return get_grad(mo_coeff, mo_occ, fock)
 
+    make_asym_dm       = NotImplemented
     make_rdm2                = NotImplemented
     energy_elec              = energy_elec
     get_init_guess           = hf.return_cupy_array(uhf.UHF.get_init_guess)
@@ -204,15 +236,6 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
     init_guess_by_mod_huckel = uhf.UHF.init_guess_by_mod_huckel
     init_guess_by_1e         = uhf.UHF.init_guess_by_1e
     init_guess_by_chkfile    = uhf.UHF.init_guess_by_chkfile
-
-    analyze            = NotImplemented
-    mulliken_pop       = NotImplemented
-    mulliken_spin_pop  = NotImplemented
-    mulliken_meta      = NotImplemented
-    mulliken_meta_spin = NotImplemented
-    canonicalize       = NotImplemented
-    det_ovlp           = NotImplemented
-    make_asym_dm       = NotImplemented
     _finalize          = uhf.UHF._finalize
 
     conv_tol_cpscf = 1e-4
@@ -225,9 +248,9 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
     density_fit = hf.RHF.density_fit
     energy_tot = hf.RHF.energy_tot
     energy_elec = energy_elec
+    canonicalize = canonicalize
 
     make_rdm2 = NotImplemented
-    newton = NotImplemented
     x2c = x2c1e = sfx2c1e = NotImplemented
     to_rhf = NotImplemented
     to_uhf = NotImplemented
@@ -236,7 +259,6 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
     to_uks = NotImplemented
     to_gks = NotImplemented
     to_ks = NotImplemented
-    canonicalize = NotImplemented
     # TODO: Enable followings after testing
     analyze = NotImplemented
     stability = NotImplemented
@@ -290,6 +312,10 @@ def nuc_grad_method(self):
         from gpu4pyscf.grad import uhf
         return uhf.Gradients(self)
 
+    def newton(self):
+        from gpu4pyscf.scf.soscf import newton
+        return newton(self)
+
     def to_cpu(self):
         from gpu4pyscf.lib import utils
         mf = uhf.UHF(self.mol)
diff --git a/gpu4pyscf/scf/uhf_symm.py b/gpu4pyscf/scf/uhf_symm.py
new file mode 100644
index 00000000..b1785a60
--- /dev/null
+++ b/gpu4pyscf/scf/uhf_symm.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from gpu4pyscf.scf.uhf import UHF
+
+SymAdaptedUHF = UHF
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 1fce56f8..1df748e6 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -243,10 +243,10 @@ def grad_qv(pcmobj, dm):
     dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart)
     dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart)
 
-    cart_ao_idx = intopt.cart_ao_idx
-    rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
-    dvj = dvj[:,rev_cart_ao_idx]
-
+    if not mol.cart:
+        dvj = dvj @ intopt.cart2sph
+    dvj = intopt.unsort_orbitals(dvj, axis=[1])
+    
     aoslice = intopt.mol.aoslice_by_atom()
     dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
     dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index 967d25f6..1060f3d4 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -128,9 +128,19 @@ def test_to_gpu(self):
         hess_gpu = hessobj.kernel()
         assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-8
         '''
+        mol = gto.Mole()
+        mol.atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+    '''
+        mol.basis = 'sto-3g'
+        mol.output = '/dev/null'
+        mol.build(verbose=0)
         mf = pyscf.dft.RKS(mol, xc='b3lyp').density_fit().PCM()
         mf.conv_tol = 1e-12
         mf.conv_tol_cpscf = 1e-7
+        mf.grids.atom_grid = (50,194)
         mf.kernel()
         hessobj = mf.Hessian()
         hess_cpu = hessobj.kernel()
@@ -148,9 +158,19 @@ def test_to_cpu(self):
         e_cpu = mf.kernel()
         assert abs(e_cpu - e_gpu) < 1e-8
         '''
+        mol = gto.Mole()
+        mol.atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+    '''
+        mol.basis = 'sto-3g'
+        mol.output = '/dev/null'
+        mol.build(verbose=0)
         mf = dft.RKS(mol, xc='b3lyp').density_fit().PCM()
         mf.conv_tol = 1e-12
         mf.conv_tol_cpscf = 1e-7
+        mf.grids.atom_grid = (50,194)
         mf.kernel()
         hessobj = mf.Hessian()
         hess_gpu = hessobj.kernel()
diff --git a/gpu4pyscf/tdscf/__init__.py b/gpu4pyscf/tdscf/__init__.py
new file mode 100644
index 00000000..552cccee
--- /dev/null
+++ b/gpu4pyscf/tdscf/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from gpu4pyscf.tdscf import rhf
+from gpu4pyscf.tdscf import uhf
+from gpu4pyscf.tdscf import rks
+from gpu4pyscf.tdscf import uks
diff --git a/gpu4pyscf/tdscf/_uhf_resp_sf.py b/gpu4pyscf/tdscf/_uhf_resp_sf.py
new file mode 100644
index 00000000..4ea074dc
--- /dev/null
+++ b/gpu4pyscf/tdscf/_uhf_resp_sf.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+# TODO: merge this function into scf._response_functions.py
+
+import functools
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.dft import numint2c, xc_deriv
+from gpu4pyscf.scf import hf, uhf
+from gpu4pyscf.dft.numint import _scale_ao, _tau_dot, eval_rho, eval_rho2
+from gpu4pyscf.lib.cupy_helper import transpose_sum, add_sparse, contract
+
+def gen_uhf_response_sf(mf, mo_coeff=None, mo_occ=None, hermi=0,
+                        collinear='mcol', collinear_samples=200):
+    '''Generate a function to compute the product of Spin Flip UKS response function
+    and UKS density matrices.
+    '''
+    assert isinstance(mf, (uhf.UHF))
+    if mo_coeff is None: mo_coeff = mf.mo_coeff
+    if mo_occ is None: mo_occ = mf.mo_occ
+    mol = mf.mol
+    assert hermi == 0
+
+    if isinstance(mf, hf.KohnShamDFT):
+        if mf.do_nlc():
+            logger.warn(mf, 'NLC functional found in DFT object.  Its second '
+                        'deriviative is not available. Its contribution is '
+                        'not included in the response function.')
+
+        ni = mf._numint
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+
+        if collinear in ('ncol', 'mcol'):
+            fxc = cache_xc_kernel_sf(ni, mol, mf.grids, mf.xc, mo_coeff, mo_occ,
+                                     collinear_samples)[2]
+        dm0 = None
+
+        def vind(dm1):
+            if collinear in ('ncol', 'mcol'):
+                v1 = nr_uks_fxc_sf(ni, mol, mf.grids, mf.xc, dm0, dm1, 0, hermi,
+                                   None, None, fxc)
+            else:
+                v1 = cp.zeros_like(dm1)
+            if hybrid:
+                # j = 0 in spin flip part.
+                if omega == 0:
+                    vk = mf.get_k(mol, dm1, hermi) * hyb
+                elif alpha == 0: # LR=0, only SR exchange
+                    vk = mf.get_k(mol, dm1, hermi, omega=-omega) * hyb
+                elif hyb == 0: # SR=0, only LR exchange
+                    vk = mf.get_k(mol, dm1, hermi, omega=omega) * alpha
+                else: # SR and LR exchange with different ratios
+                    vk = mf.get_k(mol, dm1, hermi) * hyb
+                    vk += mf.get_k(mol, dm1, hermi, omega=omega) * (alpha-hyb)
+                v1 -= vk
+            return v1
+        return vind
+
+    else: #HF
+        def vind(dm1):
+            vk = mf.get_k(mol, dm1, hermi)
+            return -vk
+        return vind
+
+# This function is copied from pyscf.dft.numint2c.py
+def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
+    evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype)
+    evfk = list(evfk)
+    for order in range(1, deriv+1):
+        if evfk[order] is not None:
+            evfk[order] = xc_deriv.ud2ts(evfk[order])
+    return evfk
+
+# Edited based on pyscf.dft.numint2c.mcfun_eval_xc_adapter
+def mcfun_eval_xc_adapter_sf(ni, xc_code, collinear_samples):
+    '''Wrapper to generate the eval_xc function required by mcfun
+    '''
+
+    try:
+        import mcfun
+    except ImportError:
+        raise ImportError('This feature requires mcfun library.\n'
+                          'Try install mcfun with `pip install mcfun`')
+
+    ni = numint2c.NumInt2C()
+    ni.collinear = 'mcol'
+    ni.collinear_samples = collinear_samples
+    xctype = ni._xc_type(xc_code)
+    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype)
+    nproc = lib.num_threads()
+
+    def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None):
+        res = mcfun.eval_xc_eff_sf(
+            fn_eval_xc, rho.get(), deriv,
+            collinear_samples=collinear_samples, workers=nproc)
+        return [x if x is None else cp.asarray(x) for x in res]
+    return eval_xc_eff
+
+def cache_xc_kernel_sf(ni, mol, grids, xc_code, mo_coeff, mo_occ,
+                       collinear_samples):
+    '''Compute the fxc_sf, which can be used in SF-TDDFT/TDA
+    '''
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'GGA':
+        ao_deriv = 1
+    elif xctype == 'MGGA':
+        ao_deriv = 1
+    else:
+        ao_deriv = 0
+    assert isinstance(mo_coeff, cp.ndarray)
+    assert mo_coeff.ndim == 3
+
+    nao = mo_coeff[0].shape[0]
+    rhoa = []
+    rhob = []
+
+    with_lapl = False
+    opt = getattr(ni, 'gdftopt', None)
+    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+        ni.build(mol, grids.coords)
+        opt = ni.gdftopt
+    _sorted_mol = opt._sorted_mol
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
+
+    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+        rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff[0,idx,:],
+                               mo_occ[0], None, xctype, with_lapl)
+        rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff[1,idx,:],
+                               mo_occ[1], None, xctype, with_lapl)
+        rhoa.append(rhoa_slice)
+        rhob.append(rhob_slice)
+    rho_ab = (cp.hstack(rhoa), cp.hstack(rhob))
+    rho_z = cp.array([rho_ab[0]+rho_ab[1],
+                      rho_ab[0]-rho_ab[1]])
+    eval_xc_eff = mcfun_eval_xc_adapter_sf(ni, xc_code, collinear_samples)
+    vxc, fxc = eval_xc_eff(xc_code, rho_z, deriv=2, xctype=xctype)[1:3]
+    return rho_ab, vxc, fxc
+
+def nr_uks_fxc_sf(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
+                  rho0=None, vxc=None, fxc=None):
+    if fxc is None:
+        raise RuntimeError('fxc was not initialized')
+    assert hermi == 0
+    assert dms.dtype == np.double
+
+    xctype = ni._xc_type(xc_code)
+    opt = getattr(ni, 'gdftopt', None)
+    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+        ni.build(mol, grids.coords)
+        opt = ni.gdftopt
+    mol = None
+    _sorted_mol = opt._sorted_mol
+    nao, nao0 = opt.coeff.shape
+    dm_shape = dms.shape
+
+    dms = cp.asarray(dms).reshape(-1,nao0,nao0)
+    dms = opt.sort_orbitals(dms, axis=[1,2])
+
+    nset = len(dms)
+    vmat = cp.zeros((nset, nao, nao))
+
+    if xctype == 'LDA':
+        ao_deriv = 0
+    elif xctype == 'GGA':
+        ao_deriv = 1
+    elif xctype == 'MGGA':
+        ao_deriv = 1
+    else:
+        raise RuntimeError(f'Unknown xctype {xctype}')
+    p0 = p1 = 0
+    for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+        p0, p1 = p1, p1+len(weights)
+        # precompute fxc_w. *2.0 becausue xx + yy
+        fxc_w = fxc[:,:,p0:p1] * weights * 2.
+
+        for i in range(nset):
+            rho1 = eval_rho(_sorted_mol, ao, dms[i,mask[:,None],mask],
+                            xctype=xctype, hermi=hermi)
+            if xctype == 'LDA':
+                wv = rho1 * fxc_w[0,0]
+                vtmp = ao.dot(_scale_ao(ao, wv).T)
+            elif xctype == 'GGA':
+                wv = contract('bg,abg->ag', rho1, fxc_w)
+                wv[0] *= .5 # for transpose_sum at the end
+                vtmp = ao[0].dot(_scale_ao(ao, wv).T)
+            elif xctype == 'MGGA':
+                wv = contract('bg,abg->ag', rho1, fxc_w)
+                wv[[0,4]] *= .5 # for transpose_sum at the end
+                vtmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T)
+                vtmp += _tau_dot(ao, ao, wv[4])
+            add_sparse(vmat[i], vtmp, mask)
+
+    vmat = opt.unsort_orbitals(vmat, axis=[1,2])
+    if xctype != 'LDA':
+        transpose_sum(vmat)
+    if len(dm_shape) == 2:
+        vmat = vmat[0]
+    return vmat
diff --git a/gpu4pyscf/tdscf/rhf.py b/gpu4pyscf/tdscf/rhf.py
new file mode 100644
index 00000000..9e33b6e8
--- /dev/null
+++ b/gpu4pyscf/tdscf/rhf.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import numpy as np
+import cupy as cp
+import scipy.linalg
+from pyscf import gto
+from pyscf import lib
+from pyscf.tdscf import rhf as tdhf_cpu
+from pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig
+from gpu4pyscf import scf
+from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from gpu4pyscf.lib import utils
+from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import _response_functions # noqa
+from pyscf import __config__
+
+REAL_EIG_THRESHOLD = tdhf_cpu.REAL_EIG_THRESHOLD
+#OUTPUT_THRESHOLD = tdhf_cpu.OUTPUT_THRESHOLD
+OUTPUT_THRESHOLD = getattr(__config__, 'tdscf_rhf_get_nto_threshold', 0.3)
+
+__all__ = [
+    'TDA', 'CIS', 'TDHF', 'TDRHF', 'TDBase'
+]
+
+
+def gen_tda_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
+    '''Generate function to compute A x
+    '''
+    assert fock_ao is None
+    assert isinstance(mf, scf.hf.SCF)
+    assert wfnsym is None
+    mo_coeff = mf.mo_coeff
+    assert mo_coeff.dtype == cp.float64
+    mo_energy = mf.mo_energy
+    mo_occ = mf.mo_occ
+    occidx = mo_occ == 2
+    viridx = mo_occ == 0
+    orbv = mo_coeff[:,viridx]
+    orbo = mo_coeff[:,occidx]
+    orbo2 = orbo * 2. # *2 for double occupancy
+
+    e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None]
+    hdiag = hdiag.ravel().get()
+    vresp = mf.gen_response(singlet=singlet, hermi=0)
+    nocc, nvir = e_ia.shape
+
+    def vind(zs):
+        zs = cp.asarray(zs).reshape(-1,nocc,nvir)
+        mo1 = contract('xov,pv->xpo', zs, orbv)
+        dms = contract('xpo,qo->xpq', mo1, orbo2.conj())
+        dms = tag_array(dms, mo1=mo1, occ_coeff=orbo)
+        v1ao = vresp(dms)
+        v1mo = contract('xpq,qo->xpo', v1ao, orbo)
+        v1mo = contract('xpo,pv->xov', v1mo, orbv.conj())
+        v1mo += zs * e_ia
+        return v1mo.reshape(v1mo.shape[0],-1).get()
+
+    return vind, hdiag
+
+
+class TDBase(lib.StreamObject):
+    to_gpu = utils.to_gpu
+    device = utils.device
+    to_cpu = utils.to_cpu
+
+    conv_tol              = tdhf_cpu.TDBase.conv_tol
+    nstates               = tdhf_cpu.TDBase.nstates
+    singlet               = tdhf_cpu.TDBase.singlet
+    lindep                = tdhf_cpu.TDBase.lindep
+    level_shift           = tdhf_cpu.TDBase.level_shift
+    max_cycle             = tdhf_cpu.TDBase.max_cycle
+    positive_eig_threshold = tdhf_cpu.TDBase.positive_eig_threshold
+    deg_eia_thresh        = tdhf_cpu.TDBase.deg_eia_thresh
+
+    _keys = tdhf_cpu.TDBase._keys
+
+    __init__ = tdhf_cpu.TDBase.__init__
+
+    nroots = tdhf_cpu.TDBase.nroots
+    e_tot = tdhf_cpu.TDBase.e_tot
+    dump_flags = tdhf_cpu.TDBase.dump_flags
+    check_sanity = tdhf_cpu.TDBase.check_sanity
+    reset = tdhf_cpu.TDBase.reset
+    _finalize = tdhf_cpu.TDBase._finalize
+
+    gen_vind = NotImplemented
+    get_ab = NotImplemented
+    get_precond = tdhf_cpu.TDBase.get_precond
+
+    nuc_grad_method = NotImplemented
+    as_scanner = tdhf_cpu.as_scanner
+
+    oscillator_strength = tdhf_cpu.oscillator_strength
+    transition_dipole              = tdhf_cpu.transition_dipole
+    transition_quadrupole          = tdhf_cpu.transition_quadrupole
+    transition_octupole            = tdhf_cpu.transition_octupole
+    transition_velocity_dipole     = tdhf_cpu.transition_velocity_dipole
+    transition_velocity_quadrupole = tdhf_cpu.transition_velocity_quadrupole
+    transition_velocity_octupole   = tdhf_cpu.transition_velocity_octupole
+    transition_magnetic_dipole     = tdhf_cpu.transition_magnetic_dipole
+    transition_magnetic_quadrupole = tdhf_cpu.transition_magnetic_quadrupole
+
+    def analyze(self, verbose=None):
+        self.to_cpu().analyze(verbose)
+        return self
+
+    def get_nto(self, state=1, threshold=OUTPUT_THRESHOLD, verbose=None):
+        '''
+        Natural transition orbital analysis.
+
+        Returns:
+            A list (weights, NTOs).  NTOs are natural orbitals represented in AO
+            basis. The first N_occ NTOs are occupied NTOs and the rest are virtual
+            NTOs. weights and NTOs are all stored in nparray
+        '''
+        return self.to_cpu().get_nto(state, threshold, verbose)
+
+    # needed by transition dipoles
+    def _contract_multipole(tdobj, ints, hermi=True, xy=None):
+        '''ints is the integral tensor of a spin-independent operator'''
+        if xy is None: xy = tdobj.xy
+        nstates = len(xy)
+        pol_shape = ints.shape[:-2]
+        nao = ints.shape[-1]
+
+        if not tdobj.singlet:
+            return np.zeros((nstates,) + pol_shape)
+
+        mo_coeff = tdobj._scf.mo_coeff
+        mo_occ = tdobj._scf.mo_occ
+        orbo = mo_coeff[:,mo_occ==2]
+        orbv = mo_coeff[:,mo_occ==0]
+        if isinstance(orbo, cp.ndarray):
+            orbo = orbo.get()
+            orbv = orbv.get()
+
+        #Incompatible to old np version
+        #ints = np.einsum('...pq,pi,qj->...ij', ints, orbo.conj(), orbv)
+        ints = lib.einsum('xpq,pi,qj->xij', ints.reshape(-1,nao,nao), orbo.conj(), orbv)
+        pol = np.array([np.einsum('xij,ij->x', ints, x) * 2 for x,y in xy])
+        if isinstance(xy[0][1], np.ndarray):
+            if hermi:
+                pol += [np.einsum('xij,ij->x', ints, y) * 2 for x,y in xy]
+            else:  # anti-Hermitian
+                pol -= [np.einsum('xij,ij->x', ints, y) * 2 for x,y in xy]
+        pol = pol.reshape((nstates,)+pol_shape)
+        return pol
+
+class TDA(TDBase):
+    __doc__ = tdhf_cpu.TDA.__doc__
+
+    def gen_vind(self, mf=None):
+        '''Generate function to compute Ax'''
+        if mf is None:
+            mf = self._scf
+        return gen_tda_operation(mf, singlet=self.singlet)
+
+    def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+        '''
+        Generate initial guess for TDA
+
+        Kwargs:
+            nstates : int
+                The number of initial guess vectors.
+        '''
+        if mf is None: mf = self._scf
+        if nstates is None: nstates = self.nstates
+        assert wfnsym is None
+        assert not return_symmetry
+
+        mo_energy = mf.mo_energy
+        mo_occ = mf.mo_occ
+        if isinstance(mo_energy, cp.ndarray):
+            mo_energy = mo_energy.get()
+            mo_occ = mo_occ.get()
+        occidx = mo_occ == 2
+        viridx = mo_occ == 0
+        e_ia = (mo_energy[viridx] - mo_energy[occidx,None]).ravel()
+        nov = e_ia.size
+        nstates = min(nstates, nov)
+
+        # Find the nstates-th lowest energy gap
+        e_threshold = float(np.partition(e_ia, nstates-1)[nstates-1])
+        e_threshold += self.deg_eia_thresh
+
+        idx = np.where(e_ia <= e_threshold)[0]
+        x0 = np.zeros((idx.size, nov))
+        for i, j in enumerate(idx):
+            x0[i, j] = 1  # Koopmans' excitations
+
+        return x0
+
+    def kernel(self, x0=None, nstates=None):
+        '''TDA diagonalization solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+        mol = self.mol
+
+        vind, hdiag = self.gen_vind(self._scf)
+        precond = self.get_precond(hdiag)
+
+        def pickeig(w, v, nroots, envs):
+            idx = np.where(w > self.positive_eig_threshold)[0]
+            return w[idx], v[:,idx], idx
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        self.converged, self.e, x1 = lr_eigh(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        nocc = mol.nelectron // 2
+        nmo = self._scf.mo_occ.size
+        nvir = nmo - nocc
+        # 1/sqrt(2) because self.x is for alpha excitation and 2(X^+*X) = 1
+        self.xy = [(xi.reshape(nocc,nvir) * .5**.5, 0) for xi in x1]
+        log.timer('TDA', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+CIS = TDA
+
+
+def gen_tdhf_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
+    '''Generate function to compute
+
+    [ A   B ][X]
+    [-B* -A*][Y]
+    '''
+    assert fock_ao is None
+    assert isinstance(mf, scf.hf.SCF)
+    mo_coeff = mf.mo_coeff
+    assert mo_coeff.dtype == cp.float64
+    mo_energy = mf.mo_energy
+    mo_occ = mf.mo_occ
+    occidx = mo_occ == 2
+    viridx = mo_occ == 0
+    orbv = mo_coeff[:,viridx]
+    orbo = mo_coeff[:,occidx]
+
+    e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None]
+    hdiag = cp.hstack((hdiag.ravel(), -hdiag.ravel())).get()
+    vresp = mf.gen_response(singlet=singlet, hermi=0)
+    nocc, nvir = e_ia.shape
+
+    def vind(xys):
+        xys = cp.asarray(xys).reshape(-1,2,nocc,nvir)
+        nz = len(xys)
+        xs, ys = xys.transpose(1,0,2,3)
+        # *2 for double occupancy
+        tmp = contract('xov,pv->xpo', xs, orbv*2)
+        dms = contract('xpo,qo->xpq', tmp, orbo.conj())
+        tmp = contract('xov,qv->xoq', ys, orbv.conj()*2)
+        dms+= contract('xoq,po->xpq', tmp, orbo)
+        v1ao = vresp(dms) # = <mb||nj> Xjb + <mj||nb> Yjb
+        v1_top = contract('xpq,qo->xpo', v1ao, orbo)
+        v1_top = contract('xpo,pv->xov', v1_top, orbv)
+        v1_bot = contract('xpq,po->xoq', v1ao, orbo)
+        v1_bot = contract('xoq,qv->xov', v1_bot, orbv)
+        v1_top += xs * e_ia  # AX
+        v1_bot += ys * e_ia  # (A*)Y
+        hx = cp.hstack((v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)))
+        return hx.get()
+
+    return vind, hdiag
+
+
+class TDHF(TDBase):
+    __doc__ = tdhf_cpu.TDHF.__doc__
+
+    @lib.with_doc(gen_tdhf_operation.__doc__)
+    def gen_vind(self, mf=None):
+        if mf is None:
+            mf = self._scf
+        return gen_tdhf_operation(mf, singlet=self.singlet)
+
+    def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+        x0 = TDA.init_guess(self, mf, nstates, wfnsym, return_symmetry)
+        y0 = np.zeros_like(x0)
+        return np.hstack([x0, y0])
+
+    def kernel(self, x0=None, nstates=None):
+        '''TDHF diagonalization with non-Hermitian eigenvalue solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+        mol = self.mol
+
+        vind, hdiag = self.gen_vind(self._scf)
+        precond = self.get_precond(hdiag)
+
+        # handle single kpt PBC SCF
+        if getattr(self._scf, 'kpt', None) is not None:
+            from pyscf.pbc.lib.kpts_helper import gamma_point
+            real_system = (gamma_point(self._scf.kpt) and
+                           self._scf.mo_coeff[0].dtype == np.double)
+        else:
+            real_system = True
+
+        # We only need positive eigenvalues
+        def pickeig(w, v, nroots, envs):
+            realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) &
+                                  (w.real > self.positive_eig_threshold))[0]
+            # If the complex eigenvalue has small imaginary part, both the
+            # real part and the imaginary part of the eigenvector can
+            # approximately be used as the "real" eigen solutions.
+            return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system)
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        self.converged, w, x1 = lr_eig(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        nocc = mol.nelectron // 2
+        nmo = self._scf.mo_occ.size
+        nvir = nmo - nocc
+        self.e = w
+        def norm_xy(z):
+            x, y = z.reshape(2,nocc,nvir)
+            norm = lib.norm(x)**2 - lib.norm(y)**2
+            norm = np.sqrt(.5/norm)  # normalize to 0.5 for alpha spin
+            return x*norm, y*norm
+        self.xy = [norm_xy(z) for z in x1]
+
+        log.timer('TDDFT', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+TDRHF = TDHF
+
+scf.hf.RHF.TDA = lib.class_as_method(TDA)
+scf.hf.RHF.TDHF = lib.class_as_method(TDHF)
diff --git a/gpu4pyscf/tdscf/rks.py b/gpu4pyscf/tdscf/rks.py
new file mode 100644
index 00000000..41971614
--- /dev/null
+++ b/gpu4pyscf/tdscf/rks.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.tdscf._lr_eig import eigh as lr_eigh
+from gpu4pyscf.dft.rks import KohnShamDFT
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum
+from gpu4pyscf.lib import logger
+from gpu4pyscf.tdscf import rhf as tdhf_gpu
+from gpu4pyscf import dft
+
+__all__ = [
+    'TDA', 'TDDFT', 'TDRKS', 'CasidaTDDFT', 'TDDFTNoHybrid',
+]
+
+TDA = tdhf_gpu.TDA
+TDDFT = tdhf_gpu.TDHF
+TDRKS = TDDFT
+
+class CasidaTDDFT(TDDFT):
+    '''Solve the Casida TDDFT formula (A-B)(A+B)(X+Y) = (X+Y)w^2
+    '''
+
+    init_guess = TDA.init_guess
+
+    def gen_vind(self, mf=None):
+        if mf is None:
+            mf = self._scf
+        singlet = self.singlet
+        mo_coeff = mf.mo_coeff
+        assert mo_coeff.dtype == cp.double
+        mo_energy = mf.mo_energy
+        mo_occ = mf.mo_occ
+        occidx = mo_occ == 2
+        viridx = mo_occ == 0
+        orbv = mo_coeff[:,viridx]
+        orbo = mo_coeff[:,occidx]
+
+        e_ia = mo_energy[viridx] - mo_energy[occidx,None]
+        d_ia = e_ia ** .5
+        ed_ia = e_ia * d_ia
+        hdiag = e_ia.ravel() ** 2
+        hdiag = hdiag.get()
+        vresp = mf.gen_response(singlet=singlet, hermi=1)
+        nocc, nvir = e_ia.shape
+
+        def vind(zs):
+            zs = cp.asarray(zs).reshape(-1,nocc,nvir)
+            # *2 for double occupancy
+            mo1 = contract('xov,pv->xpo', zs*(d_ia*2), orbv)
+            dms = contract('xpo,qo->xpq', mo1, orbo)
+            # +cc for A+B and K_{ai,jb} in A == K_{ai,bj} in B
+            dms = transpose_sum(dms)
+            dms = tag_array(dms, mo1=mo1, occ_coeff=orbo)
+            v1ao = vresp(dms)
+            v1mo = contract('xpq,qo->xpo', v1ao, orbo)
+            v1mo = contract('xpo,pv->xov', v1mo, orbv)
+            v1mo += zs * ed_ia
+            v1mo *= d_ia
+            return v1mo.reshape(v1mo.shape[0],-1).get()
+
+        return vind, hdiag
+
+    def kernel(self, x0=None, nstates=None):
+        '''TDDFT diagonalization solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        mf = self._scf
+        if mf._numint.libxc.is_hybrid_xc(mf.xc):
+            raise RuntimeError('%s cannot be used with hybrid functional'
+                               % self.__class__)
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+
+        vind, hdiag = self.gen_vind(self._scf)
+        precond = self.get_precond(hdiag)
+
+        def pickeig(w, v, nroots, envs):
+            idx = np.where(w > self.positive_eig_threshold)[0]
+            return w[idx], v[:,idx], idx
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        self.converged, w2, x1 = lr_eigh(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        mo_energy = self._scf.mo_energy
+        mo_occ = self._scf.mo_occ
+        occidx = mo_occ == 2
+        viridx = mo_occ == 0
+        e_ia = mo_energy[viridx] - mo_energy[occidx,None]
+        e_ia = e_ia**.5
+        if isinstance(e_ia, cp.ndarray):
+            e_ia = e_ia.get()
+
+        def norm_xy(w, z):
+            zp = e_ia * z.reshape(e_ia.shape)
+            zm = w/e_ia * z.reshape(e_ia.shape)
+            x = (zp + zm) * .5
+            y = (zp - zm) * .5
+            norm = lib.norm(x)**2 - lib.norm(y)**2
+            norm = (.5/norm)**.5  # normalize to 0.5 for alpha spin
+            return (x*norm, y*norm)
+
+        idx = np.where(w2 > self.positive_eig_threshold)[0]
+        self.e = w2[idx]**.5
+        self.xy = [norm_xy(self.e[i], x1[i]) for i in idx]
+        log.timer('TDDFT', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+    def nuc_grad_method(self):
+        from pyscf.grad import tdrks
+        return tdrks.Gradients(self)
+
+TDDFTNoHybrid = CasidaTDDFT
+
+def tddft(mf):
+    '''Driver to create TDDFT or CasidaTDDFT object'''
+    if mf._numint.libxc.is_hybrid_xc(mf.xc):
+        return TDDFT(mf)
+    else:
+        return CasidaTDDFT(mf)
+
+dft.rks.RKS.TDA           = lib.class_as_method(TDA)
+dft.rks.RKS.TDHF          = None
+#dft.rks.RKS.TDDFT         = lib.class_as_method(TDDFT)
+dft.rks.RKS.TDDFTNoHybrid = lib.class_as_method(TDDFTNoHybrid)
+dft.rks.RKS.CasidaTDDFT   = lib.class_as_method(CasidaTDDFT)
+dft.rks.RKS.TDDFT         = tddft
diff --git a/gpu4pyscf/tdscf/tests/test_sftddft.py b/gpu4pyscf/tdscf/tests/test_sftddft.py
new file mode 100644
index 00000000..0358fb3a
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_sftddft.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto, scf
+from gpu4pyscf import tdscf
+try:
+    import mcfun
+except ImportError:
+    mcfun = None
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        mol = gto.Mole()
+        mol.verbose = 5
+        mol.output = '/dev/null'
+        mol.atom = '''
+        O     0.   0.       0.
+        H     0.   -0.757   0.587
+        H     0.   0.757    0.587'''
+        mol.spin = 2
+        mol.basis = '631g'
+        cls.mol = mol.build()
+        cls.mf = mol.UHF().to_gpu().run()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.mol.stdout.close()
+
+    def test_tda(self):
+        mf = self.mf
+        # sftddft not available in pyscf main branch. References are created
+        # using the sftda module from pyscf-forge
+        ref = [ 0.46644071, 0.55755649, 1.05310518]
+        td = mf.SFTDA().run(extype=0, conv_tol=1e-7)
+        self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+        ref = [-0.21574567, 0.00270390, 0.03143914]
+        td = mf.SFTDA().run(extype=1, conv_tol=1e-7)
+        self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+    @unittest.skipIf(mcfun is None, 'MCfun not available')
+    def test_mcol_b3lyp_tda(self):
+        mf = self.mf
+        # sftddft not available in pyscf main branch. References are created
+        # using the sftda module from pyscf-forge
+        ref = [ 0.45941171, 0.57799552, 1.06629265]
+        td = mf.SFTDA().run(collinear='mcol', extype=0, conv_tol=1e-7)
+        self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+        ref = [-0.29629139, 0.00067017, 0.01956306]
+        td = mf.SFTDA().run(collinear='mcol', extype=1, conv_tol=1e-7)
+        self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+    @unittest.skip('Numerical issues encountered in non-hermitian diagonalization')
+    def test_tdhf(self):
+        mf = self.mf
+        ref = [1.74385401, 9.38227395, 14.90168875]
+        td = mf.SFTDHF().run(extype=0, conv_tol=1e-7)
+        self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+        ref = [0.41701647, 9.59644331, 22.99972711]
+        td = mf.SFTDHF().run(extype=1, conv_tol=1e-7)
+        self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+if __name__ == "__main__":
+    print("Full Tests for spin-flip-TDA and spin-flip-TDDFT")
+    unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tdrhf.py b/gpu4pyscf/tdscf/tests/test_tdrhf.py
new file mode 100644
index 00000000..3ebc0372
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tdrhf.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto, scf
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        mol = gto.Mole()
+        mol.verbose = 7
+        mol.output = '/dev/null'
+        mol.atom = [
+            ['H' , (0. , 0. , .917)],
+            ['F' , (0. , 0. , 0.)], ]
+        mol.basis = '631g'
+        mol.symmetry = True
+        cls.mol = mol.build()
+        cls.mf = mf = scf.RHF(mol).to_gpu().run()
+        cls.df_mf = mf.density_fit().run()
+        cls.nstates = 5 # make sure first 3 states are converged
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.mol.stdout.close()
+
+    def test_tda_singlet(self):
+        mf = self.mf
+        nstates = self.nstates
+        td = mf.TDA().set(nstates=nstates)
+        assert td.device == 'gpu'
+        e = td.kernel()[0]
+        ref = [11.9027511, 11.9027511, 16.8603101]
+        self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.65616659, 5)
+
+        df_mf = self.df_mf
+        td = df_mf.TDA().set(nstates=nstates)
+        e = td.kernel()[0]
+        ref = td.to_cpu().kernel()[0][:3]
+        self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.65618093, 5)
+
+    def test_tda_triplet(self):
+        mf = self.mf
+        nstates = self.nstates
+        td = mf.TDA().set(nstates=nstates)
+        assert td.device == 'gpu'
+        td.singlet = False
+        e = td.kernel()[0]
+        ref = [11.0174650, 11.0174650, 13.1694960]
+        self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+        df_mf = self.df_mf
+        td = df_mf.TDA().set(nstates=nstates)
+        td.singlet = False
+        e = td.kernel()[0]
+        ref = td.to_cpu().kernel()[0][:3]
+        self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+    def test_tdhf_singlet(self):
+        mf = self.mf
+        nstates = self.nstates
+        td = mf.TDHF().set(nstates=nstates)
+        assert td.device == 'gpu'
+        e = td.kernel()[0]
+        ref = [11.8348584, 11.8348584, 16.6630381]
+        self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.64009191, 5)
+
+        df_mf = self.df_mf
+        td = df_mf.TDHF().set(nstates=nstates)
+        e = td.kernel()[0]
+        ref = td.to_cpu().kernel()[0][:3]
+        self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.64011895, 5)
+
+    def test_tdhf_triplet(self):
+        mf = self.mf
+        nstates = self.nstates
+        td = mf.TDHF().set(nstates=nstates)
+        assert td.device == 'gpu'
+        td.singlet = False
+        e = td.kernel()[0]
+        ref = [10.8919091, 10.8919091, 12.6343507]
+        self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+        df_mf = self.df_mf
+        td = df_mf.TDHF().set(nstates=nstates)
+        td.singlet = False
+        e = td.kernel()[0]
+        ref = td.to_cpu().kernel()[0][:3]
+        self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+        dip = td.transition_dipole()
+        self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+    def test_tda_vind(self):
+        mf = self.mf
+        nocc = self.mol.nelectron // 2
+        nmo = mf.mo_energy.size
+        nvir = nmo - nocc
+        zs = np.random.rand(3,nocc,nvir)
+        ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs)
+        dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+        df_mf = self.df_mf
+        ref = df_mf.to_cpu().TDA().set(singlet=True).gen_vind()[0](zs)
+        dat = df_mf.TDA().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+    def test_tdhf_vind(self):
+        mf = self.mf
+        nocc = self.mol.nelectron // 2
+        nmo = mf.mo_energy.size
+        nvir = nmo - nocc
+        zs = np.random.rand(3,2,nocc,nvir)
+        ref = mf.to_cpu().TDHF().set(singlet=True).gen_vind()[0](zs)
+        dat = mf.TDHF().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+        df_mf = self.df_mf
+        ref = df_mf.to_cpu().TDHF().set(singlet=False).gen_vind()[0](zs)
+        dat = df_mf.TDHF().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+    print("Full Tests for rhf-TDA and rhf-TDHF")
+    unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tdrks.py b/gpu4pyscf/tdscf/tests/test_tdrks.py
new file mode 100644
index 00000000..c113c1bd
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tdrks.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        mol = gto.Mole()
+        mol.verbose = 5
+        mol.output = '/dev/null'
+        mol.atom = [
+            ['H' , (0. , 0. , .917)],
+            ['F' , (0. , 0. , 0.)], ]
+        mol.basis = '631g'
+        cls.mol = mol.build()
+
+        cls.mf = mf = mol.RHF().to_gpu().run()
+        cls.td_hf = mf.TDHF().run(conv_tol=1e-6)
+
+        mf_lda = mol.RKS().to_gpu().density_fit()
+        mf_lda.xc = 'lda, vwn'
+        mf_lda.grids.prune = None
+        mf_lda.cphf_grids = mf_lda.grids
+        cls.mf_lda = mf_lda.run(conv_tol=1e-10)
+
+        mf_bp86 = mol.RKS().to_gpu().density_fit()
+        mf_bp86.xc = 'b88,p86'
+        mf_bp86.grids.prune = None
+        mf_bp86.cphf_grids = mf_bp86.grids
+        cls.mf_bp86 = mf_bp86.run(conv_tol=1e-10)
+
+        mf_b3lyp = mol.RKS().to_gpu().density_fit()
+        mf_b3lyp.xc = 'b3lyp5'
+        mf_b3lyp.grids.prune = None
+        mf_b3lyp.cphf_grids = mf_b3lyp.grids
+        cls.mf_b3lyp = mf_b3lyp.run(conv_tol=1e-10)
+
+        mf_m06l = mol.RKS().to_gpu().density_fit()
+        mf_m06l.xc = 'm06l'
+        mf_m06l.cphf_grids = mf_m06l.grids
+        cls.mf_m06l = mf_m06l.run(conv_tol=1e-10)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.mol.stdout.close()
+
+    def test_nohbrid_lda(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.CasidaTDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 5)
+        self.assertAlmostEqual(lib.fp(es), -1.5103950945691957, 5)
+
+    def test_nohbrid_b88p86(self):
+        mf_bp86 = self.mf_bp86
+        td = mf_bp86.CasidaTDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.4869180666784665, 6)
+
+    def test_tddft_lda(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.TDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.5103950945691957, 6)
+
+    def test_tddft_b88p86(self):
+        mf_bp86 = self.mf_bp86
+        td = mf_bp86.TDDFT()
+        assert td.device == 'gpu'
+        td.conv_tol = 1e-5
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.4869180666784665, 6)
+
+    def test_tddft_b3lyp(self):
+        mf_b3lyp = self.mf_b3lyp
+        td = mf_b3lyp.TDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.5175884245769546, 6)
+
+    def test_tddft_camb3lyp(self):
+        mol = self.mol
+        mf = mol.RKS(xc='camb3lyp').run()
+        mf.cphf_grids = mf.grids
+        td = mf.TDDFT().to_gpu()
+        assert td.device == 'gpu'
+        td.conv_tol = 1e-5
+        es = td.kernel(nstates=4)[0]
+        e_ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]*27.2114), 9.00540521503348, 6)
+
+    def test_tda_b3lypg(self):
+        mol = self.mol
+        mf = mol.RKS()
+        mf.xc = 'b3lypg'
+        mf.grids.prune = None
+        mf.cphf_grids = mf.grids
+        mf.scf()
+        td = mf.TDA().to_gpu()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.520888995669812, 6)
+
+    def test_tda_lda(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.TDA()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.5141057378565799, 6)
+
+    def test_tda_b3lyp_triplet(self):
+        mf_b3lyp = self.mf_b3lyp
+        td = mf_b3lyp.TDA()
+        assert td.device == 'gpu'
+        td.singlet = False
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.4707787881198082, 6)
+        td.analyze()
+
+    def test_tda_lda_triplet(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.TDA()
+        assert td.device == 'gpu'
+        td.singlet = False
+        es = td.kernel(nstates=6)[0]
+        ref = td.to_cpu().kernel(nstates=6)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[[0,1,2,4,5]]), -1.4695846533898422, 6)
+
+    def test_tddft_b88p86_triplet(self):
+        mf_bp86 = self.mf_bp86
+        td = mf_bp86.TDDFT()
+        assert td.device == 'gpu'
+        td.singlet = False
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.4412243124430528, 6)
+
+    def test_tda_rsh(self):
+        mol = gto.M(atom='H 0 0 0.6; H 0 0 0', basis = "6-31g")
+        mf = mol.RKS()
+        mf.xc = 'wb97'
+        mf.kernel()
+        mf.cphf_grids = mf.grids
+        td = mf.TDA().to_gpu()
+        assert td.device == 'gpu'
+        e_td = td.set(nstates=5).kernel()[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(e_td - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(e_td), 0.3953917940299652, 6)
+
+    def test_tda_m06l_singlet(self):
+        mf_m06l = self.mf_m06l
+        td = mf_m06l.TDA()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -1.5620823865741496, 6)
+
+    def test_analyze(self):
+        td_hf = self.td_hf
+        assert td_hf.device == 'gpu'
+        f = td_hf.oscillator_strength(gauge='length')
+        self.assertAlmostEqual(lib.fp(f), -0.13908774016795605, 5)
+        f = td_hf.oscillator_strength(gauge='velocity', order=2)
+        self.assertAlmostEqual(lib.fp(f), -0.096991134490587522, 5)
+
+        note_args = []
+        def temp_logger_note(rec, msg, *args):
+            note_args.append(args)
+        with lib.temporary_env(lib.logger.Logger, note=temp_logger_note):
+            td_hf.analyze()
+        ref = [(),
+               (1, 11.834865910142547, 104.76181013351982, 0.01075359074556743),
+               (2, 11.834865910142618, 104.76181013351919, 0.010753590745567499),
+               (3, 16.66308427853695, 74.40651170629978, 0.3740302871966713)]
+        self.assertAlmostEqual(abs(np.hstack(ref) -
+                                   np.hstack(note_args)).max(), 0, 3)
+
+        self.assertEqual(td_hf.nroots, td_hf.nstates)
+        mf = self.mf
+        self.assertAlmostEqual(lib.fp(td_hf.e_tot-mf.e_tot), 0.41508325757603637, 5)
+
+    def test_scanner(self):
+        mol = self.mol
+        td_hf = self.td_hf
+        td_scan = td_hf.as_scanner().as_scanner()
+        td_scan.nroots = 3
+        td_scan(mol)
+        self.assertAlmostEqual(lib.fp(td_scan.e), 0.41508325757603637, 5)
+
+    def test_transition_multipoles(self):
+        td_hf = self.td_hf
+        self.assertAlmostEqual(abs(lib.fp(td_hf.transition_dipole()             [2])), 0.39833021312014988, 4)
+        self.assertAlmostEqual(abs(lib.fp(td_hf.transition_quadrupole()         [2])), 0.14862776196563565, 4)
+        self.assertAlmostEqual(abs(lib.fp(td_hf.transition_octupole()           [2])), 2.79058994496489410, 4)
+        self.assertAlmostEqual(abs(lib.fp(td_hf.transition_velocity_dipole()    [2])), 0.24021409469918567, 4)
+        self.assertAlmostEqual(abs(lib.fp(td_hf.transition_magnetic_dipole()    [2])), 0                  , 4)
+        self.assertAlmostEqual(abs(lib.fp(td_hf.transition_magnetic_quadrupole()[2])), 0.16558596265719450, 4)
+
+    def test_reset(self):
+        mol1 = gto.M(atom='C')
+        mol = self.mol
+        td = mol.RHF().newton().TDHF().to_gpu()
+        assert td.device == 'gpu'
+        td.reset(mol1)
+        self.assertTrue(td.mol is mol1)
+        self.assertTrue(td._scf.mol is mol1)
+
+    def test_tda_vind(self):
+        mf = self.mf_bp86
+        nocc = self.mol.nelectron // 2
+        nmo = mf.mo_energy.size
+        nvir = nmo - nocc
+        zs = np.random.rand(3,nocc,nvir)
+        ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs)
+        dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+    def test_tddft_vind(self):
+        mf = self.mf_b3lyp
+        nocc = self.mol.nelectron // 2
+        nmo = mf.mo_energy.size
+        nvir = nmo - nocc
+        zs = np.random.rand(3,2,nocc,nvir)
+        ref = mf.to_cpu().TDDFT().set(singlet=True).gen_vind()[0](zs)
+        dat = mf.TDDFT().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+    def test_casida_tddft_vind(self):
+        mf = self.mf_lda
+        nocc = self.mol.nelectron // 2
+        nmo = mf.mo_energy.size
+        nvir = nmo - nocc
+        zs = np.random.rand(3,nocc,nvir)
+        ref = mf.to_cpu().CasidaTDDFT().set().gen_vind()[0](zs)
+        dat = mf.CasidaTDDFT().set().gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+    print("Full Tests for TD-RKS")
+    unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tduhf.py b/gpu4pyscf/tdscf/tests/test_tduhf.py
new file mode 100644
index 00000000..2b6c2df9
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tduhf.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto, scf
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        mol = gto.Mole()
+        mol.verbose = 0
+        mol.atom = [
+            ['H' , (0. , 0. , .917)],
+            ['F' , (0. , 0. , 0.)], ]
+        mol.basis = '631g'
+        # FIXME: mo_coeff of uhf_symm.SymAdaptedUHF not converted to cupy arrays
+        mol.symmetry = True
+        cls.mol = mol.build()
+        cls.mf = scf.UHF(mol).density_fit().run(conv_tol=1e-10).to_gpu()
+
+        mol1 = gto.Mole()
+        mol1.verbose = 7
+        mol1.output = '/dev/null'
+        mol1.atom = [
+            ['H' , (0. , 0. , .917)],
+            ['F' , (0. , 0. , 0.)], ]
+        mol1.basis = '631g'
+        mol1.spin = 2
+        cls.mol1 = mol1.build()
+        cls.mf1 = scf.UHF(mol1).run(conv_tol=1e-10).to_gpu()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.mol1.stdout.close()
+
+    def test_tda(self):
+        mf = self.mf
+        td = mf.TDA()
+        assert td.device == 'gpu'
+        td.nstates = 5
+        e = td.kernel()[0]
+        ref = [11.0179839, 11.0179839, 11.9031214, 11.9031214, 13.1701375]
+        self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+    def test_tdhf(self):
+        mf = self.mf
+        td = mf.TDHF()
+        assert td.device == 'gpu'
+        td.nstates = 5
+        td.conv_tol = 1e-5
+        e = td.kernel()[0]
+        ref = [10.8924334, 10.8924334, 11.8352278, 11.8352278, 12.6350840]
+        self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+    def test_tda1(self):
+        mf1 = self.mf1
+        td = mf1.TDA()
+        assert td.device == 'gpu'
+        td.nstates = 5
+        e = td.kernel()[0]
+        ref = [ 3.3211349, 18.5597821, 21.0147390, 21.6150240, 25.0938938]
+        self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+    def test_tdhf1(self):
+        mf1 = self.mf1
+        td = mf1.TDHF()
+        assert td.device == 'gpu'
+        td.nstates = 4
+        e = td.kernel()[0]
+        ref = [ 3.3126683, 18.4954862, 20.8493515, 21.5480882,]
+        self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+    def test_tda_vind(self):
+        mf = self.mf1
+        nocca, noccb = mf.nelec
+        nmo = mf.mo_energy[0].size
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
+        ref = mf.to_cpu().TDA().set().gen_vind()[0](zs)
+        dat = mf.TDA().set().gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+    def test_tdhf_vind(self):
+        mf = self.mf1
+        nocca, noccb = mf.nelec
+        nmo = mf.mo_energy[0].size
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb)
+        ref = mf.to_cpu().TDHF().set().gen_vind()[0](zs)
+        dat = mf.TDHF().set().gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+    print("Full Tests for uhf-TDA and uhf-TDHF")
+    unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tduks.py b/gpu4pyscf/tdscf/tests/test_tduks.py
new file mode 100644
index 00000000..598e4156
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tduks.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        mol = gto.Mole()
+        mol.verbose = 5
+        mol.output = '/dev/null'
+        mol.atom = '''
+        O     0.   0.       0.
+        H     0.   -0.757   0.587
+        H     0.   0.757    0.587'''
+        mol.spin = 2
+        mol.basis = '631g'
+        cls.mol = mol.build()
+
+        mol1 = gto.Mole()
+        mol1.verbose = 0
+        mol1.atom = '''
+        O     0.   0.       0.
+        H     0.   -0.757   0.587
+        H     0.   0.757    0.587'''
+        mol1.basis = '631g'
+        cls.mol1 = mol1.build()
+
+        cls.mf_uhf = mf_uhf = mol.UHF().to_gpu().run()
+        cls.td_hf = mf_uhf.TDHF().run(conv_tol=1e-6)
+
+        mf_lda = mol.UKS().set(xc='lda', conv_tol=1e-12).to_gpu()
+        mf_lda.grids.prune = None
+        mf_lda.cphf_grids = mf_lda.grids
+        cls.mf_lda = mf_lda.density_fit().run()
+
+        mf_bp86 = mol.UKS().set(xc='b88,p86', conv_tol=1e-12).to_gpu()
+        mf_bp86.grids.prune = None
+        mf_bp86.cphf_grids = mf_bp86.grids
+        cls.mf_bp86 = mf_bp86.density_fit().run()
+
+        mf_b3lyp = mol.UKS().set(xc='b3lyp5', conv_tol=1e-12).to_gpu()
+        mf_b3lyp.grids.prune = None
+        mf_b3lyp.cphf_grids = mf_b3lyp.grids
+        cls.mf_b3lyp = mf_b3lyp.density_fit().run()
+
+        mf_m06l = mol.UKS().to_gpu().density_fit().run(xc='m06l')
+        mf_m06l.cphf_grids = mf_m06l.grids
+        cls.mf_m06l = mf_m06l
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.mol.stdout.close()
+
+    def test_nohybrid_lda(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.CasidaTDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=4)[0]
+        e_ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.0476763425122965, 6)
+
+        mol1 = self.mol1
+        mf = mol1.UKS().run(xc='lda, vwn_rpa').run()
+        mf.cphf_grids = mf.grids
+        td = mf.CasidaTDDFT().to_gpu()
+        assert td.device == 'gpu'
+        td.nstates = 5
+        es = td.kernel()[0]
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+
+    def test_nohybrid_b88p86(self):
+        mf_bp86 = self.mf_bp86
+        td = mf_bp86.CasidaTDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=4)[0]
+        e_ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.05383891686210346, 6)
+
+    def test_tddft_lda(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.TDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=4)[0]
+        ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.0476763425122965, 6)
+
+    def test_tddft_b88p86(self):
+        mf_bp86 = self.mf_bp86
+        td = mf_bp86.TDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.05383891686259823, 6)
+
+        mol1 = self.mol1
+        mf = mol1.UKS().run(xc='b88,p86').run()
+        mf.cphf_grids = mf.grids
+        td = mf.TDDFT().to_gpu()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+
+    def test_tddft_b3lyp(self):
+        mf_b3lyp = self.mf_b3lyp
+        td = mf_b3lyp.TDDFT()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=4)[0]
+        ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.047793873508724743, 6)
+
+    def test_tddft_camb3lyp(self):
+        mol1 = self.mol1
+        mf = mol1.UKS(xc='camb3lyp').run()
+        mf.cphf_grids = mf.grids
+        td = mf.TDDFT().to_gpu()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=4)[0]
+        e_ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.2827429269753051, 6)
+
+    def test_tda_b3lyp(self):
+        mf_b3lyp = self.mf_b3lyp
+        td = mf_b3lyp.TDA()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=4)[0]
+        ref = td.to_cpu().kernel(nstates=4)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.052638024165134974, 6)
+
+    def test_tda_lda(self):
+        mf_lda = self.mf_lda
+        td = mf_lda.TDA()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es[:3]), 0.05368082550881462, 6)
+
+        mol1 = self.mol1
+        mf = mol1.UKS().run(xc='lda,vwn').run()
+        mf.cphf_grids = mf.grids
+        td = mf.TDA().to_gpu()
+        assert td.device == 'gpu'
+        td.nstates = 5
+        es = td.kernel()[0]
+        ref = td.to_cpu().kernel()[0]
+        self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+
+    def test_tda_m06l(self):
+        mf_m06l = self.mf_m06l
+        td = mf_m06l.TDA()
+        assert td.device == 'gpu'
+        es = td.kernel(nstates=5)[0]
+        ref = td.to_cpu().kernel(nstates=5)[0]
+        self.assertAlmostEqual(abs(es - ref[:5]).max(), 0, 8)
+        self.assertAlmostEqual(lib.fp(es), -0.7530329968766932, 6)
+
+    def test_tda_vind(self):
+        mf = self.mf_bp86
+        nocca, noccb = mf.nelec
+        nmo = mf.mo_energy[0].size
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
+        ref = mf.to_cpu().TDA().gen_vind()[0](zs)
+        dat = mf.TDA().gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+    def test_tddft_vind(self):
+        mf = self.mf_b3lyp
+        nocca, noccb = mf.nelec
+        nmo = mf.mo_energy[0].size
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb)
+        ref = mf.to_cpu().TDDFT().gen_vind()[0](zs)
+        dat = mf.TDDFT().gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+    def test_casida_tddft_vind(self):
+        mf = self.mf_lda
+        nocca, noccb = mf.nelec
+        nmo = mf.mo_energy[0].size
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
+        ref = mf.to_cpu().CasidaTDDFT().gen_vind()[0](zs)
+        dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs))
+        self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+    print("Full Tests for TD-UKS")
+    unittest.main()
diff --git a/gpu4pyscf/tdscf/uhf.py b/gpu4pyscf/tdscf/uhf.py
new file mode 100644
index 00000000..27cc0850
--- /dev/null
+++ b/gpu4pyscf/tdscf/uhf.py
@@ -0,0 +1,785 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.tdscf import uhf as tdhf_cpu
+from pyscf.data.nist import HARTREE2EV, HARTREE2WAVENUMBER
+from pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig
+from gpu4pyscf import scf
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from gpu4pyscf.tdscf._uhf_resp_sf import gen_uhf_response_sf
+from gpu4pyscf.tdscf import rhf as tdhf_gpu
+from gpu4pyscf.dft import KohnShamDFT
+from pyscf import __config__
+
+__all__ = [
+    'TDA', 'CIS', 'TDHF', 'TDUHF', 'TDBase'
+]
+
+REAL_EIG_THRESHOLD = tdhf_cpu.REAL_EIG_THRESHOLD
+
+def gen_tda_operation(mf, fock_ao=None, wfnsym=None):
+    '''A x
+    '''
+    assert fock_ao is None
+    assert isinstance(mf, scf.hf.SCF)
+    assert wfnsym is None
+    if isinstance(mf.mo_coeff, (tuple, list)):
+        # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+        # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+        # cp.asarray() for this object leads to an error in
+        # cupy._core.core._array_from_nested_sequence
+        mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+    else:
+        mo_coeff = cp.asarray(mf.mo_coeff)
+    assert mo_coeff[0].dtype == cp.float64
+    mo_energy = cp.asarray(mf.mo_energy)
+    mo_occ = cp.asarray(mf.mo_occ)
+    nao, nmo = mo_coeff[0].shape
+    occidxa = mo_occ[0] > 0
+    occidxb = mo_occ[1] > 0
+    viridxa = mo_occ[0] ==0
+    viridxb = mo_occ[1] ==0
+    orboa = mo_coeff[0][:,occidxa]
+    orbob = mo_coeff[1][:,occidxb]
+    orbva = mo_coeff[0][:,viridxa]
+    orbvb = mo_coeff[1][:,viridxb]
+
+    e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+    e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+    e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
+    hdiag = e_ia.get()
+    nocca, nvira = e_ia_a.shape
+    noccb, nvirb = e_ia_b.shape
+
+    vresp = mf.gen_response(hermi=0)
+
+    def vind(zs):
+        nz = len(zs)
+        zs = cp.asarray(zs)
+        za = zs[:,:nocca*nvira].reshape(nz,nocca,nvira)
+        zb = zs[:,nocca*nvira:].reshape(nz,noccb,nvirb)
+        mo1a = contract('xov,pv->xpo', za, orbva)
+        dmsa = contract('xpo,qo->xpq', mo1a, orboa.conj())
+        mo1b = contract('xov,pv->xpo', zb, orbvb)
+        dmsb = contract('xpo,qo->xpq', mo1b, orbob.conj())
+        dms = cp.asarray((dmsa, dmsb))
+        dms = tag_array(dms, mo1=[mo1a,mo1b], occ_coeff=[orboa,orbob])
+        v1ao = vresp(dms)
+        v1a = contract('xpq,qo->xpo', v1ao[0], orboa)
+        v1a = contract('xpo,pv->xov', v1a, orbva.conj())
+        v1b = contract('xpq,qo->xpo', v1ao[1], orbob)
+        v1b = contract('xpo,pv->xov', v1b, orbvb.conj())
+        v1a += za * e_ia_a
+        v1b += zb * e_ia_b
+        hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1)))
+        return hx.get()
+
+    return vind, hdiag
+
+
+class TDBase(tdhf_gpu.TDBase):
+    def _contract_multipole(tdobj, ints, hermi=True, xy=None):
+        if xy is None: xy = tdobj.xy
+        mo_coeff = tdobj._scf.mo_coeff
+        mo_occ = tdobj._scf.mo_occ
+        orbo_a = mo_coeff[0][:,mo_occ[0]==1]
+        orbv_a = mo_coeff[0][:,mo_occ[0]==0]
+        orbo_b = mo_coeff[1][:,mo_occ[1]==1]
+        orbv_b = mo_coeff[1][:,mo_occ[1]==0]
+        if isinstance(orbo_a, cp.ndarray):
+            orbo_a = orbo_a.get()
+            orbv_a = orbv_a.get()
+            orbo_b = orbo_b.get()
+            orbv_b = orbv_b.get()
+
+        ints_a = np.einsum('...pq,pi,qj->...ij', ints, orbo_a.conj(), orbv_a)
+        ints_b = np.einsum('...pq,pi,qj->...ij', ints, orbo_b.conj(), orbv_b)
+        pol = [(np.einsum('...ij,ij->...', ints_a, x[0]) +
+                np.einsum('...ij,ij->...', ints_b, x[1])) for x,y in xy]
+        pol = np.array(pol)
+        y = xy[0][1]
+        if isinstance(y[0], np.ndarray):
+            pol_y = [(np.einsum('...ij,ij->...', ints_a, y[0]) +
+                      np.einsum('...ij,ij->...', ints_b, y[1])) for x,y in xy]
+            if hermi:
+                pol += pol_y
+            else:  # anti-Hermitian
+                pol -= pol_y
+        return pol
+
+
+class TDA(TDBase):
+    __doc__ = tdhf_gpu.TDA.__doc__
+
+    singlet = None
+
+    def gen_vind(self, mf=None):
+        '''Generate function to compute Ax'''
+        if mf is None:
+            mf = self._scf
+        return gen_tda_operation(mf)
+
+    def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+        if mf is None: mf = self._scf
+        if nstates is None: nstates = self.nstates
+        assert wfnsym is None
+        assert not return_symmetry
+
+        mo_energy_a, mo_energy_b = mf.mo_energy
+        mo_occ_a, mo_occ_b = mf.mo_occ
+        if isinstance(mo_energy_a, cp.ndarray):
+            mo_energy_a = mo_energy_a.get()
+            mo_energy_b = mo_energy_b.get()
+        if isinstance(mo_occ_a, cp.ndarray):
+            mo_occ_a = mo_occ_a.get()
+            mo_occ_b = mo_occ_b.get()
+        occidxa = mo_occ_a >  0
+        occidxb = mo_occ_b >  0
+        viridxa = mo_occ_a == 0
+        viridxb = mo_occ_b == 0
+        e_ia_a = mo_energy_a[viridxa] - mo_energy_a[occidxa,None]
+        e_ia_b = mo_energy_b[viridxb] - mo_energy_b[occidxb,None]
+        nov = e_ia_a.size + e_ia_b.size
+        nstates = min(nstates, nov)
+
+        e_ia = np.append(e_ia_a.ravel(), e_ia_b.ravel())
+        # Find the nstates-th lowest energy gap
+        e_threshold = np.partition(e_ia, nstates-1)[nstates-1]
+        e_threshold += self.deg_eia_thresh
+
+        idx = np.where(e_ia <= e_threshold)[0]
+        x0 = np.zeros((idx.size, nov))
+        for i, j in enumerate(idx):
+            x0[i, j] = 1
+        return x0
+
+    def kernel(self, x0=None, nstates=None):
+        '''TDA diagonalization solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = (logger.process_clock(), logger.perf_counter())
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+
+        vind, hdiag = self.gen_vind(self._scf)
+        precond = self.get_precond(hdiag)
+
+        def pickeig(w, v, nroots, envs):
+            idx = np.where(w > self.positive_eig_threshold)[0]
+            return w[idx], v[:,idx], idx
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        self.converged, self.e, x1 = lr_eigh(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        nmo = self._scf.mo_occ[0].size
+        nocca, noccb = self._scf.nelec
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        self.xy = [((xi[:nocca*nvira].reshape(nocca,nvira),  # X_alpha
+                     xi[nocca*nvira:].reshape(noccb,nvirb)), # X_beta
+                    (0, 0))  # (Y_alpha, Y_beta)
+                   for xi in x1]
+
+        log.timer('TDA', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+CIS = TDA
+
+class SpinFlipTDA(TDBase):
+    '''
+    Attributes:
+        extype : int (0 or 1)
+            Spin flip up: exytpe=0. Spin flip down: exytpe=1.
+        collinear : str
+            collinear schemes, can be
+            'col': collinear, by default
+            'ncol': non-collinear
+            'mcol': multi-collinear
+        collinear_samples : int
+            Integration samples for the multi-collinear treatment
+    '''
+
+    extype = getattr(__config__, 'tdscf_uhf_SFTDA_extype', 1)
+    collinear = getattr(__config__, 'tdscf_uhf_SFTDA_collinear', 'col')
+    collinear_samples = getattr(__config__, 'tdscf_uhf_SFTDA_collinear_samples', 200)
+
+    _keys = {'extype', 'collinear', 'collinear_samples'}
+
+    def gen_vind(self):
+        '''Generate function to compute A*x for spin-flip TDDFT case.
+        '''
+        mf = self._scf
+        assert isinstance(mf, scf.hf.SCF)
+        if isinstance(mf.mo_coeff, (tuple, list)):
+            # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+            # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+            # cp.asarray() for this object leads to an error in
+            # cupy._core.core._array_from_nested_sequence
+            mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+        else:
+            mo_coeff = cp.asarray(mf.mo_coeff)
+        assert mo_coeff[0].dtype == cp.float64
+        mo_energy = cp.asarray(mf.mo_energy)
+        mo_occ = cp.asarray(mf.mo_occ)
+        nao, nmo = mo_coeff[0].shape
+
+        extype = self.extype
+        if extype == 0:
+            occidxb = mo_occ[1] > 0
+            viridxa = mo_occ[0] ==0
+            orbob = mo_coeff[1][:,occidxb]
+            orbva = mo_coeff[0][:,viridxa]
+            orbov = (orbob, orbva)
+            e_ia = mo_energy[0][viridxa] - mo_energy[1][occidxb,None]
+            hdiag = e_ia.ravel().get()
+
+        elif extype == 1:
+            occidxa = mo_occ[0] > 0
+            viridxb = mo_occ[1] ==0
+            orboa = mo_coeff[0][:,occidxa]
+            orbvb = mo_coeff[1][:,viridxb]
+            orbov = (orboa, orbvb)
+            e_ia = mo_energy[1][viridxb] - mo_energy[0][occidxa,None]
+            hdiag = e_ia.ravel().get()
+
+        vresp = gen_uhf_response_sf(
+            mf, hermi=0, collinear=self.collinear,
+            collinear_samples=self.collinear_samples)
+
+        def vind(zs):
+            zs = cp.asarray(zs).reshape(-1, *e_ia.shape)
+            orbo, orbv = orbov
+            mo1 = contract('xov,pv->xpo', zs, orbv)
+            dms = contract('xpo,qo->xpq', mo1, orbo.conj())
+            dms = tag_array(dms, mo1=mo1, occ_coeff=orbo)
+            v1ao = vresp(dms)
+            v1mo = contract('xpq,qo->xpo', v1ao, orbo)
+            v1mo = contract('xpo,pv->xov', v1mo, orbv.conj())
+            v1mo += zs * e_ia
+            return v1mo.reshape(len(v1mo), -1).get()
+
+        return vind, hdiag
+
+    def _init_guess(self, mf, nstates):
+        mo_energy_a, mo_energy_b = mf.mo_energy
+        mo_occ_a, mo_occ_b = mf.mo_occ
+        if isinstance(mo_energy_a, cp.ndarray):
+            mo_energy_a = mo_energy_a.get()
+            mo_energy_b = mo_energy_b.get()
+        if isinstance(mo_occ_a, cp.ndarray):
+            mo_occ_a = mo_occ_a.get()
+            mo_occ_b = mo_occ_b.get()
+
+        if self.extype == 0:
+            occidxb = mo_occ_b > 0
+            viridxa = mo_occ_a ==0
+            e_ia = mo_energy_a[viridxa] - mo_energy_b[occidxb,None]
+
+        elif self.extype == 1:
+            occidxa = mo_occ_a > 0
+            viridxb = mo_occ_b ==0
+            e_ia = mo_energy_b[viridxb] - mo_energy_a[occidxa,None]
+
+        e_ia = e_ia.ravel()
+        nov = e_ia.size
+        nstates = min(nstates, nov)
+        e_threshold = np.partition(e_ia, nstates-1)[nstates-1]
+        idx = np.where(e_ia <= e_threshold)[0]
+        nstates = idx.size
+        e = e_ia[idx]
+        idx = idx[np.argsort(e)]
+        x0 = np.zeros((nstates, nov))
+        for i, j in enumerate(idx):
+            x0[i, j] = 1
+        return np.sort(e), x0.reshape(nstates, *e_ia.shape)
+
+    def init_guess(self, mf=None, nstates=None, wfnsym=None):
+        if mf is None: mf = self._scf
+        if nstates is None: nstates = self.nstates
+        x0 = self._init_guess(mf, nstates)[1]
+        return x0.reshape(len(x0), -1)
+
+    def dump_flags(self, verbose=None):
+        TDBase.dump_flags(self, verbose)
+        logger.info(self, 'extype = %s', self.extype)
+        logger.info(self, 'collinear = %s', self.collinear)
+        if self.collinear == 'mcol':
+            logger.info(self, 'collinear_samples = %s', self.collinear_samples)
+        return self
+
+    def check_sanity(self):
+        TDBase.check_sanity(self)
+        assert self.extype in (0, 1)
+        assert self.collinear in ('col', 'ncol', 'mcol')
+        return self
+
+    def kernel(self, x0=None, nstates=None):
+        '''Spin-flip TDA diagonalization solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+
+        if self.collinear == 'col' and isinstance(self._scf, KohnShamDFT):
+            mf = self._scf
+            ni = mf._numint
+            if not ni.libxc.is_hybrid_xc(mf.xc):
+                self.converged = True
+                self.e, xs = self._init_guess()
+                self.xy = [(x, 0) for x in xs]
+                return self.e, self.xy
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        # Keep all eigenvalues as SF-TDDFT allows triplet to singlet
+        # "dexcitation"
+        def all_eigs(w, v, nroots, envs):
+            return w, v, np.arange(w.size)
+
+        vind, hdiag = self.gen_vind()
+        precond = self.get_precond(hdiag)
+
+        self.converged, self.e, x1 = lr_eigh(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=all_eigs, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        nmo = self._scf.mo_occ[0].size
+        nocca, noccb = self._scf.nelec
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+
+        if self.extype == 0:
+            self.xy = [(xi.reshape(noccb,nvira), 0) for xi in x1]
+        elif self.extype == 1:
+            self.xy = [(xi.reshape(nocca,nvirb), 0) for xi in x1]
+        log.timer('SpinFlipTDA', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+
+def gen_tdhf_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
+    '''Generate function to compute
+
+    [ A   B ][X]
+    [-B* -A*][Y]
+    '''
+    assert fock_ao is None
+    assert isinstance(mf, scf.hf.SCF)
+    if isinstance(mf.mo_coeff, (tuple, list)):
+        # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+        # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+        # cp.asarray() for this object leads to an error in
+        # cupy._core.core._array_from_nested_sequence
+        mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+    else:
+        mo_coeff = cp.asarray(mf.mo_coeff)
+    assert mo_coeff[0].dtype == cp.float64
+    mo_energy = cp.asarray(mf.mo_energy)
+    mo_occ = cp.asarray(mf.mo_occ)
+    occidxa = mo_occ[0] >  0
+    occidxb = mo_occ[1] >  0
+    viridxa = mo_occ[0] == 0
+    viridxb = mo_occ[1] == 0
+    orboa = mo_coeff[0][:,occidxa]
+    orbob = mo_coeff[1][:,occidxb]
+    orbva = mo_coeff[0][:,viridxa]
+    orbvb = mo_coeff[1][:,viridxb]
+
+    e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+    e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+    e_ia = hdiag = cp.hstack((e_ia_a.ravel(), e_ia_b.ravel()))
+    hdiag = cp.hstack((hdiag, -hdiag)).get()
+    nocca, nvira = e_ia_a.shape
+    noccb, nvirb = e_ia_b.shape
+
+    vresp = mf.gen_response(hermi=0)
+
+    def vind(xys):
+        nz = len(xys)
+        xys = cp.asarray(xys).reshape(nz,2,-1)
+        xs, ys = xys.transpose(1,0,2)
+        xa = xs[:,:nocca*nvira].reshape(nz,nocca,nvira)
+        xb = xs[:,nocca*nvira:].reshape(nz,noccb,nvirb)
+        ya = ys[:,:nocca*nvira].reshape(nz,nocca,nvira)
+        yb = ys[:,nocca*nvira:].reshape(nz,noccb,nvirb)
+        tmp  = contract('xov,pv->xpo', xa, orbva)
+        dmsa = contract('xpo,qo->xpq', tmp, orboa.conj())
+        tmp  = contract('xov,pv->xpo', xb, orbvb)
+        dmsb = contract('xpo,qo->xpq', tmp, orbob.conj())
+        tmp  = contract('xov,qv->xoq', ya, orbva.conj())
+        dmsa+= contract('xoq,po->xpq', tmp, orboa)
+        tmp  = contract('xov,qv->xoq', yb, orbvb.conj())
+        dmsb+= contract('xoq,po->xpq', tmp, orbob)
+        v1ao = vresp(cp.asarray((dmsa,dmsb)))
+        v1a_top = contract('xpq,qo->xpo', v1ao[0], orboa)
+        v1a_top = contract('xpo,pv->xov', v1a_top, orbva.conj())
+        v1b_top = contract('xpq,qo->xpo', v1ao[1], orbob)
+        v1b_top = contract('xpo,pv->xov', v1b_top, orbvb.conj())
+        v1a_bot = contract('xpq,po->xoq', v1ao[0], orboa.conj())
+        v1a_bot = contract('xoq,qv->xov', v1a_bot, orbva)
+        v1b_bot = contract('xpq,po->xoq', v1ao[1], orbob.conj())
+        v1b_bot = contract('xoq,qv->xov', v1b_bot, orbvb)
+
+        v1_top = xs * e_ia
+        v1_bot = ys * e_ia
+        v1_top[:,:nocca*nvira] += v1a_top.reshape(nz,-1)
+        v1_bot[:,:nocca*nvira] += v1a_bot.reshape(nz,-1)
+        v1_top[:,nocca*nvira:] += v1b_top.reshape(nz,-1)
+        v1_bot[:,nocca*nvira:] += v1b_bot.reshape(nz,-1)
+        hx = cp.hstack((v1_top, -v1_bot))
+        return hx.get()
+
+    return vind, hdiag
+
+
+class TDHF(TDBase):
+
+    singlet = None
+
+    @lib.with_doc(gen_tdhf_operation.__doc__)
+    def gen_vind(self, mf=None):
+        if mf is None:
+            mf = self._scf
+        return gen_tdhf_operation(mf, singlet=self.singlet)
+
+    def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+        x0 = TDA.init_guess(self, mf, nstates, wfnsym, return_symmetry)
+        y0 = np.zeros_like(x0)
+        return np.hstack([x0, y0])
+
+    def kernel(self, x0=None, nstates=None):
+        '''TDHF diagonalization with non-Hermitian eigenvalue solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+
+        vind, hdiag = self.gen_vind(self._scf)
+        precond = self.get_precond(hdiag)
+
+        # handle single kpt PBC SCF
+        if getattr(self._scf, 'kpt', None) is not None:
+            from pyscf.pbc.lib.kpts_helper import gamma_point
+            real_system = (gamma_point(self._scf.kpt) and
+                           self._scf.mo_coeff[0].dtype == np.double)
+        else:
+            real_system = True
+
+        # We only need positive eigenvalues
+        def pickeig(w, v, nroots, envs):
+            realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) &
+                                  (w.real > self.positive_eig_threshold))[0]
+            return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system)
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        self.converged, w, x1 = lr_eig(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        nmo = self._scf.mo_occ[0].size
+        nocca, noccb = self._scf.nelec
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        e = []
+        xy = []
+        for i, z in enumerate(x1):
+            x, y = z.reshape(2,-1)
+            norm = lib.norm(x)**2 - lib.norm(y)**2
+            if norm > 0:
+                norm = norm**-.5
+                e.append(w[i])
+                xy.append(((x[:nocca*nvira].reshape(nocca,nvira) * norm,  # X_alpha
+                            x[nocca*nvira:].reshape(noccb,nvirb) * norm), # X_beta
+                           (y[:nocca*nvira].reshape(nocca,nvira) * norm,  # Y_alpha
+                            y[nocca*nvira:].reshape(noccb,nvirb) * norm)))# Y_beta
+        self.e = np.array(e)
+        self.xy = xy
+
+        log.timer('TDDFT', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+TDUHF = TDHF
+
+class SpinFlipTDHF(TDBase):
+
+    extype = SpinFlipTDA.extype
+    collinear = SpinFlipTDA.collinear
+    collinear_samples = SpinFlipTDA.collinear_samples
+
+    _keys = {'extype', 'collinear', 'collinear_samples'}
+
+    def gen_vind(self):
+        '''Generate function to compute A*x for spin-flip TDDFT case.
+        '''
+        mf = self._scf
+        assert isinstance(mf, scf.hf.SCF)
+        if isinstance(mf.mo_coeff, (tuple, list)):
+            # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+            # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+            # cp.asarray() for this object leads to an error in
+            # cupy._core.core._array_from_nested_sequence
+            mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+        else:
+            mo_coeff = cp.asarray(mf.mo_coeff)
+        assert mo_coeff[0].dtype == cp.float64
+        mo_energy = cp.asarray(mf.mo_energy)
+        mo_occ = cp.asarray(mf.mo_occ)
+        nao, nmo = mo_coeff[0].shape
+
+        occidxa = mo_occ[0] > 0
+        occidxb = mo_occ[1] > 0
+        viridxa = mo_occ[0] ==0
+        viridxb = mo_occ[1] ==0
+        orboa = mo_coeff[0][:,occidxa]
+        orbob = mo_coeff[1][:,occidxb]
+        orbva = mo_coeff[0][:,viridxa]
+        orbvb = mo_coeff[1][:,viridxb]
+        e_ia_b2a = mo_energy[0][viridxa] - mo_energy[1][occidxb,None]
+        e_ia_a2b = mo_energy[1][viridxb] - mo_energy[0][occidxa,None]
+        nocca, nvirb = e_ia_a2b.shape
+        noccb, nvira = e_ia_b2a.shape
+
+        extype = self.extype
+        if extype == 0:
+            hdiag = cp.hstack([e_ia_b2a.ravel(), -e_ia_a2b.ravel()]).get()
+        else:
+            hdiag = cp.hstack([e_ia_a2b.ravel(), -e_ia_b2a.ravel()]).get()
+
+        vresp = gen_uhf_response_sf(
+            mf, hermi=0, collinear=self.collinear,
+            collinear_samples=self.collinear_samples)
+
+        def vind(zs):
+            nz = len(zs)
+            zs = cp.asarray(zs).reshape(nz, -1)
+            if extype == 0:
+                zs_b2a = zs[:,:noccb*nvira].reshape(nz,noccb,nvira)
+                zs_a2b = zs[:,noccb*nvira:].reshape(nz,nocca,nvirb)
+                dm_b2a = contract('xov,pv->xpo', zs_b2a, orbva)
+                dm_b2a = contract('xpo,qo->xpq', dm_b2a, orbob.conj())
+                dm_a2b = contract('xov,qv->xoq', zs_a2b, orbvb.conj())
+                dm_a2b = contract('xoq,po->xpq', dm_a2b, orboa)
+            else:
+                zs_a2b = zs[:,:nocca*nvirb].reshape(nz,nocca,nvirb)
+                zs_b2a = zs[:,nocca*nvirb:].reshape(nz,noccb,nvira)
+                dm_b2a = contract('xov,pv->xpo', zs_b2a, orbva)
+                dm_b2a = contract('xpo,qo->xpq', dm_b2a, orbob.conj())
+                dm_a2b = contract('xov,qv->xoq', zs_a2b, orbvb.conj())
+                dm_a2b = contract('xoq,po->xpq', dm_a2b, orboa)
+
+            '''
+            # The slow way to compute individual terms in
+            # [A   B] [X]
+            # [B* A*] [Y]
+            dms = cp.vstack([dm_b2a, dm_a2b])
+            v1ao = vresp(dms)
+            v1ao_b2a, v1ao_a2b = v1ao[:nz], v1ao[nz:]
+            if extype == 0:
+                # A*X = (aI||Jb) * z_b2a = -(ab|IJ) * z_b2a
+                v1A_b2a = contract('xpq,qo->xpo', v1ao_b2a, orbob)
+                v1A_b2a = contract('xpo,pv->xov', v1A_b2a, orbva.conj())
+                # (A*)*Y = (iA||Bj) * z_a2b = -(ij|BA) * z_a2b
+                v1A_a2b = contract('xpq,po->xoq', v1ao_a2b, orboa.conj())
+                v1A_a2b = contract('xoq,qv->xov', v1A_a2b, orbvb)
+                # B*Y = (aI||Bj) * z_a2b = -(aj|BI) * z_a2b
+                v1B_b2a = contract('xpq,qo->xpo', v1ao_a2b, orbob)
+                v1B_b2a = contract('xpo,pv->xov', v1B_b2a, orbva.conj())
+                # (B*)*X = (iA||Jb) * z_b2a = -(ib|JA) * z_b2a
+                v1B_a2b = contract('xpq,po->xoq', v1ao_b2a, orboa.conj())
+                v1B_a2b = contract('xoq,qv->xov', v1B_a2b, orbvb)
+                # add the orbital energy difference in A matrix.
+                v1_top = v1A_b2a + v1B_b2a + zs_b2a * e_ia_b2a
+                v1_bot = v1B_a2b + v1A_a2b + zs_a2b * e_ia_a2b
+                hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
+            else:
+                # A*X = (Ai||jB) * z_a2b = -(AB|ij) * z_a2b
+                v1A_a2b = contract('xpq,qo->xpo', v1ao_a2b, orboa)
+                v1A_a2b = contract('xpo,pv->xov', v1A_a2b, orbvb.conj())
+                # (A*)*Y = (Ia||bJ) * z_b2a = -(IJ|ba) * z_b2a
+                v1A_b2a = contract('xpq,po->xoq', v1ao_b2a, orbob.conj())
+                v1A_b2a = contract('xoq,qv->xov', v1A_b2a, orbva)
+                # B*Y = (Ai||bJ) * z_b2a = -(AJ|bi) * z_b2a
+                v1B_a2b = contract('xpq,qo->xpo', v1ao_b2a, orboa)
+                v1B_a2b = contract('xpo,pv->xov', v1B_a2b, orbvb.conj())
+                # (B*)*X = (Ia||jB) * z_a2b = -(IB|ja) * z_a2b
+                v1B_b2a = contract('xpq,po->xoq', v1ao_a2b, orbob.conj())
+                v1B_b2a = contract('xoq,qv->xov', v1B_b2a, orbva)
+                # add the orbital energy difference in A matrix.
+                v1_top = v1A_a2b + v1B_a2b + zs_a2b * e_ia_a2b
+                v1_bot = v1B_b2a + v1A_b2a + zs_b2a * e_ia_b2a
+                hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
+            '''
+
+            # [A   B] [X]
+            # [B* A*] [Y]
+            # is simplified to
+            dms = dm_b2a + dm_a2b
+            v1ao = vresp(dms)
+            if extype == 0:
+                # v1_top = A*X+B*Y
+                # A*X = (aI||Jb) * z_b2a = -(ab|JI) * z_b2a
+                # B*Y = (aI||Bj) * z_a2b = -(aj|BI) * z_a2b
+                v1_top = contract('xpq,qo->xpo', v1ao, orbob)
+                v1_top = contract('xpo,pv->xov', v1_top, orbva.conj())
+                # (A*)*Y = (iA||Bj) * z_a2b = -(ij|BA) * z_a2b
+                # (B*)*X = (iA||Jb) * z_b2a = -(ib|JA) * z_b2a
+                # v1_bot = (B*)*X + (A*)*Y
+                v1_bot = contract('xpq,po->xoq', v1ao, orboa.conj())
+                v1_bot = contract('xoq,qv->xov', v1_bot, orbvb)
+                # add the orbital energy difference in A matrix.
+                v1_top += zs_b2a * e_ia_b2a
+                v1_bot += zs_a2b * e_ia_a2b
+            else:
+                # v1_top = A*X+B*Y
+                # A*X = (Ai||jB) * z_a2b = -(AB|ji) * z_a2b
+                # B*Y = (Ai||bJ) * z_b2a = -(AJ|bi) * z_b2a
+                v1_top = contract('xpq,qo->xpo', v1ao, orboa)
+                v1_top = contract('xpo,pv->xov', v1_top, orbvb.conj())
+                # v1_bot = (B*)*X + (A*)*Y
+                # (A*)*Y = (Ia||bJ) * z_b2a = -(IJ|ba) * z_b2a
+                # (B*)*X = (Ia||jB) * z_a2b = -(IB|ja) * z_a2b
+                v1_bot = contract('xpq,po->xoq', v1ao, orbob.conj())
+                v1_bot = contract('xoq,qv->xov', v1_bot, orbva)
+                # add the orbital energy difference in A matrix.
+                v1_top += zs_a2b * e_ia_a2b
+                v1_bot += zs_b2a * e_ia_b2a
+            hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
+            return hx.get()
+
+        return vind, hdiag
+
+    _init_guess = SpinFlipTDA._init_guess
+
+    def init_guess(self, mf=None, nstates=None, wfnsym=None):
+        if mf is None: mf = self._scf
+        if nstates is None: nstates = self.nstates
+        x0 = self._init_guess(mf, nstates)[1]
+        nx = len(x0)
+        nmo = mf.mo_occ[0].size
+        nocca, noccb = mf.nelec
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+        if self.extype == 0:
+            y0 = np.zeros((nx, nocca*nvirb))
+        else:
+            y0 = np.zeros((nx, noccb*nvira))
+        return np.hstack([x0.reshape(nx,-1), y0])
+
+    dump_flags = SpinFlipTDA.dump_flags
+    check_sanity = SpinFlipTDA.check_sanity
+
+    def kernel(self, x0=None, nstates=None):
+        '''Spin-flip TDA diagonalization solver
+        '''
+        # TODO: Enable this feature after updating the TDDFT davidson algorithm
+        # in pyscf main branch
+        raise RuntimeError('Numerical issues in lr_eig')
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+
+        if self.collinear == 'col' and isinstance(self._scf, KohnShamDFT):
+            raise NotImplementedError
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        real_system = self._scf.mo_coeff[0].dtype == np.float64
+        def pickeig(w, v, nroots, envs):
+            realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) &
+                                  (w.real > self.positive_eig_threshold))[0]
+            return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system)
+
+        vind, hdiag = self.gen_vind()
+        precond = self.get_precond(hdiag)
+
+        self.converged, self.e, x1 = lr_eig(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        nmo = self._scf.mo_occ[0].size
+        nocca, noccb = self._scf.nelec
+        nvira = nmo - nocca
+        nvirb = nmo - noccb
+
+        if self.extype == 0:
+            def norm_xy(z):
+                x = z[:noccb*nvira].reshape(noccb,nvira)
+                y = z[noccb*nvira:].reshape(nocca,nvirb)
+                norm = lib.norm(x)**2 - lib.norm(y)**2
+                #assert norm > 0
+                norm = abs(norm) ** -.5
+                return x*norm, y*norm
+        elif self.extype == 1:
+            def norm_xy(z):
+                x = z[:nocca*nvirb].reshape(nocca,nvirb)
+                y = z[nocca*nvirb:].reshape(noccb,nvira)
+                norm = lib.norm(x)**2 - lib.norm(y)**2
+                #assert norm > 0
+                norm = abs(norm) ** -.5
+                return x*norm, y*norm
+
+        self.xy = [norm_xy(z) for z in x1]
+        log.timer('SpinFlipTDDFT', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+scf.uhf.UHF.TDA = lib.class_as_method(TDA)
+scf.uhf.UHF.TDHF = lib.class_as_method(TDHF)
+scf.uhf.UHF.SFTDA = lib.class_as_method(SpinFlipTDA)
+scf.uhf.UHF.SFTDHF = lib.class_as_method(SpinFlipTDHF)
diff --git a/gpu4pyscf/tdscf/uks.py b/gpu4pyscf/tdscf/uks.py
new file mode 100644
index 00000000..23646332
--- /dev/null
+++ b/gpu4pyscf/tdscf/uks.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import cupy as cp
+from pyscf import symm
+from pyscf import lib
+from pyscf.tdscf._lr_eig import eigh as lr_eigh
+from gpu4pyscf.dft.rks import KohnShamDFT
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum
+from gpu4pyscf.lib import logger
+from gpu4pyscf.tdscf import uhf as tdhf_gpu
+from gpu4pyscf import dft
+
+__all__ = [
+    'TDA', 'TDDFT', 'TDUKS', 'CasidaTDDFT', 'TDDFTNoHybrid',
+]
+
+TDA = tdhf_gpu.TDA
+TDDFT = tdhf_gpu.TDHF
+TDUKS = TDDFT
+SpinFlipTDA = tdhf_gpu.SpinFlipTDA
+SpinFlipTDDFT = tdhf_gpu.SpinFlipTDHF
+
+class CasidaTDDFT(TDDFT):
+    '''Solve the Casida TDDFT formula (A-B)(A+B)(X+Y) = (X+Y)w^2
+    '''
+
+    init_guess = TDA.init_guess
+
+    def gen_vind(self, mf=None):
+        if mf is None:
+            mf = self._scf
+        if isinstance(mf.mo_coeff, (tuple, list)):
+            # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+            # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+            # cp.asarray() for this object leads to an error in
+            # cupy._core.core._array_from_nested_sequence
+            mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+        else:
+            mo_coeff = cp.asarray(mf.mo_coeff)
+        assert mo_coeff[0].dtype == cp.float64
+        mo_energy = cp.asarray(mf.mo_energy)
+        mo_occ = cp.asarray(mf.mo_occ)
+        occidxa = mo_occ[0] >  0
+        occidxb = mo_occ[1] >  0
+        viridxa = mo_occ[0] == 0
+        viridxb = mo_occ[1] == 0
+        orboa = mo_coeff[0][:,occidxa]
+        orbob = mo_coeff[1][:,occidxb]
+        orbva = mo_coeff[0][:,viridxa]
+        orbvb = mo_coeff[1][:,viridxb]
+
+        e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+        e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+        e_ia = cp.hstack((e_ia_a.ravel(), e_ia_b.ravel()))
+        d_ia = e_ia**.5
+        ed_ia = e_ia * d_ia
+        hdiag = e_ia ** 2
+        hdiag = hdiag.get()
+        vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+        nocca, nvira = e_ia_a.shape
+        noccb, nvirb = e_ia_b.shape
+
+        def vind(zs):
+            assert zs.dtype == np.float64
+            nz = len(zs)
+            zs = cp.asarray(zs).reshape(nz,-1)
+            dmsa = (zs[:,:nocca*nvira] * d_ia[:nocca*nvira]).reshape(nz,nocca,nvira)
+            dmsb = (zs[:,nocca*nvira:] * d_ia[nocca*nvira:]).reshape(nz,noccb,nvirb)
+            mo1a = contract('xov,pv->xpo', dmsa, orbva)
+            dmsa = contract('xpo,qo->xpq', mo1a, orboa)
+            mo1b = contract('xov,pv->xpo', dmsb, orbvb)
+            dmsb = contract('xpo,qo->xpq', mo1b, orbob)
+            dmsa = transpose_sum(dmsa)
+            dmsb = transpose_sum(dmsb)
+            dms = cp.asarray((dmsa, dmsb))
+            dms = tag_array(dms, mo1=[mo1a,mo1b], occ_coeff=[orboa,orbob])
+            v1ao = vresp(dms)
+            v1a = contract('xpq,qo->xpo', v1ao[0], orboa)
+            v1a = contract('xpo,pv->xov', v1a, orbva)
+            v1b = contract('xpq,qo->xpo', v1ao[1], orbob)
+            v1b = contract('xpo,pv->xov', v1b, orbvb)
+            hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1)))
+            hx += ed_ia * zs
+            hx *= d_ia
+            return hx.get()
+
+        return vind, hdiag
+
+    def kernel(self, x0=None, nstates=None):
+        '''TDDFT diagonalization solver
+        '''
+        log = logger.new_logger(self)
+        cpu0 = log.init_timer()
+        mf = self._scf
+        if mf._numint.libxc.is_hybrid_xc(mf.xc):
+            raise RuntimeError('%s cannot be used with hybrid functional'
+                               % self.__class__)
+        self.check_sanity()
+        self.dump_flags()
+        if nstates is None:
+            nstates = self.nstates
+        else:
+            self.nstates = nstates
+
+        vind, hdiag = self.gen_vind(self._scf)
+        precond = self.get_precond(hdiag)
+
+        def pickeig(w, v, nroots, envs):
+            idx = np.where(w > self.positive_eig_threshold)[0]
+            return w[idx], v[:,idx], idx
+
+        x0sym = None
+        if x0 is None:
+            x0 = self.init_guess()
+
+        self.converged, w2, x1 = lr_eigh(
+            vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+            nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+            max_memory=self.max_memory, verbose=log)
+
+        mo_energy = self._scf.mo_energy
+        mo_occ = self._scf.mo_occ
+        occidxa = mo_occ[0] >  0
+        occidxb = mo_occ[1] >  0
+        viridxa = mo_occ[0] == 0
+        viridxb = mo_occ[1] == 0
+        e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+        e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+        nocca, nvira = e_ia_a.shape
+        noccb, nvirb = e_ia_b.shape
+        if isinstance(mo_energy, cp.ndarray):
+            e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
+            e_ia = e_ia**.5
+            e_ia = e_ia.get()
+        else:
+            e_ia = np.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
+            e_ia = e_ia**.5
+
+        e = []
+        xy = []
+        for i, z in enumerate(x1):
+            if w2[i] < self.positive_eig_threshold:
+                continue
+            w = w2[i] ** .5
+            zp = e_ia * z
+            zm = w/e_ia * z
+            x = (zp + zm) * .5
+            y = (zp - zm) * .5
+            norm = lib.norm(x)**2 - lib.norm(y)**2
+            if norm > 0:
+                norm = norm**-.5
+                e.append(w)
+                xy.append(((x[:nocca*nvira].reshape(nocca,nvira) * norm,  # X_alpha
+                            x[nocca*nvira:].reshape(noccb,nvirb) * norm), # X_beta
+                           (y[:nocca*nvira].reshape(nocca,nvira) * norm,  # Y_alpha
+                            y[nocca*nvira:].reshape(noccb,nvirb) * norm)))# Y_beta
+        self.e = np.array(e)
+        self.xy = xy
+
+        log.timer('TDDFT', *cpu0)
+        self._finalize()
+        return self.e, self.xy
+
+TDDFTNoHybrid = CasidaTDDFT
+
+def tddft(mf):
+    '''Driver to create TDDFT or CasidaTDDFT object'''
+    if mf._numint.libxc.is_hybrid_xc(mf.xc):
+        return TDDFT(mf)
+    else:
+        return CasidaTDDFT(mf)
+
+dft.uks.UKS.TDA           = lib.class_as_method(TDA)
+dft.uks.UKS.TDHF          = None
+#dft.uks.UKS.TDDFT         = lib.class_as_method(TDDFT)
+dft.uks.UKS.TDDFTNoHybrid = lib.class_as_method(TDDFTNoHybrid)
+dft.uks.UKS.CasidaTDDFT   = lib.class_as_method(CasidaTDDFT)
+dft.uks.UKS.TDDFT         = tddft
+dft.uks.UKS.SFTDA         = lib.class_as_method(SpinFlipTDA)
+dft.uks.UKS.SFTDDFT       = lib.class_as_method(SpinFlipTDDFT)
diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py
index dc3156cf..4546da4e 100644
--- a/gpu4pyscf/tests/test_dft.py
+++ b/gpu4pyscf/tests/test_dft.py
@@ -13,12 +13,16 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+import unittest
 import numpy as np
 import pyscf
 import pytest
 import cupy
+from gpu4pyscf.dft import rks, uks
 
-atom = '''
+def setUpModule():
+    global mol
+    atom = '''
 C                 -0.07551087    1.68127663   -0.10745193
 O                  1.33621755    1.87147409   -0.39326987
 C                  1.67074668    2.95729545    0.49387976
@@ -41,112 +45,116 @@
 H                 -3.93210821    0.28874990   -1.89865997
 '''
 
-mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
-mol.output = '/dev/null'
-mol.build()
-mol.verbose = 1
-
-@pytest.mark.smoke
-def test_b3lyp_with_d3bj():
-    print('-------- DFRKS with D3(BJ) -------')
-    from gpu4pyscf.dft import rks
-    mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-    mf.grids.atom_grid = (99,590)
-    mf.conv_tol = 1e-10
-    mf.conv_tol_cpscf = 1e-8
-    mf.disp = 'd3bj'
-    e_dft = mf.kernel()
-    assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-    g = mf.nuc_grad_method().kernel()
-    assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-    h = mf.Hessian().kernel()
-    assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-@pytest.mark.smoke
-def test_b3lyp_d3bj():
-    print('-------- DFRKS with D3(BJ) -------')
-    from gpu4pyscf.dft import rks
-    mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
-    mf.grids.atom_grid = (99,590)
-    mf.conv_tol = 1e-10
-    mf.conv_tol_cpscf = 1e-8
-    e_dft = mf.kernel()
-    assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-    g = mf.nuc_grad_method().kernel()
-    assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-    h = mf.Hessian().kernel()
-    assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-@pytest.mark.smoke
-def test_DFUKS():
-    print('------- DFUKS with D3(BJ) -------')
-    from gpu4pyscf.dft import uks
-    mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-    mf.grids.atom_grid = (99,590)
-    mf.conv_tol = 1e-10
-    mf.conv_tol_cpscf = 1e-8
-    mf.disp = 'd3bj'
-    e_dft = mf.kernel()
-    assert np.abs(e_dft - -685.0326965349493) < 1e-7
-
-    g = mf.nuc_grad_method().kernel()
-    assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
-
-    h = mf.Hessian().kernel()
-    assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
-
-@pytest.mark.smoke
-def test_RKS():
-    print('-------- RKS with D3(BJ) -------')
-    from gpu4pyscf.dft import rks
-    mf = rks.RKS(mol, xc='b3lyp')
-    mf.grids.atom_grid = (99,590)
-    mf.conv_tol = 1e-12
-    mf.disp = 'd3bj'
-    e_dft = mf.kernel()
-    assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-    g = mf.nuc_grad_method().kernel()
-    assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-@pytest.mark.smoke
-def test_DFRKS_with_SMD():
-    print('----- DFRKS with SMD -----')
-    from gpu4pyscf.dft import rks
-    mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-    mf = mf.SMD()
-    mf.grids.atom_grid = (99,590)
-    mf.conv_tol = 1e-10
-    mf.conv_tol_cpscf = 1e-8
-    mf.disp = 'd3bj'
-    e_dft = mf.kernel()
-    assert np.abs(e_dft - -685.0578838805443) < 1e-7
-
-    g = mf.nuc_grad_method().kernel()
-    assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5
-
-    h = mf.Hessian().kernel()
-    assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4
-
-@pytest.mark.smoke
-def test_DFUKS_with_SMD():
-    print('------- DFUKS with SMD ---------')
-    from gpu4pyscf.dft import uks
-    mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-    mf = mf.SMD()
-    mf.grids.atom_grid = (99,590)
-    mf.conv_tol = 1e-10
-    mf.conv_tol_cpscf = 1e-8
-    mf.disp = 'd3bj'
-    e_dft = mf.kernel()
-    assert np.abs(e_dft - -685.05788388063) < 1e-7
-
-    g = mf.nuc_grad_method().kernel()
-    assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5
-
-    h = mf.Hessian().kernel()
-    assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4
+    mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
+    mol.output = '/dev/null'
+    mol.build()
+    mol.verbose = 1
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+class KnownValues(unittest.TestCase):
+    @pytest.mark.smoke
+    def test_b3lyp_with_d3bj(self):
+        print('-------- DFRKS with D3(BJ) -------')
+        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.disp = 'd3bj'
+        e_dft = mf.kernel()
+        assert np.abs(e_dft - -685.0326965348272) < 1e-7
+
+        g = mf.nuc_grad_method().kernel()
+        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
+
+        h = mf.Hessian().kernel()
+        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
+
+    @pytest.mark.smoke
+    def test_b3lyp_d3bj(self):
+        print('-------- DFRKS with D3(BJ) -------')
+        mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        e_dft = mf.kernel()
+        assert np.abs(e_dft - -685.0326965348272) < 1e-7
+
+        g = mf.nuc_grad_method().kernel()
+        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
+
+        h = mf.Hessian().kernel()
+        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
+
+    @pytest.mark.smoke
+    def test_DFUKS(self):
+        print('------- DFUKS with D3(BJ) -------')
+        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.disp = 'd3bj'
+        e_dft = mf.kernel()
+        assert np.abs(e_dft - -685.0326965349493) < 1e-7
+
+        g = mf.nuc_grad_method().kernel()
+        assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
+
+        h = mf.Hessian().kernel()
+        assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
+
+    @pytest.mark.smoke
+    def test_RKS(self):
+        print('-------- RKS with D3(BJ) -------')
+        mf = rks.RKS(mol, xc='b3lyp')
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-12
+        mf.disp = 'd3bj'
+        e_dft = mf.kernel()
+        assert np.abs(e_dft - -685.0325611822375) < 1e-7
+
+        g = mf.nuc_grad_method().kernel()
+        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
+
+    @pytest.mark.smoke
+    def test_DFRKS_with_SMD(self):
+        print('----- DFRKS with SMD -----')
+        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf = mf.SMD()
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.disp = 'd3bj'
+        e_dft = mf.kernel()
+        assert np.abs(e_dft - -685.0578838805443) < 1e-7
+
+        g = mf.nuc_grad_method().kernel()
+        assert np.abs(cupy.linalg.norm(g) - 0.16905807654571403) < 1e-5
+
+        h = mf.Hessian().kernel()
+        assert np.abs(cupy.linalg.norm(h) - 3.743840896534178) < 1e-4
+
+    @pytest.mark.smoke
+    def test_DFUKS_with_SMD(self):
+        print('------- DFUKS with SMD ---------')
+        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf = mf.SMD()
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.disp = 'd3bj'
+        e_dft = mf.kernel()
+        assert np.abs(e_dft - -685.05788388063) < 1e-7
+
+        g = mf.nuc_grad_method().kernel()
+        assert np.abs(cupy.linalg.norm(g) - 0.1690582751813457) < 1e-5
+
+        h = mf.Hessian().kernel()
+        assert np.abs(cupy.linalg.norm(h) - 3.743858482519822) < 1e-4
+
+if __name__ == "__main__":
+    print("Full Smoke Tests")
+    unittest.main()