diff --git a/gpu4pyscf/gto/moleintor.py b/gpu4pyscf/gto/moleintor.py
index 0a880c0c..fb060d9a 100644
--- a/gpu4pyscf/gto/moleintor.py
+++ b/gpu4pyscf/gto/moleintor.py
@@ -24,6 +24,7 @@
 from gpu4pyscf.scf.int4c2e import BasisProdCache
 from gpu4pyscf.df.int3c2e import sort_mol, _split_l_ctr_groups, get_pairing
 from gpu4pyscf.gto.mole import basis_seg_contraction
+from gpu4pyscf.__config__ import _num_devices, _streams
 
 BLKSIZE = 128
 
@@ -31,32 +32,18 @@
 libgint = load_library('libgint')
 
 class VHFOpt(_vhf.VHFOpt):
-    def __init__(self, mol, intor, prescreen='CVHFnoscreen',
+    def __init__(self, mol, intor='int2e', prescreen='CVHFnoscreen',
                  qcondname='CVHFsetnr_direct_scf', dmcondname=None):
-        # use local basis_seg_contraction for efficiency
-        # TODO: switch _mol and mol
-        self.mol = basis_seg_contraction(mol,allow_replica=True)
-        self._mol = mol
-
-        '''
-        # Note mol._bas will be sorted in .build() method. VHFOpt should be
-        # initialized after mol._bas updated.
-        '''
-        self.nao = self.mol.nao
+        self.mol = mol
+        self._sorted_mol = None
 
         self._intor = intor
         self._prescreen = prescreen
         self._qcondname = qcondname
         self._dmcondname = dmcondname
 
-        self.bpcache = None
-
-        self.cart_ao_idx = None
-        self.sph_ao_idx = None
-
         self.cart_ao_loc = []
         self.sph_ao_loc = []
-        self.cart2sph = None
 
         self.angular = None
 
@@ -69,8 +56,8 @@ def __init__(self, mol, intor, prescreen='CVHFnoscreen',
 
     def clear(self):
         _vhf.VHFOpt.__del__(self)
-        if self.bpcache is not None:
-            libgvhf.GINTdel_basis_prod(ctypes.byref(self.bpcache))
+        for n, bpcache in self._bpcache.items():
+            libgvhf.GINTdel_basis_prod(ctypes.byref(bpcache))
         return self
 
     def __del__(self):
@@ -79,20 +66,21 @@ def __del__(self):
         except AttributeError:
             pass
 
-    def build(self, cutoff=1e-14, group_size=None, diag_block_with_triu=False, aosym=False):
-        _mol = self._mol
-        mol = self.mol
+    def build(self, cutoff=1e-13, group_size=BLKSIZE, diag_block_with_triu=False, aosym=True):
+        original_mol = self.mol
+        mol = basis_seg_contraction(original_mol, allow_replica=True)
 
-        log = logger.new_logger(_mol, _mol.verbose)
+        log = logger.new_logger(original_mol, original_mol.verbose)
         cput0 = log.init_timer()
-        sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
-        self.sorted_mol = sorted_mol
+        _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
+        self._sorted_mol = _sorted_mol
+
         if group_size is not None :
             uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size)
-        self.nctr = len(uniq_l_ctr)
+        self.l_ctr_counts = l_ctr_counts
 
         # Initialize vhfopt after reordering mol._bas
-        _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen,
+        _vhf.VHFOpt.__init__(self, _sorted_mol, self._intor, self._prescreen,
                              self._qcondname, self._dmcondname)
         self.direct_scf_tol = cutoff
 
@@ -104,39 +92,22 @@ def build(self, cutoff=1e-14, group_size=None, diag_block_with_triu=False, aosym
             l_ctr_offsets, l_ctr_offsets, q_cond,
             diag_block_with_triu=diag_block_with_triu, aosym=aosym)
         self.log_qs = log_qs.copy()
-        cput1 = log.timer_debug1('Get pairing', *cput1)
+        cput1 = log.timer_debug1('Get AO pairing', *cput1)
 
         # contraction coefficient for ao basis
-        cart_ao_loc = sorted_mol.ao_loc_nr(cart=True)
-        sph_ao_loc = sorted_mol.ao_loc_nr(cart=False)
+        cart_ao_loc = _sorted_mol.ao_loc_nr(cart=True)
+        sph_ao_loc = _sorted_mol.ao_loc_nr(cart=False)
         self.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets]
         self.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets]
         self.angular = [l[0] for l in uniq_l_ctr]
 
-        cart_ao_loc = mol.ao_loc_nr(cart=True)
-        sph_ao_loc = mol.ao_loc_nr(cart=False)
-        nao = sph_ao_loc[-1]
-        ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1])
-        self.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
-
-        # cartesian ao index
-        nao = cart_ao_loc[-1]
-        ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
-        self.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
-        self.cart2sph = block_c2s_diag(self.angular, l_ctr_counts)
-        cput1 = log.timer_debug1('AO cart2sph coeff', *cput1)
-
-        if _mol.cart:
-            ncart = cart_ao_loc[-1]
-            inv_idx = np.argsort(self.cart_ao_idx, kind='stable').astype(np.int32)
-            self.coeff = cp.eye(ncart)[:,inv_idx]
-        else:
-            inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32)
-            self.coeff = self.cart2sph[:, inv_idx]
-        cput1 = log.timer_debug1('AO cart2sph coeff', *cput1)
+        # Sorted AO indices
+        ao_loc = mol.ao_loc_nr(cart=original_mol.cart)
+        ao_idx = np.array_split(np.arange(original_mol.nao), ao_loc[1:-1])
+        self._ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
+        cput1 = log.timer_debug1('AO indices', *cput1)
 
-        ao_loc = sorted_mol.ao_loc_nr(cart=True)
-        cput1 = log.timer_debug1('Get AO pairs', *cput1)
+        ao_loc = cart_ao_loc
 
         self.pair2bra = pair2bra
         self.pair2ket = pair2ket
@@ -156,16 +127,20 @@ def get_n_hermite_density_of_angular_pair(l):
         n_density_per_angular_pair = (bas_pairs_locs[1:] - bas_pairs_locs[:-1]) * n_density_per_pair
         self.density_offset = np.append(0, np.cumsum(n_density_per_angular_pair)).astype(np.int32)
 
-        self.bpcache = ctypes.POINTER(BasisProdCache)()
-        scale_shellpair_diag = 1.0
-        libgint.GINTinit_basis_prod(
-            ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
-            ao_loc.ctypes.data_as(ctypes.c_void_p),
-            bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
-            bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
-            sorted_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(sorted_mol.natm),
-            sorted_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(sorted_mol.nbas),
-            sorted_mol._env.ctypes.data_as(ctypes.c_void_p))
+        self._bpcache = {}
+        for n in range(_num_devices):
+            with cp.cuda.Device(n), _streams[n]:
+                bpcache = ctypes.POINTER(BasisProdCache)()
+                scale_shellpair_diag = 1.0
+                libgint.GINTinit_basis_prod(
+                    ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag),
+                    ao_loc.ctypes.data_as(ctypes.c_void_p),
+                    bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
+                    bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
+                    _sorted_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_sorted_mol.natm),
+                    _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_sorted_mol.nbas),
+                    _sorted_mol._env.ctypes.data_as(ctypes.c_void_p))
+                self._bpcache[n] = bpcache
 
         cput1 = log.timer_debug1('Initialize GPU cache', *cput1)
         ncptype = len(self.log_qs)
@@ -176,22 +151,44 @@ def get_n_hermite_density_of_angular_pair(l):
             nl = int(round(np.sqrt(ncptype)))
             self.cp_idx, self.cp_jdx = np.unravel_index(np.arange(ncptype), (nl, nl))
 
-        if _mol.cart:
+        if original_mol.cart:
             self.ao_loc = self.cart_ao_loc
-            self.ao_idx = self.cart_ao_idx
         else:
             self.ao_loc = self.sph_ao_loc
-            self.ao_idx = self.sph_ao_idx
+
+    def sort_orbitals(self, mat, axis=[]):
+        ''' Transform given axis of a matrix into sorted AO,
+        and transform given auxiliary axis of a matrix into sorted auxiliary AO
+        '''
+        idx = self._ao_idx
+        shape_ones = (1,) * mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        return mat[tuple(fancy_index)]
+
+    @property
+    def bpcache(self):
+        device_id = cp.cuda.Device().id
+        bpcache = self._bpcache[device_id]
+        return bpcache
+
+    @property
+    def cart2sph(self):
+        return block_c2s_diag(self.angular, self.l_ctr_counts)
 # end of class VHFOpt
 
 
-def get_int3c1e(mol, grids, direct_scf_tol):
+def get_int3c1e(mol, grids, intopt):
     omega = mol.omega
     assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
 
-    intopt = VHFOpt(mol, 'int2e')
-    intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE)
-
     nao = mol.nao
     ngrids = grids.shape[0]
     total_double_number = ngrids * nao * nao
@@ -221,7 +218,7 @@ def get_int3c1e(mol, grids, direct_scf_tol):
             lj = intopt.angular[cpj]
 
             stream = cp.cuda.get_current_stream()
-            nao_cart = intopt.mol.nao
+            nao_cart = intopt._sorted_mol.nao
 
             log_q_ij = intopt.log_qs[cp_ij_id]
 
@@ -265,7 +262,7 @@ def get_int3c1e(mol, grids, direct_scf_tol):
 
         row, col = np.tril_indices(nao)
         int3c_grid_slice[:, row, col] = int3c_grid_slice[:, col, row]
-        ao_idx = np.argsort(intopt.ao_idx)
+        ao_idx = np.argsort(intopt._ao_idx)
         grid_idx = np.arange(ngrids_of_split)
         int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)]
 
@@ -273,13 +270,10 @@ def get_int3c1e(mol, grids, direct_scf_tol):
 
     return int3c
 
-def get_int3c1e_charge_contracted(mol, grids, charges, direct_scf_tol):
+def get_int3c1e_charge_contracted(mol, grids, charges, intopt):
     omega = mol.omega
     assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
 
-    intopt = VHFOpt(mol, 'int2e')
-    intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE)
-
     nao = mol.nao
 
     assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
@@ -296,7 +290,7 @@ def get_int3c1e_charge_contracted(mol, grids, charges, direct_scf_tol):
         lj = intopt.angular[cpj]
 
         stream = cp.cuda.get_current_stream()
-        nao_cart = intopt.mol.nao
+        nao_cart = intopt._sorted_mol.nao
 
         log_q_ij = intopt.log_qs[cp_ij_id]
 
@@ -312,8 +306,8 @@ def get_int3c1e_charge_contracted(mol, grids, charges, direct_scf_tol):
         strides = np.array([ni, ni*nj], dtype=np.int32)
 
         ngrids = grids.shape[0]
-        # n_charge_sum_per_thread = 1 means every thread processes one pair and one grid
-        # n_charge_sum_per_thread = ngrids or larger number gaurantees one thread processes one pair and all grid points
+        # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+        # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
         n_charge_sum_per_thread = 10
 
         int1e_angular_slice = cp.zeros([j1-j0, i1-i0], order='C')
@@ -346,26 +340,30 @@ def get_int3c1e_charge_contracted(mol, grids, charges, direct_scf_tol):
 
     row, col = np.tril_indices(nao)
     int1e[row, col] = int1e[col, row]
-    ao_idx = np.argsort(intopt.ao_idx)
+    ao_idx = np.argsort(intopt._ao_idx)
     int1e = int1e[np.ix_(ao_idx, ao_idx)]
 
-    return cp.asnumpy(int1e)
+    return int1e
 
-def get_int3c1e_density_contracted(mol, grids, dm, direct_scf_tol):
+def get_int3c1e_density_contracted(mol, grids, dm, intopt):
     omega = mol.omega
     assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
 
     dm = cp.asarray(dm)
+    if dm.ndim == 3:
+        if dm.shape[0] > 2:
+            print("Warning: There are more than two density matrices to contract with one electron integrals, "
+                  "it's not from an unrestricted calculation, and we're unsure about your purpose. "
+                  "We sum the density matrices up, please check if that's expected.")
+        dm = cp.einsum("ijk->jk", dm)
+
     assert dm.ndim == 2
     assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
 
-    intopt = VHFOpt(mol, 'int2e')
-    intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=BLKSIZE)
-
-    nao_cart = intopt.mol.nao
+    nao_cart = intopt._sorted_mol.nao
     ngrids = grids.shape[0]
 
-    dm = dm[np.ix_(intopt.ao_idx, intopt.ao_idx)] # intopt.ao_idx is in spherical basis
+    dm = intopt.sort_orbitals(dm, [0,1])
     if not mol.cart:
         cart2sph_transformation_matrix = intopt.cart2sph
         # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm
@@ -374,9 +372,9 @@ def get_int3c1e_density_contracted(mol, grids, dm, direct_scf_tol):
 
     dm = cp.asnumpy(dm)
 
-    ao_loc_sorted_order = intopt.sorted_mol.ao_loc_nr(cart = True)
+    ao_loc_sorted_order = intopt._sorted_mol.ao_loc_nr(cart = True)
     l_ij = intopt.l_ij.T.flatten()
-    bas_coords = intopt.sorted_mol.atom_coords()[intopt.sorted_mol._bas[:, ATOM_OF]].flatten()
+    bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
 
     n_total_hermite_density = intopt.density_offset[-1]
     dm_pair_ordered = np.zeros(n_total_hermite_density)
@@ -412,8 +410,8 @@ def get_int3c1e_density_contracted(mol, grids, dm, direct_scf_tol):
             nbins = 1
             bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
 
-            # n_pair_sum_per_thread = 1 means every thread processes one pair and one grid
-            # n_pair_sum_per_thread = nao_cart or larger number gaurantees one thread processes one grid and all pairs of the same type
+            # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
+            # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
             n_pair_sum_per_thread = nao_cart
 
             err = libgint.GINTfill_int3c1e_density_contracted(
@@ -433,19 +431,28 @@ def get_int3c1e_density_contracted(mol, grids, dm, direct_scf_tol):
             if err != 0:
                 raise RuntimeError('GINTfill_int3c1e_density_contracted failed')
 
-    return cp.asnumpy(int3c_density_contracted)
+    return int3c_density_contracted
 
-def intor(mol, intor, grids, dm=None, charges=None, direct_scf_tol=1e-13):
-    assert intor == 'int1e_grids' and grids is not None
+def intor(mol, intor, grids, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+    assert intor == 'int1e_grids'
+    assert grids is not None
     assert dm is None or charges is None, \
         "Are you sure you want to contract the one electron integrals with both charge and density? " + \
         "If so, pass in density, obtain the result with n_charge and contract with the charges yourself."
 
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+
     if dm is None and charges is None:
-        return get_int3c1e(mol, grids, direct_scf_tol)
+        return get_int3c1e(mol, grids, intopt)
     elif dm is not None:
-        return get_int3c1e_density_contracted(mol, grids, dm, direct_scf_tol)
+        return get_int3c1e_density_contracted(mol, grids, dm, intopt)
     elif charges is not None:
-        return get_int3c1e_charge_contracted(mol, grids, charges, direct_scf_tol)
+        return get_int3c1e_charge_contracted(mol, grids, charges, intopt)
     else:
         raise ValueError(f"Logic error in {__file__} {__name__}")
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids.py b/gpu4pyscf/gto/tests/test_int1e_grids.py
index 56ee9065..53d7dbad 100644
--- a/gpu4pyscf/gto/tests/test_int1e_grids.py
+++ b/gpu4pyscf/gto/tests/test_int1e_grids.py
@@ -15,12 +15,13 @@
 
 import unittest
 import numpy as np
+import cupy as cp
 import pyscf
 from pyscf import lib
 from gpu4pyscf.gto.moleintor import intor
 
 def setUpModule():
-    global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold
+    global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold
     atom = '''
 O	0.0000	0.7375	-0.0528
 O	0.0000	-0.7375	-0.0528
@@ -45,8 +46,9 @@ def setUpModule():
     grid_points = lib.cartesian_prod([xs, ys, zs])
 
     # All of the following thresholds bound the max value of the corresponding matrix / tensor.
-    integral_threshold = 1e-10
-    density_contraction_threshold = 1e-8
+    integral_threshold = 1e-12
+    density_contraction_threshold = 1e-10
+    charge_contraction_threshold = 1e-12
 
 def tearDownModule():
     global mol_sph, mol_cart, grid_points
@@ -56,17 +58,17 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     '''
-    known values are obtained by Q-Chem
+    Values are compared to PySCF CPU intor() function
     '''
     def test_int1e_grids_full_tensor_cart(self):
         ref_int1e = mol_cart.intor('int1e_grids', grids=grid_points)
         test_int1e = intor(mol_cart, 'int1e_grids', grid_points)
-        assert np.abs(ref_int1e - test_int1e).max() < integral_threshold
+        np.testing.assert_allclose(ref_int1e, test_int1e, atol = integral_threshold)
 
     def test_int1e_grids_full_tensor_sph(self):
         ref_int1e = mol_sph.intor('int1e_grids', grids=grid_points)
         test_int1e = intor(mol_sph, 'int1e_grids', grid_points)
-        assert np.abs(ref_int1e - test_int1e).max() < integral_threshold
+        np.testing.assert_allclose(ref_int1e, test_int1e, atol = integral_threshold)
 
     def test_int1e_grids_density_contracted_cart_symmetric(self):
         np.random.seed(12345)
@@ -74,7 +76,8 @@ def test_int1e_grids_density_contracted_cart_symmetric(self):
         dm = 0.5 * (dm + dm.T)
         ref_int1e_dot_D = np.einsum('pij,ij->p', mol_cart.intor('int1e_grids', grids=grid_points), dm)
         test_int1e_dot_D = intor(mol_cart, 'int1e_grids', grid_points, dm = dm)
-        assert np.abs(ref_int1e_dot_D - test_int1e_dot_D).max() < density_contraction_threshold
+        assert isinstance(test_int1e_dot_D, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold)
 
     def test_int1e_grids_density_contracted_sph_symmetric(self):
         np.random.seed(12346)
@@ -82,35 +85,42 @@ def test_int1e_grids_density_contracted_sph_symmetric(self):
         dm = 0.5 * (dm + dm.T)
         ref_int1e_dot_D = np.einsum('pij,ij->p', mol_sph.intor('int1e_grids', grids=grid_points), dm)
         test_int1e_dot_D = intor(mol_sph, 'int1e_grids', grid_points, dm = dm)
-        assert np.abs(ref_int1e_dot_D - test_int1e_dot_D).max() < density_contraction_threshold
+        assert isinstance(test_int1e_dot_D, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold)
 
     def test_int1e_grids_density_contracted_cart_asymmetric(self):
         np.random.seed(12347)
         dm = np.random.uniform(-2.0, 2.0, (mol_cart.nao, mol_cart.nao))
         ref_int1e_dot_D = np.einsum('pij,ij->p', mol_cart.intor('int1e_grids', grids=grid_points), dm)
         test_int1e_dot_D = intor(mol_cart, 'int1e_grids', grid_points, dm = dm)
-        assert np.abs(ref_int1e_dot_D - test_int1e_dot_D).max() < density_contraction_threshold
+        assert isinstance(test_int1e_dot_D, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold)
 
     def test_int1e_grids_density_contracted_sph_asymmetric(self):
         np.random.seed(12348)
         dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
         ref_int1e_dot_D = np.einsum('pij,ij->p', mol_sph.intor('int1e_grids', grids=grid_points), dm)
         test_int1e_dot_D = intor(mol_sph, 'int1e_grids', grid_points, dm = dm)
-        assert np.abs(ref_int1e_dot_D - test_int1e_dot_D).max() < density_contraction_threshold
+        assert isinstance(test_int1e_dot_D, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold)
 
     def test_int1e_grids_charge_contracted_cart(self):
         np.random.seed(12347)
         charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
         ref_int1e_dot_q = np.einsum('pij,p->ij', mol_cart.intor('int1e_grids', grids=grid_points), charges)
         test_int1e_dot_q = intor(mol_cart, 'int1e_grids', grid_points, charges = charges)
-        assert np.abs(ref_int1e_dot_q - test_int1e_dot_q).max() < density_contraction_threshold
+        assert isinstance(test_int1e_dot_q, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_q, test_int1e_dot_q, atol = charge_contraction_threshold)
 
     def test_int1e_grids_charge_contracted_sph(self):
         np.random.seed(12348)
         charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
         ref_int1e_dot_q = np.einsum('pij,p->ij', mol_sph.intor('int1e_grids', grids=grid_points), charges)
         test_int1e_dot_q = intor(mol_sph, 'int1e_grids', grid_points, charges = charges)
-        assert np.abs(ref_int1e_dot_q - test_int1e_dot_q).max() < density_contraction_threshold
+        assert isinstance(test_int1e_dot_q, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_q, test_int1e_dot_q, atol = charge_contraction_threshold)
+
+    # Range separated integrals
 
     def test_int1e_grids_full_tensor_omega(self):
         omega = 0.8
@@ -119,7 +129,7 @@ def test_int1e_grids_full_tensor_omega(self):
 
         ref_int1e = mol_sph_omega.intor('int1e_grids', grids=grid_points)
         test_int1e = intor(mol_sph_omega, 'int1e_grids', grid_points)
-        assert np.abs(ref_int1e - test_int1e).max() < integral_threshold
+        np.testing.assert_allclose(ref_int1e, test_int1e, atol = integral_threshold)
 
     def test_int1e_grids_density_contracted_omega(self):
         np.random.seed(12349)
@@ -131,7 +141,8 @@ def test_int1e_grids_density_contracted_omega(self):
 
         ref_int1e_dot_D = np.einsum('pij,ij->p', mol_sph_omega.intor('int1e_grids', grids=grid_points), dm)
         test_int1e_dot_D = intor(mol_sph_omega, 'int1e_grids', grid_points, dm = dm)
-        assert np.abs(ref_int1e_dot_D - test_int1e_dot_D).max() < integral_threshold
+        assert isinstance(test_int1e_dot_D, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold)
 
     def test_int1e_grids_charge_contracted_omega(self):
         np.random.seed(12349)
@@ -143,7 +154,8 @@ def test_int1e_grids_charge_contracted_omega(self):
 
         ref_int1e_dot_q = np.einsum('pij,p->ij', mol_sph_omega.intor('int1e_grids', grids=grid_points), charges)
         test_int1e_dot_q = intor(mol_sph_omega, 'int1e_grids', grid_points, charges = charges)
-        assert np.abs(ref_int1e_dot_q - test_int1e_dot_q).max() < integral_threshold
+        assert isinstance(test_int1e_dot_q, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dot_q, test_int1e_dot_q, atol = charge_contraction_threshold)
 
 if __name__ == "__main__":
     print("Full Tests for One Electron Coulomb Integrals")
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 66daf960..96e57279 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -374,8 +374,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmat_dm[ia][:,:,mask] += contract('yjg,xjg->xyj', ao_mask[1:4], aow)
             ao_dm0 = aow = None
             t1 = log.timer_debug2('integration', *t1)
+        vmat_dm = opt.unsort_orbitals(vmat_dm, axis=[3])
         for ia in range(_sorted_mol.natm):
-            vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
     elif xctype == 'GGA':
@@ -411,8 +411,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmat_dm[ia][:,:,mask] += vmat_dm_tmp
             ao_dm0 = aow = None
             t1 = log.timer_debug2('integration', *t1)
+        vmat_dm = opt.unsort_orbitals(vmat_dm, axis=[3])
         for ia in range(_sorted_mol.natm):
-            vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
             vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1])
@@ -478,8 +478,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
 
                 vmat_dm[ia][:,:,mask] += vmat_dm_tmp
             t1 = log.timer_debug2('integration', *t1)
+        vmat_dm = opt.unsort_orbitals(vmat_dm, axis=[3])
         for ia in range(_sorted_mol.natm):
-            vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
             vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1])
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index d7537cc4..d42c8f76 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -446,10 +446,10 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmatb_dm[ia][:,:,mask] += contract('yjg,xjg->xyj', ao_mask[1:4], aow)
             ao_dm0a = ao_dm0b = aow = None
             t1 = log.timer_debug2('integration', *t1)
+        vmata_dm = opt.unsort_orbitals(vmata_dm, axis=[3])
+        vmatb_dm = opt.unsort_orbitals(vmatb_dm, axis=[3])
         for ia in range(_sorted_mol.natm):
             p0, p1 = aoslices[ia][2:]
-            vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
-            vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
             vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
             vmatb_dm[ia] += contract('xypq,pq->xyp', ipipb[:,:,:,p0:p1], dm0b[:,p0:p1])
     elif xctype == 'GGA':
@@ -503,9 +503,9 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmatb_dm[ia][:,:,mask] += vmatb_dm_tmp
             ao_dm0a = ao_dm0b = aow = None
             t1 = log.timer_debug2('integration', *t1)
+        vmata_dm = opt.unsort_orbitals(vmata_dm, axis=[3])
+        vmatb_dm = opt.unsort_orbitals(vmatb_dm, axis=[3])
         for ia in range(_sorted_mol.natm):
-            vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
-            vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
             vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1])
@@ -618,9 +618,9 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
                 vmata_dm[ia][:,:,mask] += vmata_dm_tmp
                 vmatb_dm[ia][:,:,mask] += vmatb_dm_tmp
             t1 = log.timer_debug2('integration', *t1)
+        vmata_dm = opt.unsort_orbitals(vmata_dm, axis=[3])
+        vmatb_dm = opt.unsort_orbitals(vmatb_dm, axis=[3])
         for ia in range(_sorted_mol.natm):
-            vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
-            vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
             p0, p1 = aoslices[ia][2:]
             vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
             vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1])
diff --git a/gpu4pyscf/pop/esp.py b/gpu4pyscf/pop/esp.py
index 4934e4e2..1b1f0f6f 100644
--- a/gpu4pyscf/pop/esp.py
+++ b/gpu4pyscf/pop/esp.py
@@ -23,7 +23,7 @@
 import cupy
 from pyscf import gto
 from pyscf.data import radii
-from gpu4pyscf.df import int3c2e
+from gpu4pyscf.gto.moleintor import intor
 from gpu4pyscf.lib.cupy_helper import dist_matrix
 
 #modified_Bondi = radii.VDW.copy()
@@ -137,10 +137,7 @@ def build_ab(mol, dm,
     A[:natm, :natm] = rinv.dot(rinv.T)
     
     # For right hand side B
-    auxmol = gto.fakemol_for_charges(surface_points)
-    intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(1e-14, diag_block_with_triu=False, aosym=True, group_size=256)
-    v_grids_e = 2.0*int3c2e.get_j_int3c2e_pass1(intopt, dm, sort_j=False)
+    v_grids_e = intor(mol, 'int1e_grids', surface_points, dm=dm, direct_scf_tol=1e-14)
     v_grids_n = cupy.dot(charges, rinv)
     
     B = cupy.empty([dim])
diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py
index 11d2687c..12c27552 100644
--- a/gpu4pyscf/qmmm/chelpg.py
+++ b/gpu4pyscf/qmmm/chelpg.py
@@ -13,163 +13,16 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import pyscf
-import time
 import cupy
 import numpy as np
 import scipy
-import ctypes
-from pyscf import lib, gto
-from pyscf.scf import _vhf
-from gpu4pyscf.df import int3c2e
-from gpu4pyscf.scf.int4c2e import BasisProdCache, libgint, libgvhf
-from gpu4pyscf.lib.cupy_helper import load_library, block_c2s_diag
+from gpu4pyscf.gto.moleintor import intor, VHFOpt
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _num_devices, _streams
 
 from pyscf.data import radii
 modified_Bondi = radii.VDW.copy()
 modified_Bondi[1] = 1.1/radii.BOHR      # modified version
 
-# TODO: replace int3c2e.get_j_int3c2e_pass1 with int1e_grids
-def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
-            group_size_aux=None, diag_block_with_triu=False, aosym=False):
-    '''
-    Implement the similar functionality as VHFOpt.build, 
-    but without transformation for auxiliary basis.
-    '''
-    sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = int3c2e.sort_mol(
-        intopt.mol)
-    if group_size is not None:
-        uniq_l_ctr, l_ctr_counts = int3c2e._split_l_ctr_groups(
-            uniq_l_ctr, l_ctr_counts, group_size)
-    intopt.l_ctr_counts = l_ctr_counts
-
-    # sort fake mol
-    fake_mol = int3c2e.make_fake_mol()
-    _, _, fake_uniq_l_ctr, fake_l_ctr_counts = int3c2e.sort_mol(fake_mol)
-
-    # sort auxiliary mol
-    sorted_auxmol, _, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol(
-        intopt.auxmol)
-    if group_size_aux is not None:
-        aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e._split_l_ctr_groups(
-            aux_uniq_l_ctr, aux_l_ctr_counts, group_size_aux)
-    intopt.aux_l_ctr_counts = aux_l_ctr_counts
-    
-    tmp_mol = gto.mole.conc_mol(fake_mol, sorted_auxmol)
-    tot_mol = gto.mole.conc_mol(sorted_mol, tmp_mol)
-
-    # Initialize vhfopt after reordering mol._bas
-    _vhf.VHFOpt.__init__(intopt, sorted_mol, intopt._intor, intopt._prescreen,
-                            intopt._qcondname, intopt._dmcondname)
-    intopt.direct_scf_tol = cutoff
-
-    # TODO: is it more accurate to filter with overlap_cond (or exp_cond)?
-    q_cond = intopt.get_q_cond()
-    l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
-    log_qs, pair2bra, pair2ket = int3c2e.get_pairing(
-        l_ctr_offsets, l_ctr_offsets, q_cond,
-        diag_block_with_triu=diag_block_with_triu, aosym=aosym)
-    intopt.log_qs = log_qs.copy()
-
-    # contraction coefficient for ao basis
-    cart_ao_loc = sorted_mol.ao_loc_nr(cart=True)
-    sph_ao_loc = sorted_mol.ao_loc_nr(cart=False)
-    intopt.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets]
-    intopt.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets]
-    intopt.angular = [l[0] for l in uniq_l_ctr]
-
-    cart_ao_loc = intopt.mol.ao_loc_nr(cart=True)
-    sph_ao_loc = intopt.mol.ao_loc_nr(cart=False)
-    nao = sph_ao_loc[-1]
-    ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1])
-    intopt.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
-
-    # cartesian ao index
-    nao = cart_ao_loc[-1]
-    ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
-    intopt.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
-    ncart = cart_ao_loc[-1]
-
-    # pairing auxiliary basis with fake basis set
-    fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts))
-    fake_l_ctr_offsets += l_ctr_offsets[-1]
-
-    aux_l_ctr_offsets = np.append(0, np.cumsum(aux_l_ctr_counts))
-
-    # contraction coefficient for auxiliary basis
-    cart_aux_loc = sorted_auxmol.ao_loc_nr(cart=True)
-    sph_aux_loc = sorted_auxmol.ao_loc_nr(cart=False)
-    intopt.cart_aux_loc = [cart_aux_loc[cp] for cp in aux_l_ctr_offsets]
-    intopt.sph_aux_loc = [sph_aux_loc[cp] for cp in aux_l_ctr_offsets]
-    intopt.aux_angular = [l[0] for l in aux_uniq_l_ctr]
-
-    cart_aux_loc = intopt.auxmol.ao_loc_nr(cart=True)
-    sph_aux_loc = intopt.auxmol.ao_loc_nr(cart=False)
-    ncart = cart_aux_loc[-1]
-    # inv_idx = np.argsort(intopt.sph_aux_idx, kind='stable').astype(np.int32)
-    aux_l_ctr_offsets += fake_l_ctr_offsets[-1]
-
-    # hardcoded for grids
-    aux_pair2bra = [np.arange(aux_l_ctr_offsets[0], aux_l_ctr_offsets[-1])]
-    aux_pair2ket = [np.ones(ncart) * fake_l_ctr_offsets[0]]
-    aux_log_qs = [np.ones(ncart)]
-
-    intopt.aux_log_qs = aux_log_qs.copy()
-    pair2bra += aux_pair2bra
-    pair2ket += aux_pair2ket
-
-    uniq_l_ctr = np.concatenate(
-        [uniq_l_ctr, fake_uniq_l_ctr, aux_uniq_l_ctr])
-    l_ctr_offsets = np.concatenate([
-        l_ctr_offsets,
-        fake_l_ctr_offsets[1:],
-        aux_l_ctr_offsets[1:]])
-
-    bas_pair2shls = np.hstack(
-        pair2bra + pair2ket).astype(np.int32).reshape(2, -1)
-    bas_pairs_locs = np.append(0, np.cumsum(
-        [x.size for x in pair2bra])).astype(np.int32)
-    log_qs = log_qs + aux_log_qs
-    ao_loc = tot_mol.ao_loc_nr(cart=True)
-    ncptype = len(log_qs)
-
-    intopt._bpcache = {}
-    for n in range(_num_devices):
-        with cupy.cuda.Device(n), _streams[n]:
-            bpcache = ctypes.POINTER(BasisProdCache)()
-            scale_shellpair_diag = 1.
-            
-            libgint.GINTinit_basis_prod(
-                ctypes.byref(bpcache), 
-                ctypes.c_double(scale_shellpair_diag),
-                ao_loc.ctypes.data_as(ctypes.c_void_p),
-                bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
-                bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), 
-                ctypes.c_int(ncptype),
-                tot_mol._atm.ctypes.data_as(ctypes.c_void_p), 
-                ctypes.c_int(tot_mol.natm),
-                tot_mol._bas.ctypes.data_as(ctypes.c_void_p), 
-                ctypes.c_int(tot_mol.nbas),
-                tot_mol._env.ctypes.data_as(ctypes.c_void_p))
-            intopt._bpcache[n] = bpcache
-    intopt.bas_pairs_locs = bas_pairs_locs
-    ncptype = len(intopt.log_qs)
-    if aosym:
-        intopt.cp_idx, intopt.cp_jdx = np.tril_indices(ncptype)
-    else:
-        nl = int(round(np.sqrt(ncptype)))
-        intopt.cp_idx, intopt.cp_jdx = np.unravel_index(
-            np.arange(ncptype), (nl, nl))
-
-    intopt._sorted_mol = sorted_mol
-    intopt._sorted_auxmol = sorted_auxmol
-    if intopt.mol.cart:
-        intopt._ao_idx = intopt.cart_ao_idx
-    else:
-        intopt._ao_idx = intopt.sph_ao_idx
-
 def eval_chelpg_layer_gpu(mf, deltaR=0.3, Rhead=2.8, ifqchem=True, Rvdw=modified_Bondi, verbose=None):
     """Cal chelpg charge
 
@@ -254,17 +107,12 @@ def tau_f(R, Rcut, Roff):
     nbatch = 256*256
 
     # assert nbatch < ngrids
-    fmol = pyscf.gto.fakemol_for_charges(gridcoords[:nbatch])
-    intopt = int3c2e.VHFOpt(mf.mol, fmol, 'int2e')
+    intopt = VHFOpt(mf.mol)
+    intopt.build(cutoff=1e-14)
     for ibatch in range(0, ngrids, nbatch):
         max_grid = min(ibatch+nbatch, ngrids)
-        num_grids = max_grid - ibatch
-        ptr = intopt.auxmol._atm[:num_grids, gto.PTR_COORD]
-        intopt.auxmol._env[np.vstack(
-            (ptr, ptr+1, ptr+2)).T] = gridcoords[ibatch:max_grid]
-        _build_VHFOpt(intopt, 1e-14, diag_block_with_triu=False, aosym=True)
-        potential_real[ibatch:max_grid] -= 2.0 * \
-            int3c2e.get_j_int3c2e_pass1(intopt, dm, sort_j=False)[:num_grids]
+        potential_real[ibatch:max_grid] -= \
+            intor(mf.mol, 'int1e_grids', gridcoords[ibatch:max_grid], dm=dm, intopt=intopt)
 
     w = cupy.array(w)
     r_pX_potential_omega = r_pX_potential*w