From 7af3fff0f26f33fa7dd88379b4e15d1ae6d171c4 Mon Sep 17 00:00:00 2001
From: "xiaojie.wu" <xiaojie.wu@bytedance.com>
Date: Wed, 22 Nov 2023 00:47:58 +0800
Subject: [PATCH] aggressive config and take_last2d

---
 gpu4pyscf/__config__.py               |  6 ++--
 gpu4pyscf/df/df.py                    |  6 ++--
 gpu4pyscf/df/grad/rhf.py              |  6 ++--
 gpu4pyscf/df/hessian/rhf.py           | 49 +++++++++++++--------------
 gpu4pyscf/df/int3c2e.py               |  6 ++--
 gpu4pyscf/df/tests/test_df_hessian.py |  1 +
 gpu4pyscf/scf/cphf.py                 |  2 +-
 7 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 4d81bc03..0a478cc2 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -20,9 +20,9 @@
     number_of_threads = 1024 * 80
 # such as A30-24GB
 elif props['totalGlobalMem'] >= 16 * GB:
-    min_ao_blksize = 64
-    min_grid_blksize = 64*64
-    ao_aligned = 16
+    min_ao_blksize = 128
+    min_grid_blksize = 128*128
+    ao_aligned = 32
     grid_aligned = 128
     mem_fraction = 0.9
     number_of_threads = 1024 * 80
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index da59f9da..76a14f28 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -22,7 +22,7 @@
 from pyscf import lib
 from pyscf.df import df, addons
 from gpu4pyscf.lib.cupy_helper import (
-    cholesky, tag_array, get_avail_mem, cart2sph)
+    cholesky, tag_array, get_avail_mem, cart2sph, take_last2d)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -77,13 +77,13 @@ def build(self, direct_scf_tol=1e-14, omega=None):
                 j2c_cpu = auxmol.intor('int2c2e', hermi=1)
         else:
             j2c_cpu = auxmol.intor('int2c2e', hermi=1)
-        j2c = cupy.asarray(j2c_cpu)
+        j2c = cupy.asarray(j2c_cpu, order='C')
         t0 = log.timer_debug1('2c2e', *t0)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
         intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256)
         log.timer_debug1('prepare intopt', *t0)
         self.j2c = j2c.copy()
-        j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)]
+        j2c = take_last2d(j2c, intopt.sph_aux_idx)
         try:
             self.cd_low = cholesky(j2c)
             self.cd_low = tag_array(self.cd_low, tag='cd')
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index febdee40..69231337 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -20,7 +20,7 @@
 from pyscf.df.grad import rhf
 from pyscf import lib, scf, gto
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library
+from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d
 from gpu4pyscf.grad.rhf import grad_elec
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
@@ -59,7 +59,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
     mo_occ = cupy.asarray(mf_grad.base.mo_occ)
     sph_ao_idx = intopt.sph_ao_idx
-    dm = dm0[numpy.ix_(sph_ao_idx, sph_ao_idx)]
+    dm = take_last2d(dm0, sph_ao_idx)
     orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0]))
     orbo = orbo[sph_ao_idx, :]
     nocc = orbo.shape[-1]
@@ -119,7 +119,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             #rhok = solve_triangular(low_t, rhok, lower=False)
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
         tmp = contract('pij,qij->pq', rhok, rhok)
-        tmp = tmp[cupy.ix_(rev_aux_idx, rev_aux_idx)]
+        tmp = take_last2d(tmp, rev_aux_idx)
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
         vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
         vkaux = tmp = None
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 2a06692f..710963a9 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -38,7 +38,7 @@
 import numpy as np
 from pyscf import lib, df
 from gpu4pyscf.hessian import rhf as rhf_hess
-from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
 
@@ -65,7 +65,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
     if atmlst is None: atmlst = range(mol.natm)
 
-    mo_coeff = cupy.asarray(mo_coeff)
+    mo_coeff = cupy.asarray(mo_coeff, order='C')
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     mocc_2 = cupy.einsum('pi,i->pi', mocc, mo_occ[mo_occ>0]**.5)
@@ -77,8 +77,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #      overlap matrix contributions
     # ------------------------------------
     s1aa, s1ab, _ = rhf_hess.get_ovlp(mol)
-    s1aa = cupy.asarray(s1aa)
-    s1ab = cupy.asarray(s1ab)
+    s1aa = cupy.asarray(s1aa, order='C')
+    s1ab = cupy.asarray(s1ab, order='C')
 
     auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis)
     naux = auxmol.nao
@@ -101,15 +101,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     sph_aux_idx = intopt.sph_aux_idx
 
     mocc_2 = mocc_2[sph_ao_idx, :]
-    dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)]
+    dm0 = take_last2d(dm0, sph_ao_idx)
     dm0_tag = tag_array(dm0, occ_coeff=mocc_2)
 
-    int2c = cupy.asarray(int2c)
-    int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)]
+    int2c = cupy.asarray(int2c, order='C')
+    int2c = take_last2d(int2c, sph_aux_idx)
     int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12)
 
-    int2c_ip1 = cupy.asarray(int2c_ip1)
-    int2c_ip1 = int2c_ip1[cupy.ix_(np.arange(3), sph_aux_idx, sph_aux_idx)]
+    int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
+    int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx)
     int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
 
     hj_ao_ao = cupy.zeros([nao,nao,3,3])
@@ -223,14 +223,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     # int2c contributions
     if hessobj.auxbasis_response > 1:
-        aux_aux_9 = np.ix_(np.arange(9), sph_aux_idx, sph_aux_idx)
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         else:
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
-        int2c_ipip1 = cupy.asarray(int2c_ipip1)
-        int2c_ipip1 = int2c_ipip1[aux_aux_9]
+        int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
+        int2c_ipip1 = take_last2d(int2c_ipip1, sph_aux_idx)
         rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         hj_aux_diag -= cupy.einsum('p,xp->px', rhoj0_P, rhoj2c_P).reshape(-1,3,3)
@@ -244,13 +243,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                 int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         else:
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
-        int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2)
-        int2c_ip1ip2 = int2c_ip1ip2[aux_aux_9]
+        int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
+        int2c_ip1ip2 = take_last2d(int2c_ip1ip2, sph_aux_idx)
         hj_aux_aux = -.5 * cupy.einsum('p,xpq,q->pqx', rhoj0_P, int2c_ip1ip2, rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
-        int2c_ip1ip2 = aux_aux_9 = None
+        int2c_ip1ip2 = None
 
     cupy.get_default_memory_pool().free_all_blocks()
     release_gpu_stack()
@@ -402,8 +401,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         atmlst = range(mol.natm)
     # FIXME
     with_k = True
-    mo_coeff = cupy.asarray(mo_coeff)
-    mo_occ = cupy.asarray(mo_occ)
+    mo_coeff = cupy.asarray(mo_coeff, order='C')
+    mo_occ = cupy.asarray(mo_occ, order='C')
 
     mf = hessobj.base
     #auxmol = hessobj.base.with_df.auxmol
@@ -420,7 +419,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             int2c = auxmol.intor('int2c2e', aosym='s1')
     else:
         int2c = auxmol.intor('int2c2e', aosym='s1')
-    int2c = cupy.asarray(int2c)
+    int2c = cupy.asarray(int2c, order='C')
     # ======================= sorted AO begin ======================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE)
@@ -430,10 +429,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     mocc = mocc[sph_ao_idx, :]
     mo_coeff = mo_coeff[sph_ao_idx,:]
-    dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)]
+    dm0 = take_last2d(dm0, sph_ao_idx)
     dm0_tag = tag_array(dm0, occ_coeff=mocc)
 
-    int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)]
+    int2c = take_last2d(int2c, sph_aux_idx)
     int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12)
 
     wj, wk_Pl_, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
@@ -451,8 +450,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     # int3c_ip1 contributions
     vj1_buf, vk1_buf, vj1_ao, vk1_ao = int3c2e.get_int3c2e_ip1_vjk(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega)
-    vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
-    vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
+    vj1_buf = take_last2d(vj1_buf, rev_ao_idx)
+    vk1_buf = take_last2d(vk1_buf, rev_ao_idx)
 
     vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
     vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
@@ -472,8 +471,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
                 int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
         else:
             int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
-        int2c_ip1 = cupy.asarray(int2c_ip1)
-        int2c_ip1 = int2c_ip1[cupy.ix_(np.arange(3), sph_aux_idx, sph_aux_idx)]
+        int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
+        int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx)
 
         wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
         wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
@@ -527,7 +526,7 @@ def _ao2mo(mat):
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
 
         h1 = hcore_deriv(ia)
-        h1 = _ao2mo(cupy.asarray(h1))
+        h1 = _ao2mo(cupy.asarray(h1, order='C'))
         vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
         if with_k:
             vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index a542a7fb..dda6cd50 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -22,7 +22,7 @@
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.hf import BasisProdCache, _make_s_index_offsets
 from gpu4pyscf.lib.cupy_helper import (
-    block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info)
+    block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info, take_last2d)
 from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 8
@@ -1183,8 +1183,8 @@ def get_dh1e(mol, dm0):
     fakemol = gto.fakemol_for_charges(coords)
     intopt = VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)]
-
+    #dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)]
+    dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx)
     dh1e = cupy.zeros([natm,3])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'):
         dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
index 04dde225..18b55c80 100644
--- a/gpu4pyscf/df/tests/test_df_hessian.py
+++ b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -39,6 +39,7 @@ def setUpModule():
 
 def tearDownModule():
     global mol
+    mol.stdout.close()
     del mol
 
 def _make_rhf():
diff --git a/gpu4pyscf/scf/cphf.py b/gpu4pyscf/scf/cphf.py
index fdd45453..52b4a84f 100644
--- a/gpu4pyscf/scf/cphf.py
+++ b/gpu4pyscf/scf/cphf.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.lib import logger
 
 def solve(fvind, mo_energy, mo_occ, h1, s1=None,
-          max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN):
+          max_cycle=50, tol=1e-7, hermi=False, verbose=logger.WARN):
     '''
     Args:
         fvind : function