From 7af3fff0f26f33fa7dd88379b4e15d1ae6d171c4 Mon Sep 17 00:00:00 2001 From: "xiaojie.wu" Date: Wed, 22 Nov 2023 00:47:58 +0800 Subject: [PATCH] aggressive config and take_last2d --- gpu4pyscf/__config__.py | 6 ++-- gpu4pyscf/df/df.py | 6 ++-- gpu4pyscf/df/grad/rhf.py | 6 ++-- gpu4pyscf/df/hessian/rhf.py | 49 +++++++++++++-------------- gpu4pyscf/df/int3c2e.py | 6 ++-- gpu4pyscf/df/tests/test_df_hessian.py | 1 + gpu4pyscf/scf/cphf.py | 2 +- 7 files changed, 38 insertions(+), 38 deletions(-) diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index 4d81bc03..0a478cc2 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -20,9 +20,9 @@ number_of_threads = 1024 * 80 # such as A30-24GB elif props['totalGlobalMem'] >= 16 * GB: - min_ao_blksize = 64 - min_grid_blksize = 64*64 - ao_aligned = 16 + min_ao_blksize = 128 + min_grid_blksize = 128*128 + ao_aligned = 32 grid_aligned = 128 mem_fraction = 0.9 number_of_threads = 1024 * 80 diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index da59f9da..76a14f28 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -22,7 +22,7 @@ from pyscf import lib from pyscf.df import df, addons from gpu4pyscf.lib.cupy_helper import ( - cholesky, tag_array, get_avail_mem, cart2sph) + cholesky, tag_array, get_avail_mem, cart2sph, take_last2d) from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ @@ -77,13 +77,13 @@ def build(self, direct_scf_tol=1e-14, omega=None): j2c_cpu = auxmol.intor('int2c2e', hermi=1) else: j2c_cpu = auxmol.intor('int2c2e', hermi=1) - j2c = cupy.asarray(j2c_cpu) + j2c = cupy.asarray(j2c_cpu, order='C') t0 = log.timer_debug1('2c2e', *t0) intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256) log.timer_debug1('prepare intopt', *t0) self.j2c = j2c.copy() - j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)] + j2c = take_last2d(j2c, intopt.sph_aux_idx) try: self.cd_low = cholesky(j2c) self.cd_low = tag_array(self.cd_low, tag='cd') diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index febdee40..69231337 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -20,7 +20,7 @@ from pyscf.df.grad import rhf from pyscf import lib, scf, gto from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library +from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d from gpu4pyscf.grad.rhf import grad_elec from gpu4pyscf import __config__ from gpu4pyscf.lib import logger @@ -59,7 +59,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega mo_coeff = cupy.asarray(mf_grad.base.mo_coeff) mo_occ = cupy.asarray(mf_grad.base.mo_occ) sph_ao_idx = intopt.sph_ao_idx - dm = dm0[numpy.ix_(sph_ao_idx, sph_ao_idx)] + dm = take_last2d(dm0, sph_ao_idx) orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0])) orbo = orbo[sph_ao_idx, :] nocc = orbo.shape[-1] @@ -119,7 +119,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega #rhok = solve_triangular(low_t, rhok, lower=False) rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) tmp = contract('pij,qij->pq', rhok, rhok) - tmp = tmp[cupy.ix_(rev_aux_idx, rev_aux_idx)] + tmp = take_last2d(tmp, rev_aux_idx) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) vkaux = tmp = None diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 2a06692f..710963a9 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -38,7 +38,7 @@ import numpy as np from pyscf import lib, df from gpu4pyscf.hessian import rhf as rhf_hess -from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info +from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger @@ -65,7 +65,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if mo_coeff is None: mo_coeff = mf.mo_coeff if atmlst is None: atmlst = range(mol.natm) - mo_coeff = cupy.asarray(mo_coeff) + mo_coeff = cupy.asarray(mo_coeff, order='C') nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] mocc_2 = cupy.einsum('pi,i->pi', mocc, mo_occ[mo_occ>0]**.5) @@ -77,8 +77,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # overlap matrix contributions # ------------------------------------ s1aa, s1ab, _ = rhf_hess.get_ovlp(mol) - s1aa = cupy.asarray(s1aa) - s1ab = cupy.asarray(s1ab) + s1aa = cupy.asarray(s1aa, order='C') + s1ab = cupy.asarray(s1ab, order='C') auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis) naux = auxmol.nao @@ -101,15 +101,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, sph_aux_idx = intopt.sph_aux_idx mocc_2 = mocc_2[sph_ao_idx, :] - dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)] + dm0 = take_last2d(dm0, sph_ao_idx) dm0_tag = tag_array(dm0, occ_coeff=mocc_2) - int2c = cupy.asarray(int2c) - int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)] + int2c = cupy.asarray(int2c, order='C') + int2c = take_last2d(int2c, sph_aux_idx) int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) - int2c_ip1 = cupy.asarray(int2c_ip1) - int2c_ip1 = int2c_ip1[cupy.ix_(np.arange(3), sph_aux_idx, sph_aux_idx)] + int2c_ip1 = cupy.asarray(int2c_ip1, order='C') + int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx) int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) hj_ao_ao = cupy.zeros([nao,nao,3,3]) @@ -223,14 +223,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # int2c contributions if hessobj.auxbasis_response > 1: - aux_aux_9 = np.ix_(np.arange(9), sph_aux_idx, sph_aux_idx) if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') else: int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') - int2c_ipip1 = cupy.asarray(int2c_ipip1) - int2c_ipip1 = int2c_ipip1[aux_aux_9] + int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') + int2c_ipip1 = take_last2d(int2c_ipip1, sph_aux_idx) rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) hj_aux_diag -= cupy.einsum('p,xp->px', rhoj0_P, rhoj2c_P).reshape(-1,3,3) @@ -244,13 +243,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') else: int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') - int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2) - int2c_ip1ip2 = int2c_ip1ip2[aux_aux_9] + int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') + int2c_ip1ip2 = take_last2d(int2c_ip1ip2, sph_aux_idx) hj_aux_aux = -.5 * cupy.einsum('p,xpq,q->pqx', rhoj0_P, int2c_ip1ip2, rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) - int2c_ip1ip2 = aux_aux_9 = None + int2c_ip1ip2 = None cupy.get_default_memory_pool().free_all_blocks() release_gpu_stack() @@ -402,8 +401,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, atmlst = range(mol.natm) # FIXME with_k = True - mo_coeff = cupy.asarray(mo_coeff) - mo_occ = cupy.asarray(mo_occ) + mo_coeff = cupy.asarray(mo_coeff, order='C') + mo_occ = cupy.asarray(mo_occ, order='C') mf = hessobj.base #auxmol = hessobj.base.with_df.auxmol @@ -420,7 +419,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c = auxmol.intor('int2c2e', aosym='s1') else: int2c = auxmol.intor('int2c2e', aosym='s1') - int2c = cupy.asarray(int2c) + int2c = cupy.asarray(int2c, order='C') # ======================= sorted AO begin ====================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE) @@ -430,10 +429,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mocc = mocc[sph_ao_idx, :] mo_coeff = mo_coeff[sph_ao_idx,:] - dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)] + dm0 = take_last2d(dm0, sph_ao_idx) dm0_tag = tag_array(dm0, occ_coeff=mocc) - int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)] + int2c = take_last2d(int2c, sph_aux_idx) int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) wj, wk_Pl_, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) @@ -451,8 +450,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, # int3c_ip1 contributions vj1_buf, vk1_buf, vj1_ao, vk1_ao = int3c2e.get_int3c2e_ip1_vjk(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega) - vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] - vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] + vj1_buf = take_last2d(vj1_buf, rev_ao_idx) + vk1_buf = take_last2d(vk1_buf, rev_ao_idx) vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff) vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff) @@ -472,8 +471,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') else: int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') - int2c_ip1 = cupy.asarray(int2c_ip1) - int2c_ip1 = int2c_ip1[cupy.ix_(np.arange(3), sph_aux_idx, sph_aux_idx)] + int2c_ip1 = cupy.asarray(int2c_ip1, order='C') + int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx) wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) @@ -527,7 +526,7 @@ def _ao2mo(mat): vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) h1 = hcore_deriv(ia) - h1 = _ao2mo(cupy.asarray(h1)) + h1 = _ao2mo(cupy.asarray(h1, order='C')) vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) if with_k: vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index a542a7fb..dda6cd50 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -22,7 +22,7 @@ from pyscf.scf import _vhf from gpu4pyscf.scf.hf import BasisProdCache, _make_s_index_offsets from gpu4pyscf.lib.cupy_helper import ( - block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info) + block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info, take_last2d) from gpu4pyscf.lib import logger LMAX_ON_GPU = 8 @@ -1183,8 +1183,8 @@ def get_dh1e(mol, dm0): fakemol = gto.fakemol_for_charges(coords) intopt = VHFOpt(mol, fakemol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)] - + #dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)] + dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx) dh1e = cupy.zeros([natm,3]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'): dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1]) diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py index 04dde225..18b55c80 100644 --- a/gpu4pyscf/df/tests/test_df_hessian.py +++ b/gpu4pyscf/df/tests/test_df_hessian.py @@ -39,6 +39,7 @@ def setUpModule(): def tearDownModule(): global mol + mol.stdout.close() del mol def _make_rhf(): diff --git a/gpu4pyscf/scf/cphf.py b/gpu4pyscf/scf/cphf.py index fdd45453..52b4a84f 100644 --- a/gpu4pyscf/scf/cphf.py +++ b/gpu4pyscf/scf/cphf.py @@ -28,7 +28,7 @@ from gpu4pyscf.lib import logger def solve(fvind, mo_energy, mo_occ, h1, s1=None, - max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN): + max_cycle=50, tol=1e-7, hermi=False, verbose=logger.WARN): ''' Args: fvind : function