Skip to content

Commit

Permalink
optimize hessian & gpu timer
Browse files Browse the repository at this point in the history
  • Loading branch information
wxj6000 committed Nov 3, 2023
1 parent ac37238 commit d153533
Show file tree
Hide file tree
Showing 24 changed files with 815 additions and 267 deletions.
4 changes: 2 additions & 2 deletions examples/00-h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
H 0.7570000000 0.0000000000 -0.4696000000
'''

xc='LDA'
xc='B3LYP'
bas='def2-tzvpp'
auxbasis='def2-tzvpp-jkfit'
scf_tol = 1e-10
Expand All @@ -34,7 +34,7 @@

mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)

mol.verbose = 4
mol.verbose = 6
mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
mf_GPU.grids.level = grids_level
mf_GPU.conv_tol = scf_tol
Expand Down
2 changes: 1 addition & 1 deletion examples/dft_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
basis=bas,
max_memory=32000)
# set verbose >= 6 for debugging timer
mol.verbose = 6
mol.verbose = 1

mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
if args.solvent:
Expand Down
4 changes: 2 additions & 2 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def build(self, direct_scf_tol=1e-14, omega=None):
idx = np.arange(nao)
self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)

t0 = (logger.process_clock(), logger.perf_counter())
log = logger.new_logger(mol, mol.verbose)
t0 = log.init_timer()
if auxmol is None:
self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis)

Expand Down Expand Up @@ -217,7 +217,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
nq = len(intopt.log_qs)
for cp_ij_id, _ in enumerate(intopt.log_qs):
if len(intopt.ao_pairs_row[cp_ij_id]) == 0: continue
t1 = (logger.process_clock(), logger.perf_counter())
t1 = log.init_timer()
cpi = intopt.cp_idx[cp_ij_id]
cpj = intopt.cp_jdx[cp_ij_id]
li = intopt.angular[cpi]
Expand Down
4 changes: 2 additions & 2 deletions gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
import cupy
import numpy
from pyscf import lib, scf, __config__
from pyscf.lib import logger
from pyscf.scf import dhf
from pyscf.df import df_jk, addons
from gpu4pyscf.lib import logger
from gpu4pyscf.lib.cupy_helper import contract, take_last2d, transpose_sum, load_library, get_avail_mem
from gpu4pyscf.dft import rks, numint
from gpu4pyscf.scf import hf
Expand Down Expand Up @@ -250,7 +250,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
nao = dms_tag.shape[-1]
dms = dms_tag.reshape([-1,nao,nao])
nset = dms.shape[0]
t0 = (logger.process_clock(), logger.perf_counter())
t0 = log.init_timer()
if dfobj._cderi is None:
log.debug('CDERI not found, build...')
dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
Expand Down
4 changes: 2 additions & 2 deletions gpu4pyscf/df/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
import cupy
from cupyx.scipy.linalg import solve_triangular
from pyscf.df.grad import rhf
from pyscf.lib import logger
from pyscf import lib, scf, gto
from gpu4pyscf.df import int3c2e
from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library
from gpu4pyscf.grad.rhf import grad_elec
from gpu4pyscf import __config__
from gpu4pyscf.lib import logger

libcupy_helper = load_library('libcupy_helper')

Expand Down Expand Up @@ -154,7 +154,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
vkaux = cupy.zeros((3,naux_cart))
cupy.get_default_memory_pool().free_all_blocks()
for cp_kl_id in range(len(intopt.aux_log_qs)):
t1 = (logger.process_clock(), logger.perf_counter())
t1 = log.init_timer()
k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
assert k1-k0 <= block_size
if with_j:
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/df/grad/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_veff(ks_grad, mol=None, dm=None):
'''
if mol is None: mol = ks_grad.mol
if dm is None: dm = ks_grad.base.make_rdm1()
t0 = (logger.process_clock(), logger.perf_counter())
t0 = logger.init_timer(ks_grad)

mf = ks_grad.base
ni = mf._numint
Expand Down
32 changes: 17 additions & 15 deletions gpu4pyscf/df/hessian/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
'''Partial derivative
'''
log = logger.new_logger(hessobj, verbose)
time0 = t1 = (logger.process_clock(), logger.perf_counter())
time0 = t1 = log.init_timer()

mol = hessobj.mol
mf = hessobj.base
Expand Down Expand Up @@ -393,12 +393,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
verbose=None, with_k=True, omega=None):
log = logger.new_logger(hessobj, verbose)
t0 = (logger.process_clock(), logger.perf_counter())
t0 = log.init_timer()
mol = hessobj.mol
if atmlst is None:
atmlst = range(mol.natm)
# FIXME
with_k = True
mo_coeff = cupy.asarray(mo_coeff)
mo_occ = cupy.asarray(mo_occ)

mf = hessobj.base
#auxmol = hessobj.base.with_df.auxmol
Expand Down Expand Up @@ -441,16 +443,16 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
rhok0_Pl_ = np.empty_like(wk_Pl_)
for p0, p1 in lib.prange(0,nao,64):
wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
rhok0_Pl_[:,p0:p1] = cupy.einsum('pq,qio->pio', int2c_inv, wk_tmp).get()
rhok0_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get()
wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None

# int3c_ip1 contributions
vj1_buf, vk1_buf, vj1_ao, vk1_ao = int3c2e.get_int3c2e_ip1_vjk(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega)
vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]

vj1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vj1_ao, mo_coeff)
vk1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vk1_ao, mo_coeff)
vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
vj1_ao = vk1_ao = None
t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)

Expand All @@ -475,15 +477,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,

for p0, p1 in lib.prange(0,nao,64):
rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
vj1_tmp = cupy.einsum('pio,xp->xpio', rhok_tmp, wj0_10)
vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)

wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok_tmp)
vj1_tmp += cupy.einsum('xpio,p->xpio', wk0_10_Pl_, rhoj0)
vj1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom)
wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp)
vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0)
vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom)
if with_k:
vk1_tmp = 2.0 * cupy.einsum('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
vk1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vk1_tmp, aux2atom)
vk1_tmp = 2.0 * contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
vk1_tmp += 2.0 * contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
vk1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom)
wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None
vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None
aux2atom = None
Expand All @@ -498,8 +500,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,

# ========================== sorted AO end ================================
def _ao2mo(mat):
tmp = cupy.einsum('xij,jo->xio', mat, mocc)
return cupy.einsum('xik,ip->xpk', tmp, mo_coeff)
tmp = contract('xij,jo->xio', mat, mocc)
return contract('xik,ip->xpk', tmp, mo_coeff)

vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2
vj1_int3c_ip1 = vj1_int3c_ip2 = None
Expand All @@ -522,7 +524,7 @@ def _ao2mo(mat):
vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)

h1 = hcore_deriv(ia)
h1 = _ao2mo(h1)
h1 = _ao2mo(cupy.asarray(h1))
vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
if with_k:
vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/df/hessian/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
atmlst=None, max_memory=4000, verbose=None):
log = logger.new_logger(hessobj, verbose)
time0 = t1 = (logger.process_clock(), logger.perf_counter())
time0 = t1 = log.init_timer()

mol = hessobj.mol
mf = hessobj.base
Expand Down
28 changes: 12 additions & 16 deletions gpu4pyscf/df/int3c2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def build(self, cutoff=1e-14, group_size=None,
a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
we will pair (ao,ao) and (aux,1) separately.
'''
cput0 = (logger.process_clock(), logger.perf_counter())
cput0 = logger.init_timer(self.mol)
sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(self.mol)
if group_size is not None :
uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size)
Expand Down Expand Up @@ -314,6 +314,7 @@ def build(self, cutoff=1e-14, group_size=None,
tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm),
tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas),
tot_mol._env.ctypes.data_as(ctypes.c_void_p))

cput1 = logger.timer_debug1(tot_mol, 'Initialize GPU cache', *cput1)
self.bas_pairs_locs = bas_pairs_locs
ncptype = len(self.log_qs)
Expand Down Expand Up @@ -745,29 +746,24 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
# initialize intermediate variables
if count % ncp_ij == 0:
rhoj0 = cupy.zeros([3,k1-k0,nao_sph])
rhok_tmp = cupy.asarray(rhok[k0:k1])
vj1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc])
if with_k:
rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2
rhok0 = contract('pli,lo->poi', rhok0_slice, orbo)
int3c_ip1_occ = cupy.zeros([3,k1-k0,nao_sph,nocc])
vk1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc])

# contraction
rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
vj1_ao = contract('pJo,xpi->xiJo', rhok_tmp, rhoj0)
vj1 += 2.0*contract('xiJo,ia->axJo', vj1_ao, ao2atom[i0:i1])

if with_k:
int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
vk1_ao[:,i0:i1,j0:j1] += contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
vk1_buf[:,i0:i1] += contract('xpji,plj->xil', int3c_blk, rhok0_slice[:,:,j0:j1])

# reduction
if (count+1) % ncp_ij == 0:
vj1_ao += contract('pjo,xpi->xijo', rhok_tmp, rhoj0)
vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom)
if with_k:
vk1_ao += contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice)
vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom)
vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1])

int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice[:,:,i0:i1])
vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1])
count += 1

return vj1_buf, vk1_buf, vj1, vk1
Expand Down
35 changes: 18 additions & 17 deletions gpu4pyscf/dft/libxc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import cupy
from pyscf import dft

libxc = np.ctypeslib.load_library(
_libxc = np.ctypeslib.load_library(
'libxc', os.path.abspath(os.path.join(__file__, '..', '..', 'lib', 'deps', 'lib')))

def _check_arrays(current_arrays, fields, factor, required):
Expand All @@ -45,31 +45,32 @@ class _xcfun(ctypes.Structure):
pass

_xc_func_p = ctypes.POINTER(_xcfun)
libxc.xc_func_alloc.restype = _xc_func_p
libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int)
libxc.xc_func_end.argtypes = (_xc_func_p, )
libxc.xc_func_free.argtypes = (_xc_func_p, )
_libxc.xc_func_alloc.restype = _xc_func_p
_libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int)
_libxc.xc_func_end.argtypes = (_xc_func_p, )
_libxc.xc_func_free.argtypes = (_xc_func_p, )

class XCfun:
def __init__(self, xc, spin):
assert spin == 'unpolarized'
self._spin = 1
self.xc_func = libxc.xc_func_alloc()
self.xc_func = _libxc.xc_func_alloc()
if isinstance(xc, str):
self.func_id = libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode()))
self.func_id = _libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode()))
else:
self.func_id = xc
ret = libxc.xc_func_init(self.xc_func, self.func_id, self._spin)
ret = _libxc.xc_func_init(self.xc_func, self.func_id, self._spin)
if ret != 0:
raise RuntimeError('failed to initialize xc fun')
self._family = dft.libxc.xc_type(xc)

def __del__(self):
if self.xc_func is None:
return
libxc.xc_func_end(self.xc_func)
libxc.xc_func_free(self.xc_func)

# TODO: deallocate xc func
#_libxc.xc_func_end(self.xc_func)
#_libxc.xc_func_free(self.xc_func)

def needs_laplacian(self):
return dft.libxc.needs_laplacian(self.func_id)

Expand All @@ -85,7 +86,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
npoints = int(inp["rho"].size / self._spin)
if (inp["rho"].size % self._spin):
raise ValueError("Rho input has an invalid shape, must be divisible by %d" % self._spin)

# Find the right compute function
args = [self.xc_func, ctypes.c_size_t(npoints)]
if self._family == 'LDA':
Expand Down Expand Up @@ -114,7 +115,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
if(isinstance(arg, cupy.ndarray)):
arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
cuda_args.append(arg)
libxc.xc_lda(*cuda_args)
_libxc.xc_lda(*cuda_args)
elif self._family == 'GGA':
input_labels = ["rho", "sigma"]
input_num_args = 2
Expand All @@ -141,7 +142,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
if(isinstance(arg, cupy.ndarray)):
arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
cuda_args.append(arg)
libxc.xc_gga(*cuda_args)
_libxc.xc_gga(*cuda_args)

elif self._family == 'MGGA':
# Build input args
Expand Down Expand Up @@ -178,7 +179,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
output = _check_arrays(output, output_labels[5:15], npoints, do_fxc)
output = _check_arrays(output, output_labels[15:35], npoints, do_kxc)
output = _check_arrays(output, output_labels[35:70], npoints, do_lxc)

args.extend([ inp[x] for x in input_labels])
if not self.needs_laplacian():
args.insert(-1, cupy.empty((1))) # Add none ptr to laplacian
Expand All @@ -189,10 +190,10 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
if(isinstance(arg, cupy.ndarray)):
arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
cuda_args.append(arg)
libxc.xc_mgga(*cuda_args)
_libxc.xc_mgga(*cuda_args)
else:
raise KeyError("Functional kind not recognized!")

return {k: v for k, v in zip(output_labels, args[2+input_num_args:]) if v is not None}


Loading

0 comments on commit d153533

Please sign in to comment.