diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 83d94acf..8863a89a 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -61,7 +61,7 @@ def build_df(): ni = mf._numint rks.initialize_grids(mf, mf.mol, dm0) ni.build(mf.mol, mf.grids.coords) - mf._numint.xcfuns = numint._init_xcfuns(mf.xc) + mf._numint.xcfuns = numint._init_xcfuns(mf.xc, dm0.ndim==3) dm0 = cupy.asarray(dm0) return diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 710963a9..fd283d82 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -68,11 +68,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mo_coeff = cupy.asarray(mo_coeff, order='C') nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] - mocc_2 = cupy.einsum('pi,i->pi', mocc, mo_occ[mo_occ>0]**.5) + mocc_2 = mocc * mo_occ[mo_occ>0]**.5 dm0 = cupy.dot(mocc, mocc.T) * 2 - hcore_deriv = hessobj.hcore_generator(mol) - # ------------------------------------ # overlap matrix contributions # ------------------------------------ @@ -107,10 +105,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c = cupy.asarray(int2c, order='C') int2c = take_last2d(int2c, sph_aux_idx) int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) + int2c = None int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx) - int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) hj_ao_ao = cupy.zeros([nao,nao,3,3]) hk_ao_ao = cupy.zeros([nao,nao,3,3]) @@ -143,10 +141,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) wj1_01 = None + int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) if with_k: if hessobj.auxbasis_response: wk1_P__ = contract('ypq,qor->ypor', int2c_ip1, rhok0_P__) - int2c_ip1_inv = cupy.asarray(int2c_ip1_inv) for i0, i1 in lib.prange(0,nao,64): wk1_Pko_islice = cupy.asarray(wk1_Pko[i0:i1]) @@ -232,7 +230,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ipip1 = take_last2d(int2c_ipip1, sph_aux_idx) rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) - hj_aux_diag -= cupy.einsum('p,xp->px', rhoj0_P, rhoj2c_P).reshape(-1,3,3) + # p,xp->px + hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) @@ -245,7 +244,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') int2c_ip1ip2 = take_last2d(int2c_ip1ip2, sph_aux_idx) - hj_aux_aux = -.5 * cupy.einsum('p,xpq,q->pqx', rhoj0_P, int2c_ip1ip2, rhoj0_P).reshape(naux, naux,3,3) + hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) @@ -309,7 +308,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, #======================================== sort AO end =========================================== # Energy weighted density matrix - dme0 = cupy.einsum('pi,qi,i->pq', mocc, mocc, mo_energy[mo_occ>0]) * 2.0 + # pi,qi,i->pq + dme0 = cupy.dot(mocc, (mocc * mo_energy[mo_occ>0] * 2).T) + hcore_deriv = hessobj.hcore_generator(mol) # ----------------------------------------- # collecting all # ----------------------------------------- @@ -318,18 +319,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, ek = cupy.zeros([len(atmlst),len(atmlst),3,3]) for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] - e1[i0,i0] -= cupy.einsum('xypq,pq->xy', s1aa[:,:,p0:p1], dme0[p0:p1]) * 2.0 + e1[i0,i0] -= contract('xypq,pq->xy', s1aa[:,:,p0:p1], dme0[p0:p1]) * 2.0 ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0) if with_k: ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) - e1[i0,j0] -= 2.0 * cupy.einsum('xypq,pq->xy', s1ab[:,:,p0:p1,q0:q1], dme0[p0:p1,q0:q1]) + e1[i0,j0] -= 2.0 * contract('xypq,pq->xy', s1ab[:,:,p0:p1,q0:q1], dme0[p0:p1,q0:q1]) if with_k: ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1]) h1ao = hcore_deriv(ia, ja) - e1[i0,j0] += cupy.einsum('xypq,pq->xy', h1ao, dm0) + e1[i0,j0] += contract('xypq,pq->xy', h1ao, dm0) # # The first order RI basis response # @@ -425,7 +426,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE) sph_ao_idx = intopt.sph_ao_idx sph_aux_idx = intopt.sph_aux_idx - rev_ao_idx = np.argsort(intopt.sph_ao_idx) mocc = mocc[sph_ao_idx, :] mo_coeff = mo_coeff[sph_ao_idx,:] @@ -434,11 +434,13 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c = take_last2d(int2c, sph_aux_idx) int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) + int2c = None wj, wk_Pl_, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) rhoj0 = contract('pq,q->p', int2c_inv, wj) if with_k: rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__) + wj = wk_P__ = None if isinstance(wk_Pl_, cupy.ndarray): rhok0_Pl_ = contract('pq,qio->pio', int2c_inv, wk_Pl_) else: @@ -446,10 +448,13 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) rhok0_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get() - wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None + wk_tmp = None + wk_Pl_ = int2c_inv = None # int3c_ip1 contributions + cupy.get_default_memory_pool().free_all_blocks() vj1_buf, vk1_buf, vj1_ao, vk1_ao = int3c2e.get_int3c2e_ip1_vjk(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega) + rev_ao_idx = np.argsort(sph_ao_idx) vj1_buf = take_last2d(vj1_buf, rev_ao_idx) vk1_buf = take_last2d(vk1_buf, rev_ao_idx) @@ -489,7 +494,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, vk1_tmp += 2.0 * contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp) vk1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom) wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None - vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None + rhok_tmp = vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None aux2atom = None vj1_int3c_ip2 = contract('nxiq,ip->nxpq', vj1_int3c_ip2, mo_coeff) diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py index 5ed63b48..c1d174bd 100644 --- a/gpu4pyscf/dft/libxc.py +++ b/gpu4pyscf/dft/libxc.py @@ -20,11 +20,12 @@ import ctypes import cupy from pyscf import dft +from gpu4pyscf.dft.libxc_structs import xc_func_type _libxc = np.ctypeslib.load_library( 'libxc', os.path.abspath(os.path.join(__file__, '..', '..', 'lib', 'deps', 'lib'))) -def _check_arrays(current_arrays, fields, factor, required): +def _check_arrays(current_arrays, fields, sizes, factor, required): """ A specialized function built to construct and check the sizes of arrays given to the LibXCFunctional class. """ @@ -35,7 +36,8 @@ def _check_arrays(current_arrays, fields, factor, required): for label in fields: if required: - current_arrays[label] = cupy.zeros((factor, 1)) + size = sizes[label] + current_arrays[label] = cupy.zeros((factor, size), dtype=np.float64) else: current_arrays[label] = None # cupy.empty((1)) @@ -44,7 +46,7 @@ def _check_arrays(current_arrays, fields, factor, required): class _xcfun(ctypes.Structure): pass -_xc_func_p = ctypes.POINTER(_xcfun) +_xc_func_p = ctypes.POINTER(xc_func_type) _libxc.xc_func_alloc.restype = _xc_func_p _libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int) _libxc.xc_func_end.argtypes = (_xc_func_p, ) @@ -52,8 +54,7 @@ class _xcfun(ctypes.Structure): class XCfun: def __init__(self, xc, spin): - assert spin == 'unpolarized' - self._spin = 1 + self._spin = 1 if spin == 'unpolarized' else 2 self.xc_func = _libxc.xc_func_alloc() if isinstance(xc, str): self.func_id = _libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode())) @@ -64,6 +65,10 @@ def __init__(self, xc, spin): raise RuntimeError('failed to initialize xc fun') self._family = dft.libxc.xc_type(xc) + self.xc_func_sizes = {} + for attr in dir(self.xc_func.contents.dim): + if "_" not in attr: + self.xc_func_sizes[attr] = getattr(self.xc_func.contents.dim, attr) def __del__(self): if self.xc_func is None: return @@ -89,6 +94,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k # Find the right compute function args = [self.xc_func, ctypes.c_size_t(npoints)] + xc_func_sizes = self.xc_func_sizes if self._family == 'LDA': input_labels = ["rho"] input_num_args = 1 @@ -102,11 +108,11 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k ] # Build input args - output = _check_arrays(output, output_labels[0:1], npoints, do_exc) - output = _check_arrays(output, output_labels[1:2], npoints, do_vxc) - output = _check_arrays(output, output_labels[2:3], npoints, do_fxc) - output = _check_arrays(output, output_labels[3:4], npoints, do_kxc) - output = _check_arrays(output, output_labels[4:5], npoints, do_lxc) + output = _check_arrays(output, output_labels[0:1], xc_func_sizes, npoints, do_exc) + output = _check_arrays(output, output_labels[1:2], xc_func_sizes, npoints, do_vxc) + output = _check_arrays(output, output_labels[2:3], xc_func_sizes, npoints, do_fxc) + output = _check_arrays(output, output_labels[3:4], xc_func_sizes, npoints, do_kxc) + output = _check_arrays(output, output_labels[4:5], xc_func_sizes, npoints, do_lxc) args.extend([ inp[x] for x in input_labels]) args.extend([output[x] for x in output_labels]) @@ -129,11 +135,11 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k ] # Build input args - output = _check_arrays(output, output_labels[0:1], npoints, do_exc) - output = _check_arrays(output, output_labels[1:3], npoints, do_vxc) - output = _check_arrays(output, output_labels[3:6], npoints, do_fxc) - output = _check_arrays(output, output_labels[6:10], npoints, do_kxc) - output = _check_arrays(output, output_labels[10:15], npoints, do_lxc) + output = _check_arrays(output, output_labels[0:1], xc_func_sizes, npoints, do_exc) + output = _check_arrays(output, output_labels[1:3], xc_func_sizes, npoints, do_vxc) + output = _check_arrays(output, output_labels[3:6], xc_func_sizes, npoints, do_fxc) + output = _check_arrays(output, output_labels[6:10], xc_func_sizes, npoints, do_kxc) + output = _check_arrays(output, output_labels[10:15], xc_func_sizes, npoints, do_lxc) args.extend([ inp[x] for x in input_labels]) args.extend([output[x] for x in output_labels]) @@ -174,11 +180,11 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k ] # Build input args - output = _check_arrays(output, output_labels[0:1], npoints, do_exc) - output = _check_arrays(output, output_labels[1:5], npoints, do_vxc) - output = _check_arrays(output, output_labels[5:15], npoints, do_fxc) - output = _check_arrays(output, output_labels[15:35], npoints, do_kxc) - output = _check_arrays(output, output_labels[35:70], npoints, do_lxc) + output = _check_arrays(output, output_labels[0:1], xc_func_sizes, npoints, do_exc) + output = _check_arrays(output, output_labels[1:5], xc_func_sizes, npoints, do_vxc) + output = _check_arrays(output, output_labels[5:15], xc_func_sizes, npoints, do_fxc) + output = _check_arrays(output, output_labels[15:35], xc_func_sizes, npoints, do_kxc) + output = _check_arrays(output, output_labels[35:70], xc_func_sizes, npoints, do_lxc) args.extend([ inp[x] for x in input_labels]) if not self.needs_laplacian(): diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 6ca2a9f8..237091d4 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -520,7 +520,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, pass else: raise NotImplementedError(f'numint.nr_rks for functional {xc_code}') - #nelec[i] += den.sum() + nelec[i] += den.sum() excsum[i] += cupy.dot(den, exc)[0] t1 = log.timer_debug1('integration', *t1) @@ -549,84 +549,95 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ni.build(mol, grids.coords) opt = ni.gdftopt + mo_coeff = getattr(dms, 'mo_coeff', None) + mo_occ = getattr(dms,'mo_occ', None) + + mol = opt.mol coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dma, dmb = dms dm_shape = dma.shape dma = cupy.asarray(dma).reshape(-1,nao0,nao0) dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0) - dma = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) for dm in dma] - dmb = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) for dm in dmb] + dma = [coeff @ dm @ coeff.T for dm in dma] + dmb = [coeff @ dm @ coeff.T for dm in dmb] nset = len(dma) + if mo_coeff is not None: + mo_coeff = coeff @ mo_coeff + nelec = np.zeros((2,nset)) excsum = np.zeros(nset) vmata = cupy.zeros((nset, nao, nao)) vmatb = cupy.zeros((nset, nao, nao)) - with opt.gdft_envs_cache(): - mem_avail = get_avail_mem() - if xctype == 'LDA': - ao_deriv = 0 - else: - ao_deriv = 1 - cupy.cuda.runtime.deviceSetLimit(0x00, 128) - comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 - block_size = int((mem_avail*.2/8/(comp+1)/nao - nao*2)/ ALIGNED) * ALIGNED - log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) - block_size = min(block_size, MIN_BLK_SIZE) - if block_size < ALIGNED: - raise RuntimeError('Not enough GPU memory') + release_gpu_stack() + if xctype == 'LDA': + ao_deriv = 0 + else: + ao_deriv = 1 - ngrids = grids.weights.size - for p0, p1 in lib.prange(0, ngrids, block_size): - ao = eval_ao(ni, opt.mol, grids.coords[p0:p1], ao_deriv) - weight = grids.weights[p0:p1] - for i in range(nset): - rho_a = eval_rho(opt.mol, ao, dma[i], xctype=xctype, hermi=1) - rho_b = eval_rho(opt.mol, ao, dmb[i], xctype=xctype, hermi=1) - rho = (rho_a, rho_b) - exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] - if xctype == 'LDA': - den_a = rho_a * weight - den_b = rho_b * weight - wv = vxc[:,0] * weight - vmata[i] += ao.T.dot(_scale_ao(ao, wv[0])) - vmatb[i] += ao.T.dot(_scale_ao(ao, wv[1])) - elif xctype == 'GGA': - den_a = rho_a[0] * weight - den_b = rho_b[0] * weight - wv = vxc * weight - wv[:,0] *= .5 - vmata[i] += ao[0].T.dot(_scale_ao(ao, wv[0])) - vmatb[i] += ao[0].T.dot(_scale_ao(ao, wv[1])) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - elif xctype == 'MGGA': - den_a = rho_a[0] * weight - den_b = rho_b[0] * weight - wv = vxc * weight - wv[:,[0, 4]] *= .5 - vmata[i] += ao[0].T.dot(_scale_ao(ao[:4], wv[0,:4])) - vmatb[i] += ao[0].T.dot(_scale_ao(ao[:4], wv[1,:4])) - vmata[i] += _tau_dot(ao, ao, wv[0,4]) - vmatb[i] += _tau_dot(ao, ao, wv[1,4]) - elif xctype == 'HF': - pass - else: - raise NotImplementedError(f'numint.nr_uks for functional {xc_code}') - nelec[0,i] += den_a.sum() - nelec[1,i] += den_b.sum() - excsum[i] += np.dot(den_a, exc) - excsum[i] += np.dot(den_b, exc) - ao = None + for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): + for i in range(nset): + t0 = log.init_timer() + if mo_coeff is None: + rho_a = eval_rho(mol, ao_mask, dma[i][np.ix_(idx,idx)], xctype=xctype, hermi=1) + rho_b = eval_rho(mol, ao_mask, dmb[i][np.ix_(idx,idx)], xctype=xctype, hermi=1) + else: + mo_coeff_mask = mo_coeff[:, idx,:] + rho_a = eval_rho2(mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) + rho_b = eval_rho2(mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) - vmata = [cupy.einsum('pi,pq,qj->ij', coeff, v, coeff).get() for v in vmata] - vmatb = [cupy.einsum('pi,pq,qj->ij', coeff, v, coeff).get() for v in vmatb] + rho = cupy.stack([rho_a, rho_b], axis=0) + exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] + t1 = log.timer_debug1('eval vxc', *t0) + if xctype == 'LDA': + den_a = rho_a * weight + den_b = rho_b * weight + wv = vxc[:,0] * weight + va = ao_mask.dot(_scale_ao(ao_mask, wv[0]).T) + vb = ao_mask.dot(_scale_ao(ao_mask, wv[1]).T) + add_sparse(vmata[i], va, idx) + add_sparse(vmatb[i], vb, idx) + + elif xctype == 'GGA': + den_a = rho_a[0] * weight + den_b = rho_b[0] * weight + wv = vxc * weight + wv[:,0] *= .5 + va = ao_mask[0].dot(_scale_ao(ao_mask, wv[0]).T) + vb = ao_mask[0].dot(_scale_ao(ao_mask, wv[1]).T) + add_sparse(vmata[i], va, idx) + add_sparse(vmatb[i], vb, idx) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + elif xctype == 'MGGA': + den_a = rho_a[0] * weight + den_b = rho_b[0] * weight + wv = vxc * weight + wv[:,[0, 4]] *= .5 + va = ao_mask[0].dot(_scale_ao(ao_mask[:4], wv[0,:4]).T) + vb = ao_mask[0].dot(_scale_ao(ao_mask[:4], wv[1,:4]).T) + va += _tau_dot(ao_mask, ao_mask, wv[0,4]) + vb += _tau_dot(ao_mask, ao_mask, wv[1,4]) + add_sparse(vmata[i], va, idx) + add_sparse(vmatb[i], vb, idx) + elif xctype == 'HF': + pass + else: + raise NotImplementedError(f'numint.nr_uks for functional {xc_code}') + nelec[0,i] += den_a.sum() + nelec[1,i] += den_b.sum() + excsum[i] += cupy.dot(den_a, exc[:,0]) + excsum[i] += cupy.dot(den_b, exc[:,0]) + t1 = log.timer_debug1('integration', *t1) + + vmata = [coeff.T @ v @ coeff for v in vmata] + vmatb = [coeff.T @ v @ coeff for v in vmatb] if xctype != 'LDA': for i in range(nset): - lib.transpose_sum(vmata[i], inplace=True) - lib.transpose_sum(vmatb[i], inplace=True) + vmata[i] = vmata[i] + vmata[i].T + vmatb[i] = vmatb[i] + vmatb[i].T if FREE_CUPY_CACHE: dma = dmb = None @@ -637,7 +648,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, excsum = excsum[0] vmata = vmata[0] vmatb = vmatb[0] - vmat = np.asarray([vmata, vmatb]) + vmat = cupy.asarray([vmata, vmatb]) return nelec, excsum, vmat @@ -730,7 +741,6 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= fxc_w = fxc[:,:,p0:p1] * weights wv = contract('axg,xyg->ayg', rho1, fxc_w) - for i in range(nset): if xctype == 'LDA': vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T) @@ -794,65 +804,104 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= nao, nao0 = coeff.shape dma, dmb = dms dm_shape = dma.shape + # AO basis -> gdftopt AO basis + with_mocc = hasattr(dms, 'mo1') + if with_mocc: + mo1a = contract('nio,pi->npo', dma.mo1, coeff) * 2.0**0.5 + mo1b = contract('nio,pi->npo', dmb.mo1, coeff) * 2.0**0.5 + occ_coeff_a = contract('io,pi->po', dma.occ_coeff, coeff) * 2.0**0.5 + occ_coeff_b = contract('io,pi->po', dmb.occ_coeff, coeff) * 2.0**0.5 + dma = cupy.asarray(dma).reshape(-1,nao0,nao0) dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0) - dma = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) for dm in dma] - dmb = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) for dm in dmb] + dma = contract('nij,qj->niq', dma, coeff) + dma = contract('pi,niq->npq', coeff, dma) + dmb = contract('nij,qj->niq', dmb, coeff) + dmb = contract('pi,niq->npq', coeff, dmb) + nset = len(dma) vmata = cupy.zeros((nset, nao, nao)) vmatb = cupy.zeros((nset, nao, nao)) - with opt.gdft_envs_cache(): - mem_avail = cupy.cuda.runtime.memGetInfo()[0] - if xctype == 'LDA': - block_size = int((mem_avail*.7/8/3/nao - nao*2)/ ALIGNED) * ALIGNED + if xctype == 'LDA': + ao_deriv = 0 + else: + ao_deriv = 1 + p0 = 0 + p1 = 0 + for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv): + t0 = log.init_timer() + p0, p1 = p1, p1+len(weights) + # precompute molecular orbitals + if with_mocc: + occ_coeff_a_mask = occ_coeff_a[mask] + occ_coeff_b_mask = occ_coeff_b[mask] + if xctype == 'LDA': + c0_a = _dot_ao_dm(mol, ao, occ_coeff_a_mask, None, None, None) + c0_b = _dot_ao_dm(mol, ao, occ_coeff_b_mask, None, None, None) + elif xctype == "GGA": + c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask) + c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask) + else: # mgga + c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask) + c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask) + + if with_mocc: + rho1a = eval_rho4(opt.mol, ao, c0_a, mo1a[:,mask], xctype=xctype, with_lapl=False) + rho1b = eval_rho4(opt.mol, ao, c0_b, mo1b[:,mask], xctype=xctype, with_lapl=False) else: - block_size = int((mem_avail*.7/8/8/nao - nao*2)/ ALIGNED) * ALIGNED - log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) - if block_size < ALIGNED: - raise RuntimeError('Not enough GPU memory') + # slow version + rho1a = [] + rho1b = [] + for i in range(nset): + rho_tmp = eval_rho(opt.mol, ao, dma[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) + rho1a.append(rho_tmp) + rho_tmp = eval_rho(opt.mol, ao, dmb[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) + rho1b.append(rho_tmp) + rho1a = cupy.stack(rho1a, axis=0) + rho1b = cupy.stack(rho1b, axis=0) + rho1 = cupy.stack([rho1a, rho1b], axis=0) + t0 = log.timer_debug1('rho', *t0) + # precompute fxc_w if xctype == 'LDA': - ao_deriv = 0 + fxc_w = fxc[:,0,:,0,p0:p1] * weights else: - ao_deriv = 1 + fxc_w = fxc[:,:,:,:,p0:p1] * weights - ngrids = grids.weights.size - for p0, p1 in lib.prange(0, ngrids, block_size): - ao = eval_ao(ni, opt.mol, grids.coords[p0:p1], ao_deriv) - weight = grids.weights[p0:p1] - for i in range(nset): - rho1a = eval_rho(opt.mol, ao, dma[i], xctype=xctype, hermi=hermi) - rho1b = eval_rho(opt.mol, ao, dmb[i], xctype=xctype, hermi=hermi) - rho1 = np.asarray([rho1a, rho1b]) - if xctype == 'LDA': - wv = np.einsum('ag,abg->bg', rho1, fxc[:,0,:,0,p0:p1]) * weight - vmata[i] += ao.T.dot(_scale_ao(ao, wv[0])) - vmatb[i] += ao.T.dot(_scale_ao(ao, wv[1])) - elif xctype == 'GGA': - wv = np.einsum('axg,axbyg->byg', rho1, fxc[:,:,:,:,p0:p1]) * weight - wv[:,0] *= .5 - vmata[i] += ao[0].T.dot(_scale_ao(ao, wv[0])) - vmatb[i] += ao[0].T.dot(_scale_ao(ao, wv[1])) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - else: - wv = np.einsum('axg,axbyg->byg', rho1, fxc[:,:,:,:,p0:p1]) * weight - wv[:,[0, 4]] *= .5 - vmata[i] += ao[0].T.dot(_scale_ao(ao[:4], wv[0,:4])) - vmatb[i] += ao[0].T.dot(_scale_ao(ao[:4], wv[1,:4])) - vmata[i] += _tau_dot(ao, ao, wv[0,4]) - vmatb[i] += _tau_dot(ao, ao, wv[1,4]) - ao = None - - vmata = [cupy.einsum('pi,pq,qj->ij', coeff, v, coeff).get() for v in vmata] - vmatb = [cupy.einsum('pi,pq,qj->ij', coeff, v, coeff).get() for v in vmatb] + for i in range(nset): + if xctype == 'LDA': + wv = contract('ag,abg->bg', rho1[:,i], fxc_w) + va = ao.dot(_scale_ao(ao, wv[0]).T) + vb = ao.dot(_scale_ao(ao, wv[1]).T) + add_sparse(vmata[i], va, mask) + add_sparse(vmatb[i], vb, mask) + elif xctype == 'GGA': + wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) + wv[:,0] *= .5 + va = ao[0].dot(_scale_ao(ao, wv[0]).T) + vb = ao[0].dot(_scale_ao(ao, wv[1]).T) + add_sparse(vmata[i], va, mask) + add_sparse(vmatb[i], vb, mask) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + else: + wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) + wv[:,[0, 4]] *= .5 + va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T) + vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T) + va += _tau_dot(ao, ao, wv[0,4]) + vb += _tau_dot(ao, ao, wv[1,4]) + add_sparse(vmata[i], va, mask) + add_sparse(vmatb[i], vb, mask) + vmata = [coeff.T @ v @ coeff for v in vmata] + vmatb = [coeff.T @ v @ coeff for v in vmatb] if xctype != 'LDA': # For real orbitals, K_{ia,bj} = K_{ia,jb}. It simplifies real fxc_jb # [(\nabla mu) nu + mu (\nabla nu)] * fxc_jb = ((\nabla mu) nu f_jb) + h.c. for i in range(nset): - lib.transpose_sum(vmata[i], inplace=True) - lib.transpose_sum(vmatb[i], inplace=True) + vmata[i] = vmata[i] + vmata[i].T + vmatb[i] = vmatb[i] + vmatb[i].T if FREE_CUPY_CACHE: dma = dmb = None @@ -861,7 +910,7 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if len(dm_shape) == 2: vmata = vmata[0] vmatb = vmatb[0] - vmat = np.asarray([vmata, vmatb]) + vmat = cupy.asarray([vmata, vmatb]) return vmat def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, @@ -942,6 +991,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, max_memory=2000): + log = logger.new_logger(mol, mol.verbose) xctype = ni._xc_type(xc_code) if xctype == 'GGA': ao_deriv = 1 @@ -957,76 +1007,80 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, ni.build(mol, grids.coords) opt = ni.gdftopt + mol = opt.mol coeff = cupy.asarray(opt.coeff) - ngrids = grids.weights.size - comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 nao = coeff.shape[0] - - def make_rdm1(mo_coeff, mo_occ): - orbo = coeff.dot(mo_coeff[:,mo_occ>0]) - dm = (orbo*mo_occ[mo_occ>0]).dot(orbo.T) - return dm - - with opt.gdft_envs_cache(): - mem_avail = get_avail_mem() - block_size = int((mem_avail*.7/8/(comp+1)/nao - nao*2)/ ALIGNED) * ALIGNED - logger.debug1(mol, 'Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) - if block_size < ALIGNED: - raise RuntimeError('Not enough GPU memory') - - if spin == 0: - dm = make_rdm1(mo_coeff, mo_occ) - rho = [] - for p0, p1 in lib.prange(0, ngrids, block_size): - ao = eval_ao(ni, opt.mol, grids.coords[p0:p1], ao_deriv) - rho.append(eval_rho(opt.mol, ao, dm, xctype=xctype, hermi=1)) - ao = None - rho = cupy.hstack(rho) - else: - dma = make_rdm1(mo_coeff[0], mo_occ[0]) - dmb = make_rdm1(mo_coeff[1], mo_occ[1]) - rhoa = [] - rhob = [] - for p0, p1 in lib.prange(0, ngrids, block_size): - ao = eval_ao(ni, opt.mol, grids.coords[p0:p1], ao_deriv) - rhoa.append(eval_rho(opt.mol, ao, dma, xctype=xctype, hermi=1)) - rhob.append(eval_rho(opt.mol, ao, dmb, xctype=xctype, hermi=1)) - ao = None - rho = (cupy.hstack(rhoa), cupy.hstack(rhob)) - - if FREE_CUPY_CACHE: - dm = dma = dmb = None - cupy.get_default_memory_pool().free_all_blocks() - + if spin == 0: + mo_coeff = coeff @ mo_coeff + rho = [] + t0 = log.init_timer() + for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): + mo_coeff_mask = mo_coeff[idx,:] + rho_slice = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) + rho.append(rho_slice) + t0 = log.timer_debug1('eval rho in fxc', *t0) + rho = cupy.hstack(rho) + else: + mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff)) + rhoa = [] + rhob = [] + t0 = log.init_timer() + for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): + mo_coeff_mask = mo_coeff[:,idx,:] + rhoa_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) + rhob_slice = eval_rho2(mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) + rhoa.append(rhoa_slice) + rhob.append(rhob_slice) + t0 = log.timer_debug1('eval rho in fxc', *t0) + #rho = (cupy.hstack(rhoa), cupy.hstack(rhob)) + rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0) vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3] return rho, vxc, fxc +@cupy.fuse() +def batch_square(a): + return a[0]**2 + a[1]**2 + a[2]**2 + def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None): ''' Different from PySCF, this function employ cuda version libxc ''' - if omega is None: omega = ni.omega - if xctype is None: xctype = ni._xc_type(xc_code) - if ni.xcfuns is None: ni.xcfuns = _init_xcfuns(xc_code) - if xctype == 'LDA': spin_polarized = rho.ndim >= 2 else: spin_polarized = rho.ndim == 3 - if spin_polarized: - raise NotImplementedError() - inp = {} - if xctype == 'LDA': - inp['rho'] = rho - if xctype == 'GGA': - inp['rho'] = rho[0] - inp['sigma'] = rho[1]*rho[1] + rho[2]*rho[2] + rho[3]*rho[3] - if xctype == 'MGGA': - inp['rho'] = rho[0] - inp['sigma'] = rho[1]*rho[1] + rho[2]*rho[2] + rho[3]*rho[3] - inp['tau'] = rho[-1] # can be 4 (without laplacian) or 5 (with laplacian) + if omega is None: omega = ni.omega + if xctype is None: xctype = ni._xc_type(xc_code) + if ni.xcfuns is None: ni.xcfuns = _init_xcfuns(xc_code, spin_polarized) + inp = {} + if not spin_polarized: + if xctype == 'LDA': + inp['rho'] = rho + if xctype == 'GGA': + inp['rho'] = rho[0] + inp['sigma'] = batch_square(rho[1:4]) + if xctype == 'MGGA': + inp['rho'] = rho[0] + inp['sigma'] = batch_square(rho[1:4]) + inp['tau'] = rho[-1] # can be 4 (without laplacian) or 5 (with laplacian) + else: + if xctype == 'LDA': + inp['rho'] = cupy.stack([rho[0], rho[1]], axis=1) + if xctype == 'GGA': + inp['rho'] = cupy.stack([rho[0,0], rho[1,0]], axis=1) + sigma0 = batch_square(rho[0,1:4]) + sigma1 = rho[0,1]*rho[1,1] + rho[0,2]*rho[1,2] + rho[0,3]*rho[1,3] + sigma2 = batch_square(rho[1,1:4]) + inp['sigma'] = cupy.stack([sigma0, sigma1, sigma2], axis=1) + if xctype == 'MGGA': + inp['rho'] = cupy.stack([rho[0,0], rho[1,0]], axis=1) + sigma0 = batch_square(rho[0,1:4]) + sigma1 = rho[0,1]*rho[1,1] + rho[0,2]*rho[1,2] + rho[0,3]*rho[1,3] + sigma2 = batch_square(rho[1,1:4]) + inp['sigma'] = cupy.stack([sigma0, sigma1, sigma2], axis=1) + inp['tau'] = cupy.stack([rho[0,-1], rho[1,-1]], axis=1) # can be 4 (without laplacian) or 5 (with laplacian) do_vxc = True do_fxc = deriv > 1 do_kxc = deriv > 2 @@ -1054,26 +1108,35 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None kxc = None exc = ret_full["zk"] - vxc = [ret_full[label] for label in vxc_labels if label in ret_full] - if do_fxc: - fxc = [ret_full[label] for label in fxc_labels if label in ret_full] - if do_kxc: - kxc = [ret_full[label] for label in kxc_labels if label in ret_full] + if not spin_polarized: + vxc = [ret_full[label] for label in vxc_labels if label in ret_full] + if do_fxc: + fxc = [ret_full[label] for label in fxc_labels if label in ret_full] + if do_kxc: + kxc = [ret_full[label] for label in kxc_labels if label in ret_full] + else: + vxc = [ret_full[label] for label in vxc_labels if label in ret_full] + if do_fxc: + fxc = [ret_full[label] for label in fxc_labels if label in ret_full] + if do_kxc: + kxc = [ret_full[label] for label in kxc_labels if label in ret_full] if do_kxc: - kxc = xc_deriv.transform_kxc(rho, fxc, kxc, xctype) + kxc = xc_deriv.transform_kxc(rho, fxc, kxc, xctype, spin_polarized) if do_fxc: - fxc = xc_deriv.transform_fxc(rho, vxc, fxc, xctype) - vxc = xc_deriv.transform_vxc(rho, vxc, xctype) - + fxc = xc_deriv.transform_fxc(rho, vxc, fxc, xctype, spin_polarized) + vxc = xc_deriv.transform_vxc(rho, vxc, xctype, spin_polarized) return exc, vxc, fxc, kxc -def _init_xcfuns(xc_code): +def _init_xcfuns(xc_code, spin): xc_upper = xc_code.upper() xc_names = dft.libxc.parse_xc(xc_upper)[1:][0] - + if spin: + spin_polarized = 'polarized' + else: + spin_polarized = 'unpolarized' xcfuns = [] for xc, w in xc_names: - xcfun = libxc.XCfun(xc, 'unpolarized') + xcfun = libxc.XCfun(xc, spin_polarized) xcfuns.append((xcfun,w)) if dft.libxc.needs_laplacian(xcfun.func_id): raise NotImplementedError() @@ -1235,7 +1298,7 @@ def _contract_rho(bra, ket, rho=None): if err != 0: raise RuntimeError('CUDA Error') else: - rho = cupy.einsum('ig,ig->g', bra, ket) + rho = contract('ig,ig->g', bra, ket) return rho def _contract_rho1(bra, ket, rho=None): @@ -1376,13 +1439,13 @@ def _tau_dot_sparse(bra, ket, wv, nbins, screen_index, ao_loc, def _scale_ao(ao, wv, out=None): if wv.ndim == 1: if ao.flags.f_contiguous: - return cupy.einsum('ip,p->ip', ao, wv) + return ao * wv nvar = 1 nao, ngrids = ao.shape assert wv.size == ngrids else: if ao[0].flags.f_contiguous: - return cupy.einsum('nip,np->ip', ao, wv) + return contract('nip,np->ip', ao, wv) nvar, nao, ngrids = ao.shape assert wv.shape == (nvar, ngrids) diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py index 0862976e..35c329fe 100644 --- a/gpu4pyscf/dft/tests/test_numint.py +++ b/gpu4pyscf/dft/tests/test_numint.py @@ -19,35 +19,39 @@ import numpy as np import pyscf import cupy -from pyscf import lib +from pyscf import lib, scf from pyscf.dft import Grids from pyscf.dft.numint import NumInt as pyscf_numint from gpu4pyscf.dft.numint import NumInt from gpu4pyscf import dft mol = pyscf.M( - atom=''' -C -0.65830719, 0.61123287, -0.00800148 -C 0.73685281, 0.61123287, -0.00800148 -''', - basis='ccpvtz', - spin=None, + atom = ''' +O 0.000000 0.000000 0.117790 +H 0.000000 0.755453 -0.471161 +H 0.000000 -0.755453 -0.471161''', + basis = 'ccpvdz', + charge = 1, + spin = 1 # = 2S = spin_up - spin_down ) + mol.verbose=1 np.random.seed(2) nao = mol.nao -dm = np.random.random((2,nao,nao)) -dm1 = dm + dm.transpose(0,2,1) -mo_coeff = np.random.rand(nao, nao) -mo_occ = (np.random.rand(nao) > .5).astype(np.double) -dm0 = (mo_coeff*mo_occ).dot(mo_coeff.T) +mf = scf.UHF(mol) +mf.kernel() +dm1 = mf.make_rdm1().copy() +dm = dm1 +mo_coeff = mf.mo_coeff +mo_occ = mf.mo_occ +dm0 = (mo_coeff[0]*mo_occ[0]).dot(mo_coeff[0].T) grids_cpu = Grids(mol) -grids_cpu.level = 5 +grids_cpu.level = 1 grids_cpu.build() grids_gpu = Grids(mol) -grids_gpu.level = 5 +grids_gpu.level = 1 grids_gpu.build() grids_gpu.weights = cupy.asarray(grids_gpu.weights) @@ -66,16 +70,18 @@ class KnownValues(unittest.TestCase): def _check_vxc(self, method, xc): ni = NumInt(xc=xc) fn = getattr(ni, method) - n, e, v = fn(mol, grids_gpu, xc, dm0, hermi=1) - v = [i.get() for i in v] + n, e, v = fn(mol, grids_gpu, xc, dm1, hermi=1) + v = [x.get() for x in v] ni_pyscf = pyscf_numint() fn = getattr(ni_pyscf, method) - nref, eref, vref = fn(mol, grids_cpu, xc, dm0, hermi=1) + nref, eref, vref = fn(mol, grids_cpu, xc, dm1, hermi=1) - cupy.allclose(e, eref) - cupy.allclose(n, nref) - cupy.allclose(v, vref) + v = cupy.asarray(v) + vref = cupy.asarray(vref) + assert cupy.allclose(e, eref) + assert cupy.allclose(n, nref) + assert cupy.allclose(v, vref) def _check_rks_fxc(self, xc, hermi=1): if hermi == 1: @@ -84,7 +90,7 @@ def _check_rks_fxc(self, xc, hermi=1): t1 = dm spin = 0 ni_pyscf = pyscf_numint() - rho, vxc, fxc = ni_pyscf.cache_xc_kernel(mol, grids_cpu, xc, mo_coeff, mo_occ, spin) + rho, vxc, fxc = ni_pyscf.cache_xc_kernel(mol, grids_cpu, xc, mo_coeff[0], mo_occ[0], spin) vref = ni_pyscf.nr_rks_fxc( mol, grids_cpu, xc, dm0=dm0, dms=t1, rho0=rho, vxc=vxc, fxc=fxc, hermi=hermi) @@ -92,7 +98,7 @@ def _check_rks_fxc(self, xc, hermi=1): vxc0 = vxc.copy() fxc0 = fxc.copy() ni = NumInt() - rho, vxc, fxc = ni.cache_xc_kernel(mol, grids_gpu, xc, cupy.asarray(mo_coeff), cupy.asarray(mo_occ), spin) + rho, vxc, fxc = ni.cache_xc_kernel(mol, grids_gpu, xc, cupy.asarray(mo_coeff[0]), cupy.asarray(mo_occ[0]), spin) v = ni.nr_rks_fxc(mol, grids_gpu, xc, dms=t1, fxc=fxc, hermi=hermi) if xc == MGGA_M06: assert cupy.allclose(rho[[0,1,2,3,5]], rho0[[0,1,2,3,5]]) @@ -107,18 +113,18 @@ def _check_rks_fxc_st(self, xc, fpref): ni = NumInt() spin = 1 rho, vxc, fxc = ni.cache_xc_kernel( - mol, grids_gpu, xc, [mo_coeff]*2, [mo_occ*.5]*2, spin) + mol, grids_gpu, xc, mo_coeff, mo_occ, spin) v = ni.nr_rks_fxc_st(mol, grids_gpu, xc, dms_alpha=dm, fxc=fxc) self.assertAlmostEqual(lib.fp(v), fpref, 12) ni_pyscf = pyscf_numint() rho, vxc, fxc = ni_pyscf.cache_xc_kernel( - mol, grids_cpu, xc, [mo_coeff]*2, [mo_occ*.5]*2, spin) + mol, grids_cpu, xc, mo_coeff, mo_occ, spin) vref = ni_pyscf.nr_rks_fxc_st( mol, grids_cpu, xc, dm0=dm0, dms_alpha=dm, rho0=rho, vxc=vxc, fxc=fxc) self.assertAlmostEqual(abs(v - vref).max(), 0, 12) - def _check_uks_fxc(self, xc, fpref, hermi=1): + def _check_uks_fxc(self, xc, hermi=1): if hermi == 1: t1 = dm1 else: @@ -126,17 +132,21 @@ def _check_uks_fxc(self, xc, fpref, hermi=1): ni = NumInt() spin = 1 rho, vxc, fxc = ni.cache_xc_kernel( - mol, grids_gpu, xc, [mo_coeff]*2, [mo_occ, 1-mo_occ], spin) + mol, grids_gpu, xc, cupy.asarray(mo_coeff), cupy.asarray(mo_occ), spin) v = ni.nr_uks_fxc(mol, grids_gpu, xc, dms=t1, fxc=fxc, hermi=hermi) - self.assertAlmostEqual(lib.fp(v), fpref, 12) ni = ni.to_cpu() dm0 = mo_coeff.dot(mo_coeff.T) - rho, vxc, fxc = ni.cache_xc_kernel( - mol, grids_cpu, xc, [mo_coeff]*2, [mo_occ, 1-mo_occ], spin) - vref = ni.nr_uks_fxc( - mol, grids_cpu, xc, dm0=dm0, dms=t1, rho0=rho, vxc=vxc, fxc=fxc, hermi=hermi) - self.assertAlmostEqual(abs(v - vref).max(), 0, 12) + rho_ref, vxc_ref, fxc_ref = ni.cache_xc_kernel( + mol, grids_cpu, xc, mo_coeff, mo_occ, spin) + v_ref = ni.nr_uks_fxc( + mol, grids_cpu, xc, dm0=dm0, dms=t1, rho0=rho_ref, vxc=vxc_ref, fxc=fxc_ref, hermi=hermi) + vxc_ref = np.asarray(vxc_ref) + rho_ref = np.asarray(rho_ref) + assert cupy.linalg.norm(rho - cupy.asarray(rho_ref)) < 1e-6 * cupy.linalg.norm(rho) + assert cupy.linalg.norm(vxc - cupy.asarray(vxc_ref)) < 1e-6 * cupy.linalg.norm(vxc) + assert cupy.linalg.norm(fxc - cupy.asarray(fxc_ref)) < 1e-6 * cupy.linalg.norm(fxc) + assert cupy.linalg.norm(v - cupy.asarray(v_ref)) < 1e-6 * cupy.linalg.norm(v) def test_rks_lda(self): self._check_vxc('nr_rks', LDA) @@ -148,16 +158,14 @@ def test_rks_mgga(self): self._check_vxc('nr_rks', MGGA_M06) # Not implemented yet - ''' def test_uks_lda(self): - self._check_vxc('nr_uks', 'lda', -6.362059440515177) + self._check_vxc('nr_uks', LDA)#'lda', -6.362059440515177) def test_uks_gga(self): - self._check_vxc('nr_uks', 'pbe', -6.732546841646528) + self._check_vxc('nr_uks', GGA_PBE)#'pbe', -6.732546841646528) def test_uks_mgga(self): - self._check_vxc('nr_uks', 'm06', 83.5606316500255) - ''' + self._check_vxc('nr_uks', MGGA_M06)#'m06', 83.5606316500255) def test_rks_fxc_lda(self): self._check_rks_fxc(LDA, hermi=1) @@ -168,19 +176,17 @@ def test_rks_fxc_gga(self): def test_rks_fxc_mgga(self): self._check_rks_fxc(MGGA_M06, hermi=1) - ''' def test_uks_fxc_lda(self): - self._check_uks_fxc('lda', -0.1125902447294953, hermi=1) - self._check_uks_fxc('lda', -0.05629512236474782, hermi=0) + self._check_uks_fxc(LDA, hermi=1) def test_uks_fxc_gga(self): - self._check_uks_fxc('pbe', -0.0752237471800169, hermi=1) - self._check_uks_fxc('pbe', -0.03761187359000853, hermi=0) + self._check_uks_fxc(GGA_PBE, hermi=1) def test_uks_fxc_mgga(self): - self._check_uks_fxc('m06', 0.7005336565753997, hermi=1) - self._check_uks_fxc('m06', 0.35026682828770006, hermi=0) + self._check_uks_fxc(MGGA_M06, hermi=1) + # Not implemented yet + ''' def test_rks_fxc_st_lda(self): self._check_rks_fxc_st('lda', -0.06358425564270553) diff --git a/gpu4pyscf/dft/xc_deriv.py b/gpu4pyscf/dft/xc_deriv.py index b6e7b7ec..402027b8 100644 --- a/gpu4pyscf/dft/xc_deriv.py +++ b/gpu4pyscf/dft/xc_deriv.py @@ -16,9 +16,10 @@ ''' Transform XC functional derivatives between different representations ''' - -import cupy import numpy as np +import cupy +from pyscf.dft.xc_deriv import _stack_fg, _stack_frr, _stack_fgg +from gpu4pyscf.lib.cupy_helper import contract def transform_vxc(rho, vxc, xctype, spin=0): r''' @@ -27,7 +28,10 @@ def transform_vxc(rho, vxc, xctype, spin=0): LDA : [2,1,N] GGA : [2,4,N] MGGA: [2,5,N] - * spin unpolarized is not implemented + * spin unpolarized + LDA : [1,N] + GGA : [4,N] + MGGA: [5,N] ''' rho = cupy.asarray(rho, order='C') if xctype == 'GGA': @@ -48,7 +52,15 @@ def transform_vxc(rho, vxc, xctype, spin=0): ngrids = rho.shape[-1] if spin == 1: - raise NotImplementedError() + if order == 0: + vp = fr.reshape(2, nvar, ngrids) + else: + vp = cupy.empty((2, nvar, ngrids)) + vp[:,0] = fr + #vp[:,1:4] = _stack_fg(fg, rho=rho) + vp[:,1:4] = contract('abg,bxg->axg', _stack_fg(fg), rho[:,1:4]) + if order > 1: + vp[:,4] = ft else: if order == 0: vp = fr.reshape(nvar, ngrids) @@ -89,7 +101,41 @@ def transform_fxc(rho, vxc, fxc, xctype, spin=0): ngrids = rho.shape[-1] if spin == 1: - raise NotImplementedError() + if order == 0: + vp = _stack_frr(frr).reshape(2,nvar, 2,nvar, ngrids).transpose(1,3,0,2,4) + else: + vp = cupy.empty((2,nvar, 2,nvar, ngrids)).transpose(1,3,0,2,4) + vp[0,0] = _stack_frr(frr) + i3 = np.arange(3) + qgg = _stack_fgg(fgg) + qgg = cupy.einsum('abcdg,axg->xbcdg', qgg, rho[:,1:4]) + qgg = cupy.einsum('xbcdg,cyg->xybdg', qgg, rho[:,1:4]) + #qgg = _stack_fgg(fgg, rho=rho).transpose(1,3,0,2,4) + qgg[i3,i3] += _stack_fg(fg) + vp[1:4,1:4] = qgg + + frg = frg.reshape(2,3,ngrids) + qrg = _stack_fg(frg, axis=1) + qrg = cupy.einsum('rabg,axg->xrbg', qrg, rho[:,1:4]) + #qrg = _stack_fg(frg, axis=1, rho=rho).transpose(2,0,1,3) + vp[0,1:4] = qrg + vp[1:4,0] = qrg.transpose(0,2,1,3) + + if order > 1: + fgt = fgt.reshape(3,2,ngrids) + qgt = _stack_fg(fgt, axis=0) + qgt = cupy.einsum('abrg,axg->xbrg', qgt, rho[:,1:4]) + # qgt = _stack_fg(fgt, axis=0, rho=rho).transpose(1,0,2,3) + vp[1:4,4] = qgt + vp[4,1:4] = qgt.transpose(0,2,1,3) + + qrt = frt.reshape(2,2,ngrids) + vp[0,4] = qrt + vp[4,0] = qrt.transpose(1,0,2) + + vp[4,4] = _stack_frr(ftt) + + vp = vp.transpose(2,0,3,1,4) else: if order == 0: diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 190a118d..b6421424 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -97,7 +97,7 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, #dm1 = cupy.einsum('ypi,qi->ypq', mo1[ja], mocc) #de2_gpu[i0,j0] += cupy.einsum('xpq,ypq->xy', h1ao[ia], dm1) * 4 de2[i0,j0] += contract('xpi,ypi->xy', h1ao[ia], mo1[ja]) * 4 - dm1 = cupy.einsum('ypi,qi,i->ypq', mo1[ja], mocc, mo_energy[mo_occ>0]) + dm1 = contract('ypi,qi->ypq', mo1[ja], mocc*mo_energy[mo_occ>0]) de2[i0,j0] -= contract('xpq,ypq->xy', s1mo, dm1) * 4 de2[i0,j0] -= contract('xpq,ypq->xy', s1oo, mo_e1[ja]) * 2 for j0 in range(i0): @@ -176,19 +176,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, vk1 = vk1.reshape(3,3,nao,nao) t1 = log.timer_debug1('contracting int2e_ipvip1 for atom %d'%ia, *t1) - ej[i0,i0] += cupy.einsum('xypq,pq->xy', vj1_diag[:,:,p0:p1], dm0[p0:p1])*2 - ek[i0,i0] += cupy.einsum('xypq,pq->xy', vk1_diag[:,:,p0:p1], dm0[p0:p1]) - e1[i0,i0] -= cupy.einsum('xypq,pq->xy', s1aa[:,:,p0:p1], dme0[p0:p1])*2 + ej[i0,i0] += contract('xypq,pq->xy', vj1_diag[:,:,p0:p1], dm0[p0:p1])*2 + ek[i0,i0] += contract('xypq,pq->xy', vk1_diag[:,:,p0:p1], dm0[p0:p1]) + e1[i0,i0] -= contract('xypq,pq->xy', s1aa[:,:,p0:p1], dme0[p0:p1])*2 for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] # *2 for +c.c. - ej[i0,j0] += cupy.einsum('xypq,pq->xy', vj1[:,:,q0:q1], dm0[q0:q1])*4 - ek[i0,j0] += cupy.einsum('xypq,pq->xy', vk1[:,:,q0:q1], dm0[q0:q1]) - e1[i0,j0] -= cupy.einsum('xypq,pq->xy', s1ab[:,:,p0:p1,q0:q1], dme0[p0:p1,q0:q1])*2 + ej[i0,j0] += contract('xypq,pq->xy', vj1[:,:,q0:q1], dm0[q0:q1])*4 + ek[i0,j0] += contract('xypq,pq->xy', vk1[:,:,q0:q1], dm0[q0:q1]) + e1[i0,j0] -= contract('xypq,pq->xy', s1ab[:,:,p0:p1,q0:q1], dme0[p0:p1,q0:q1])*2 h1ao = hcore_deriv(ia, ja) - e1[i0,j0] += cupy.einsum('xypq,pq->xy', h1ao, dm0) + e1[i0,j0] += contract('xypq,pq->xy', h1ao, dm0) for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index c9157cc9..b6cb4f0a 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -294,8 +294,9 @@ def contract_(mat, ao, aoidx, wv, mask): vmat = vmat[[0,1,2, 1,3,4, 2,4,5]] - vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff) + vmat = contract('npq,qj->npj', vmat, coeff) + vmat = contract('pi,npj->nij', coeff, vmat) return vmat.reshape(3,3,nao_sph,nao_sph) def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): @@ -584,7 +585,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] # First order density = rho1 * 2. *2 is not applied because + c.c. in the end - rho1 = cupy.einsum('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) + rho1 = contract('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) wv = wf * rho1 aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 337d2c6a..f52f68f7 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -437,7 +437,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, ax = cupy.asarray(ax) nd = cycle + 1 - h = cupy.einsum('in,jn->ij', xs, ax) + h = cupy.dot(xs, ax.T) # Add the contribution of I in (1+a) h += cupy.diag(cupy.asarray(innerprod[:nd])) @@ -447,7 +447,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, g[0] = innerprod[0] else: ng = min(nd, nroots) - g[:ng, :nroots] += cupy.einsum('in,jn->ij', xs[:ng], b[:nroots]) + g[:ng, :nroots] += cupy.dot(xs[:ng], b[:nroots].T) ''' # Restore the first nroots vectors, which are array b or b-(1+a)x0 for i in range(min(nd, nroots)): @@ -492,4 +492,4 @@ def _qr(xs, dot, lindep=1e-14): return qs[:nv], cupy.linalg.inv(rmat[:nv,:nv]) def _gen_x0(v, xs): - return cupy.einsum('nk,nj->kj', v, xs) + return cupy.dot(v.T, xs) diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 8bcfd65e..9ac52cfe 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -55,7 +55,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, nao, nao0 = coeff.shape dm0 = dm dms = cupy.asarray(dm0.reshape(-1,nao0,nao0)) - dms = [cupy.einsum('pi,ij,qj->pq', coeff, x, coeff) for x in dms] + dms = [coeff @ x @ coeff.T for x in dms] if dm0.ndim == 2: dms = cupy.asarray(dms[0], order='C').reshape(1,nao,nao) else: @@ -180,16 +180,18 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, vj_ao = [] #vj = [cupy.einsum('pi,pq,qj->ij', coeff, x, coeff) for x in vj] for x in vj: - x = cupy.einsum('pi,pq->iq', coeff, x) - x = cupy.einsum('iq,qj->ij', x, coeff) + #x = cupy.einsum('pi,pq->iq', coeff, x) + #x = cupy.einsum('iq,qj->ij', x, coeff) + x = coeff.T @ x @ coeff vj_ao.append(2.0*(x + x.T)) vj = vj_ao if with_k: vk_ao = [] for x in vk: - x = cupy.einsum('pi,pq->iq', coeff, x) - x = cupy.einsum('iq,qj->ij', x, coeff) + #x = cupy.einsum('pi,pq->iq', coeff, x) + #x = cupy.einsum('iq,qj->ij', x, coeff) + x = coeff.T @ x @ coeff vk_ao.append(x + x.T) vk = vk_ao diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index c1bdaded..be469508 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -270,7 +270,7 @@ def contract_ket(a, B, c): vK_1_D = vK_1.dot(D) vK_1_Dv = vK_1_D * v_grids - de_dR += 0.5*fac * cupy.einsum('j,xjn->nx', vK_1_Dv, dA) + de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA) de_dS0 = 0.5*contract_ket(vK_1, dS, q) de_dS0 -= 0.5*contract_bra(vK_1, dS, q)