diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 5a0d1bd1..7f33e0dd 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -24,7 +24,7 @@ jobs: pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion - pip3 install "pyscf>2.5" + pip3 install pyscf --upgrade pip3 install numpy --upgrade pip3 install h5py --upgrade pip3 install gpu4pyscf-libxc-cuda12x --upgrade diff --git a/README.md b/README.md index dbe59a0e..6eeb2e82 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ Features - Density fitting scheme and direct SCF scheme; - SCF, analytical Gradient, and analytical Hessian calculations for Hartree-Fock and DFT; - LDA, GGA, mGGA, hybrid, and range-separated functionals via [libXC](https://gitlab.com/libxc/libxc/-/tree/master/); +- Spin-conserved and spin-flip TDA and TDDFT for excitated states - Geometry optimization and transition state search via [geomeTRIC](https://geometric.readthedocs.io/en/latest/); - Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4); - Nonlocal functional correction (vv10) for SCF and gradient; diff --git a/examples/00-h2o.py b/examples/00-h2o.py index 2bf6c993..5ed2b6d1 100644 --- a/examples/00-h2o.py +++ b/examples/00-h2o.py @@ -60,6 +60,7 @@ # Compute Hessian h = mf_GPU.Hessian() h.auxbasis_response = 2 # 0: no aux contribution, 1: some contributions, 2: all +mf_GPU.cphf_grids.atom_grid = (50,194) # customize grids for solving CPSCF equation, SG1 by default h_dft = h.kernel() # harmonic analysis diff --git a/examples/24-cp_bsse.py b/examples/24-cp_bsse.py new file mode 100644 index 00000000..4ac8dc10 --- /dev/null +++ b/examples/24-cp_bsse.py @@ -0,0 +1,67 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +#################################################### +# Example of interaction energy with counterpoise correction +#################################################### + +import pyscf +from gpu4pyscf.dft import rks + +atom_A = [ +('O', (0.000000, 0.000000, 0.000000)), +('H', (0.000000, 0.757160, 0.586260)), +('H', (0.000000, -0.757160, 0.586260)) +] + +atom_B = [ +('O', (0.000000, 0.000000, 2.913530)), +('H', (0.000000, 0.757160, 3.499790)), +('H', (0.000000, -0.757160, 3.499790)) +] + +atom_AB = atom_A + atom_B + +mol_A = pyscf.M(atom=atom_A, basis='cc-pVDZ').build() +mol_B = pyscf.M(atom=atom_B, basis='cc-pVDZ').build() +mol_AB = pyscf.M(atom=atom_AB, basis='cc-pVDZ').build() + +# Monomer A in the dimer basis +mol_A_ghost = mol_A.copy() +ghost_atoms_B = mol_B.atom +mol_A_ghost.atom.extend([('X-' + atom[0], atom[1]) for atom in ghost_atoms_B]) +mol_A_ghost.build() + +# Monomer B in the dimer basis +mol_B_ghost = mol_B.copy() +ghost_atoms_A = mol_A.atom +mol_B_ghost.atom.extend([('X-' + atom[0], atom[1]) for atom in ghost_atoms_A]) +mol_B_ghost.build() + +def solve_dft(mol, xc='b3lyp'): + mf = rks.RKS(mol, xc='b3lyp').density_fit() + mf.grids.atom_grid = (99,590) + return mf.kernel() + +E_AB = solve_dft(mol_AB) +E_A = solve_dft(mol_A) +E_B = solve_dft(mol_B) +interaction_energy_no_bsse = E_AB - (E_A + E_B) +print(f"Interaction Energy without BSSE Correction: {interaction_energy_no_bsse:.6f} Hartree") + +E_A_ghost = solve_dft(mol_A_ghost) +E_B_ghost = solve_dft(mol_B_ghost) +interaction_energy_bsse = E_AB - (E_A_ghost + E_B_ghost) +print(f"Interaction Energy with BSSE Correction: {interaction_energy_bsse:.6f} Hartree") diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index 5ecab3d4..73e90830 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -2,37 +2,16 @@ props = cupy.cuda.runtime.getDeviceProperties(0) GB = 1024*1024*1024 -# such as A100-80G -if props['totalGlobalMem'] >= 64 * GB: - min_ao_blksize = 128 - min_grid_blksize = 128*128 - ao_aligned = 32 - grid_aligned = 256 - mem_fraction = 0.9 - number_of_threads = 2048 * 108 -# such as V100-32G -elif props['totalGlobalMem'] >= 32 * GB: - min_ao_blksize = 128 - min_grid_blksize = 128*128 - ao_aligned = 32 - grid_aligned = 256 - mem_fraction = 0.9 - number_of_threads = 1024 * 80 -# such as A30-24GB -elif props['totalGlobalMem'] >= 16 * GB: - min_ao_blksize = 128 - min_grid_blksize = 128*128 - ao_aligned = 32 - grid_aligned = 256 - mem_fraction = 0.9 - number_of_threads = 1024 * 80 -# other gaming cards -else: +min_ao_blksize = 128 +min_grid_blksize = 128*128 +ao_aligned = 32 +grid_aligned = 256 + +# Use smaller blksize for old gaming GPUs +if props['totalGlobalMem'] < 16 * GB: min_ao_blksize = 64 min_grid_blksize = 64*64 - ao_aligned = 32 - grid_aligned = 128 - mem_fraction = 0.9 - number_of_threads = 1024 * 80 +# Use 90% of the global memory for CuPy memory pool +mem_fraction = 0.9 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index e8f66422..2ef8680a 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -91,7 +91,7 @@ def build(self, direct_scf_tol=1e-14, omega=None): log.timer_debug1('prepare intopt', *t0) self.j2c = j2c.copy() - j2c = take_last2d(j2c, intopt.aux_ao_idx) + j2c = intopt.sort_orbitals(j2c, aux_axis=[0,1]) try: self.cd_low = cholesky(j2c) self.cd_low = tag_array(self.cd_low, tag='cd') @@ -108,6 +108,7 @@ def build(self, direct_scf_tol=1e-14, omega=None): self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, omega=omega) log.timer_debug1('cholesky_eri', *t0) self.intopt = intopt + return self def get_jk(self, dm, hermi=1, with_j=True, with_k=True, direct_scf_tol=getattr(__config__, 'scf_hf_SCF_direct_scf_tol', 1e-13), diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 5a271903..ed181f62 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -1,17 +1,18 @@ -#!/usr/bin/env python -# Copyright 2014-2019 The PySCF Developers. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. # -# http://www.apache.org/licenses/LICENSE-2.0 +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . # # Author: Qiming Sun # Modified by Xiaojie Wu @@ -242,7 +243,7 @@ def to_cpu(self): obj = self.undo_df().to_cpu().density_fit() return utils.to_cpu(self, obj) -def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): +def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): ''' get jk with density fitting outputs and input are on the same device @@ -268,31 +269,37 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- assert nao == dfobj.nao vj = vk = None - ao_idx = dfobj.intopt.ao_idx - dms = take_last2d(dms, ao_idx) + intopt = dfobj.intopt + dms = intopt.sort_orbitals(dms, axis=[1,2]) dms_shape = dms.shape - rows = dfobj.intopt.cderi_row - cols = dfobj.intopt.cderi_col - + rows = intopt.cderi_row + cols = intopt.cderi_col + if with_j: dm_sparse = dms[:,rows,cols] - dm_sparse[:, dfobj.intopt.cderi_diag] *= .5 + if hermi == 0: + dm_sparse += dms[:,cols,rows] + else: + dm_sparse *= 2 + dm_sparse[:, intopt.cderi_diag] *= .5 if with_k: vk = cupy.zeros_like(dms) - + # SCF K matrix with occ if getattr(dms_tag, 'mo_coeff', None) is not None: + assert hermi == 1 mo_occ = dms_tag.mo_occ mo_coeff = dms_tag.mo_coeff nmo = mo_occ.shape[-1] mo_coeff = mo_coeff.reshape(-1,nao,nmo) mo_occ = mo_occ.reshape(-1,nmo) + mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1]) nocc = 0 occ_coeff = [0]*nset for i in range(nset): occ_idx = mo_occ[i] > 0 - occ_coeff[i] = mo_coeff[i][:,occ_idx][ao_idx] * mo_occ[i][occ_idx]**0.5 + occ_coeff[i] = mo_coeff[i][:,occ_idx] * mo_occ[i][occ_idx]**0.5 nocc += mo_occ[i].sum() blksize = dfobj.get_blksize(extra=nao*nocc) if with_j: @@ -300,7 +307,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): # leading dimension is 1 if with_j: - rhoj = 2.0*dm_sparse.dot(cderi_sparse) + rhoj = dm_sparse.dot(cderi_sparse) vj_packed += cupy.dot(rhoj, cderi_sparse.T) cderi_sparse = rhoj = None for i in range(nset): @@ -316,18 +323,18 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- vj[:,rows,cols] = vj_packed vj[:,cols,rows] = vj_packed - # CP-HF K matrix elif hasattr(dms_tag, 'mo1'): + # K matrix in CP-HF or TDDFT occ_coeffs = dms_tag.occ_coeff mo1s = dms_tag.mo1 - mo_occ = dms_tag.mo_occ - if not isinstance(occ_coeffs, list): - occ_coeffs = [occ_coeffs * 2.0] # For restricted - if not isinstance(mo1s, list): + if not isinstance(occ_coeffs, (tuple, list)): + # *2 for double occupancy in RHF/RKS + occ_coeffs = [occ_coeffs * 2.0] + if not isinstance(mo1s, (tuple, list)): mo1s = [mo1s] - occ_coeffs = [occ_coeff[ao_idx] for occ_coeff in occ_coeffs] - mo1s = [mo1[:,ao_idx] for mo1 in mo1s] + occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs] + mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s] if with_j: vj_sparse = cupy.zeros_like(dm_sparse) @@ -336,7 +343,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- blksize = dfobj.get_blksize(extra=2*nao*nocc) for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): if with_j: - rhoj = 2.0*dm_sparse.dot(cderi_sparse) + rhoj = dm_sparse.dot(cderi_sparse) vj_sparse += cupy.dot(rhoj, cderi_sparse.T) rhoj = None cderi_sparse = None @@ -346,8 +353,8 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- rhok = contract('Lij,jk->Lki', cderi, occ_coeff).reshape([-1,nao]) for i in range(mo1.shape[0]): rhok1 = contract('Lij,jk->Lki', cderi, mo1[i]).reshape([-1,nao]) - #contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[iset]) - vk[iset] += cupy.dot(rhok.T, rhok1) + #contract('Lki,Lkj->ij', rhok1, rhok, alpha=1.0, beta=1.0, out=vk[iset]) + vk[iset] += cupy.dot(rhok1.T, rhok) iset += 1 mo1 = rhok1 = rhok = None cderi = None @@ -356,7 +363,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- vj = cupy.zeros(dms_shape) vj[:,rows,cols] = vj_sparse vj[:,cols,rows] = vj_sparse - if with_k: + if with_k and hermi: transpose_sum(vk) vj_sparse = None # general K matrix with density matrix @@ -366,25 +373,24 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- blksize = dfobj.get_blksize() for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): if with_j: - rhoj = 2.0*dm_sparse.dot(cderi_sparse) + rhoj = dm_sparse.dot(cderi_sparse) vj_sparse += cupy.dot(rhoj, cderi_sparse.T) if with_k: for k in range(nset): rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao]) - #vk[k] += contract('Lki,Lkj->ij', cderi, rhok) - vk[k] += cupy.dot(cderi.reshape([-1,nao]).T, rhok) + #vk[k] += contract('Lki,Lkj->ij', rhok, cderi) + vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao])) if with_j: vj = cupy.zeros(dms_shape) vj[:,rows,cols] = vj_sparse vj[:,cols,rows] = vj_sparse rhok = None - rev_ao_idx = dfobj.intopt.rev_ao_idx if with_j: - vj = take_last2d(vj, rev_ao_idx) + vj = intopt.unsort_orbitals(vj, axis=[1,2]) vj = vj.reshape(out_shape) if with_k: - vk = take_last2d(vk, rev_ao_idx) + vk = intopt.unsort_orbitals(vk, axis=[1,2]) vk = vk.reshape(out_shape) t1 = log.timer_debug1('vj and vk', *t1) if out_cupy: diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 05a09639..15645846 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -17,7 +17,7 @@ import numpy import cupy from cupyx.scipy.linalg import solve_triangular -from pyscf import scf +from pyscf import scf, gto from gpu4pyscf.df import int3c2e, df from gpu4pyscf.lib.cupy_helper import (print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d, cholesky) @@ -88,11 +88,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega raise NotImplementedError() mo_coeff = cupy.asarray(mf_grad.base.mo_coeff) mo_occ = cupy.asarray(mf_grad.base.mo_occ) - ao_idx = intopt.ao_idx - dm = take_last2d(dm0, ao_idx) + dm = intopt.sort_orbitals(dm0, axis=[0,1]) orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5 - orbo = orbo[ao_idx, :] + mo_coeff = None + orbo = intopt.sort_orbitals(orbo, axis=[0]) nocc = orbo.shape[-1] # (L|ij) -> rhoj: (L), rhok: (L|oo) @@ -126,8 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega else: int2c_e1 = auxmol.intor('int2c2e_ip1') int2c_e1 = cupy.asarray(int2c_e1) - aux_ao_idx = intopt.aux_ao_idx - rev_aux_idx = numpy.argsort(aux_ao_idx) + auxslices = auxmol.aoslice_by_atom() aux_cart2sph = intopt.aux_cart2sph low_t = low.T.copy() @@ -141,7 +140,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj) else: rhoj_cart = rhoj - rhoj = rhoj[rev_aux_idx] + + rhoj = intopt.unsort_orbitals(rhoj, aux_axis=[0]) tmp = contract('xpq,q->xp', int2c_e1, rhoj) vjaux = -contract('xp,p->xp', tmp, rhoj) vjaux_2c = cupy.array([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) @@ -153,7 +153,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega #rhok = solve_triangular(low_t, rhok, lower=False) rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) tmp = contract('pij,qij->pq', rhok, rhok) - tmp = take_last2d(tmp, rev_aux_idx) + tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1]) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) vkaux = tmp = None @@ -166,26 +166,25 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega t0 = log.timer_debug1('rhoj and rhok', *t0) int2c_e1 = None - nao_cart = intopt.mol.nao + nao_cart = intopt._sorted_mol.nao block_size = with_df.get_blksize(nao=nao_cart) intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=block_size)#, group_size=block_size) - if not intopt._mol.cart: + dm_cart = dm + orbo_cart = orbo + if not mol.cart: # sph2cart for ao cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo dm_cart = cart2sph @ dm @ cart2sph.T - else: - dm_cart = dm - orbo_cart = orbo - dm = orbo = None + dm = orbo = None vj = vk = rhoj_tmp = rhok_tmp = None vjaux = vkaux = None - naux_cart = intopt.auxmol.nao + naux_cart = intopt._sorted_auxmol.nao if with_j: vj = cupy.zeros((3,nao_cart), order='C') vjaux = cupy.zeros((3,naux_cart)) @@ -193,8 +192,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega vk = cupy.zeros((3,nao_cart), order='C') vkaux = cupy.zeros((3,naux_cart)) cupy.get_default_memory_pool().free_all_blocks() + t1 = log.init_timer() for cp_kl_id in range(len(intopt.aux_log_qs)): - t1 = log.init_timer() k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] assert k1-k0 <= block_size if with_j: @@ -233,33 +232,36 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) - - # vj and vk are still in cartesian - cart_ao_idx = intopt.cart_ao_idx - rev_cart_ao_idx = numpy.argsort(cart_ao_idx) - aoslices = intopt.mol.aoslice_by_atom() + + # NOTE: vj and vk are still in cartesian + _sorted_mol = intopt._sorted_mol + natm = _sorted_mol.natm + ao2atom = numpy.zeros([nao_cart, natm]) + ao_loc = _sorted_mol.ao_loc + for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]): + ao2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1 + ao2atom = cupy.asarray(ao2atom) if with_j: - vj = vj[:, rev_cart_ao_idx] - vj = [-vj[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]] - vj = cupy.asarray(vj) + vj = -ao2atom.T @ vj.T if with_k: - vk = vk[:, rev_cart_ao_idx] - vk = [-vk[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]] - vk = cupy.asarray(vk) + vk = -ao2atom.T @ vk.T t0 = log.timer_debug1('(di,j|P) and (i,j|dP)', *t0) - cart_aux_idx = intopt.cart_aux_idx - rev_cart_aux_idx = numpy.argsort(cart_aux_idx) - auxslices = intopt.auxmol.aoslice_by_atom() + _sorted_auxmol = intopt._sorted_auxmol + natm = _sorted_auxmol.natm + aux2atom = numpy.zeros([naux_cart, natm]) + ao_loc = _sorted_auxmol.ao_loc + for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]): + aux2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1 + aux2atom = cupy.asarray(aux2atom) if with_j: - vjaux = vjaux[:, rev_cart_aux_idx] - vjaux_3c = cupy.asarray([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) - vjaux = vjaux_2c + vjaux_3c + vjaux_3c = aux2atom.T @ vjaux.T + vjaux = vjaux_2c - vjaux_3c if with_k: - vkaux = vkaux[:, rev_cart_aux_idx] - vkaux_3c = cupy.asarray([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) - vkaux = vkaux_2c + vkaux_3c + vkaux_3c = aux2atom.T @ vkaux.T + vkaux = vkaux_2c - vkaux_3c + return vj, vk, vjaux, vkaux @@ -303,4 +305,4 @@ def extra_force(self, atom_id, envs): else: return 0 -Grad = Gradients +Grad = Gradients \ No newline at end of file diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py index c19cc3d6..5dcb7c23 100644 --- a/gpu4pyscf/df/grad/uhf.py +++ b/gpu4pyscf/df/grad/uhf.py @@ -17,7 +17,7 @@ import cupy import copy from cupyx.scipy.linalg import solve_triangular -from pyscf import scf +from pyscf import scf, gto from gpu4pyscf.df import int3c2e from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library, take_last2d from gpu4pyscf.grad import uhf as uhf_grad @@ -68,13 +68,14 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega mo_coeff = cupy.asarray(mf_grad.base.mo_coeff) if mo_occ is None: mo_occ = cupy.asarray(mf_grad.base.mo_occ) - ao_idx = intopt.ao_idx - dm = take_last2d(dm0, ao_idx) + + dm = intopt.sort_orbitals(dm0, axis=[0,1]) if dm2 is not None: - dm2_tmp = take_last2d(dm2, ao_idx) + dm2_tmp = intopt.sort_orbitals(dm2, axis=[0,1]) + # (L|ij) -> rhoj: (L), rhok: (L|oo) orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5 - orbo = orbo[ao_idx, :] + orbo = intopt.sort_orbitals(orbo, axis=[0]) nocc = orbo.shape[-1] # (L|ij) -> rhoj: (L), rhok: (L|oo) @@ -115,8 +116,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega else: int2c_e1 = auxmol.intor('int2c2e_ip1') int2c_e1 = cupy.asarray(int2c_e1) - aux_ao_idx = intopt.aux_ao_idx - rev_aux_idx = np.argsort(aux_ao_idx) auxslices = auxmol.aoslice_by_atom() aux_cart2sph = intopt.aux_cart2sph low_t = low.T.copy() @@ -133,11 +132,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj) else: rhoj_cart = rhoj - - rhoj = rhoj[rev_aux_idx] + rhoj = intopt.unsort_orbitals(rhoj, aux_axis=[0]) if dm2 is not None: - rhoj2 = rhoj2[rev_aux_idx] + rhoj2 = intopt.unsort_orbitals(rhoj2, aux_axis=[0]) + tmp = contract('xpq,q->xp', int2c_e1, rhoj) if dm2 is not None: vjaux = -contract('xp,p->xp', tmp, rhoj2) @@ -151,7 +150,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega elif low.tag == 'cd': rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) tmp = contract('pij,qij->pq', rhok, rhok) - tmp = take_last2d(tmp, rev_aux_idx) + tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1]) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) vkaux = tmp = None @@ -164,33 +163,34 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega t0 = log.timer_debug1('rhoj and rhok', *t0) int2c_e1 = None - nao_cart = intopt.mol.nao + nao_cart = intopt._sorted_mol.nao block_size = with_df.get_blksize(nao=nao_cart) intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=block_size)#, group_size=block_size) - if not intopt._mol.cart: + + if not mol.cart: # sph2cart for ao cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo if dm2 is None: dm_cart = cart2sph @ dm @ cart2sph.T else: - dm2_tmp = take_last2d(dm2, ao_idx) + dm2_tmp = intopt.sort_orbitals(dm2, axis=[0,1]) dm_cart = cart2sph @ dm2_tmp @ cart2sph.T else: if dm2 is None: dm_cart = dm else: - dm_cart = take_last2d(dm2, ao_idx) + dm_cart = intopt.sort_orbitals(dm2, axis=[0,1]) orbo_cart = orbo dm = orbo = None vj = vk = rhoj_tmp = rhok_tmp = None vjaux = vkaux = None - naux_cart = intopt.auxmol.nao + naux_cart = intopt._sorted_auxmol.nao if with_j: vj = cupy.zeros((3,nao_cart), order='C') vjaux = cupy.zeros((3,naux_cart)) @@ -198,8 +198,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega vk = cupy.zeros((3,nao_cart), order='C') vkaux = cupy.zeros((3,naux_cart)) cupy.get_default_memory_pool().free_all_blocks() + t1 = log.init_timer() for cp_kl_id in range(len(intopt.aux_log_qs)): - t1 = log.init_timer() k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] assert k1-k0 <= block_size if with_j: @@ -239,32 +239,34 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) - cart_ao_idx = intopt.cart_ao_idx - rev_cart_ao_idx = np.argsort(cart_ao_idx) - aoslices = intopt.mol.aoslice_by_atom() + # NOTE: vj and vk are still in cartesian + _sorted_mol = intopt._sorted_mol + natm = _sorted_mol.natm + ao2atom = np.zeros([nao_cart, natm]) + ao_loc = _sorted_mol.ao_loc + for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]): + ao2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1 + ao2atom = cupy.asarray(ao2atom) if with_j: - vj = vj[:, rev_cart_ao_idx] - vj = [-vj[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]] - vj = cupy.asarray(vj) + vj = -ao2atom.T @ vj.T if with_k: - vk = vk[:, rev_cart_ao_idx] - vk = [-vk[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]] - vk = cupy.asarray(vk) + vk = -ao2atom.T @ vk.T t0 = log.timer_debug1('(di,j|P) and (i,j|dP)', *t0) - cart_aux_idx = intopt.cart_aux_idx - rev_cart_aux_idx = np.argsort(cart_aux_idx) - auxslices = intopt.auxmol.aoslice_by_atom() - + _sorted_auxmol = intopt._sorted_auxmol + natm = _sorted_auxmol.natm + aux2atom = np.zeros([naux_cart, natm]) + ao_loc = _sorted_auxmol.ao_loc + for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]): + aux2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1 + aux2atom = cupy.asarray(aux2atom) if with_j: - vjaux = vjaux[:, rev_cart_aux_idx] - vjaux_3c = cupy.asarray([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) - vjaux = vjaux_2c + vjaux_3c + vjaux_3c = aux2atom.T @ vjaux.T + vjaux = vjaux_2c - vjaux_3c if with_k: - vkaux = vkaux[:, rev_cart_aux_idx] - vkaux_3c = cupy.asarray([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]]) - vkaux = vkaux_2c + vkaux_3c + vkaux_3c = aux2atom.T @ vkaux.T + vkaux = vkaux_2c - vkaux_3c return vj, vk, vjaux, vkaux class Gradients(uhf_grad.Gradients): diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index cc669174..b09e41af 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -96,19 +96,17 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - ao_idx = intopt.ao_idx - aux_ao_idx = intopt.aux_ao_idx - naux = len(aux_ao_idx) - mocc_2 = mocc_2[ao_idx, :] - dm0 = take_last2d(dm0, ao_idx) + naux = auxmol.nao #len(aux_ao_idx) + mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0]) + dm0 = intopt.sort_orbitals(dm0, axis=[0,1]) dm0_tag = tag_array(dm0, occ_coeff=mocc_2) int2c = cupy.asarray(int2c, order='C') - int2c = take_last2d(int2c, aux_ao_idx) + int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) int2c_ip1 = cupy.asarray(int2c_ip1, order='C') - int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx) + int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) hj_ao_ao = cupy.zeros([nao,nao,3,3]) hk_ao_ao = cupy.zeros([nao,nao,3,3]) @@ -255,7 +253,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, else: int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') - int2c_ipip1 = take_last2d(int2c_ipip1, aux_ao_idx) + int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2]) rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) # p,xp->px @@ -271,7 +269,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, else: int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') - int2c_ip1ip2 = take_last2d(int2c_ip1ip2, aux_ao_idx) + int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2]) hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) @@ -329,29 +327,22 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rho2c_10= int2c_ip1_inv = None t1 = log.timer_debug1('contract int2c_*', *t1) - ao_idx = np.argsort(intopt.ao_idx) - aux_idx = np.argsort(intopt.aux_ao_idx) - rev_ao_ao = cupy.ix_(ao_idx, ao_idx) - dm0 = dm0[rev_ao_ao] - hj_ao_diag = hj_ao_diag[ao_idx] - hj_ao_ao = hj_ao_ao[rev_ao_ao] + dm0 = intopt.unsort_orbitals(dm0, axis=[0,1]) + hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0]) + hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: - rev_ao_aux = cupy.ix_(ao_idx, aux_idx) - hj_ao_aux = hj_ao_aux[rev_ao_aux] + hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - rev_aux_aux = cupy.ix_(aux_idx, aux_idx) - hj_aux_diag = hj_aux_diag[aux_idx] - hj_aux_aux = hj_aux_aux[rev_aux_aux] - + hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) + hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: - hk_ao_diag = hk_ao_diag[ao_idx] - hk_ao_ao = hk_ao_ao[rev_ao_ao] + hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0]) + hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: - hk_ao_aux = hk_ao_aux[rev_ao_aux] + hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - hk_aux_diag = hk_aux_diag[aux_idx] - hk_aux_aux = hk_aux_aux[rev_aux_aux] - + hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0]) + hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1]) #======================================== sort AO end =========================================== # Energy weighted density matrix # pi,qi,i->pq @@ -460,7 +451,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mo_occ = cupy.asarray(mo_occ, order='C') mf = hessobj.base - #auxmol = hessobj.base.with_df.auxmol auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis) aoslices = mol.aoslice_by_atom() auxslices = auxmol.aoslice_by_atom() @@ -486,16 +476,14 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE) - ao_idx = intopt.ao_idx - aux_ao_idx = intopt.aux_ao_idx - naux = len(aux_ao_idx) - mocc = mocc[ao_idx, :] + naux = auxmol.nao + mocc = intopt.sort_orbitals(mocc, axis=[0]) nocc = mocc.shape[1] - mo_coeff = mo_coeff[ao_idx,:] - dm0 = take_last2d(dm0, ao_idx) + mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0]) + dm0 = intopt.sort_orbitals(dm0, axis=[0,1]) dm0_tag = tag_array(dm0, occ_coeff=mocc) - - int2c = take_last2d(int2c, aux_ao_idx) + + int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) rhoj0 = solve_j2c(wj) @@ -530,7 +518,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') int2c_ip1 = cupy.asarray(int2c_ip1, order='C') - int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx) + int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) # Generate rhok0_P__ if isinstance(rhok0_Pl_, cupy.ndarray): @@ -583,17 +571,17 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, rhoj0 = rhok0_Pl_ = None vk1_ao *= 2.0 vk1_buf *= 2.0 - rev_ao_idx = np.argsort(ao_idx) - vj1_buf = take_last2d(vj1_buf, rev_ao_idx) - vk1_buf = take_last2d(vk1_buf, rev_ao_idx) + + vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) + vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2]) vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff) vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff) vj1_ao = vk1_ao = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) - mocc = mocc[rev_ao_idx] - mo_coeff = mo_coeff[rev_ao_idx] + mocc = intopt.unsort_orbitals(mocc, axis=[0]) + mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[0]) release_gpu_stack() # ========================== sorted AO end ================================ diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 468a0add..014142fa 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -55,7 +55,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, raise NotImplementedError omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - with_k = abs(hyb) > 1e-10 + with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, atmlst, max_memory, verbose, with_k=with_k) @@ -98,11 +98,12 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + with_k = ni.libxc.is_hybrid_xc(mf.xc) for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, abs(hyb) > 1e-10): + atmlst, verbose, with_k): h1mo[ia] += h1 + vj1 - if abs(hyb) > 1e-10 or abs(alpha-hyb) > 1e-10: + if with_k: h1mo[ia] -= .5 * hyb * vk1 if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, diff --git a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py index 490608e6..c39f0172 100644 --- a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py +++ b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py @@ -83,6 +83,7 @@ def test_df_gga(self): mf = mf.to_gpu() hessobj = mf.Hessian() + hessobj.base.cphf_grids = hessobj.base.grids hess_gpu = hessobj.kernel() assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 @@ -98,9 +99,11 @@ def test_df_mgga(self): mf = mf.to_gpu() hessobj = mf.Hessian() + hessobj.base.cphf_grids = hessobj.base.grids hess_gpu = hessobj.kernel() assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 if __name__ == "__main__": print("Full Tests for DF UKS Hessian") - unittest.main() \ No newline at end of file + unittest.main() + \ No newline at end of file diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index a66b6557..71a6c7dc 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -100,23 +100,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - ao_idx = intopt.ao_idx - aux_ao_idx = intopt.aux_ao_idx - mocca = mocca[ao_idx, :] - moccb = moccb[ao_idx, :] - dm0a = take_last2d(dm0a, ao_idx) - dm0b = take_last2d(dm0b, ao_idx) + mocca = intopt.sort_orbitals(mocca, axis=[0]) + moccb = intopt.sort_orbitals(moccb, axis=[0]) + dm0a = intopt.sort_orbitals(dm0a, axis=[0,1]) + dm0b = intopt.sort_orbitals(dm0b, axis=[0,1]) + dm0a_tag = tag_array(dm0a, occ_coeff=mocca) dm0b_tag = tag_array(dm0b, occ_coeff=moccb) int2c = cupy.asarray(int2c, order='C') - int2c = take_last2d(int2c, aux_ao_idx) + int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) + int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR) solve_j2c = _gen_metric_solver(int2c) int2c = None int2c_ip1 = cupy.asarray(int2c_ip1, order='C') - int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx) + int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) hj_ao_ao = cupy.zeros([nao,nao,3,3]) hk_ao_ao = cupy.zeros([nao,nao,3,3]) @@ -272,7 +272,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, else: int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') - int2c_ipip1 = take_last2d(int2c_ipip1, aux_ao_idx) + int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2]) rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) # p,xp->px @@ -289,7 +289,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, else: int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') - int2c_ip1ip2 = take_last2d(int2c_ip1ip2, aux_ao_idx) + int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2]) hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) @@ -349,32 +349,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rho2c_10= int2c_ip1_inv = None t1 = log.timer_debug1('contract int2c_*', *t1) - ao_idx = np.argsort(intopt.ao_idx) - aux_idx = np.argsort(intopt.aux_ao_idx) - rev_ao_ao = cupy.ix_(ao_idx, ao_idx) - #dm0 = dm0[rev_ao_ao] - hj_ao_diag = hj_ao_diag[ao_idx] - hj_ao_ao = hj_ao_ao[rev_ao_ao] + hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0]) + hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: - rev_ao_aux = cupy.ix_(ao_idx, aux_idx) - hj_ao_aux = hj_ao_aux[rev_ao_aux] + hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - rev_aux_aux = cupy.ix_(aux_idx, aux_idx) - hj_aux_diag = hj_aux_diag[aux_idx] - hj_aux_aux = hj_aux_aux[rev_aux_aux] - + hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) + hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: - hk_ao_diag = hk_ao_diag[ao_idx] - hk_ao_ao = hk_ao_ao[rev_ao_ao] + hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0]) + hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: - hk_ao_aux = hk_ao_aux[rev_ao_aux] + hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - hk_aux_diag = hk_aux_diag[aux_idx] - hk_aux_aux = hk_aux_aux[rev_aux_aux] - - mocca = mocca[ao_idx] - moccb = moccb[ao_idx] - + hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0]) + hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1]) + mocca = intopt.unsort_orbitals(mocca, axis=[0]) + moccb = intopt.unsort_orbitals(moccb, axis=[0]) #======================================== sort AO end =========================================== # Energy weighted density matrix # pi,qi,i->pq @@ -517,17 +508,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE) - ao_idx = intopt.ao_idx - aux_ao_idx = intopt.aux_ao_idx - - mocca = mocca[ao_idx, :] - moccb = moccb[ao_idx, :] - mo_coeff = mo_coeff[:, ao_idx,:] - dm0a = take_last2d(dm0a, ao_idx) - dm0b = take_last2d(dm0b, ao_idx) + + mocca = intopt.sort_orbitals(mocca, axis=[0]) + moccb = intopt.sort_orbitals(moccb, axis=[0]) + mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1]) + dm0a = intopt.sort_orbitals(dm0a, axis=[0,1]) + dm0b = intopt.sort_orbitals(dm0b, axis=[0,1]) dm0 = dm0a + dm0b - int2c = take_last2d(int2c, aux_ao_idx) + int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) int2c = None @@ -567,10 +556,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega) dm0_tag = tag_array(dm0, occ_coeff=moccb) vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega) - rev_ao_idx = np.argsort(ao_idx) - vj1_buf = take_last2d(vj1_buf, rev_ao_idx) - vk1a_buf = take_last2d(vk1a_buf, rev_ao_idx) - vk1b_buf = take_last2d(vk1b_buf, rev_ao_idx) + + vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) + vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2]) + vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2]) vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0]) vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1]) @@ -597,13 +586,13 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') int2c_ip1 = cupy.asarray(int2c_ip1, order='C') - int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx) + int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) # generate rhok0_P__ if isinstance(rhok0a_Pl_, cupy.ndarray): rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca) else: - naux = len(aux_ao_idx) + naux = auxmol.nao nocc = mocca.shape[1] rhok0a_P__ = cupy.empty([naux,nocc,nocc]) for p0, p1 in lib.prange(0,naux,64): @@ -615,7 +604,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, if isinstance(rhok0b_Pl_, cupy.ndarray): rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb) else: - naux = len(aux_ao_idx) + naux = auxmol.nao nocc = moccb.shape[1] rhok0b_P__ = cupy.empty([naux,nocc,nocc]) for p0, p1 in lib.prange(0,naux,64): @@ -670,9 +659,9 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, vk1a_int3c_ip2 = vk1b_int3c_ip2 = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) - mocca = mocca[rev_ao_idx] - moccb = moccb[rev_ao_idx] - mo_coeff = mo_coeff[:,rev_ao_idx] + mocca = intopt.unsort_orbitals(mocca, axis=[0]) + moccb = intopt.unsort_orbitals(moccb, axis=[0]) + mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1]) release_gpu_stack() # ========================== sorted AO end ================================ diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 9ab957be..3a4dbd52 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -57,7 +57,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, raise NotImplementedError omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - with_k = abs(hyb) > 1e-10 + with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, atmlst, max_memory, verbose, with_k=with_k) @@ -103,13 +103,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + with_k = ni.libxc.is_hybrid_xc(mf.xc) for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, abs(hyb) > 1e-10): + atmlst, verbose, with_k): h1moa[ia] += h1[0] + vj1[0] h1mob[ia] += h1[1] + vj1[1] - if abs(hyb) > 1e-10 or abs(alpha-hyb) > 1e-10: + if with_k: vk1a, vk1b = vk1 h1moa[ia] -= hyb * vk1a h1mob[ia] -= hyb * vk1b diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index f2aa0a5a..834c587c 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -64,19 +64,13 @@ def make_fake_mol(): class VHFOpt(_vhf.VHFOpt): def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen', qcondname='CVHFsetnr_direct_scf', dmcondname=None): - # use local basis_seg_contraction for efficiency - # TODO: switch _mol and mol - self.mol = basis_seg_contraction(mol,allow_replica=True) - self.auxmol = basis_seg_contraction(auxmol, allow_replica=True) - self._mol = mol - self._auxmol = auxmol + self.mol = mol # original mol + self.auxmol = auxmol # original auxiliary mol + self._sorted_mol = None # sorted mol + self._sorted_auxmol = None # sorted auxilary mol - ''' - # Note mol._bas will be sorted in .build() method. VHFOpt should be - # initialized after mol._bas updated. - ''' - self.nao = self.mol.nao - self.naux = self.auxmol.nao + self._ao_idx = None + self._aux_ao_idx = None self._intor = intor self._prescreen = prescreen @@ -85,11 +79,6 @@ def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen', self.bpcache = None - self.cart_ao_idx = None - self.sph_ao_idx = None - self.cart_aux_idx = None - self.sph_aux_idx = None - self.cart_ao_loc = [] self.cart_aux_loc = [] self.sph_ao_loc = [] @@ -128,14 +117,16 @@ def build(self, cutoff=1e-14, group_size=None, a tot_mol is created with concatenating [mol, fake_mol, aux_mol] we will pair (ao,ao) and (aux,1) separately. ''' - _mol = self._mol - _auxmol = self._auxmol - mol = self.mol - auxmol = self.auxmol + _mol = self.mol + _auxmol = self.auxmol + mol = basis_seg_contraction(_mol,allow_replica=True) + auxmol = basis_seg_contraction(_auxmol, allow_replica=True) + log = logger.new_logger(_mol, _mol.verbose) cput0 = log.init_timer() - sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log) + _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log) + if group_size is not None : uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size) self.nctr = len(uniq_l_ctr) @@ -145,16 +136,16 @@ def build(self, cutoff=1e-14, group_size=None, _, _, fake_uniq_l_ctr, fake_l_ctr_counts = sort_mol(fake_mol, log=log) # sort auxiliary mol - sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(auxmol, log=log) + _sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(auxmol, log=log) if group_size_aux is not None: aux_uniq_l_ctr, aux_l_ctr_counts = _split_l_ctr_groups(aux_uniq_l_ctr, aux_l_ctr_counts, group_size_aux) - - tot_mol = sorted_mol + fake_mol + sorted_auxmol - tot_mol.cart = True - self.tot_mol = tot_mol + + _tot_mol = _sorted_mol + fake_mol + _sorted_auxmol + _tot_mol.cart = True + self._tot_mol = _tot_mol # Initialize vhfopt after reordering mol._bas - _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen, + _vhf.VHFOpt.__init__(self, _sorted_mol, self._intor, self._prescreen, self._qcondname, self._dmcondname) self.direct_scf_tol = cutoff @@ -169,32 +160,19 @@ def build(self, cutoff=1e-14, group_size=None, cput1 = log.timer_debug1('Get pairing', *cput1) # contraction coefficient for ao basis - cart_ao_loc = sorted_mol.ao_loc_nr(cart=True) - sph_ao_loc = sorted_mol.ao_loc_nr(cart=False) + cart_ao_loc = _sorted_mol.ao_loc_nr(cart=True) + sph_ao_loc = _sorted_mol.ao_loc_nr(cart=False) self.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets] self.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets] self.angular = [l[0] for l in uniq_l_ctr] - cart_ao_loc = mol.ao_loc_nr(cart=True) - sph_ao_loc = mol.ao_loc_nr(cart=False) - nao = sph_ao_loc[-1] - ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1]) - self.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx]) + # Sorted AO indices + ao_loc = mol.ao_loc_nr(cart=_mol.cart) + ao_idx = np.array_split(np.arange(_mol.nao), ao_loc[1:-1]) + self._ao_idx = np.hstack([ao_idx[i] for i in sorted_idx]) # cartesian ao index - nao = cart_ao_loc[-1] - ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1]) - self.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx]) - ncart = cart_ao_loc[-1] - nsph = sph_ao_loc[-1] - self.cart2sph = block_c2s_diag(ncart, nsph, self.angular, l_ctr_counts) - - if _mol.cart: - inv_idx = np.argsort(self.cart_ao_idx, kind='stable').astype(np.int32) - self.coeff = cupy.eye(ncart)[:,inv_idx] - else: - inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32) - self.coeff = self.cart2sph[:, inv_idx] + self.cart2sph = block_c2s_diag(self.angular, l_ctr_counts) cput1 = log.timer_debug1('AO cart2sph coeff', *cput1) # pairing auxiliary basis with fake basis set @@ -203,36 +181,22 @@ def build(self, cutoff=1e-14, group_size=None, aux_l_ctr_offsets = np.append(0, np.cumsum(aux_l_ctr_counts)) # contraction coefficient for auxiliary basis - cart_aux_loc = sorted_auxmol.ao_loc_nr(cart=True) - sph_aux_loc = sorted_auxmol.ao_loc_nr(cart=False) + cart_aux_loc = _sorted_auxmol.ao_loc_nr(cart=True) + sph_aux_loc = _sorted_auxmol.ao_loc_nr(cart=False) self.cart_aux_loc = [cart_aux_loc[cp] for cp in aux_l_ctr_offsets] self.sph_aux_loc = [sph_aux_loc[cp] for cp in aux_l_ctr_offsets] self.aux_angular = [l[0] for l in aux_uniq_l_ctr] - cart_aux_loc = self.auxmol.ao_loc_nr(cart=True) - sph_aux_loc = self.auxmol.ao_loc_nr(cart=False) - naux = sph_aux_loc[-1] - ao_idx = np.array_split(np.arange(naux), sph_aux_loc[1:-1]) - self.sph_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) + aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart) + ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1]) + self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) # cartesian aux index - naux = cart_aux_loc[-1] - ao_idx = np.array_split(np.arange(naux), cart_aux_loc[1:-1]) - self.cart_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) - ncart = cart_aux_loc[-1] - nsph = sph_aux_loc[-1] - self.aux_cart2sph = block_c2s_diag(ncart, nsph, self.aux_angular, aux_l_ctr_counts) - - if _auxmol.cart: - inv_idx = np.argsort(self.cart_aux_idx, kind='stable').astype(np.int32) - self.aux_coeff = cupy.eye(ncart)[:,inv_idx] - else: - inv_idx = np.argsort(self.sph_aux_idx, kind='stable').astype(np.int32) - self.aux_coeff = self.aux_cart2sph[:, inv_idx] + self.aux_cart2sph = block_c2s_diag(self.aux_angular, aux_l_ctr_counts) aux_l_ctr_offsets += fake_l_ctr_offsets[-1] cput1 = log.timer_debug1('aux cart2sph coeff', *cput1) - ao_loc = sorted_mol.ao_loc_nr(cart=_mol.cart) + ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart) self.ao_pairs_row, self.ao_pairs_col = get_ao_pairs(pair2bra, pair2ket, ao_loc) cderi_row = cupy.hstack(self.ao_pairs_row) cderi_col = cupy.hstack(self.ao_pairs_col) @@ -268,7 +232,7 @@ def build(self, cutoff=1e-14, group_size=None, bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1) bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32) log_qs = log_qs + aux_log_qs - ao_loc = tot_mol.ao_loc_nr(cart=True) + ao_loc = _tot_mol.ao_loc_nr(cart=True) ncptype = len(log_qs) self.bpcache = ctypes.POINTER(BasisProdCache)() @@ -278,9 +242,9 @@ def build(self, cutoff=1e-14, group_size=None, ao_loc.ctypes.data_as(ctypes.c_void_p), bas_pair2shls.ctypes.data_as(ctypes.c_void_p), bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype), - tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm), - tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas), - tot_mol._env.ctypes.data_as(ctypes.c_void_p)) + _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm), + _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas), + _tot_mol._env.ctypes.data_as(ctypes.c_void_p)) cput1 = log.timer_debug1('Initialize GPU cache', *cput1) self.bas_pairs_locs = bas_pairs_locs @@ -294,25 +258,79 @@ def build(self, cutoff=1e-14, group_size=None, if _mol.cart: self.ao_loc = self.cart_ao_loc - self.ao_idx = self.cart_ao_idx else: self.ao_loc = self.sph_ao_loc - self.ao_idx = self.sph_ao_idx if _auxmol.cart: self.aux_ao_loc = self.cart_aux_loc - self.aux_ao_idx = self.cart_aux_idx else: self.aux_ao_loc = self.sph_aux_loc - self.aux_ao_idx = self.sph_aux_idx - self.rev_ao_idx = np.argsort(self.ao_idx, kind='stable').astype(np.int32) - self.ao_idx = cupy.array(self.ao_idx) - self.cart_ao_idx = cupy.array(self.cart_ao_idx) - self.sph_ao_idx = cupy.array(self.sph_ao_idx) - self.aux_ao_idx = cupy.array(self.aux_ao_idx) - self.cart_aux_idx = cupy.array(self.cart_aux_idx) - self.sph_aux_idx = cupy.array(self.sph_aux_idx) - self.rev_ao_idx = cupy.array(self.rev_ao_idx) + self._sorted_mol = _sorted_mol + self._sorted_auxmol = _sorted_auxmol + + def sort_orbitals(self, mat, axis=[], aux_axis=[]): + ''' Transform given axis of a matrix into sorted AO, + and transform given auxiliary axis of a matrix into sorted auxiliary AO + ''' + idx = self._ao_idx + aux_idx = self._aux_ao_idx + shape_ones = (1,) * mat.ndim + fancy_index = [] + for dim, n in enumerate(mat.shape): + if dim in axis: + assert n == len(idx) + indices = idx + elif dim in aux_axis: + assert n == len(aux_idx) + indices = aux_idx + else: + indices = np.arange(n) + idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:] + fancy_index.append(indices.reshape(idx_shape)) + return mat[tuple(fancy_index)] + + def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]): + ''' Transform given axis of a matrix into sorted AO, + and transform given auxiliary axis of a matrix into original auxiliary AO + ''' + idx = self._ao_idx + aux_idx = self._aux_ao_idx + shape_ones = (1,) * sorted_mat.ndim + fancy_index = [] + for dim, n in enumerate(sorted_mat.shape): + if dim in axis: + assert n == len(idx) + indices = idx + elif dim in aux_axis: + assert n == len(aux_idx) + indices = aux_idx + else: + indices = np.arange(n) + idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:] + fancy_index.append(indices.reshape(idx_shape)) + mat = cupy.empty_like(sorted_mat) + mat[tuple(fancy_index)] = sorted_mat + return mat + + @property + def coeff(self): + nao = self.mol.nao + if self.mol.cart: + coeff = cupy.eye(nao) + self._coeff = self.unsort_orbitals(coeff, axis=[1]) + else: + self._coeff = self.unsort_orbitals(self.cart2sph, axis=[1]) + return self._coeff + + @property + def aux_coeff(self): + naux = self.auxmol.nao + if self.auxmol.cart: + coeff = cupy.eye(naux) + self._aux_coeff = self.unsort_orbitals(coeff, aux_axis=[1]) + else: + self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1]) + return self._aux_coeff def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): log = logger.new_logger(mol, mol.verbose) @@ -351,7 +369,7 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): li = intopt.angular[cpi] lj = intopt.angular[cpj] int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega) - if not intopt._mol.cart: + if not intopt.mol.cart: int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] @@ -378,7 +396,7 @@ def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None): ''' fn = getattr(libgvhf, 'GINTbuild_int3c2e_' + ip_type + '_jk') if omega is None: omega = 0.0 - nao = intopt.mol.nao + nao = intopt._sorted_mol.nao n_dm = 1 cp_kl_id = cp_aux_id + len(intopt.log_qs) @@ -451,19 +469,19 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - nao = intopt.mol.nao - naux = intopt.auxmol.nao + nao = intopt._sorted_mol.nao + naux = intopt._sorted_auxmol.nao norb = nao + naux + 1 ao_loc = intopt.ao_loc aux_ao_loc = intopt.aux_ao_loc comp = 3**order - lmax = intopt.mol._bas[:gto.ANG_OF].max() - aux_lmax = intopt.auxmol._bas[:gto.ANG_OF].max() + lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max() + aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max() nroots = (lmax + aux_lmax + order)//2 + 1 if nroots > NROOT_ON_GPU: from pyscf.gto.moleintor import getints, make_cintopt - pmol = intopt.tot_mol + pmol = intopt._tot_mol intor = pmol._add_suffix('int3c2e_' + ip_type) opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) @@ -519,9 +537,9 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1]) int3c_blk = cupy.asarray(int3c_cpu) - if not intopt._auxmol.cart: + if not intopt.auxmol.cart: int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) - if not intopt._mol.cart: + if not intopt.mol.cart: int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) @@ -550,9 +568,9 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - nao = len(intopt.ao_idx) - nao_cart = intopt.mol.nao - naux_cart = intopt.auxmol.nao + nao = intopt.mol.nao + nao_cart = intopt._sorted_mol.nao + naux_cart = intopt._sorted_auxmol.nao norb_cart = nao_cart + naux_cart + 1 ao_loc = intopt.ao_loc aux_ao_loc = intopt.aux_ao_loc @@ -615,20 +633,20 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): yield aux_id, ints_slices def get_ao2atom(intopt, aoslices): - ao_idx = intopt.ao_idx - ao2atom = cupy.zeros([len(ao_idx), len(aoslices)]) + nao = intopt.mol.nao + ao2atom = cupy.zeros([nao, len(aoslices)]) for ia, aoslice in enumerate(aoslices): _, _, p0, p1 = aoslice ao2atom[p0:p1,ia] = 1.0 - return ao2atom[ao_idx,:] + return intopt.sort_orbitals(ao2atom, axis=[0]) def get_aux2atom(intopt, auxslices): - aux_ao_idx = intopt.aux_ao_idx - aux2atom = cupy.zeros([len(aux_ao_idx), len(auxslices)]) + naux = intopt.auxmol.nao + aux2atom = cupy.zeros([naux, len(auxslices)]) for ia, auxslice in enumerate(auxslices): _, _, p0, p1 = auxslice aux2atom[p0:p1,ia] = 1.0 - return aux2atom[aux_ao_idx,:] + return intopt.sort_orbitals(aux2atom, aux_axis=[0]) def get_j_int3c2e_pass1(intopt, dm0, sort_j=True): ''' @@ -636,22 +654,24 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True): ''' n_dm = 1 - naux = intopt.cart_aux_loc[-1]#len(intopt.cart_aux_idx) - rhoj = cupy.zeros([naux]) + naux = intopt._sorted_auxmol.nao + coeff = intopt.coeff if dm0.ndim == 3: dm0 = dm0[0] + dm0[1] dm_cart = coeff @ dm0 @ coeff.T - + num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs] num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs] bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32) bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32) - + ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) norb = dm_cart.shape[0] + + rhoj = cupy.zeros([naux]) err = libgvhf.GINTbuild_j_int3c2e_pass1( intopt.bpcache, ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p), @@ -665,7 +685,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True): ctypes.c_int(ncp_kl)) if err != 0: raise RuntimeError('CUDA error in get_j_pass1') - + if sort_j: aux_coeff = intopt.aux_coeff rhoj = cupy.dot(rhoj, aux_coeff) @@ -676,8 +696,8 @@ def get_j_int3c2e_pass2(intopt, rhoj): get vj pass2 for int3c2e ''' n_dm = 1 - norb = len(intopt.cart_ao_idx) - naux = len(intopt.cart_aux_idx) + norb = intopt._sorted_mol.nao + naux = intopt._sorted_auxmol.nao vj = cupy.zeros([norb, norb]) num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs] @@ -688,9 +708,10 @@ def get_j_int3c2e_pass2(intopt, rhoj): ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) - - aux_coeff = intopt.aux_coeff - rhoj = cupy.dot(aux_coeff, rhoj) + + rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0]) + if not intopt.auxmol.cart: + rhoj = intopt.aux_cart2sph @ rhoj err = libgvhf.GINTbuild_j_int3c2e_pass2( intopt.bpcache, @@ -706,8 +727,11 @@ def get_j_int3c2e_pass2(intopt, rhoj): if err != 0: raise RuntimeError('CUDA error in get_j_pass2') - coeff = intopt.coeff - vj = coeff.T @ vj @ coeff + + if not intopt.mol.cart: + cart2sph = intopt.cart2sph + vj = cart2sph.T @ vj @ cart2sph + vj = intopt.unsort_orbitals(vj, axis=[0,1]) vj = vj + vj.T return vj @@ -719,7 +743,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE) if omega is None: omega = 0.0 - naux = len(intopt.aux_ao_idx) + naux = auxmol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] rhoj = cupy.empty([naux]) @@ -736,7 +760,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): li = intopt.angular[cpi] lj = intopt.angular[cpj] int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega) - if not intopt._mol.cart: + if not intopt.mol.cart: int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] @@ -761,8 +785,8 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg # vj and vk responses (due to int3c2e_ip1) to changes in atomic positions ''' ao2atom = get_ao2atom(intopt, aoslices) - natom = len(aoslices) - nao = len(intopt.ao_idx) + natom = intopt.mol.natm + nao = intopt.mol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] vj1_buf = cupy.zeros([3,nao,nao]) @@ -820,8 +844,8 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome vj and vk responses (due to int3c2e_ip2) to changes in atomic positions ''' aux2atom = get_aux2atom(intopt, auxslices) - natom = len(auxslices) - nao = len(intopt.ao_idx) + natom = intopt.mol.natm + nao = intopt.mol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] vj1 = cupy.zeros([natom,3,nao,nocc]) @@ -863,8 +887,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): ''' get wj and wk for int3c2e_ip1 ''' - nao = len(intopt.ao_idx) - naux = len(intopt.aux_ao_idx) + nao = intopt.mol.nao + naux = intopt.auxmol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] @@ -903,7 +927,7 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): ''' get wj and wk for int3c2e_ip2 ''' - naux = len(intopt.aux_ao_idx) + naux = intopt.auxmol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] wj = cupy.zeros([naux,3]) @@ -918,12 +942,12 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): ''' get hj and hk with int3c2e_ipip1 ''' - nao_sph = dm0_tag.shape[0] + nao = dm0_tag.shape[0] orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - hj = cupy.zeros([nao_sph,9]) + hj = cupy.zeros([nao,9]) hk = None if with_k: - hk = cupy.zeros([nao_sph,9]) + hk = cupy.zeros([nao,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1', omega=omega): tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1]) @@ -931,21 +955,21 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp) - hj = hj.reshape([nao_sph,3,3]) + hj = hj.reshape([nao,3,3]) if with_k: - hk = hk.reshape([nao_sph,3,3]) + hk = hk.reshape([nao,3,3]) return hj, hk def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): ''' # get hj and hk with int3c2e_ipvip1 ''' - nao_sph = dm0_tag.shape[0] + nao = dm0_tag.shape[0] orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - hj = cupy.zeros([nao_sph,nao_sph,9]) + hj = cupy.zeros([nao,nao,9]) hk = None if with_k: - hk = cupy.zeros([nao_sph,nao_sph,9]) + hk = cupy.zeros([nao,nao,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1', omega=omega): tmp = contract('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1]) hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1]) @@ -953,22 +977,22 @@ def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None) rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1]) hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp) - hj = hj.reshape([nao_sph,nao_sph,3,3]) + hj = hj.reshape([nao,nao,3,3]) if with_k: - hk = hk.reshape([nao_sph,nao_sph,3,3]) + hk = hk.reshape([nao,nao,3,3]) return hj, hk def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): ''' # get hj and hk with int3c2e_ip1ip2 ''' - nao_sph = dm0_tag.shape[0] - naux_sph = rhok.shape[0] + nao = dm0_tag.shape[0] + naux = rhok.shape[0] orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - hj = cupy.zeros([nao_sph,naux_sph,9]) + hj = cupy.zeros([nao,naux,9]) hk = None if with_k: - hk = cupy.zeros([nao_sph,naux_sph,9]) + hk = cupy.zeros([nao,naux,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1ip2', omega=omega): tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1]) @@ -976,21 +1000,21 @@ def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None) rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp) - hj = hj.reshape([nao_sph,naux_sph,3,3]) + hj = hj.reshape([nao,naux,3,3]) if with_k: - hk = hk.reshape([nao_sph,naux_sph,3,3]) + hk = hk.reshape([nao,naux,3,3]) return hj, hk def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): ''' # get hj and hk with int3c2e_ipip2 ''' - naux_sph = rhok.shape[0] + naux = rhok.shape[0] orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - hj = cupy.zeros([naux_sph,9]) + hj = cupy.zeros([naux,9]) hk = None if with_k: - hk = cupy.zeros([naux_sph,9]) + hk = cupy.zeros([naux,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip2', omega=omega): tmp = contract('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1]) hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) @@ -998,9 +1022,9 @@ def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1]) rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1]) hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp) - hj = hj.reshape([naux_sph,3,3]) + hj = hj.reshape([naux,3,3]) if with_k: - hk = hk.reshape([naux_sph,3,3]) + hk = hk.reshape([naux,3,3]) return hj, hk def get_hess_nuc_elec(mol, dm): @@ -1016,8 +1040,7 @@ def get_hess_nuc_elec(mol, dm): fakemol.stdout = mol.stdout intopt = VHFOpt(mol, fakemol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - ao_idx = intopt.ao_idx - dm = take_last2d(cupy.asarray(dm), ao_idx) + dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1]) natm = mol.natm nao = mol.nao @@ -1172,9 +1195,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s if err != 0: raise RuntimeError("int3c2e_ip failed\n") - if not intopt._auxmol.cart: + if not intopt.auxmol.cart: int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) - if not intopt._mol.cart: + if not intopt.mol.cart: int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) @@ -1183,13 +1206,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1] int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk - ao_idx = np.argsort(intopt.ao_idx) - aux_idx = np.argsort(intopt.aux_ao_idx) - int3c = int3c[cupy.ix_(np.arange(3), aux_idx, ao_idx, ao_idx)] - + int3c = intopt.unsort_orbitals(int3c, aux_axis=[1], axis=[2,3]) return int3c.transpose([0,3,2,1]) - def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', direct_scf_tol=1e-13, omega=None, stream=None): ''' Generate full int3c2e type tensor on GPU @@ -1219,13 +1238,12 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di nroots = (lmax + aux_lmax + order)//2 + 1 if nroots > NROOT_ON_GPU: from pyscf.gto.moleintor import getints, make_cintopt - mol = intopt.mol - pmol = intopt.tot_mol + pmol = intopt._tot_mol intor = pmol._add_suffix('int3c2e_' + ip_type) opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - nao_cart = intopt.mol.nao - naux_cart = intopt.auxmol.nao + nao_cart = intopt._sorted_mol.nao + naux_cart = intopt._sorted_auxmol.nao norb_cart = nao_cart + naux_cart + 1 ao_loc = intopt.ao_loc aux_ao_loc = intopt.aux_ao_loc @@ -1281,9 +1299,9 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1]) int3c_blk = cupy.asarray(int3c_cpu) - if not intopt._auxmol.cart: + if not intopt.auxmol.cart: int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) - if not intopt._mol.cart: + if not intopt.mol.cart: int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) @@ -1293,10 +1311,7 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk - ao_idx = np.argsort(intopt.ao_idx) - aux_idx = np.argsort(intopt.aux_ao_idx) - int3c = int3c[cupy.ix_(np.arange(comp), aux_idx, ao_idx, ao_idx)] - + int3c = intopt.unsort_orbitals(int3c, aux_axis=[1], axis=[2,3]) return int3c.transpose([0,3,2,1]) def get_dh1e(mol, dm0): @@ -1313,7 +1328,7 @@ def get_dh1e(mol, dm0): fakemol.stdout = mol.stdout intopt = VHFOpt(mol, fakemol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - dm0_sorted = take_last2d(dm0, intopt.ao_idx) + dm0_sorted = intopt.sort_orbitals(dm0, axis=[0,1]) dh1e = cupy.zeros([natm,3]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'): dh1e[k0:k1,:3] += contract('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1]) @@ -1332,7 +1347,7 @@ def get_d2h1e(mol, dm0): d2h1e_offdiag = cupy.zeros([natm, nao, 9]) intopt = VHFOpt(mol, fakemol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - dm0_sorted = take_last2d(dm0, intopt.ao_idx) + dm0_sorted = intopt.sort_orbitals(dm0, axis=[0,1]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'): d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1]) d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1]) @@ -1352,8 +1367,8 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N ''' if stream is None: stream = cupy.cuda.get_current_stream() if omega is None: omega = 0.0 - nao_cart = intopt.mol.nao - naux_cart = intopt.auxmol.nao + nao_cart = intopt._sorted_mol.nao + naux_cart = intopt._sorted_auxmol.nao norb_cart = nao_cart + naux_cart + 1 cpi = intopt.cp_idx[cp_ij_id] @@ -1381,7 +1396,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N # if possible, write the data into the given allocated space # otherwise, need a temporary space for cart2sph ''' - if out is None or (lk > 1 and not intopt._auxmol.cart): + if out is None or (lk > 1 and not intopt.auxmol.cart): int3c_blk = cupy.zeros([nk,nj,ni], order='C') strides = np.array([1, ni, ni*nj, 1], dtype=np.int32) else: @@ -1408,7 +1423,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N raise RuntimeError('GINT_fill_int2e failed') # move this operation to j2c? - if lk > 1 and intopt._auxmol.cart == 0: + if lk > 1 and intopt.auxmol.cart == 0: int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out) return int3c_blk @@ -1445,10 +1460,7 @@ def get_int3c2e(mol, auxmol=None, auxbasis='weigend+etb', direct_scf_tol=1e-13, int3c[:, j0:j1, i0:i1] = int3c_slice row, col = np.tril_indices(nao) int3c[:, row, col] = int3c[:, col, row] - ao_idx = np.argsort(intopt.ao_idx) - aux_id = np.argsort(intopt.aux_ao_idx) - int3c = int3c[np.ix_(aux_id, ao_idx, ao_idx)] - + int3c = intopt.unsort_orbitals(int3c, aux_axis=[0], axis=[1,2]) return int3c.transpose([2,1,0]) def sort_mol(mol0, cart=True, log=None): diff --git a/gpu4pyscf/df/tests/test_jk.py b/gpu4pyscf/df/tests/test_df_jk.py similarity index 54% rename from gpu4pyscf/df/tests/test_jk.py rename to gpu4pyscf/df/tests/test_df_jk.py index f353e529..6fb3f841 100644 --- a/gpu4pyscf/df/tests/test_jk.py +++ b/gpu4pyscf/df/tests/test_df_jk.py @@ -17,9 +17,10 @@ import numpy as np import cupy import pyscf -from pyscf import df +from pyscf import df, lib from gpu4pyscf import scf as gpu_scf from gpu4pyscf.df import int3c2e, df_jk +from gpu4pyscf.df.df import DF atom=''' Ti 0.0 0.0 0.0 @@ -31,18 +32,20 @@ bas='def2-tzvpp' def setUpModule(): - global mol, auxmol - mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) - mol.output = '/dev/null' - mol.cart = True - mol.build() - mol.verbose = 1 + global mol, mol_sph, auxmol, auxmol_sph + mol = pyscf.M(atom=atom, basis=bas, output='/dev/null', cart=True, verbose=1) auxmol = df.addons.make_auxmol(mol, auxbasis='sto3g') + mol_sph = pyscf.M(atom=atom, basis=bas, output='/dev/null', cart=False, verbose=1) + auxmol_sph = df.addons.make_auxmol(mol_sph, auxbasis='sto3g') + def tearDownModule(): - global mol, auxmol + global mol, mol_sph, auxmol, auxmol_sph mol.stdout.close() - del mol, auxmol + mol_sph.stdout.close() + auxmol.stdout.close() + auxmol_sph.stdout.close() + del mol, auxmol, mol_sph, auxmol_sph class KnownValues(unittest.TestCase): @@ -51,7 +54,7 @@ def test_vj_incore(self): intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(1e-14, diag_block_with_triu=False, aosym=True) cupy.random.seed(np.asarray(1, dtype=np.uint64)) - nao = len(intopt.ao_idx) + nao = intopt.mol.nao dm = cupy.random.rand(nao, nao) dm = dm + dm.T @@ -64,6 +67,25 @@ def test_vj_incore(self): vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore) vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore) assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5 + + def test_vj_sph_incore(self): + int3c_gpu = int3c2e.get_int3c2e(mol_sph, auxmol, aosym=True, direct_scf_tol=1e-14) + intopt = int3c2e.VHFOpt(mol_sph, auxmol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=False, aosym=True) + cupy.random.seed(np.asarray(1, dtype=np.uint64)) + nao = intopt.mol.nao + dm = cupy.random.rand(nao, nao) + dm = dm + dm.T + + # pass 1 + rhoj_outcore = cupy.einsum('ijL,ij->L', int3c_gpu, dm) + rhoj_incore = 2.0*int3c2e.get_j_int3c2e_pass1(intopt, dm) + assert cupy.linalg.norm(rhoj_outcore - rhoj_incore) < 1e-8 + + # pass 2 + vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore) + vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore) + assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5 def test_j_outcore(self): cupy.random.seed(np.asarray(1, dtype=np.uint64)) @@ -72,10 +94,22 @@ def test_j_outcore(self): dm = dm + dm.T mf = gpu_scf.RHF(mol).density_fit() mf.kernel() - vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False) + vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False, hermi=1) vj = df_jk.get_j(mf.with_df, dm) assert cupy.linalg.norm(vj - vj0) < 1e-4 + + def test_jk_hermi0(self): + dfobj = DF(mol, 'sto3g').build() + np.random.seed(3) + nao = mol.nao + dm = np.random.rand(nao, nao) + refj, refk = dfobj.to_cpu().get_jk(dm, hermi=0) + vj, vk = dfobj.get_jk(dm, hermi=0) + assert abs(vj - refj).max() < 1e-9 + assert abs(vk - refk).max() < 1e-9 + assert abs(lib.fp(vj) - 455.864593801164).max() < 1e-9 + assert abs(lib.fp(vk) - 37.7022369618297).max() < 1e-9 if __name__ == "__main__": print("Full Tests for DF JK") - unittest.main() \ No newline at end of file + unittest.main() diff --git a/gpu4pyscf/df/tests/test_df_rhf.py b/gpu4pyscf/df/tests/test_df_rhf.py index 3852c70b..abb2da46 100644 --- a/gpu4pyscf/df/tests/test_df_rhf.py +++ b/gpu4pyscf/df/tests/test_df_rhf.py @@ -31,15 +31,11 @@ def setUpModule(): global mol_sph, mol_cart - mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0) - mol_sph.output = '/dev/null' - mol_sph.build() - mol_sph.verbose = 1 + mol_sph = pyscf.M(atom=atom, basis=bas, cart=0, + symmetry=True, output='/dev/null', verbose=1) - mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1) - mol_cart.output = '/dev/null' - mol_cart.build() - mol_cart.verbose = 1 + mol_cart = pyscf.M(atom=atom, basis=bas, cart=1, + output='/dev/null', verbose=1) def tearDownModule(): global mol_sph, mol_cart diff --git a/gpu4pyscf/df/tests/test_df_rks.py b/gpu4pyscf/df/tests/test_df_rks.py index 4cd40701..1aa69944 100644 --- a/gpu4pyscf/df/tests/test_df_rks.py +++ b/gpu4pyscf/df/tests/test_df_rks.py @@ -31,15 +31,11 @@ def setUpModule(): global mol_sph, mol_cart - mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0) - mol_sph.output = '/dev/null' - mol_sph.build() - mol_sph.verbose = 1 - - mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1) - mol_cart.output = '/dev/null' - mol_cart.build() - mol_cart.verbose = 1 + mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0, + output='/dev/null', verbose=1) + + mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1, + output='/dev/null', verbose=1) def tearDownModule(): global mol_sph, mol_cart diff --git a/gpu4pyscf/df/tests/test_df_rks_grad.py b/gpu4pyscf/df/tests/test_df_rks_grad.py index ea382e66..a218630d 100644 --- a/gpu4pyscf/df/tests/test_df_rks_grad.py +++ b/gpu4pyscf/df/tests/test_df_rks_grad.py @@ -117,17 +117,17 @@ def _vs_cpu(mol, grid_response=False, xc=xc0, disp=disp0, tol=1e-9): assert abs(g_analy - ref).max() < tol class KnownValues(unittest.TestCase): - + def test_grad_with_grids_response(self): print("-----testing DF DFT gradient with grids response----") _check_grad(mol_sph, grid_response=True, xc='LDA', disp=None) _check_grad(mol_sph, grid_response=True, xc='B3LYP', disp=None) _check_grad(mol_sph, grid_response=True, xc='m06', disp=None, tol=1e-4) - + def test_grad_lda(self): print("-----LDA testing-------") _vs_cpu(mol_sph, xc='LDA', disp=None) - + def test_grad_gga(self): print('-----GGA testing-------') _vs_cpu(mol_sph, xc='PBE', disp=None) @@ -147,7 +147,7 @@ def test_grad_rsh(self): def test_grad_nlc(self): print('--------nlc testing-------------') _vs_cpu(mol_sph, xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-7) - + def test_grad_cart(self): print('------ Cart testing--------') _vs_cpu(mol_cart, xc='B3LYP', disp=None) @@ -163,7 +163,7 @@ def test_grad_d4(self): def test_grad_wb97m_d3bj(self): print('------ wB97m-d3bj --------') _vs_cpu(mol_sph, xc='wb97m-d3bj', tol=1e-8) - + if __name__ == "__main__": print("Full Tests for DF Gradient") unittest.main() diff --git a/gpu4pyscf/dft/__init__.py b/gpu4pyscf/dft/__init__.py index d1ae3570..c65e412d 100644 --- a/gpu4pyscf/dft/__init__.py +++ b/gpu4pyscf/dft/__init__.py @@ -1,9 +1,9 @@ from . import rks -from .rks import RKS +from .rks import RKS, KohnShamDFT from .uks import UKS from .gks import GKS from .roks import ROKS -from gpu4pyscf.dft.gen_grid import Grids +from .gen_grid import Grids def KS(mol, xc='LDA,VWN'): if mol.spin == 0: diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py index dda28353..3f709733 100644 --- a/gpu4pyscf/dft/gks.py +++ b/gpu4pyscf/dft/gks.py @@ -26,6 +26,7 @@ class GKS(gks.GKS, GHF): def __init__(self, mol, xc='LDA,VWN'): raise NotImplementedError + reset = rks.RKS.reset energy_elec = rks.RKS.energy_elec get_veff = NotImplemented nuc_grad_method = NotImplemented diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py index 8a07e3c3..850a879a 100644 --- a/gpu4pyscf/dft/libxc.py +++ b/gpu4pyscf/dft/libxc.py @@ -124,17 +124,18 @@ def _check_arrays(current_arrays, fields, sizes, factor, required): """ A specialized function built to construct and check the sizes of arrays given to the LibXCFunctional class. """ - # Nothing supplied so we build it out if current_arrays is None: current_arrays = {} + if not required: + for label in fields: + current_arrays[label] = None + return current_arrays + for label in fields: - if required: - size = sizes[label] - current_arrays[label] = cupy.empty((factor, size), dtype=np.float64) - else: - current_arrays[label] = None # cupy.empty((1)) + size = sizes[label] + current_arrays[label] = cupy.empty((factor, size), dtype=np.float64) return current_arrays @@ -150,6 +151,7 @@ class _xcfun(ctypes.Structure): class XCfun: def __init__(self, xc, spin): + self.spin = spin self._spin = 1 if spin == 'unpolarized' else 2 self.xc_func = _libxc.xc_func_alloc() if isinstance(xc, str): @@ -178,6 +180,9 @@ def needs_laplacian(self): rsh_coeff = dft.libxc.rsh_coeff def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_kxc=False, do_lxc=False): + # TODO: turn to dft.libxc.eval_xc for do_kxc and do_lxc + assert not do_kxc + assert not do_lxc if isinstance(inp, cupy.ndarray): inp = {"rho": cupy.asarray(inp, dtype=cupy.double)} elif isinstance(inp, dict): @@ -207,12 +212,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k args.extend([ inp[x] for x in input_labels]) args.extend([output[x] for x in output_labels]) - cuda_args = [] - for arg in args: - if(isinstance(arg, cupy.ndarray)): - arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) - cuda_args.append(arg) - #_libxc.xc_lda(*cuda_args) out_params = xc_lda_out_params() buf_params = xc_lda_out_params() @@ -246,12 +245,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k args.extend([ inp[x] for x in input_labels]) args.extend([output[x] for x in output_labels]) - cuda_args = [] - for arg in args: - if(isinstance(arg, cupy.ndarray)): - arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) - cuda_args.append(arg) - #_libxc.xc_gga(*cuda_args) out_params = xc_gga_out_params() buf_params = xc_gga_out_params() @@ -295,12 +288,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k args.insert(-1, cupy.empty((1))) # Add none ptr to laplacian #args.insert(-1, cupy.zeros_like(inp['rho'])) args.extend([output[x] for x in output_labels]) - cuda_args = [] - for arg in args: - if(isinstance(arg, cupy.ndarray)): - arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) - cuda_args.append(arg) - #_libxc.xc_mgga(*cuda_args) out_params = xc_mgga_out_params() buf_params = xc_mgga_out_params() @@ -310,13 +297,14 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k setattr(buf_params, label, buf[label].data.ptr) setattr(out_params, label, output[label].data.ptr) stream = cupy.cuda.get_current_stream() + lapl = cupy.empty(1) err = libgdft.GDFT_xc_mgga( stream.ptr, self.xc_func, npoints, inp['rho'].data.ptr, inp['sigma'].data.ptr, - cupy.empty(1).data.ptr, + lapl.data.ptr, inp['tau'].data.ptr, ctypes.byref(out_params), ctypes.byref(buf_params) diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index b68d5368..c1bb1180 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -16,6 +16,7 @@ # along with this program. If not, see . import ctypes +from functools import lru_cache import contextlib import numpy as np import cupy @@ -25,7 +26,7 @@ from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.gto.mole import basis_seg_contraction from gpu4pyscf.lib.cupy_helper import ( - contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, take_last2d, transpose_sum, + contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, transpose_sum, grouped_dot, grouped_gemm) from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ @@ -41,7 +42,6 @@ # Should we release the cupy cache? FREE_CUPY_CACHE = False -MGGA_DENSITY_LAPL = False USE_SPARSITY = 2 # 0: no sparsity, 1: in-house GEMM, 2: sparse in AO direction libgdft = load_library('libgdft') @@ -52,23 +52,26 @@ libgdft.GDFTdot_ao_ao_sparse.restype = ctypes.c_int libgdft.GDFTdot_aow_ao_sparse.restype = ctypes.c_int -def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None, - non0tab=None, out=None, verbose=None, ctr_offsets_slice=None): +def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None, + non0tab=None, out=None, verbose=None, ctr_offsets_slice=None, gdftopt=None, + transpose=True): ''' evaluate ao values for given coords and shell indices Kwargs: shls_slice : offsets of shell slices to be evaluated ao_loc_slice: offsets of ao slices to be evaluated ctr_offsets_slice: offsets of contraction patterns Returns: - ao: comp x nao_slice x ngrids, ao is in C-contiguous + ao: comp x nao_slice x ngrids, ao is in C-contiguous. + comp x ngrids x nao_slice if tranpose, be compatiable with PySCF. ''' - opt = getattr(ni, 'gdftopt', None) - with_opt = True - if opt is None or mol not in [opt.mol, opt._sorted_mol]: - ni.build(mol, coords) - opt = ni.gdftopt - with_opt = False - mol = None + if gdftopt is None: + opt = _GDFTOpt.from_mol(mol) + with opt.gdft_envs_cache(): + return eval_ao( + mol, coords, deriv, shls_slice, nao_slice, ao_loc_slice, + non0tab, out, verbose, ctr_offsets_slice, opt, transpose) + + opt = gdftopt _sorted_mol = opt._sorted_mol if shls_slice is None: @@ -78,6 +81,9 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_sl ao_loc_slice = cupy.asarray(_sorted_mol.ao_loc_nr()) nao_slice = _sorted_mol.nao else: + assert ao_loc_slice is not None + assert nao_slice is not None + assert ctr_offsets_slice is not None ctr_offsets = opt.l_ctr_offsets nctr = ctr_offsets.size - 1 @@ -96,44 +102,34 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_sl if out is None: out = cupy.empty((comp, nao_slice, ngrids), order='C') - if not with_opt: - # mol may be different to _GDFTOpt._sorted_mol. - # nao should be consistent with the _GDFTOpt._sorted_mol object - coeff = cupy.asarray(opt.coeff) - with opt.gdft_envs_cache(): - err = libgdft.GDFTeval_gto( - ctypes.cast(stream.ptr, ctypes.c_void_p), - ctypes.cast(out.data.ptr, ctypes.c_void_p), - ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart), - ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), - ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p), - ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p), - ctypes.c_int(nao_slice), - ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr), - ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p), - _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p)) - out = contract('nig,ij->njg', out, coeff).transpose([0,2,1]) - else: - err = libgdft.GDFTeval_gto( - ctypes.cast(stream.ptr, ctypes.c_void_p), - ctypes.cast(out.data.ptr, ctypes.c_void_p), - ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart), - ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), - ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p), - ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p), - ctypes.c_int(nao_slice), - ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr), - ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p), - _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p)) + err = libgdft.GDFTeval_gto( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(out.data.ptr, ctypes.c_void_p), + ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart), + ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), + ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p), + ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao_slice), + ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr), + ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p), + _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p)) + if err != 0: raise RuntimeError('CUDA Error in evaluating AO') + if mol is not _sorted_mol: + coeff = cupy.asarray(opt.coeff) + out = contract('nig,ij->njg', out, coeff) + + if transpose: + out = out.transpose(0,2,1) + if deriv == 0: out = out[0] return out def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0, - with_lapl=True, verbose=None): + with_lapl=False, verbose=None): xctype = xctype.upper() if xctype in ('LDA', 'HF'): _, ngrids = ao.shape @@ -153,17 +149,13 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0, if hermi: rho[1:4] *= 2 # *2 for + einsum('pi,ij,pj->p', ao[i], dm, ao[0]) else: - c0 = dm.dot(ao[0]) + c0 = dm.T.dot(ao[0]) for i in range(1, 4): rho[i] += _contract_rho(ao[i], c0) else: # meta-GGA - if with_lapl: - # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2 - rho = cupy.empty((6,ngrids)) - tau_idx = 5 - else: - rho = cupy.empty((5,ngrids)) - tau_idx = 4 + assert not with_lapl + rho = cupy.empty((5,ngrids)) + tau_idx = 4 c0 = dm.dot(ao[0]) rho[0] = _contract_rho(c0, ao[0]) @@ -181,11 +173,11 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0, return rho def eval_rho1(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', - with_lapl=True, verbose=None): + with_lapl=False, verbose=None): raise NotImplementedError def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', - with_lapl=True, verbose=None, out=None): + with_lapl=False, verbose=None, out=None): xctype = xctype.upper() if xctype == 'LDA' or xctype == 'HF': _, ngrids = ao.shape @@ -205,40 +197,24 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', _contract_rho(c0, c1, rho=rho[i]) rho[1:] *= 2 else: # meta-GGA - if with_lapl: - # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2 - rho = cupy.empty((6,ngrids)) - tau_idx = 5 - else: - rho = cupy.empty((5,ngrids)) - tau_idx = 4 + assert not with_lapl + rho = cupy.empty((5,ngrids)) + tau_idx = 4 c0 = cupy.dot(cpos.T, ao[0]) _contract_rho(c0, c0, rho=rho[0]) - rho[tau_idx] = 0 for i in range(1, 4): c1 = cupy.dot(cpos.T, ao[i]) rho[i] = _contract_rho(c0, c1) rho[tau_idx] += _contract_rho(c1, c1) - if with_lapl: - if ao.shape[0] > 4: - XX, YY, ZZ = 4, 7, 9 - ao2 = ao[XX] + ao[YY] + ao[ZZ] - c1 = cupy.dot(cpos.T, ao2) - #:rho[4] = numpy.einsum('pi,pi->p', c0, c1) - rho[4] = _contract_rho(c0, c1) - rho[4] += rho[5] - rho[4] *= 2 - else: - rho[4] = 0 rho[1:4] *= 2 rho[tau_idx] *= .5 return rho def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', - with_lapl=True, verbose=None): + with_lapl=False, verbose=None): xctype = xctype.upper() if xctype == 'LDA' or xctype == 'HF': _, ngrids = ao.shape @@ -261,15 +237,9 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', rho[i] += _contract_rho(c0[0], c_0[i]) rho *= 2.0 else: # meta-GGA - # TODO: complete this - if with_lapl: - raise NotImplementedError("mGGA with lapl not implemented") - # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2 - rho = cupy.empty((6,ngrids)) - tau_idx = 5 - else: - rho = cupy.empty((5,ngrids)) - tau_idx = 4 + assert not with_lapl + rho = cupy.empty((5,ngrids)) + tau_idx = 4 c_0 = contract('nig,io->nog', ao, cpos1) #:rho[0] = numpy.einsum('pi,pi->p', c0, c0) rho[0] = _contract_rho(c0[0], c_0[0]) @@ -281,27 +251,22 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', rho[i]+= _contract_rho(c0[0], c_0[i]) rho[tau_idx] += _contract_rho(c_0[i], c0[i]) rho *= 2.0 - if with_lapl: - raise NotImplementedError("mGGA with lapl not implemented") - if ao.shape[0] > 4: - XX, YY, ZZ = 4, 7, 9 - ao2 = ao[XX] + ao[YY] + ao[ZZ] - c1 = _dot_ao_dm(mol, ao2, cpos1, non0tab, shls_slice, ao_loc) - #:rho[4] = numpy.einsum('pi,pi->p', c0, c1) - rho[4] = _contract_rho(c0, c1) - rho[4] += rho[5] - rho[4] *= 2 - else: - rho[4] = 0 rho[tau_idx] *= .5 return rho -def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA', - with_lapl=True, verbose=None): - ''' ao: nd x nao x ng - c0: nd x nocc x ng - mo1: na x nao x nocc +def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0, + with_lapl=False, verbose=None): + '''Evaluate density using first order orbitals. This density is typically + derived from the non-symmetric density matrix (hermi=0) in TDDFT + dm[i] = mo0.dot(mo1[i].T) and symmetric density matrix (hermi=1) in CPHF + dm[i] = mo0.dot(mo1[i].T) + mo1[i].dot(mo0.T) + + ao: nd x nao x ng + mo0: nao x nocc + mo1: na x nao x nocc ''' + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() xctype = xctype.upper() if xctype == 'LDA' or xctype == 'HF': _, ngrids = ao.shape @@ -309,30 +274,32 @@ def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA', _, ngrids = ao[0].shape na = mo1.shape[0] - cpos1= mo1 if xctype == 'LDA' or xctype == 'HF': - c_0 = contract('aio,ig->aog', cpos1, ao)#cupy.dot(cpos1.T, ao) + c0 = mo0.T.dot(ao) + t1 = log.timer_debug2('eval occ_coeff', *t0) + c_0 = contract('aio,ig->aog', mo1, ao) rho = cupy.empty([na,ngrids]) for i in range(na): rho[i] = _contract_rho(c0, c_0[i]) - rho *= 2.0 elif xctype in ('GGA', 'NLC'): - log = logger.new_logger(mol, mol.verbose) - t0 = log.init_timer() - c_0 = contract('nig,aio->anog', ao, cpos1) - t0 = log.timer_debug2('ao * cpos', *t0) + c0 = contract('nig,io->nog', ao, mo0) + t1 = log.timer_debug2('eval occ_coeff', *t0) + c_0 = contract('nig,aio->anog', ao, mo1) + t1 = log.timer_debug2('ao * cpos', *t1) rho = cupy.empty([na, 4, ngrids]) for i in range(na): _contract_rho_gga(c0, c_0[i], rho=rho[i]) - t0 = log.timer_debug2('contract rho', *t0) else: # meta-GGA - if with_lapl: - raise NotImplementedError("mGGA with lapl not implemented") + assert not with_lapl rho = cupy.empty((na,5,ngrids)) - c_0 = contract('nig,aio->anog', ao, cpos1) + c0 = contract('nig,io->nog', ao, mo0) + c_0 = contract('nig,aio->anog', ao, mo1) for i in range(na): _contract_rho_mgga(c0, c_0[i], rho=rho[i]) - + if hermi: + # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao + rho *= 2. + t0 = log.timer_debug2('contract rho', *t0) return rho def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): @@ -435,7 +402,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) - if opt is None or mol not in [opt.mol, opt._sorted_mol]: + if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt @@ -443,17 +410,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_occ = getattr(dms,'mo_occ', None) mol = None _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - nao, nao0 = coeff.shape + nao, nao0 = opt.coeff.shape dms = cupy.asarray(dms) dm_shape = dms.shape - #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)] - dms = dms.reshape(-1,nao0,nao0) - dms = take_last2d(dms, opt.ao_idx) + dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2]) nset = len(dms) if mo_coeff is not None: - mo_coeff = mo_coeff[opt.ao_idx] + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) nelec = cupy.empty(nset) excsum = cupy.empty(nset) @@ -464,27 +428,24 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ao_deriv = 0 else: ao_deriv = 1 - with_lapl = MGGA_DENSITY_LAPL ngrids = grids.weights.size if xctype == 'LDA': rho_tot = cupy.empty([nset,1,ngrids]) elif xctype == 'GGA': rho_tot = cupy.empty([nset,4,ngrids]) else: - if with_lapl: - rho_tot = cupy.empty([nset,6,ngrids]) - else: - rho_tot = cupy.empty([nset,5,ngrids]) + rho_tot = cupy.empty([nset,5,ngrids]) p0 = p1 = 0 t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): p1 = p0 + weight.size for i in range(nset): if mo_coeff is None: - rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl) + rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] - rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl) + rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) p0 = p1 t1 = log.timer_debug2('eval rho slice', *t1) t0 = log.timer_debug1('eval rho', *t0) @@ -501,6 +462,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, excsum[i] = cupy.dot(den, exc[:,0]) wv.append(vxc * grids.weights) + # *.5 for v+v.conj().T at the end if xctype == 'GGA': wv[i][0] *= .5 if xctype == 'MGGA': @@ -512,7 +474,8 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, t1 = t0 p0 = p1 = 0 - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): p1 = p0 + weight.size for i in range(nset): if xctype == 'LDA': @@ -535,8 +498,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, p0 = p1 t1 = log.timer_debug2('integration', *t1) t0 = log.timer_debug1('vxc integration', *t0) - rev_ao_idx = opt.rev_ao_idx - vmat = take_last2d(vmat, rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) if xctype != 'LDA': transpose_sum(vmat) @@ -553,7 +515,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, return nelec, excsum, vmat def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype='LDA', - with_lapl=True, verbose=None, out=None): + with_lapl=False, verbose=None, out=None): groups = len(ao_group) xctype = xctype.upper() if xctype == 'LDA' or xctype == 'HF': @@ -600,6 +562,7 @@ def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype=' rho[1:] *= 2 rho_group.append(rho) else: # meta-GGA + assert not with_lapl c0_group = [] cpos_group4 = [] ao_group4 = [] @@ -646,7 +609,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) - if opt is None or mol not in [opt.mol, opt._sorted_mol]: + if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt @@ -655,17 +618,14 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mol = None _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - nao, nao0 = coeff.shape + nao, nao0 = opt.coeff.shape dms = cupy.asarray(dms) dm_shape = dms.shape - #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)] - dms = dms.reshape(-1,nao0,nao0) - dms = take_last2d(dms, opt.ao_idx) + dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2]) nset = len(dms) if mo_coeff is not None: - mo_coeff = mo_coeff[opt.ao_idx] + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) nelec = cupy.zeros(nset) excsum = cupy.zeros(nset) @@ -676,27 +636,24 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ao_deriv = 0 else: ao_deriv = 1 - with_lapl = MGGA_DENSITY_LAPL ngrids = grids.weights.size if xctype == 'LDA': rho_tot = cupy.empty([nset,1,ngrids]) elif xctype == 'GGA': rho_tot = cupy.empty([nset,4,ngrids]) else: - if with_lapl: - rho_tot = cupy.empty([nset,6,ngrids]) - else: - rho_tot = cupy.empty([nset,5,ngrids]) + rho_tot = cupy.empty([nset,5,ngrids]) p0 = p1 = 0 t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): p1 = p0 + weight.size for i in range(nset): if mo_coeff is None: - rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl) + rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] - rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl) + rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) p0 = p1 t1 = log.timer_debug2('eval rho slice', *t1) t0 = log.timer_debug1('eval rho', *t0) @@ -772,8 +729,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, raise NotImplementedError(f'numint.nr_rks for functional {xc_code}') t1 = log.timer_debug2('integration', *t1) t0 = log.timer_debug1('vxc integration', *t0) - rev_ao_idx = opt.rev_ao_idx - vmat = take_last2d(vmat, rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) if xctype != 'LDA': transpose_sum(vmat) @@ -794,7 +750,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) - if opt is None or mol not in [opt.mol, opt._sorted_mol]: + if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt @@ -802,18 +758,17 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_occ = getattr(dms,'mo_occ', None) mol = None _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - nao, nao0 = coeff.shape + nao, nao0 = opt.coeff.shape dma, dmb = dms dm_shape = dma.shape dma = cupy.asarray(dma).reshape(-1,nao0,nao0) dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0) - dma = [coeff @ dm @ coeff.T for dm in dma] - dmb = [coeff @ dm @ coeff.T for dm in dmb] + dma = opt.sort_orbitals(dma, axis=[1,2]) + dmb = opt.sort_orbitals(dmb, axis=[1,2]) nset = len(dma) if mo_coeff is not None: - mo_coeff = coeff @ mo_coeff + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1]) nelec = np.zeros((2,nset)) excsum = np.zeros(nset) @@ -825,18 +780,18 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ao_deriv = 0 else: ao_deriv = 1 - with_lapl = MGGA_DENSITY_LAPL - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): for i in range(nset): t0 = log.init_timer() if mo_coeff is None: - rho_a = eval_rho(_sorted_mol, ao_mask, dma[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl) - rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl) + rho_a = eval_rho(_sorted_mol, ao_mask, dma[i][idx[:,None],idx], xctype=xctype, hermi=1) + rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][idx[:,None],idx], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[:, idx,:] - rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype, with_lapl) - rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype, with_lapl) + rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) + rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) rho = cupy.stack([rho_a, rho_b], axis=0) exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] @@ -882,8 +837,8 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, excsum[i] += cupy.dot(den_b, exc[:,0]) t1 = log.timer_debug1('integration', *t1) - vmata = [coeff.T @ v @ coeff for v in vmata] - vmatb = [coeff.T @ v @ coeff for v in vmatb] + vmata = opt.unsort_orbitals(vmata, axis=[1,2]) + vmatb = opt.unsort_orbitals(vmatb, axis=[1,2]) if xctype != 'LDA': for i in range(nset): vmata[i] = vmata[i] + vmata[i].T @@ -918,7 +873,6 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None): dm = coeff @ cupy.asarray(dm) @ coeff.T if mo_coeff is not None: mo_coeff = coeff @ mo_coeff - with_lapl = MGGA_DENSITY_LAPL mem_avail = get_avail_mem() blksize = mem_avail*.2/8/nao//ALIGNED * ALIGNED @@ -932,11 +886,11 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None): t1 = t0 = log.init_timer() for p0, p1 in lib.prange(0,ngrids,blksize): coords = grids.coords[p0:p1] - ao = eval_ao(ni, _sorted_mol, coords, 0) + ao = eval_ao(_sorted_mol, coords, 0, gdftopt=opt, transpose=False) if mo_coeff is None: - rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1, with_lapl=with_lapl) + rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1) else: - rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA', with_lapl) + rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA') t1 = log.timer_debug2('eval rho slice', *t1) t0 = log.timer_debug1('eval rho', *t0) @@ -957,16 +911,15 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= opt = ni.gdftopt _sorted_mol = opt.mol - coeff = cupy.asarray(opt.coeff) - nao, nao0 = coeff.shape + nao, nao0 = opt.coeff.shape dms = cupy.asarray(dms) dm_shape = dms.shape # AO basis -> gdftopt AO basis with_mocc = hasattr(dms, 'mo1') if with_mocc: - mo1 = dms.mo1[:,opt.ao_idx] * 2.0**0.5 - occ_coeff = dms.occ_coeff[opt.ao_idx] * 2.0**0.5 - dms = take_last2d(dms, opt.ao_idx) + mo1 = opt.sort_orbitals(dms.mo1, axis=[1]) + occ_coeff = opt.sort_orbitals(dms.occ_coeff, axis=[0]) * 2.0 + dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2]) nset = len(dms) vmat = cupy.zeros((nset, nao, nao)) @@ -974,29 +927,23 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= ao_deriv = 0 else: ao_deriv = 1 - with_lapl = MGGA_DENSITY_LAPL p0 = 0 p1 = 0 t1 = t0 = log.init_timer() - for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): p0, p1 = p1, p1+len(weights) # precompute molecular orbitals if with_mocc: occ_coeff_mask = occ_coeff[mask] - if xctype == 'LDA': - c0 = _dot_ao_dm(_sorted_mol, ao, occ_coeff_mask, None, None, None) - elif xctype == "GGA": - c0 = contract('nig,io->nog', ao, occ_coeff_mask) - else: # mgga - c0 = contract('nig,io->nog', ao, occ_coeff_mask) - t1 = log.timer_debug2(f'eval occ_coeff, with mocc: {with_mocc}', *t1) - if with_mocc: - rho1 = eval_rho4(_sorted_mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False) + rho1 = eval_rho4(_sorted_mol, ao, occ_coeff_mask, mo1[:,mask], + xctype=xctype, hermi=hermi) else: # slow version rho1 = [] for i in range(nset): - rho_tmp = eval_rho(_sorted_mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl) + rho_tmp = eval_rho(_sorted_mol, ao, dms[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) rho1.append(rho_tmp) rho1 = cupy.stack(rho1, axis=0) t1 = log.timer_debug2('eval rho', *t1) @@ -1012,12 +959,10 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= for i in range(nset): if xctype == 'LDA': vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T) - add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'GGA': wv[i,0] *= .5 aow = _scale_ao(ao, wv[i]) vmat_tmp = aow.dot(ao[0].T) - add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'NLC': raise NotImplementedError('NLC') else: @@ -1025,13 +970,13 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= wv[i,4] *= .5 vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T) vmat_tmp+= _tau_dot(ao, ao, wv[i,4]) - add_sparse(vmat[i], vmat_tmp, mask) + add_sparse(vmat[i], vmat_tmp, mask) t1 = log.timer_debug2('integration', *t1) - ao = c0 = rho1 = None + ao = rho1 = None t0 = log.timer_debug1('vxc', *t0) - vmat = take_last2d(vmat, opt.rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) if xctype != 'LDA': transpose_sum(vmat) @@ -1054,7 +999,8 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, fxc = fxc[0,:,0] + fxc[0,:,1] else: fxc = fxc[0,:,0] - fxc[0,:,1] - return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc) + return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc, + max_memory=max_memory, verbose=verbose) def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0, @@ -1069,8 +1015,7 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= opt = ni.gdftopt mol = None _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - nao, nao0 = coeff.shape + nao, nao0 = opt.coeff.shape dma, dmb = dms dm_shape = dma.shape # AO basis -> gdftopt AO basis @@ -1078,17 +1023,15 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if with_mocc: mo1a, mo1b = dms.mo1 occ_coeffa, occ_coeffb = dms.occ_coeff - mo1a = contract('nio,pi->npo', mo1a, coeff) - mo1b = contract('nio,pi->npo', mo1b, coeff) - occ_coeff_a = contract('io,pi->po', occ_coeffa, coeff) - occ_coeff_b = contract('io,pi->po', occ_coeffb, coeff) + mo1a = opt.sort_orbitals(mo1a, axis=[1]) + mo1b = opt.sort_orbitals(mo1b, axis=[1]) + occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0]) + occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0]) dma = cupy.asarray(dma).reshape(-1,nao0,nao0) dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0) - dma = contract('nij,qj->niq', dma, coeff) - dma = contract('pi,niq->npq', coeff, dma) - dmb = contract('nij,qj->niq', dmb, coeff) - dmb = contract('pi,niq->npq', coeff, dmb) + dma = opt.sort_orbitals(dma, axis=[1,2]) + dmb = opt.sort_orbitals(dmb, axis=[1,2]) nset = len(dma) vmata = cupy.zeros((nset, nao, nao)) @@ -1096,84 +1039,65 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if xctype == 'LDA': ao_deriv = 0 + nvar = 1 + elif xctype == 'GGA': + ao_deriv = 1 + nvar = 4 else: ao_deriv = 1 - with_lapl = MGGA_DENSITY_LAPL - p0 = 0 - p1 = 0 - for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + nvar = 5 + p0 = p1 = 0 + for ao, mask, weights, coords in ni.block_loop( + _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): t0 = log.init_timer() p0, p1 = p1, p1+len(weights) + # precompute fxc_w + fxc_w = fxc[:,:,:,:,p0:p1] * weights + # precompute molecular orbitals if with_mocc: occ_coeff_a_mask = occ_coeff_a[mask] occ_coeff_b_mask = occ_coeff_b[mask] - if xctype == 'LDA': - c0_a = _dot_ao_dm(_sorted_mol, ao, occ_coeff_a_mask, None, None, None) - c0_b = _dot_ao_dm(_sorted_mol, ao, occ_coeff_b_mask, None, None, None) - elif xctype == "GGA": - c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask) - c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask) - else: # mgga - c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask) - c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask) - - if with_mocc: - rho1a = eval_rho4(_sorted_mol, ao, c0_a, mo1a[:,mask], xctype=xctype, with_lapl=with_lapl) - rho1b = eval_rho4(_sorted_mol, ao, c0_b, mo1b[:,mask], xctype=xctype, with_lapl=with_lapl) - else: - # slow version - rho1a = [] - rho1b = [] + rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask], + xctype=xctype, hermi=hermi) + rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask], + xctype=xctype, hermi=hermi) + rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0) + else: # slow version + rho1 = cupy.empty((2, nset, nvar, p1-p0)) for i in range(nset): - rho_tmp = eval_rho(_sorted_mol, ao, dma[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl) - rho1a.append(rho_tmp) - rho_tmp = eval_rho(_sorted_mol, ao, dmb[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl) - rho1b.append(rho_tmp) - rho1a = cupy.stack(rho1a, axis=0) - rho1b = cupy.stack(rho1b, axis=0) - rho1 = cupy.stack([rho1a, rho1b], axis=0) + rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) + rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) t0 = log.timer_debug1('rho', *t0) - # precompute fxc_w - if xctype == 'LDA': - fxc_w = fxc[:,0,:,0,p0:p1] * weights - else: - fxc_w = fxc[:,:,:,:,p0:p1] * weights - for i in range(nset): + wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) if xctype == 'LDA': - wv = contract('ag,abg->bg', rho1[:,i], fxc_w) - va = ao.dot(_scale_ao(ao, wv[0]).T) - vb = ao.dot(_scale_ao(ao, wv[1]).T) - add_sparse(vmata[i], va, mask) - add_sparse(vmatb[i], vb, mask) + va = ao.dot(_scale_ao(ao, wv[0,0]).T) + vb = ao.dot(_scale_ao(ao, wv[1,0]).T) elif xctype == 'GGA': - wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) - wv[:,0] *= .5 + wv[:,0] *= .5 # for transpose_sum at the end va = ao[0].dot(_scale_ao(ao, wv[0]).T) vb = ao[0].dot(_scale_ao(ao, wv[1]).T) - add_sparse(vmata[i], va, mask) - add_sparse(vmatb[i], vb, mask) elif xctype == 'NLC': raise NotImplementedError('NLC') else: - wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) - wv[:,[0, 4]] *= .5 + wv[:,[0,4]] *= .5 # for transpose_sum at the end va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T) vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T) va += _tau_dot(ao, ao, wv[0,4]) vb += _tau_dot(ao, ao, wv[1,4]) - add_sparse(vmata[i], va, mask) - add_sparse(vmatb[i], vb, mask) - vmata = [coeff.T @ v @ coeff for v in vmata] - vmatb = [coeff.T @ v @ coeff for v in vmatb] + add_sparse(vmata[i], va, mask) + add_sparse(vmatb[i], vb, mask) + vmata = opt.unsort_orbitals(vmata, axis=[1,2]) + vmatb = opt.unsort_orbitals(vmatb, axis=[1,2]) if xctype != 'LDA': # For real orbitals, K_{ia,bj} = K_{ia,jb}. It simplifies real fxc_jb # [(\nabla mu) nu + mu (\nabla nu)] * fxc_jb = ((\nabla mu) nu f_jb) + h.c. - for i in range(nset): - vmata[i] = vmata[i] + vmata[i].T - vmatb[i] = vmatb[i] + vmatb[i].T + transpose_sum(vmata) + transpose_sum(vmatb) if FREE_CUPY_CACHE: dma = dmb = None @@ -1228,23 +1152,22 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, nao, nao0 = opt.coeff.shape mol = None _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)] + + dms = dms.reshape(-1,nao0,nao0) assert len(dms) == 1 + dms = opt.sort_orbitals(dms, axis=[1,2]) if mo_coeff is not None: - mo_coeff = coeff @ mo_coeff - with_lapl = MGGA_DENSITY_LAPL + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) ao_deriv = 1 vvrho = [] for ao, idx, weight, coords \ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): - #rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1) if mo_coeff is None: - rho = eval_rho(_sorted_mol, ao, dms[0][np.ix_(idx,idx)], xctype='GGA', hermi=1, with_lapl=with_lapl) + rho = eval_rho(_sorted_mol, ao, dms[0][idx[:,None],idx], xctype='GGA', hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] - rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA', with_lapl) + rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA') vvrho.append(rho) rho = cupy.hstack(vvrho) @@ -1277,7 +1200,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, t1 = log.timer_debug1('integration', *t1) transpose_sum(vmat) - vmat = take_last2d(vmat, opt.rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[0,1]) log.timer_debug1('eval vv10', *t0) return nelec, excsum, vmat @@ -1293,7 +1216,6 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, raise NotImplementedError('NLC') else: ao_deriv = 0 - with_lapl = MGGA_DENSITY_LAPL opt = getattr(ni, 'gdftopt', None) if opt is None or mol not in [opt.mol, opt._sorted_mol]: ni.build(mol, grids.coords) @@ -1301,28 +1223,34 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, mol = None _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - nao = coeff.shape[0] - if spin == 0: - mo_coeff = coeff @ mo_coeff + mo_coeff = cupy.asarray(mo_coeff) + nao = opt.coeff.shape[0] + if mo_coeff.ndim == 2: # RHF + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) rho = [] t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): mo_coeff_mask = mo_coeff[idx,:] - rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl) + rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) rho.append(rho_slice) t1 = log.timer_debug2('eval rho slice', *t1) rho = cupy.hstack(rho) + if spin == 1: # RKS with nr_rks_fxc_st + rho *= .5 + rho = cupy.repeat(rho[None], 2, axis=0) t0 = log.timer_debug1('eval rho in fxc', *t0) else: - mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff)) + assert spin == 1 + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1]) rhoa = [] rhob = [] t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=max_memory): mo_coeff_mask = mo_coeff[:,idx,:] - rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype, with_lapl) - rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype, with_lapl) + rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype) + rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype) rhoa.append(rhoa_slice) rhob.append(rhob_slice) t1 = log.timer_debug2('eval rho in fxc', *t1) @@ -1348,7 +1276,8 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None if omega is None: omega = ni.omega if xctype is None: xctype = ni._xc_type(xc_code) - if ni.xcfuns is None: ni.xcfuns = _init_xcfuns(xc_code, spin_polarized) + + xcfuns = ni._init_xcfuns(xc_code, spin_polarized) inp = {} if not spin_polarized: @@ -1391,13 +1320,13 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None "v3sigma2lapl", "v3sigma2tau", "v3sigmalapl2", "v3sigmalapltau", "v3sigmatau2", "v3lapl3", "v3lapl2tau", "v3lapltau2", "v3tau3"] - if len(ni.xcfuns) == 1: - xcfun, _ = ni.xcfuns[0] + if len(xcfuns) == 1: + xcfun, _ = xcfuns[0] xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc) ret_full = xc_res else: ret_full = {} - for xcfun, w in ni.xcfuns: + for xcfun, w in xcfuns: xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc) for label in xc_res: if label in ret_full: @@ -1539,11 +1468,14 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids] ao_mask = eval_ao( - ni, _sorted_mol, coords, deriv, + _sorted_mol, coords, deriv, nao_slice=len(idx), shls_slice=non0shl_idx, ao_loc_slice=ao_loc_slice, - ctr_offsets_slice=ctr_offsets_slice) + ctr_offsets_slice=ctr_offsets_slice, + gdftopt=opt, + transpose=False + ) t1 = log.timer_debug2('evaluate ao slice', *t1) if pad > 0: @@ -1579,7 +1511,7 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, raise RuntimeError('Not enough GPU memory') opt = getattr(ni, 'gdftopt', None) - if opt is None or mol not in [opt.mol, opt._sorted_mol]: + if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt @@ -1590,7 +1522,6 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, total_used_bytes = 0 mem_limit = get_avail_mem() - mol = None _sorted_mol = opt._sorted_mol with opt.gdft_envs_cache(): block_id = 0 @@ -1605,11 +1536,14 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids] ao_mask = eval_ao( - ni, _sorted_mol, coords, deriv, + _sorted_mol, coords, deriv, nao_slice=len(idx), shls_slice=non0shl_idx, ao_loc_slice=ao_loc_slice, - ctr_offsets_slice=ctr_offsets_slice) + ctr_offsets_slice=ctr_offsets_slice, + gdftopt=opt, + transpose=False + ) if pad > 0: if deriv == 0: @@ -1660,7 +1594,7 @@ def _xc_type(self, xc_code): class NumInt(lib.StreamObject, LibXCMixin): from gpu4pyscf.lib.utils import to_gpu, device - _keys = {'screen_idx', 'xcfuns', 'gdftopt'} + _keys = {'screen_index', 'xcfuns', 'gdftopt', 'pair_mask', 'grid_blksize', 'non0ao_idx'} gdftopt = None pair_mask = None screen_index = None @@ -1700,14 +1634,27 @@ def build(self, mol, coords): # cannot patch this function eval_xc_eff = eval_xc_eff block_loop = _block_loop - eval_rho2 = eval_rho2 - eval_ao = eval_ao - #eval_rho2 = staticmethod(eval_rho2) + eval_ao = staticmethod(eval_ao) + eval_rho = staticmethod(eval_rho) + eval_rho2 = staticmethod(eval_rho2) def to_cpu(self): ni = numint.NumInt() return ni + @lru_cache(10) + def _init_xcfuns(self, xc_code, spin): + return _init_xcfuns(xc_code, spin) + + def reset(self): + self.gdftopt = None + self.pair_mask = None + self.screen_index = None + self.xcfuns = None + self.grid_blksize = None + self.non0ao_idx = {} + return self + def _make_pairs2shls_idx(pair_mask, l_bas_loc, hermi=0): if hermi: pair_mask = np.tril(pair_mask) @@ -1985,9 +1932,7 @@ def build(self, mol=None): coeff = np.vstack([coeff, np.zeros((paddings, coeff.shape[1]))]) pmol._decontracted = True self._sorted_mol = pmol - inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32) - self.ao_idx = cupy.asarray(ao_idx, dtype=np.int32) - self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32) + self._ao_idx = cupy.asarray(ao_idx, dtype=np.int32) self.coeff = coeff[ao_idx] self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32) self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32) @@ -2014,5 +1959,40 @@ def gdft_envs_cache(self): finally: libgdft.GDFTdel_envs(ctypes.byref(self.envs_cache)) + def sort_orbitals(self, mat, axis=[]): + ''' Transform given axis of a matrix into sorted AO + ''' + idx = self._ao_idx + shape_ones = (1,) * mat.ndim + fancy_index = [] + for dim, n in enumerate(mat.shape): + if dim in axis: + assert n == len(idx) + indices = idx + else: + indices = np.arange(n) + idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:] + fancy_index.append(indices.reshape(idx_shape)) + return mat[tuple(fancy_index)] + + def unsort_orbitals(self, sorted_mat, axis=[], out=None): + ''' Transform given axis of a matrix into original AO + ''' + idx = self._ao_idx + shape_ones = (1,) * sorted_mat.ndim + fancy_index = [] + for dim, n in enumerate(sorted_mat.shape): + if dim in axis: + assert n == len(idx) + indices = idx + else: + indices = np.arange(n) + idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:] + fancy_index.append(indices.reshape(idx_shape)) + if out is None: + out = cupy.empty_like(sorted_mat) + out[tuple(fancy_index)] = sorted_mat + return out + class _GDFTEnvsCache(ctypes.Structure): pass diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index fb3820b3..333034f8 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -25,15 +25,13 @@ from gpu4pyscf.lib import logger from gpu4pyscf.dft import numint, gen_grid from gpu4pyscf.scf import hf -from gpu4pyscf.lib.cupy_helper import load_library, tag_array +from gpu4pyscf.lib.cupy_helper import tag_array from pyscf import __config__ __all__ = [ - 'get_veff', 'RKS' + 'get_veff', 'RKS', 'KohnShamDFT', ] -libcupy_helper = load_library('libcupy_helper') - def prune_small_rho_grids_(ks, mol, dm, grids): rho = ks._numint.get_rho(mol, dm, grids, ks.max_memory, verbose=ks.verbose) @@ -134,16 +132,14 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): if hermi == 2: # because rho = 0 n, exc, vxc = 0, 0, 0 else: - max_memory = ks.max_memory - lib.current_memory()[0] - n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm, max_memory=max_memory) + n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm) if ks.do_nlc(): if ni.libxc.is_nlc(ks.xc): xc = ks.xc else: assert ni.libxc.is_nlc(ks.nlc) xc = ks.nlc - n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm, - max_memory=max_memory) + n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm) exc += enlc vxc += vnlc @@ -151,8 +147,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): t0 = logger.timer_debug1(ks, 'vxc tot', *t0) #enabling range-separated hybrids - omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin) - if abs(hyb) < 1e-10 and abs(alpha) < 1e-10: + if not ni.libxc.is_hybrid_xc(ks.xc): vk = None if (ks._eri is None and ks.direct_scf and getattr(vhf_last, 'vj', None) is not None): @@ -164,6 +159,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): vxc += vj else: + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin) if (ks._eri is None and ks.direct_scf and getattr(vhf_last, 'vk', None) is not None): ddm = cupy.asarray(dm) - cupy.asarray(dm_last) @@ -232,6 +228,16 @@ def energy_elec(ks, dm=None, h1e=None, vhf=None): # Inherit pyscf KohnShamDFT class since this is tested in the pyscf dispersion code class KohnShamDFT(rks.KohnShamDFT): + _keys = {'cphf_grids', *rks.KohnShamDFT._keys} + + to_rhf = NotImplemented + to_uhf = NotImplemented + to_ghf = NotImplemented + to_hf = NotImplemented + to_rks = NotImplemented + to_uks = NotImplemented + to_gks = NotImplemented + _keys = rks.KohnShamDFT._keys def __init__(self, xc='LDA,VWN'): @@ -245,6 +251,14 @@ def __init__(self, xc='LDA,VWN'): self.nlcgrids = gen_grid.Grids(self.mol) self.nlcgrids.level = getattr( __config__, 'dft_rks_RKS_nlcgrids_level', self.nlcgrids.level) + + # Default CPHF grids is SG1 grids + # Reference: + # https://gaussian.com/integral/?tabid=1#Integral_keyword__Grid_option + self.cphf_grids = gen_grid.Grids(self.mol) + self.cphf_grids.prune = gen_grid.sg1_prune + self.cphf_grids.atom_grid = (50,194) + # Use rho to filter grids self.small_rho_cutoff = getattr( __config__, 'dft_rks_RKS_small_rho_cutoff', 1e-7) @@ -261,7 +275,7 @@ def omega(self, v): def dump_flags(self, verbose=None): # TODO: add this later return - + reset = rks.KohnShamDFT.reset do_nlc = rks.KohnShamDFT.do_nlc @@ -285,7 +299,8 @@ def reset(self, mol=None): hf.SCF.reset(self, mol) self.grids.reset(mol) self.nlcgrids.reset(mol) - self._numint.gdftopt = None + self.cphf_grids.reset(mol) + self._numint.reset() return self def nuc_grad_method(self): diff --git a/gpu4pyscf/dft/tests/test_ao_values.py b/gpu4pyscf/dft/tests/test_ao_values.py index 86d52d6c..8a1a1457 100644 --- a/gpu4pyscf/dft/tests/test_ao_values.py +++ b/gpu4pyscf/dft/tests/test_ao_values.py @@ -55,40 +55,35 @@ def test_ao_sph_deriv0(self): coords = np.random.random((100,3)) ao = mol_sph.eval_gto('GTOval_sph_deriv0', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=0) + ao_gpu = numint.eval_ao(mol_sph, coords, deriv=0) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 - + def test_ao_sph_deriv1(self): coords = np.random.random((100,3)) ao = mol_sph.eval_gto('GTOval_sph_deriv1', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=1) + ao_gpu = numint.eval_ao(mol_sph, coords, deriv=1) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_sph_deriv2(self): coords = np.random.random((4,3)) ao = mol_sph.eval_gto('GTOval_sph_deriv2', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=2) + ao_gpu = numint.eval_ao(mol_sph, coords, deriv=2) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_sph_deriv3(self): coords = np.random.random((100,3)) ao = mol_sph.eval_gto('GTOval_sph_deriv3', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=3) + ao_gpu = numint.eval_ao(mol_sph, coords, deriv=3) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_sph_deriv4(self): coords = np.random.random((100,3)) ao = mol_sph.eval_gto('GTOval_sph_deriv4', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=4) + ao_gpu = numint.eval_ao(mol_sph, coords, deriv=4) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 # cart mol @@ -96,24 +91,21 @@ def test_ao_cart_deriv0(self): coords = np.random.random((100,3)) ao = mol_cart.eval_gto('GTOval_cart_deriv0', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=0) + ao_gpu = numint.eval_ao(mol_cart, coords, deriv=0) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_cart_deriv1(self): coords = np.random.random((100,3)) ao = mol_cart.eval_gto('GTOval_cart_deriv1', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=1) + ao_gpu = numint.eval_ao(mol_cart, coords, deriv=1) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_cart_deriv2(self): coords = np.random.random((100,3)) ao = mol_cart.eval_gto('GTOval_cart_deriv2', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=2) + ao_gpu = numint.eval_ao(mol_cart, coords, deriv=2) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_cart_deriv3(self): @@ -128,8 +120,7 @@ def test_ao_cart_deriv4(self): coords = np.random.random((100,3)) ao = mol_cart.eval_gto('GTOval_cart_deriv4', coords) ao_cpu = cupy.asarray(ao) - ni = NumInt() - ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=4) + ao_gpu = numint.eval_ao(mol_cart, coords, deriv=4) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 if __name__ == "__main__": diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py index 229f0854..80d305aa 100644 --- a/gpu4pyscf/dft/tests/test_libxc.py +++ b/gpu4pyscf/dft/tests/test_libxc.py @@ -47,8 +47,12 @@ def tearDownModule(): mol.stdout.close() del mol +def _diff(dat, ref): + d = dat - ref + return np.min((abs(d/(ref+1e-300)), abs(d)), axis=0) + class KnownValues(unittest.TestCase): - def _check_xc(self, xc): + def _check_xc(self, xc, spin=0, fxc_tol=1e-10, kxc_tol=1e-10): ni_cpu = numint_cpu() ni_gpu = numint_gpu() xctype = ni_cpu._xc_type(xc) @@ -60,26 +64,42 @@ def _check_xc(self, xc): grids = Grids(mol).build() ao = ni_cpu.eval_ao(mol, grids.coords, ao_deriv) rho = ni_cpu.eval_rho(mol, ao, dm0, xctype=xctype) + if spin != 0: + rho = (rho, rho) exc_cpu, vxc_cpu, fxc_cpu, kxc_cpu = ni_cpu.eval_xc_eff(xc, rho, deriv=2, xctype=xctype) exc_gpu, vxc_gpu, fxc_gpu, kxc_gpu = ni_gpu.eval_xc_eff(xc, cupy.array(rho), deriv=2, xctype=xctype) - assert(np.linalg.norm((exc_gpu[:,0].get() - exc_cpu)) < 1e-10) - assert(np.linalg.norm((vxc_gpu.get() - vxc_cpu)) < 1e-10) + assert _diff(exc_gpu[:,0].get(), exc_cpu).max() < 1e-10 + assert _diff(vxc_gpu.get(), vxc_cpu).max() < 1e-10 if fxc_gpu is not None: - assert(np.linalg.norm((fxc_gpu.get() - fxc_cpu))/np.linalg.norm(fxc_cpu) < 1e-6) + assert _diff(fxc_gpu.get(), fxc_cpu).max() < fxc_tol if kxc_gpu is not None: - assert(np.linalg.norm(kxc_gpu.get() - kxc_cpu) < 1e-5) + assert _diff(kxc_gpu.get(), kxc_cpu).max() < kxc_tol def test_LDA(self): self._check_xc('LDA_C_VWN') def test_GGA(self): - self._check_xc('GGA_C_PBE') + self._check_xc('HYB_GGA_XC_B3LYP') + self._check_xc('GGA_X_B88', fxc_tol=1e-10) + self._check_xc('GGA_C_PBE', fxc_tol=1e-5) def test_mGGA(self): - self._check_xc('MGGA_C_M06') + self._check_xc('MGGA_C_M06', fxc_tol=1e-5) + + def test_u_LDA(self): + self._check_xc('LDA_C_VWN', spin=1) + + def test_u_GGA(self): + # large errors found in B88 for the spin polarized case + self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-3) + self._check_xc('GGA_X_B88', spin=1, fxc_tol=1e-1) + self._check_xc('GGA_C_PBE', spin=1, fxc_tol=1e-5) + + def test_u_mGGA(self): + self._check_xc('MGGA_C_M06', spin=1, fxc_tol=1e-5) if __name__ == "__main__": print("Full Tests for xc fun") - unittest.main() \ No newline at end of file + unittest.main() diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py index ba34f63d..505df831 100644 --- a/gpu4pyscf/dft/tests/test_numint.py +++ b/gpu4pyscf/dft/tests/test_numint.py @@ -155,7 +155,7 @@ def test_rks_gga(self): def test_rks_mgga(self): self._check_vxc('nr_rks', MGGA_M06) - + def test_uks_lda(self): self._check_vxc('nr_uks', LDA)#'lda', -6.362059440515177) @@ -212,7 +212,25 @@ def test_vv10(self): v = dft.numint._vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars) self.assertAlmostEqual(lib.fp(v[0].get()), 0.15894647203764295, 8) self.assertAlmostEqual(lib.fp(v[1].get()), 0.20500922537924576, 8) - return + + def test_eval_rho(self): + np.random.seed(1) + dm = np.random.random(dm0.shape) + ni_gpu = NumInt() + ni_cpu = pyscf_numint() + for xctype in ('LDA', 'GGA', 'MGGA'): + deriv = 1 + if xctype == 'LDA': + deriv = 0 + ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False) + ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv) + rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False) + ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False) + self.assertAlmostEqual(abs(rho.get() - ref).max(), 0, 10) + + rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False) + ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False) + self.assertAlmostEqual(abs(rho.get() - ref).max(), 0, 10) if __name__ == "__main__": print("Full Tests for dft numint") diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py index 398f8b81..7ccf20c7 100644 --- a/gpu4pyscf/dft/uks.py +++ b/gpu4pyscf/dft/uks.py @@ -133,7 +133,8 @@ def reset(self, mol=None): hf.SCF.reset(self, mol) self.grids.reset(mol) self.nlcgrids.reset(mol) - self._numint.gdftopt = None + self.cphf_grids.reset(mol) + self._numint.reset() return self def nuc_grad_method(self): @@ -145,4 +146,4 @@ def to_cpu(self): mf = uks.UKS(self.mol, xc=self.xc) mf.disp = self.disp utils.to_cpu(self, mf) - return mf \ No newline at end of file + return mf diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index 7cc5e78d..70ab8240 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -256,8 +256,8 @@ def get_grad_hcore(mf_grad, mo_coeff=None, mo_occ=None): intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE) - orbo_sorted = orbo[intopt.ao_idx] - mo_coeff_sorted = mo_coeff[intopt.ao_idx] + orbo_sorted = intopt.sort_orbitals(orbo, axis=[0]) + mo_coeff_sorted = intopt.sort_orbitals(mo_coeff, axis=[0]) for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'): dh1e[k0:k1,:,j0:j1,:] += contract('xkji,io->kxjo', int3c_blk, orbo_sorted[i0:i1]) dh1e[k0:k1,:,i0:i1,:] += contract('xkji,jo->kxio', int3c_blk, orbo_sorted[j0:j1]) diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 2ef4a6d8..1fd43ac0 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -135,9 +135,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dms = cupy.asarray(dms).reshape(-1,nao0,nao0) - dms = take_last2d(dms, opt.ao_idx) - mo_coeff = mo_coeff[opt.ao_idx] - + dms = opt.sort_orbitals(dms, axis=[1,2]) + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) nset = len(dms) assert nset == 1 vmat = cupy.zeros((nset,3,nao,nao)) @@ -179,8 +178,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vtmp = _gga_grad_sum_(ao_mask, wv) vtmp += _tau_grad_dot_(ao_mask, wv[4]) add_sparse(vmat[idm], vtmp, idx) - #vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat] - vmat = take_last2d(vmat, opt.rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[2,3]) exc = None if nset == 1: vmat = vmat[0] @@ -203,10 +201,9 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, _sorted_mol = opt._sorted_mol coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape - dms = cupy.asarray(dms) - dms = [coeff @ dm @ coeff.T - for dm in dms.reshape(-1,nao0,nao0)] - mo_coeff = coeff @ mo_coeff + dms = cupy.asarray(dms).reshape(-1,nao0,nao0) + dms = opt.sort_orbitals(dms, axis=[1,2]) + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) nset = len(dms) assert nset == 1 @@ -238,10 +235,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmat_tmp = _gga_grad_sum_(ao_mask, wv) add_sparse(vmat, vmat_tmp, mask) - #vmat = contract('npq,qj->npj', vmat, coeff) - #vmat = contract('pi,npj->nij', coeff, vmat) - rev_ao_idx = opt.rev_ao_idx - vmat = take_last2d(vmat, rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) exc = None # - sign because nabla_X = -nabla_x return exc, -vmat @@ -358,7 +352,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)): ngrids = weight.size for p0, p1 in lib.prange(0,ngrids,block_size): - ao = numint.eval_ao(ni, _sorted_mol, coords[p0:p1, :], ao_deriv) + ao = numint.eval_ao(_sorted_mol, coords[p0:p1, :], ao_deriv, gdftopt=opt, transpose=False) if xctype == 'LDA': rho = numint.eval_rho(_sorted_mol, ao[0], dms, @@ -409,7 +403,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, #:vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff) vmat = sandwich_dot(vmat, coeff) - + # - sign because nabla_X = -nabla_x return excsum, -vmat @@ -424,7 +418,7 @@ def grids_response_cc(grids): atm_dist = gto.inter_distance(mol, atm_coords) atm_dist = cupy.asarray(atm_dist) atm_coords = cupy.asarray(atm_coords) - + def _radii_adjust(mol, atomic_radii): charges = mol.atom_charges() if grids.radii_adjust == radi.treutler_atomic_radii_adjust: diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py index 32848381..32d18207 100644 --- a/gpu4pyscf/grad/uks.py +++ b/gpu4pyscf/grad/uks.py @@ -90,7 +90,7 @@ def get_veff(ks_grad, mol=None, dm=None, verbose=None): vxc_tmp[0] += vnlc vxc_tmp[1] += vnlc t0 = logger.timer(ks_grad, 'vxc', *t0) - + mo_coeff_alpha = mf.mo_coeff[0] mo_coeff_beta = mf.mo_coeff[1] occ_coeff0 = cupy.asarray(mo_coeff_alpha[:, mf.mo_occ[0]>0.5], order='C') @@ -139,9 +139,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dms = cupy.asarray(dms) - dms = take_last2d(dms, opt.ao_idx) - mo_coeff = mo_coeff[:, opt.ao_idx] - + dms = opt.sort_orbitals(dms, axis=[1,2]) + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1]) nset = len(dms) vmat = cupy.zeros((nset,3,nao,nao)) if xctype == 'LDA': @@ -193,7 +192,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vtmp += rks_grad._tau_grad_dot_(ao_mask, wv[1,4]) add_sparse(vmat[1], vtmp, idx) - vmat = take_last2d(vmat, opt.rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[2,3]) exc = None # - sign because nabla_X = -nabla_x @@ -216,8 +215,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, nao, nao0 = coeff.shape dms = cupy.asarray(dms) assert dms.ndim == 3 and dms.shape[0] == 2 - #:dms = cupy.einsum('pi,nij,qj->npq', coeff, dms, coeff) - dms = sandwich_dot(dms.reshape(-1,nao0,nao0), coeff.T) + dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2]) excsum = cupy.zeros((natm, 3)) vmat = cupy.zeros((2,3,nao,nao)) @@ -239,7 +237,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, for atm_id, (coords, weight, weight1) in enumerate(rks_grad.grids_response_cc(grids)): ngrids = weight.size for p0, p1 in lib.prange(0,ngrids,block_size): - ao = numint.eval_ao(ni, _sorted_mol, coords[p0:p1, :], ao_deriv) + ao = numint.eval_ao(_sorted_mol, coords[p0:p1, :], ao_deriv, gdftopt=opt, transpose=False) if xctype == 'LDA': rho_a = numint.eval_rho(_sorted_mol, ao[0], dms[0], xctype=xctype, hermi=1, with_lapl=False) @@ -304,9 +302,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, excsum[atm_id] += cupy.einsum('xij,ji->x', vtmp, dms[1]) * 2 rho = vxc = None - #:vmat = cupy.einsum('pi,snpq,qj->snij', coeff, vmat, coeff) - vmat = sandwich_dot(vmat.reshape(6,nao,nao), coeff).reshape(2,3,nao0,nao0) - + vmat = opt.unsort_orbitals(vmat, axis=[2,3]) # - sign because nabla_X = -nabla_x return excsum, -vmat @@ -326,8 +322,8 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he _sorted_mol = opt._sorted_mol coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape - mo_coeff_0 = coeff @ mo_coeff[0] - mo_coeff_1 = coeff @ mo_coeff[1] + mo_coeff_0 = opt.sort_orbitals(mo_coeff[0], axis=[0]) + mo_coeff_1 = opt.sort_orbitals(mo_coeff[1], axis=[0]) nset = 1 assert nset == 1 @@ -361,8 +357,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he vmat_tmp = rks_grad._gga_grad_sum_(ao_mask, wv) add_sparse(vmat, vmat_tmp, mask) - rev_ao_idx = opt.rev_ao_idx - vmat = take_last2d(vmat, rev_ao_idx) + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) exc = None # - sign because nabla_X = -nabla_x return exc, -vmat diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py index 83e3e323..01af5ca0 100644 --- a/gpu4pyscf/gto/mole.py +++ b/gpu4pyscf/gto/mole.py @@ -86,7 +86,7 @@ def basis_seg_contraction(mol, allow_replica=False): pmol.output = mol.output pmol.verbose = mol.verbose pmol.stdout = mol.stdout - pmol.cart = True + pmol.cart = True #mol.cart pmol._bas = np.asarray(np.vstack(_bas), dtype=np.int32) pmol._env = _env return pmol diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 41669c93..3d2545e2 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -561,15 +561,17 @@ def gen_vind(mf, mo_coeff, mo_occ): nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] - vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1) + mocc_2 = mocc * 2 + grids = getattr(mf, 'cphf_grids', None) + vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids) def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmo,nocc) mo1_mo = contract('npo,ip->nio', mo1, mo_coeff) - #dm1 = contract('nio,jo->nij', 2.0*mo1_mo, mocc) + #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2) #dm1 = dm1 + dm1.transpose(0,2,1) - dm1 = mo1_mo.dot(2.0*mocc.T) + dm1 = mo1_mo.dot(mocc_2.T) transpose_sum(dm1) dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ) v1 = vresp(dm1) diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 942438f9..4f03da9e 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -52,7 +52,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if mf.do_nlc(): raise NotImplementedError omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - with_k = abs(hyb) > 1e-10 + with_k = ni.libxc.is_hybrid_xc(mf.xc) de2, ej, ek = rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, atmlst, max_memory, verbose, with_k=with_k) @@ -103,7 +103,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mf = hessobj.base ni = mf._numint omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - with_k = abs(hyb) > 1e-10 + with_k = ni.libxc.is_hybrid_xc(mf.xc) avail_mem -= 8 * h1mo.size slice_size = int(avail_mem*0.5) // (8*3*nao*nao) @@ -146,7 +146,6 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) - nao_sph = mo_coeff.shape[0] ni = mf._numint xctype = ni._xc_type(mf.xc) shls_slice = (0, mol.nbas) @@ -157,8 +156,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): ni.build(mol, grids.coords) opt = ni.gdftopt _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - mo_coeff = coeff @ mo_coeff + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) nao = mo_coeff.shape[0] vmat = cupy.zeros((6,nao,nao)) @@ -251,9 +249,8 @@ def contract_(ao, aoidx, wv, mask): 1,3,4, 2,4,5]] - vmat = contract('npq,qj->npj', vmat, coeff) - vmat = contract('pi,npj->nij', coeff, vmat) - return vmat.reshape(3,3,nao_sph,nao_sph) + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) + return vmat.reshape(3,3,nao,nao) def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): p0, p1 = aoslices[atm_id][2:] @@ -344,7 +341,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): _sorted_mol = opt._sorted_mol coeff = cupy.asarray(opt.coeff) dm0 = mf.make_rdm1(mo_coeff, mo_occ) - dm0_sorted = take_last2d(dm0, opt.ao_idx) + dm0_sorted = opt.sort_orbitals(dm0, axis=[0,1]) vmat_dm = cupy.zeros((_sorted_mol.natm,3,3,nao)) ipip = cupy.zeros((3,3,nao,nao)) if xctype == 'LDA': @@ -361,7 +358,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): wv = weight * vxc[0] aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)] _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) - dm0_mask = dm0_sorted[numpy.ix_(mask, mask)] + dm0_mask = dm0_sorted[mask[:,None], mask] ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask) ao_dm0 = numint._dot_ao_dm(mol, ao[0], dm0, mask, shls_slice, ao_loc) @@ -379,7 +376,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): ao_dm0 = aow = None t1 = log.timer_debug2('integration', *t1) for ia in range(_sorted_mol.natm): - vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx] + vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia] p0, p1 = aoslices[ia][2:] vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1]) elif xctype == 'GGA': @@ -399,7 +396,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)] wf = weight * fxc - dm0_mask = dm0_sorted[numpy.ix_(mask, mask)] + dm0_mask = dm0_sorted[mask[:,None], mask] ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask) vmat_dm_tmp = cupy.empty([3,3,nao_non0]) for ia in range(_sorted_mol.natm): @@ -416,7 +413,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): ao_dm0 = aow = None t1 = log.timer_debug2('integration', *t1) for ia in range(_sorted_mol.natm): - vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx] + vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia] p0, p1 = aoslices[ia][2:] vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1]) vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1]) @@ -444,7 +441,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): _d1d2_dot_(ipip, mol, [aow[0], aow[1], aow[2]], [ao[XX], ao[XY], ao[XZ]], mask, ao_loc, False) _d1d2_dot_(ipip, mol, [aow[1], aow[3], aow[4]], [ao[YX], ao[YY], ao[YZ]], mask, ao_loc, False) _d1d2_dot_(ipip, mol, [aow[2], aow[4], aow[5]], [ao[ZX], ao[ZY], ao[ZZ]], mask, ao_loc, False) - dm0_mask = dm0_sorted[numpy.ix_(mask, mask)] + dm0_mask = dm0_sorted[mask[:,None], mask] ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)] ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask) wf = weight * fxc @@ -483,7 +480,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): vmat_dm[ia][:,:,mask] += vmat_dm_tmp t1 = log.timer_debug2('integration', *t1) for ia in range(_sorted_mol.natm): - vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx] + vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia] p0, p1 = aoslices[ia][2:] vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1]) vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1]) diff --git a/gpu4pyscf/hessian/tests/test_rks_hessian.py b/gpu4pyscf/hessian/tests/test_rks_hessian.py index bdc1b2f6..bbe272d3 100644 --- a/gpu4pyscf/hessian/tests/test_rks_hessian.py +++ b/gpu4pyscf/hessian/tests/test_rks_hessian.py @@ -70,7 +70,9 @@ def _check_vxc(method, xc='LDA'): def _vs_cpu(mf, tol=1e-7): mf.conv_tol_cpscf = 1e-8 ref = mf.Hessian().kernel() - e2_gpu = mf.Hessian().to_gpu().kernel() + hessobj = mf.Hessian().to_gpu() + hessobj.base.cphf_grids = hessobj.base.grids + e2_gpu = hessobj.kernel() assert abs(ref - e2_gpu).max() < tol class KnownValues(unittest.TestCase): diff --git a/gpu4pyscf/hessian/tests/test_uks_hessian.py b/gpu4pyscf/hessian/tests/test_uks_hessian.py index c9853579..76beb1e8 100644 --- a/gpu4pyscf/hessian/tests/test_uks_hessian.py +++ b/gpu4pyscf/hessian/tests/test_uks_hessian.py @@ -81,7 +81,9 @@ def _check_vxc(method, xc='LDA'): def _vs_cpu(mf, tol=1e-7): mf.conv_tol_cpscf = 1e-8 ref = mf.Hessian().kernel() - e2_gpu = mf.Hessian().to_gpu().kernel() + hessobj = mf.Hessian().to_gpu() + hessobj.base.cphf_grids = hessobj.base.grids + e2_gpu = hessobj.kernel() assert abs(ref - e2_gpu).max() < tol class KnownValues(unittest.TestCase): diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index a338dc59..76f9ae9f 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -324,7 +324,8 @@ def gen_vind(mf, mo_coeff, mo_occ): moccb = mo_coeff[1][:,mo_occ[1]>0] nocca = mocca.shape[1] noccb = moccb.shape[1] - vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1) + grids = getattr(mf, 'cphf_grids', None) + vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids) def fx(mo1): mo1 = cupy.asarray(mo1) diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index b4d9fc48..00c861b3 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -53,7 +53,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if mf.nlc != '': raise NotImplementedError omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - with_k = abs(hyb) > 1e-10 + with_k = ni.libxc.is_hybrid_xc(mf.xc) de2, ej, ek = uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, atmlst, max_memory, verbose, with_k=with_k) @@ -112,7 +112,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mf = hessobj.base ni = mf._numint omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - with_k = abs(hyb) > 1e-10 + with_k = ni.libxc.is_hybrid_xc(mf.xc) avail_mem -= 8 * (h1moa.size + h1mob.size) slice_size = int(avail_mem*0.5) // (8*3*nao*nao) @@ -183,8 +183,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): opt = ni.gdftopt _sorted_mol = opt._sorted_mol - coeff = cupy.asarray(opt.coeff) - mo_coeff = contract('nij,pi->npj', mo_coeff, coeff) + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1]) nao = mo_coeff.shape[1] # TODO: check mol in opt? vmata = cupy.zeros((6,nao,nao)) @@ -304,10 +303,10 @@ def contract_(ao, aoidx, wv, mask): vmatb = vmatb[[0,1,2, 1,3,4, 2,4,5]] - vmata = contract('npq,qj->npj', vmata, coeff) - vmata = contract('pi,npj->nij', coeff, vmata).reshape(3,3,nao_sph,nao_sph) - vmatb = contract('npq,qj->npj', vmatb, coeff) - vmatb = contract('pi,npj->nij', coeff, vmatb).reshape(3,3,nao_sph,nao_sph) + vmata = opt.unsort_orbitals(vmata, axis=[1,2]) + vmata = vmata.reshape(3,3,nao_sph,nao_sph) + vmatb = opt.unsort_orbitals(vmatb, axis=[1,2]) + vmatb = vmatb.reshape(3,3,nao_sph,nao_sph) return vmata, vmatb def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): @@ -400,8 +399,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): coeff = cupy.asarray(opt.coeff) dm0a, dm0b = mf.make_rdm1(mo_coeff, mo_occ) - dm0a_sorted = take_last2d(dm0a, opt.ao_idx) - dm0b_sorted = take_last2d(dm0b, opt.ao_idx) + dm0a_sorted = opt.sort_orbitals(dm0a, axis=[0,1]) + dm0b_sorted = opt.sort_orbitals(dm0b, axis=[0,1]) vmata_dm = cupy.zeros((mol.natm,3,3,nao)) vmatb_dm = cupy.zeros((mol.natm,3,3,nao)) ipipa = cupy.zeros((3,3,nao,nao)) @@ -423,8 +422,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): _d1d2_dot_(ipipa, mol, aowa, ao[1:4], mask, ao_loc, False) aowb = [numint._scale_ao(ao[i], wv[1]) for i in range(1, 4)] _d1d2_dot_(ipipb, mol, aowb, ao[1:4], mask, ao_loc, False) - dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)] - dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)] + dm0a_mask = dm0a_sorted[mask[:,None], mask] + dm0b_mask = dm0b_sorted[mask[:,None], mask] ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask) ao_dmb_mask = contract('nig,ij->njg', ao_mask[:4], dm0b_mask) @@ -451,8 +450,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): t1 = log.timer_debug2('integration', *t1) for ia in range(_sorted_mol.natm): p0, p1 = aoslices[ia][2:] - vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx] - vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx] + vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia] + vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia] vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1]) vmatb_dm[ia] += contract('xypq,pq->xyp', ipipb[:,:,:,p0:p1], dm0b[:,p0:p1]) elif xctype == 'GGA': @@ -476,8 +475,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): ao_dm0a = [numint._dot_ao_dm(mol, ao[i], dm0a, mask, shls_slice, ao_loc) for i in range(4)] ao_dm0b = [numint._dot_ao_dm(mol, ao[i], dm0b, mask, shls_slice, ao_loc) for i in range(4)] wf = weight * fxc - dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)] - dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)] + dm0a_mask = dm0a_sorted[mask[:,None], mask] + dm0b_mask = dm0b_sorted[mask[:,None], mask] ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask) ao_dmb_mask = contract('nig,ij->njg', ao_mask[:4], dm0b_mask) vmata_dm_tmp = cupy.empty([3,3,nao_non0]) @@ -507,8 +506,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): ao_dm0a = ao_dm0b = aow = None t1 = log.timer_debug2('integration', *t1) for ia in range(_sorted_mol.natm): - vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx] - vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx] + vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia] + vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia] p0, p1 = aoslices[ia][2:] vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1]) vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1]) @@ -546,8 +545,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): _d1d2_dot_(ipipb, mol, [aow[1], aow[3], aow[4]], [ao[YX], ao[YY], ao[YZ]], mask, ao_loc, False) _d1d2_dot_(ipipb, mol, [aow[2], aow[4], aow[5]], [ao[ZX], ao[ZY], ao[ZZ]], mask, ao_loc, False) - dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)] - dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)] + dm0a_mask = dm0a_sorted[mask[:,None], mask] + dm0b_mask = dm0b_sorted[mask[:,None], mask] ao_dm0a = [numint._dot_ao_dm(mol, ao[i], dm0a, mask, shls_slice, ao_loc) for i in range(4)] ao_dm0b = [numint._dot_ao_dm(mol, ao[i], dm0b, mask, shls_slice, ao_loc) for i in range(4)] ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask) @@ -622,8 +621,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): vmatb_dm[ia][:,:,mask] += vmatb_dm_tmp t1 = log.timer_debug2('integration', *t1) for ia in range(_sorted_mol.natm): - vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx] - vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx] + vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia] + vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia] p0, p1 = aoslices[ia][2:] vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1]) vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1]) diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt index e5115f5b..4390407e 100644 --- a/gpu4pyscf/lib/CMakeLists.txt +++ b/gpu4pyscf/lib/CMakeLists.txt @@ -148,6 +148,7 @@ if(BUILD_SOLVENT) endif() add_subdirectory(gvhf-rys) +add_subdirectory(gvhf-md) option(BUILD_LIBXC "Using libxc for DFT" ON) if(BUILD_LIBXC) diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index a3b3b341..2edfd17e 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -226,9 +226,12 @@ def dist_matrix(x, y, out=None): raise RuntimeError('failed in calculating distance matrix') return out -def block_c2s_diag(ncart, nsph, angular, counts): +def block_c2s_diag(angular, counts): ''' - constract a cartesian to spherical transformation of n shells + Diagonal blocked cartesian to spherical transformation + Args: + angular (list): angular momentum type, e.g. [0,1,2,3] + counts (list): count of each angular momentum ''' if _data['c2s'] is None: c2s_data = cupy.concatenate([cupy.asarray(x.ravel()) for x in c2s_l]) @@ -246,7 +249,8 @@ def block_c2s_diag(ncart, nsph, angular, counts): offsets += [c2s_offset[l]] * count rows = cupy.hstack(rows) cols = cupy.hstack(cols) - + + ncart, nsph = int(rows[-1]), int(cols[-1]) cart2sph = cupy.zeros([ncart, nsph]) offsets = cupy.asarray(offsets, dtype='int32') @@ -358,11 +362,12 @@ def transpose_sum(a, stream=None): return a + a.transpose(0,2,1) ''' assert a.flags.c_contiguous - n = a.shape[-1] + out = a if a.ndim == 2: - a = a.reshape([-1,n,n]) + a = a[None] assert a.ndim == 3 - count = a.shape[0] + count, m, n = a.shape + assert m == n stream = cupy.cuda.get_current_stream() err = libcupy_helper.transpose_sum( ctypes.cast(stream.ptr, ctypes.c_void_p), @@ -372,7 +377,7 @@ def transpose_sum(a, stream=None): ) if err != 0: raise RuntimeError('failed in transpose_sum kernel') - return a + return out # for i > j of 2d mat, mat[j,i] = mat[i,j] def hermi_triu(mat, hermi=1, inplace=True): @@ -911,10 +916,11 @@ def sandwich_dot(a, c, out=None): a = a[None] counts = a.shape[0] m = c.shape[1] - out = cupy.empty((counts, m, m)) + dtype = np.result_type(a, c) + out = cupy.empty((counts, m, m), dtype=dtype) tmp = None for i in range(counts): - tmp = cupy.dot(c.T, a[i], out=tmp) + tmp = cupy.dot(c.conj().T, a[i], out=tmp) cupy.dot(tmp, c, out=out[i]) if a_ndim == 2: out = out[0] diff --git a/gpu4pyscf/lib/cusolver.py b/gpu4pyscf/lib/cusolver.py index 27fcb0b0..454567bd 100644 --- a/gpu4pyscf/lib/cusolver.py +++ b/gpu4pyscf/lib/cusolver.py @@ -66,22 +66,65 @@ ctypes.c_void_p # *devInfo ] +# https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd +libcusolver.cusolverDnZhegvd_bufferSize.argtypes = [ + ctypes.c_void_p, # handle + ctypes.c_int, # itype + ctypes.c_int, # jobz + ctypes.c_int, # uplo + ctypes.c_int, # n + ctypes.c_void_p, # *A + ctypes.c_int, # lda + ctypes.c_void_p, # *B + ctypes.c_int, # ldb + ctypes.c_void_p, # *w + ctypes.c_void_p # *lwork +] + +libcusolver.cusolverDnZhegvd.argtypes = [ + ctypes.c_void_p, # handle + ctypes.c_int, # itype + ctypes.c_int, # jobz + ctypes.c_int, # uplo + ctypes.c_int, # n + ctypes.c_void_p, # *A + ctypes.c_int, # lda + ctypes.c_void_p, # *B + ctypes.c_int, # ldb + ctypes.c_void_p, # *w + ctypes.c_void_p, # *work + ctypes.c_int, # lwork + ctypes.c_void_p # *devInfo +] + def eigh(h, s): ''' solve generalized eigenvalue problem ''' + assert h.dtype == s.dtype + assert h.dtype in (np.float64, np.complex128) n = h.shape[0] w = cupy.zeros(n) - A = h.copy() - B = s.copy() + if h.dtype == np.complex128 and h.flags.c_contiguous: + # zhegvd requires the matrices in F-order. For hermitian matrices, + # .T.copy() is equivalent to .conj() + A = h.conj() + B = s.conj() + else: + A = h.copy() + B = s.copy() _handle = device.get_cusolver_handle() # TODO: reuse workspace - if n in _buffersize: - lwork = _buffersize[n] + if (h.dtype, n) in _buffersize: + lwork = _buffersize[h.dtype, n] else: - lwork = ctypes.c_int() - status = libcusolver.cusolverDnDsygvd_bufferSize( + lwork = ctypes.c_int(0) + if h.dtype == np.float64: + fn = libcusolver.cusolverDnDsygvd_bufferSize + else: + fn = libcusolver.cusolverDnZhegvd_bufferSize + status = fn( _handle, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, @@ -98,10 +141,14 @@ def eigh(h, s): if status != 0: raise RuntimeError("failed in buffer size") - - work = cupy.empty(lwork) + + if h.dtype == np.float64: + fn = libcusolver.cusolverDnDsygvd + else: + fn = libcusolver.cusolverDnZhegvd + work = cupy.empty(lwork, dtype=h.dtype) devInfo = cupy.empty(1, dtype=np.int32) - status = libcusolver.cusolverDnDsygvd( + status = fn( _handle, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, @@ -116,7 +163,7 @@ def eigh(h, s): lwork, devInfo.data.ptr ) - + if status != 0: raise RuntimeError("failed in eigh kernel") return w, A.T @@ -126,10 +173,14 @@ def cholesky(A): assert A.flags['C_CONTIGUOUS'] x = A.copy() handle = device.get_cusolver_handle() - potrf = cusolver.dpotrf - potrf_bufferSize = cusolver.dpotrf_bufferSize + if A.dtype == np.float64: + potrf = cusolver.dpotrf + potrf_bufferSize = cusolver.dpotrf_bufferSize + else: + potrf = cusolver.zpotrf + potrf_bufferSize = cusolver.zpotrf_bufferSize buffersize = potrf_bufferSize(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) - workspace = cupy.empty(buffersize) + workspace = cupy.empty(buffersize, dtype=A.dtype) dev_info = cupy.empty(1, dtype=np.int32) potrf(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) @@ -137,4 +188,4 @@ def cholesky(A): if dev_info[0] != 0: raise RuntimeError('failed to perform Cholesky Decomposition') cupy.linalg._util._tril(x,k=0) - return x \ No newline at end of file + return x diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index 07d35547..573e1777 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -42,20 +42,20 @@ def _auto_create_mode(array, mode): 'ndim mismatch: {} != {}'.format(array.ndim, mode.ndim)) return mode -def _create_tensor_descriptor(a): - handle = cutensor._get_handle() - key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides)) - # hard coded - alignment_req = 8 - if key not in _tensor_descriptors: - num_modes = a.ndim - extent = np.array(a.shape, dtype=np.int64) - stride = np.array(a.strides, dtype=np.int64) // a.itemsize - cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype) - _tensor_descriptors[key] = cutensor.TensorDescriptor( - handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data, - cutensor_dtype, alignment_req=alignment_req) - return _tensor_descriptors[key] +#def _create_tensor_descriptor(a): +# handle = cutensor._get_handle() +# key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides)) +# # hard coded +# alignment_req = 8 +# if key not in _tensor_descriptors: +# num_modes = a.ndim +# extent = np.array(a.shape, dtype=np.int64) +# stride = np.array(a.strides, dtype=np.int64) // a.itemsize +# cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype) +# _tensor_descriptors[key] = cutensor.TensorDescriptor( +# handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data, +# cutensor_dtype, alignment_req=alignment_req) +# return _tensor_descriptors[key] def contraction( pattern, a, b, alpha, beta, @@ -80,14 +80,14 @@ def contraction( mode_b = list(str_b) mode_c = list(str_c) - if(out is not None): - c = out - else: - c = cupy.empty([shape[k] for k in str_c], order='C') + if out is None: + dtype = np.result_type(a, b, alpha) + out = cupy.empty([shape[k] for k in str_c], order='C', dtype=dtype) + c = out - desc_a = _create_tensor_descriptor(a) - desc_b = _create_tensor_descriptor(b) - desc_c = _create_tensor_descriptor(c) + desc_a = cutensor.create_tensor_descriptor(a) + desc_b = cutensor.create_tensor_descriptor(b) + desc_c = cutensor.create_tensor_descriptor(c) mode_a = _auto_create_mode(a, mode_a) mode_b = _auto_create_mode(b, mode_b) diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu index 5c6dbd1c..1f6a6939 100644 --- a/gpu4pyscf/lib/gdft/contract_rho.cu +++ b/gpu4pyscf/lib/gdft/contract_rho.cu @@ -56,6 +56,7 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, } } +// half of the GGA rho __global__ void GDFTcontract_rho4_kernel(double *rho, double *bra, double *ket, int ngrids, int nao, int count) { @@ -109,7 +110,7 @@ void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngri double v[4] = {0.0, 0.0, 0.0, 0.0}; if (active){ for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { - int ket_idx = grid_id + ao_id * Ngrids; + size_t ket_idx = grid_id + ao_id * Ngrids; double bra_tmp = bra[ket_idx]; double ket_tmp = ket[ket_idx]; @@ -143,7 +144,7 @@ void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngri if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); if (iy == 0 && active) { - rho[grid_id + ngrids * i] = 2.0 * buf[ix]; + rho[grid_id + ngrids * i] = buf[ix]; } } } @@ -161,7 +162,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr double v[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; if (active){ for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { - int ket_idx = grid_id + ao_id * Ngrids; + size_t ket_idx = grid_id + ao_id * Ngrids; double bra_tmp0 = bra[ket_idx]; double ket_tmp0 = ket[ket_idx]; @@ -207,7 +208,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); if (iy == 0 && active) { - rho[grid_id + ngrids * i] = 2.0 * buf[ix]; + rho[grid_id + ngrids * i] = buf[ix]; } } } @@ -358,4 +359,4 @@ int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv, return 0; } -} \ No newline at end of file +} diff --git a/gpu4pyscf/lib/gdft/libxc.cu b/gpu4pyscf/lib/gdft/libxc.cu index 639eecc6..3eeb1b76 100644 --- a/gpu4pyscf/lib/gdft/libxc.cu +++ b/gpu4pyscf/lib/gdft/libxc.cu @@ -73,37 +73,121 @@ void _memset_lda(xc_lda_out_params *out, int order, int np, const xc_dimensions if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk); if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho); if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2); + if(order >= 3) cudaMemset(out->v3rho3, 0, sizeof(double)*np*dim->v3rho3); + if(order >= 4) cudaMemset(out->v4rho4, 0, sizeof(double)*np*dim->v4rho4); } __host__ void _memset_gga(xc_gga_out_params *out, int order, int np, const xc_dimensions *dim){ if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk); - if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho); - if(order >= 1) cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); // (sigma, lapl, tau) - if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2); - if(order >= 2) cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma); - if(order >= 2) cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2); + if(order >= 1) { + cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho); + cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); // (sigma, lapl, tau) + } + if(order >= 2) { + cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2); + cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma); + cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2); + } + if(order >= 3) { + cudaMemset(out->v3rho3, 0, sizeof(double)*np*dim->v3rho3); + cudaMemset(out->v3rho2sigma, 0, sizeof(double)*np*dim->v3rho2sigma); + cudaMemset(out->v3rhosigma2, 0, sizeof(double)*np*dim->v3rhosigma2); + cudaMemset(out->v3sigma3, 0, sizeof(double)*np*dim->v3sigma3); + } + if(order >= 4) { + cudaMemset(out->v4rho4, 0, sizeof(double)*np*dim->v4rho4); + cudaMemset(out->v4rho3sigma, 0, sizeof(double)*np*dim->v4rho3sigma); + cudaMemset(out->v4rho2sigma2, 0, sizeof(double)*np*dim->v4rho2sigma2); + cudaMemset(out->v4rhosigma3, 0, sizeof(double)*np*dim->v4rhosigma3); + cudaMemset(out->v4sigma4, 0, sizeof(double)*np*dim->v4sigma4); + } } __host__ void _memset_mgga(xc_mgga_out_params *out, int order, int np, const xc_dimensions *dim){ if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk); - if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho); - if(order >= 1) cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); - if(order >= 1 && out->vlapl != NULL) cudaMemset(out->vlapl, 0, sizeof(double)*np*dim->vlapl); // (sigma, lapl, tau) - if(order >= 1) cudaMemset(out->vtau, 0, sizeof(double)*np*dim->vtau); + if(order >= 1) { + cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho); + cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); + cudaMemset(out->vtau, 0, sizeof(double)*np*dim->vtau); + if(out->vlapl != NULL) cudaMemset(out->vlapl, 0, sizeof(double)*np*dim->vlapl); // (sigma, lapl, tau) + } - if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2); - if(order >= 2) cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma); - if(order >= 2 && out->v2rholapl != NULL) cudaMemset(out->v2rholapl, 0, sizeof(double)*np*dim->v2rholapl); - if(order >= 2) cudaMemset(out->v2rhotau, 0, sizeof(double)*np*dim->v2rhotau); - if(order >= 2) cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2); - if(order >= 2 && out->v2sigmalapl != NULL) cudaMemset(out->v2sigmalapl, 0, sizeof(double)*np*dim->v2sigmalapl); - if(order >= 2) cudaMemset(out->v2sigmatau, 0, sizeof(double)*np*dim->v2sigmatau); - if(order >= 2 && out->v2lapl2 != NULL) cudaMemset(out->v2lapl2, 0, sizeof(double)*np*dim->v2lapl2); - if(order >= 2 && out->v2lapltau != NULL) cudaMemset(out->v2lapltau, 0, sizeof(double)*np*dim->v2lapltau); - if(order >= 2) cudaMemset(out->v2tau2, 0, sizeof(double)*np*dim->v2tau2); + if(order >= 2) { + cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2); + cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma); + cudaMemset(out->v2rhotau, 0, sizeof(double)*np*dim->v2rhotau); + cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2); + cudaMemset(out->v2sigmatau, 0, sizeof(double)*np*dim->v2sigmatau); + cudaMemset(out->v2tau2, 0, sizeof(double)*np*dim->v2tau2); + if(out->v2rholapl != NULL) cudaMemset(out->v2rholapl, 0, sizeof(double)*np*dim->v2rholapl); + if(out->v2sigmalapl != NULL) cudaMemset(out->v2sigmalapl, 0, sizeof(double)*np*dim->v2sigmalapl); + if(out->v2lapl2 != NULL) cudaMemset(out->v2lapl2, 0, sizeof(double)*np*dim->v2lapl2); + if(out->v2lapltau != NULL) cudaMemset(out->v2lapltau, 0, sizeof(double)*np*dim->v2lapltau); + } + + if (order >= 3) { + cudaMemset(out->v3rho3 , 0, sizeof(double)*np*dim->v3rho3); + cudaMemset(out->v3rho2sigma , 0, sizeof(double)*np*dim->v3rho2sigma); + cudaMemset(out->v3rho2tau , 0, sizeof(double)*np*dim->v3rho2tau); + cudaMemset(out->v3rhosigma2 , 0, sizeof(double)*np*dim->v3rhosigma2); + cudaMemset(out->v3rhosigmatau , 0, sizeof(double)*np*dim->v3rhosigmatau); + cudaMemset(out->v3rhotau2 , 0, sizeof(double)*np*dim->v3rhotau2); + cudaMemset(out->v3sigma3 , 0, sizeof(double)*np*dim->v3sigma3); + cudaMemset(out->v3sigma2tau , 0, sizeof(double)*np*dim->v3sigma2tau); + cudaMemset(out->v3sigmatau2 , 0, sizeof(double)*np*dim->v3sigmatau2); + cudaMemset(out->v3tau3 , 0, sizeof(double)*np*dim->v3tau3); + if (out->v3rho2lapl != NULL) cudaMemset(out->v3rho2lapl , 0, sizeof(double)*np*dim->v3rho2lapl); + if (out->v3rhosigmalapl!= NULL) cudaMemset(out->v3rhosigmalapl, 0, sizeof(double)*np*dim->v3rhosigmalapl); + if (out->v3rholapl2 != NULL) cudaMemset(out->v3rholapl2 , 0, sizeof(double)*np*dim->v3rholapl2); + if (out->v3rholapltau != NULL) cudaMemset(out->v3rholapltau , 0, sizeof(double)*np*dim->v3rholapltau); + if (out->v3sigma2lapl != NULL) cudaMemset(out->v3sigma2lapl , 0, sizeof(double)*np*dim->v3sigma2lapl); + if (out->v3sigmalapl2 != NULL) cudaMemset(out->v3sigmalapl2 , 0, sizeof(double)*np*dim->v3sigmalapl2); + if (out->v3sigmalapltau!= NULL) cudaMemset(out->v3sigmalapltau, 0, sizeof(double)*np*dim->v3sigmalapltau); + if (out->v3lapl3 != NULL) cudaMemset(out->v3lapl3 , 0, sizeof(double)*np*dim->v3lapl3); + if (out->v3lapl2tau != NULL) cudaMemset(out->v3lapl2tau , 0, sizeof(double)*np*dim->v3lapl2tau); + if (out->v3lapltau2 != NULL) cudaMemset(out->v3lapltau2 , 0, sizeof(double)*np*dim->v3lapltau2); + } + + if (order >= 4) { + cudaMemset(out->v4rho4 , 0, sizeof(double)*np*dim->v4rho4); + cudaMemset(out->v4rho3sigma , 0, sizeof(double)*np*dim->v4rho3sigma); + cudaMemset(out->v4rho3tau , 0, sizeof(double)*np*dim->v4rho3tau); + cudaMemset(out->v4rho2sigma2 , 0, sizeof(double)*np*dim->v4rho2sigma2); + cudaMemset(out->v4rho2sigmatau , 0, sizeof(double)*np*dim->v4rho2sigmatau); + cudaMemset(out->v4rho2tau2 , 0, sizeof(double)*np*dim->v4rho2tau2); + cudaMemset(out->v4rhosigma3 , 0, sizeof(double)*np*dim->v4rhosigma3); + cudaMemset(out->v4rhosigma2tau , 0, sizeof(double)*np*dim->v4rhosigma2tau); + cudaMemset(out->v4rhosigmatau2 , 0, sizeof(double)*np*dim->v4rhosigmatau2); + cudaMemset(out->v4rhotau3 , 0, sizeof(double)*np*dim->v4rhotau3); + cudaMemset(out->v4sigma4 , 0, sizeof(double)*np*dim->v4sigma4); + cudaMemset(out->v4sigma3tau , 0, sizeof(double)*np*dim->v4sigma3tau); + cudaMemset(out->v4sigma2tau2 , 0, sizeof(double)*np*dim->v4sigma2tau2); + cudaMemset(out->v4sigmatau3 , 0, sizeof(double)*np*dim->v4sigmatau3); + cudaMemset(out->v4tau4 , 0, sizeof(double)*np*dim->v4tau4); + if (out->v4rho3lapl != NULL) cudaMemset(out->v4rho3lapl , 0, sizeof(double)*np*dim->v4rho3lapl); + if (out->v4rho2sigmalapl != NULL) cudaMemset(out->v4rho2sigmalapl , 0, sizeof(double)*np*dim->v4rho2sigmalapl); + if (out->v4rho2lapl2 != NULL) cudaMemset(out->v4rho2lapl2 , 0, sizeof(double)*np*dim->v4rho2lapl2); + if (out->v4rho2lapltau != NULL) cudaMemset(out->v4rho2lapltau , 0, sizeof(double)*np*dim->v4rho2lapltau); + if (out->v4rhosigma2lapl != NULL) cudaMemset(out->v4rhosigma2lapl , 0, sizeof(double)*np*dim->v4rhosigma2lapl); + if (out->v4rhosigmalapl2 != NULL) cudaMemset(out->v4rhosigmalapl2 , 0, sizeof(double)*np*dim->v4rhosigmalapl2); + if (out->v4rhosigmalapltau!= NULL) cudaMemset(out->v4rhosigmalapltau, 0, sizeof(double)*np*dim->v4rhosigmalapltau); + if (out->v4rholapl3 != NULL) cudaMemset(out->v4rholapl3 , 0, sizeof(double)*np*dim->v4rholapl3); + if (out->v4rholapl2tau != NULL) cudaMemset(out->v4rholapl2tau , 0, sizeof(double)*np*dim->v4rholapl2tau); + if (out->v4rholapltau2 != NULL) cudaMemset(out->v4rholapltau2 , 0, sizeof(double)*np*dim->v4rholapltau2); + if (out->v4sigma3lapl != NULL) cudaMemset(out->v4sigma3lapl , 0, sizeof(double)*np*dim->v4sigma3lapl); + if (out->v4sigma2lapl2 != NULL) cudaMemset(out->v4sigma2lapl2 , 0, sizeof(double)*np*dim->v4sigma2lapl2); + if (out->v4sigma2lapltau != NULL) cudaMemset(out->v4sigma2lapltau , 0, sizeof(double)*np*dim->v4sigma2lapltau); + if (out->v4sigmalapl3 != NULL) cudaMemset(out->v4sigmalapl3 , 0, sizeof(double)*np*dim->v4sigmalapl3); + if (out->v4sigmalapl2tau != NULL) cudaMemset(out->v4sigmalapl2tau , 0, sizeof(double)*np*dim->v4sigmalapl2tau); + if (out->v4sigmalapltau2 != NULL) cudaMemset(out->v4sigmalapltau2 , 0, sizeof(double)*np*dim->v4sigmalapltau2); + if (out->v4lapl4 != NULL) cudaMemset(out->v4lapl4 , 0, sizeof(double)*np*dim->v4lapl4); + if (out->v4lapl3tau != NULL) cudaMemset(out->v4lapl3tau , 0, sizeof(double)*np*dim->v4lapl3tau); + if (out->v4lapl2tau2 != NULL) cudaMemset(out->v4lapl2tau2 , 0, sizeof(double)*np*dim->v4lapl2tau2); + if (out->v4lapltau3 != NULL) cudaMemset(out->v4lapltau3 , 0, sizeof(double)*np*dim->v4lapltau3); + } } __host__ diff --git a/gpu4pyscf/lib/gvhf-md/CMakeLists.txt b/gpu4pyscf/lib/gvhf-md/CMakeLists.txt new file mode 100644 index 00000000..c241d1c2 --- /dev/null +++ b/gpu4pyscf/lib/gvhf-md/CMakeLists.txt @@ -0,0 +1,17 @@ +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=128") + +add_library(gvhf_md SHARED + md_contract_j.cu md_j_driver.cu md_pairdata.c unrolled_md_j.cu +) + +#option(BUILD_SHARED_LIBS "build shared libraries" 1) +#option(ENABLE_STATIC "Enforce static library build" 0) +#if(ENABLE_STATIC) +# set(BUILD_SHARED_LIBS 0) +#endif() + +set_target_properties(gvhf_md PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR} + CUDA_SEPARABLE_COMPILATION ON) + +target_link_libraries(gvhf_md OpenMP::OpenMP_C) diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu new file mode 100644 index 00000000..2d1b3a12 --- /dev/null +++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu @@ -0,0 +1,467 @@ +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" +#include "gvhf-rys/gamma_inc.cu" + +#define TILEX 2 +#define TILEY 4 + +extern __constant__ uint16_t c_Rt_idx[]; +extern __constant__ uint16_t c_Rt_offsets[]; + +#define ADDR(l, t, u, v) \ + ((l+1)*(l+2)*(l+3)/6 - ((l)-(t)+1)*((l)-(t)+2)*((l)-(t)+3)/6 + \ + ((l)-(t)+1)*((l)-(t)+2)/2 - ((l)-(t)-(u)+1)*((l)-(t)-(u)+2)/2 + (v)) + +__device__ +static void iter_Rt_n(double *out, double *Rt, double rx, double ry, double rz, + int l, int sq_id, int nsq_per_block) +{ + uint16_t *p1 = c_Rt_idx + c_Rt_offsets[l]; + double *pout = out + nsq_per_block; + int k = 0; + for (int v = 0, i = 0; v < l; ++v) { + pout[sq_id+k*nsq_per_block] = rz * Rt[sq_id+i*nsq_per_block] + v * Rt[sq_id+p1[k]*nsq_per_block]; + ++k; ++i; + } + for (int u = 0, i = 0; u < l; ++u) { + for (int v = 0; v < l-u; ++v) { + pout[sq_id+k*nsq_per_block] = ry * Rt[sq_id+i*nsq_per_block] + u * Rt[sq_id+p1[k]*nsq_per_block]; + ++k; ++i; + } + } + //int nf3 = l*(l+1)*(l+2)/6; + //Fold3Index *fold3idx = c_i_in_fold3idx + (l-1)*nf3/4;; + //for (int i = 0; i < nf3; ++i) { + // Fold3Index f3i = fold3idx[i]; + // int t = f3i.x; + // pout[sq_id+(k+i)*nsq_per_block] = rx * Rt[sq_id+i*nsq_per_block] + // + t * Rt[sq_id+p1[k+i]*nsq_per_block]; + //} + for (int t = 0, i = 0; t < l; ++t) { + // corresponding to the nested loops + // for (u = 0; u < l-t; ++u) for (v = 0; v < l-t-u; ++v) + for (int uv = 0; uv < (l-t) * (l-t+1) / 2; ++uv) { + pout[sq_id+(k+i)*nsq_per_block] = rx * Rt[sq_id+i*nsq_per_block] + + t * Rt[sq_id+p1[k+i]*nsq_per_block]; + ++i; + } + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void md_j_kernel(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int threadsx = blockDim.x; + int threadsy = blockDim.y; + int bsizex = threadsx * TILEX; + int bsizey = threadsy * TILEY; + int task_ij0 = blockIdx.x * bsizex; + int task_kl0 = blockIdx.y * bsizey; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + threadsx * ty; + int nsq_per_block = threadsx * threadsy; + int gout_id = threadIdx.z; + int gout_stride = blockDim.z; + int t_id = sq_id + nsq_per_block * gout_id; + int threads = nsq_per_block * gout_stride; + int li = bounds.li; + int lj = bounds.lj; + int lk = bounds.lk; + int ll = bounds.ll; + int lij = li + lj; + int lkl = lk + ll; + int order = lij + lkl; + int nf3ijkl = (order+1)*(order+2)*(order+3)/6; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + int nf3ij = (lij+1)*(lij+2)*(lij+3)/6; + int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6; + int ij_fold3idx_cum = lij*nf3ij/4; + int kl_fold3idx_cum = lkl*nf3kl/4; + Fold3Index *ij_fold3idx = c_i_in_fold3idx + ij_fold3idx_cum; + Fold3Index *kl_fold3idx = c_i_in_fold3idx + kl_fold3idx_cum; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + (order+1) * nsq_per_block; + double *Rq_cache = Rp_cache + bsizex*4; + double *vj_ij_cache = Rq_cache + bsizey*4; + double *vj_kl_cache = vj_ij_cache + nf3ij * bsizex; + + // zero out all cache; + for (int n = t_id; n < (bsizex*4 + bsizey*4 + nf3ij*bsizex + nf3kl*bsizey); n += threads) { + Rp_cache[n] = 0.; + } + __syncthreads(); + if (t_id < bsizex) { + int task_ij = blockIdx.x * bsizex + t_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[t_id+0*bsizex] = xij; + Rp_cache[t_id+1*bsizex] = yij; + Rp_cache[t_id+2*bsizex] = zij; + Rp_cache[t_id+3*bsizex] = aij; + } else { + Rp_cache[t_id+3*bsizex] = 1.; + } + } + if (t_id < bsizey) { + int task_kl = blockIdx.y * bsizey + t_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[t_id+0*bsizey] = xkl; + Rq_cache[t_id+1*bsizey] = ykl; + Rq_cache[t_id+2*bsizey] = zkl; + Rq_cache[t_id+3*bsizey] = akl; + } else { + Rq_cache[t_id+3*bsizey] = 1.; + } + } + //for (int n = ty+threadsy*gout_id; n < nf3ij*TILEX; n += threadsy*gout_stride) { + // int i = n / TILEX; + // int tile = n % TILEX; + // int task_ij = blockIdx.x * bsizex + tile * threadsx + tx; + // if (task_ij < npairs_ij) { + // int pair_ij = pair_ij_mapping[task_ij]; + // int dm_ij_pair0 = dm_pair_loc[pair_ij]; + // int sq_ij = tx + tile * threadsx; + // dm_ij_cache[sq_ij+i*bsizex] = dm[dm_ij_pair0+i]; + // } + //} + //for (int n = tx+threadsx*gout_id; n < nf3kl*TILEY; n += threadsx*gout_stride) { + // int i = n / TILEY; + // int tile = n % TILEY; + // int task_kl = blockIdx.y * bsizey + tile * threadsy + ty; + // if (task_kl < npairs_kl) { + // int pair_kl = pair_kl_mapping[task_kl]; + // int dm_kl_pair0 = dm_pair_loc[pair_kl]; + // int sq_kl = ty + tile * threadsy; + // dm_kl_cache[sq_kl+i*bsizey] = dm[dm_kl_pair0+i]; + // } + //} + __syncthreads(); + + for (int batch_ij = 0; batch_ij < TILEX; ++batch_ij) { + for (int batch_kl = 0; batch_kl < TILEY; ++batch_kl) { + int task_ij0 = blockIdx.x * bsizex + batch_ij * threadsx; + int task_kl0 = blockIdx.y * bsizey + batch_kl * threadsy; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * threadsx; + int sq_kl = ty + batch_kl * threadsy; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + // TODO: skip certain blocks when task_ij < task_kl + if (task_ij < task_kl) fac_sym = 0.; + } + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + double *Rt, *buf; + if (gout_id == 0) { + double xij = Rp_cache[sq_ij+0*bsizex]; + double yij = Rp_cache[sq_ij+1*bsizex]; + double zij = Rp_cache[sq_ij+2*bsizex]; + double aij = Rp_cache[sq_ij+3*bsizex]; + double xkl = Rq_cache[sq_kl+0*bsizey]; + double ykl = Rq_cache[sq_kl+1*bsizey]; + double zkl = Rq_cache[sq_kl+2*bsizey]; + double akl = Rq_cache[sq_kl+3*bsizey]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, order, sq_id, nsq_per_block); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= order; i++) { + fac *= a2; + gamma_inc[sq_id+i*nsq_per_block] *= fac; + } + if (order % 2 == 0) { + Rt = vj_kl_cache + nf3kl*bsizey; + buf = Rt + nf3ijkl * nsq_per_block; + } else { + buf = vj_kl_cache + nf3kl*bsizey; + Rt = buf + nf3ijkl * nsq_per_block; + } + Rt[sq_id] = gamma_inc[sq_id+order*nsq_per_block]; + for (int n = 1; n <= order; ++n) { + // swap input and output + double *tmp = buf; + buf = Rt; + Rt = tmp; + Rt[sq_id] = gamma_inc[sq_id+(order-n)*nsq_per_block]; + switch (n) { + case 1: + Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+2*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+3*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block]; + break; + case 2: + Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+3*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+4*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+6*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+7*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+8*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block]; + Rt[sq_id+9*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + break; + case 3: + Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+3*nsq_per_block] = zpq * buf[sq_id+2*nsq_per_block] + 2 * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+4*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+6*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block]; + Rt[sq_id+7*nsq_per_block] = ypq * buf[sq_id+3*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+8*nsq_per_block] = ypq * buf[sq_id+4*nsq_per_block] + buf[sq_id+1*nsq_per_block]; + Rt[sq_id+9*nsq_per_block] = ypq * buf[sq_id+5*nsq_per_block] + 2 * buf[sq_id+3*nsq_per_block]; + Rt[sq_id+10*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+11*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+12*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block]; + Rt[sq_id+13*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block]; + Rt[sq_id+14*nsq_per_block] = xpq * buf[sq_id+4*nsq_per_block]; + Rt[sq_id+15*nsq_per_block] = xpq * buf[sq_id+5*nsq_per_block]; + Rt[sq_id+16*nsq_per_block] = xpq * buf[sq_id+6*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+17*nsq_per_block] = xpq * buf[sq_id+7*nsq_per_block] + buf[sq_id+1*nsq_per_block]; + Rt[sq_id+18*nsq_per_block] = xpq * buf[sq_id+8*nsq_per_block] + buf[sq_id+3*nsq_per_block]; + Rt[sq_id+19*nsq_per_block] = xpq * buf[sq_id+9*nsq_per_block] + 2 * buf[sq_id+6*nsq_per_block]; + break; + case 4: + Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+3*nsq_per_block] = zpq * buf[sq_id+2*nsq_per_block] + 2 * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+4*nsq_per_block] = zpq * buf[sq_id+3*nsq_per_block] + 3 * buf[sq_id+2*nsq_per_block]; + Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+6*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+7*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block]; + Rt[sq_id+8*nsq_per_block] = ypq * buf[sq_id+3*nsq_per_block]; + Rt[sq_id+9*nsq_per_block] = ypq * buf[sq_id+4*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+10*nsq_per_block] = ypq * buf[sq_id+5*nsq_per_block] + buf[sq_id+1*nsq_per_block]; + Rt[sq_id+11*nsq_per_block] = ypq * buf[sq_id+6*nsq_per_block] + buf[sq_id+2*nsq_per_block]; + Rt[sq_id+12*nsq_per_block] = ypq * buf[sq_id+7*nsq_per_block] + 2 * buf[sq_id+4*nsq_per_block]; + Rt[sq_id+13*nsq_per_block] = ypq * buf[sq_id+8*nsq_per_block] + 2 * buf[sq_id+5*nsq_per_block]; + Rt[sq_id+14*nsq_per_block] = ypq * buf[sq_id+9*nsq_per_block] + 3 * buf[sq_id+7*nsq_per_block]; + Rt[sq_id+15*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block]; + Rt[sq_id+16*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block]; + Rt[sq_id+17*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block]; + Rt[sq_id+18*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block]; + Rt[sq_id+19*nsq_per_block] = xpq * buf[sq_id+4*nsq_per_block]; + Rt[sq_id+20*nsq_per_block] = xpq * buf[sq_id+5*nsq_per_block]; + Rt[sq_id+21*nsq_per_block] = xpq * buf[sq_id+6*nsq_per_block]; + Rt[sq_id+22*nsq_per_block] = xpq * buf[sq_id+7*nsq_per_block]; + Rt[sq_id+23*nsq_per_block] = xpq * buf[sq_id+8*nsq_per_block]; + Rt[sq_id+24*nsq_per_block] = xpq * buf[sq_id+9*nsq_per_block]; + Rt[sq_id+25*nsq_per_block] = xpq * buf[sq_id+10*nsq_per_block] + buf[sq_id+0*nsq_per_block]; + Rt[sq_id+26*nsq_per_block] = xpq * buf[sq_id+11*nsq_per_block] + buf[sq_id+1*nsq_per_block]; + Rt[sq_id+27*nsq_per_block] = xpq * buf[sq_id+12*nsq_per_block] + buf[sq_id+2*nsq_per_block]; + Rt[sq_id+28*nsq_per_block] = xpq * buf[sq_id+13*nsq_per_block] + buf[sq_id+4*nsq_per_block]; + Rt[sq_id+29*nsq_per_block] = xpq * buf[sq_id+14*nsq_per_block] + buf[sq_id+5*nsq_per_block]; + Rt[sq_id+30*nsq_per_block] = xpq * buf[sq_id+15*nsq_per_block] + buf[sq_id+7*nsq_per_block]; + Rt[sq_id+31*nsq_per_block] = xpq * buf[sq_id+16*nsq_per_block] + 2 * buf[sq_id+10*nsq_per_block]; + Rt[sq_id+32*nsq_per_block] = xpq * buf[sq_id+17*nsq_per_block] + 2 * buf[sq_id+11*nsq_per_block]; + Rt[sq_id+33*nsq_per_block] = xpq * buf[sq_id+18*nsq_per_block] + 2 * buf[sq_id+13*nsq_per_block]; + Rt[sq_id+34*nsq_per_block] = xpq * buf[sq_id+19*nsq_per_block] + 3 * buf[sq_id+16*nsq_per_block]; + break; + default: iter_Rt_n(Rt, buf, xpq, ypq, zpq, n, sq_id, nsq_per_block); + } + } + } + + Rt = vj_kl_cache + nf3kl*bsizey; + double *vj_cache = Rt + nf3ijkl * nsq_per_block; + //for (k = 0, e = 0; e <= l1; ++e) { + //for (f = 0; f <= l1-e; ++f) { + //for (g = 0; g <= l1-e-f; ++g, ++k) { + // double rho_kl_val = rho_kl[k]; + // double jvec_kl_val = 0.; + // double fac = 1; + // if ((e + f + g) % 2 != 0) { + // fac = -1; + // } + // for (i = 0, t = 0; t <= l2; ++t) { + // for (u = 0; u <= l2-t; ++u) { + // for (v = 0; v <= l2-t-u; ++v, ++i) { + // s = fac * R[e+t,f+u,g+v] + // jvec_kl_val += s * rho_ij[i]; + // jvec_ij[i] += s * rho_kl_val; + // } } } + // jvec_kl[k] += jvec_kl_val; + //} } } + for (int k = gout_id; k < nf3kl+gout_id; k += gout_stride) { + __syncthreads(); + double vj_kl = 0.; + if (k < nf3kl) { + Fold3Index f3k = kl_fold3idx[k]; + int e = f3k.x; + int f = f3k.y; + int g = f3k.z; + double fac = 1.; + if ((e + f + g) % 2 != 0) { + fac = -1.; + } + for (int i = 0, t = 0; t <= lij; ++t) { + for (int u = 0; u <= lij-t; ++u) { + for (int v = 0; v <= lij-t-u; ++v, ++i) { + //double s = Rt[sq_id+ADDR(order,e+t,f+u,g+v)*nsq_per_block]; + int ix = order-e-t; + int xoffset = ix*(ix+1)*(ix+2)/6; + int iy = ix-f-u; + int i2y = (iy+1)*(iy+2)/2; + double s = Rt[sq_id+(nf3ijkl-xoffset-i2y+g+v)*nsq_per_block]; + vj_kl += fac * s * dm[dm_ij_pair0+i]; + } } } + //atomicAdd(vj+dm_kl_pair0+k, vj_kl); + } + vj_cache[t_id] = vj_kl; + for (int stride = threadsx/2; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[t_id] += vj_cache[t_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+k*bsizey] += vj_cache[t_id]; + } + } + + for (int i = gout_id; i < nf3ij+gout_id; i += gout_stride) { + __syncthreads(); + double vj_ij = 0.; + if (i < nf3ij) { + Fold3Index f3i = ij_fold3idx[i]; + int t = f3i.x; + int u = f3i.y; + int v = f3i.z; + for (int k = 0, e = 0; e <= lkl; ++e) { + for (int f = 0; f <= lkl-e; ++f) { + for (int g = 0; g <= lkl-e-f; ++g, ++k) { + //double s = Rt[sq_id+ADDR(order,e+t,f+u,g+v)*nsq_per_block]; + int ix = order-e-t; + int xoffset = ix*(ix+1)*(ix+2)/6; + int iy = ix-f-u; + int i2y = (iy+1)*(iy+2)/2; + double s = Rt[sq_id+(nf3ijkl-xoffset-i2y+g+v)*nsq_per_block]; + double d = dm[dm_kl_pair0+k]; + if ((e + f + g) % 2 == 0) { + vj_ij += s * d; + } else { + vj_ij -= s * d; + } + } } } + //atomicAdd(vj+dm_ij_pair0+i, vj_ij); + } + vj_cache[t_id] = vj_ij; + for (int stride = threadsy/2; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[t_id] += vj_cache[t_id + stride*threadsx]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+i*bsizex] += vj_cache[t_id]; + } + } + __syncthreads(); + } } + + for (int n = ty+threadsy*gout_id; n < nf3ij*TILEX; n += threadsy*gout_stride) { + int i = n / TILEX; + int tile = n % TILEX; + int task_ij = blockIdx.x * bsizex + tile * threadsx + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * threadsx; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*bsizex]); + } + } + for (int n = tx+threadsx*gout_id; n < nf3kl*TILEY; n += threadsx*gout_stride) { + int i = n / TILEY; + int tile = n % TILEY; + int task_kl = blockIdx.y * bsizey + tile * threadsy + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * threadsy; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*bsizey]); + } + } +} diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu new file mode 100644 index 00000000..e48407a6 --- /dev/null +++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu @@ -0,0 +1,434 @@ +#include +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" + +#define TILEX 2 +#define TILEY 4 + +__constant__ uint16_t c_Rt_idx[5967]; +__constant__ uint16_t c_Rt_offsets[19]; +__constant__ Fold2Index c_i_in_fold2idx[165]; +__constant__ Fold3Index c_i_in_fold3idx[495]; + + +extern __global__ void md_j_kernel(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds); +int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds, + int *scheme, int workers, double omega); +void set_md_j_unrolled_shm_size(); + +static uint16_t Rt_idx[] = { +// l = 1 +0,0,0, +// l = 2 +0,0,0,0,0,0,0,0,0, +// l = 3 +0,0,1,0,0,0,0,1,3,0,0,0,0,0,0,0,1,3,6, +// l = 4 +0,0,1,2,0,0,0,0,0,1,2,4,5,7,0,0,0,0,0,0, +0,0,0,0,0,1,2,4,5,7,10,11,13,16, +// l = 5 +0,0,1,2,3,0,0,0,0,0,0,1,2,3,5,6,7,9,10,12, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,5, +6,7,9,10,12,15,16,17,19,20,22,25,26,28,31, +// l = 6 +0,0,1,2,3,4,0,0,0,0,0,0,0,1,2,3,4,6,7,8, +9,11,12,13,15,16,18,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,1,2,3,4,6,7,8,9,11,12,13, +15,16,18,21,22,23,24,26,27,28,30,31,33,36,37,38,40,41,43,46, +47,49,52, +// l = 7 +0,0,1,2,3,4,5,0,0,0,0,0,0,0,0,1,2,3,4,5, +7,8,9,10,11,13,14,15,16,18,19,20,22,23,25,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,1,2,3,4,5,7,8,9,10,11,13,14,15,16,18,19, +20,22,23,25,28,29,30,31,32,34,35,36,37,39,40,41,43,44,46,49, +50,51,52,54,55,56,58,59,61,64,65,66,68,69,71,74,75,77,80, +// l = 8 +0,0,1,2,3,4,5,6,0,0,0,0,0,0,0,0,0,1,2,3, +4,5,6,8,9,10,11,12,13,15,16,17,18,19,21,22,23,24,26,27, +28,30,31,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,1,2,3,4,5,6,8,9,10,11,12,13,15,16,17,18,19,21,22, +23,24,26,27,28,30,31,33,36,37,38,39,40,41,43,44,45,46,47,49, +50,51,52,54,55,56,58,59,61,64,65,66,67,68,70,71,72,73,75,76, +77,79,80,82,85,86,87,88,90,91,92,94,95,97,100,101,102,104,105,107, +110,111,113,116, +// l = 9 +0,0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0,0,0,1, +2,3,4,5,6,7,9,10,11,12,13,14,15,17,18,19,20,21,22,24, +25,26,27,28,30,31,32,33,35,36,37,39,40,42,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +1,2,3,4,5,6,7,9,10,11,12,13,14,15,17,18,19,20,21,22, +24,25,26,27,28,30,31,32,33,35,36,37,39,40,42,45,46,47,48,49, +50,51,53,54,55,56,57,58,60,61,62,63,64,66,67,68,69,71,72,73, +75,76,78,81,82,83,84,85,86,88,89,90,91,92,94,95,96,97,99,100, +101,103,104,106,109,110,111,112,113,115,116,117,118,120,121,122,124,125,127,130, +131,132,133,135,136,137,139,140,142,145,146,147,149,150,152,155,156,158,161, +// l = 10 +0,0,1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0,0,0, +0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,20,21, +22,23,24,25,27,28,29,30,31,32,34,35,36,37,38,40,41,42,43,45, +46,47,49,50,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,20,21, +22,23,24,25,27,28,29,30,31,32,34,35,36,37,38,40,41,42,43,45, +46,47,49,50,52,55,56,57,58,59,60,61,62,64,65,66,67,68,69,70, +72,73,74,75,76,77,79,80,81,82,83,85,86,87,88,90,91,92,94,95, +97,100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,118,119,121, +122,123,124,126,127,128,130,131,133,136,137,138,139,140,141,143,144,145,146,147, +149,150,151,152,154,155,156,158,159,161,164,165,166,167,168,170,171,172,173,175, +176,177,179,180,182,185,186,187,188,190,191,192,194,195,197,200,201,202,204,205, +207,210,211,213,216, +// l = 11 +0,0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0,0,0,0, +0,0,0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18, +19,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,38,39,40,41, +42,43,45,46,47,48,49,51,52,53,54,56,57,58,60,61,63,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17, +18,19,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,38,39,40, +41,42,43,45,46,47,48,49,51,52,53,54,56,57,58,60,61,63,66,67, +68,69,70,71,72,73,74,76,77,78,79,80,81,82,83,85,86,87,88,89, +90,91,93,94,95,96,97,98,100,101,102,103,104,106,107,108,109,111,112,113, +115,116,118,121,122,123,124,125,126,127,128,130,131,132,133,134,135,136,138,139, +140,141,142,143,145,146,147,148,149,151,152,153,154,156,157,158,160,161,163,166, +167,168,169,170,171,172,174,175,176,177,178,179,181,182,183,184,185,187,188,189, +190,192,193,194,196,197,199,202,203,204,205,206,207,209,210,211,212,213,215,216, +217,218,220,221,222,224,225,227,230,231,232,233,234,236,237,238,239,241,242,243, +245,246,248,251,252,253,254,256,257,258,260,261,263,266,267,268,270,271,273,276, +277,279,282, +// l = 12 +0,0,1,2,3,4,5,6,7,8,9,10,0,0,0,0,0,0,0,0, +0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16, +17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38, +39,40,42,43,44,45,46,47,48,50,51,52,53,54,55,57,58,59,60,61, +63,64,65,66,68,69,70,72,73,75,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,12, +13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34, +35,36,37,38,39,40,42,43,44,45,46,47,48,50,51,52,53,54,55,57, +58,59,60,61,63,64,65,66,68,69,70,72,73,75,78,79,80,81,82,83, +84,85,86,87,89,90,91,92,93,94,95,96,97,99,100,101,102,103,104,105, +106,108,109,110,111,112,113,114,116,117,118,119,120,121,123,124,125,126,127,129, +130,131,132,134,135,136,138,139,141,144,145,146,147,148,149,150,151,152,154,155, +156,157,158,159,160,161,163,164,165,166,167,168,169,171,172,173,174,175,176,178, +179,180,181,182,184,185,186,187,189,190,191,193,194,196,199,200,201,202,203,204, +205,206,208,209,210,211,212,213,214,216,217,218,219,220,221,223,224,225,226,227, +229,230,231,232,234,235,236,238,239,241,244,245,246,247,248,249,250,252,253,254, +255,256,257,259,260,261,262,263,265,266,267,268,270,271,272,274,275,277,280,281, +282,283,284,285,287,288,289,290,291,293,294,295,296,298,299,300,302,303,305,308, +309,310,311,312,314,315,316,317,319,320,321,323,324,326,329,330,331,332,334,335, +336,338,339,341,344,345,346,348,349,351,354,355,357,360, +// l = 13 +0,0,1,2,3,4,5,6,7,8,9,10,11,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,13,14, +15,16,17,18,19,20,21,22,23,25,26,27,28,29,30,31,32,33,34,36, +37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,55,56,57,58, +59,60,61,63,64,65,66,67,68,70,71,72,73,74,76,77,78,79,81,82, +83,85,86,88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4, +5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,25,26, +27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,46,47,48, +49,50,51,52,53,55,56,57,58,59,60,61,63,64,65,66,67,68,70,71, +72,73,74,76,77,78,79,81,82,83,85,86,88,91,92,93,94,95,96,97, +98,99,100,101,103,104,105,106,107,108,109,110,111,112,114,115,116,117,118,119, +120,121,122,124,125,126,127,128,129,130,131,133,134,135,136,137,138,139,141,142, +143,144,145,146,148,149,150,151,152,154,155,156,157,159,160,161,163,164,166,169, +170,171,172,173,174,175,176,177,178,180,181,182,183,184,185,186,187,188,190,191, +192,193,194,195,196,197,199,200,201,202,203,204,205,207,208,209,210,211,212,214, +215,216,217,218,220,221,222,223,225,226,227,229,230,232,235,236,237,238,239,240, +241,242,243,245,246,247,248,249,250,251,252,254,255,256,257,258,259,260,262,263, +264,265,266,267,269,270,271,272,273,275,276,277,278,280,281,282,284,285,287,290, +291,292,293,294,295,296,297,299,300,301,302,303,304,305,307,308,309,310,311,312, +314,315,316,317,318,320,321,322,323,325,326,327,329,330,332,335,336,337,338,339, +340,341,343,344,345,346,347,348,350,351,352,353,354,356,357,358,359,361,362,363, +365,366,368,371,372,373,374,375,376,378,379,380,381,382,384,385,386,387,389,390, +391,393,394,396,399,400,401,402,403,405,406,407,408,410,411,412,414,415,417,420, +421,422,423,425,426,427,429,430,432,435,436,437,439,440,442,445,446,448,451, +// l = 14 +0,0,1,2,3,4,5,6,7,8,9,10,11,12,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11, +12,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33, +34,35,36,37,39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55, +56,57,58,60,61,62,63,64,65,66,67,69,70,71,72,73,74,75,77,78, +79,80,81,82,84,85,86,87,88,90,91,92,93,95,96,97,99,100,102,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16, +17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,36,37, +39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55,56,57,58,60, +61,62,63,64,65,66,67,69,70,71,72,73,74,75,77,78,79,80,81,82, +84,85,86,87,88,90,91,92,93,95,96,97,99,100,102,105,106,107,108,109, +110,111,112,113,114,115,116,118,119,120,121,122,123,124,125,126,127,128,130,131, +132,133,134,135,136,137,138,139,141,142,143,144,145,146,147,148,149,151,152,153, +154,155,156,157,158,160,161,162,163,164,165,166,168,169,170,171,172,173,175,176, +177,178,179,181,182,183,184,186,187,188,190,191,193,196,197,198,199,200,201,202, +203,204,205,206,208,209,210,211,212,213,214,215,216,217,219,220,221,222,223,224, +225,226,227,229,230,231,232,233,234,235,236,238,239,240,241,242,243,244,246,247, +248,249,250,251,253,254,255,256,257,259,260,261,262,264,265,266,268,269,271,274, +275,276,277,278,279,280,281,282,283,285,286,287,288,289,290,291,292,293,295,296, +297,298,299,300,301,302,304,305,306,307,308,309,310,312,313,314,315,316,317,319, +320,321,322,323,325,326,327,328,330,331,332,334,335,337,340,341,342,343,344,345, +346,347,348,350,351,352,353,354,355,356,357,359,360,361,362,363,364,365,367,368, +369,370,371,372,374,375,376,377,378,380,381,382,383,385,386,387,389,390,392,395, +396,397,398,399,400,401,402,404,405,406,407,408,409,410,412,413,414,415,416,417, +419,420,421,422,423,425,426,427,428,430,431,432,434,435,437,440,441,442,443,444, +445,446,448,449,450,451,452,453,455,456,457,458,459,461,462,463,464,466,467,468, +470,471,473,476,477,478,479,480,481,483,484,485,486,487,489,490,491,492,494,495, +496,498,499,501,504,505,506,507,508,510,511,512,513,515,516,517,519,520,522,525, +526,527,528,530,531,532,534,535,537,540,541,542,544,545,547,550,551,553,556, +// l = 15 +0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9, +10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31, +32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,48,49,50,51,52, +54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,70,71,72,73,75, +76,77,78,79,80,81,82,84,85,86,87,88,89,90,92,93,94,95,96,97, +99,100,101,102,103,105,106,107,108,110,111,112,114,115,117,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4, +5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25, +26,27,29,30,31,32,33,34,35,36,37,38,39,40,42,43,44,45,46,47, +48,49,50,51,52,54,55,56,57,58,59,60,61,62,63,65,66,67,68,69, +70,71,72,73,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90,92, +93,94,95,96,97,99,100,101,102,103,105,106,107,108,110,111,112,114,115,117, +120,121,122,123,124,125,126,127,128,129,130,131,132,134,135,136,137,138,139,140, +141,142,143,144,145,147,148,149,150,151,152,153,154,155,156,157,159,160,161,162, +163,164,165,166,167,168,170,171,172,173,174,175,176,177,178,180,181,182,183,184, +185,186,187,189,190,191,192,193,194,195,197,198,199,200,201,202,204,205,206,207, +208,210,211,212,213,215,216,217,219,220,222,225,226,227,228,229,230,231,232,233, +234,235,236,238,239,240,241,242,243,244,245,246,247,248,250,251,252,253,254,255, +256,257,258,259,261,262,263,264,265,266,267,268,269,271,272,273,274,275,276,277, +278,280,281,282,283,284,285,286,288,289,290,291,292,293,295,296,297,298,299,301, +302,303,304,306,307,308,310,311,313,316,317,318,319,320,321,322,323,324,325,326, +328,329,330,331,332,333,334,335,336,337,339,340,341,342,343,344,345,346,347,349, +350,351,352,353,354,355,356,358,359,360,361,362,363,364,366,367,368,369,370,371, +373,374,375,376,377,379,380,381,382,384,385,386,388,389,391,394,395,396,397,398, +399,400,401,402,403,405,406,407,408,409,410,411,412,413,415,416,417,418,419,420, +421,422,424,425,426,427,428,429,430,432,433,434,435,436,437,439,440,441,442,443, +445,446,447,448,450,451,452,454,455,457,460,461,462,463,464,465,466,467,468,470, +471,472,473,474,475,476,477,479,480,481,482,483,484,485,487,488,489,490,491,492, +494,495,496,497,498,500,501,502,503,505,506,507,509,510,512,515,516,517,518,519, +520,521,522,524,525,526,527,528,529,530,532,533,534,535,536,537,539,540,541,542, +543,545,546,547,548,550,551,552,554,555,557,560,561,562,563,564,565,566,568,569, +570,571,572,573,575,576,577,578,579,581,582,583,584,586,587,588,590,591,593,596, +597,598,599,600,601,603,604,605,606,607,609,610,611,612,614,615,616,618,619,621, +624,625,626,627,628,630,631,632,633,635,636,637,639,640,642,645,646,647,648,650, +651,652,654,655,657,660,661,662,664,665,667,670,671,673,676, +// l = 16 +0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7, +8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28, +29,31,32,33,34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50, +51,52,53,54,55,56,58,59,60,61,62,63,64,65,66,67,68,70,71,72, +73,74,75,76,77,78,79,81,82,83,84,85,86,87,88,89,91,92,93,94, +95,96,97,98,100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117, +118,119,121,122,123,124,126,127,128,130,131,133,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11, +12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33, +34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54, +55,56,58,59,60,61,62,63,64,65,66,67,68,70,71,72,73,74,75,76, +77,78,79,81,82,83,84,85,86,87,88,89,91,92,93,94,95,96,97,98, +100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,118,119,121,122, +123,124,126,127,128,130,131,133,136,137,138,139,140,141,142,143,144,145,146,147, +148,149,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169, +170,171,172,173,174,175,176,178,179,180,181,182,183,184,185,186,187,188,190,191, +192,193,194,195,196,197,198,199,201,202,203,204,205,206,207,208,209,211,212,213, +214,215,216,217,218,220,221,222,223,224,225,226,228,229,230,231,232,233,235,236, +237,238,239,241,242,243,244,246,247,248,250,251,253,256,257,258,259,260,261,262, +263,264,265,266,267,268,270,271,272,273,274,275,276,277,278,279,280,281,283,284, +285,286,287,288,289,290,291,292,293,295,296,297,298,299,300,301,302,303,304,306, +307,308,309,310,311,312,313,314,316,317,318,319,320,321,322,323,325,326,327,328, +329,330,331,333,334,335,336,337,338,340,341,342,343,344,346,347,348,349,351,352, +353,355,356,358,361,362,363,364,365,366,367,368,369,370,371,372,374,375,376,377, +378,379,380,381,382,383,384,386,387,388,389,390,391,392,393,394,395,397,398,399, +400,401,402,403,404,405,407,408,409,410,411,412,413,414,416,417,418,419,420,421, +422,424,425,426,427,428,429,431,432,433,434,435,437,438,439,440,442,443,444,446, +447,449,452,453,454,455,456,457,458,459,460,461,462,464,465,466,467,468,469,470, +471,472,473,475,476,477,478,479,480,481,482,483,485,486,487,488,489,490,491,492, +494,495,496,497,498,499,500,502,503,504,505,506,507,509,510,511,512,513,515,516, +517,518,520,521,522,524,525,527,530,531,532,533,534,535,536,537,538,539,541,542, +543,544,545,546,547,548,549,551,552,553,554,555,556,557,558,560,561,562,563,564, +565,566,568,569,570,571,572,573,575,576,577,578,579,581,582,583,584,586,587,588, +590,591,593,596,597,598,599,600,601,602,603,604,606,607,608,609,610,611,612,613, +615,616,617,618,619,620,621,623,624,625,626,627,628,630,631,632,633,634,636,637, +638,639,641,642,643,645,646,648,651,652,653,654,655,656,657,658,660,661,662,663, +664,665,666,668,669,670,671,672,673,675,676,677,678,679,681,682,683,684,686,687, +688,690,691,693,696,697,698,699,700,701,702,704,705,706,707,708,709,711,712,713, +714,715,717,718,719,720,722,723,724,726,727,729,732,733,734,735,736,737,739,740, +741,742,743,745,746,747,748,750,751,752,754,755,757,760,761,762,763,764,766,767, +768,769,771,772,773,775,776,778,781,782,783,784,786,787,788,790,791,793,796,797, +798,800,801,803,806,807,809,812, +// l = 17 +0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5, +6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26, +27,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46,48, +49,50,51,52,53,54,55,56,57,58,59,60,62,63,64,65,66,67,68,69, +70,71,72,73,75,76,77,78,79,80,81,82,83,84,85,87,88,89,90,91, +92,93,94,95,96,98,99,100,101,102,103,104,105,106,108,109,110,111,112,113, +114,115,117,118,119,120,121,122,123,125,126,127,128,129,130,132,133,134,135,136, +138,139,140,141,143,144,145,147,148,150,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17, +18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38, +39,40,41,42,43,44,45,46,48,49,50,51,52,53,54,55,56,57,58,59, +60,62,63,64,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81, +82,83,84,85,87,88,89,90,91,92,93,94,95,96,98,99,100,101,102,103, +104,105,106,108,109,110,111,112,113,114,115,117,118,119,120,121,122,123,125,126, +127,128,129,130,132,133,134,135,136,138,139,140,141,143,144,145,147,148,150,153, +154,155,156,157,158,159,160,161,162,163,164,165,166,167,169,170,171,172,173,174, +175,176,177,178,179,180,181,182,184,185,186,187,188,189,190,191,192,193,194,195, +196,198,199,200,201,202,203,204,205,206,207,208,209,211,212,213,214,215,216,217, +218,219,220,221,223,224,225,226,227,228,229,230,231,232,234,235,236,237,238,239, +240,241,242,244,245,246,247,248,249,250,251,253,254,255,256,257,258,259,261,262, +263,264,265,266,268,269,270,271,272,274,275,276,277,279,280,281,283,284,286,289, +290,291,292,293,294,295,296,297,298,299,300,301,302,304,305,306,307,308,309,310, +311,312,313,314,315,316,318,319,320,321,322,323,324,325,326,327,328,329,331,332, +333,334,335,336,337,338,339,340,341,343,344,345,346,347,348,349,350,351,352,354, +355,356,357,358,359,360,361,362,364,365,366,367,368,369,370,371,373,374,375,376, +377,378,379,381,382,383,384,385,386,388,389,390,391,392,394,395,396,397,399,400, +401,403,404,406,409,410,411,412,413,414,415,416,417,418,419,420,421,423,424,425, +426,427,428,429,430,431,432,433,434,436,437,438,439,440,441,442,443,444,445,446, +448,449,450,451,452,453,454,455,456,457,459,460,461,462,463,464,465,466,467,469, +470,471,472,473,474,475,476,478,479,480,481,482,483,484,486,487,488,489,490,491, +493,494,495,496,497,499,500,501,502,504,505,506,508,509,511,514,515,516,517,518, +519,520,521,522,523,524,525,527,528,529,530,531,532,533,534,535,536,537,539,540, +541,542,543,544,545,546,547,548,550,551,552,553,554,555,556,557,558,560,561,562, +563,564,565,566,567,569,570,571,572,573,574,575,577,578,579,580,581,582,584,585, +586,587,588,590,591,592,593,595,596,597,599,600,602,605,606,607,608,609,610,611, +612,613,614,615,617,618,619,620,621,622,623,624,625,626,628,629,630,631,632,633, +634,635,636,638,639,640,641,642,643,644,645,647,648,649,650,651,652,653,655,656, +657,658,659,660,662,663,664,665,666,668,669,670,671,673,674,675,677,678,680,683, +684,685,686,687,688,689,690,691,692,694,695,696,697,698,699,700,701,702,704,705, +706,707,708,709,710,711,713,714,715,716,717,718,719,721,722,723,724,725,726,728, +729,730,731,732,734,735,736,737,739,740,741,743,744,746,749,750,751,752,753,754, +755,756,757,759,760,761,762,763,764,765,766,768,769,770,771,772,773,774,776,777, +778,779,780,781,783,784,785,786,787,789,790,791,792,794,795,796,798,799,801,804, +805,806,807,808,809,810,811,813,814,815,816,817,818,819,821,822,823,824,825,826, +828,829,830,831,832,834,835,836,837,839,840,841,843,844,846,849,850,851,852,853, +854,855,857,858,859,860,861,862,864,865,866,867,868,870,871,872,873,875,876,877, +879,880,882,885,886,887,888,889,890,892,893,894,895,896,898,899,900,901,903,904, +905,907,908,910,913,914,915,916,917,919,920,921,922,924,925,926,928,929,931,934, +935,936,937,939,940,941,943,944,946,949,950,951,953,954,956,959,960,962,965, +}; + +// l*(l+1)*(l+2)*(l+3)//24 - l +static uint16_t Rt_idx_offsets[] = { +0,0,3,12,31,65,120,203,322,486,705,990,1353,1807,2366,3045,3860,4828,5967, +}; + +extern "C" { +int MD_build_j(double *vj, double *dm, int n_dm, int nao, + RysIntEnvVars envs, int *scheme, int *shls_slice, + int ntile_ij_pairs, int ntile_kl_pairs, + int *tile_ij_mapping, int *tile_kl_mapping, float *tile_q_cond, + float *q_cond, float *dm_cond, float cutoff, + uint32_t *batch_head, int workers, double omega, + int *atm, int natm, int *bas, int nbas, double *env) +{ + uint16_t ish0 = shls_slice[0]; + uint16_t jsh0 = shls_slice[2]; + uint16_t ksh0 = shls_slice[4]; + uint16_t lsh0 = shls_slice[6]; + uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS]; + uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS]; + uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS]; + uint8_t ll = bas[ANG_OF + lsh0*BAS_SLOTS]; + uint8_t order = li + lj + lk + ll; + BoundsInfo bounds = {li, lj, lk, ll, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, + ntile_ij_pairs, ntile_kl_pairs, tile_ij_mapping, tile_kl_mapping, + q_cond, dm_cond, cutoff}; + + JKMatrix jk = {vj, NULL, dm, (uint16_t)n_dm}; + + if (!md_j_unrolled(&envs, &jk, &bounds, scheme, workers, omega)) { + int lij = li + lj; + int lkl = lk + ll; + int threads_ij = scheme[0]; + int threads_kl = scheme[1]; + int bsizex = threads_ij * TILEX; + int bsizey = threads_kl * TILEY; + int nsq_per_block = threads_ij * threads_kl; + int gout_stride = scheme[2]; + dim3 threads(threads_ij, threads_kl, gout_stride); + int nf3ij = (lij+1)*(lij+2)*(lij+3)/6; + int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6; + int buflen = (order+1) * nsq_per_block + + bsizex * (4+nf3ij) + bsizey * (4+nf3kl) + + (order+1)*(order+2)*(order+3)/6 * nsq_per_block; + buflen += MAX(order*(order+1)*(order+2)/6, gout_stride) * nsq_per_block; + int blocks_ij = (ntile_ij_pairs + bsizex - 1) / bsizex; + int blocks_kl = (ntile_kl_pairs + bsizey - 1) / bsizey; + dim3 blocks(blocks_ij, blocks_kl); + md_j_kernel<<>>(envs, jk, bounds); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in MD_build_j: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +void init_mdj_constant(int shm_size) +{ + Fold2Index i_in_fold2idx[165]; + Fold3Index i_in_fold3idx[495]; + int n2 = 0; + int n3 = 0; + for (int l = 0; l <= LMAX*2; ++l) { + for (int i = 0, ijk = 0; i <= l; ++i) { + for (int j = 0; j <= l-i; ++j, ++n2) { + i_in_fold2idx[n2].x = i; + i_in_fold2idx[n2].y = j; + i_in_fold2idx[n2].fold3offset = ijk; + for (int k = 0; k <= l-i-j; ++k, ++n3, ++ijk) { + i_in_fold3idx[n3].x = i; + i_in_fold3idx[n3].y = j; + i_in_fold3idx[n3].z = k; + i_in_fold3idx[n3].fold2yz = (l+1)*(l+2)/2 - (l-j+1)*(l-j+2)/2 + k; + } + } } + } + cudaMemcpyToSymbol(c_Rt_idx, Rt_idx, sizeof(Rt_idx)); // reuse these buffer to store Rt1_idx + cudaMemcpyToSymbol(c_Rt_offsets, Rt_idx_offsets, sizeof(Rt_idx_offsets)); + cudaMemcpyToSymbol(c_i_in_fold2idx, i_in_fold2idx, 165*sizeof(Fold2Index)); + cudaMemcpyToSymbol(c_i_in_fold3idx, i_in_fold3idx, 495*sizeof(Fold3Index)); + cudaFuncSetAttribute(md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + set_md_j_unrolled_shm_size(); +} +} diff --git a/gpu4pyscf/lib/gvhf-md/md_pairdata.c b/gpu4pyscf/lib/gvhf-md/md_pairdata.c new file mode 100644 index 00000000..8b8b7017 --- /dev/null +++ b/gpu4pyscf/lib/gvhf-md/md_pairdata.c @@ -0,0 +1,203 @@ +#include +#include +#include "gvhf-rys/vhf.cuh" + +#define Ex_at(i,j,t) Ex[(i)*stride1+(j)*stride2+t] +#define Ey_at(i,j,t) Ey[(i)*stride1+(j)*stride2+t] +#define Ez_at(i,j,t) Ez[(i)*stride1+(j)*stride2+t] + +void get_E_cart_components(double *Ecart, int li, int lj, double ai, double aj, + double *Ra, double *Rb) +{ + double aij = ai + aj; + double xixj = Ra[0] - Rb[0]; + double yiyj = Ra[1] - Rb[1]; + double zizj = Ra[2] - Rb[2]; + double theta_ij = ai * aj / aij; + double Kab = exp(-theta_ij * (xixj*xixj + yiyj*yiyj + zizj*zizj)); + double Xp = (ai * Ra[0] + aj * Rb[0]) / aij; + double Yp = (ai * Ra[1] + aj * Rb[1]) / aij; + double Zp = (ai * Ra[2] + aj * Rb[2]) / aij; + double Xpa = Xp - Ra[0]; + double Ypa = Yp - Ra[1]; + double Zpa = Zp - Ra[2]; + double Xpb = Xp - Rb[0]; + double Ypb = Yp - Rb[1]; + double Zpb = Zp - Rb[2]; + int lij = li + lj; + int stride2 = lij+1; + int stride1 = (lj+1) * stride2; + int Ex_size = (li+1) * stride1; + double *Ex = Ecart; + double *Ey = Ex + Ex_size; + double *Ez = Ey + Ex_size; + int i, j, t; + double fac, fac1; + + Ex_at(0,0,0) = 1.; + Ey_at(0,0,0) = 1.; + Ez_at(0,0,0) = Kab; + for (t = 1; t <= lij; t++) { + Ex_at(0,0,t) = 0.; + Ey_at(0,0,t) = 0.; + Ez_at(0,0,t) = 0.; + } + + for (j = 1; j <= lj; j++) { + Ex_at(0,j,0) = Xpb * Ex_at(0,j-1,0) + Ex_at(0,j-1,1); + Ey_at(0,j,0) = Ypb * Ey_at(0,j-1,0) + Ey_at(0,j-1,1); + Ez_at(0,j,0) = Zpb * Ez_at(0,j-1,0) + Ez_at(0,j-1,1); + for (t = 1; t <= lij; t++) { + fac = j/(2*aij*t); + Ex_at(0,j,t) = fac * Ex_at(0,j-1,t-1); + Ey_at(0,j,t) = fac * Ey_at(0,j-1,t-1); + Ez_at(0,j,t) = fac * Ez_at(0,j-1,t-1); + } + } + + for (i = 1; i <= li; i++) { + Ex_at(i,0,0) = Xpa * Ex_at(i-1,0,0) + Ex_at(i-1,0,1); + Ey_at(i,0,0) = Ypa * Ey_at(i-1,0,0) + Ey_at(i-1,0,1); + Ez_at(i,0,0) = Zpa * Ez_at(i-1,0,0) + Ez_at(i-1,0,1); + for (t = 1; t <= lij; t++) { + fac = i/(2*aij*t); + Ex_at(i,0,t) = fac * Ex_at(i-1,0,t-1); + Ey_at(i,0,t) = fac * Ey_at(i-1,0,t-1); + Ez_at(i,0,t) = fac * Ez_at(i-1,0,t-1); + } + } + + for (i = 1; i <= li; i++) { + for (j = 1; j <= lj; j++) { + Ex_at(i,j,0) = Xpb * Ex_at(i,j-1,0) + Ex_at(i,j-1,1); + Ey_at(i,j,0) = Ypb * Ey_at(i,j-1,0) + Ey_at(i,j-1,1); + Ez_at(i,j,0) = Zpb * Ez_at(i,j-1,0) + Ez_at(i,j-1,1); + for (t = 1; t <= lij; t++) { + fac = i/(2*aij*t); + fac1 = j/(2*aij*t); + Ex_at(i,j,t) = fac*Ex_at(i-1,j,t-1) + fac1*Ex_at(i,j-1,t-1); + Ey_at(i,j,t) = fac*Ey_at(i-1,j,t-1) + fac1*Ey_at(i,j-1,t-1); + Ez_at(i,j,t) = fac*Ez_at(i-1,j,t-1) + fac1*Ez_at(i,j-1,t-1); + } + } + } +} + +// Shape of E tensor is [:li+lj,:li,:lj] +void get_E_tensor(double *Et, int li, int lj, double ai, double aj, + double *Ra, double *Rb, double *buf) +{ + get_E_cart_components(buf, li, lj, ai, aj, Ra, Rb); + int lij = li + lj; + int stride2 = lij+1; + int stride1 = (lj+1) * stride2; + int Ex_size = (li+1) * stride1; + double *Ex = buf; + double *Ey = Ex + Ex_size; + double *Ez = Ey + Ex_size; + int t, u, v, n; + int ix, iy, iz; + int jx, jy, jz; + + n = 0; + // products subject to t+u+v <= li+lj + for (t = 0; t <= lij; t++) { + for (u = 0; u <= lij-t; u++) { + for (v = 0; v <= lij-t-u; v++) { + for (ix = li; ix >= 0; ix--) { + for (iy = li-ix; iy >= 0; iy--) { + iz = li - ix - iy; + for (jx = lj; jx >= 0; jx--) { + for (jy = lj-jx; jy >= 0; jy--) { + jz = lj - jx - jy; + Et[n] = Ex_at(ix,jx,t) * Ey_at(iy,jy,u) * Ez_at(iz,jz,v); + n++; + } } + } } + } } } +} + +void Et_dot_dm(double *Et_dm, double *dm, int *ao_loc, int *pair_loc, + int *bas, int nbas, double *env) +{ + int l2 = 2*LMAX; + int Et_size = (l2+1)*(l2+2)*(l2+3)/6*NCART_MAX*NCART_MAX; + int Ex_size = (2*LMAX+1)*(LMAX+1)*(LMAX+1); + double *Et = malloc(sizeof(double) * (Et_size+3*Ex_size)); + double *buf = Et + Et_size; + + size_t nao = ao_loc[nbas]; + for (int ish = 0; ish < nbas; ish++) { + int li = bas[ish*BAS_SLOTS+ANG_OF]; + int i0 = ao_loc[ish]; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double ci = env[bas[ish*BAS_SLOTS+PTR_COEFF]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + for (int jsh = 0; jsh <= ish; jsh++) { + int lj = bas[jsh*BAS_SLOTS+ANG_OF]; + int j0 = ao_loc[jsh]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double cj = env[bas[jsh*BAS_SLOTS+PTR_COEFF]]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rho = Et_dm + pair_loc[ish*nbas+jsh]; + int lij = li + lj; + int nfi = (li + 1) * (li + 2) / 2; + int nfj = (lj + 1) * (lj + 2) / 2; + int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6; + get_E_tensor(Et, li, lj, ai, aj, ri, rj, buf); + double cc = ci * cj; + double *pdm = dm + j0*nao + i0; + for (int n = 0, t = 0; t < Et_len; t++) { + double rho_t = 0.; + for (int i = 0; i < nfi; i++) { + for (int j = 0; j < nfj; j++, n++) { + rho_t += Et[n] * cc * pdm[j*nao+i]; + } } + rho[t] = rho_t; + } + } + } + free(Et); +} + +void jengine_dot_Et(double *vj, double *jvec, int *ao_loc, int *pair_loc, + int *bas, int nbas, double *env) +{ + int l2 = 2*LMAX; + int Et_size = (l2+1)*(l2+2)*(l2+3)/6*NCART_MAX*NCART_MAX; + int Ex_size = (2*LMAX+1)*(LMAX+1)*(LMAX+1); + double *Et = malloc(sizeof(double) * (Et_size+3*Ex_size)); + double *buf = Et + Et_size; + + size_t nao = ao_loc[nbas]; + for (int ish = 0; ish < nbas; ish++) { + int li = bas[ish*BAS_SLOTS+ANG_OF]; + int i0 = ao_loc[ish]; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double ci = env[bas[ish*BAS_SLOTS+PTR_COEFF]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + for (int jsh = 0; jsh <= ish; jsh++) { + int lj = bas[jsh*BAS_SLOTS+ANG_OF]; + int j0 = ao_loc[jsh]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double cj = env[bas[jsh*BAS_SLOTS+PTR_COEFF]]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *jvec_ij = jvec + pair_loc[ish*nbas+jsh]; + int lij = li + lj; + int nfi = (li + 1) * (li + 2) / 2; + int nfj = (lj + 1) * (lj + 2) / 2; + int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6; + get_E_tensor(Et, li, lj, ai, aj, ri, rj, buf); + double cc = ci * cj; + double *pj = vj + i0*nao+j0; + for (int n = 0, t = 0; t < Et_len; t++) { + double fac = cc * jvec_ij[t]; + for (int i = 0; i < nfi; i++) { + for (int j = 0; j < nfj; j++, n++) { + pj[i*nao+j] += Et[n] * fac; + } } + } + } + } + free(Et); +} diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu new file mode 100644 index 00000000..61a679f3 --- /dev/null +++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu @@ -0,0 +1,5077 @@ +#include "gvhf-rys/vhf.cuh" +#include "gvhf-rys/gamma_inc_unrolled.cu" + + +// TILEX=16, TILEY=16, cache_dm=True +__global__ +void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 256; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 256; + double *vj_cache = vj_kl_cache + 256; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 256; + // zero out all cache; + for (int n = sq_id; n < 3328; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 0); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 0; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +// TILEX=16, TILEY=16, cache_dm=True +__global__ +void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 512; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 1024; + double *vj_cache = vj_kl_cache + 256; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 1024; + // zero out all cache; + for (int n = sq_id; n < 4864; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 1); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 1; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +// TILEX=16, TILEY=16, cache_dm=True +__global__ +void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 768; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 1024; + double *vj_cache = vj_kl_cache + 1024; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 1024; + // zero out all cache; + for (int n = sq_id; n < 6400; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 2); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 2; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+512]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+256] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+512]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+512] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+512]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +// TILEX=16, TILEY=8, cache_dm=True +__global__ +void md_j_1_2(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 128; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 1024; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 512; + double *vj_kl_cache = vj_ij_cache + 1024; + double *vj_cache = vj_kl_cache + 1280; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 1024; + // zero out all cache; + for (int n = sq_id; n < 6400; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 128) { + int task_kl = blockIdx.y * 128 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+128] = ykl; + Rq_cache[sq_id+256] = zkl; + Rq_cache[sq_id+384] = akl; + } else { + Rq_cache[sq_id+384] = 1.; + } + } + for (int n = ty; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 80; n += 16) { + int i = n / 8; + int tile = n % 8; + int task_kl = blockIdx.y * 128 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]); + dm_kl_cache[sq_kl+i*128] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 8; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 128 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+128]; + double zkl = Rq_cache[sq_kl+256]; + double akl = Rq_cache[sq_kl+384]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 3); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 3; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+512]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+128] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+0]; + double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256]; + double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1; + vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_2 = ypq * R_1_0_0_2; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+512]; + double R_0_1_0_2 = xpq * R_1_0_0_2; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+256] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+512]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+384] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+256]; + double R_1_0_1_1 = ypq * R_2_0_0_1; + double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+512]; + double R_0_1_1_1 = xpq * R_1_0_1_1; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+512] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+256]; + double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256]; + double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0; + vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+512]; + double R_0_1_2_0 = xpq * R_1_0_2_0; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+640] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+512]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+768] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+512]; + double R_1_1_0_1 = xpq * R_2_0_0_1; + double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+896] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+512]; + double R_1_1_1_0 = xpq * R_2_0_1_0; + double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+1024] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+512]; + double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256]; + double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0; + vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+768]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+1152] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+128]; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+640]; + vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768]; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+896]; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+1024]; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+1152]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+128]; + vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+640]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768]; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+896]; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+1024]; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+1152]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+128]; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+640]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768]; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+896]; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+1024]; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+1152]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+128]; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+640]; + vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768]; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+896]; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+1024]; + vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+1152]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 80; n += 16) { + int i = n / 8; + int tile = n % 8; + int task_kl = blockIdx.y * 128 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]); + } + } +} + +// TILEX=16, TILEY=16, cache_dm=True +__global__ +void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 768; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 2560; + double *vj_cache = vj_kl_cache + 256; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 2560; + // zero out all cache; + for (int n = sq_id; n < 7936; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 160; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 2); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 2; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1024] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1280] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1536] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1792] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2048] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2304] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 160; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +// TILEX=16, TILEY=8, cache_dm=True +__global__ +void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 128; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 1024; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 512; + double *vj_kl_cache = vj_ij_cache + 2560; + double *vj_cache = vj_kl_cache + 512; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 2560; + // zero out all cache; + for (int n = sq_id; n < 7936; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 128) { + int task_kl = blockIdx.y * 128 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+128] = ykl; + Rq_cache[sq_id+256] = zkl; + Rq_cache[sq_id+384] = akl; + } else { + Rq_cache[sq_id+384] = 1.; + } + } + for (int n = ty; n < 160; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 32; n += 16) { + int i = n / 8; + int tile = n % 8; + int task_kl = blockIdx.y * 128 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]); + dm_kl_cache[sq_kl+i*128] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 8; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 128 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+128]; + double zkl = Rq_cache[sq_kl+256]; + double akl = Rq_cache[sq_kl+384]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 3); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 3; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256]; + double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256]; + double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1; + vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+768]; + double R_0_0_1_2 = ypq * R_1_0_0_2; + vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+1024]; + double R_1_0_1_1 = ypq * R_2_0_0_1; + double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1; + vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1280]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+1536]; + double R_0_1_0_2 = xpq * R_1_0_0_2; + vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+1792]; + double R_0_1_1_1 = xpq * R_1_0_1_1; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_1 = xpq * R_2_0_0_1; + double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1; + vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+128] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1024]; + double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256]; + double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0; + vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+1280]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+1536]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_2_0 = xpq * R_1_0_2_0; + vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_1_0 = xpq * R_2_0_1_0; + double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0; + vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+256] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1024]; + vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+1280]; + vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+1536]; + vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1792]; + vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2048]; + double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256]; + double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0; + vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+384] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1024] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1280] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1536] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1792] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2048] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+384]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2304] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 160; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 32; n += 16) { + int i = n / 8; + int tile = n % 8; + int task_kl = blockIdx.y * 128 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]); + } + } +} + +// TILEX=16, TILEY=4, cache_dm=True +__global__ +void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 256; + int task_kl0 = blockIdx.y * 64; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 1280; + double *Rq_cache = Rp_cache + 1024; + double *vj_ij_cache = Rq_cache + 256; + double *vj_kl_cache = vj_ij_cache + 2560; + double *vj_cache = vj_kl_cache + 640; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 2560; + // zero out all cache; + for (int n = sq_id; n < 7936; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 256) { + int task_ij = blockIdx.x * 256 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+256] = yij; + Rp_cache[sq_id+512] = zij; + Rp_cache[sq_id+768] = aij; + } else { + Rp_cache[sq_id+768] = 1.; + } + } + if (sq_id < 64) { + int task_kl = blockIdx.y * 64 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+64] = ykl; + Rq_cache[sq_id+128] = zkl; + Rq_cache[sq_id+192] = akl; + } else { + Rq_cache[sq_id+192] = 1.; + } + } + for (int n = ty; n < 160; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 40; n += 16) { + int i = n / 4; + int tile = n % 4; + int task_kl = blockIdx.y * 64 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*64]); + dm_kl_cache[sq_kl+i*64] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 16; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 4; ++batch_kl) { + int task_ij0 = blockIdx.x * 256 + batch_ij * 16; + int task_kl0 = blockIdx.y * 64 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+256]; + double zij = Rp_cache[sq_ij+512]; + double aij = Rp_cache[sq_ij+768]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+64]; + double zkl = Rq_cache[sq_kl+128]; + double akl = Rq_cache[sq_kl+192]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 4); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 4; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256]; + double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256]; + double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1; + vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+768]; + double R_0_0_1_2 = ypq * R_1_0_0_2; + vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+1024]; + double R_1_0_1_1 = ypq * R_2_0_0_1; + double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1; + vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1280]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+1536]; + double R_0_1_0_2 = xpq * R_1_0_0_2; + vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+1792]; + double R_0_1_1_1 = xpq * R_1_0_1_1; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_1 = xpq * R_2_0_0_1; + double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1; + vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+64] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+256]; + double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256]; + double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256]; + double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1; + double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2; + vj_kl += R_0_0_0_4 * dm_ij_cache[sq_ij+512]; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+768]; + double R_0_0_1_3 = ypq * R_1_0_0_3; + vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+1024]; + double R_1_0_1_2 = ypq * R_2_0_0_2; + double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2; + vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+1280]; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1536]; + double R_0_1_0_3 = xpq * R_1_0_0_3; + vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+1792]; + double R_0_1_1_2 = xpq * R_1_0_1_2; + vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_2 = xpq * R_2_0_0_2; + double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2; + vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+128] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1024]; + double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256]; + double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0; + vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+1280]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+1536]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_2_0 = xpq * R_1_0_2_0; + vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_1_0 = xpq * R_2_0_1_0; + double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0; + vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+192] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+512]; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+768]; + vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+1024]; + double R_2_0_1_1 = ypq * R_3_0_0_1; + double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1; + double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1; + vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+1280]; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1536]; + vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1792]; + double R_0_1_2_1 = xpq * R_1_0_2_1; + vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+2048]; + double R_1_1_1_1 = xpq * R_2_0_1_1; + double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1; + vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+256] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+512]; + vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+768]; + vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+1024]; + double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256]; + double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256]; + double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0; + double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0; + vj_kl += R_0_0_4_0 * dm_ij_cache[sq_ij+1280]; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1536]; + vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_3_0 = xpq * R_1_0_3_0; + vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_2_0 = xpq * R_2_0_2_0; + double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0; + vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+320] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1024]; + vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+1280]; + vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+1536]; + vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1792]; + vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2048]; + double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256]; + double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0; + vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+384] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+512]; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+768]; + vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1024]; + vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1280]; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1536]; + vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+1792]; + vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+2048]; + double R_2_1_0_1 = xpq * R_3_0_0_1; + double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1; + double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1; + vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+448] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+512]; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+768]; + vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1024]; + vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+1280]; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1536]; + vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1792]; + vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+2048]; + double R_2_1_1_0 = xpq * R_3_0_1_0; + double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0; + double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0; + vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+512] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+256]; + vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+512]; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+768]; + vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1024]; + vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+1280]; + vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1536]; + vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+1792]; + vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2048]; + double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256]; + double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256]; + double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0; + double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0; + vj_kl += R_0_4_0_0 * dm_ij_cache[sq_ij+2304]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+576] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_0_0_4 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1024] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_0_4_0 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1280] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1536] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1792] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2048] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+64]; + vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+128]; + vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+192]; + vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+320]; + vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+384]; + vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+448]; + vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij += R_0_4_0_0 * dm_kl_cache[sq_kl+576]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2304] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 160; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_ij = blockIdx.x * 256 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]); + } + } + for (int n = tx; n < 40; n += 16) { + int i = n / 4; + int tile = n % 4; + int task_kl = blockIdx.y * 64 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*64]); + } + } +} + +// TILEX=8, TILEY=16, cache_dm=True +__global__ +void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 128; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 1024; + double *Rq_cache = Rp_cache + 512; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 2560; + double *vj_cache = vj_kl_cache + 256; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 2560; + // zero out all cache; + for (int n = sq_id; n < 7424; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 128) { + int task_ij = blockIdx.x * 128 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+128] = yij; + Rp_cache[sq_id+256] = zij; + Rp_cache[sq_id+384] = aij; + } else { + Rp_cache[sq_id+384] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 160; n += 16) { + int i = n / 8; + int tile = n % 8; + int task_ij = blockIdx.x * 128 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*128] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 8; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 128 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+128]; + double zij = Rp_cache[sq_ij+256]; + double aij = Rp_cache[sq_ij+384]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 3); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 3; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+128]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+256]; + double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256]; + double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1; + vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+384]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+640]; + double R_0_0_1_2 = ypq * R_1_0_0_2; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+768]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+896]; + double R_1_0_1_1 = ypq * R_2_0_0_1; + double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+1024]; + double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256]; + double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0; + vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+1152]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1280]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1408]; + double R_0_1_0_2 = xpq * R_1_0_0_2; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1536]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+1664]; + double R_0_1_1_1 = xpq * R_1_0_1_1; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1792]; + double R_0_1_2_0 = xpq * R_1_0_2_0; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1920]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2048]; + double R_1_1_0_1 = xpq * R_2_0_0_1; + double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+2176]; + double R_1_1_1_0 = xpq * R_2_0_1_0; + double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+2304]; + double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256]; + double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0; + vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+2432]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+128] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+384] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+640] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+896] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1024] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1152] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1280] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1408] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1536] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1664] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1792] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1920] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2048] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2176] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2304] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2432] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 160; n += 16) { + int i = n / 8; + int tile = n % 8; + int task_ij = blockIdx.x * 128 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*128]); + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +// TILEX=4, TILEY=16, cache_dm=True +__global__ +void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 64; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 1280; + double *Rq_cache = Rp_cache + 256; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 1280; + double *vj_cache = vj_kl_cache + 1024; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 1280; + // zero out all cache; + for (int n = sq_id; n < 6144; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 64) { + int task_ij = blockIdx.x * 64 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+64] = yij; + Rp_cache[sq_id+128] = zij; + Rp_cache[sq_id+192] = aij; + } else { + Rp_cache[sq_id+192] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 80; n += 16) { + int i = n / 4; + int tile = n % 4; + int task_ij = blockIdx.x * 64 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*64] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 4; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 64 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+64]; + double zij = Rp_cache[sq_ij+128]; + double aij = Rp_cache[sq_ij+192]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 4); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 4; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+64]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+128]; + double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256]; + double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1; + vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+192]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+320]; + double R_0_0_1_2 = ypq * R_1_0_0_2; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+384]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+448]; + double R_1_0_1_1 = ypq * R_2_0_0_1; + double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+512]; + double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256]; + double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0; + vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+576]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+640]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+704]; + double R_0_1_0_2 = xpq * R_1_0_0_2; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+768]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+832]; + double R_0_1_1_1 = xpq * R_1_0_1_1; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+896]; + double R_0_1_2_0 = xpq * R_1_0_2_0; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+960]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+1024]; + double R_1_1_0_1 = xpq * R_2_0_0_1; + double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1088]; + double R_1_1_1_0 = xpq * R_2_0_1_0; + double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1152]; + double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256]; + double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0; + vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1216]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+64]; + vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+128]; + double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256]; + double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256]; + double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1; + double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2; + vj_kl -= R_0_0_0_4 * dm_ij_cache[sq_ij+192]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+320]; + double R_0_0_1_3 = ypq * R_1_0_0_3; + vj_kl -= R_0_0_1_3 * dm_ij_cache[sq_ij+384]; + vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+448]; + double R_1_0_1_2 = ypq * R_2_0_0_2; + double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2; + vj_kl -= R_0_0_2_2 * dm_ij_cache[sq_ij+512]; + double R_2_0_1_1 = ypq * R_3_0_0_1; + double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1; + double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1; + vj_kl -= R_0_0_3_1 * dm_ij_cache[sq_ij+576]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+640]; + vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+704]; + double R_0_1_0_3 = xpq * R_1_0_0_3; + vj_kl -= R_0_1_0_3 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+832]; + double R_0_1_1_2 = xpq * R_1_0_1_2; + vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+896]; + double R_0_1_2_1 = xpq * R_1_0_2_1; + vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+960]; + vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1024]; + double R_1_1_0_2 = xpq * R_2_0_0_2; + double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2; + vj_kl -= R_0_2_0_2 * dm_ij_cache[sq_ij+1088]; + double R_1_1_1_1 = xpq * R_2_0_1_1; + double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1; + vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+1152]; + double R_2_1_0_1 = xpq * R_3_0_0_1; + double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1; + double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1; + vj_kl -= R_0_3_0_1 * dm_ij_cache[sq_ij+1216]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+256] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+64]; + vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+128]; + vj_kl -= R_0_0_1_3 * dm_ij_cache[sq_ij+192]; + vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+320]; + vj_kl -= R_0_0_2_2 * dm_ij_cache[sq_ij+384]; + vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+448]; + vj_kl -= R_0_0_3_1 * dm_ij_cache[sq_ij+512]; + double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256]; + double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256]; + double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0; + double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0; + vj_kl -= R_0_0_4_0 * dm_ij_cache[sq_ij+576]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+640]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+704]; + vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+832]; + vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+896]; + double R_0_1_3_0 = xpq * R_1_0_3_0; + vj_kl -= R_0_1_3_0 * dm_ij_cache[sq_ij+960]; + vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+1024]; + vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+1088]; + double R_1_1_2_0 = xpq * R_2_0_2_0; + double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0; + vj_kl -= R_0_2_2_0 * dm_ij_cache[sq_ij+1152]; + double R_2_1_1_0 = xpq * R_3_0_1_0; + double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0; + double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0; + vj_kl -= R_0_3_1_0 * dm_ij_cache[sq_ij+1216]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+512] += vj_cache[sq_id]; + } + vj_kl = 0.; + vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0]; + vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+64]; + vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+128]; + vj_kl -= R_0_1_0_3 * dm_ij_cache[sq_ij+192]; + vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+256]; + vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+320]; + vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+384]; + vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+448]; + vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+512]; + vj_kl -= R_0_1_3_0 * dm_ij_cache[sq_ij+576]; + vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+640]; + vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+704]; + vj_kl -= R_0_2_0_2 * dm_ij_cache[sq_ij+768]; + vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+832]; + vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+896]; + vj_kl -= R_0_2_2_0 * dm_ij_cache[sq_ij+960]; + vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+1024]; + vj_kl -= R_0_3_0_1 * dm_ij_cache[sq_ij+1088]; + vj_kl -= R_0_3_1_0 * dm_ij_cache[sq_ij+1152]; + double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256]; + double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256]; + double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0; + double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0; + vj_kl -= R_0_4_0_0 * dm_ij_cache[sq_ij+1216]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+64] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+128] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_0_4 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_1_3 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_0_3 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+192] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+320] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_1_3 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_2_2 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+384] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+448] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_2_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_3_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_0_3_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_0_4_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_1_3_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+576] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+640] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+704] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_0_3 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_0_2 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+832] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+896] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_1_3_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_2_2_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+960] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1024] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_2_0_2 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_3_0_1 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1088] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_2_2_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_3_1_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1152] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0]; + vj_ij -= R_0_3_0_1 * dm_kl_cache[sq_kl+256]; + vj_ij -= R_0_3_1_0 * dm_kl_cache[sq_kl+512]; + vj_ij -= R_0_4_0_0 * dm_kl_cache[sq_kl+768]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1216] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 80; n += 16) { + int i = n / 4; + int tile = n % 4; + int task_ij = blockIdx.x * 64 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*64]); + } + } + for (int n = tx; n < 64; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +// TILEX=4, TILEY=16, cache_dm=True +__global__ +void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds) +{ + int *pair_ij_mapping = bounds.tile_ij_mapping; + int *pair_kl_mapping = bounds.tile_kl_mapping; + int task_ij0 = blockIdx.x * 64; + int task_kl0 = blockIdx.y * 256; + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + float *q_cond = bounds.q_cond; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + return; + } + + int tx = threadIdx.x; + int ty = threadIdx.y; + int sq_id = tx + 16 * ty; + int *bas = envs.bas; + int *dm_pair_loc = envs.ao_loc; + int nbas = envs.nbas; + double *env = envs.env; + double *dm = jk.dm; + double *vj = jk.vj; + double vj_ij, vj_kl; + + int npairs_ij = bounds.npairs_ij; + int npairs_kl = bounds.npairs_kl; + extern __shared__ double gamma_inc[]; + double *Rp_cache = gamma_inc + 1280; + double *Rq_cache = Rp_cache + 256; + double *vj_ij_cache = Rq_cache + 1024; + double *vj_kl_cache = vj_ij_cache + 2240; + double *vj_cache = vj_kl_cache + 256; + double *dm_ij_cache = vj_cache + 256; + double *dm_kl_cache = dm_ij_cache + 2240; + // zero out all cache; + for (int n = sq_id; n < 6528; n += 256) { + Rp_cache[n] = 0.; + } + __syncthreads(); + + if (sq_id < 64) { + int task_ij = blockIdx.x * 64 + sq_id; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]]; + double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double aij = ai + aj; + double xij = (ai * ri[0] + aj * rj[0]) / aij; + double yij = (ai * ri[1] + aj * rj[1]) / aij; + double zij = (ai * ri[2] + aj * rj[2]) / aij; + Rp_cache[sq_id+0] = xij; + Rp_cache[sq_id+64] = yij; + Rp_cache[sq_id+128] = zij; + Rp_cache[sq_id+192] = aij; + } else { + Rp_cache[sq_id+192] = 1.; + } + } + if (sq_id < 256) { + int task_kl = blockIdx.y * 256 + sq_id; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]]; + double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double akl = ak + al; + double xkl = (ak * rk[0] + al * rl[0]) / akl; + double ykl = (ak * rk[1] + al * rl[1]) / akl; + double zkl = (ak * rk[2] + al * rl[2]) / akl; + Rq_cache[sq_id+0] = xkl; + Rq_cache[sq_id+256] = ykl; + Rq_cache[sq_id+512] = zkl; + Rq_cache[sq_id+768] = akl; + } else { + Rq_cache[sq_id+768] = 1.; + } + } + for (int n = ty; n < 140; n += 16) { + int i = n / 4; + int tile = n % 4; + int task_ij = blockIdx.x * 64 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + dm_ij_cache[sq_ij+i*64] = dm[dm_ij_pair0+i]; + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i]; + } + } + __syncthreads(); + + for (int batch_ij = 0; batch_ij < 4; ++batch_ij) { + for (int batch_kl = 0; batch_kl < 16; ++batch_kl) { + int task_ij0 = blockIdx.x * 64 + batch_ij * 16; + int task_kl0 = blockIdx.y * 256 + batch_kl * 16; + if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) { + continue; + } + int pair_ij0 = pair_ij_mapping[task_ij0]; + int pair_kl0 = pair_kl_mapping[task_kl0]; + if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) { + continue; + } + + int sq_ij = tx + batch_ij * 16; + int sq_kl = ty + batch_kl * 16; + int task_ij = task_ij0 + tx; + int task_kl = task_kl0 + ty; + double fac_sym = PI_FAC; + if (task_ij >= npairs_ij) { + task_ij = task_ij0; + fac_sym = 0.; + } + if (task_kl >= npairs_kl) { + task_kl = task_kl0; + fac_sym = 0.; + } + int pair_ij = pair_ij_mapping[task_ij]; + int pair_kl = pair_kl_mapping[task_kl]; + + int ish = pair_ij / nbas; + int jsh = pair_ij % nbas; + int ksh = pair_kl / nbas; + int lsh = pair_kl % nbas; + if (ish == jsh) fac_sym *= .5; + if (ksh == lsh) fac_sym *= .5; + if (pair_ij_mapping == pair_kl_mapping) { + if (task_ij == task_kl) fac_sym *= .5; + if (task_ij < task_kl) fac_sym = 0.; + } + double xij = Rp_cache[sq_ij+0]; + double yij = Rp_cache[sq_ij+64]; + double zij = Rp_cache[sq_ij+128]; + double aij = Rp_cache[sq_ij+192]; + double xkl = Rq_cache[sq_kl+0]; + double ykl = Rq_cache[sq_kl+256]; + double zkl = Rq_cache[sq_kl+512]; + double akl = Rq_cache[sq_kl+768]; + double fac = fac_sym / (aij*akl*sqrt(aij+akl)); + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + double theta_rr = theta * rr; + eval_gamma_inc_fn(gamma_inc, theta_rr, 4); + double a2 = -2. * theta; + gamma_inc[sq_id] *= fac; + for (int i = 1; i <= 4; i++) { + fac *= a2; + gamma_inc[sq_id+i*256] *= fac; + } + vj_kl = 0.; + vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0]; + double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+64]; + double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256]; + double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+128]; + double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256]; + double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1; + vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+192]; + double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256]; + double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256]; + double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1; + double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2; + vj_kl += R_0_0_0_4 * dm_ij_cache[sq_ij+256]; + double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+320]; + double R_0_0_1_1 = ypq * R_1_0_0_1; + vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+384]; + double R_0_0_1_2 = ypq * R_1_0_0_2; + vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+448]; + double R_0_0_1_3 = ypq * R_1_0_0_3; + vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+512]; + double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256]; + double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+576]; + double R_1_0_1_1 = ypq * R_2_0_0_1; + double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1; + vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+640]; + double R_1_0_1_2 = ypq * R_2_0_0_2; + double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2; + vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+704]; + double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256]; + double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0; + vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+768]; + double R_2_0_1_1 = ypq * R_3_0_0_1; + double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1; + double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1; + vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+832]; + double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256]; + double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256]; + double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0; + double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0; + vj_kl += R_0_0_4_0 * dm_ij_cache[sq_ij+896]; + double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256]; + vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+960]; + double R_0_1_0_1 = xpq * R_1_0_0_1; + vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1024]; + double R_0_1_0_2 = xpq * R_1_0_0_2; + vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1088]; + double R_0_1_0_3 = xpq * R_1_0_0_3; + vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+1152]; + double R_0_1_1_0 = xpq * R_1_0_1_0; + vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+1216]; + double R_0_1_1_1 = xpq * R_1_0_1_1; + vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1280]; + double R_0_1_1_2 = xpq * R_1_0_1_2; + vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1344]; + double R_0_1_2_0 = xpq * R_1_0_2_0; + vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1408]; + double R_0_1_2_1 = xpq * R_1_0_2_1; + vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1472]; + double R_0_1_3_0 = xpq * R_1_0_3_0; + vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+1536]; + double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256]; + double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256]; + vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+1600]; + double R_1_1_0_1 = xpq * R_2_0_0_1; + double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1; + vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1664]; + double R_1_1_0_2 = xpq * R_2_0_0_2; + double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2; + vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+1728]; + double R_1_1_1_0 = xpq * R_2_0_1_0; + double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0; + vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1792]; + double R_1_1_1_1 = xpq * R_2_0_1_1; + double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1; + vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1856]; + double R_1_1_2_0 = xpq * R_2_0_2_0; + double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0; + vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+1920]; + double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256]; + double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256]; + double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0; + vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1984]; + double R_2_1_0_1 = xpq * R_3_0_0_1; + double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1; + double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1; + vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+2048]; + double R_2_1_1_0 = xpq * R_3_0_1_0; + double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0; + double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0; + vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2112]; + double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256]; + double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256]; + double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0; + double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0; + vj_kl += R_0_4_0_0 * dm_ij_cache[sq_ij+2176]; + __syncthreads(); + vj_cache[sq_id] = vj_kl; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (tx < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride]; + } + } + __syncthreads(); + if (tx == 0 && task_kl0+ty < npairs_kl) { + vj_kl_cache[sq_kl+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+0] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+64] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+128] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+192] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_0_4 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+256] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+320] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+384] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+448] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+512] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+576] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+640] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+704] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+768] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+832] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_0_4_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+896] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+960] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1024] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1088] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1152] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1216] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1280] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1344] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1408] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1472] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1536] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1600] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1664] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1728] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1792] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1856] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1920] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+1984] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2048] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2112] += vj_cache[sq_id]; + } + vj_ij = 0.; + vj_ij += R_0_4_0_0 * dm_kl_cache[sq_kl+0]; + __syncthreads(); + vj_cache[sq_id] = vj_ij; + for (int stride = 8; stride > 0; stride /= 2) { + __syncthreads(); + if (ty < stride) { + vj_cache[sq_id] += vj_cache[sq_id + stride*16]; + } + } + __syncthreads(); + if (ty == 0 && task_ij0+tx < npairs_ij) { + vj_ij_cache[sq_ij+2176] += vj_cache[sq_id]; + } + __syncthreads(); + } } + for (int n = ty; n < 140; n += 16) { + int i = n / 4; + int tile = n % 4; + int task_ij = blockIdx.x * 64 + tile * 16 + tx; + if (task_ij < npairs_ij) { + int pair_ij = pair_ij_mapping[task_ij]; + int dm_ij_pair0 = dm_pair_loc[pair_ij]; + int sq_ij = tx + tile * 16; + atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*64]); + } + } + for (int n = tx; n < 16; n += 16) { + int i = n / 16; + int tile = n % 16; + int task_kl = blockIdx.y * 256 + tile * 16 + ty; + if (task_kl < npairs_kl) { + int pair_kl = pair_kl_mapping[task_kl]; + int dm_kl_pair0 = dm_pair_loc[pair_kl]; + int sq_kl = ty + tile * 16; + atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]); + } + } +} + +int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds, + int *scheme, int workers, double omega) +{ + int li = bounds->li; + int lj = bounds->lj; + int lk = bounds->lk; + int ll = bounds->ll; + int lij = li + lj; + int lkl = lk + ll; + dim3 threads(16, 16); + dim3 blocks; + int ijkl = lij*9 + lkl; + switch (ijkl) { + case 0: // lij=0, lkl=0 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_0_0<<>>(*envs, *jk, *bounds); break; + case 9: // lij=1, lkl=0 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_1_0<<>>(*envs, *jk, *bounds); break; + case 10: // lij=1, lkl=1 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_1_1<<>>(*envs, *jk, *bounds); break; + case 11: // lij=1, lkl=2 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 127) / 128; + md_j_1_2<<>>(*envs, *jk, *bounds); break; + case 18: // lij=2, lkl=0 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_2_0<<>>(*envs, *jk, *bounds); break; + case 19: // lij=2, lkl=1 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 127) / 128; + md_j_2_1<<>>(*envs, *jk, *bounds); break; + case 20: // lij=2, lkl=2 + blocks.x = (bounds->npairs_ij + 255) / 256; + blocks.y = (bounds->npairs_kl + 63) / 64; + md_j_2_2<<>>(*envs, *jk, *bounds); break; + case 27: // lij=3, lkl=0 + blocks.x = (bounds->npairs_ij + 127) / 128; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_3_0<<>>(*envs, *jk, *bounds); break; + case 28: // lij=3, lkl=1 + blocks.x = (bounds->npairs_ij + 63) / 64; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_3_1<<>>(*envs, *jk, *bounds); break; + case 36: // lij=4, lkl=0 + blocks.x = (bounds->npairs_ij + 63) / 64; + blocks.y = (bounds->npairs_kl + 255) / 256; + md_j_4_0<<>>(*envs, *jk, *bounds); break; + default: return 0; + } + return 1; +} + +void set_md_j_unrolled_shm_size() +{ + cudaFuncSetAttribute(md_j_0_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 3584*sizeof(double)); + cudaFuncSetAttribute(md_j_1_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 5376*sizeof(double)); + cudaFuncSetAttribute(md_j_1_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 7168*sizeof(double)); + cudaFuncSetAttribute(md_j_1_2, cudaFuncAttributeMaxDynamicSharedMemorySize, 7424*sizeof(double)); + cudaFuncSetAttribute(md_j_2_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 8704*sizeof(double)); + cudaFuncSetAttribute(md_j_2_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 8960*sizeof(double)); + cudaFuncSetAttribute(md_j_2_2, cudaFuncAttributeMaxDynamicSharedMemorySize, 9216*sizeof(double)); + cudaFuncSetAttribute(md_j_3_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 8448*sizeof(double)); + cudaFuncSetAttribute(md_j_3_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 7424*sizeof(double)); + cudaFuncSetAttribute(md_j_4_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 7808*sizeof(double)); +} diff --git a/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu b/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu index 3953c63b..89758bfd 100644 --- a/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu +++ b/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu @@ -4,10 +4,8 @@ #define SQRTPIE4 .886226925452758013 __device__ -static void eval_gamma_inc_fn(double *f, double t, int m) +static void eval_gamma_inc_fn(double *f, double t, int m, int sq_id, int block_size) { - int sq_id = threadIdx.x; - int block_size = blockDim.x; if (t < EPS_FLOAT64) { f[sq_id] = 1.; for (int i = 1; i <= m; i++) { diff --git a/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu b/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu index 17c8c570..88ba3436 100644 --- a/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu +++ b/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu @@ -6,8 +6,8 @@ __device__ static void eval_gamma_inc_fn(double *f, double t, int m) { - int t_id = threadIdx.x + blockDim.x * threadIdx.y; - int block_size = blockDim.x * blockDim.y; + int t_id = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; + int block_size = blockDim.x * blockDim.y * blockDim.z; if (t < EPS_FLOAT64) { f[t_id] = 1.; for (int i = 1; i <= m; i++) { diff --git a/gpu4pyscf/lib/tests/test_cusolver.py b/gpu4pyscf/lib/tests/test_cusolver.py index e69de29b..0f4941c7 100644 --- a/gpu4pyscf/lib/tests/test_cusolver.py +++ b/gpu4pyscf/lib/tests/test_cusolver.py @@ -0,0 +1,64 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import scipy.linalg +import cupy as cp +from gpu4pyscf.lib.cusolver import eigh, cholesky + +def test_eigh_real(): + np.random.seed(6) + n = 12 + a = np.random.rand(n, n) + a = a + a.T + b = np.random.rand(n, n) + b = b.dot(b.T) + ref = scipy.linalg.eigh(a, b) + e, c = eigh(cp.asarray(a), cp.asarray(b)) + assert abs(e.get() - ref[0]).max() < 1e-10 + ovlp = c.get().T.dot(b).dot(ref[1]) + assert abs(abs(ovlp) - np.eye(n)).max() < 1e-10 + +def test_eigh_cmplx(): + np.random.seed(6) + n = 12 + a = np.random.rand(n, n) + np.random.rand(n, n) * 1j + a = a + a.conj().T + b = np.random.rand(n, n) + np.random.rand(n, n) * 1j + b = b.dot(b.conj().T) + ref = scipy.linalg.eigh(a, b) + e, c = eigh(cp.asarray(a), cp.asarray(b)) + assert abs(e.get() - ref[0]).max() < 1e-10 + ovlp = c.get().conj().T.dot(b).dot(ref[1]) + assert abs(abs(ovlp) - np.eye(n)).max() < 1e-10 + +def test_cholesky_real(): + np.random.seed(6) + n = 12 + a = np.random.rand(n, n) + a = a.dot(a.T) + ref = np.linalg.cholesky(a) + x = cholesky(cp.asarray(a)) + assert abs(x.get() - ref).max() < 1e-12 + +def test_cholesky_cmplx(): + np.random.seed(6) + n = 12 + a = np.random.rand(n, n) + np.random.rand(n, n) * 1j + a = a.dot(a.conj().T) + ref = np.linalg.cholesky(a) + x = cholesky(cp.asarray(a)) + assert abs(x.get() - ref).max() < 1e-12 diff --git a/gpu4pyscf/lib/tests/test_cutensor.py b/gpu4pyscf/lib/tests/test_cutensor.py index ca338331..3e9ef1c4 100644 --- a/gpu4pyscf/lib/tests/test_cutensor.py +++ b/gpu4pyscf/lib/tests/test_cutensor.py @@ -38,6 +38,13 @@ def test_contract(self): c_contract = contract('lkji,jk->il', a, b[10:20,10:20]) assert cupy.linalg.norm(c_einsum - c_contract) < 1e-10 + def test_complex_valued(self): + a = cupy.random.rand(10,9,11) + cupy.random.rand(10,9,11)*1j + b = cupy.random.rand(11,7,13) + cupy.random.rand(11,7,13)*1j + c_einsum = cupy.einsum('ijk,ikl->jl', a[3:9,:,4:10], b[3:9,:6, 7:13]) + c_contract = contract('ijk,ikl->jl', a[3:9,:,4:10], b[3:9,:6, 7:13]) + assert cupy.linalg.norm(c_einsum - c_contract) < 1e-10 + def test_cache(self): a = cupy.random.rand(20,20,20,20) b = cupy.random.rand(20,20) @@ -52,4 +59,4 @@ def test_cache(self): if __name__ == "__main__": print("Full tests for cutensor module") - unittest.main() \ No newline at end of file + unittest.main() diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py index 753b987c..d8c3b0c2 100644 --- a/gpu4pyscf/mp/dfmp2.py +++ b/gpu4pyscf/mp/dfmp2.py @@ -100,8 +100,7 @@ def loop_ao2mo(self, mo_coeff, nocc): mo_coeff = cupy.asarray(mo_coeff, order='C') Lov = None with_df = self.with_df - ao_idx = with_df.intopt.ao_idx - mo_coeff = mo_coeff[ao_idx] + mo_coeff = with_df.intopt.sort_orbitals(mo_coeff, axis=[0]) orbo = mo_coeff[:,:nocc] orbv = mo_coeff[:,nocc:] blksize = with_df.get_blksize() diff --git a/gpu4pyscf/mp/tests/test_mp2.py b/gpu4pyscf/mp/tests/test_mp2.py index 43142fd8..9cffad01 100644 --- a/gpu4pyscf/mp/tests/test_mp2.py +++ b/gpu4pyscf/mp/tests/test_mp2.py @@ -155,4 +155,4 @@ def test_to_gpu(self): if __name__ == "__main__": print("Full Tests for mp2") - unittest.main() \ No newline at end of file + unittest.main() diff --git a/gpu4pyscf/pbc/__init__.py b/gpu4pyscf/pbc/__init__.py new file mode 100644 index 00000000..f7ec6fe8 --- /dev/null +++ b/gpu4pyscf/pbc/__init__.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from gpu4pyscf.pbc import scf +from gpu4pyscf.pbc import dft diff --git a/gpu4pyscf/pbc/df/__init__.py b/gpu4pyscf/pbc/df/__init__.py new file mode 100644 index 00000000..6b9e0c3f --- /dev/null +++ b/gpu4pyscf/pbc/df/__init__.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from . import fft +#from . import aft +#from . import df +from .fft import FFTDF +#from .df import DF, GDF +#from .aft import AFTDF + +class DF: pass # Just a placeholder diff --git a/gpu4pyscf/pbc/df/fft.py b/gpu4pyscf/pbc/df/fft.py new file mode 100644 index 00000000..f84894ac --- /dev/null +++ b/gpu4pyscf/pbc/df/fft.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +'''GPW method''' + +import numpy as np +import cupy as cp +from pyscf import gto +from pyscf import lib +from pyscf.pbc.df import fft as fft_cpu +from pyscf.pbc.df import aft as aft_cpu +from pyscf.pbc.df.aft import _check_kpts, ft_ao +from pyscf.pbc.gto import pseudo +from pyscf.pbc.lib.kpts_helper import is_zero +from gpu4pyscf.lib import logger, utils +from gpu4pyscf.pbc import tools +from gpu4pyscf.pbc.df import fft_jk + +__all__ = [ + 'get_nuc', 'get_pp', 'get_SI', 'FFTDF' +] + +def get_nuc(mydf, kpts=None): + from gpu4pyscf.pbc.dft import numint + kpts, is_single_kpt = _check_kpts(mydf, kpts) + cell = mydf.cell + assert cell.low_dim_ft_type != 'inf_vacuum' + assert cell.dimension > 1 + mesh = mydf.mesh + charge = cp.asarray(-cell.atom_charges()) + Gv = cell.get_Gv(mesh) + SI = get_SI(cell, mesh=mesh) + rhoG = charge.dot(SI) + + coulG = tools.get_coulG(cell, mesh=mesh, Gv=Gv) + vneG = rhoG * coulG + vneR = tools.ifft(vneG, mesh).real + + nkpts = len(kpts) + nao = cell.nao + if is_zero(kpts): + vne = cp.zeros((nkpts,nao,nao)) + else: + vne = cp.zeros((nkpts,nao,nao), dtype=np.complex128) + kpts = np.asarray(kpts) + ao_ks = numint.eval_ao_kpts(cell, mydf.grids.coords, kpts) + for k, ao in enumerate(ao_ks): + vne[k] += (ao.conj().T*vneR).dot(ao) + + if is_single_kpt: + vne = vne[0] + return vne + +def get_pp(mydf, kpts=None): + '''Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed. + ''' + from gpu4pyscf.pbc.dft import numint + kpts, is_single_kpt = _check_kpts(mydf, kpts) + cell = mydf.cell + assert cell.low_dim_ft_type != 'inf_vacuum' + assert cell.dimension > 1 + mesh = mydf.mesh + Gv = cell.get_Gv(mesh) + SI = get_SI(cell, mesh=mesh) + vpplocG = pseudo.get_vlocG(cell, Gv) + vpplocG = -np.einsum('ij,ij->j', SI, vpplocG) + vpplocG = cp.asarray(vpplocG) + # vpploc evaluated in real-space + vpplocR = tools.ifft(vpplocG, mesh).real + + ngrids = len(vpplocG) + nkpts = len(kpts) + nao = cell.nao + if is_zero(kpts): + vpp = cp.zeros((nkpts,nao,nao)) + else: + vpp = cp.zeros((nkpts,nao,nao), dtype=np.complex128) + kpts = np.asarray(kpts) + ao_ks = numint.eval_ao_kpts(cell, mydf.grids.coords, kpts) + for k, ao in enumerate(ao_ks): + vpp[k] += (ao.conj().T*vpplocR).dot(ao) + + # vppnonloc evaluated in reciprocal space + fakemol = gto.Mole() + fakemol._atm = np.zeros((1,gto.ATM_SLOTS), dtype=np.int32) + fakemol._bas = np.zeros((1,gto.BAS_SLOTS), dtype=np.int32) + ptr = gto.PTR_ENV_START + fakemol._env = np.zeros(ptr+10) + fakemol._bas[0,gto.NPRIM_OF ] = 1 + fakemol._bas[0,gto.NCTR_OF ] = 1 + fakemol._bas[0,gto.PTR_EXP ] = ptr+3 + fakemol._bas[0,gto.PTR_COEFF] = ptr+4 + + # buf for SPG_lmi upto l=0..3 and nl=3 + buf = np.empty((48,ngrids), dtype=np.complex128) + def vppnl_by_k(kpt): + Gk = Gv + kpt + G_rad = lib.norm(Gk, axis=1) + aokG = ft_ao.ft_ao(cell, Gv, kpt=kpt) * (1/cell.vol)**.5 + vppnl = 0 + for ia in range(cell.natm): + symb = cell.atom_symbol(ia) + if symb not in cell._pseudo: + continue + pp = cell._pseudo[symb] + p1 = 0 + for l, proj in enumerate(pp[5:]): + rl, nl, hl = proj + if nl > 0: + fakemol._bas[0,gto.ANG_OF] = l + fakemol._env[ptr+3] = .5*rl**2 + fakemol._env[ptr+4] = rl**(l+1.5)*np.pi**1.25 + pYlm_part = fakemol.eval_gto('GTOval', Gk) + + p0, p1 = p1, p1+nl*(l*2+1) + # pYlm is real, SI[ia] is complex + pYlm = np.ndarray((nl,l*2+1,ngrids), dtype=np.complex128, buffer=buf[p0:p1]) + for k in range(nl): + qkl = pseudo.pp._qli(G_rad*rl, l, k) + pYlm[k] = pYlm_part.T * qkl + #:SPG_lmi = np.einsum('g,nmg->nmg', SI[ia].conj(), pYlm) + #:SPG_lm_aoG = np.einsum('nmg,gp->nmp', SPG_lmi, aokG) + #:tmp = np.einsum('ij,jmp->imp', hl, SPG_lm_aoG) + #:vppnl += np.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp) + if p1 > 0: + SPG_lmi = buf[:p1] + SPG_lmi *= SI[ia].conj() + SPG_lm_aoGs = lib.zdot(SPG_lmi, aokG) + p1 = 0 + for l, proj in enumerate(pp[5:]): + rl, nl, hl = proj + if nl > 0: + p0, p1 = p1, p1+nl*(l*2+1) + hl = np.asarray(hl) + SPG_lm_aoG = SPG_lm_aoGs[p0:p1].reshape(nl,l*2+1,-1) + tmp = np.einsum('ij,jmp->imp', hl, SPG_lm_aoG) + vppnl += np.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp) + return vppnl * (1./cell.vol) + + for k, kpt in enumerate(kpts): + vppnl = vppnl_by_k(kpt) + if is_zero(kpt): + vpp[k] += cp.asarray(vppnl.real) + else: + vpp[k] += cp.asarray(vppnl) + + if is_single_kpt: + vpp = vpp[0] + return vpp + +def get_SI(cell, Gv=None, mesh=None, atmlst=None): + '''Calculate the structure factor (0D, 1D, 2D, 3D) for all atoms; see MH (3.34). + + Args: + cell : instance of :class:`Cell` + + Gv : (N,3) array + G vectors + + atmlst : list of ints, optional + Indices of atoms for which the structure factors are computed. + + Returns: + SI : (natm, ngrids) ndarray, dtype=np.complex128 + The structure factor for each atom at each G-vector. + ''' + coords = cp.asarray(cell.atom_coords()) + if atmlst is not None: + coords = coords[np.asarray(atmlst)] + if Gv is None: + if mesh is None: + mesh = cell.mesh + basex, basey, basez = cell.get_Gv_weights(mesh)[1] + basex = cp.asarray(basex) + basey = cp.asarray(basey) + basez = cp.asarray(basez) + b = cp.asarray(cell.reciprocal_vectors()) + rb = coords.dot(b.T) + SIx = cp.exp(-1j*rb[:,0,None] * basex) + SIy = cp.exp(-1j*rb[:,1,None] * basey) + SIz = cp.exp(-1j*rb[:,2,None] * basez) + SI = SIx[:,:,None,None] * SIy[:,None,:,None] * SIz[:,None,None,:] + natm = coords.shape[0] + SI = SI.reshape(natm, -1) + else: + SI = cp.exp(-1j*coords.dot(cp.asarray(Gv).T)) + return SI + + +class FFTDF(lib.StreamObject): + '''Density expansion on plane waves (GPW method) + ''' + + blockdim = 240 + + _keys = fft_cpu.FFTDF._keys + + def __init__(self, cell, kpts=np.zeros((1,3))): + from gpu4pyscf.pbc.dft import gen_grid + from gpu4pyscf.pbc.dft import numint + self.cell = cell + self.stdout = cell.stdout + self.verbose = cell.verbose + self.max_memory = cell.max_memory + self.kpts = kpts + self.grids = gen_grid.UniformGrids(cell) + + # The following attributes are not input options. + # self.exxdiv has no effects. It was set in the get_k_kpts function to + # mimic the KRHF/KUHF object in the call to tools.get_coulG. + self.exxdiv = None + self._numint = numint.KNumInt() + self._rsh_df = {} # Range separated Coulomb DF objects + + mesh = fft_cpu.FFTDF.mesh + dump_flags = fft_cpu.FFTDF.dump_flags + check_sanity = fft_cpu.FFTDF.check_sanity + build = fft_cpu.FFTDF.build + reset = fft_cpu.FFTDF.reset + + aoR_loop = NotImplemented + + get_pp = get_pp + get_nuc = get_nuc + + def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None, + with_j=True, with_k=True, omega=None, exxdiv=None): + if omega is not None: # J/K for RSH functionals + with self.range_coulomb(omega) as rsh_df: + return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k, + omega=None, exxdiv=exxdiv) + + kpts, is_single_kpt = _check_kpts(self, kpts) + if is_single_kpt: + vj, vk = fft_jk.get_jk(self, dm, hermi, kpts[0], kpts_band, + with_j, with_k, exxdiv) + else: + vj = vk = None + if with_k: + vk = fft_jk.get_k_kpts(self, dm, hermi, kpts, kpts_band, exxdiv) + if with_j: + vj = fft_jk.get_j_kpts(self, dm, hermi, kpts, kpts_band) + return vj, vk + + get_eri = get_ao_eri = NotImplemented + ao2mo = get_mo_eri = NotImplemented + ao2mo_7d = NotImplemented + get_ao_pairs_G = get_ao_pairs = NotImplemented + get_mo_pairs_G = get_mo_pairs = NotImplemented + + range_coulomb = aft_cpu.AFTDF.range_coulomb + + to_gpu = utils.to_gpu + device = utils.device + + def to_cpu(self): + obj = utils.to_cpu(self) + return obj.reset() diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py new file mode 100644 index 00000000..31e9a5d7 --- /dev/null +++ b/gpu4pyscf/pbc/df/fft_jk.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +JK with GPW +''' + +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.pbc.lib.kpts_helper import is_zero, member +from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract +from gpu4pyscf.pbc import tools + +__all__ = [ + 'get_j_kpts', 'get_k_kpts', 'get_jk', 'get_j', 'get_k', + 'get_j_e1_kpts', 'get_k_e1_kpts' +] + +def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None): + '''Get the Coulomb (J) AO matrix at sampled k-points. + + Args: + dm_kpts : (nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray + Density matrix at each k-point. If a list of k-point DMs, eg, + UHF alpha and beta DM, the alpha and beta DMs are contracted + separately. + kpts : (nkpts, 3) ndarray + + Kwargs: + kpts_band : (3,) ndarray or (*,3) ndarray + A list of arbitrary "band" k-points at which to evalute the matrix. + + Returns: + vj : (nkpts, nao, nao) ndarray + or list of vj if the input dm_kpts is a list of DMs + ''' + cell = mydf.cell + mesh = mydf.mesh + assert cell.low_dim_ft_type != 'inf_vacuum' + assert cell.dimension > 1 + + ni = mydf._numint + dm_kpts = cp.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + + coulG = tools.get_coulG(cell, mesh=mesh) + ngrids = len(coulG) + + if hermi == 1 or is_zero(kpts): + vR = cp.zeros((nset,ngrids)) + ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts) + for i in range(nset): + rhoR = ni.eval_rho(cell, ao_ks, dm_kpts[i], hermi=hermi).real + rhoG = tools.fft(rhoR, mesh) + vG = coulG * rhoG + vR[i] = tools.ifft(vG, mesh).real + else: + vR = cp.zeros((nset,ngrids), dtype=np.complex128) + ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts) + for i in range(nset): + rhoR = ni.eval_rho(cell, ao_ks, dm_kpts[i], hermi=hermi) + rhoG = tools.fft(rhoR, mesh) + vG = coulG * rhoG + vR[i] = tools.ifft(vG, mesh) + + vR *= cell.vol / ngrids + kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + nband = len(kpts_band) + if is_zero(kpts_band): + vj_kpts = cp.zeros((nset,nband,nao,nao)) + else: + vj_kpts = cp.zeros((nset,nband,nao,nao), dtype=np.complex128) + + if input_band is not None: + ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts_band) + for k, ao in enumerate(ao_ks): + for i in range(nset): + aow = ao * vR[i,:,None] + vj_kpts[i,k] += ao.conj().T.dot(aow) + + return _format_jks(vj_kpts, dm_kpts, input_band, kpts) + +def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None, + exxdiv=None): + '''Get the Coulomb (J) and exchange (K) AO matrices at sampled k-points. + + Args: + dm_kpts : (nkpts, nao, nao) ndarray + Density matrix at each k-point + kpts : (nkpts, 3) ndarray + + Kwargs: + hermi : int + Whether K matrix is hermitian + + | 0 : not hermitian and not symmetric + | 1 : hermitian + + kpts_band : (3,) ndarray or (*,3) ndarray + A list of arbitrary "band" k-points at which to evalute the matrix. + + Returns: + vj : (nkpts, nao, nao) ndarray + vk : (nkpts, nao, nao) ndarray + or list of vj and vk if the input dm_kpts is a list of DMs + ''' + cell = mydf.cell + mesh = mydf.mesh + assert cell.low_dim_ft_type != 'inf_vacuum' + assert cell.dimension > 1 + coords = mydf.grids.coords + ngrids = coords.shape[0] + + if getattr(dm_kpts, 'mo_coeff', None) is not None: + mo_coeff = dm_kpts.mo_coeff + mo_occ = dm_kpts.mo_occ + else: + mo_coeff = None + + ni = mydf._numint + kpts = np.asarray(kpts) + dm_kpts = cp.asarray(dm_kpts, order='C') + dms = _format_dms(dm_kpts, kpts) + nset, nkpts, nao = dms.shape[:3] + + weight = 1./nkpts * (cell.vol/ngrids) + + kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band + nband = len(kpts_band) + + if is_zero(kpts_band) and is_zero(kpts): + vk_kpts = cp.zeros((nset,nband,nao,nao), dtype=dms.dtype) + else: + vk_kpts = cp.zeros((nset,nband,nao,nao), dtype=np.complex128) + + ao2_kpts = ni.eval_ao(cell, coords, kpts=kpts) + if input_band is None: + ao1_kpts = ao2_kpts + else: + ao1_kpts = ni.eval_ao(cell, coords, kpts=kpts_band) + + if mo_coeff is not None and nset == 1: + mo2_kpts = [ + ao.dot(mo[:,occ>0] * occ[occ>0]**.5) + for occ, mo, ao in zip(mo_occ, mo_coeff, ao2_kpts)] + ao2_kpts = mo2_kpts + else: + mo2_kpts = None + + vR_dm = cp.empty((nset,nao,ngrids), dtype=vk_kpts.dtype) + blksize = 32 + + for k2, ao2 in enumerate(ao2_kpts): + ao2T = ao2.T + kpt2 = kpts[k2] + naoj = ao2.shape[1] + if mo2_kpts is None: + ao_dms = [dms[i,k2].dot(ao2T.conj()) for i in range(nset)] + else: + ao_dms = [ao2T.conj()] + + for k1, ao1 in enumerate(ao1_kpts): + ao1T = ao1.T + kpt1 = kpts_band[k1] + + # If we have an ewald exxdiv, we add the G=0 correction near the + # end of the function to bypass any discretization errors + # that arise from the FFT. + if exxdiv == 'ewald': + coulG = tools.get_coulG(cell, kpt2-kpt1, False, mydf, mesh) + else: + coulG = tools.get_coulG(cell, kpt2-kpt1, exxdiv, mydf, mesh) + if is_zero(kpt1-kpt2): + expmikr = cp.array(1.) + else: + expmikr = cp.exp(-1j * coords.dot(cp.asarray(kpt2-kpt1))) + + for p0, p1 in lib.prange(0, nao, blksize): + rho1 = contract('ig,jg->ijg', ao1T[p0:p1].conj()*expmikr, ao2T) + vG = tools.fft(rho1.reshape(-1,ngrids), mesh) + rho1 = None + vG *= coulG + vR = tools.ifft(vG, mesh).reshape(p1-p0,naoj,ngrids) + vG = None + if vk_kpts.dtype == np.double: + vR = vR.real + for i in range(nset): + vR_dm[i,p0:p1] = contract('ijg,jg->ig', vR, ao_dms[i]) + vR = None + vR_dm *= expmikr.conj() + + for i in range(nset): + vk_kpts[i,k1] += weight * vR_dm[i].dot(ao1) + + # Function _ewald_exxdiv_for_G0 to add back in the G=0 component to vk_kpts + # Note in the _ewald_exxdiv_for_G0 implementation, the G=0 treatments are + # different for 1D/2D and 3D systems. The special treatments for 1D and 2D + # can only be used with AFTDF/GDF/MDF method. In the FFTDF method, 1D, 2D + # and 3D should use the ewald probe charge correction. + if exxdiv == 'ewald': + vk_kpts = _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band=kpts_band) + + return _format_jks(vk_kpts, dm_kpts, input_band, kpts) + +def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None, + with_j=True, with_k=True, exxdiv=None): + '''Get the Coulomb (J) and exchange (K) AO matrices for the given density matrix. + + Args: + dm : ndarray or list of ndarrays + A density matrix or a list of density matrices + + Kwargs: + hermi : int + Whether J, K matrix is hermitian + | 0 : no hermitian or symmetric + | 1 : hermitian + | 2 : anti-hermitian + kpt : (3,) ndarray + The "inner" dummy k-point at which the DM was evaluated (or + sampled). + kpts_band : (3,) ndarray or (*,3) ndarray + The "outer" primary k-point at which J and K are evaluated. + + Returns: + The function returns one J and one K matrix, corresponding to the input + density matrix (both order and shape). + ''' + dm = cp.asarray(dm, order='C') + vj = vk = None + if with_j: + vj = get_j(mydf, dm, hermi, kpt, kpts_band) + if with_k: + vk = get_k(mydf, dm, hermi, kpt, kpts_band, exxdiv) + return vj, vk + +def get_j(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None): + '''Get the Coulomb (J) AO matrix for the given density matrix. + + Args: + dm : ndarray or list of ndarrays + A density matrix or a list of density matrices + + Kwargs: + hermi : int + Whether J, K matrix is hermitian + | 0 : no hermitian or symmetric + | 1 : hermitian + | 2 : anti-hermitian + kpt : (3,) ndarray + The "inner" dummy k-point at which the DM was evaluated (or + sampled). + kpts_band : (3,) ndarray or (*,3) ndarray + The "outer" primary k-point at which J and K are evaluated. + + Returns: + The function returns one J matrix, corresponding to the input + density matrix (both order and shape). + ''' + dm = cp.asarray(dm, order='C') + nao = dm.shape[-1] + dm_kpts = dm.reshape(-1,1,nao,nao) + vj = get_j_kpts(mydf, dm_kpts, hermi, kpt.reshape(1,3), kpts_band) + if kpts_band is None: + vj = vj[:,0,:,:] + if dm.ndim == 2: + vj = vj[0] + return vj + + +def get_k(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None, exxdiv=None): + '''Get the Coulomb (J) and exchange (K) AO matrices for the given density matrix. + + Args: + dm : ndarray or list of ndarrays + A density matrix or a list of density matrices + + Kwargs: + hermi : int + Whether J, K matrix is hermitian + | 0 : no hermitian or symmetric + | 1 : hermitian + | 2 : anti-hermitian + kpt : (3,) ndarray + The "inner" dummy k-point at which the DM was evaluated (or + sampled). + kpts_band : (3,) ndarray or (*,3) ndarray + The "outer" primary k-point at which J and K are evaluated. + + Returns: + The function returns one J and one K matrix, corresponding to the input + density matrix (both order and shape). + ''' + dm = cp.asarray(dm, order='C') + nao = dm.shape[-1] + dm_kpts = dm.reshape(-1,1,nao,nao) + vk = get_k_kpts(mydf, dm_kpts, hermi, kpt.reshape(1,3), kpts_band, exxdiv) + if kpts_band is None: + vk = vk[:,0,:,:] + if dm.ndim == 2: + vk = vk[0] + return vk + +get_j_e1_kpts = NotImplemented +get_k_e1_kpts = NotImplemented + +def _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band=None): + from pyscf.pbc.tools.pbc import madelung + s = cp.asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts)) + m = madelung(cell, kpts) + if kpts is None: + for i,dm in enumerate(dms): + vk[i] += m * s.dot(dm).dot(s) + elif np.shape(kpts) == (3,): + if kpts_band is None or is_zero(kpts_band-kpts): + for i,dm in enumerate(dms): + vk[i] += m * s.dot(dm).dot(s) + + elif kpts_band is None or np.array_equal(kpts, kpts_band): + for k in range(len(kpts)): + for i,dm in enumerate(dms): + vk[i,k] += m * s[k].dot(dm[k]).dot(s[k]) + else: + for k, kpt in enumerate(kpts): + for kp in member(kpt, kpts_band.reshape(-1,3)): + for i,dm in enumerate(dms): + vk[i,kp] += m * s[k].dot(dm[k]).dot(s[k]) + return vk diff --git a/gpu4pyscf/pbc/dft/__init__.py b/gpu4pyscf/pbc/dft/__init__.py new file mode 100644 index 00000000..1de0a907 --- /dev/null +++ b/gpu4pyscf/pbc/dft/__init__.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +'''Kohn-Sham DFT for periodic systems +''' + +from .gen_grid import UniformGrids, BeckeGrids +from . import rks +#from . import uks +#from . import krks +#from . import kuks +from .rks import KohnShamDFT + +RKS = rks.RKS +#UKS = uks.UKS +#KRKS = krks.KRKS +#KUKS = kuks.KUKS diff --git a/gpu4pyscf/pbc/dft/gen_grid.py b/gpu4pyscf/pbc/dft/gen_grid.py new file mode 100644 index 00000000..af1c40b4 --- /dev/null +++ b/gpu4pyscf/pbc/dft/gen_grid.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import ctypes +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.lib import logger +from pyscf.pbc.dft import gen_grid as gen_grid_cpu +from pyscf.pbc.gto.cell import get_uniform_grids +from gpu4pyscf.lib import utils + +class UniformGrids(lib.StreamObject): + '''Uniform Grid class.''' + + def __init__(self, cell): + self.cell = cell + self.stdout = cell.stdout + self.verbose = cell.verbose + self.mesh = cell.mesh + self.non0tab = None + self._coords = None + self._weights = None + + @property + def coords(self): + if self._coords is not None: + return self._coords + else: + return cp.asarray(get_uniform_grids(self.cell, self.mesh)) + @coords.setter + def coords(self, x): + self._coords = x + + @property + def weights(self): + if self._weights is not None: + return self._weights + else: + ngrids = np.prod(self.mesh) + weights = cp.empty(ngrids) + weights[:] = self.cell.vol / ngrids + return weights + @weights.setter + def weights(self, x): + self._weights = x + + @property + def size(self): + return np.prod(self.mesh) + + reset = gen_grid_cpu.UniformGrids.reset + build = gen_grid_cpu.UniformGrids.build + dump_flags = gen_grid_cpu.UniformGrids.dump_flags + kernel = gen_grid_cpu.UniformGrids.kernel + + to_gpu = utils.to_gpu + device = utils.device + + def to_cpu(self): + obj = utils.to_cpu(self) + return obj.reset() + +class BeckeGrids: + pass diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py new file mode 100644 index 00000000..7ecf6202 --- /dev/null +++ b/gpu4pyscf/pbc/dft/numint.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.pbc.lib.kpts_helper import is_zero +from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks +from pyscf.pbc.dft import numint as numint_cpu +from pyscf.dft.gen_grid import CUTOFF +from pyscf.pbc.lib.kpts import KPoints +from gpu4pyscf.dft import numint +from gpu4pyscf.lib.cupy_helper import transpose_sum, contract, get_avail_mem +from gpu4pyscf.lib import utils + +MIN_BLK_SIZE = numint.MIN_BLK_SIZE +ALIGNED = numint.ALIGNED + +def eval_ao(cell, coords, kpt=np.zeros(3), deriv=0, relativity=0, shls_slice=None, + non0tab=None, cutoff=None, out=None, verbose=None): + '''Collocate AO crystal orbitals (opt. gradients) on the real-space grid. + + Args: + cell : instance of :class:`Cell` + + coords : (nx*ny*nz, 3) ndarray + The real-space grid point coordinates. + + Kwargs: + kpt : (3,) ndarray + The k-point corresponding to the crystal AO. + deriv : int + AO derivative order. It affects the shape of the return array. + If deriv=0, the returned AO values are stored in a (N,nao) array. + Otherwise the AO values are stored in an array of shape (M,N,nao). + Here N is the number of grids, nao is the number of AO functions, + M is the size associated to the derivative deriv. + + Returns: + aoR : ([4,] nx*ny*nz, nao=cell.nao_nr()) ndarray + The value of the AO crystal orbitals on the real-space grid by default. + If deriv=1, also contains the value of the orbitals gradient in the + x, y, and z directions. It can be either complex or float array, + depending on the kpt argument. If kpt is not given (gamma point), + aoR is a float array. + ''' + ao_kpts = eval_ao_kpts(cell, coords, np.reshape(kpt, (-1,3)), deriv) + return ao_kpts[0] + +def eval_ao_kpts(cell, coords, kpts=None, deriv=0, relativity=0, + shls_slice=None, non0tab=None, cutoff=None, out=None, verbose=None): + ''' + Returns: + ao_kpts: (nkpts, [comp], ngrids, nao) ndarray + AO values at each k-point + ''' + return [cp.asarray(ao) for ao in numint_cpu.eval_ao_kpts(cell, coords.get(), kpts, deriv)] + + +def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=False, + verbose=None): + '''Collocate the density (opt. gradients) on the real-space grid. + + Args: + cell : instance of :class:`Mole` or :class:`Cell` + + ao : ([4,] nx*ny*nz, nao=cell.nao_nr()) ndarray + The value of the AO crystal orbitals on the real-space grid by default. + If xctype='GGA', also contains the value of the gradient in the x, y, + and z directions. + + Returns: + rho : ([4,] nx*ny*nz) ndarray + The value of the density on the real-space grid. If xctype='GGA', + also contains the value of the gradient in the x, y, and z + directions. + + See Also: + pyscf.dft.numint.eval_rho + + ''' + if np.iscomplexobj(ao) or np.iscomplexobj(dm): + ngrids, nao = ao.shape[-2:] + ao_loc = cell.ao_loc_nr() + assert nao == ao_loc[-1] + dm = cp.asarray(dm, dtype=np.complex128) + + if hermi == 1: + def dot_bra(bra, aodm): + rho = contract('pi,pi->p', bra.real, aodm.real) + rho += contract('pi,pi->p', bra.imag, aodm.imag) + return rho + dtype = np.float64 + else: + def dot_bra(bra, aodm): + return contract('pi,pi->p', bra.conj(), aodm) + dtype = np.complex128 + + if xctype == 'LDA' or xctype == 'HF': + c0 = ao.dot(dm) + rho = dot_bra(ao, c0) + + elif xctype == 'GGA': + rho = cp.empty((4,ngrids), dtype=dtype) + c0 = ao[0].dot(dm) + rho[0] = dot_bra(ao[0], c0) + for i in range(1, 4): + rho[i] = dot_bra(ao[i], c0) + if hermi == 1: + rho[1:4] *= 2 + else: + c1 = ao[0].dot(dm.conj().T) + for i in range(1, 4): + rho[i] += dot_bra(c1, ao[i]) + + else: # MGGA + assert not with_lapl + rho = cp.empty((5,ngrids), dtype=dtype) + tau_idx = 4 + c0 = ao[0].dot(dm) + rho[0] = dot_bra(ao[0], c0) + rho[tau_idx] = 0 + for i in range(1, 4): + c1 = ao[i].dot(dm) + rho[tau_idx] += dot_bra(ao[i], c1) + rho[i] = dot_bra(ao[i], c0) + if hermi == 1: + rho[i] *= 2 + else: + rho[i] += dot_bra(ao[0], c1) + rho[tau_idx] *= .5 + else: + # real orbitals and real DM + # TODO: call numint.eval_rho. However, the structure of ao is not compatible + # rho = numint.eval_rho(cell, ao, dm, non0tab, xctype, hermi, with_lapl, verbose) + ngrids, nao = ao.shape[-2:] + ao_loc = cell.ao_loc_nr() + assert nao == ao_loc[-1] + + def dot_bra(bra, aodm): + return contract('pi,pi->p', bra, aodm) + + if xctype == 'LDA' or xctype == 'HF': + c0 = ao.dot(dm) + rho = dot_bra(ao, c0) + + elif xctype == 'GGA': + rho = cp.empty((4,ngrids)) + c0 = ao[0].dot(dm) + rho[0] = dot_bra(ao[0], c0) + for i in range(1, 4): + rho[i] = dot_bra(ao[i], c0) + if hermi == 1: + rho[1:4] *= 2 + else: + c1 = ao[0].dot(dm.T) + for i in range(1, 4): + rho[i] += dot_bra(c1, ao[i]) + + else: # MGGA + assert not with_lapl + rho = cp.empty((5,ngrids)) + tau_idx = 4 + c0 = ao[0].dot(dm) + rho[0] = dot_bra(ao[0], c0) + rho[tau_idx] = 0 + for i in range(1, 4): + c1 = ao[i].dot(dm) + rho[tau_idx] += dot_bra(ao[i], c1) + rho[i] = dot_bra(ao[i], c0) + if hermi == 1: + rho[i] *= 2 + else: + rho[i] += dot_bra(ao[0], c1) + rho[tau_idx] *= .5 + return rho + +nr_uks_vxc = nr_uks = NotImplemented +nr_nlc_vxc = NotImplemented +nr_rks_fxc = NotImplemented +nr_rks_fxc_st = NotImplemented +nr_uks_fxc = NotImplemented +cache_xc_kernel = NotImplemented +cache_xc_kernel1 = NotImplemented + + +def get_rho(ni, cell, dm, grids, kpts=np.zeros((1,3)), max_memory=2000): + '''Density in real space + ''' + assert dm.ndim == 2 or dm.shape[0] == 1 + rho = cp.empty(grids.size) + nao = cell.nao + p1 = 0 + for ao_k1, ao_k2, mask, weight, coords \ + in ni.block_loop(cell, grids, nao, 0, kpts, None, max_memory): + p0, p1 = p1, p1 + weight.size + rho[p0:p1] = ni.eval_rho(cell, ao_k1, dm, xctype='LDA', hermi=1) + return rho + +def _scale_ao(ao, wv, out=None): + # TODO: reuse gpu4pyscf.dft.numint._scale_ao + if wv.ndim == 1: + return ao * wv[:,None] + else: + return contract('ngi,ng->gi', ao, wv) + +def _tau_dot(bra, ket, wv): + '''1/2 ''' + # TODO: reuse gpu4pyscf.dft.numint._tau_dot + wv = .5 * wv + mat = bra[1].conj().T.dot(_scale_ao(ket[1], wv)) + mat += bra[2].conj().T.dot(_scale_ao(ket[2], wv)) + mat += bra[3].conj().T.dot(_scale_ao(ket[3], wv)) + return mat + +class NumInt(lib.StreamObject, numint.LibXCMixin): + '''Generalization of pyscf's NumInt class for a single k-point shift and + periodic images. + ''' + + get_vxc = nr_vxc = numint_cpu.NumInt.nr_vxc + + def nr_rks(self, cell, grids, xc_code, dms, relativity=0, hermi=1, + kpt=None, kpts_band=None, max_memory=2000, verbose=None): + if kpt is None: + kpt = np.zeros(3) + xctype = self._xc_type(xc_code) + if xctype == 'LDA': + ao_deriv = 0 + nvar = 1 + elif xctype == 'GGA': + ao_deriv = 1 + nvar = 4 + elif xctype == 'MGGA': + ao_deriv = 1 + nvar = 5 + elif xctype == 'HF': + return 0, 0, cp.zeros_like(dms) + else: + raise NotImplementedError(f'nr_rks for functional {xc_code}') + + dms = cp.asarray(dms) + dm_shape = dms.shape + nao = dm_shape[-1] + dms = dms.reshape(nao,nao) + ngrids = grids.size + + rho = cp.empty([nvar,ngrids]) + p0 = p1 = 0 + for ao_ks, weight, coords \ + in self.block_loop(cell, grids, ao_deriv, kpt=kpt): + p0, p1 = p1, p1 + weight.size + rho[:,p0:p1] = eval_rho(cell, ao_ks[0], dms, xctype=xctype, hermi=hermi) + + if xctype == 'LDA': + exc, vxc = self.eval_xc_eff(xc_code, rho[0], deriv=1, xctype=xctype)[:2] + else: + exc, vxc = self.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] + den = rho[0] * grids.weights + nelec = den.sum() + excsum = cp.sum(den * exc[:,0]) + + wv = vxc * grids.weights + # *.5 for v+v.conj().T at the end + if xctype == 'GGA': + wv[0] *= .5 + elif xctype == 'MGGA': + wv[[0,4]] *= .5 + + kpts_band, input_band = _format_kpts_band(kpts_band, kpt), kpts_band + nband = len(kpts_band) + if is_zero(kpts_band): + vmat = cp.zeros((nband, nao, nao)) + else: + vmat = cp.zeros((nband, nao, nao), dtype=np.complex128) + v_hermi = 1 # the output matrix must be hermitian + p0 = p1 = 0 + for ao_ks, weight, coords \ + in self.block_loop(cell, grids, ao_deriv, kpts_band=kpts_band): + p0, p1 = p1, p1 + weight.size + for k, ao in enumerate(ao_ks): + if xctype == 'LDA': + aow = _scale_ao(ao, wv[0,p0:p1]) + vmat[k] += ao.conj().T.dot(aow) + elif xctype == 'GGA': + aow = _scale_ao(ao[:4], wv[:4,p0:p1]) + vmat[k] += ao[0].conj().T.dot(aow) + elif xctype == 'MGGA': + aow = _scale_ao(ao[:4], wv[:4,p0:p1]) + vmat[k] += ao[0].conj().T.dot(aow) + vmat[k] += _tau_dot(ao, ao, wv[4,p0:p1]) + + if v_hermi and xctype != 'LDA': + vmat = vmat + vmat.transpose(0, 2, 1).conj() + if input_band is None: + vmat = vmat[0] + return nelec, excsum, vmat + + def nr_uks(self, cell, grids, xc_code, dms, relativity=0, hermi=1, + kpt=None, kpts_band=None, max_memory=2000, verbose=None): + raise NotImplementedError + + def block_loop(self, cell, grids, deriv=0, kpt=None, kpts_band=None): + '''Define this macro to loop over grids by blocks. + ''' + nao = cell.nao + grids_coords = grids.coords + grids_weights = grids.weights + ngrids = grids_coords.shape[0] + comp = (deriv+1)*(deriv+2)*(deriv+3)//6 + + #cupy.get_default_memory_pool().free_all_blocks() + mem_avail = get_avail_mem() + blksize = int((mem_avail*.2/8/((comp+1)*nao))/ ALIGNED) * ALIGNED + blksize = min(blksize, MIN_BLK_SIZE) + if blksize < ALIGNED: + raise RuntimeError('Not enough GPU memory') + + if kpts_band is None: + if kpt is None: + kpts = np.zeros((1, 3)) + else: + kpts = np.reshape(kpt, (1, 3)) + elif kpt is None: + kpts = np.reshape(kpts_band, (-1, 3)) + else: + raise RuntimeError('Cannot produce AOs for kpt and kpts_band in the same run') + + for ip0, ip1 in lib.prange(0, ngrids, blksize): + coords = grids_coords[ip0:ip1] + weight = grids_weights[ip0:ip1] + ao_ks = eval_ao_kpts(cell, coords, kpts, deriv=deriv) + yield ao_ks, weight, coords + ao_ks = None + + eval_xc_eff = numint.eval_xc_eff + _init_xcfuns = numint.NumInt._init_xcfuns + + get_fxc = nr_fxc = numint_cpu.NumInt.nr_fxc + nr_rks_fxc = nr_rks_fxc + nr_uks_fxc = nr_uks_fxc + nr_rks_fxc_st = nr_rks_fxc_st + nr_nlc_vxc = nr_nlc_vxc + cache_xc_kernel = cache_xc_kernel + cache_xc_kernel1 = cache_xc_kernel1 + get_rho = get_rho + + eval_ao = staticmethod(eval_ao) + eval_rho = staticmethod(eval_rho) + eval_rho2 = NotImplemented + eval_rho1 = NotImplemented + + to_gpu = utils.to_gpu + device = utils.device + + def to_cpu(self): + return numint_cpu.NumInt() + +_NumInt = NumInt + + +class KNumInt(lib.StreamObject, numint.LibXCMixin): + '''Generalization of pyscf's NumInt class for k-point sampling and + periodic images. + ''' + def __init__(self, kpts=np.zeros((1,3))): + self.kpts = np.reshape(kpts, (-1,3)) + + eval_ao = staticmethod(eval_ao_kpts) + + make_mask = NotImplemented + + def eval_rho(self, cell, ao_kpts, dm_kpts, non0tab=None, xctype='LDA', + hermi=0, with_lapl=True, verbose=None): + '''Collocate the density (opt. gradients) on the real-space grid. + + Args: + cell : Mole or Cell object + ao_kpts : (nkpts, ngrids, nao) ndarray + AO values at each k-point + dm_kpts: (nkpts, nao, nao) ndarray + Density matrix at each k-point + + Returns: + rhoR : (ngrids,) ndarray + ''' + nkpts = len(ao_kpts) + rho_ks = [eval_rho(cell, ao_kpts[k], dm_kpts[k], non0tab, xctype, + hermi, with_lapl, verbose) + for k in range(nkpts)] + dtype = np.result_type(*rho_ks) + rho = cp.zeros(rho_ks[0].shape, dtype=dtype) + for k in range(nkpts): + rho += rho_ks[k] + rho *= 1./nkpts + return rho + + get_vxc = nr_vxc = numint_cpu.KNumInt.nr_vxc + eval_rho1 = NotImplemented + nr_rks = NotImplemented + nr_uks = NotImplemented + + block_loop = NotImplemented + eval_rho2 = NotImplemented + get_vxc = nr_vxc = numint_cpu.KNumInt.nr_vxc + nr_rks_fxc = nr_rks_fxc + nr_uks_fxc = nr_uks_fxc + nr_rks_fxc_st = nr_rks_fxc_st + cache_xc_kernel = cache_xc_kernel + cache_xc_kernel1 = cache_xc_kernel1 + get_rho = get_rho + + to_gpu = utils.to_gpu + device = utils.device + + def to_cpu(self): + return numint_cpu.KNumInt() + +_KNumInt = KNumInt diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py new file mode 100644 index 00000000..f514b9e4 --- /dev/null +++ b/gpu4pyscf/pbc/dft/rks.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +Non-relativistic Restricted Kohn-Sham for periodic systems at a single k-point +''' + + +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.pbc.dft import rks as ks_cpu +from pyscf.pbc.scf import khf +from pyscf.pbc.dft import multigrid +from gpu4pyscf.lib import logger, utils +from gpu4pyscf.dft import rks as mol_ks +from gpu4pyscf.pbc.scf import hf as pbchf +from gpu4pyscf.pbc.dft import gen_grid +from gpu4pyscf.pbc.dft import numint +from gpu4pyscf.lib.cupy_helper import contract, tag_array +from pyscf import __config__ + +__all__ = [ + 'get_veff', 'RKS', 'KohnShamDFT', +] + +def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, + kpt=None, kpts_band=None): + '''Coulomb + XC functional + + .. note:: + This function will change the ks object. + + Args: + ks : an instance of :class:`RKS` + XC functional are controlled by ks.xc attribute. Attribute + ks.grids might be initialized. + dm : ndarray or list of ndarrays + A density matrix or a list of density matrices + + Returns: + matrix Veff = J + Vxc. Veff can be a list matrices, if the input + dm is a list of density matrices. + ''' + if cell is None: cell = ks.cell + if dm is None: dm = ks.make_rdm1() + if kpt is None: kpt = ks.kpt + t0 = logger.init_timer(ks) + + ni = ks._numint + hybrid = ni.libxc.is_hybrid_xc(ks.xc) + + if isinstance(ks.with_df, multigrid.MultiGridFFTDF): + if ks.do_nlc(): + raise NotImplementedError(f'MultiGrid for NLC functional {ks.xc} + {ks.nlc}') + + ground_state = (isinstance(dm, cp.ndarray) and dm.ndim == 2 + and kpts_band is None) + ks.initialize_grids(cell, dm, kpt, ground_state) + + if hermi == 2: # because rho = 0 + n, exc, vxc = 0, 0, 0 + else: + n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi, + kpt, kpts_band) + logger.info(ks, 'nelec by numeric integration = %s', n) + if ks.do_nlc(): + if ni.libxc.is_nlc(ks.xc): + xc = ks.xc + else: + assert ni.libxc.is_nlc(ks.nlc) + xc = ks.nlc + n, enlc, vnlc = ni.nr_nlc_vxc(cell, ks.nlcgrids, xc, dm, 0, hermi, kpt) + exc += enlc + vxc += vnlc + logger.info(ks, 'nelec with nlc grids = %s', n) + t0 = logger.timer(ks, 'vxc', *t0) + + if not hybrid: + vj = ks.get_j(cell, dm, hermi, kpt, kpts_band) + vxc += vj + else: + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin) + if omega == 0: + vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band) + vk *= hyb + elif alpha == 0: # LR=0, only SR exchange + vj = ks.get_j(cell, dm, hermi, kpt, kpts_band) + vk = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=-omega) + vk *= hyb + elif hyb == 0: # SR=0, only LR exchange + vj = ks.get_j(cell, dm, hermi, kpt, kpts_band) + vk = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega) + vk *= alpha + else: # SR and LR exchange with different ratios + vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band) + vk *= hyb + vklr = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega) + vklr *= (alpha - hyb) + vk += vklr + vxc += vj - vk * .5 + + if ground_state: + exc -= contract('ij,ji->', dm, vk).real * .5 * .5 + + if ground_state: + ecoul = contract('ij,ji->', dm, vj).real * .5 + else: + ecoul = None + + vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=None, vk=None) + return vxc + +def prune_small_rho_grids_(ks, cell, dm, grids, kpts): + raise NotImplementedError + +def get_rho(mf, dm=None, grids=None, kpt=None): + if dm is None: dm = mf.make_rdm1() + if grids is None: grids = mf.grids + if kpt is None: kpt = mf.kpt + if dm[0].ndim == 2: # the UKS density matrix + dm = dm[0] + dm[1] + if isinstance(mf.with_df, multigrid.MultiGridFFTDF): + rho = mf.with_df.get_rho(dm, kpt) + else: + rho = mf._numint.get_rho(mf.cell, dm, grids, kpt, mf.max_memory) + return rho + + +class KohnShamDFT(mol_ks.KohnShamDFT): + '''PBC-KS''' + + _keys = ks_cpu.KohnShamDFT._keys + + def __init__(self, xc='LDA,VWN'): + self.xc = xc + self.grids = gen_grid.UniformGrids(self.cell) + self.nlc = '' + self.nlcgrids = gen_grid.UniformGrids(self.cell) + self.small_rho_cutoff = getattr( + __config__, 'dft_rks_RKS_small_rho_cutoff', 1e-7) + if isinstance(self, khf.KSCF): + self._numint = numint.KNumInt(self.kpts) + else: + self._numint = numint.NumInt() + + build = ks_cpu.KohnShamDFT.build + reset = ks_cpu.KohnShamDFT.reset + dump_flags = ks_cpu.KohnShamDFT.dump_flags + + get_veff = NotImplemented + get_rho = get_rho + + density_fit = NotImplemented + rs_density_fit = NotImplemented + + jk_method = NotImplemented + + to_rks = NotImplemented + to_uks = NotImplemented + to_gks = NotImplemented + to_hf = NotImplemented + + def initialize_grids(self, cell, dm, kpts, ground_state=True): + '''Initialize self.grids the first time call get_veff''' + if self.grids.coords is None: + t0 = (logger.process_clock(), logger.perf_counter()) + self.grids.build(with_non0tab=True) + if (isinstance(self.grids, gen_grid.BeckeGrids) and + self.small_rho_cutoff > 1e-20 and ground_state): + self.grids = prune_small_rho_grids_( + self, self.cell, dm, self.grids, kpts) + t0 = logger.timer(self, 'setting up grids', *t0) + is_nlc = self.do_nlc() + if is_nlc and self.nlcgrids.coords is None: + t0 = (logger.process_clock(), logger.perf_counter()) + self.nlcgrids.build(with_non0tab=True) + if (isinstance(self.grids, gen_grid.BeckeGrids) and + self.small_rho_cutoff > 1e-20 and ground_state): + self.nlcgrids = prune_small_rho_grids_( + self, self.cell, dm, self.nlcgrids, kpts) + t0 = logger.timer(self, 'setting up nlc grids', *t0) + return self + +# Update the KohnShamDFT label in pbc.scf.hf module +pbchf.KohnShamDFT = KohnShamDFT + + +class RKS(KohnShamDFT, pbchf.RHF): + '''RKS class adapted for PBCs. + + This is a literal duplication of the molecular RKS class with some `mol` + variables replaced by `cell`. + ''' + + def __init__(self, cell, kpt=np.zeros(3), xc='LDA,VWN', exxdiv='ewald'): + pbchf.RHF.__init__(self, cell, kpt, exxdiv=exxdiv) + KohnShamDFT.__init__(self, xc) + + def dump_flags(self, verbose=None): + pbchf.RHF.dump_flags(self, verbose) + KohnShamDFT.dump_flags(self, verbose) + return self + + get_veff = get_veff + energy_elec = mol_ks.energy_elec + + to_gpu = utils.to_gpu + device = utils.device + + def to_cpu(self): + mf = ks_cpu.RKS(self.cell) + utils.to_cpu(self, out=mf) + return mf diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py new file mode 100644 index 00000000..1489ee40 --- /dev/null +++ b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import tempfile +import numpy as np +from pyscf.pbc import gto as pbcgto +from gpu4pyscf.pbc import dft as pbcdft + + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + global cell + L = 4 + n = 21 + cell = pbcgto.Cell() + cell.build(unit = 'B', + a = ((L,0,0),(0,L,0),(0,0,L)), + mesh = [n,n,n], + atom = [['He', (L/2.-.5,L/2.,L/2.-.5)], + ['He', (L/2. ,L/2.,L/2.+.5)]], + basis = { 'He': [[0, (0.8, 1.0)], + [0, (1.0, 1.0)], + [0, (1.2, 1.0)]]}) + cls.cell = cell + + @classmethod + def tearDownClass(cls): + global cell + del cell + + def test_lda_fft(self): + mf = pbcdft.RKS(cell, xc='lda,vwn').run() + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e0, c0 = mf_ref.get_bands(kpts_band) + e1, c1 = mf.get_bands(kpts_band) + self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7) + self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7) + + def test_gga_fft(self): + mf = pbcdft.RKS(cell, xc='pbe0').run() + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e0, c0 = mf_ref.get_bands(kpts_band) + e1, c1 = mf.get_bands(kpts_band) + self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7) + self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7) + + def test_rsh_fft(self): + mf = pbcdft.RKS(cell, xc='camb3lyp').run() + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e0, c0 = mf_ref.get_bands(kpts_band) + e1, c1 = mf.get_bands(kpts_band) + self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7) + self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7) + + def test_lda_fft_with_kpt(self): + np.random.seed(1) + k = np.random.random(3) + mf = pbcdft.RKS(cell, xc='lda,vwn', kpt=k).run() + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e0, c0 = mf_ref.get_bands(kpts_band) + e1, c1 = mf.get_bands(kpts_band) + self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7) + self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7) + + def test_gga_fft_with_kpt(self): + np.random.seed(1) + k = np.random.random(3) + mf = pbcdft.RKS(cell, xc='pbe0', kpt=k).run() + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e0, c0 = mf_ref.get_bands(kpts_band) + e1, c1 = mf.get_bands(kpts_band) + self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7) + self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7) + + def test_rsh_fft_with_kpt(self): + np.random.seed(1) + k = np.random.random(3) + mf = pbcdft.RKS(cell, xc='camb3lyp', kpt=k).run(conv_tol=1e-8) + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e0, c0 = mf_ref.get_bands(kpts_band) + e1, c1 = mf.get_bands(kpts_band) + self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7) + self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7) + +if __name__ == '__main__': + print("Full Tests for pbc.dft.rks") + unittest.main() diff --git a/gpu4pyscf/pbc/scf/__init__.py b/gpu4pyscf/pbc/scf/__init__.py new file mode 100644 index 00000000..70ac5a5b --- /dev/null +++ b/gpu4pyscf/pbc/scf/__init__.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +'''Hartree-Fock for periodic systems +''' + +from .import hf +#from . import uhf +#from . import khf +#from . import kuhf + +rhf = hf +#krhf = khf + +#UHF = uhf.UHF +RHF = rhf.RHF +#KRHF = krhf.KRHF +#KUHF = kuhf.KRHF diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py new file mode 100644 index 00000000..83ad7b47 --- /dev/null +++ b/gpu4pyscf/pbc/scf/hf.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +Hartree-Fock for periodic systems at a single k-point +''' + +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.pbc.scf import hf as hf_cpu +from gpu4pyscf.lib import logger, utils +from gpu4pyscf.lib.cupy_helper import return_cupy_array, contract +from gpu4pyscf.scf import hf as mol_hf +from gpu4pyscf.pbc import df + +__all__ = [ + 'RHF', 'SCF' +] + +def get_bands(mf, kpts_band, cell=None, dm=None, kpt=None): + '''Get energy bands at the given (arbitrary) 'band' k-points. + + Returns: + mo_energy : (nmo,) ndarray or a list of (nmo,) ndarray + Bands energies E_n(k) + mo_coeff : (nao, nmo) ndarray or a list of (nao,nmo) ndarray + Band orbitals psi_n(k) + ''' + if cell is None: cell = mf.cell + if dm is None: dm = mf.make_rdm1() + if kpt is None: kpt = mf.kpt + + kpts_band = np.asarray(kpts_band) + single_kpt_band = (getattr(kpts_band, 'ndim', None) == 1) + kpts_band = kpts_band.reshape(-1,3) + + fock = mf.get_veff(cell, dm, kpt=kpt, kpts_band=kpts_band) + fock += mf.get_hcore(cell, kpts_band) + s1e = mf.get_ovlp(cell, kpts_band) + nkpts = len(kpts_band) + mo_energy = [] + mo_coeff = [] + for k in range(nkpts): + e, c = mf.eig(fock[k], s1e[k]) + mo_energy.append(e) + mo_coeff.append(c) + + if single_kpt_band: + mo_energy = mo_energy[0] + mo_coeff = mo_coeff[0] + return mo_energy, mo_coeff + +get_fock = mol_hf.get_fock +get_occ = mol_hf.get_occ +get_grad = mol_hf.get_grad +make_rdm1 = mol_hf.make_rdm1 +energy_elec = mol_hf.energy_elec + +def get_rho(mf, dm=None, grids=None, kpt=None): + '''Compute density in real space + ''' + from gpu4pyscf.pbc.dft import gen_grid + from gpu4pyscf.pbc.dft import numint + if dm is None: + dm = mf.make_rdm1() + if getattr(dm, 'ndim', None) != 2: # UHF + dm = dm[0] + dm[1] + if grids is None: + grids = gen_grid.UniformGrids(mf.cell) + if kpt is None: + kpt = mf.kpt + ni = numint.NumInt() + return ni.get_rho(mf.cell, dm, grids, kpt, mf.max_memory) + +class SCF(mol_hf.SCF): + '''SCF base class adapted for PBCs. + + Attributes: + kpt : (3,) ndarray + The AO k-point in Cartesian coordinates, in units of 1/Bohr. + + exxdiv : str + Exchange divergence treatment, can be one of + + | None : ignore G=0 contribution in exchange + | 'ewald' : Ewald probe charge correction [JCP 122, 234102 (2005); DOI:10.1063/1.1926272] + + with_df : density fitting object + Default is the instance of FFTDF class (GPW method). + ''' + + _keys = hf_cpu.SCF._keys + + def __init__(self, cell, kpt=np.zeros(3), exxdiv='ewald'): + if not cell._built: + cell.build() + mol_hf.SCF.__init__(self, cell) + self.with_df = df.FFTDF(cell) + # Range separation JK builder + self.rsjk = None + self.exxdiv = exxdiv + self.kpt = kpt + self.conv_tol = 1e-8 + if cell.precision: + self.conv_tol = max(cell.precision * 10, 1e-8) + + def check_sanity(self): + if (isinstance(self.exxdiv, str) and self.exxdiv.lower() != 'ewald' and + isinstance(self.with_df, df.DF)): + logger.warn(self, 'exxdiv %s is not supported in DF', self.exxdiv) + + if self.verbose >= logger.DEBUG: + super().check_sanity() + return self + + kpt = hf_cpu.SCF.kpt + kpts = hf_cpu.SCF.kpts + mol = hf_cpu.SCF.mol # required by the hf.kernel + + reset = hf_cpu.SCF.reset + build = hf_cpu.SCF.build + dump_flags = hf_cpu.SCF.dump_flags + + get_bands = get_bands + get_rho = get_rho + + get_ovlp = return_cupy_array(hf_cpu.SCF.get_ovlp) + + def get_hcore(self, cell=None, kpt=None): + if cell is None: cell = self.cell + if kpt is None: kpt = self.kpt + if cell.pseudo: + nuc = self.with_df.get_pp(kpt) + else: + nuc = self.with_df.get_nuc(kpt) + if len(cell._ecpbas) > 0: + raise NotImplementedError('ECP in PBC SCF') + return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt)) + + def get_jk(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None, + with_j=True, with_k=True, omega=None, **kwargs): + r'''Get Coulomb (J) and exchange (K) following :func:`scf.hf.RHF.get_jk_`. + for particular k-point (kpt). + + When kpts_band is given, the J, K matrices on kpts_band are evaluated. + + J_{pq} = \sum_{rs} (pq|rs) dm[s,r] + K_{pq} = \sum_{rs} (pr|sq) dm[r,s] + + where r,s are orbitals on kpt. p and q are orbitals on kpts_band + if kpts_band is given otherwise p and q are orbitals on kpt. + ''' + if cell is None: cell = self.cell + if dm is None: dm = self.make_rdm1() + if kpt is None: kpt = self.kpt + + cpu0 = logger.init_timer(self) + dm = cp.asarray(dm) + nao = dm.shape[-1] + vj, vk = self.with_df.get_jk(dm.reshape(-1,nao,nao), hermi, kpt, kpts_band, + with_j, with_k, omega, exxdiv=self.exxdiv) + if with_j: + vj = _format_jks(vj, dm, kpts_band) + if with_k: + vk = _format_jks(vk, dm, kpts_band) + logger.timer(self, 'vj and vk', *cpu0) + return vj, vk + + def get_j(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None, + omega=None): + r'''Compute J matrix for the given density matrix and k-point (kpt). + When kpts_band is given, the J matrices on kpts_band are evaluated. + + J_{pq} = \sum_{rs} (pq|rs) dm[s,r] + + where r,s are orbitals on kpt. p and q are orbitals on kpts_band + if kpts_band is given otherwise p and q are orbitals on kpt. + ''' + return self.get_jk(cell, dm, hermi, kpt, kpts_band, with_k=False, + omega=omega)[0] + + def get_k(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None, + omega=None): + '''Compute K matrix for the given density matrix. + ''' + return self.get_jk(cell, dm, hermi, kpt, kpts_band, with_j=False, + omega=omega)[1] + + get_veff = hf_cpu.SCF.get_veff + energy_nuc = hf_cpu.SCF.energy_nuc + _finalize = hf_cpu.SCF._finalize + + def get_init_guess(self, cell=None, key='minao', s1e=None): + if cell is None: cell = self.cell + dm = mol_hf.SCF.get_init_guess(self, cell, key) + dm = normalize_dm_(self, dm, s1e) + return dm + + init_guess_by_1e = hf_cpu.SCF.init_guess_by_1e + init_guess_by_chkfile = hf_cpu.SCF.init_guess_by_chkfile + from_chk = hf_cpu.SCF.from_chk + dump_chk = hf_cpu.SCF.dump_chk + analyze = NotImplemented + mulliken_pop = NotImplemented + density_fit = NotImplemented + rs_density_fit = NotImplemented + x2c = x2c1e = sfx2c1e = NotImplemented + spin_square = NotImplemented + dip_moment = NotImplemented + + +class KohnShamDFT: + '''A mock DFT base class + + The base class is defined in the pbc.dft.rks module. This class can + be used to verify if an SCF object is an pbc-Hartree-Fock method or an + pbc-DFT method. It should be overwritten by the actual KohnShamDFT class + when loading dft module. + ''' + + +class RHF(SCF): + + to_gpu = utils.to_gpu + device = utils.device + + def to_cpu(self): + mf = hf_cpu.RHF(self.cell) + utils.to_cpu(self, out=mf) + return mf + +def _format_jks(vj, dm, kpts_band): + if kpts_band is None: + vj = vj.reshape(dm.shape) + elif kpts_band.ndim == 1: # a single k-point on bands + vj = vj.reshape(dm.shape) + elif getattr(dm, "ndim", 0) == 2: + vj = vj[0] + return vj + +def normalize_dm_(mf, dm, s1e=None): + ''' + Force density matrices integrated to the correct number of electrons. + ''' + cell = mf.cell + if s1e is None: + s1e = mf.get_ovlp(cell) + ne = contract('ij,ji->', dm, s1e).real + if abs(ne - cell.nelectron) > 0.01: + logger.debug(mf, 'Big errors in the electron number of initial guess ' + 'density matrix (Ne/cell = %g)!', ne) + dm *= cell.nelectron / ne + return dm diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py new file mode 100644 index 00000000..ca11d5b0 --- /dev/null +++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.pbc.scf import hf as pbchf_cpu +from pyscf.pbc import gto as pbcgto +from gpu4pyscf.pbc import scf + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + L = 4 + n = 21 + cell = pbcgto.Cell() + cell.build(unit = 'B', + verbose = 7, + output = '/dev/null', + a = ((L,0,0),(0,L,0),(0,0,L)), + mesh = [n,n,n], + atom = [['He', (L/2.-.5,L/2.,L/2.-.5)], + ['He', (L/2. ,L/2.,L/2.+.5)]], + basis = { 'He': [[0, (0.8, 1.0)], + [0, (1.0, 1.0)], + [0, (1.2, 1.0)]]}) + cls.cell = cell + + @classmethod + def tearDownClass(cls): + cls.cell.stdout.close() + + def test_rhf_exx_ewald(self): + cell = self.cell + mf = scf.RHF(cell, exxdiv='ewald').run() + self.assertAlmostEqual(mf.e_tot, -4.3511582284698633, 7) + self.assertTrue(mf.mo_coeff.dtype == np.double) + #kmf = scf.KRHF(cell, [[0,0,0]], exxdiv='ewald').run() + #self.assertAlmostEqual(mf.e_tot, kmf.e_tot, 8) + + # test bands + np.random.seed(1) + kpts_band = np.random.random((2,3)) + e1, c1 = mf.get_bands(kpts_band) + #e0, c0 = kmf.get_bands(kpts_band) + #self.assertAlmostEqual(abs(e0[0]-e1[0]).max(), 0, 7) + #self.assertAlmostEqual(abs(e0[1]-e1[1]).max(), 0, 7) + self.assertAlmostEqual(lib.fp(e1[0].get()), -6.2986775452228283, 6) + self.assertAlmostEqual(lib.fp(e1[1].get()), -7.6616273746782362, 6) + + def test_rhf_exx_ewald_with_kpt(self): + np.random.seed(1) + k = np.random.random(3) + cell = self.cell + mf = scf.RHF(cell, k, exxdiv='ewald') + e1 = mf.kernel() + self.assertAlmostEqual(e1, -4.2048655827967139, 7) + self.assertTrue(mf.mo_coeff.dtype == np.complex128) + + #kmf = scf.KRHF(cell, k, exxdiv='ewald') + #e0 = kmf.kernel() + #self.assertTrue(np.allclose(e0,e1)) + + # test bands + np.random.seed(1) + kpt_band = np.random.random(3) + e1, c1 = mf.get_bands(kpt_band) + #e0, c0 = kmf.get_bands(kpt_band) + #self.assertAlmostEqual(abs(e0-e1).max(), 0, 7) + self.assertAlmostEqual(lib.fp(e1.get()), -6.8312867098806249, 6) + + def test_rhf_exx_None(self): + cell = self.cell + mf = scf.RHF(cell, exxdiv=None) + e1 = mf.kernel() + self.assertAlmostEqual(e1, -2.9325094887283196, 7) + self.assertTrue(mf.mo_coeff.dtype == np.double) + + #mf = scf.KRHF(cell, [[0,0,0]], exxdiv=None) + #e0 = mf.kernel() + #self.assertTrue(np.allclose(e0,e1)) + + np.random.seed(1) + k = np.random.random(3) + mf = scf.RHF(cell, k, exxdiv=None) + mf.init_guess = 'hcore' + e1 = mf.kernel() + self.assertAlmostEqual(e1, -2.7862168430230341, 7) + self.assertTrue(mf.mo_coeff.dtype == np.complex128) + + #mf = scf.KRHF(cell, k, exxdiv=None) + #mf.init_guess = 'hcore' + #e0 = mf.kernel() + #self.assertTrue(np.allclose(e0,e1)) + + def test_jk(self): + cell = self.cell + nao = cell.nao + np.random.seed(2) + dm = np.random.random((2,nao,nao)) + .5j*np.random.random((2,nao,nao)) + dm = dm + dm.conj().transpose(0,2,1) + ref = pbchf_cpu.RHF(cell).get_jk(cell, dm) + + dm = cp.asarray(dm) + vj, vk = scf.RHF(cell).get_jk(cell, dm) + self.assertAlmostEqual(abs(vj.get() - ref[0]).max(), 0, 9) + self.assertAlmostEqual(abs(vk.get() - ref[1]).max(), 0, 9) + + +if __name__ == '__main__': + print("Full Tests for pbc.scf.hf") + unittest.main() diff --git a/gpu4pyscf/pbc/tools/__init__.py b/gpu4pyscf/pbc/tools/__init__.py new file mode 100644 index 00000000..12b67013 --- /dev/null +++ b/gpu4pyscf/pbc/tools/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from .pbc import * diff --git a/gpu4pyscf/pbc/tools/pbc.py b/gpu4pyscf/pbc/tools/pbc.py new file mode 100644 index 00000000..c5fc91e8 --- /dev/null +++ b/gpu4pyscf/pbc/tools/pbc.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy as np +import cupy as cp +from gpu4pyscf.lib.cupy_helper import return_cupy_array +from pyscf.pbc.tools.pbc import get_coulG + +get_coulG = return_cupy_array(get_coulG) + +def fft(f, mesh): + '''Perform the 3D FFT from real (R) to reciprocal (G) space. + + After FFT, (u, v, w) -> (j, k, l). + (jkl) is in the index order of Gv. + + FFT normalization factor is 1., as in MH and in `numpy.fft`. + + Args: + f : (nx*ny*nz,) ndarray + The function to be FFT'd, flattened to a 1D array corresponding + to the index order of :func:`cartesian_prod`. + mesh : (3,) ndarray of ints (= nx,ny,nz) + The number G-vectors along each direction. + + Returns: + (nx*ny*nz,) ndarray + The FFT 1D array in same index order as Gv (natural order of + numpy.fft). + + ''' + if f.size == 0: + return cp.zeros_like(f) + + f3d = cp.asarray(f).reshape(-1, *mesh) + assert (f3d.shape[0] == 1 or f[0].size == f3d[0].size) + g3d = cp.fft.fftn(f3d, axes=(1,2,3)) + ngrids = np.prod(mesh) + if f.ndim == 1 or (f.ndim == 3 and f.size == ngrids): + return g3d.ravel() + else: + return g3d.reshape(-1, ngrids) + +def ifft(g, mesh): + '''Perform the 3D inverse FFT from reciprocal (G) space to real (R) space. + + Inverse FFT normalization factor is 1./N, same as in `numpy.fft` but + **different** from MH (they use 1.). + + Args: + g : (nx*ny*nz,) ndarray + The function to be inverse FFT'd, flattened to a 1D array + corresponding to the index order of `span3`. + mesh : (3,) ndarray of ints (= nx,ny,nz) + The number G-vectors along each direction. + + Returns: + (nx*ny*nz,) ndarray + The inverse FFT 1D array in same index order as Gv (natural order + of numpy.fft). + + ''' + if g.size == 0: + return cp.zeros_like(g) + + g3d = cp.asarray(g).reshape(-1, *mesh) + assert (g3d.shape[0] == 1 or g[0].size == g3d[0].size) + f3d = cp.fft.ifftn(g3d, axes=(1,2,3)) + ngrids = np.prod(mesh) + if g.ndim == 1 or (g.ndim == 3 and g.size == ngrids): + return f3d.ravel() + else: + return f3d.reshape(-1, ngrids) + + +def fftk(f, mesh, expmikr): + r'''Perform the 3D FFT of a real-space function which is (periodic*e^{ikr}). + + fk(k+G) = \sum_r fk(r) e^{-i(k+G)r} = \sum_r [f(k)e^{-ikr}] e^{-iGr} + ''' + return fft(f*expmikr, mesh) + + +def ifftk(g, mesh, expikr): + r'''Perform the 3D inverse FFT of f(k+G) into a function which is (periodic*e^{ikr}). + + fk(r) = (1/Ng) \sum_G fk(k+G) e^{i(k+G)r} = (1/Ng) \sum_G [fk(k+G)e^{iGr}] e^{ikr} + ''' + return ifft(g, mesh) * expikr diff --git a/gpu4pyscf/properties/shielding.py b/gpu4pyscf/properties/shielding.py index 1ef5e844..ae98dc04 100644 --- a/gpu4pyscf/properties/shielding.py +++ b/gpu4pyscf/properties/shielding.py @@ -18,7 +18,7 @@ from pyscf.data import nist from pyscf.scf import _vhf, jk from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract, take_last2d, add_sparse +from gpu4pyscf.lib.cupy_helper import contract, sandwich_dot, add_sparse from gpu4pyscf.scf import cphf def gen_vind(mf, mo_coeff, mo_occ): @@ -37,23 +37,20 @@ def gen_vind(mf, mo_coeff, mo_occ): mvir = mo_coeff[:, mo_occ == 0] nocc = mocc.shape[1] nvir = nmo - nocc - omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff( - mf.xc, spin=mf.mol.spin) + omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin) + # FIXME: check if hybrid + # FIXME: handle rsh def fx(mo1): mo1 = mo1.reshape(-1, nvir, nocc) # * the saving pattern mo1_mo_real = contract('nai,ua->nui', mo1, mvir) dm1 = 2*contract('nui,vi->nuv', mo1_mo_real, mocc.conj()) - dm1 -= dm1.transpose(0, 2, 1) + dm1 = dm1 - dm1.transpose(0, 2, 1) if hasattr(mf,'with_df'): - v1 = cupy.empty((3, nao, nao)) - for i in range(3): - v1[i] =+mf.get_jk(mf.mol, dm1[i], hermi=2, with_j=False)[1]*0.5*hyb + vk = mf.get_jk(mf.mol, dm1, hermi=2, with_j=False)[1] else: - v1 = np.empty((3, nao, nao)) - for i in range(3): - v1[i] = -jk.get_jk(mf.mol, dm1[i].get(), 'ijkl,jk->il')*0.5*hyb - v1 = cupy.array(v1) + vk = cupy.array(jk.get_jk(mf.mol, dm1.get(), ['ijkl,jk->il']*3)) + v1 = -.5*hyb * vk tmp = contract('nuv,vi->nui', v1, mocc) v1vo = contract('nui,ua->nai', tmp, mvir.conj()) @@ -68,7 +65,7 @@ def nr_rks(ni, mol, grids, xc_code, dms): mo_coeff = getattr(dms, 'mo_coeff', None) mo_occ = getattr(dms, 'mo_occ', None) nao = mo_coeff.shape[1] - + opt = getattr(ni, 'gdftopt', None) if opt is None: ni.build(mol, grids.coords) @@ -77,9 +74,8 @@ def nr_rks(ni, mol, grids, xc_code, dms): coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape - dms = cupy.asarray(dms).reshape(-1,nao0,nao0) - dms = take_last2d(dms, opt.ao_idx) - mo_coeff = mo_coeff[opt.ao_idx] + dms = sandwich_dot(cupy.asarray(dms).reshape(-1,nao0,nao0), coeff.T) + mo_coeff = coeff.dot(mo_coeff) vmat = cupy.zeros((3, nao, nao)) if xctype == 'LDA': @@ -100,7 +96,7 @@ def nr_rks(ni, mol, grids, xc_code, dms): vtmp = contract('pu,p,vp->uv', giao_aux[idirect], wv, ao) vtmp = cupy.ascontiguousarray(vtmp) add_sparse(vmat[idirect], vtmp, index) - + elif xctype == 'GGA': wv = vxc * weight giao = _sorted_mol.eval_gto('GTOval_ig', coords.get(), comp=3) @@ -133,7 +129,7 @@ def nr_rks(ni, mol, grids, xc_code, dms): ao = None - vmat = take_last2d(vmat, opt.rev_ao_idx) + vmat = sandwich_dot(vmat, coeff) if numint.FREE_CUPY_CACHE: dms = None @@ -164,8 +160,9 @@ def get_vxc(mf, dm0): vk = None vxc += vj else: - omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff( - mf.xc, spin=mf.mol.spin) + omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin) + # FIXME: check if hybrid + # FIXME: handle rsh vxc += vj - vk*hyb*0.5 return vxc @@ -211,19 +208,16 @@ def eval_shielding(mf): s1jk = -contract('xiq,qj->xij', tmp, mocc)*0.5 tmp = contract('nai,ua->nui', s1jk, mocc) s1jkdm1 = contract('nui,vi->nuv', tmp, mocc.conj())*2 - s1jkdm1 -= s1jkdm1.transpose(0, 2, 1) - omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff( - mf.xc, spin=mf.mol.spin) + s1jkdm1 = s1jkdm1 - s1jkdm1.transpose(0, 2, 1) + omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin) + # FIXME: check if hybrid + # FIXME: handle rsh + if hasattr(mf,'with_df'): - vk2 = cupy.empty((3, nao, nao)) - for i in range(3): - vk2[i] = +mf.get_jk(mf.mol, s1jkdm1[i], hermi=2, with_j=False)[1]*0.5*hyb - + vk = mf.get_jk(mf.mol, s1jkdm1, hermi=2, with_j=False)[1] else: - vk2 = np.empty((3, nao, nao)) - for i in range(3): - vk2[i] = -jk.get_jk(mf.mol, s1jkdm1[i].get(), 'ijkl,jk->il')*0.5*hyb - vk2 = cupy.array(vk2) + vk = cupy.array(jk.get_jk(mf.mol, s1jkdm1.get(), ['ijkl,jk->il']*3)) + vk2 = -.5*hyb * vk h1ao += vk2 tmp = contract('xuv,ua->xav', h1ao, mvir) veff_ai = contract('xav,vi->xai', tmp, mocc) diff --git a/gpu4pyscf/properties/tests/test_shielding.py b/gpu4pyscf/properties/tests/test_shielding.py index e2415c80..0bbe9c07 100644 --- a/gpu4pyscf/properties/tests/test_shielding.py +++ b/gpu4pyscf/properties/tests/test_shielding.py @@ -135,4 +135,4 @@ def test_rks_b3lyp_df(self): if __name__ == "__main__": print("Full Tests for nmr shielding constants") - unittest.main() \ No newline at end of file + unittest.main() diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py index 874ab513..c2c8b056 100644 --- a/gpu4pyscf/qmmm/chelpg.py +++ b/gpu4pyscf/qmmm/chelpg.py @@ -48,7 +48,7 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None, _, _, fake_uniq_l_ctr, fake_l_ctr_counts = int3c2e.sort_mol(fake_mol) # sort auxiliary mol - sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol( + sorted_auxmol, _, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol( intopt.auxmol) if group_size_aux is not None: aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e._split_l_ctr_groups( @@ -88,10 +88,7 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None, ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1]) intopt.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx]) ncart = cart_ao_loc[-1] - nsph = sph_ao_loc[-1] - intopt.cart2sph = block_c2s_diag(ncart, nsph, intopt.angular, l_ctr_counts) - inv_idx = np.argsort(intopt.sph_ao_idx, kind='stable').astype(np.int32) - intopt.coeff = intopt.cart2sph[:, inv_idx] + intopt.cart2sph = block_c2s_diag(intopt.angular, l_ctr_counts) # pairing auxiliary basis with fake basis set fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts)) @@ -109,7 +106,6 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None, cart_aux_loc = intopt.auxmol.ao_loc_nr(cart=True) sph_aux_loc = intopt.auxmol.ao_loc_nr(cart=False) ncart = cart_aux_loc[-1] - nsph = sph_aux_loc[-1] # inv_idx = np.argsort(intopt.sph_aux_idx, kind='stable').astype(np.int32) aux_l_ctr_offsets += fake_l_ctr_offsets[-1] @@ -159,6 +155,13 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None, intopt.cp_idx, intopt.cp_jdx = np.unravel_index( np.arange(ncptype), (nl, nl)) + intopt._sorted_mol = sorted_mol + intopt._sorted_auxmol = sorted_auxmol + if intopt.mol.cart: + intopt._ao_idx = intopt.cart_ao_idx + else: + intopt._ao_idx = intopt.sph_ao_idx + def eval_chelpg_layer_gpu(mf, deltaR=0.3, Rhead=2.8, ifqchem=True, Rvdw=modified_Bondi, verbose=None): """Cal chelpg charge diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py index f704133e..986ae2f2 100644 --- a/gpu4pyscf/qmmm/pbc/itrf.py +++ b/gpu4pyscf/qmmm/pbc/itrf.py @@ -1026,7 +1026,8 @@ def calculate_h1e(self, h1_gpu): v = cp.zeros_like(g_qm) for i0,i1,j0,j1,k0,k1,j3c in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'): v[:,i0:i1,j0:j1] += contract('xkji,k->xij', j3c, charges[k0:k1]) - g_qm += cupy_helper.take_last2d(v, intopt.rev_ao_idx) + v = intopt.unsort_orbitals(v, axis=[1,2]) + g_qm += v #cupy_helper.take_last2d(v, intopt.rev_ao_idx) elif mm_mol.charge_model == 'point' and len(coords) != 0: max_memory = self.max_memory - lib.current_memory()[0] blksize = int(min(max_memory*1e6/8/nao**2/3, 200)) @@ -1079,7 +1080,7 @@ def grad_hcore_mm(self, dm, mol=None): intopt.build(self.base.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE) - dm_ = cupy_helper.take_last2d(dm, intopt.sph_ao_idx) + dm_ = intopt.sort_orbitals(dm, axis=[0,1]) for i0,i1,j0,j1,k0,k1,j3c in int3c2e.loop_int3c2e_general(intopt, ip_type='ip2'): j3c = contract('xkji,k->xkji', j3c, charges[k0:k1]) g_[k0:k1] += contract('xkji,ij->kx', j3c, dm_[i0:i1,j0:j1]) diff --git a/gpu4pyscf/scf/_response_functions.py b/gpu4pyscf/scf/_response_functions.py index 6677cf6f..b86b0514 100644 --- a/gpu4pyscf/scf/_response_functions.py +++ b/gpu4pyscf/scf/_response_functions.py @@ -19,7 +19,7 @@ from gpu4pyscf.scf import hf, uhf def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None, - singlet=None, hermi=0, max_memory=None): + singlet=None, hermi=0, grids=None, max_memory=None): '''Generate a function to compute the product of RHF response function and RHF density matrices. @@ -31,24 +31,29 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None, if mo_coeff is None: mo_coeff = mf.mo_coeff if mo_occ is None: mo_occ = mf.mo_occ mol = mf.mol + if isinstance(mf, hf.KohnShamDFT): + if grids is None: + grids = mf.grids + if grids and grids.coords is None: + grids.build(mol=mol, with_non0tab=False, sort_grids=True) ni = mf._numint ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True) - if getattr(mf, 'nlc', '') != '': + if mf.do_nlc(): logger.warn(mf, 'NLC functional found in DFT object. Its second ' 'deriviative is not available. Its contribution is ' 'not included in the response function.') omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) - hybrid = abs(hyb) > 1e-10 + hybrid = ni.libxc.is_hybrid_xc(mf.xc) if singlet is None: # for ground state orbital hessian - rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc, - mo_coeff, mo_occ, 0) + spin = 0 else: - rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc, - [mo_coeff]*2, [mo_occ*.5]*2, spin=1) - dm0 = None #mf.make_rdm1(mo_coeff, mo_occ) + spin = 1 + rho0, vxc, fxc = ni.cache_xc_kernel( + mol, grids, mf.xc, mo_coeff, mo_occ, spin, max_memory=max_memory) + dm0 = None if singlet is None: # Without specify singlet, used in ground state orbital hessian @@ -57,9 +62,9 @@ def vind(dm1): if hermi == 2: v1 = cupy.zeros_like(dm1) else: - v1 = ni.nr_rks_fxc(mol, mf.grids, mf.xc, dm0, dm1, 0, hermi, + v1 = ni.nr_rks_fxc(mol, grids, mf.xc, dm0, dm1, 0, hermi, rho0, vxc, fxc, max_memory=max_memory) - if hybrid or abs(alpha) > 1e-10: + if hybrid: if hermi != 2: vj, vk = mf.get_jk(mol, dm1, hermi=hermi) vk *= hyb @@ -71,8 +76,45 @@ def vind(dm1): elif hermi != 2: v1 += mf.get_j(mol, dm1, hermi=hermi) return v1 - else: - raise NotImplementedError('only singlet response is supported!') + + elif singlet: + fxc *= .5 + def vind(dm1): + if hermi == 2: + v1 = cupy.zeros_like(dm1) + else: + # nr_rks_fxc_st requires alpha of dm1, dm1*.5 should be scaled + v1 = ni.nr_rks_fxc_st(mol, grids, mf.xc, dm0, dm1, 0, True, + rho0, vxc, fxc, max_memory=max_memory) + if hybrid: + if hermi != 2: + vj, vk = mf.get_jk(mol, dm1, hermi=hermi) + vk *= hyb + if abs(omega) > 1e-10: # For range separated Coulomb + vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb) + v1 += vj - .5 * vk + else: + v1 -= .5 * hyb * mf.get_k(mol, dm1, hermi=hermi) + elif hermi != 2: + v1 += mf.get_j(mol, dm1, hermi=hermi) + return v1 + + else: # triplet + fxc *= .5 + def vind(dm1): + if hermi == 2: + v1 = cupy.zeros_like(dm1) + else: + # nr_rks_fxc_st requires alpha of dm1, dm1*.5 should be scaled + v1 = ni.nr_rks_fxc_st(mol, grids, mf.xc, dm0, dm1, 0, False, + rho0, vxc, fxc, max_memory=max_memory) + if hybrid: + vk = mf.get_k(mol, dm1, hermi=hermi) + vk *= hyb + if abs(omega) > 1e-10: # For range separated Coulomb + vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb) + v1 += -.5 * vk + return v1 else: # HF if (singlet is None or singlet) and hermi != 2: @@ -87,7 +129,7 @@ def vind(dm1): def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None, - with_j=True, hermi=0, max_memory=None): + with_j=True, hermi=0, grids=None, max_memory=None): '''Generate a function to compute the product of UHF response function and UHF density matrices. ''' @@ -96,6 +138,10 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None, if mo_occ is None: mo_occ = mf.mo_occ mol = mf.mol if isinstance(mf, hf.KohnShamDFT): + if grids is None: + grids = mf.grids + if grids and grids.coords is None: + grids.build(mol=mol, with_non0tab=False, sort_grids=True) ni = mf._numint ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True) if mf.do_nlc(): @@ -105,19 +151,15 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None, omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) hybrid = ni.libxc.is_hybrid_xc(mf.xc) - rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc, + rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc, mo_coeff, mo_occ, 1) dm0 = None - if max_memory is None: - mem_now = lib.current_memory()[0] - max_memory = max(2000, mf.max_memory*.8-mem_now) - def vind(dm1): if hermi == 2: v1 = cupy.zeros_like(dm1) else: - v1 = ni.nr_uks_fxc(mol, mf.grids, mf.xc, dm0, dm1, 0, hermi, + v1 = ni.nr_uks_fxc(mol, grids, mf.xc, dm0, dm1, 0, hermi, rho0, vxc, fxc, max_memory=max_memory) if not hybrid: if with_j: diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index b84d0a58..a069d89b 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -25,13 +25,14 @@ from pyscf.scf import hf from pyscf.scf import chkfile from gpu4pyscf import lib +from gpu4pyscf.lib import utils from gpu4pyscf.lib.cupy_helper import eigh, tag_array, return_cupy_array, cond from gpu4pyscf.scf import diis, jk from gpu4pyscf.lib import logger __all__ = [ 'get_jk', 'get_occ', 'get_grad', 'damping', 'level_shift', 'get_fock', - 'energy_elec', 'RHF' + 'energy_elec', 'RHF', 'SCF' ] def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, @@ -238,33 +239,13 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, scf_conv = True break - if(cycle == mf.max_cycle): - logger.warn("SCF failed to converge") + if (cycle + 1 == mf.max_cycle): + assert not scf_conv + logger.warn(mf, "SCF failed to converge") return scf_conv, e_tot, mo_energy, mo_coeff, mo_occ -def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'): - from pyscf.data import nist - if mol is None: mol = mf.mol - if dm is None: dm = mf.make_rdm1() - nao = mol.nao - with mol.with_common_orig((0,0,0)): - ao_quad = mol.intor_symmetric('int1e_rr').reshape(3,3,nao,nao) - - el_quad = np.einsum('xyij,ji->xy', ao_quad, dm).real - - # Nuclear contribution - charges = mol.atom_charges() - coords = mol.atom_coords() - nucl_quad = np.einsum('i,ix,iy->xy', charges, coords, coords) - - mol_quad = nucl_quad - el_quad - - if unit.upper() == 'DEBYE-ANG': - mol_quad *= nist.AU2DEBYE * nist.BOHR - return mol_quad - def energy_tot(mf, dm=None, h1e=None, vhf=None): r'''Total Hartree-Fock energy, electronic part plus nuclear repulstion See :func:`scf.hf.energy_elec` for the electron part @@ -310,6 +291,27 @@ def scf(mf, dm0=None, **kwargs): mf._finalize() return mf.e_tot +def canonicalize(mf, mo_coeff, mo_occ, fock=None): + '''Canonicalization diagonalizes the Fock matrix within occupied, open, + virtual subspaces separatedly (without change occupancy). + ''' + if fock is None: + dm = mf.make_rdm1(mo_coeff, mo_occ) + fock = mf.get_fock(dm=dm) + coreidx = mo_occ == 2 + viridx = mo_occ == 0 + openidx = ~(coreidx | viridx) + mo = cupy.empty_like(mo_coeff) + mo_e = cupy.empty(mo_occ.size) + for idx in (coreidx, openidx, viridx): + if cupy.any(idx) > 0: + orb = mo_coeff[:,idx] + f1 = orb.conj().T.dot(fock).dot(orb) + e, c = cupy.linalg.eigh(f1) + mo[:,idx] = orb.dot(c) + mo_e[idx] = e + return mo_e, mo + def as_scanner(mf): if isinstance(mf, pyscf_lib.SinglePointScanner): return mf @@ -354,9 +356,10 @@ class SCF(pyscf_lib.StreamObject): conv_tol_grad = hf.SCF.conv_tol_grad max_cycle = hf.SCF.max_cycle init_guess = hf.SCF.init_guess + conv_tol_cpscf = 1e-4 disp = None - DIIS = hf.SCF.DIIS + DIIS = diis.SCF_DIIS diis = hf.SCF.diis diis_space = hf.SCF.diis_space diis_damp = hf.SCF.diis_damp @@ -410,9 +413,11 @@ def check_sanity(self): build = hf.SCF.build opt = NotImplemented dump_flags = hf.SCF.dump_flags - get_fock = hf.SCF.get_fock - get_occ = hf.SCF.get_occ - get_grad = hf.SCF.get_grad + get_hcore = return_cupy_array(hf.SCF.get_hcore) + get_ovlp = return_cupy_array(hf.SCF.get_ovlp) + get_fock = get_fock + get_occ = get_occ + get_grad = staticmethod(get_grad) dump_chk = hf.SCF.dump_chk init_guess_by_minao = hf.SCF.init_guess_by_minao init_guess_by_atom = hf.SCF.init_guess_by_atom @@ -421,41 +426,65 @@ def check_sanity(self): init_guess_by_1e = hf.SCF.init_guess_by_1e init_guess_by_chkfile = hf.SCF.init_guess_by_chkfile from_chk = hf.SCF.from_chk - get_init_guess = hf.SCF.get_init_guess - make_rdm1 = hf.SCF.make_rdm1 - make_rdm2 = hf.SCF.make_rdm2 - energy_elec = hf.SCF.energy_elec - energy_tot = hf.SCF.energy_tot + get_init_guess = return_cupy_array(hf.SCF.get_init_guess) + make_rdm1 = make_rdm1 + make_rdm2 = NotImplemented + energy_elec = energy_elec + energy_tot = energy_tot energy_nuc = hf.SCF.energy_nuc check_convergence = None _eigh = staticmethod(eigh) eig = hf.SCF.eig do_disp = hf.SCF.do_disp get_dispersion = hf.SCF.get_dispersion - - scf = hf.SCF.scf + kernel = scf = scf as_scanner = hf.SCF.as_scanner _finalize = hf.SCF._finalize init_direct_scf = hf.SCF.init_direct_scf - get_jk = hf.SCF.get_jk + get_jk = _get_jk get_j = hf.SCF.get_j get_k = hf.SCF.get_k - get_veff = hf.SCF.get_veff - analyze = hf.SCF.analyze + get_veff = NotImplemented mulliken_meta = hf.SCF.mulliken_meta pop = hf.SCF.pop - dip_moment = hf.SCF.dip_moment _is_mem_enough = NotImplemented density_fit = NotImplemented - sfx2c1e = NotImplemented - x2c1e = NotImplemented - x2c = NotImplemented newton = NotImplemented - remove_soscf = NotImplemented + x2c = x2c1e = sfx2c1e = NotImplemented stability = NotImplemented nuc_grad_method = NotImplemented update_ = NotImplemented + canonicalize = NotImplemented istype = hf.SCF.istype + to_rhf = NotImplemented + to_uhf = NotImplemented + to_ghf = NotImplemented + to_rks = NotImplemented + to_uks = NotImplemented + to_gks = NotImplemented + to_ks = NotImplemented + canonicalize = NotImplemented + mulliken_pop = NotImplemented + mulliken_meta = NotImplemented + + def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None, + verbose=logger.NOTE): + if mol is None: mol = self.mol + if dm is None: dm = self.make_rdm1() + return hf.dip_moment(mol, dm.get(), unit, origin, verbose) + + def quad_moment(self, mol=None, dm=None, unit='DebyeAngstrom', origin=None, + verbose=logger.NOTE): + if mol is None: mol = self.mol + if dm is None: dm = self.make_rdm1() + return hf.quad_moment(mol, dm.get(), unit, origin, verbose) + + def remove_soscf(self): + lib.logger.warn('remove_soscf has no effect in current version') + return self + + def analyze(self, *args, **kwargs): + return self.to_cpu().analyze() def reset(self, mol=None): if mol is not None: @@ -469,7 +498,6 @@ class KohnShamDFT: A mock DFT base class, to be compatible with PySCF ''' -from gpu4pyscf.lib import utils class RHF(SCF): to_gpu = utils.to_gpu @@ -477,42 +505,8 @@ class RHF(SCF): _keys = {'e_disp', 'h1e', 's1e', 'e_mf', 'conv_tol_cpscf', 'disp_with_3body'} - conv_tol_cpscf = 1e-4 - DIIS = diis.SCF_DIIS - get_jk = _get_jk - _eigh = staticmethod(eigh) - make_rdm1 = make_rdm1 - energy_elec = energy_elec - get_fock = get_fock - get_occ = get_occ get_veff = get_veff - get_grad = staticmethod(get_grad) - quad_moment = _quad_moment - energy_tot = energy_tot - - get_hcore = return_cupy_array(hf.RHF.get_hcore) - get_ovlp = return_cupy_array(hf.RHF.get_ovlp) - get_init_guess = return_cupy_array(hf.RHF.get_init_guess) - init_direct_scf = NotImplemented - make_rdm2 = NotImplemented - newton = NotImplemented - x2c = x2c1e = sfx2c1e = NotImplemented - to_rhf = NotImplemented - to_uhf = NotImplemented - to_ghf = NotImplemented - to_rks = NotImplemented - to_uks = NotImplemented - to_gks = NotImplemented - to_ks = NotImplemented - canonicalize = NotImplemented - # TODO: Enable followings after testing - analyze = NotImplemented - stability = NotImplemented - mulliken_pop = NotImplemented - mulliken_meta = NotImplemented - - scf = scf - kernel = scf + canonicalize = canonicalize def check_sanity(self): mol = self.mol @@ -529,6 +523,10 @@ def density_fit(self, auxbasis=None, with_df=None, only_dfj=False): import gpu4pyscf.df.df_jk return gpu4pyscf.df.df_jk.density_fit(self, auxbasis, with_df, only_dfj) + def newton(self): + from gpu4pyscf.scf.soscf import newton + return newton(self) + def to_cpu(self): mf = hf.RHF(self.mol) utils.to_cpu(self, out=mf) diff --git a/gpu4pyscf/scf/hf_symm.py b/gpu4pyscf/scf/hf_symm.py new file mode 100644 index 00000000..486c02fd --- /dev/null +++ b/gpu4pyscf/scf/hf_symm.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# +# Copyright 2024 The PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from gpu4pyscf.scf.hf import RHF +from gpu4pyscf.scf.rohf import ROHF + +SymAdaptedRHF = RHF +SymAdaptedROHF = ROHF diff --git a/gpu4pyscf/scf/int2c2e.py b/gpu4pyscf/scf/int2c2e.py index 8ec1564d..0dbc8730 100644 --- a/gpu4pyscf/scf/int2c2e.py +++ b/gpu4pyscf/scf/int2c2e.py @@ -33,7 +33,7 @@ def get_int2c2e_sorted(mol, intopt=None, direct_scf_tol=1e-13, aosym=None, omega nao = mol.nao rows, cols = np.tril_indices(nao) - nao_cart = intopt.mol.nao + nao_cart = intopt._sorted_mol.nao norb_cart = nao_cart + 1 int2c = cupy.zeros([nao_cart, nao_cart], order='F') @@ -137,5 +137,5 @@ def get_int2c2e(mol, direct_scf_tol=1e-13): intopt = VHFOpt(mol, mol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=True) int2c = get_int2c2e_sorted(mol, intopt=intopt) - int2c = take_last2d(int2c, intopt.rev_ao_idx) + int2c = intopt.unsort_orbitals(int2c, axis=[0,1]) return int2c diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py new file mode 100644 index 00000000..7ec884b8 --- /dev/null +++ b/gpu4pyscf/scf/j_engine.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +# +# Copyright 2024 The PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +J engine using McMurchie-Davidson algorithm +''' + +import ctypes +import functools +import math +import numpy as np +import cupy as cp +import scipy.linalg +from pyscf import lib +from pyscf import __config__ +from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum +from gpu4pyscf.__config__ import props as gpu_specs +from gpu4pyscf.lib import logger +from gpu4pyscf.scf import jk +from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars + +__all__ = [ + 'get_j', +] + +PTR_BAS_COORD = 7 +LMAX = 4 +SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE', + int(gpu_specs['sharedMemPerBlockOptin']//9)*8) +THREADS = 256 + +libvhf_md = load_library('libgvhf_md') +libvhf_md.MD_build_j.restype = ctypes.c_int + +def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None): + '''Compute J matrix + ''' + log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() + if vhfopt is None: + with mol.with_range_coulomb(omega): + vhfopt = _VHFOpt(mol).build() + if omega is None: + omega = mol.omega + + mol = vhfopt.mol + nbas = mol.nbas + nao, nao_orig = vhfopt.coeff.shape + dm = cp.asarray(dm, order='C') + dms = dm.reshape(-1,nao_orig,nao_orig) + n_dm = dms.shape[0] + assert n_dm == 1 + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, vhfopt.coeff.T) + dms = cp.asarray(dms, order='C') + if hermi != 1: + dms = transpose_sum(dms) + else: + dms *= 2. + + ao_loc = mol.ao_loc + dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32) + log_max_dm = dm_cond.max() + log_cutoff = math.log(vhfopt.direct_scf_tol) + + dms = dms.get() + pair_loc = _make_j_engine_pair_locs(mol) + dm_xyz = np.zeros(pair_loc[-1]) + # Must use this modified _env to ensure the consistency with GPU kernel + # In this _env, normalization coefficients for s and p funcitons are scaled. + _env = vhfopt._mol_gpu[2].get() + libvhf_md.Et_dot_dm( + dm_xyz.ctypes, dms.ctypes, ao_loc.ctypes, pair_loc.ctypes, + mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes) + dm_xyz = cp.asarray(dm_xyz) + vj_xyz = cp.zeros_like(dm_xyz) + + pair_loc_on_gpu = cp.asarray(pair_loc) + rys_envs = RysIntEnvVars( + mol.natm, mol.nbas, + vhfopt.rys_envs.atm, vhfopt.rys_envs.bas, vhfopt.rys_envs.env, + pair_loc_on_gpu.data.ptr, + ) + + libvhf_md.init_mdj_constant(ctypes.c_int(SHM_SIZE)) + + uniq_l_ctr = vhfopt.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_ctr_bas_loc = vhfopt.l_ctr_offsets + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + n_groups = len(uniq_l_ctr) + tile_mappings = {} + workers = gpu_specs['multiProcessorCount'] + info = cp.empty(2, dtype=np.uint32) + + for i in range(n_groups): + for j in range(i+1): + ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1] + jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1] + ij_shls = (ish0, ish1, jsh0, jsh1) + sub_q = vhfopt.q_cond[ish0:ish1,jsh0:jsh1] + mask = sub_q > log_cutoff# - log_max_dm + if i == j: + mask = cp.tril(mask) + t_ij = (cp.arange(ish0, ish1, dtype=np.int32)[:,None] * nbas + + cp.arange(jsh0, jsh1, dtype=np.int32)) + idx = cp.argsort(sub_q[mask])[::-1] + tile_mappings[i,j] = t_ij[mask][idx] + t1 = t2 = log.timer_debug1('q_cond and dm_cond', *cput0) + + timing_collection = {} + kern_counts = 0 + kern = libvhf_md.MD_build_j + + for i in range(n_groups): + for j in range(i+1): + ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], + l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) + tile_ij_mapping = tile_mappings[i,j] + for k in range(i+1): + for l in range(k+1): + if i == k and j < l: continue + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = _md_j_engine_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p), + ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + rys_envs, (ctypes.c_int*3)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p), + ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p), + lib.c_null_ptr(), + ctypes.c_float(log_cutoff-log_max_dm), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), ctypes.c_double(omega), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + ntasks = tile_ij_mapping.size * tile_kl_mapping.size + t1, t1p = log.timer_debug1(f'processing {llll}, tasks ~= {ntasks}', *t1), t1 + if llll not in timing_collection: + timing_collection[llll] = 0 + timing_collection[llll] += t1[1] - t1p[1] + kern_counts += 1 + + if log.verbose >= logger.DEBUG1: + log.debug1('kernel launches %d', kern_counts) + for llll, t in timing_collection.items(): + log.debug1('%s wall time %.2f', llll, t) + cp.cuda.Stream.null.synchronize() + log.timer_debug1('cuda kernel', *t2) + + vj_xyz = vj_xyz.get() + vj = np.zeros_like(dms) + libvhf_md.jengine_dot_Et( + vj.ctypes, vj_xyz.ctypes, ao_loc.ctypes, pair_loc.ctypes, + mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes) + #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, cp.asarray(vj), vhfopt.coeff) + vj = sandwich_dot(vj, vhfopt.coeff) + vj = transpose_sum(vj) + vj = vj.reshape(dm.shape) + log.timer('vj', *cput0) + return vj + +class _VHFOpt(jk._VHFOpt): + def __init__(self, mol, cutoff=1e-13): + self.mol, self.coeff = mol.decontract_basis(to_cart=True, aggregate=True) + self.direct_scf_tol = cutoff + self.uniq_l_ctr = None + self.l_ctr_offsets = None + self.q_cond = None + self.tile_q_cond = None + self.tile = 1 + +def _md_j_engine_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE): + ls = l_ctr_pattern[:,0] + li, lj, lk, ll = ls + order = li + lj + lk + ll + lij = li + lj + lkl = lk + ll + nf3ij = (lij+1)*(lij+2)*(lij+3)//6 + nf3kl = (lkl+1)*(lkl+2)*(lkl+3)//6 + unit = order+1 + (order+1)*(order+2)*(2*order+3)//6 + counts = shm_size // (unit*8) + if counts >= THREADS: + nsq = THREADS + else: + nsq = _nearest_power2(counts) + ij = _nearest_power2(int(nsq**.5)) + kl = nsq // ij + tilex, tiley = 2, 4 + cache_size = ij*tilex * (4+nf3ij) + kl*tiley * (4+nf3kl) + while (nsq * unit + cache_size) * 8 > shm_size: + nsq //= 2 + ij = _nearest_power2(int(nsq**.5)) + kl = nsq // ij + cache_size = ij*tilex * (4+nf3ij) + kl*tiley * (4+nf3kl) + gout_stride = THREADS // nsq + return ij, kl, gout_stride + +def _nearest_power2(n): + t = 0 + while n > 1: + n >>= 1 + t += 1 + return 2**t diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 13b19277..939ba956 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -1,3 +1,24 @@ +#!/usr/bin/env python +# +# Copyright 2024 The PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +Compute J/K matrices +''' + import ctypes import math import numpy as np diff --git a/gpu4pyscf/scf/rohf.py b/gpu4pyscf/scf/rohf.py index 9e80a93b..67153195 100644 --- a/gpu4pyscf/scf/rohf.py +++ b/gpu4pyscf/scf/rohf.py @@ -15,29 +15,76 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from functools import reduce import numpy as np import cupy -from pyscf.scf import rohf +from pyscf.scf import rohf as rohf_cpu from gpu4pyscf.scf import hf, uhf -from gpu4pyscf.lib.cupy_helper import tag_array +from gpu4pyscf.lib.cupy_helper import tag_array, contract -class ROHF(rohf.ROHF, hf.RHF): +def get_roothaan_fock(focka_fockb, dma_dmb, s): + '''Roothaan's effective fock. + Ref. http://www-theor.ch.cam.ac.uk/people/ross/thesis/node15.html + + ======== ======== ====== ========= + space closed open virtual + ======== ======== ====== ========= + closed Fc Fb Fc + open Fb Fc Fa + virtual Fc Fa Fc + ======== ======== ====== ========= + + where Fc = (Fa + Fb) / 2 + + Returns: + Roothaan effective Fock matrix + ''' + nao = s.shape[0] + focka, fockb = focka_fockb + dma, dmb = dma_dmb + fc = (focka + fockb) * .5 +# Projector for core, open-shell, and virtual + pc = cupy.dot(dmb, s) + po = cupy.dot(dma-dmb, s) + pv = cupy.eye(nao) - cupy.dot(dma, s) + fock = reduce(cupy.dot, (pc.conj().T, fc, pc)) * .5 + fock += reduce(cupy.dot, (po.conj().T, fc, po)) * .5 + fock += reduce(cupy.dot, (pv.conj().T, fc, pv)) * .5 + fock += reduce(cupy.dot, (po.conj().T, fockb, pc)) + fock += reduce(cupy.dot, (po.conj().T, focka, pv)) + fock += reduce(cupy.dot, (pv.conj().T, fc, pc)) + fock = fock + fock.conj().T + fock = tag_array(fock, focka=focka, fockb=fockb) + return fock + +def canonicalize(mf, mo_coeff, mo_occ, fock=None): + '''Canonicalization diagonalizes the Fock matrix within occupied, open, + virtual subspaces separatedly (without change occupancy). + ''' + if getattr(fock, 'focka', None) is None: + dm = mf.make_rdm1(mo_coeff, mo_occ) + fock = mf.get_fock(dm=dm) + mo_e, mo_coeff = hf.canonicalize(mf, mo_coeff, mo_occ, fock) + fa, fb = fock.focka, fock.fockb + mo_ea = contract('pi,pi->i', mo_coeff.conj(), fa.dot(mo_coeff)).real + mo_eb = contract('pi,pi->i', mo_coeff.conj(), fb.dot(mo_coeff)).real + mo_e = tag_array(mo_e, mo_ea=mo_ea, mo_eb=mo_eb) + return mo_e, mo_coeff + +class ROHF(hf.RHF): from gpu4pyscf.lib.utils import to_cpu, to_gpu, device + nelec = rohf_cpu.ROHF.nelec get_jk = hf._get_jk - _eigh = hf.RHF._eigh + _eigh = staticmethod(hf.eigh) scf = kernel = hf.RHF.kernel # FIXME: Needs more tests for get_fock and get_occ - get_fock = hf.return_cupy_array(rohf.ROHF.get_fock) - get_occ = hf.return_cupy_array(rohf.ROHF.get_occ) + get_occ = hf.return_cupy_array(rohf_cpu.ROHF.get_occ) get_hcore = hf.RHF.get_hcore get_ovlp = hf.RHF.get_ovlp get_init_guess = uhf.UHF.get_init_guess - make_rdm1 = hf.return_cupy_array(rohf.ROHF.make_rdm1) make_rdm2 = NotImplemented - dump_chk = NotImplemented - newton = NotImplemented x2c = x2c1e = sfx2c1e = NotImplemented to_rhf = NotImplemented to_uhf = NotImplemented @@ -46,18 +93,83 @@ class ROHF(rohf.ROHF, hf.RHF): to_uks = NotImplemented to_gks = NotImplemented to_ks = NotImplemented - canonicalize = NotImplemented analyze = NotImplemented stability = NotImplemented mulliken_pop = NotImplemented mulliken_meta = NotImplemented nuc_grad_method = NotImplemented + canonicalize = canonicalize + + def make_rdm1(self, mo_coeff, mo_occ, **kwargs): + '''One-particle density matrix. mo_occ is a 1D array, with occupancy 1 or 2. + ''' + if isinstance(mo_occ, cupy.ndarray) and mo_occ.ndim == 1: + mo_occa = (mo_occ > 0).astype(np.double) + mo_occb = (mo_occ ==2).astype(np.double) + else: + mo_occa, mo_occb = mo_occ + dm_a = cupy.dot(mo_coeff*mo_occa, mo_coeff.conj().T) + dm_b = cupy.dot(mo_coeff*mo_occb, mo_coeff.conj().T) + return tag_array((dm_a, dm_b), mo_coeff=mo_coeff, mo_occ=mo_occ) + + def eig(self, fock, s): + e, c = self._eigh(fock, s) + if getattr(fock, 'focka', None) is not None: + mo_ea = contract('pi,pi->i', c.conj(), fock.focka.dot(c)).real + mo_eb = contract('pi,pi->i', c.conj(), fock.fockb.dot(c)).real + e = tag_array(e, mo_ea=mo_ea, mo_eb=mo_eb) + return e, c + + def energy_elec(self, dm=None, h1e=None, vhf=None): + if dm is None: dm = self.make_rdm1() + elif isinstance(dm, cupy.ndarray) and dm.ndim == 2: + dm = [dm*.5, dm*.5] + return uhf.energy_elec(self, dm, h1e, vhf) + + def get_fock(self, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None, + diis_start_cycle=None, level_shift_factor=None, damp_factor=None, + fock_last=None): + '''Build fock matrix based on Roothaan's effective fock. + See also :func:`get_roothaan_fock` + ''' + if h1e is None: h1e = self.get_hcore() + if s1e is None: s1e = self.get_ovlp() + if vhf is None: vhf = self.get_veff(self.mol, dm) + if dm is None: dm = self.make_rdm1() + if isinstance(dm, cupy.ndarray) and dm.ndim == 2: + dm = [dm*.5, dm*.5] +# To Get orbital energy in get_occ, we saved alpha and beta fock, because +# Roothaan effective Fock cannot provide correct orbital energy with `eig` +# TODO, check other treatment J. Chem. Phys. 133, 141102 + focka = h1e + vhf[0] + fockb = h1e + vhf[1] + f = get_roothaan_fock((focka,fockb), dm, s1e) + if cycle < 0 and diis is None: # Not inside the SCF iteration + return f + + if diis_start_cycle is None: + diis_start_cycle = self.diis_start_cycle + if level_shift_factor is None: + level_shift_factor = self.level_shift + if damp_factor is None: + damp_factor = self.damp + + dm_tot = dm[0] + dm[1] + if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None: + raise NotImplementedError('ROHF Fock-damping') + if diis and cycle >= diis_start_cycle: + f = diis.update(s1e, dm_tot, f, self, h1e, vhf, f_prev=fock_last) + if abs(level_shift_factor) > 1e-4: + f = hf.level_shift(s1e, dm_tot*.5, f, level_shift_factor) + f = tag_array(f, focka=focka, fockb=fockb) + return f + def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): if mol is None: mol = self.mol if dm is None: dm = self.make_rdm1() if getattr(dm, 'ndim', 0) == 2: - dm = cupy.asarray((dm*.5,dm*.5)) + dm = cupy.stack((dm*.5,dm*.5)) if dm_last is None or not self.direct_scf: if getattr(dm, 'mo_coeff', None) is not None: @@ -74,3 +186,35 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): vhf = vj[0] + vj[1] - vk vhf += vhf_last return vhf + + def get_grad(self, mo_coeff, mo_occ, fock): + '''ROHF gradients is the off-diagonal block [co + cv + ov], where + [ cc co cv ] + [ oc oo ov ] + [ vc vo vv ] + ''' + occidxa = mo_occ > 0 + occidxb = mo_occ == 2 + viridxa = ~occidxa + viridxb = ~occidxb + uniq_var_a = viridxa.reshape(-1,1) & occidxa + uniq_var_b = viridxb.reshape(-1,1) & occidxb + + if getattr(fock, 'focka', None) is not None: + focka = fock.focka + fockb = fock.fockb + elif isinstance(fock, (tuple, list)) or getattr(fock, 'ndim', None) == 3: + focka, fockb = fock + else: + focka = fockb = fock + focka = mo_coeff.conj().T.dot(focka).dot(mo_coeff) + fockb = mo_coeff.conj().T.dot(fockb).dot(mo_coeff) + + g = cupy.zeros_like(focka) + g[uniq_var_a] = focka[uniq_var_a] + g[uniq_var_b] += fockb[uniq_var_b] + return g[uniq_var_a | uniq_var_b] + + def newton(self): + from gpu4pyscf.scf.soscf import newton + return newton(self) diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py new file mode 100644 index 00000000..f64aa441 --- /dev/null +++ b/gpu4pyscf/scf/soscf.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python +# +# Copyright 2024 The PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +Second order SCF solver +''' + +import sys +import math +import numpy as np +import cupy as cp +import scipy.linalg +from cupyx.scipy.linalg import expm +from pyscf import lib +from pyscf.scf import chkfile +from pyscf.soscf import ciah +from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu +from gpu4pyscf.lib import logger +from gpu4pyscf.scf import hf, rohf, uhf +from gpu4pyscf.lib.cupy_helper import transpose_sum, contract +from gpu4pyscf.lib import utils + +def gen_g_hop_rhf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None): + assert mo_coeff.dtype == np.float64 + occidx = cp.nonzero(mo_occ==2)[0] + viridx = cp.nonzero(mo_occ==0)[0] + orbo = mo_coeff[:,occidx] + orbv = mo_coeff[:,viridx] + nocc = orbo.shape[1] + nvir = orbv.shape[1] + + if fock_ao is None: + dm0 = mf.make_rdm1(mo_coeff, mo_occ) + fock_ao = mf.get_fock(h1e, dm=dm0) + fock = mo_coeff.conj().T.dot(fock_ao).dot(mo_coeff) + foo = fock[occidx[:,None],occidx] + fvv = fock[viridx[:,None],viridx] + + g = fock[viridx[:,None],occidx] * 2 + h_diag = (fvv.diagonal().real[:,None] - foo.diagonal().real) * 2 + + vind = mf.gen_response(mo_coeff, mo_occ, singlet=None, hermi=1) + + def h_op(x): + x = x.reshape(nvir,nocc) + x2 = contract('ps,sq->pq', fvv, x) + x2-= contract('ps,rp->rs', foo, x) + + # *2 for double occupancy + dm1 = orbv.dot(x*2).dot(orbo.conj().T) + dm1 = transpose_sum(dm1) + v1 = vind(dm1) + x2 += orbv.conj().T.dot(v1).dot(orbo) + return x2.ravel() * 2 + + return g.reshape(-1), h_op, h_diag.reshape(-1) + +def gen_g_hop_rohf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None): + if getattr(fock_ao, 'focka', None) is None: + dm0 = mf.make_rdm1(mo_coeff, mo_occ) + fock_ao = mf.get_fock(h1e, dm=dm0) + fock_ao = fock_ao.focka, fock_ao.fockb + mo_occa = occidxa = mo_occ > 0 + mo_occb = occidxb = mo_occ ==2 + ug, uh_op, uh_diag = gen_g_hop_uhf( + mf, (mo_coeff,)*2, (mo_occa,mo_occb), fock_ao, None) + + viridxa = ~occidxa + viridxb = ~occidxb + uniq_var_a = viridxa[:,None] & occidxa + uniq_var_b = viridxb[:,None] & occidxb + uniq_ab = uniq_var_a | uniq_var_b + nmo = mo_coeff.shape[-1] + nocca, noccb = mf.nelec + nvira = nmo - nocca + + def sum_ab(x): + x1 = cp.zeros((nmo,nmo), dtype=x.dtype) + x1[uniq_var_a] = x[:nvira*nocca] + x1[uniq_var_b] += x[nvira*nocca:] + return x1[uniq_ab] + + g = sum_ab(ug) + h_diag = sum_ab(uh_diag) + def h_op(x): + x1 = cp.zeros((nmo,nmo), dtype=x.dtype) + # unpack ROHF rotation parameters + x1[uniq_ab] = x + x1 = cp.hstack((x1[uniq_var_a],x1[uniq_var_b])) + return sum_ab(uh_op(x1)) + + return g, h_op, h_diag + +def gen_g_hop_uhf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None): + assert mo_coeff[0].dtype == np.float64 + occidxa = cp.nonzero(mo_occ[0] > 0)[0] + occidxb = cp.nonzero(mo_occ[1] > 0)[0] + viridxa = cp.nonzero(mo_occ[0] == 0)[0] + viridxb = cp.nonzero(mo_occ[1] == 0)[0] + orboa = mo_coeff[0][:,occidxa] + orbob = mo_coeff[1][:,occidxb] + orbva = mo_coeff[0][:,viridxa] + orbvb = mo_coeff[1][:,viridxb] + nmo = mo_occ[0].size + nocca, noccb = mf.nelec + nvira = nmo - nocca + nvirb = nmo - noccb + + if fock_ao is None: + dm0 = mf.make_rdm1(mo_coeff, mo_occ) + fock_ao = mf.get_fock(h1e, dm=dm0) + focka = mo_coeff[0].conj().T.dot(fock_ao[0]).dot(mo_coeff[0]) + fockb = mo_coeff[1].conj().T.dot(fock_ao[1]).dot(mo_coeff[1]) + fooa = focka[occidxa[:,None],occidxa] + fvva = focka[viridxa[:,None],viridxa] + foob = fockb[occidxb[:,None],occidxb] + fvvb = fockb[viridxb[:,None],viridxb] + + g = cp.hstack((focka[viridxa[:,None],occidxa].ravel(), + fockb[viridxb[:,None],occidxb].ravel())) + h_diaga = fvva.diagonal().real[:,None] - fooa.diagonal().real + h_diagb = fvvb.diagonal().real[:,None] - foob.diagonal().real + h_diag = cp.hstack((h_diaga.reshape(-1), h_diagb.reshape(-1))) + + vind = mf.gen_response(mo_coeff, mo_occ, hermi=1) + + def h_op(x): + x1a = x[:nvira*nocca].reshape(nvira,nocca) + x1b = x[nvira*nocca:].reshape(nvirb,noccb) + x2a = contract('pr,rq->pq', fvva, x1a) + x2a-= contract('sq,ps->pq', fooa, x1a) + x2b = contract('pr,rq->pq', fvvb, x1b) + x2b-= contract('sq,ps->pq', foob, x1b) + + d1a = orbva.dot(x1a).dot(orboa.conj().T) + d1b = orbvb.dot(x1b).dot(orbob.conj().T) + dm1 = cp.array([transpose_sum(d1a), + transpose_sum(d1b)]) + v1 = vind(dm1) + x2a += orbva.conj().T.dot(v1[0]).dot(orboa) + x2b += orbvb.conj().T.dot(v1[1]).dot(orbob) + return cp.hstack((x2a.ravel(), x2b.ravel())) + + return g, h_op, h_diag + + +def _rotate_orb_cc(mf, h1e, s1e, conv_tol_grad=None, verbose=None): + log = logger.new_logger(mf, verbose) + + if conv_tol_grad is None: + conv_tol_grad = (mf.conv_tol*.1)**.5 + #TODO: dynamically adjust max_stepsize, as done in mc1step.py + + def precond(x, e): + hdiagd = h_diag-(e-mf.ah_level_shift) + hdiagd[abs(hdiagd)<1e-8] = 1e-8 + x = x/hdiagd + return x + + t3m = log.init_timer() + u = g_kf = g_orb = norm_gorb = dxi = kfcount = jkcount = None + dm0 = vhf0 = None + g_op = lambda: g_orb + while True: + mo_coeff, mo_occ, dm0, vhf0, e_tot = (yield u, g_kf, kfcount, jkcount, dm0, vhf0) + fock_ao = mf.get_fock(h1e, s1e, vhf0, dm0) + + g_kf, h_op, h_diag = mf.gen_g_hop(mo_coeff, mo_occ, fock_ao) + norm_gkf = cp.linalg.norm(g_kf) + if g_orb is None: + log.debug(' |g|= %4.3g (keyframe)', norm_gkf) + kf_trust_region = mf.kf_trust_region + x0_guess = g_kf + else: + norm_dg = cp.linalg.norm(g_kf-g_orb) + log.debug(' |g|= %4.3g (keyframe), |g-correction|= %4.3g', + norm_gkf, norm_dg) + kf_trust_region = min(max(norm_gorb/(norm_dg+1e-9), mf.kf_trust_region), 10) + log.debug1('Set kf_trust_region = %g', kf_trust_region) + x0_guess = dxi + g_orb = g_kf + norm_gorb = norm_gkf + problem_size = g_orb.size + + ah_conv_tol = min(norm_gorb**2, mf.ah_conv_tol) + # increase the AH accuracy when approach convergence + ah_start_cycle = mf.ah_start_cycle + imic = 0 + dr = 0. + u = 1. + ukf = None + jkcount = 0 + kfcount = 0 + ikf = 0 + ihop = 0 + + for ah_end, ihop, w, dxi, hdxi, residual, seig \ + in _davidson_cc(h_op, g_op, precond, x0_guess, + tol=ah_conv_tol, max_cycle=mf.ah_max_cycle, + lindep=mf.ah_lindep, verbose=log): + norm_residual = cp.linalg.norm(residual) + ah_start_tol = min(norm_gorb*5, mf.ah_start_tol) + if (ah_end or ihop == mf.ah_max_cycle or # make sure to use the last step + ((norm_residual < ah_start_tol) and (ihop >= ah_start_cycle)) or + (seig < mf.ah_lindep)): + imic += 1 + dxmax = abs(dxi).max() + if ihop == problem_size: + log.debug1('... Hx=g fully converged for small systems') + elif dxmax > mf.max_stepsize: + scale = mf.max_stepsize / dxmax + log.debug1('... scale rotation size %g', scale) + dxi *= scale + hdxi *= scale + + dr = dr + dxi + g_orb = g_orb + hdxi + norm_dr = cp.linalg.norm(dr) + norm_gorb = cp.linalg.norm(g_orb) + norm_dxi = cp.linalg.norm(dxi) + log.debug(' imic %d(%d) |g|= %4.3g |dxi|= %4.3g ' + 'max(|x|)= %4.3g |dr|= %4.3g eig= %4.3g seig= %4.3g', + imic, ihop, norm_gorb, norm_dxi, + dxmax, norm_dr, w, seig) + + max_cycle = max(mf.max_cycle_inner, + mf.max_cycle_inner-int(math.log(norm_gkf+1e-9)*2)) + log.debug1('Set ah_start_tol %g, ah_start_cycle %d, max_cycle %d', + ah_start_tol, ah_start_cycle, max_cycle) + ikf += 1 + if imic > 3 and norm_gorb > norm_gkf*mf.ah_grad_trust_region: + g_orb = g_orb - hdxi + dr -= dxi + norm_gorb = cp.linalg.norm(g_orb) + log.debug('|g| >> keyframe, Restore previouse step') + break + + elif (imic >= max_cycle or norm_gorb < conv_tol_grad/mf.ah_grad_trust_region): + break + + elif (ikf > 2 and # avoid frequent keyframe + #TODO: replace it with keyframe_scheduler + (ikf >= max(mf.kf_interval, mf.kf_interval-math.log(norm_dr+1e-9)) or + # Insert keyframe if the keyframe and the estimated g_orb are too different + norm_gorb < norm_gkf/kf_trust_region)): + ikf = 0 + u = mf.update_rotate_matrix(dr, mo_occ, mo_coeff=mo_coeff) + if ukf is not None: + u = mf.rotate_mo(ukf, u) + ukf = u + dr[:] = 0 + mo1 = mf.rotate_mo(mo_coeff, u) + dm = mf.make_rdm1(mo1, mo_occ) + # use mf._scf.get_veff to avoid density-fit mf polluting get_veff + vhf0 = mf._scf.get_veff(mf._scf.mol, dm, dm_last=dm0, vhf_last=vhf0) + dm0 = dm + # Use API to compute fock instead of "fock=h1e+vhf0". This is because get_fock + # is the hook being overloaded in many places. + fock_ao = mf.get_fock(h1e, s1e, vhf0, dm0) + g_kf1 = mf.get_grad(mo1, mo_occ, fock_ao) + norm_gkf1 = cp.linalg.norm(g_kf1) + norm_dg = cp.linalg.norm(g_kf1-g_orb) + jkcount += 1 + kfcount += 1 + if log.verbose >= logger.DEBUG: + e_tot, e_last = mf._scf.energy_tot(dm, h1e, vhf0), e_tot + log.debug('Adjust keyframe g_orb to |g|= %4.3g ' + '|g-correction|=%4.3g E=%.12g dE=%.5g', + norm_gkf1, norm_dg, e_tot, e_tot-e_last) + + if (norm_dg < norm_gorb*mf.ah_grad_trust_region # kf not too diff + #or norm_gkf1 < norm_gkf # grad is decaying + # close to solution + or norm_gkf1 < conv_tol_grad*mf.ah_grad_trust_region): + kf_trust_region = min(max(norm_gorb/(norm_dg+1e-9), mf.kf_trust_region), 10) + log.debug1('Set kf_trust_region = %g', kf_trust_region) + g_orb = g_kf = g_kf1 + norm_gorb = norm_gkf = norm_gkf1 + else: + g_orb = g_orb - hdxi + dr -= dxi + norm_gorb = cp.linalg.norm(g_orb) + log.debug('Out of trust region. Restore previouse step') + break + + if ihop > 0: + u = mf.update_rotate_matrix(dr, mo_occ, mo_coeff=mo_coeff) + if ukf is not None: + u = mf.rotate_mo(ukf, u) + jkcount += ihop + 1 + log.debug(' tot inner=%d %d JK |g|= %4.3g |u-1|= %4.3g', + imic, jkcount, norm_gorb, cp.linalg.norm(dr)) + h_op = h_diag = None + t3m = log.timer('aug_hess in %d inner iters' % imic, *t3m) + +def _davidson_cc(h_op, g_op, precond, x0, tol=1e-10, xs=[], ax=[], + max_cycle=30, lindep=1e-14, verbose=logger.WARN): + if isinstance(verbose, logger.Logger): + log = verbose + else: + log = logger.Logger(sys.stdout, verbose) + + toloose = tol**.5 + # the first trial vector is (1,0,0,...), which is not included in xs + xs = list(xs) + ax = list(ax) + nx = len(xs) + + problem_size = x0.size + max_cycle = min(max_cycle, problem_size) + heff = np.zeros((max_cycle+nx+1,max_cycle+nx+1), dtype=x0.dtype) + ovlp = np.eye(max_cycle+nx+1, dtype=x0.dtype) + if nx == 0: + xs.append(x0) + ax.append(h_op(x0)) + else: + for i in range(1, nx+1): + for j in range(1, i+1): + heff[i,j] = xs[i-1].conj().dot(ax[j-1]) + ovlp[i,j] = xs[i-1].conj().dot(xs[j-1]) + heff[1:i,i] = heff[i,1:i].conj() + ovlp[1:i,i] = ovlp[i,1:i].conj() + + w_t = 0 + for istep in range(max_cycle): + g = g_op() + nx = len(xs) + for i in range(nx): + heff[i+1,0] = xs[i].conj().dot(g) + heff[nx,i+1] = xs[nx-1].conj().dot(ax[i]) + ovlp[nx,i+1] = xs[nx-1].conj().dot(xs[i]) + heff[0,:nx+1] = heff[:nx+1,0].conj() + heff[1:nx,nx] = heff[nx,1:nx].conj() + ovlp[1:nx,nx] = ovlp[nx,1:nx].conj() + nvec = nx + 1 + #s0 = scipy.linalg.eigh(ovlp[:nvec,:nvec])[0][0] + #if s0 < lindep: + # yield True, istep, w_t, xtrial, hx, dx, s0 + # break + wlast = w_t + xtrial, w_t, v_t, index, seig = \ + _regular_step(heff[:nvec,:nvec], ovlp[:nvec,:nvec], xs, + lindep, log) + s0 = seig[0] + hx = _dgemv(v_t[1:], ax) + # note g*v_t[0], as the first trial vector is (1,0,0,...) + dx = hx + g*v_t[0] - w_t * v_t[0]*xtrial + norm_dx = np.linalg.norm(dx) + log.debug1('... AH step %d index= %d |dx|= %.5g eig= %.5g v[0]= %.5g lindep= %.5g', + istep+1, index, norm_dx, w_t, v_t[0].real, s0) + hx *= 1/v_t[0] # == h_op(xtrial) + if ((abs(w_t-wlast) < tol and norm_dx < toloose) or + s0 < lindep or + istep+1 == problem_size): + # Avoid adding more trial vectors if hessian converged + yield True, istep+1, w_t, xtrial, hx, dx, s0 + if s0 < lindep or norm_dx < lindep:# or np.linalg.norm(xtrial) < lindep: + # stop the iteration because eigenvectors would be barely updated + break + else: + yield False, istep+1, w_t, xtrial, hx, dx, s0 + x0 = precond(dx, w_t) + xs.append(x0) + ax.append(h_op(x0)) + +def _regular_step(heff, ovlp, xs, lindep, log, root_id=0): + w, v, seig = lib.safe_eigh(heff, ovlp, lindep) + #if e[0] < -.1: + # sel = 0 + #else: + # There exists systems that the first eigenvalue of AH is -inf. + # Dynamically choosing the eigenvectors may be better. + idx = np.nonzero(abs(v[0]) > 0.1)[0] + sel = idx[root_id] + log.debug1('CIAH eigen-sel %s', sel) + w_t = w[sel] + + if w_t < 1e-4: + try: + e, c = scipy.linalg.eigh(heff[1:,1:], ovlp[1:,1:]) + except scipy.linalg.LinAlgError: + e, c = lib.safe_eigh(heff[1:,1:], ovlp[1:,1:], lindep)[:2] + if np.any(e < -1e-5): + log.debug('Negative hessians found %s', e[e<0]) + + xtrial = _dgemv(v[1:,sel]/v[0,sel], xs) + return xtrial, w_t, v[:,sel], sel, seig + +def _dgemv(v, m): + vm = v[0] * m[0] + for i,vi in enumerate(v[1:]): + vm += vi * m[i+1] + return vm + + +def kernel(mf, mo_coeff=None, mo_occ=None, dm=None, + conv_tol=1e-10, conv_tol_grad=None, max_cycle=50, dump_chk=True, + callback=None, verbose=logger.NOTE): + log = logger.new_logger(mf, verbose) + cput0 = log.init_timer() + mol = mf._scf.mol + assert mol is mf.mol + + if conv_tol_grad is None: + conv_tol_grad = conv_tol**.5 + log.info('Set conv_tol_grad to %g', conv_tol_grad) + + # call mf._scf.get_hcore, mf._scf.get_ovlp because they might be overloaded + h1e = mf._scf.get_hcore(mol) + s1e = mf._scf.get_ovlp(mol) + + if mo_coeff is not None and mo_occ is not None: + dm = mf.make_rdm1(mo_coeff, mo_occ) + # call mf._scf.get_veff, to avoid "newton().density_fit()" polluting get_veff + vhf = mf._scf.get_veff(mol, dm) + fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0) + mo_energy, mo_tmp = mf.eig(fock, s1e) + mf.get_occ(mo_energy, mo_tmp) + mo_tmp = None + + else: + if dm is None: + dm = mf.get_init_guess(mol, mf.init_guess) + vhf = mf._scf.get_veff(mol, dm) + fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0) + mo_energy, mo_coeff = mf.eig(fock, s1e) + mo_occ = mf.get_occ(mo_energy, mo_coeff) + dm, dm_last = mf.make_rdm1(mo_coeff, mo_occ), dm + vhf = mf._scf.get_veff(mol, dm, dm_last=dm_last, vhf_last=vhf) + + # Save mo_coeff and mo_occ because they are needed by function rotate_mo + mf.mo_coeff, mf.mo_occ = mo_coeff, mo_occ + + e_tot = mf._scf.energy_tot(dm, h1e, vhf) + fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0) + log.info('Initial guess E= %.15g |g|= %g', e_tot, + cp.linalg.norm(mf._scf.get_grad(mo_coeff, mo_occ, fock))) + + if dump_chk and mf.chkfile: + chkfile.save_mol(mol, mf.chkfile) + + # Copy the integral file to soscf object to avoid the integrals being + # cached twice. + if mol is mf.mol and not getattr(mf, 'with_df', None): + mf._eri = mf._scf._eri + + rotaiter = _rotate_orb_cc(mf, h1e, s1e, conv_tol_grad, verbose=log) + next(rotaiter) # start the iterator + kftot = jktot = 0 + norm_gorb = 0. + scf_conv = False + cput1 = log.timer('initializing second order scf', *cput0) + + for imacro in range(max_cycle): + u, g_orb, kfcount, jkcount, dm_last, vhf = \ + rotaiter.send((mo_coeff, mo_occ, dm, vhf, e_tot)) + kftot += kfcount + 1 + jktot += jkcount + 1 + + last_hf_e = e_tot + norm_gorb = cp.linalg.norm(g_orb) + mo_coeff = mf.rotate_mo(mo_coeff, u, log) + dm = mf.make_rdm1(mo_coeff, mo_occ) + vhf = mf._scf.get_veff(mol, dm, dm_last=dm_last, vhf_last=vhf) + fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0) + # NOTE: DO NOT change the initial guess mo_occ, mo_coeff + if mf.verbose >= logger.DEBUG: + mo_energy, mo_tmp = mf.eig(fock, s1e) + mf.get_occ(mo_energy, mo_tmp) + # call mf._scf.energy_tot for dft, because the (dft).get_veff step saved _exc in mf._scf + e_tot = mf._scf.energy_tot(dm, h1e, vhf) + + log.info('macro= %d E= %.15g delta_E= %g |g|= %g %d KF %d JK', + imacro, e_tot, e_tot-last_hf_e, norm_gorb, + kfcount+1, jkcount) + cput1 = log.timer('cycle= %d'%(imacro+1), *cput1) + + if callable(mf.check_convergence): + scf_conv = mf.check_convergence(locals()) + elif abs(e_tot-last_hf_e) < conv_tol and norm_gorb < conv_tol_grad: + scf_conv = True + + if dump_chk: + mf.dump_chk(locals()) + + if callable(callback): + callback(locals()) + + if scf_conv: + break + + if callable(callback): + callback(locals()) + + rotaiter.close() + mo_energy, mo_coeff1 = mf._scf.canonicalize(mo_coeff, mo_occ, fock) + if mf.canonicalization: + log.info('Canonicalize SCF orbitals') + mo_coeff = mo_coeff1 + if dump_chk: + mf.dump_chk(locals()) + log.info('macro X = %d E=%.15g |g|= %g total %d KF %d JK', + imacro+1, e_tot, norm_gorb, kftot+1, jktot+1) + + if cp.any(mo_occ==0): + homo = mo_energy[mo_occ>0].max() + lumo = mo_energy[mo_occ==0].min() + if homo > lumo: + log.warn('canonicalized orbital HOMO %s > LUMO %s ', homo, lumo) + return scf_conv, e_tot, mo_energy, mo_coeff, mo_occ + +# A tag to label the derived SCF class +class _CIAH_SOSCF: + ''' + Attributes for Newton solver: + max_cycle_inner : int + AH iterations within eacy macro iterations. Default is 10 + max_stepsize : int + The step size for orbital rotation. Small step is prefered. Default is 0.05. + canonicalization : bool + To control whether to canonicalize the orbitals optimized by + Newton solver. Default is True. + ''' + + __name_mixin__ = 'SecondOrder' + + max_cycle_inner = _SOSCF_cpu.max_cycle_inner + max_stepsize = _SOSCF_cpu.max_stepsize + canonicalization = _SOSCF_cpu.canonicalization + + ah_start_tol = _SOSCF_cpu.ah_start_tol + ah_start_cycle = _SOSCF_cpu.ah_start_cycle + ah_level_shift = _SOSCF_cpu.ah_level_shift + ah_conv_tol = _SOSCF_cpu.ah_conv_tol + ah_lindep = _SOSCF_cpu.ah_lindep + ah_max_cycle = _SOSCF_cpu.ah_max_cycle + ah_grad_trust_region = _SOSCF_cpu.ah_grad_trust_region + kf_interval = _SOSCF_cpu.kf_interval + kf_trust_region = _SOSCF_cpu.kf_trust_region + + _keys = _SOSCF_cpu._keys + + to_gpu = utils.to_gpu + device = utils.device + to_cpu = utils.to_cpu + + def __init__(self, mf): + self.__dict__.update(mf.__dict__) + self._scf = mf + + def undo_soscf(self): + '''Remove the SOSCF Mixin''' + from gpu4pyscf.df.df_jk import _DFHF + if isinstance(self, _DFHF) and not isinstance(self._scf, _DFHF): + # where density fitting is only applied on the SOSCF hessian + mf = self.undo_df() + else: + mf = self + obj = lib.view(mf, lib.drop_class(mf.__class__, _CIAH_SOSCF)) + del obj._scf + # When both self and self._scf are DF objects, they may be different df + # objects. The DF object of the base scf object should be used. + if hasattr(self._scf, 'with_df'): + obj.with_df = self._scf.with_df + return obj + + undo_newton = undo_soscf + + def dump_flags(self, verbose=None): + log = logger.new_logger(self, verbose) + log.info('\n') + super().dump_flags(verbose) + log.info('******** %s Newton solver flags ********', self._scf.__class__) + log.info('max_cycle_inner = %d', self.max_cycle_inner) + log.info('max_stepsize = %g', self.max_stepsize) + log.info('ah_start_tol = %g', self.ah_start_tol) + log.info('ah_level_shift = %g', self.ah_level_shift) + log.info('ah_conv_tol = %g', self.ah_conv_tol) + log.info('ah_lindep = %g', self.ah_lindep) + log.info('ah_start_cycle = %d', self.ah_start_cycle) + log.info('ah_max_cycle = %d', self.ah_max_cycle) + log.info('ah_grad_trust_region = %g', self.ah_grad_trust_region) + log.info('kf_interval = %d', self.kf_interval) + log.info('kf_trust_region = %d', self.kf_trust_region) + log.info('canonicalization = %s', self.canonicalization) + return self + + build = _SOSCF_cpu.build + reset = _SOSCF_cpu.reset + + def kernel(self, mo_coeff=None, mo_occ=None, dm0=None): + if mo_coeff is None: mo_coeff = self.mo_coeff + if mo_occ is None: mo_occ = self.mo_occ + cput0 = logger.init_timer(self) + self.build(self.mol) + self.dump_flags() + + self.converged, self.e_tot, \ + self.mo_energy, self.mo_coeff, self.mo_occ = \ + kernel(self, mo_coeff, mo_occ, dm0, conv_tol=self.conv_tol, + conv_tol_grad=self.conv_tol_grad, + max_cycle=self.max_cycle, + callback=self.callback, verbose=self.verbose) + + logger.timer(self, 'Second order SCF', *cput0) + self._finalize() + return self.e_tot + + from_dm = _SOSCF_cpu.from_dm + + gen_g_hop = gen_g_hop_rhf + + def update_rotate_matrix(self, dx, mo_occ, u0=1, mo_coeff=None): + nmo = len(mo_occ) + x1 = cp.zeros((nmo,nmo), dtype=dx.dtype) + occidxa = mo_occ>0 + occidxb = mo_occ==2 + viridxa = ~occidxa + viridxb = ~occidxb + mask = (viridxa[:,None] & occidxa) | (viridxb[:,None] & occidxb) + x1[mask] = dx + dr = x1 - x1.conj().T + u = expm(dr) + if isinstance(u0, cp.ndarray): + u = u0.dot(u) + return u + + def rotate_mo(self, mo_coeff, u, log=None): + return mo_coeff.dot(u) + +class _SecondOrderROHF(_CIAH_SOSCF): + gen_g_hop = gen_g_hop_rohf + +class _SecondOrderUHF(_CIAH_SOSCF): + gen_g_hop = gen_g_hop_uhf + + def update_rotate_matrix(self, dx, mo_occ, u0=1, mo_coeff=None): + occidxa = mo_occ[0] > 0 + occidxb = mo_occ[1] > 0 + viridxa = ~occidxa + viridxb = ~occidxb + + nmo = len(occidxa) + dr = cp.zeros((2,nmo,nmo), dtype=dx.dtype) + uniq = cp.array((viridxa[:,None] & occidxa, + viridxb[:,None] & occidxb)) + dr[uniq] = dx + dr = dr - dr.conj().transpose(0,2,1) + + if isinstance(u0, int) and u0 == 1: + return cp.asarray((expm(dr[0]), expm(dr[1]))) + else: + return cp.asarray((u0[0].dot(expm(dr[0])), + u0[1].dot(expm(dr[1])))) + + def rotate_mo(self, mo_coeff, u, log=None): + mo = cp.asarray((mo_coeff[0].dot(u[0]), + mo_coeff[1].dot(u[1]))) + return mo + + def kernel(self, mo_coeff=None, mo_occ=None, dm0=None): + if isinstance(mo_coeff, cp.ndarray) and mo_coeff.ndim == 2: + mo_coeff = (mo_coeff, mo_coeff) + if isinstance(mo_occ, cp.ndarray) and mo_occ.ndim == 1: + mo_occ = (cp.asarray(mo_occ >0, dtype=np.float64), + cp.asarray(mo_occ==2, dtype=np.float64)) + return _CIAH_SOSCF.kernel(self, mo_coeff, mo_occ, dm0) + +class _SecondOrderRHF(_CIAH_SOSCF): + gen_g_hop = gen_g_hop_rhf + +def newton(mf): + if isinstance(mf, _CIAH_SOSCF): + return mf + + assert isinstance(mf, hf.SCF) + + if mf.istype('ROHF'): + cls = _SecondOrderROHF + elif mf.istype('UHF'): + cls = _SecondOrderUHF + elif mf.istype('GHF'): + raise NotImplementedError + elif mf.istype('RDHF'): + raise NotImplementedError + elif mf.istype('DHF'): + raise NotImplementedError + else: + cls = _SecondOrderRHF + return lib.set_class(cls(mf), (cls, mf.__class__)) diff --git a/gpu4pyscf/scf/tests/test_scf_j_engine.py b/gpu4pyscf/scf/tests/test_scf_j_engine.py new file mode 100644 index 00000000..19291e5a --- /dev/null +++ b/gpu4pyscf/scf/tests/test_scf_j_engine.py @@ -0,0 +1,45 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import pyscf +from pyscf import lib +from gpu4pyscf.scf import j_engine +from pyscf.scf.hf import get_jk + +def test_j_engine(): + mol = pyscf.M( + atom = ''' + O 0.000 -0. 0.1174 + H -0.757 4. -0.4696 + H 0.757 4. -0.4696 + C 1. 1. 0. + H 4. 0. 3. + H 0. 1. .6 + ''', + basis='def2-tzvp', + unit='B',) + + np.random.seed(9) + nao = mol.nao + dm = np.random.rand(nao, nao) + dm = dm.dot(dm.T) + + vj = j_engine.get_j(mol, dm) + vj1 = vj.get() + ref = get_jk(mol, dm, with_k=False)[0] + assert abs(lib.fp(vj1) - -2327.4715195591784) < 1e-9 + assert abs(vj1 - ref).max() < 1e-9 diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py new file mode 100644 index 00000000..b7fa3990 --- /dev/null +++ b/gpu4pyscf/scf/tests/test_soscf.py @@ -0,0 +1,224 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import cupy as cp +from pyscf import gto +from gpu4pyscf import scf +from gpu4pyscf import dft + +def setUpModule(): + global h2o_z0, h2o_z1 + h2o_z0 = gto.M( + verbose = 5, + output = '/dev/null', + atom = [ + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ], + basis = '6-31g') + + h2o_z1 = gto.M( + verbose = 5, + output = '/dev/null', + atom = [ + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ], + basis = '6-31g', + charge = 1, + spin = 1,) + +def tearDownModule(): + global h2o_z0, h2o_z1 + h2o_z0.stdout.close() + h2o_z1.stdout.close() + del h2o_z0, h2o_z1 + +class KnownValues(unittest.TestCase): + def test_nr_rhf(self): + mf = scf.RHF(h2o_z0) + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 2 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), -75.98394849812, 9) + + def test_nr_rohf(self): + mf = scf.ROHF(h2o_z1) + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 20 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), -75.5783963795897, 9) + + def test_nr_uhf(self): + mf = scf.UHF(h2o_z1) + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 2 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), -75.58051984397145, 9) + + def test_nr_rks_lda(self): + mf = dft.RKS(h2o_z0) + eref = mf.kernel() + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 3 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_nr_rks_rsh(self): + '''test range-separated Coulomb''' + mf = dft.RKS(h2o_z0) + mf.xc = 'wb97x' + eref = mf.kernel() + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 3 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_nr_rks(self): + mf = dft.RKS(h2o_z0) + mf.xc = 'b3lyp' + eref = mf.kernel() + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 3 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_rks_gen_g_hop(self): + mf = dft.RKS(h2o_z0) + mf.grids.build() + mf.xc = 'b3lyp' + nao = h2o_z0.nao_nr() + mo = cp.random.random((nao,nao)) + mo_occ = cp.zeros(nao) + mo_occ[:5] = 2 + nocc, nvir = 5, nao-5 + dm1 = cp.random.random(nvir*nocc) + nr = mf.newton() + g, hop, hdiag = nr.gen_g_hop(mo, mo_occ) + mf_cpu = mf.to_cpu().newton() + hop_ref = mf_cpu.gen_g_hop(mo.get(), mo_occ.get())[1] + self.assertAlmostEqual(abs(hop(dm1).get() - hop_ref(dm1.get())).max(), 0, 9) + + def test_nr_roks(self): + mf = dft.RKS(h2o_z1) + mf.xc = 'b3lyp' + eref = mf.kernel() + + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 3 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_nr_uks_lda(self): + mf = dft.UKS(h2o_z1) + eref = mf.kernel() + + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 2 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_nr_uks_rsh(self): + '''test range-separated Coulomb''' + mf = dft.UKS(h2o_z1) + mf.xc = 'wb97x' + eref = mf.kernel() + + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 3 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_nr_uks(self): + mf = dft.UKS(h2o_z1) + mf.xc = 'b3lyp' + eref = mf.kernel() + + mf.max_cycle = 1 + mf.conv_check = False + mf.kernel() + nr = mf.newton() + nr.max_cycle = 3 + nr.conv_tol_grad = 1e-5 + self.assertAlmostEqual(nr.kernel(), eref, 9) + + def test_uks_gen_g_hop(self): + mf = dft.UKS(h2o_z0) + mf.grids.build() + mf.xc = 'hse06' + nao = h2o_z0.nao_nr() + mo = cp.random.random((2, nao,nao)) + mo_occ = cp.zeros((2,nao)) + mo_occ[:,:5] = 1 + nocc, nvir = 5, nao-5 + dm1 = cp.random.random(nvir*nocc*2) + nr = mf.newton() + g, hop, hdiag = nr.gen_g_hop(mo, mo_occ) + mf_cpu = mf.to_cpu().newton() + hop_ref = mf_cpu.gen_g_hop(mo.get(), mo_occ.get())[1] + self.assertAlmostEqual(abs(hop(dm1).get() - hop_ref(dm1.get())).max(), 0, 9) + + def test_with_df(self): + mf = scf.RHF(h2o_z0).density_fit().newton().run() + self.assertTrue(mf._eri is None) + self.assertAlmostEqual(mf.e_tot, -75.983944727996, 9) + self.assertEqual(mf.__class__.__name__, 'SecondOrderDFRHF') + + mf = scf.RHF(h2o_z0).newton().density_fit().run() + self.assertTrue(mf._eri is None) + self.assertAlmostEqual(mf.e_tot, -75.9839484980661, 9) + mf = mf.undo_newton() + self.assertEqual(mf.__class__.__name__, 'RHF') + + def test_secondary_auxbasis(self): + mf_ref = scf.UHF(h2o_z0).run() + mf = scf.UHF(h2o_z0).newton().density_fit(auxbasis=[[0, [1., 1.]]]).run() + self.assertAlmostEqual(mf_ref.e_tot, mf.e_tot, 8) + + mf_ref = scf.UHF(h2o_z0).density_fit().run() + mf = scf.UHF(h2o_z0).density_fit().newton().density_fit(auxbasis=[[0, [1., 1.]]]).run() + self.assertAlmostEqual(mf_ref.e_tot, mf.e_tot, 8) + +if __name__ == "__main__": + print("Full Tests for Newton solver") + unittest.main() diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py index 17826721..2c7dbf08 100644 --- a/gpu4pyscf/scf/uhf.py +++ b/gpu4pyscf/scf/uhf.py @@ -70,7 +70,8 @@ def spin_square(mo, s=1): def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None, diis_start_cycle=None, level_shift_factor=None, damp_factor=None): if dm is None: dm = mf.make_rdm1() - if h1e is None: h1e = cupy.asarray(mf.get_hcore()) + if h1e is None: h1e = mf.get_hcore() + if s1e is None: s1e = mf.get_ovlp() if vhf is None: vhf = mf.get_veff(mf.mol, dm) if not isinstance(s1e, cupy.ndarray): s1e = cupy.asarray(s1e) if not isinstance(dm, cupy.ndarray): dm = cupy.asarray(dm) @@ -150,6 +151,36 @@ def energy_elec(mf, dm=None, h1e=None, vhf=None): logger.debug(mf, 'E1 = %s Ecoul = %s', e1, e_coul.real) return e_elec, e_coul +def canonicalize(mf, mo_coeff, mo_occ, fock=None): + '''Canonicalization diagonalizes the UHF Fock matrix within occupied, + virtual subspaces separatedly (without change occupancy). + ''' + mo_occ = cupy.asarray(mo_occ) + assert mo_occ.ndim == 2 + if fock is None: + dm = mf.make_rdm1(mo_coeff, mo_occ) + fock = mf.get_fock(dm=dm) + occidxa = mo_occ[0] == 1 + occidxb = mo_occ[1] == 1 + viridxa = mo_occ[0] == 0 + viridxb = mo_occ[1] == 0 + + def eig_(fock, mo_coeff, idx, es, cs): + if cupy.any(idx) > 0: + orb = mo_coeff[:,idx] + f1 = orb.conj().T.dot(fock).dot(orb) + e, c = cupy.linalg.eigh(f1) + es[idx] = e + cs[:,idx] = cupy.dot(orb, c) + + mo = cupy.empty_like(mo_coeff) + mo_e = cupy.empty(mo_occ.shape) + eig_(fock[0], mo_coeff[0], occidxa, mo_e[0], mo[0]) + eig_(fock[0], mo_coeff[0], viridxa, mo_e[0], mo[0]) + eig_(fock[1], mo_coeff[1], occidxb, mo_e[1], mo[1]) + eig_(fock[1], mo_coeff[1], viridxb, mo_e[1], mo[1]) + return mo_e, mo + class UHF(hf.SCF): from gpu4pyscf.lib.utils import to_gpu, device @@ -195,6 +226,7 @@ def get_grad(self, mo_coeff, mo_occ, fock=None): fock = self.get_hcore(self.mol) + self.get_veff(self.mol, dm1) return get_grad(mo_coeff, mo_occ, fock) + make_asym_dm = NotImplemented make_rdm2 = NotImplemented energy_elec = energy_elec get_init_guess = hf.return_cupy_array(uhf.UHF.get_init_guess) @@ -204,15 +236,6 @@ def get_grad(self, mo_coeff, mo_occ, fock=None): init_guess_by_mod_huckel = uhf.UHF.init_guess_by_mod_huckel init_guess_by_1e = uhf.UHF.init_guess_by_1e init_guess_by_chkfile = uhf.UHF.init_guess_by_chkfile - - analyze = NotImplemented - mulliken_pop = NotImplemented - mulliken_spin_pop = NotImplemented - mulliken_meta = NotImplemented - mulliken_meta_spin = NotImplemented - canonicalize = NotImplemented - det_ovlp = NotImplemented - make_asym_dm = NotImplemented _finalize = uhf.UHF._finalize conv_tol_cpscf = 1e-4 @@ -225,9 +248,9 @@ def get_grad(self, mo_coeff, mo_occ, fock=None): density_fit = hf.RHF.density_fit energy_tot = hf.RHF.energy_tot energy_elec = energy_elec + canonicalize = canonicalize make_rdm2 = NotImplemented - newton = NotImplemented x2c = x2c1e = sfx2c1e = NotImplemented to_rhf = NotImplemented to_uhf = NotImplemented @@ -236,7 +259,6 @@ def get_grad(self, mo_coeff, mo_occ, fock=None): to_uks = NotImplemented to_gks = NotImplemented to_ks = NotImplemented - canonicalize = NotImplemented # TODO: Enable followings after testing analyze = NotImplemented stability = NotImplemented @@ -290,6 +312,10 @@ def nuc_grad_method(self): from gpu4pyscf.grad import uhf return uhf.Gradients(self) + def newton(self): + from gpu4pyscf.scf.soscf import newton + return newton(self) + def to_cpu(self): from gpu4pyscf.lib import utils mf = uhf.UHF(self.mol) diff --git a/gpu4pyscf/scf/uhf_symm.py b/gpu4pyscf/scf/uhf_symm.py new file mode 100644 index 00000000..b1785a60 --- /dev/null +++ b/gpu4pyscf/scf/uhf_symm.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# +# Copyright 2024 The PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from gpu4pyscf.scf.uhf import UHF + +SymAdaptedUHF = UHF diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index 1fce56f8..1df748e6 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -243,10 +243,10 @@ def grad_qv(pcmobj, dm): dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart) dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart) - cart_ao_idx = intopt.cart_ao_idx - rev_cart_ao_idx = numpy.argsort(cart_ao_idx) - dvj = dvj[:,rev_cart_ao_idx] - + if not mol.cart: + dvj = dvj @ intopt.cart2sph + dvj = intopt.unsort_orbitals(dvj, axis=[1]) + aoslice = intopt.mol.aoslice_by_atom() dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice]) dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]]) diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py index 967d25f6..1060f3d4 100644 --- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py +++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py @@ -128,9 +128,19 @@ def test_to_gpu(self): hess_gpu = hessobj.kernel() assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-8 ''' + mol = gto.Mole() + mol.atom = ''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 + ''' + mol.basis = 'sto-3g' + mol.output = '/dev/null' + mol.build(verbose=0) mf = pyscf.dft.RKS(mol, xc='b3lyp').density_fit().PCM() mf.conv_tol = 1e-12 mf.conv_tol_cpscf = 1e-7 + mf.grids.atom_grid = (50,194) mf.kernel() hessobj = mf.Hessian() hess_cpu = hessobj.kernel() @@ -148,9 +158,19 @@ def test_to_cpu(self): e_cpu = mf.kernel() assert abs(e_cpu - e_gpu) < 1e-8 ''' + mol = gto.Mole() + mol.atom = ''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 + ''' + mol.basis = 'sto-3g' + mol.output = '/dev/null' + mol.build(verbose=0) mf = dft.RKS(mol, xc='b3lyp').density_fit().PCM() mf.conv_tol = 1e-12 mf.conv_tol_cpscf = 1e-7 + mf.grids.atom_grid = (50,194) mf.kernel() hessobj = mf.Hessian() hess_gpu = hessobj.kernel() diff --git a/gpu4pyscf/tdscf/__init__.py b/gpu4pyscf/tdscf/__init__.py new file mode 100644 index 00000000..552cccee --- /dev/null +++ b/gpu4pyscf/tdscf/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +from gpu4pyscf.tdscf import rhf +from gpu4pyscf.tdscf import uhf +from gpu4pyscf.tdscf import rks +from gpu4pyscf.tdscf import uks diff --git a/gpu4pyscf/tdscf/_uhf_resp_sf.py b/gpu4pyscf/tdscf/_uhf_resp_sf.py new file mode 100644 index 00000000..4ea074dc --- /dev/null +++ b/gpu4pyscf/tdscf/_uhf_resp_sf.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +# TODO: merge this function into scf._response_functions.py + +import functools +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.lib import logger +from pyscf.dft import numint2c, xc_deriv +from gpu4pyscf.scf import hf, uhf +from gpu4pyscf.dft.numint import _scale_ao, _tau_dot, eval_rho, eval_rho2 +from gpu4pyscf.lib.cupy_helper import transpose_sum, add_sparse, contract + +def gen_uhf_response_sf(mf, mo_coeff=None, mo_occ=None, hermi=0, + collinear='mcol', collinear_samples=200): + '''Generate a function to compute the product of Spin Flip UKS response function + and UKS density matrices. + ''' + assert isinstance(mf, (uhf.UHF)) + if mo_coeff is None: mo_coeff = mf.mo_coeff + if mo_occ is None: mo_occ = mf.mo_occ + mol = mf.mol + assert hermi == 0 + + if isinstance(mf, hf.KohnShamDFT): + if mf.do_nlc(): + logger.warn(mf, 'NLC functional found in DFT object. Its second ' + 'deriviative is not available. Its contribution is ' + 'not included in the response function.') + + ni = mf._numint + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) + hybrid = ni.libxc.is_hybrid_xc(mf.xc) + + if collinear in ('ncol', 'mcol'): + fxc = cache_xc_kernel_sf(ni, mol, mf.grids, mf.xc, mo_coeff, mo_occ, + collinear_samples)[2] + dm0 = None + + def vind(dm1): + if collinear in ('ncol', 'mcol'): + v1 = nr_uks_fxc_sf(ni, mol, mf.grids, mf.xc, dm0, dm1, 0, hermi, + None, None, fxc) + else: + v1 = cp.zeros_like(dm1) + if hybrid: + # j = 0 in spin flip part. + if omega == 0: + vk = mf.get_k(mol, dm1, hermi) * hyb + elif alpha == 0: # LR=0, only SR exchange + vk = mf.get_k(mol, dm1, hermi, omega=-omega) * hyb + elif hyb == 0: # SR=0, only LR exchange + vk = mf.get_k(mol, dm1, hermi, omega=omega) * alpha + else: # SR and LR exchange with different ratios + vk = mf.get_k(mol, dm1, hermi) * hyb + vk += mf.get_k(mol, dm1, hermi, omega=omega) * (alpha-hyb) + v1 -= vk + return v1 + return vind + + else: #HF + def vind(dm1): + vk = mf.get_k(mol, dm1, hermi) + return -vk + return vind + +# This function is copied from pyscf.dft.numint2c.py +def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv): + evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype) + evfk = list(evfk) + for order in range(1, deriv+1): + if evfk[order] is not None: + evfk[order] = xc_deriv.ud2ts(evfk[order]) + return evfk + +# Edited based on pyscf.dft.numint2c.mcfun_eval_xc_adapter +def mcfun_eval_xc_adapter_sf(ni, xc_code, collinear_samples): + '''Wrapper to generate the eval_xc function required by mcfun + ''' + + try: + import mcfun + except ImportError: + raise ImportError('This feature requires mcfun library.\n' + 'Try install mcfun with `pip install mcfun`') + + ni = numint2c.NumInt2C() + ni.collinear = 'mcol' + ni.collinear_samples = collinear_samples + xctype = ni._xc_type(xc_code) + fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype) + nproc = lib.num_threads() + + def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None): + res = mcfun.eval_xc_eff_sf( + fn_eval_xc, rho.get(), deriv, + collinear_samples=collinear_samples, workers=nproc) + return [x if x is None else cp.asarray(x) for x in res] + return eval_xc_eff + +def cache_xc_kernel_sf(ni, mol, grids, xc_code, mo_coeff, mo_occ, + collinear_samples): + '''Compute the fxc_sf, which can be used in SF-TDDFT/TDA + ''' + xctype = ni._xc_type(xc_code) + if xctype == 'GGA': + ao_deriv = 1 + elif xctype == 'MGGA': + ao_deriv = 1 + else: + ao_deriv = 0 + assert isinstance(mo_coeff, cp.ndarray) + assert mo_coeff.ndim == 3 + + nao = mo_coeff[0].shape[0] + rhoa = [] + rhob = [] + + with_lapl = False + opt = getattr(ni, 'gdftopt', None) + if opt is None or mol not in [opt.mol, opt._sorted_mol]: + ni.build(mol, grids.coords) + opt = ni.gdftopt + _sorted_mol = opt._sorted_mol + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1]) + + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff[0,idx,:], + mo_occ[0], None, xctype, with_lapl) + rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff[1,idx,:], + mo_occ[1], None, xctype, with_lapl) + rhoa.append(rhoa_slice) + rhob.append(rhob_slice) + rho_ab = (cp.hstack(rhoa), cp.hstack(rhob)) + rho_z = cp.array([rho_ab[0]+rho_ab[1], + rho_ab[0]-rho_ab[1]]) + eval_xc_eff = mcfun_eval_xc_adapter_sf(ni, xc_code, collinear_samples) + vxc, fxc = eval_xc_eff(xc_code, rho_z, deriv=2, xctype=xctype)[1:3] + return rho_ab, vxc, fxc + +def nr_uks_fxc_sf(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0, + rho0=None, vxc=None, fxc=None): + if fxc is None: + raise RuntimeError('fxc was not initialized') + assert hermi == 0 + assert dms.dtype == np.double + + xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + if opt is None or mol not in [opt.mol, opt._sorted_mol]: + ni.build(mol, grids.coords) + opt = ni.gdftopt + mol = None + _sorted_mol = opt._sorted_mol + nao, nao0 = opt.coeff.shape + dm_shape = dms.shape + + dms = cp.asarray(dms).reshape(-1,nao0,nao0) + dms = opt.sort_orbitals(dms, axis=[1,2]) + + nset = len(dms) + vmat = cp.zeros((nset, nao, nao)) + + if xctype == 'LDA': + ao_deriv = 0 + elif xctype == 'GGA': + ao_deriv = 1 + elif xctype == 'MGGA': + ao_deriv = 1 + else: + raise RuntimeError(f'Unknown xctype {xctype}') + p0 = p1 = 0 + for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv): + p0, p1 = p1, p1+len(weights) + # precompute fxc_w. *2.0 becausue xx + yy + fxc_w = fxc[:,:,p0:p1] * weights * 2. + + for i in range(nset): + rho1 = eval_rho(_sorted_mol, ao, dms[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) + if xctype == 'LDA': + wv = rho1 * fxc_w[0,0] + vtmp = ao.dot(_scale_ao(ao, wv).T) + elif xctype == 'GGA': + wv = contract('bg,abg->ag', rho1, fxc_w) + wv[0] *= .5 # for transpose_sum at the end + vtmp = ao[0].dot(_scale_ao(ao, wv).T) + elif xctype == 'MGGA': + wv = contract('bg,abg->ag', rho1, fxc_w) + wv[[0,4]] *= .5 # for transpose_sum at the end + vtmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T) + vtmp += _tau_dot(ao, ao, wv[4]) + add_sparse(vmat[i], vtmp, mask) + + vmat = opt.unsort_orbitals(vmat, axis=[1,2]) + if xctype != 'LDA': + transpose_sum(vmat) + if len(dm_shape) == 2: + vmat = vmat[0] + return vmat diff --git a/gpu4pyscf/tdscf/rhf.py b/gpu4pyscf/tdscf/rhf.py new file mode 100644 index 00000000..9e33b6e8 --- /dev/null +++ b/gpu4pyscf/tdscf/rhf.py @@ -0,0 +1,368 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import numpy as np +import cupy as cp +import scipy.linalg +from pyscf import gto +from pyscf import lib +from pyscf.tdscf import rhf as tdhf_cpu +from pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig +from gpu4pyscf import scf +from gpu4pyscf.lib.cupy_helper import contract, tag_array +from gpu4pyscf.lib import utils +from gpu4pyscf.lib import logger +from gpu4pyscf.scf import _response_functions # noqa +from pyscf import __config__ + +REAL_EIG_THRESHOLD = tdhf_cpu.REAL_EIG_THRESHOLD +#OUTPUT_THRESHOLD = tdhf_cpu.OUTPUT_THRESHOLD +OUTPUT_THRESHOLD = getattr(__config__, 'tdscf_rhf_get_nto_threshold', 0.3) + +__all__ = [ + 'TDA', 'CIS', 'TDHF', 'TDRHF', 'TDBase' +] + + +def gen_tda_operation(mf, fock_ao=None, singlet=True, wfnsym=None): + '''Generate function to compute A x + ''' + assert fock_ao is None + assert isinstance(mf, scf.hf.SCF) + assert wfnsym is None + mo_coeff = mf.mo_coeff + assert mo_coeff.dtype == cp.float64 + mo_energy = mf.mo_energy + mo_occ = mf.mo_occ + occidx = mo_occ == 2 + viridx = mo_occ == 0 + orbv = mo_coeff[:,viridx] + orbo = mo_coeff[:,occidx] + orbo2 = orbo * 2. # *2 for double occupancy + + e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None] + hdiag = hdiag.ravel().get() + vresp = mf.gen_response(singlet=singlet, hermi=0) + nocc, nvir = e_ia.shape + + def vind(zs): + zs = cp.asarray(zs).reshape(-1,nocc,nvir) + mo1 = contract('xov,pv->xpo', zs, orbv) + dms = contract('xpo,qo->xpq', mo1, orbo2.conj()) + dms = tag_array(dms, mo1=mo1, occ_coeff=orbo) + v1ao = vresp(dms) + v1mo = contract('xpq,qo->xpo', v1ao, orbo) + v1mo = contract('xpo,pv->xov', v1mo, orbv.conj()) + v1mo += zs * e_ia + return v1mo.reshape(v1mo.shape[0],-1).get() + + return vind, hdiag + + +class TDBase(lib.StreamObject): + to_gpu = utils.to_gpu + device = utils.device + to_cpu = utils.to_cpu + + conv_tol = tdhf_cpu.TDBase.conv_tol + nstates = tdhf_cpu.TDBase.nstates + singlet = tdhf_cpu.TDBase.singlet + lindep = tdhf_cpu.TDBase.lindep + level_shift = tdhf_cpu.TDBase.level_shift + max_cycle = tdhf_cpu.TDBase.max_cycle + positive_eig_threshold = tdhf_cpu.TDBase.positive_eig_threshold + deg_eia_thresh = tdhf_cpu.TDBase.deg_eia_thresh + + _keys = tdhf_cpu.TDBase._keys + + __init__ = tdhf_cpu.TDBase.__init__ + + nroots = tdhf_cpu.TDBase.nroots + e_tot = tdhf_cpu.TDBase.e_tot + dump_flags = tdhf_cpu.TDBase.dump_flags + check_sanity = tdhf_cpu.TDBase.check_sanity + reset = tdhf_cpu.TDBase.reset + _finalize = tdhf_cpu.TDBase._finalize + + gen_vind = NotImplemented + get_ab = NotImplemented + get_precond = tdhf_cpu.TDBase.get_precond + + nuc_grad_method = NotImplemented + as_scanner = tdhf_cpu.as_scanner + + oscillator_strength = tdhf_cpu.oscillator_strength + transition_dipole = tdhf_cpu.transition_dipole + transition_quadrupole = tdhf_cpu.transition_quadrupole + transition_octupole = tdhf_cpu.transition_octupole + transition_velocity_dipole = tdhf_cpu.transition_velocity_dipole + transition_velocity_quadrupole = tdhf_cpu.transition_velocity_quadrupole + transition_velocity_octupole = tdhf_cpu.transition_velocity_octupole + transition_magnetic_dipole = tdhf_cpu.transition_magnetic_dipole + transition_magnetic_quadrupole = tdhf_cpu.transition_magnetic_quadrupole + + def analyze(self, verbose=None): + self.to_cpu().analyze(verbose) + return self + + def get_nto(self, state=1, threshold=OUTPUT_THRESHOLD, verbose=None): + ''' + Natural transition orbital analysis. + + Returns: + A list (weights, NTOs). NTOs are natural orbitals represented in AO + basis. The first N_occ NTOs are occupied NTOs and the rest are virtual + NTOs. weights and NTOs are all stored in nparray + ''' + return self.to_cpu().get_nto(state, threshold, verbose) + + # needed by transition dipoles + def _contract_multipole(tdobj, ints, hermi=True, xy=None): + '''ints is the integral tensor of a spin-independent operator''' + if xy is None: xy = tdobj.xy + nstates = len(xy) + pol_shape = ints.shape[:-2] + nao = ints.shape[-1] + + if not tdobj.singlet: + return np.zeros((nstates,) + pol_shape) + + mo_coeff = tdobj._scf.mo_coeff + mo_occ = tdobj._scf.mo_occ + orbo = mo_coeff[:,mo_occ==2] + orbv = mo_coeff[:,mo_occ==0] + if isinstance(orbo, cp.ndarray): + orbo = orbo.get() + orbv = orbv.get() + + #Incompatible to old np version + #ints = np.einsum('...pq,pi,qj->...ij', ints, orbo.conj(), orbv) + ints = lib.einsum('xpq,pi,qj->xij', ints.reshape(-1,nao,nao), orbo.conj(), orbv) + pol = np.array([np.einsum('xij,ij->x', ints, x) * 2 for x,y in xy]) + if isinstance(xy[0][1], np.ndarray): + if hermi: + pol += [np.einsum('xij,ij->x', ints, y) * 2 for x,y in xy] + else: # anti-Hermitian + pol -= [np.einsum('xij,ij->x', ints, y) * 2 for x,y in xy] + pol = pol.reshape((nstates,)+pol_shape) + return pol + +class TDA(TDBase): + __doc__ = tdhf_cpu.TDA.__doc__ + + def gen_vind(self, mf=None): + '''Generate function to compute Ax''' + if mf is None: + mf = self._scf + return gen_tda_operation(mf, singlet=self.singlet) + + def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False): + ''' + Generate initial guess for TDA + + Kwargs: + nstates : int + The number of initial guess vectors. + ''' + if mf is None: mf = self._scf + if nstates is None: nstates = self.nstates + assert wfnsym is None + assert not return_symmetry + + mo_energy = mf.mo_energy + mo_occ = mf.mo_occ + if isinstance(mo_energy, cp.ndarray): + mo_energy = mo_energy.get() + mo_occ = mo_occ.get() + occidx = mo_occ == 2 + viridx = mo_occ == 0 + e_ia = (mo_energy[viridx] - mo_energy[occidx,None]).ravel() + nov = e_ia.size + nstates = min(nstates, nov) + + # Find the nstates-th lowest energy gap + e_threshold = float(np.partition(e_ia, nstates-1)[nstates-1]) + e_threshold += self.deg_eia_thresh + + idx = np.where(e_ia <= e_threshold)[0] + x0 = np.zeros((idx.size, nov)) + for i, j in enumerate(idx): + x0[i, j] = 1 # Koopmans' excitations + + return x0 + + def kernel(self, x0=None, nstates=None): + '''TDA diagonalization solver + ''' + log = logger.new_logger(self) + cpu0 = log.init_timer() + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + mol = self.mol + + vind, hdiag = self.gen_vind(self._scf) + precond = self.get_precond(hdiag) + + def pickeig(w, v, nroots, envs): + idx = np.where(w > self.positive_eig_threshold)[0] + return w[idx], v[:,idx], idx + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + self.converged, self.e, x1 = lr_eigh( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + nocc = mol.nelectron // 2 + nmo = self._scf.mo_occ.size + nvir = nmo - nocc + # 1/sqrt(2) because self.x is for alpha excitation and 2(X^+*X) = 1 + self.xy = [(xi.reshape(nocc,nvir) * .5**.5, 0) for xi in x1] + log.timer('TDA', *cpu0) + self._finalize() + return self.e, self.xy + +CIS = TDA + + +def gen_tdhf_operation(mf, fock_ao=None, singlet=True, wfnsym=None): + '''Generate function to compute + + [ A B ][X] + [-B* -A*][Y] + ''' + assert fock_ao is None + assert isinstance(mf, scf.hf.SCF) + mo_coeff = mf.mo_coeff + assert mo_coeff.dtype == cp.float64 + mo_energy = mf.mo_energy + mo_occ = mf.mo_occ + occidx = mo_occ == 2 + viridx = mo_occ == 0 + orbv = mo_coeff[:,viridx] + orbo = mo_coeff[:,occidx] + + e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None] + hdiag = cp.hstack((hdiag.ravel(), -hdiag.ravel())).get() + vresp = mf.gen_response(singlet=singlet, hermi=0) + nocc, nvir = e_ia.shape + + def vind(xys): + xys = cp.asarray(xys).reshape(-1,2,nocc,nvir) + nz = len(xys) + xs, ys = xys.transpose(1,0,2,3) + # *2 for double occupancy + tmp = contract('xov,pv->xpo', xs, orbv*2) + dms = contract('xpo,qo->xpq', tmp, orbo.conj()) + tmp = contract('xov,qv->xoq', ys, orbv.conj()*2) + dms+= contract('xoq,po->xpq', tmp, orbo) + v1ao = vresp(dms) # = Xjb + Yjb + v1_top = contract('xpq,qo->xpo', v1ao, orbo) + v1_top = contract('xpo,pv->xov', v1_top, orbv) + v1_bot = contract('xpq,po->xoq', v1ao, orbo) + v1_bot = contract('xoq,qv->xov', v1_bot, orbv) + v1_top += xs * e_ia # AX + v1_bot += ys * e_ia # (A*)Y + hx = cp.hstack((v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1))) + return hx.get() + + return vind, hdiag + + +class TDHF(TDBase): + __doc__ = tdhf_cpu.TDHF.__doc__ + + @lib.with_doc(gen_tdhf_operation.__doc__) + def gen_vind(self, mf=None): + if mf is None: + mf = self._scf + return gen_tdhf_operation(mf, singlet=self.singlet) + + def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False): + x0 = TDA.init_guess(self, mf, nstates, wfnsym, return_symmetry) + y0 = np.zeros_like(x0) + return np.hstack([x0, y0]) + + def kernel(self, x0=None, nstates=None): + '''TDHF diagonalization with non-Hermitian eigenvalue solver + ''' + log = logger.new_logger(self) + cpu0 = log.init_timer() + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + mol = self.mol + + vind, hdiag = self.gen_vind(self._scf) + precond = self.get_precond(hdiag) + + # handle single kpt PBC SCF + if getattr(self._scf, 'kpt', None) is not None: + from pyscf.pbc.lib.kpts_helper import gamma_point + real_system = (gamma_point(self._scf.kpt) and + self._scf.mo_coeff[0].dtype == np.double) + else: + real_system = True + + # We only need positive eigenvalues + def pickeig(w, v, nroots, envs): + realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) & + (w.real > self.positive_eig_threshold))[0] + # If the complex eigenvalue has small imaginary part, both the + # real part and the imaginary part of the eigenvector can + # approximately be used as the "real" eigen solutions. + return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system) + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + self.converged, w, x1 = lr_eig( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + nocc = mol.nelectron // 2 + nmo = self._scf.mo_occ.size + nvir = nmo - nocc + self.e = w + def norm_xy(z): + x, y = z.reshape(2,nocc,nvir) + norm = lib.norm(x)**2 - lib.norm(y)**2 + norm = np.sqrt(.5/norm) # normalize to 0.5 for alpha spin + return x*norm, y*norm + self.xy = [norm_xy(z) for z in x1] + + log.timer('TDDFT', *cpu0) + self._finalize() + return self.e, self.xy + +TDRHF = TDHF + +scf.hf.RHF.TDA = lib.class_as_method(TDA) +scf.hf.RHF.TDHF = lib.class_as_method(TDHF) diff --git a/gpu4pyscf/tdscf/rks.py b/gpu4pyscf/tdscf/rks.py new file mode 100644 index 00000000..41971614 --- /dev/null +++ b/gpu4pyscf/tdscf/rks.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.tdscf._lr_eig import eigh as lr_eigh +from gpu4pyscf.dft.rks import KohnShamDFT +from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum +from gpu4pyscf.lib import logger +from gpu4pyscf.tdscf import rhf as tdhf_gpu +from gpu4pyscf import dft + +__all__ = [ + 'TDA', 'TDDFT', 'TDRKS', 'CasidaTDDFT', 'TDDFTNoHybrid', +] + +TDA = tdhf_gpu.TDA +TDDFT = tdhf_gpu.TDHF +TDRKS = TDDFT + +class CasidaTDDFT(TDDFT): + '''Solve the Casida TDDFT formula (A-B)(A+B)(X+Y) = (X+Y)w^2 + ''' + + init_guess = TDA.init_guess + + def gen_vind(self, mf=None): + if mf is None: + mf = self._scf + singlet = self.singlet + mo_coeff = mf.mo_coeff + assert mo_coeff.dtype == cp.double + mo_energy = mf.mo_energy + mo_occ = mf.mo_occ + occidx = mo_occ == 2 + viridx = mo_occ == 0 + orbv = mo_coeff[:,viridx] + orbo = mo_coeff[:,occidx] + + e_ia = mo_energy[viridx] - mo_energy[occidx,None] + d_ia = e_ia ** .5 + ed_ia = e_ia * d_ia + hdiag = e_ia.ravel() ** 2 + hdiag = hdiag.get() + vresp = mf.gen_response(singlet=singlet, hermi=1) + nocc, nvir = e_ia.shape + + def vind(zs): + zs = cp.asarray(zs).reshape(-1,nocc,nvir) + # *2 for double occupancy + mo1 = contract('xov,pv->xpo', zs*(d_ia*2), orbv) + dms = contract('xpo,qo->xpq', mo1, orbo) + # +cc for A+B and K_{ai,jb} in A == K_{ai,bj} in B + dms = transpose_sum(dms) + dms = tag_array(dms, mo1=mo1, occ_coeff=orbo) + v1ao = vresp(dms) + v1mo = contract('xpq,qo->xpo', v1ao, orbo) + v1mo = contract('xpo,pv->xov', v1mo, orbv) + v1mo += zs * ed_ia + v1mo *= d_ia + return v1mo.reshape(v1mo.shape[0],-1).get() + + return vind, hdiag + + def kernel(self, x0=None, nstates=None): + '''TDDFT diagonalization solver + ''' + log = logger.new_logger(self) + cpu0 = log.init_timer() + mf = self._scf + if mf._numint.libxc.is_hybrid_xc(mf.xc): + raise RuntimeError('%s cannot be used with hybrid functional' + % self.__class__) + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + + vind, hdiag = self.gen_vind(self._scf) + precond = self.get_precond(hdiag) + + def pickeig(w, v, nroots, envs): + idx = np.where(w > self.positive_eig_threshold)[0] + return w[idx], v[:,idx], idx + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + self.converged, w2, x1 = lr_eigh( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + mo_energy = self._scf.mo_energy + mo_occ = self._scf.mo_occ + occidx = mo_occ == 2 + viridx = mo_occ == 0 + e_ia = mo_energy[viridx] - mo_energy[occidx,None] + e_ia = e_ia**.5 + if isinstance(e_ia, cp.ndarray): + e_ia = e_ia.get() + + def norm_xy(w, z): + zp = e_ia * z.reshape(e_ia.shape) + zm = w/e_ia * z.reshape(e_ia.shape) + x = (zp + zm) * .5 + y = (zp - zm) * .5 + norm = lib.norm(x)**2 - lib.norm(y)**2 + norm = (.5/norm)**.5 # normalize to 0.5 for alpha spin + return (x*norm, y*norm) + + idx = np.where(w2 > self.positive_eig_threshold)[0] + self.e = w2[idx]**.5 + self.xy = [norm_xy(self.e[i], x1[i]) for i in idx] + log.timer('TDDFT', *cpu0) + self._finalize() + return self.e, self.xy + + def nuc_grad_method(self): + from pyscf.grad import tdrks + return tdrks.Gradients(self) + +TDDFTNoHybrid = CasidaTDDFT + +def tddft(mf): + '''Driver to create TDDFT or CasidaTDDFT object''' + if mf._numint.libxc.is_hybrid_xc(mf.xc): + return TDDFT(mf) + else: + return CasidaTDDFT(mf) + +dft.rks.RKS.TDA = lib.class_as_method(TDA) +dft.rks.RKS.TDHF = None +#dft.rks.RKS.TDDFT = lib.class_as_method(TDDFT) +dft.rks.RKS.TDDFTNoHybrid = lib.class_as_method(TDDFTNoHybrid) +dft.rks.RKS.CasidaTDDFT = lib.class_as_method(CasidaTDDFT) +dft.rks.RKS.TDDFT = tddft diff --git a/gpu4pyscf/tdscf/tests/test_sftddft.py b/gpu4pyscf/tdscf/tests/test_sftddft.py new file mode 100644 index 00000000..0358fb3a --- /dev/null +++ b/gpu4pyscf/tdscf/tests/test_sftddft.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +from pyscf import lib, gto, scf +from gpu4pyscf import tdscf +try: + import mcfun +except ImportError: + mcfun = None + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + mol = gto.Mole() + mol.verbose = 5 + mol.output = '/dev/null' + mol.atom = ''' + O 0. 0. 0. + H 0. -0.757 0.587 + H 0. 0.757 0.587''' + mol.spin = 2 + mol.basis = '631g' + cls.mol = mol.build() + cls.mf = mol.UHF().to_gpu().run() + + @classmethod + def tearDownClass(cls): + cls.mol.stdout.close() + + def test_tda(self): + mf = self.mf + # sftddft not available in pyscf main branch. References are created + # using the sftda module from pyscf-forge + ref = [ 0.46644071, 0.55755649, 1.05310518] + td = mf.SFTDA().run(extype=0, conv_tol=1e-7) + self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6) + + ref = [-0.21574567, 0.00270390, 0.03143914] + td = mf.SFTDA().run(extype=1, conv_tol=1e-7) + self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6) + + @unittest.skipIf(mcfun is None, 'MCfun not available') + def test_mcol_b3lyp_tda(self): + mf = self.mf + # sftddft not available in pyscf main branch. References are created + # using the sftda module from pyscf-forge + ref = [ 0.45941171, 0.57799552, 1.06629265] + td = mf.SFTDA().run(collinear='mcol', extype=0, conv_tol=1e-7) + self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6) + + ref = [-0.29629139, 0.00067017, 0.01956306] + td = mf.SFTDA().run(collinear='mcol', extype=1, conv_tol=1e-7) + self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6) + + @unittest.skip('Numerical issues encountered in non-hermitian diagonalization') + def test_tdhf(self): + mf = self.mf + ref = [1.74385401, 9.38227395, 14.90168875] + td = mf.SFTDHF().run(extype=0, conv_tol=1e-7) + self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6) + + ref = [0.41701647, 9.59644331, 22.99972711] + td = mf.SFTDHF().run(extype=1, conv_tol=1e-7) + self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6) + +if __name__ == "__main__": + print("Full Tests for spin-flip-TDA and spin-flip-TDDFT") + unittest.main() diff --git a/gpu4pyscf/tdscf/tests/test_tdrhf.py b/gpu4pyscf/tdscf/tests/test_tdrhf.py new file mode 100644 index 00000000..3ebc0372 --- /dev/null +++ b/gpu4pyscf/tdscf/tests/test_tdrhf.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +from pyscf import lib, gto, scf +from gpu4pyscf import tdscf + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + mol = gto.Mole() + mol.verbose = 7 + mol.output = '/dev/null' + mol.atom = [ + ['H' , (0. , 0. , .917)], + ['F' , (0. , 0. , 0.)], ] + mol.basis = '631g' + mol.symmetry = True + cls.mol = mol.build() + cls.mf = mf = scf.RHF(mol).to_gpu().run() + cls.df_mf = mf.density_fit().run() + cls.nstates = 5 # make sure first 3 states are converged + + @classmethod + def tearDownClass(cls): + cls.mol.stdout.close() + + def test_tda_singlet(self): + mf = self.mf + nstates = self.nstates + td = mf.TDA().set(nstates=nstates) + assert td.device == 'gpu' + e = td.kernel()[0] + ref = [11.9027511, 11.9027511, 16.8603101] + self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5) + dip = td.transition_dipole() + self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.65616659, 5) + + df_mf = self.df_mf + td = df_mf.TDA().set(nstates=nstates) + e = td.kernel()[0] + ref = td.to_cpu().kernel()[0][:3] + self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7) + dip = td.transition_dipole() + self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.65618093, 5) + + def test_tda_triplet(self): + mf = self.mf + nstates = self.nstates + td = mf.TDA().set(nstates=nstates) + assert td.device == 'gpu' + td.singlet = False + e = td.kernel()[0] + ref = [11.0174650, 11.0174650, 13.1694960] + self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5) + dip = td.transition_dipole() + self.assertAlmostEqual(abs(dip).max(), 0, 8) + + df_mf = self.df_mf + td = df_mf.TDA().set(nstates=nstates) + td.singlet = False + e = td.kernel()[0] + ref = td.to_cpu().kernel()[0][:3] + self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7) + dip = td.transition_dipole() + self.assertAlmostEqual(abs(dip).max(), 0, 8) + + def test_tdhf_singlet(self): + mf = self.mf + nstates = self.nstates + td = mf.TDHF().set(nstates=nstates) + assert td.device == 'gpu' + e = td.kernel()[0] + ref = [11.8348584, 11.8348584, 16.6630381] + self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5) + dip = td.transition_dipole() + self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.64009191, 5) + + df_mf = self.df_mf + td = df_mf.TDHF().set(nstates=nstates) + e = td.kernel()[0] + ref = td.to_cpu().kernel()[0][:3] + self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7) + dip = td.transition_dipole() + self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.64011895, 5) + + def test_tdhf_triplet(self): + mf = self.mf + nstates = self.nstates + td = mf.TDHF().set(nstates=nstates) + assert td.device == 'gpu' + td.singlet = False + e = td.kernel()[0] + ref = [10.8919091, 10.8919091, 12.6343507] + self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5) + dip = td.transition_dipole() + self.assertAlmostEqual(abs(dip).max(), 0, 8) + + df_mf = self.df_mf + td = df_mf.TDHF().set(nstates=nstates) + td.singlet = False + e = td.kernel()[0] + ref = td.to_cpu().kernel()[0][:3] + self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7) + dip = td.transition_dipole() + self.assertAlmostEqual(abs(dip).max(), 0, 8) + + def test_tda_vind(self): + mf = self.mf + nocc = self.mol.nelectron // 2 + nmo = mf.mo_energy.size + nvir = nmo - nocc + zs = np.random.rand(3,nocc,nvir) + ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs) + dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + df_mf = self.df_mf + ref = df_mf.to_cpu().TDA().set(singlet=True).gen_vind()[0](zs) + dat = df_mf.TDA().set(singlet=True).gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + def test_tdhf_vind(self): + mf = self.mf + nocc = self.mol.nelectron // 2 + nmo = mf.mo_energy.size + nvir = nmo - nocc + zs = np.random.rand(3,2,nocc,nvir) + ref = mf.to_cpu().TDHF().set(singlet=True).gen_vind()[0](zs) + dat = mf.TDHF().set(singlet=True).gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + df_mf = self.df_mf + ref = df_mf.to_cpu().TDHF().set(singlet=False).gen_vind()[0](zs) + dat = df_mf.TDHF().set(singlet=False).gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + +if __name__ == "__main__": + print("Full Tests for rhf-TDA and rhf-TDHF") + unittest.main() diff --git a/gpu4pyscf/tdscf/tests/test_tdrks.py b/gpu4pyscf/tdscf/tests/test_tdrks.py new file mode 100644 index 00000000..c113c1bd --- /dev/null +++ b/gpu4pyscf/tdscf/tests/test_tdrks.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +from pyscf import lib, gto +from gpu4pyscf import tdscf + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + mol = gto.Mole() + mol.verbose = 5 + mol.output = '/dev/null' + mol.atom = [ + ['H' , (0. , 0. , .917)], + ['F' , (0. , 0. , 0.)], ] + mol.basis = '631g' + cls.mol = mol.build() + + cls.mf = mf = mol.RHF().to_gpu().run() + cls.td_hf = mf.TDHF().run(conv_tol=1e-6) + + mf_lda = mol.RKS().to_gpu().density_fit() + mf_lda.xc = 'lda, vwn' + mf_lda.grids.prune = None + mf_lda.cphf_grids = mf_lda.grids + cls.mf_lda = mf_lda.run(conv_tol=1e-10) + + mf_bp86 = mol.RKS().to_gpu().density_fit() + mf_bp86.xc = 'b88,p86' + mf_bp86.grids.prune = None + mf_bp86.cphf_grids = mf_bp86.grids + cls.mf_bp86 = mf_bp86.run(conv_tol=1e-10) + + mf_b3lyp = mol.RKS().to_gpu().density_fit() + mf_b3lyp.xc = 'b3lyp5' + mf_b3lyp.grids.prune = None + mf_b3lyp.cphf_grids = mf_b3lyp.grids + cls.mf_b3lyp = mf_b3lyp.run(conv_tol=1e-10) + + mf_m06l = mol.RKS().to_gpu().density_fit() + mf_m06l.xc = 'm06l' + mf_m06l.cphf_grids = mf_m06l.grids + cls.mf_m06l = mf_m06l.run(conv_tol=1e-10) + + @classmethod + def tearDownClass(cls): + cls.mol.stdout.close() + + def test_nohbrid_lda(self): + mf_lda = self.mf_lda + td = mf_lda.CasidaTDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 5) + self.assertAlmostEqual(lib.fp(es), -1.5103950945691957, 5) + + def test_nohbrid_b88p86(self): + mf_bp86 = self.mf_bp86 + td = mf_bp86.CasidaTDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.4869180666784665, 6) + + def test_tddft_lda(self): + mf_lda = self.mf_lda + td = mf_lda.TDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.5103950945691957, 6) + + def test_tddft_b88p86(self): + mf_bp86 = self.mf_bp86 + td = mf_bp86.TDDFT() + assert td.device == 'gpu' + td.conv_tol = 1e-5 + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.4869180666784665, 6) + + def test_tddft_b3lyp(self): + mf_b3lyp = self.mf_b3lyp + td = mf_b3lyp.TDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.5175884245769546, 6) + + def test_tddft_camb3lyp(self): + mol = self.mol + mf = mol.RKS(xc='camb3lyp').run() + mf.cphf_grids = mf.grids + td = mf.TDDFT().to_gpu() + assert td.device == 'gpu' + td.conv_tol = 1e-5 + es = td.kernel(nstates=4)[0] + e_ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]*27.2114), 9.00540521503348, 6) + + def test_tda_b3lypg(self): + mol = self.mol + mf = mol.RKS() + mf.xc = 'b3lypg' + mf.grids.prune = None + mf.cphf_grids = mf.grids + mf.scf() + td = mf.TDA().to_gpu() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.520888995669812, 6) + + def test_tda_lda(self): + mf_lda = self.mf_lda + td = mf_lda.TDA() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.5141057378565799, 6) + + def test_tda_b3lyp_triplet(self): + mf_b3lyp = self.mf_b3lyp + td = mf_b3lyp.TDA() + assert td.device == 'gpu' + td.singlet = False + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.4707787881198082, 6) + td.analyze() + + def test_tda_lda_triplet(self): + mf_lda = self.mf_lda + td = mf_lda.TDA() + assert td.device == 'gpu' + td.singlet = False + es = td.kernel(nstates=6)[0] + ref = td.to_cpu().kernel(nstates=6)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[[0,1,2,4,5]]), -1.4695846533898422, 6) + + def test_tddft_b88p86_triplet(self): + mf_bp86 = self.mf_bp86 + td = mf_bp86.TDDFT() + assert td.device == 'gpu' + td.singlet = False + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.4412243124430528, 6) + + def test_tda_rsh(self): + mol = gto.M(atom='H 0 0 0.6; H 0 0 0', basis = "6-31g") + mf = mol.RKS() + mf.xc = 'wb97' + mf.kernel() + mf.cphf_grids = mf.grids + td = mf.TDA().to_gpu() + assert td.device == 'gpu' + e_td = td.set(nstates=5).kernel()[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(e_td - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(e_td), 0.3953917940299652, 6) + + def test_tda_m06l_singlet(self): + mf_m06l = self.mf_m06l + td = mf_m06l.TDA() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -1.5620823865741496, 6) + + def test_analyze(self): + td_hf = self.td_hf + assert td_hf.device == 'gpu' + f = td_hf.oscillator_strength(gauge='length') + self.assertAlmostEqual(lib.fp(f), -0.13908774016795605, 5) + f = td_hf.oscillator_strength(gauge='velocity', order=2) + self.assertAlmostEqual(lib.fp(f), -0.096991134490587522, 5) + + note_args = [] + def temp_logger_note(rec, msg, *args): + note_args.append(args) + with lib.temporary_env(lib.logger.Logger, note=temp_logger_note): + td_hf.analyze() + ref = [(), + (1, 11.834865910142547, 104.76181013351982, 0.01075359074556743), + (2, 11.834865910142618, 104.76181013351919, 0.010753590745567499), + (3, 16.66308427853695, 74.40651170629978, 0.3740302871966713)] + self.assertAlmostEqual(abs(np.hstack(ref) - + np.hstack(note_args)).max(), 0, 3) + + self.assertEqual(td_hf.nroots, td_hf.nstates) + mf = self.mf + self.assertAlmostEqual(lib.fp(td_hf.e_tot-mf.e_tot), 0.41508325757603637, 5) + + def test_scanner(self): + mol = self.mol + td_hf = self.td_hf + td_scan = td_hf.as_scanner().as_scanner() + td_scan.nroots = 3 + td_scan(mol) + self.assertAlmostEqual(lib.fp(td_scan.e), 0.41508325757603637, 5) + + def test_transition_multipoles(self): + td_hf = self.td_hf + self.assertAlmostEqual(abs(lib.fp(td_hf.transition_dipole() [2])), 0.39833021312014988, 4) + self.assertAlmostEqual(abs(lib.fp(td_hf.transition_quadrupole() [2])), 0.14862776196563565, 4) + self.assertAlmostEqual(abs(lib.fp(td_hf.transition_octupole() [2])), 2.79058994496489410, 4) + self.assertAlmostEqual(abs(lib.fp(td_hf.transition_velocity_dipole() [2])), 0.24021409469918567, 4) + self.assertAlmostEqual(abs(lib.fp(td_hf.transition_magnetic_dipole() [2])), 0 , 4) + self.assertAlmostEqual(abs(lib.fp(td_hf.transition_magnetic_quadrupole()[2])), 0.16558596265719450, 4) + + def test_reset(self): + mol1 = gto.M(atom='C') + mol = self.mol + td = mol.RHF().newton().TDHF().to_gpu() + assert td.device == 'gpu' + td.reset(mol1) + self.assertTrue(td.mol is mol1) + self.assertTrue(td._scf.mol is mol1) + + def test_tda_vind(self): + mf = self.mf_bp86 + nocc = self.mol.nelectron // 2 + nmo = mf.mo_energy.size + nvir = nmo - nocc + zs = np.random.rand(3,nocc,nvir) + ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs) + dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + def test_tddft_vind(self): + mf = self.mf_b3lyp + nocc = self.mol.nelectron // 2 + nmo = mf.mo_energy.size + nvir = nmo - nocc + zs = np.random.rand(3,2,nocc,nvir) + ref = mf.to_cpu().TDDFT().set(singlet=True).gen_vind()[0](zs) + dat = mf.TDDFT().set(singlet=True).gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + def test_casida_tddft_vind(self): + mf = self.mf_lda + nocc = self.mol.nelectron // 2 + nmo = mf.mo_energy.size + nvir = nmo - nocc + zs = np.random.rand(3,nocc,nvir) + ref = mf.to_cpu().CasidaTDDFT().set().gen_vind()[0](zs) + dat = mf.CasidaTDDFT().set().gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + +if __name__ == "__main__": + print("Full Tests for TD-RKS") + unittest.main() diff --git a/gpu4pyscf/tdscf/tests/test_tduhf.py b/gpu4pyscf/tdscf/tests/test_tduhf.py new file mode 100644 index 00000000..2b6c2df9 --- /dev/null +++ b/gpu4pyscf/tdscf/tests/test_tduhf.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +from pyscf import lib, gto, scf +from gpu4pyscf import tdscf + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + mol = gto.Mole() + mol.verbose = 0 + mol.atom = [ + ['H' , (0. , 0. , .917)], + ['F' , (0. , 0. , 0.)], ] + mol.basis = '631g' + # FIXME: mo_coeff of uhf_symm.SymAdaptedUHF not converted to cupy arrays + mol.symmetry = True + cls.mol = mol.build() + cls.mf = scf.UHF(mol).density_fit().run(conv_tol=1e-10).to_gpu() + + mol1 = gto.Mole() + mol1.verbose = 7 + mol1.output = '/dev/null' + mol1.atom = [ + ['H' , (0. , 0. , .917)], + ['F' , (0. , 0. , 0.)], ] + mol1.basis = '631g' + mol1.spin = 2 + cls.mol1 = mol1.build() + cls.mf1 = scf.UHF(mol1).run(conv_tol=1e-10).to_gpu() + + @classmethod + def tearDownClass(cls): + cls.mol1.stdout.close() + + def test_tda(self): + mf = self.mf + td = mf.TDA() + assert td.device == 'gpu' + td.nstates = 5 + e = td.kernel()[0] + ref = [11.0179839, 11.0179839, 11.9031214, 11.9031214, 13.1701375] + self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4) + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(e - ref).max(), 0, 4) + + def test_tdhf(self): + mf = self.mf + td = mf.TDHF() + assert td.device == 'gpu' + td.nstates = 5 + td.conv_tol = 1e-5 + e = td.kernel()[0] + ref = [10.8924334, 10.8924334, 11.8352278, 11.8352278, 12.6350840] + self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4) + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(e - ref).max(), 0, 4) + + def test_tda1(self): + mf1 = self.mf1 + td = mf1.TDA() + assert td.device == 'gpu' + td.nstates = 5 + e = td.kernel()[0] + ref = [ 3.3211349, 18.5597821, 21.0147390, 21.6150240, 25.0938938] + self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4) + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(e - ref).max(), 0, 4) + + def test_tdhf1(self): + mf1 = self.mf1 + td = mf1.TDHF() + assert td.device == 'gpu' + td.nstates = 4 + e = td.kernel()[0] + ref = [ 3.3126683, 18.4954862, 20.8493515, 21.5480882,] + self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4) + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(e - ref).max(), 0, 4) + + def test_tda_vind(self): + mf = self.mf1 + nocca, noccb = mf.nelec + nmo = mf.mo_energy[0].size + nvira = nmo - nocca + nvirb = nmo - noccb + zs = np.random.rand(3,nocca*nvira+noccb*nvirb) + ref = mf.to_cpu().TDA().set().gen_vind()[0](zs) + dat = mf.TDA().set().gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + def test_tdhf_vind(self): + mf = self.mf1 + nocca, noccb = mf.nelec + nmo = mf.mo_energy[0].size + nvira = nmo - nocca + nvirb = nmo - noccb + zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb) + ref = mf.to_cpu().TDHF().set().gen_vind()[0](zs) + dat = mf.TDHF().set().gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + +if __name__ == "__main__": + print("Full Tests for uhf-TDA and uhf-TDHF") + unittest.main() diff --git a/gpu4pyscf/tdscf/tests/test_tduks.py b/gpu4pyscf/tdscf/tests/test_tduks.py new file mode 100644 index 00000000..598e4156 --- /dev/null +++ b/gpu4pyscf/tdscf/tests/test_tduks.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +from pyscf import lib, gto +from gpu4pyscf import tdscf + +class KnownValues(unittest.TestCase): + @classmethod + def setUpClass(cls): + mol = gto.Mole() + mol.verbose = 5 + mol.output = '/dev/null' + mol.atom = ''' + O 0. 0. 0. + H 0. -0.757 0.587 + H 0. 0.757 0.587''' + mol.spin = 2 + mol.basis = '631g' + cls.mol = mol.build() + + mol1 = gto.Mole() + mol1.verbose = 0 + mol1.atom = ''' + O 0. 0. 0. + H 0. -0.757 0.587 + H 0. 0.757 0.587''' + mol1.basis = '631g' + cls.mol1 = mol1.build() + + cls.mf_uhf = mf_uhf = mol.UHF().to_gpu().run() + cls.td_hf = mf_uhf.TDHF().run(conv_tol=1e-6) + + mf_lda = mol.UKS().set(xc='lda', conv_tol=1e-12).to_gpu() + mf_lda.grids.prune = None + mf_lda.cphf_grids = mf_lda.grids + cls.mf_lda = mf_lda.density_fit().run() + + mf_bp86 = mol.UKS().set(xc='b88,p86', conv_tol=1e-12).to_gpu() + mf_bp86.grids.prune = None + mf_bp86.cphf_grids = mf_bp86.grids + cls.mf_bp86 = mf_bp86.density_fit().run() + + mf_b3lyp = mol.UKS().set(xc='b3lyp5', conv_tol=1e-12).to_gpu() + mf_b3lyp.grids.prune = None + mf_b3lyp.cphf_grids = mf_b3lyp.grids + cls.mf_b3lyp = mf_b3lyp.density_fit().run() + + mf_m06l = mol.UKS().to_gpu().density_fit().run(xc='m06l') + mf_m06l.cphf_grids = mf_m06l.grids + cls.mf_m06l = mf_m06l + + @classmethod + def tearDownClass(cls): + cls.mol.stdout.close() + + def test_nohybrid_lda(self): + mf_lda = self.mf_lda + td = mf_lda.CasidaTDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=4)[0] + e_ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.0476763425122965, 6) + + mol1 = self.mol1 + mf = mol1.UKS().run(xc='lda, vwn_rpa').run() + mf.cphf_grids = mf.grids + td = mf.CasidaTDDFT().to_gpu() + assert td.device == 'gpu' + td.nstates = 5 + es = td.kernel()[0] + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + + def test_nohybrid_b88p86(self): + mf_bp86 = self.mf_bp86 + td = mf_bp86.CasidaTDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=4)[0] + e_ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.05383891686210346, 6) + + def test_tddft_lda(self): + mf_lda = self.mf_lda + td = mf_lda.TDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=4)[0] + ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.0476763425122965, 6) + + def test_tddft_b88p86(self): + mf_bp86 = self.mf_bp86 + td = mf_bp86.TDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.05383891686259823, 6) + + mol1 = self.mol1 + mf = mol1.UKS().run(xc='b88,p86').run() + mf.cphf_grids = mf.grids + td = mf.TDDFT().to_gpu() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + + def test_tddft_b3lyp(self): + mf_b3lyp = self.mf_b3lyp + td = mf_b3lyp.TDDFT() + assert td.device == 'gpu' + es = td.kernel(nstates=4)[0] + ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.047793873508724743, 6) + + def test_tddft_camb3lyp(self): + mol1 = self.mol1 + mf = mol1.UKS(xc='camb3lyp').run() + mf.cphf_grids = mf.grids + td = mf.TDDFT().to_gpu() + assert td.device == 'gpu' + es = td.kernel(nstates=4)[0] + e_ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.2827429269753051, 6) + + def test_tda_b3lyp(self): + mf_b3lyp = self.mf_b3lyp + td = mf_b3lyp.TDA() + assert td.device == 'gpu' + es = td.kernel(nstates=4)[0] + ref = td.to_cpu().kernel(nstates=4)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.052638024165134974, 6) + + def test_tda_lda(self): + mf_lda = self.mf_lda + td = mf_lda.TDA() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es[:3]), 0.05368082550881462, 6) + + mol1 = self.mol1 + mf = mol1.UKS().run(xc='lda,vwn').run() + mf.cphf_grids = mf.grids + td = mf.TDA().to_gpu() + assert td.device == 'gpu' + td.nstates = 5 + es = td.kernel()[0] + ref = td.to_cpu().kernel()[0] + self.assertAlmostEqual(abs(es - ref).max(), 0, 8) + + def test_tda_m06l(self): + mf_m06l = self.mf_m06l + td = mf_m06l.TDA() + assert td.device == 'gpu' + es = td.kernel(nstates=5)[0] + ref = td.to_cpu().kernel(nstates=5)[0] + self.assertAlmostEqual(abs(es - ref[:5]).max(), 0, 8) + self.assertAlmostEqual(lib.fp(es), -0.7530329968766932, 6) + + def test_tda_vind(self): + mf = self.mf_bp86 + nocca, noccb = mf.nelec + nmo = mf.mo_energy[0].size + nvira = nmo - nocca + nvirb = nmo - noccb + zs = np.random.rand(3,nocca*nvira+noccb*nvirb) + ref = mf.to_cpu().TDA().gen_vind()[0](zs) + dat = mf.TDA().gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + def test_tddft_vind(self): + mf = self.mf_b3lyp + nocca, noccb = mf.nelec + nmo = mf.mo_energy[0].size + nvira = nmo - nocca + nvirb = nmo - noccb + zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb) + ref = mf.to_cpu().TDDFT().gen_vind()[0](zs) + dat = mf.TDDFT().gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + + def test_casida_tddft_vind(self): + mf = self.mf_lda + nocca, noccb = mf.nelec + nmo = mf.mo_energy[0].size + nvira = nmo - nocca + nvirb = nmo - noccb + zs = np.random.rand(3,nocca*nvira+noccb*nvirb) + ref = mf.to_cpu().CasidaTDDFT().gen_vind()[0](zs) + dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs)) + self.assertAlmostEqual(abs(ref - dat).max(), 0, 9) + +if __name__ == "__main__": + print("Full Tests for TD-UKS") + unittest.main() diff --git a/gpu4pyscf/tdscf/uhf.py b/gpu4pyscf/tdscf/uhf.py new file mode 100644 index 00000000..27cc0850 --- /dev/null +++ b/gpu4pyscf/tdscf/uhf.py @@ -0,0 +1,785 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.tdscf import uhf as tdhf_cpu +from pyscf.data.nist import HARTREE2EV, HARTREE2WAVENUMBER +from pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig +from gpu4pyscf import scf +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract, tag_array +from gpu4pyscf.tdscf._uhf_resp_sf import gen_uhf_response_sf +from gpu4pyscf.tdscf import rhf as tdhf_gpu +from gpu4pyscf.dft import KohnShamDFT +from pyscf import __config__ + +__all__ = [ + 'TDA', 'CIS', 'TDHF', 'TDUHF', 'TDBase' +] + +REAL_EIG_THRESHOLD = tdhf_cpu.REAL_EIG_THRESHOLD + +def gen_tda_operation(mf, fock_ao=None, wfnsym=None): + '''A x + ''' + assert fock_ao is None + assert isinstance(mf, scf.hf.SCF) + assert wfnsym is None + if isinstance(mf.mo_coeff, (tuple, list)): + # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff. + # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag). + # cp.asarray() for this object leads to an error in + # cupy._core.core._array_from_nested_sequence + mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1]) + else: + mo_coeff = cp.asarray(mf.mo_coeff) + assert mo_coeff[0].dtype == cp.float64 + mo_energy = cp.asarray(mf.mo_energy) + mo_occ = cp.asarray(mf.mo_occ) + nao, nmo = mo_coeff[0].shape + occidxa = mo_occ[0] > 0 + occidxb = mo_occ[1] > 0 + viridxa = mo_occ[0] ==0 + viridxb = mo_occ[1] ==0 + orboa = mo_coeff[0][:,occidxa] + orbob = mo_coeff[1][:,occidxb] + orbva = mo_coeff[0][:,viridxa] + orbvb = mo_coeff[1][:,viridxb] + + e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None] + e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None] + e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1))) + hdiag = e_ia.get() + nocca, nvira = e_ia_a.shape + noccb, nvirb = e_ia_b.shape + + vresp = mf.gen_response(hermi=0) + + def vind(zs): + nz = len(zs) + zs = cp.asarray(zs) + za = zs[:,:nocca*nvira].reshape(nz,nocca,nvira) + zb = zs[:,nocca*nvira:].reshape(nz,noccb,nvirb) + mo1a = contract('xov,pv->xpo', za, orbva) + dmsa = contract('xpo,qo->xpq', mo1a, orboa.conj()) + mo1b = contract('xov,pv->xpo', zb, orbvb) + dmsb = contract('xpo,qo->xpq', mo1b, orbob.conj()) + dms = cp.asarray((dmsa, dmsb)) + dms = tag_array(dms, mo1=[mo1a,mo1b], occ_coeff=[orboa,orbob]) + v1ao = vresp(dms) + v1a = contract('xpq,qo->xpo', v1ao[0], orboa) + v1a = contract('xpo,pv->xov', v1a, orbva.conj()) + v1b = contract('xpq,qo->xpo', v1ao[1], orbob) + v1b = contract('xpo,pv->xov', v1b, orbvb.conj()) + v1a += za * e_ia_a + v1b += zb * e_ia_b + hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1))) + return hx.get() + + return vind, hdiag + + +class TDBase(tdhf_gpu.TDBase): + def _contract_multipole(tdobj, ints, hermi=True, xy=None): + if xy is None: xy = tdobj.xy + mo_coeff = tdobj._scf.mo_coeff + mo_occ = tdobj._scf.mo_occ + orbo_a = mo_coeff[0][:,mo_occ[0]==1] + orbv_a = mo_coeff[0][:,mo_occ[0]==0] + orbo_b = mo_coeff[1][:,mo_occ[1]==1] + orbv_b = mo_coeff[1][:,mo_occ[1]==0] + if isinstance(orbo_a, cp.ndarray): + orbo_a = orbo_a.get() + orbv_a = orbv_a.get() + orbo_b = orbo_b.get() + orbv_b = orbv_b.get() + + ints_a = np.einsum('...pq,pi,qj->...ij', ints, orbo_a.conj(), orbv_a) + ints_b = np.einsum('...pq,pi,qj->...ij', ints, orbo_b.conj(), orbv_b) + pol = [(np.einsum('...ij,ij->...', ints_a, x[0]) + + np.einsum('...ij,ij->...', ints_b, x[1])) for x,y in xy] + pol = np.array(pol) + y = xy[0][1] + if isinstance(y[0], np.ndarray): + pol_y = [(np.einsum('...ij,ij->...', ints_a, y[0]) + + np.einsum('...ij,ij->...', ints_b, y[1])) for x,y in xy] + if hermi: + pol += pol_y + else: # anti-Hermitian + pol -= pol_y + return pol + + +class TDA(TDBase): + __doc__ = tdhf_gpu.TDA.__doc__ + + singlet = None + + def gen_vind(self, mf=None): + '''Generate function to compute Ax''' + if mf is None: + mf = self._scf + return gen_tda_operation(mf) + + def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False): + if mf is None: mf = self._scf + if nstates is None: nstates = self.nstates + assert wfnsym is None + assert not return_symmetry + + mo_energy_a, mo_energy_b = mf.mo_energy + mo_occ_a, mo_occ_b = mf.mo_occ + if isinstance(mo_energy_a, cp.ndarray): + mo_energy_a = mo_energy_a.get() + mo_energy_b = mo_energy_b.get() + if isinstance(mo_occ_a, cp.ndarray): + mo_occ_a = mo_occ_a.get() + mo_occ_b = mo_occ_b.get() + occidxa = mo_occ_a > 0 + occidxb = mo_occ_b > 0 + viridxa = mo_occ_a == 0 + viridxb = mo_occ_b == 0 + e_ia_a = mo_energy_a[viridxa] - mo_energy_a[occidxa,None] + e_ia_b = mo_energy_b[viridxb] - mo_energy_b[occidxb,None] + nov = e_ia_a.size + e_ia_b.size + nstates = min(nstates, nov) + + e_ia = np.append(e_ia_a.ravel(), e_ia_b.ravel()) + # Find the nstates-th lowest energy gap + e_threshold = np.partition(e_ia, nstates-1)[nstates-1] + e_threshold += self.deg_eia_thresh + + idx = np.where(e_ia <= e_threshold)[0] + x0 = np.zeros((idx.size, nov)) + for i, j in enumerate(idx): + x0[i, j] = 1 + return x0 + + def kernel(self, x0=None, nstates=None): + '''TDA diagonalization solver + ''' + log = logger.new_logger(self) + cpu0 = (logger.process_clock(), logger.perf_counter()) + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + + vind, hdiag = self.gen_vind(self._scf) + precond = self.get_precond(hdiag) + + def pickeig(w, v, nroots, envs): + idx = np.where(w > self.positive_eig_threshold)[0] + return w[idx], v[:,idx], idx + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + self.converged, self.e, x1 = lr_eigh( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + nmo = self._scf.mo_occ[0].size + nocca, noccb = self._scf.nelec + nvira = nmo - nocca + nvirb = nmo - noccb + self.xy = [((xi[:nocca*nvira].reshape(nocca,nvira), # X_alpha + xi[nocca*nvira:].reshape(noccb,nvirb)), # X_beta + (0, 0)) # (Y_alpha, Y_beta) + for xi in x1] + + log.timer('TDA', *cpu0) + self._finalize() + return self.e, self.xy + +CIS = TDA + +class SpinFlipTDA(TDBase): + ''' + Attributes: + extype : int (0 or 1) + Spin flip up: exytpe=0. Spin flip down: exytpe=1. + collinear : str + collinear schemes, can be + 'col': collinear, by default + 'ncol': non-collinear + 'mcol': multi-collinear + collinear_samples : int + Integration samples for the multi-collinear treatment + ''' + + extype = getattr(__config__, 'tdscf_uhf_SFTDA_extype', 1) + collinear = getattr(__config__, 'tdscf_uhf_SFTDA_collinear', 'col') + collinear_samples = getattr(__config__, 'tdscf_uhf_SFTDA_collinear_samples', 200) + + _keys = {'extype', 'collinear', 'collinear_samples'} + + def gen_vind(self): + '''Generate function to compute A*x for spin-flip TDDFT case. + ''' + mf = self._scf + assert isinstance(mf, scf.hf.SCF) + if isinstance(mf.mo_coeff, (tuple, list)): + # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff. + # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag). + # cp.asarray() for this object leads to an error in + # cupy._core.core._array_from_nested_sequence + mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1]) + else: + mo_coeff = cp.asarray(mf.mo_coeff) + assert mo_coeff[0].dtype == cp.float64 + mo_energy = cp.asarray(mf.mo_energy) + mo_occ = cp.asarray(mf.mo_occ) + nao, nmo = mo_coeff[0].shape + + extype = self.extype + if extype == 0: + occidxb = mo_occ[1] > 0 + viridxa = mo_occ[0] ==0 + orbob = mo_coeff[1][:,occidxb] + orbva = mo_coeff[0][:,viridxa] + orbov = (orbob, orbva) + e_ia = mo_energy[0][viridxa] - mo_energy[1][occidxb,None] + hdiag = e_ia.ravel().get() + + elif extype == 1: + occidxa = mo_occ[0] > 0 + viridxb = mo_occ[1] ==0 + orboa = mo_coeff[0][:,occidxa] + orbvb = mo_coeff[1][:,viridxb] + orbov = (orboa, orbvb) + e_ia = mo_energy[1][viridxb] - mo_energy[0][occidxa,None] + hdiag = e_ia.ravel().get() + + vresp = gen_uhf_response_sf( + mf, hermi=0, collinear=self.collinear, + collinear_samples=self.collinear_samples) + + def vind(zs): + zs = cp.asarray(zs).reshape(-1, *e_ia.shape) + orbo, orbv = orbov + mo1 = contract('xov,pv->xpo', zs, orbv) + dms = contract('xpo,qo->xpq', mo1, orbo.conj()) + dms = tag_array(dms, mo1=mo1, occ_coeff=orbo) + v1ao = vresp(dms) + v1mo = contract('xpq,qo->xpo', v1ao, orbo) + v1mo = contract('xpo,pv->xov', v1mo, orbv.conj()) + v1mo += zs * e_ia + return v1mo.reshape(len(v1mo), -1).get() + + return vind, hdiag + + def _init_guess(self, mf, nstates): + mo_energy_a, mo_energy_b = mf.mo_energy + mo_occ_a, mo_occ_b = mf.mo_occ + if isinstance(mo_energy_a, cp.ndarray): + mo_energy_a = mo_energy_a.get() + mo_energy_b = mo_energy_b.get() + if isinstance(mo_occ_a, cp.ndarray): + mo_occ_a = mo_occ_a.get() + mo_occ_b = mo_occ_b.get() + + if self.extype == 0: + occidxb = mo_occ_b > 0 + viridxa = mo_occ_a ==0 + e_ia = mo_energy_a[viridxa] - mo_energy_b[occidxb,None] + + elif self.extype == 1: + occidxa = mo_occ_a > 0 + viridxb = mo_occ_b ==0 + e_ia = mo_energy_b[viridxb] - mo_energy_a[occidxa,None] + + e_ia = e_ia.ravel() + nov = e_ia.size + nstates = min(nstates, nov) + e_threshold = np.partition(e_ia, nstates-1)[nstates-1] + idx = np.where(e_ia <= e_threshold)[0] + nstates = idx.size + e = e_ia[idx] + idx = idx[np.argsort(e)] + x0 = np.zeros((nstates, nov)) + for i, j in enumerate(idx): + x0[i, j] = 1 + return np.sort(e), x0.reshape(nstates, *e_ia.shape) + + def init_guess(self, mf=None, nstates=None, wfnsym=None): + if mf is None: mf = self._scf + if nstates is None: nstates = self.nstates + x0 = self._init_guess(mf, nstates)[1] + return x0.reshape(len(x0), -1) + + def dump_flags(self, verbose=None): + TDBase.dump_flags(self, verbose) + logger.info(self, 'extype = %s', self.extype) + logger.info(self, 'collinear = %s', self.collinear) + if self.collinear == 'mcol': + logger.info(self, 'collinear_samples = %s', self.collinear_samples) + return self + + def check_sanity(self): + TDBase.check_sanity(self) + assert self.extype in (0, 1) + assert self.collinear in ('col', 'ncol', 'mcol') + return self + + def kernel(self, x0=None, nstates=None): + '''Spin-flip TDA diagonalization solver + ''' + log = logger.new_logger(self) + cpu0 = log.init_timer() + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + + if self.collinear == 'col' and isinstance(self._scf, KohnShamDFT): + mf = self._scf + ni = mf._numint + if not ni.libxc.is_hybrid_xc(mf.xc): + self.converged = True + self.e, xs = self._init_guess() + self.xy = [(x, 0) for x in xs] + return self.e, self.xy + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + # Keep all eigenvalues as SF-TDDFT allows triplet to singlet + # "dexcitation" + def all_eigs(w, v, nroots, envs): + return w, v, np.arange(w.size) + + vind, hdiag = self.gen_vind() + precond = self.get_precond(hdiag) + + self.converged, self.e, x1 = lr_eigh( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=all_eigs, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + nmo = self._scf.mo_occ[0].size + nocca, noccb = self._scf.nelec + nvira = nmo - nocca + nvirb = nmo - noccb + + if self.extype == 0: + self.xy = [(xi.reshape(noccb,nvira), 0) for xi in x1] + elif self.extype == 1: + self.xy = [(xi.reshape(nocca,nvirb), 0) for xi in x1] + log.timer('SpinFlipTDA', *cpu0) + self._finalize() + return self.e, self.xy + + +def gen_tdhf_operation(mf, fock_ao=None, singlet=True, wfnsym=None): + '''Generate function to compute + + [ A B ][X] + [-B* -A*][Y] + ''' + assert fock_ao is None + assert isinstance(mf, scf.hf.SCF) + if isinstance(mf.mo_coeff, (tuple, list)): + # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff. + # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag). + # cp.asarray() for this object leads to an error in + # cupy._core.core._array_from_nested_sequence + mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1]) + else: + mo_coeff = cp.asarray(mf.mo_coeff) + assert mo_coeff[0].dtype == cp.float64 + mo_energy = cp.asarray(mf.mo_energy) + mo_occ = cp.asarray(mf.mo_occ) + occidxa = mo_occ[0] > 0 + occidxb = mo_occ[1] > 0 + viridxa = mo_occ[0] == 0 + viridxb = mo_occ[1] == 0 + orboa = mo_coeff[0][:,occidxa] + orbob = mo_coeff[1][:,occidxb] + orbva = mo_coeff[0][:,viridxa] + orbvb = mo_coeff[1][:,viridxb] + + e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None] + e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None] + e_ia = hdiag = cp.hstack((e_ia_a.ravel(), e_ia_b.ravel())) + hdiag = cp.hstack((hdiag, -hdiag)).get() + nocca, nvira = e_ia_a.shape + noccb, nvirb = e_ia_b.shape + + vresp = mf.gen_response(hermi=0) + + def vind(xys): + nz = len(xys) + xys = cp.asarray(xys).reshape(nz,2,-1) + xs, ys = xys.transpose(1,0,2) + xa = xs[:,:nocca*nvira].reshape(nz,nocca,nvira) + xb = xs[:,nocca*nvira:].reshape(nz,noccb,nvirb) + ya = ys[:,:nocca*nvira].reshape(nz,nocca,nvira) + yb = ys[:,nocca*nvira:].reshape(nz,noccb,nvirb) + tmp = contract('xov,pv->xpo', xa, orbva) + dmsa = contract('xpo,qo->xpq', tmp, orboa.conj()) + tmp = contract('xov,pv->xpo', xb, orbvb) + dmsb = contract('xpo,qo->xpq', tmp, orbob.conj()) + tmp = contract('xov,qv->xoq', ya, orbva.conj()) + dmsa+= contract('xoq,po->xpq', tmp, orboa) + tmp = contract('xov,qv->xoq', yb, orbvb.conj()) + dmsb+= contract('xoq,po->xpq', tmp, orbob) + v1ao = vresp(cp.asarray((dmsa,dmsb))) + v1a_top = contract('xpq,qo->xpo', v1ao[0], orboa) + v1a_top = contract('xpo,pv->xov', v1a_top, orbva.conj()) + v1b_top = contract('xpq,qo->xpo', v1ao[1], orbob) + v1b_top = contract('xpo,pv->xov', v1b_top, orbvb.conj()) + v1a_bot = contract('xpq,po->xoq', v1ao[0], orboa.conj()) + v1a_bot = contract('xoq,qv->xov', v1a_bot, orbva) + v1b_bot = contract('xpq,po->xoq', v1ao[1], orbob.conj()) + v1b_bot = contract('xoq,qv->xov', v1b_bot, orbvb) + + v1_top = xs * e_ia + v1_bot = ys * e_ia + v1_top[:,:nocca*nvira] += v1a_top.reshape(nz,-1) + v1_bot[:,:nocca*nvira] += v1a_bot.reshape(nz,-1) + v1_top[:,nocca*nvira:] += v1b_top.reshape(nz,-1) + v1_bot[:,nocca*nvira:] += v1b_bot.reshape(nz,-1) + hx = cp.hstack((v1_top, -v1_bot)) + return hx.get() + + return vind, hdiag + + +class TDHF(TDBase): + + singlet = None + + @lib.with_doc(gen_tdhf_operation.__doc__) + def gen_vind(self, mf=None): + if mf is None: + mf = self._scf + return gen_tdhf_operation(mf, singlet=self.singlet) + + def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False): + x0 = TDA.init_guess(self, mf, nstates, wfnsym, return_symmetry) + y0 = np.zeros_like(x0) + return np.hstack([x0, y0]) + + def kernel(self, x0=None, nstates=None): + '''TDHF diagonalization with non-Hermitian eigenvalue solver + ''' + log = logger.new_logger(self) + cpu0 = log.init_timer() + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + + vind, hdiag = self.gen_vind(self._scf) + precond = self.get_precond(hdiag) + + # handle single kpt PBC SCF + if getattr(self._scf, 'kpt', None) is not None: + from pyscf.pbc.lib.kpts_helper import gamma_point + real_system = (gamma_point(self._scf.kpt) and + self._scf.mo_coeff[0].dtype == np.double) + else: + real_system = True + + # We only need positive eigenvalues + def pickeig(w, v, nroots, envs): + realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) & + (w.real > self.positive_eig_threshold))[0] + return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system) + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + self.converged, w, x1 = lr_eig( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + nmo = self._scf.mo_occ[0].size + nocca, noccb = self._scf.nelec + nvira = nmo - nocca + nvirb = nmo - noccb + e = [] + xy = [] + for i, z in enumerate(x1): + x, y = z.reshape(2,-1) + norm = lib.norm(x)**2 - lib.norm(y)**2 + if norm > 0: + norm = norm**-.5 + e.append(w[i]) + xy.append(((x[:nocca*nvira].reshape(nocca,nvira) * norm, # X_alpha + x[nocca*nvira:].reshape(noccb,nvirb) * norm), # X_beta + (y[:nocca*nvira].reshape(nocca,nvira) * norm, # Y_alpha + y[nocca*nvira:].reshape(noccb,nvirb) * norm)))# Y_beta + self.e = np.array(e) + self.xy = xy + + log.timer('TDDFT', *cpu0) + self._finalize() + return self.e, self.xy + +TDUHF = TDHF + +class SpinFlipTDHF(TDBase): + + extype = SpinFlipTDA.extype + collinear = SpinFlipTDA.collinear + collinear_samples = SpinFlipTDA.collinear_samples + + _keys = {'extype', 'collinear', 'collinear_samples'} + + def gen_vind(self): + '''Generate function to compute A*x for spin-flip TDDFT case. + ''' + mf = self._scf + assert isinstance(mf, scf.hf.SCF) + if isinstance(mf.mo_coeff, (tuple, list)): + # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff. + # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag). + # cp.asarray() for this object leads to an error in + # cupy._core.core._array_from_nested_sequence + mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1]) + else: + mo_coeff = cp.asarray(mf.mo_coeff) + assert mo_coeff[0].dtype == cp.float64 + mo_energy = cp.asarray(mf.mo_energy) + mo_occ = cp.asarray(mf.mo_occ) + nao, nmo = mo_coeff[0].shape + + occidxa = mo_occ[0] > 0 + occidxb = mo_occ[1] > 0 + viridxa = mo_occ[0] ==0 + viridxb = mo_occ[1] ==0 + orboa = mo_coeff[0][:,occidxa] + orbob = mo_coeff[1][:,occidxb] + orbva = mo_coeff[0][:,viridxa] + orbvb = mo_coeff[1][:,viridxb] + e_ia_b2a = mo_energy[0][viridxa] - mo_energy[1][occidxb,None] + e_ia_a2b = mo_energy[1][viridxb] - mo_energy[0][occidxa,None] + nocca, nvirb = e_ia_a2b.shape + noccb, nvira = e_ia_b2a.shape + + extype = self.extype + if extype == 0: + hdiag = cp.hstack([e_ia_b2a.ravel(), -e_ia_a2b.ravel()]).get() + else: + hdiag = cp.hstack([e_ia_a2b.ravel(), -e_ia_b2a.ravel()]).get() + + vresp = gen_uhf_response_sf( + mf, hermi=0, collinear=self.collinear, + collinear_samples=self.collinear_samples) + + def vind(zs): + nz = len(zs) + zs = cp.asarray(zs).reshape(nz, -1) + if extype == 0: + zs_b2a = zs[:,:noccb*nvira].reshape(nz,noccb,nvira) + zs_a2b = zs[:,noccb*nvira:].reshape(nz,nocca,nvirb) + dm_b2a = contract('xov,pv->xpo', zs_b2a, orbva) + dm_b2a = contract('xpo,qo->xpq', dm_b2a, orbob.conj()) + dm_a2b = contract('xov,qv->xoq', zs_a2b, orbvb.conj()) + dm_a2b = contract('xoq,po->xpq', dm_a2b, orboa) + else: + zs_a2b = zs[:,:nocca*nvirb].reshape(nz,nocca,nvirb) + zs_b2a = zs[:,nocca*nvirb:].reshape(nz,noccb,nvira) + dm_b2a = contract('xov,pv->xpo', zs_b2a, orbva) + dm_b2a = contract('xpo,qo->xpq', dm_b2a, orbob.conj()) + dm_a2b = contract('xov,qv->xoq', zs_a2b, orbvb.conj()) + dm_a2b = contract('xoq,po->xpq', dm_a2b, orboa) + + ''' + # The slow way to compute individual terms in + # [A B] [X] + # [B* A*] [Y] + dms = cp.vstack([dm_b2a, dm_a2b]) + v1ao = vresp(dms) + v1ao_b2a, v1ao_a2b = v1ao[:nz], v1ao[nz:] + if extype == 0: + # A*X = (aI||Jb) * z_b2a = -(ab|IJ) * z_b2a + v1A_b2a = contract('xpq,qo->xpo', v1ao_b2a, orbob) + v1A_b2a = contract('xpo,pv->xov', v1A_b2a, orbva.conj()) + # (A*)*Y = (iA||Bj) * z_a2b = -(ij|BA) * z_a2b + v1A_a2b = contract('xpq,po->xoq', v1ao_a2b, orboa.conj()) + v1A_a2b = contract('xoq,qv->xov', v1A_a2b, orbvb) + # B*Y = (aI||Bj) * z_a2b = -(aj|BI) * z_a2b + v1B_b2a = contract('xpq,qo->xpo', v1ao_a2b, orbob) + v1B_b2a = contract('xpo,pv->xov', v1B_b2a, orbva.conj()) + # (B*)*X = (iA||Jb) * z_b2a = -(ib|JA) * z_b2a + v1B_a2b = contract('xpq,po->xoq', v1ao_b2a, orboa.conj()) + v1B_a2b = contract('xoq,qv->xov', v1B_a2b, orbvb) + # add the orbital energy difference in A matrix. + v1_top = v1A_b2a + v1B_b2a + zs_b2a * e_ia_b2a + v1_bot = v1B_a2b + v1A_a2b + zs_a2b * e_ia_a2b + hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)]) + else: + # A*X = (Ai||jB) * z_a2b = -(AB|ij) * z_a2b + v1A_a2b = contract('xpq,qo->xpo', v1ao_a2b, orboa) + v1A_a2b = contract('xpo,pv->xov', v1A_a2b, orbvb.conj()) + # (A*)*Y = (Ia||bJ) * z_b2a = -(IJ|ba) * z_b2a + v1A_b2a = contract('xpq,po->xoq', v1ao_b2a, orbob.conj()) + v1A_b2a = contract('xoq,qv->xov', v1A_b2a, orbva) + # B*Y = (Ai||bJ) * z_b2a = -(AJ|bi) * z_b2a + v1B_a2b = contract('xpq,qo->xpo', v1ao_b2a, orboa) + v1B_a2b = contract('xpo,pv->xov', v1B_a2b, orbvb.conj()) + # (B*)*X = (Ia||jB) * z_a2b = -(IB|ja) * z_a2b + v1B_b2a = contract('xpq,po->xoq', v1ao_a2b, orbob.conj()) + v1B_b2a = contract('xoq,qv->xov', v1B_b2a, orbva) + # add the orbital energy difference in A matrix. + v1_top = v1A_a2b + v1B_a2b + zs_a2b * e_ia_a2b + v1_bot = v1B_b2a + v1A_b2a + zs_b2a * e_ia_b2a + hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)]) + ''' + + # [A B] [X] + # [B* A*] [Y] + # is simplified to + dms = dm_b2a + dm_a2b + v1ao = vresp(dms) + if extype == 0: + # v1_top = A*X+B*Y + # A*X = (aI||Jb) * z_b2a = -(ab|JI) * z_b2a + # B*Y = (aI||Bj) * z_a2b = -(aj|BI) * z_a2b + v1_top = contract('xpq,qo->xpo', v1ao, orbob) + v1_top = contract('xpo,pv->xov', v1_top, orbva.conj()) + # (A*)*Y = (iA||Bj) * z_a2b = -(ij|BA) * z_a2b + # (B*)*X = (iA||Jb) * z_b2a = -(ib|JA) * z_b2a + # v1_bot = (B*)*X + (A*)*Y + v1_bot = contract('xpq,po->xoq', v1ao, orboa.conj()) + v1_bot = contract('xoq,qv->xov', v1_bot, orbvb) + # add the orbital energy difference in A matrix. + v1_top += zs_b2a * e_ia_b2a + v1_bot += zs_a2b * e_ia_a2b + else: + # v1_top = A*X+B*Y + # A*X = (Ai||jB) * z_a2b = -(AB|ji) * z_a2b + # B*Y = (Ai||bJ) * z_b2a = -(AJ|bi) * z_b2a + v1_top = contract('xpq,qo->xpo', v1ao, orboa) + v1_top = contract('xpo,pv->xov', v1_top, orbvb.conj()) + # v1_bot = (B*)*X + (A*)*Y + # (A*)*Y = (Ia||bJ) * z_b2a = -(IJ|ba) * z_b2a + # (B*)*X = (Ia||jB) * z_a2b = -(IB|ja) * z_a2b + v1_bot = contract('xpq,po->xoq', v1ao, orbob.conj()) + v1_bot = contract('xoq,qv->xov', v1_bot, orbva) + # add the orbital energy difference in A matrix. + v1_top += zs_a2b * e_ia_a2b + v1_bot += zs_b2a * e_ia_b2a + hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)]) + return hx.get() + + return vind, hdiag + + _init_guess = SpinFlipTDA._init_guess + + def init_guess(self, mf=None, nstates=None, wfnsym=None): + if mf is None: mf = self._scf + if nstates is None: nstates = self.nstates + x0 = self._init_guess(mf, nstates)[1] + nx = len(x0) + nmo = mf.mo_occ[0].size + nocca, noccb = mf.nelec + nvira = nmo - nocca + nvirb = nmo - noccb + if self.extype == 0: + y0 = np.zeros((nx, nocca*nvirb)) + else: + y0 = np.zeros((nx, noccb*nvira)) + return np.hstack([x0.reshape(nx,-1), y0]) + + dump_flags = SpinFlipTDA.dump_flags + check_sanity = SpinFlipTDA.check_sanity + + def kernel(self, x0=None, nstates=None): + '''Spin-flip TDA diagonalization solver + ''' + # TODO: Enable this feature after updating the TDDFT davidson algorithm + # in pyscf main branch + raise RuntimeError('Numerical issues in lr_eig') + log = logger.new_logger(self) + cpu0 = log.init_timer() + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + + if self.collinear == 'col' and isinstance(self._scf, KohnShamDFT): + raise NotImplementedError + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + real_system = self._scf.mo_coeff[0].dtype == np.float64 + def pickeig(w, v, nroots, envs): + realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) & + (w.real > self.positive_eig_threshold))[0] + return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system) + + vind, hdiag = self.gen_vind() + precond = self.get_precond(hdiag) + + self.converged, self.e, x1 = lr_eig( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + nmo = self._scf.mo_occ[0].size + nocca, noccb = self._scf.nelec + nvira = nmo - nocca + nvirb = nmo - noccb + + if self.extype == 0: + def norm_xy(z): + x = z[:noccb*nvira].reshape(noccb,nvira) + y = z[noccb*nvira:].reshape(nocca,nvirb) + norm = lib.norm(x)**2 - lib.norm(y)**2 + #assert norm > 0 + norm = abs(norm) ** -.5 + return x*norm, y*norm + elif self.extype == 1: + def norm_xy(z): + x = z[:nocca*nvirb].reshape(nocca,nvirb) + y = z[nocca*nvirb:].reshape(noccb,nvira) + norm = lib.norm(x)**2 - lib.norm(y)**2 + #assert norm > 0 + norm = abs(norm) ** -.5 + return x*norm, y*norm + + self.xy = [norm_xy(z) for z in x1] + log.timer('SpinFlipTDDFT', *cpu0) + self._finalize() + return self.e, self.xy + +scf.uhf.UHF.TDA = lib.class_as_method(TDA) +scf.uhf.UHF.TDHF = lib.class_as_method(TDHF) +scf.uhf.UHF.SFTDA = lib.class_as_method(SpinFlipTDA) +scf.uhf.UHF.SFTDHF = lib.class_as_method(SpinFlipTDHF) diff --git a/gpu4pyscf/tdscf/uks.py b/gpu4pyscf/tdscf/uks.py new file mode 100644 index 00000000..23646332 --- /dev/null +++ b/gpu4pyscf/tdscf/uks.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# +# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy as np +import cupy as cp +from pyscf import symm +from pyscf import lib +from pyscf.tdscf._lr_eig import eigh as lr_eigh +from gpu4pyscf.dft.rks import KohnShamDFT +from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum +from gpu4pyscf.lib import logger +from gpu4pyscf.tdscf import uhf as tdhf_gpu +from gpu4pyscf import dft + +__all__ = [ + 'TDA', 'TDDFT', 'TDUKS', 'CasidaTDDFT', 'TDDFTNoHybrid', +] + +TDA = tdhf_gpu.TDA +TDDFT = tdhf_gpu.TDHF +TDUKS = TDDFT +SpinFlipTDA = tdhf_gpu.SpinFlipTDA +SpinFlipTDDFT = tdhf_gpu.SpinFlipTDHF + +class CasidaTDDFT(TDDFT): + '''Solve the Casida TDDFT formula (A-B)(A+B)(X+Y) = (X+Y)w^2 + ''' + + init_guess = TDA.init_guess + + def gen_vind(self, mf=None): + if mf is None: + mf = self._scf + if isinstance(mf.mo_coeff, (tuple, list)): + # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff. + # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag). + # cp.asarray() for this object leads to an error in + # cupy._core.core._array_from_nested_sequence + mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1]) + else: + mo_coeff = cp.asarray(mf.mo_coeff) + assert mo_coeff[0].dtype == cp.float64 + mo_energy = cp.asarray(mf.mo_energy) + mo_occ = cp.asarray(mf.mo_occ) + occidxa = mo_occ[0] > 0 + occidxb = mo_occ[1] > 0 + viridxa = mo_occ[0] == 0 + viridxb = mo_occ[1] == 0 + orboa = mo_coeff[0][:,occidxa] + orbob = mo_coeff[1][:,occidxb] + orbva = mo_coeff[0][:,viridxa] + orbvb = mo_coeff[1][:,viridxb] + + e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None] + e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None] + e_ia = cp.hstack((e_ia_a.ravel(), e_ia_b.ravel())) + d_ia = e_ia**.5 + ed_ia = e_ia * d_ia + hdiag = e_ia ** 2 + hdiag = hdiag.get() + vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1) + nocca, nvira = e_ia_a.shape + noccb, nvirb = e_ia_b.shape + + def vind(zs): + assert zs.dtype == np.float64 + nz = len(zs) + zs = cp.asarray(zs).reshape(nz,-1) + dmsa = (zs[:,:nocca*nvira] * d_ia[:nocca*nvira]).reshape(nz,nocca,nvira) + dmsb = (zs[:,nocca*nvira:] * d_ia[nocca*nvira:]).reshape(nz,noccb,nvirb) + mo1a = contract('xov,pv->xpo', dmsa, orbva) + dmsa = contract('xpo,qo->xpq', mo1a, orboa) + mo1b = contract('xov,pv->xpo', dmsb, orbvb) + dmsb = contract('xpo,qo->xpq', mo1b, orbob) + dmsa = transpose_sum(dmsa) + dmsb = transpose_sum(dmsb) + dms = cp.asarray((dmsa, dmsb)) + dms = tag_array(dms, mo1=[mo1a,mo1b], occ_coeff=[orboa,orbob]) + v1ao = vresp(dms) + v1a = contract('xpq,qo->xpo', v1ao[0], orboa) + v1a = contract('xpo,pv->xov', v1a, orbva) + v1b = contract('xpq,qo->xpo', v1ao[1], orbob) + v1b = contract('xpo,pv->xov', v1b, orbvb) + hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1))) + hx += ed_ia * zs + hx *= d_ia + return hx.get() + + return vind, hdiag + + def kernel(self, x0=None, nstates=None): + '''TDDFT diagonalization solver + ''' + log = logger.new_logger(self) + cpu0 = log.init_timer() + mf = self._scf + if mf._numint.libxc.is_hybrid_xc(mf.xc): + raise RuntimeError('%s cannot be used with hybrid functional' + % self.__class__) + self.check_sanity() + self.dump_flags() + if nstates is None: + nstates = self.nstates + else: + self.nstates = nstates + + vind, hdiag = self.gen_vind(self._scf) + precond = self.get_precond(hdiag) + + def pickeig(w, v, nroots, envs): + idx = np.where(w > self.positive_eig_threshold)[0] + return w[idx], v[:,idx], idx + + x0sym = None + if x0 is None: + x0 = self.init_guess() + + self.converged, w2, x1 = lr_eigh( + vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep, + nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle, + max_memory=self.max_memory, verbose=log) + + mo_energy = self._scf.mo_energy + mo_occ = self._scf.mo_occ + occidxa = mo_occ[0] > 0 + occidxb = mo_occ[1] > 0 + viridxa = mo_occ[0] == 0 + viridxb = mo_occ[1] == 0 + e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None] + e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None] + nocca, nvira = e_ia_a.shape + noccb, nvirb = e_ia_b.shape + if isinstance(mo_energy, cp.ndarray): + e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1))) + e_ia = e_ia**.5 + e_ia = e_ia.get() + else: + e_ia = np.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1))) + e_ia = e_ia**.5 + + e = [] + xy = [] + for i, z in enumerate(x1): + if w2[i] < self.positive_eig_threshold: + continue + w = w2[i] ** .5 + zp = e_ia * z + zm = w/e_ia * z + x = (zp + zm) * .5 + y = (zp - zm) * .5 + norm = lib.norm(x)**2 - lib.norm(y)**2 + if norm > 0: + norm = norm**-.5 + e.append(w) + xy.append(((x[:nocca*nvira].reshape(nocca,nvira) * norm, # X_alpha + x[nocca*nvira:].reshape(noccb,nvirb) * norm), # X_beta + (y[:nocca*nvira].reshape(nocca,nvira) * norm, # Y_alpha + y[nocca*nvira:].reshape(noccb,nvirb) * norm)))# Y_beta + self.e = np.array(e) + self.xy = xy + + log.timer('TDDFT', *cpu0) + self._finalize() + return self.e, self.xy + +TDDFTNoHybrid = CasidaTDDFT + +def tddft(mf): + '''Driver to create TDDFT or CasidaTDDFT object''' + if mf._numint.libxc.is_hybrid_xc(mf.xc): + return TDDFT(mf) + else: + return CasidaTDDFT(mf) + +dft.uks.UKS.TDA = lib.class_as_method(TDA) +dft.uks.UKS.TDHF = None +#dft.uks.UKS.TDDFT = lib.class_as_method(TDDFT) +dft.uks.UKS.TDDFTNoHybrid = lib.class_as_method(TDDFTNoHybrid) +dft.uks.UKS.CasidaTDDFT = lib.class_as_method(CasidaTDDFT) +dft.uks.UKS.TDDFT = tddft +dft.uks.UKS.SFTDA = lib.class_as_method(SpinFlipTDA) +dft.uks.UKS.SFTDDFT = lib.class_as_method(SpinFlipTDDFT) diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py index dc3156cf..4546da4e 100644 --- a/gpu4pyscf/tests/test_dft.py +++ b/gpu4pyscf/tests/test_dft.py @@ -13,12 +13,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import unittest import numpy as np import pyscf import pytest import cupy +from gpu4pyscf.dft import rks, uks -atom = ''' +def setUpModule(): + global mol + atom = ''' C -0.07551087 1.68127663 -0.10745193 O 1.33621755 1.87147409 -0.39326987 C 1.67074668 2.95729545 0.49387976 @@ -41,112 +45,116 @@ H -3.93210821 0.28874990 -1.89865997 ''' -mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0) -mol.output = '/dev/null' -mol.build() -mol.verbose = 1 - -@pytest.mark.smoke -def test_b3lyp_with_d3bj(): - print('-------- DFRKS with D3(BJ) -------') - from gpu4pyscf.dft import rks - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - -@pytest.mark.smoke -def test_b3lyp_d3bj(): - print('-------- DFRKS with D3(BJ) -------') - from gpu4pyscf.dft import rks - mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - -@pytest.mark.smoke -def test_DFUKS(): - print('------- DFUKS with D3(BJ) -------') - from gpu4pyscf.dft import uks - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965349493) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4 - -@pytest.mark.smoke -def test_RKS(): - print('-------- RKS with D3(BJ) -------') - from gpu4pyscf.dft import rks - mf = rks.RKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - -@pytest.mark.smoke -def test_DFRKS_with_SMD(): - print('----- DFRKS with SMD -----') - from gpu4pyscf.dft import rks - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0578838805443) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4 - -@pytest.mark.smoke -def test_DFUKS_with_SMD(): - print('------- DFUKS with SMD ---------') - from gpu4pyscf.dft import uks - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.05788388063) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4 + mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0) + mol.output = '/dev/null' + mol.build() + mol.verbose = 1 + +def tearDownModule(): + global mol + mol.stdout.close() + del mol + +class KnownValues(unittest.TestCase): + @pytest.mark.smoke + def test_b3lyp_with_d3bj(self): + print('-------- DFRKS with D3(BJ) -------') + mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.disp = 'd3bj' + e_dft = mf.kernel() + assert np.abs(e_dft - -685.0326965348272) < 1e-7 + + g = mf.nuc_grad_method().kernel() + assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 + + h = mf.Hessian().kernel() + assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 + + @pytest.mark.smoke + def test_b3lyp_d3bj(self): + print('-------- DFRKS with D3(BJ) -------') + mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit') + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + e_dft = mf.kernel() + assert np.abs(e_dft - -685.0326965348272) < 1e-7 + + g = mf.nuc_grad_method().kernel() + assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 + + h = mf.Hessian().kernel() + assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 + + @pytest.mark.smoke + def test_DFUKS(self): + print('------- DFUKS with D3(BJ) -------') + mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.disp = 'd3bj' + e_dft = mf.kernel() + assert np.abs(e_dft - -685.0326965349493) < 1e-7 + + g = mf.nuc_grad_method().kernel() + assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5 + + h = mf.Hessian().kernel() + assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4 + + @pytest.mark.smoke + def test_RKS(self): + print('-------- RKS with D3(BJ) -------') + mf = rks.RKS(mol, xc='b3lyp') + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-12 + mf.disp = 'd3bj' + e_dft = mf.kernel() + assert np.abs(e_dft - -685.0325611822375) < 1e-7 + + g = mf.nuc_grad_method().kernel() + assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 + + @pytest.mark.smoke + def test_DFRKS_with_SMD(self): + print('----- DFRKS with SMD -----') + mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') + mf = mf.SMD() + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.disp = 'd3bj' + e_dft = mf.kernel() + assert np.abs(e_dft - -685.0578838805443) < 1e-7 + + g = mf.nuc_grad_method().kernel() + assert np.abs(cupy.linalg.norm(g) - 0.16905807654571403) < 1e-5 + + h = mf.Hessian().kernel() + assert np.abs(cupy.linalg.norm(h) - 3.743840896534178) < 1e-4 + + @pytest.mark.smoke + def test_DFUKS_with_SMD(self): + print('------- DFUKS with SMD ---------') + mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') + mf = mf.SMD() + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.disp = 'd3bj' + e_dft = mf.kernel() + assert np.abs(e_dft - -685.05788388063) < 1e-7 + + g = mf.nuc_grad_method().kernel() + assert np.abs(cupy.linalg.norm(g) - 0.1690582751813457) < 1e-5 + + h = mf.Hessian().kernel() + assert np.abs(cupy.linalg.norm(h) - 3.743858482519822) < 1e-4 + +if __name__ == "__main__": + print("Full Smoke Tests") + unittest.main()