diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 5a0d1bd1..7f33e0dd 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -24,7 +24,7 @@ jobs:
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install --upgrade pip
pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
- pip3 install "pyscf>2.5"
+ pip3 install pyscf --upgrade
pip3 install numpy --upgrade
pip3 install h5py --upgrade
pip3 install gpu4pyscf-libxc-cuda12x --upgrade
diff --git a/README.md b/README.md
index dbe59a0e..6eeb2e82 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,7 @@ Features
- Density fitting scheme and direct SCF scheme;
- SCF, analytical Gradient, and analytical Hessian calculations for Hartree-Fock and DFT;
- LDA, GGA, mGGA, hybrid, and range-separated functionals via [libXC](https://gitlab.com/libxc/libxc/-/tree/master/);
+- Spin-conserved and spin-flip TDA and TDDFT for excitated states
- Geometry optimization and transition state search via [geomeTRIC](https://geometric.readthedocs.io/en/latest/);
- Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4);
- Nonlocal functional correction (vv10) for SCF and gradient;
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
index 2bf6c993..5ed2b6d1 100644
--- a/examples/00-h2o.py
+++ b/examples/00-h2o.py
@@ -60,6 +60,7 @@
# Compute Hessian
h = mf_GPU.Hessian()
h.auxbasis_response = 2 # 0: no aux contribution, 1: some contributions, 2: all
+mf_GPU.cphf_grids.atom_grid = (50,194) # customize grids for solving CPSCF equation, SG1 by default
h_dft = h.kernel()
# harmonic analysis
diff --git a/examples/24-cp_bsse.py b/examples/24-cp_bsse.py
new file mode 100644
index 00000000..4ac8dc10
--- /dev/null
+++ b/examples/24-cp_bsse.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+####################################################
+# Example of interaction energy with counterpoise correction
+####################################################
+
+import pyscf
+from gpu4pyscf.dft import rks
+
+atom_A = [
+('O', (0.000000, 0.000000, 0.000000)),
+('H', (0.000000, 0.757160, 0.586260)),
+('H', (0.000000, -0.757160, 0.586260))
+]
+
+atom_B = [
+('O', (0.000000, 0.000000, 2.913530)),
+('H', (0.000000, 0.757160, 3.499790)),
+('H', (0.000000, -0.757160, 3.499790))
+]
+
+atom_AB = atom_A + atom_B
+
+mol_A = pyscf.M(atom=atom_A, basis='cc-pVDZ').build()
+mol_B = pyscf.M(atom=atom_B, basis='cc-pVDZ').build()
+mol_AB = pyscf.M(atom=atom_AB, basis='cc-pVDZ').build()
+
+# Monomer A in the dimer basis
+mol_A_ghost = mol_A.copy()
+ghost_atoms_B = mol_B.atom
+mol_A_ghost.atom.extend([('X-' + atom[0], atom[1]) for atom in ghost_atoms_B])
+mol_A_ghost.build()
+
+# Monomer B in the dimer basis
+mol_B_ghost = mol_B.copy()
+ghost_atoms_A = mol_A.atom
+mol_B_ghost.atom.extend([('X-' + atom[0], atom[1]) for atom in ghost_atoms_A])
+mol_B_ghost.build()
+
+def solve_dft(mol, xc='b3lyp'):
+ mf = rks.RKS(mol, xc='b3lyp').density_fit()
+ mf.grids.atom_grid = (99,590)
+ return mf.kernel()
+
+E_AB = solve_dft(mol_AB)
+E_A = solve_dft(mol_A)
+E_B = solve_dft(mol_B)
+interaction_energy_no_bsse = E_AB - (E_A + E_B)
+print(f"Interaction Energy without BSSE Correction: {interaction_energy_no_bsse:.6f} Hartree")
+
+E_A_ghost = solve_dft(mol_A_ghost)
+E_B_ghost = solve_dft(mol_B_ghost)
+interaction_energy_bsse = E_AB - (E_A_ghost + E_B_ghost)
+print(f"Interaction Energy with BSSE Correction: {interaction_energy_bsse:.6f} Hartree")
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 5ecab3d4..73e90830 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -2,37 +2,16 @@
props = cupy.cuda.runtime.getDeviceProperties(0)
GB = 1024*1024*1024
-# such as A100-80G
-if props['totalGlobalMem'] >= 64 * GB:
- min_ao_blksize = 128
- min_grid_blksize = 128*128
- ao_aligned = 32
- grid_aligned = 256
- mem_fraction = 0.9
- number_of_threads = 2048 * 108
-# such as V100-32G
-elif props['totalGlobalMem'] >= 32 * GB:
- min_ao_blksize = 128
- min_grid_blksize = 128*128
- ao_aligned = 32
- grid_aligned = 256
- mem_fraction = 0.9
- number_of_threads = 1024 * 80
-# such as A30-24GB
-elif props['totalGlobalMem'] >= 16 * GB:
- min_ao_blksize = 128
- min_grid_blksize = 128*128
- ao_aligned = 32
- grid_aligned = 256
- mem_fraction = 0.9
- number_of_threads = 1024 * 80
-# other gaming cards
-else:
+min_ao_blksize = 128
+min_grid_blksize = 128*128
+ao_aligned = 32
+grid_aligned = 256
+
+# Use smaller blksize for old gaming GPUs
+if props['totalGlobalMem'] < 16 * GB:
min_ao_blksize = 64
min_grid_blksize = 64*64
- ao_aligned = 32
- grid_aligned = 128
- mem_fraction = 0.9
- number_of_threads = 1024 * 80
+# Use 90% of the global memory for CuPy memory pool
+mem_fraction = 0.9
cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index e8f66422..2ef8680a 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -91,7 +91,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
log.timer_debug1('prepare intopt', *t0)
self.j2c = j2c.copy()
- j2c = take_last2d(j2c, intopt.aux_ao_idx)
+ j2c = intopt.sort_orbitals(j2c, aux_axis=[0,1])
try:
self.cd_low = cholesky(j2c)
self.cd_low = tag_array(self.cd_low, tag='cd')
@@ -108,6 +108,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, omega=omega)
log.timer_debug1('cholesky_eri', *t0)
self.intopt = intopt
+ return self
def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
direct_scf_tol=getattr(__config__, 'scf_hf_SCF_direct_scf_tol', 1e-13),
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index 5a271903..ed181f62 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -1,17 +1,18 @@
-#!/usr/bin/env python
-# Copyright 2014-2019 The PySCF Developers. All Rights Reserved.
#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
#
-# http://www.apache.org/licenses/LICENSE-2.0
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
#
# Author: Qiming Sun
# Modified by Xiaojie Wu
@@ -242,7 +243,7 @@ def to_cpu(self):
obj = self.undo_df().to_cpu().density_fit()
return utils.to_cpu(self, obj)
-def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
+def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
'''
get jk with density fitting
outputs and input are on the same device
@@ -268,31 +269,37 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
assert nao == dfobj.nao
vj = vk = None
- ao_idx = dfobj.intopt.ao_idx
- dms = take_last2d(dms, ao_idx)
+ intopt = dfobj.intopt
+ dms = intopt.sort_orbitals(dms, axis=[1,2])
dms_shape = dms.shape
- rows = dfobj.intopt.cderi_row
- cols = dfobj.intopt.cderi_col
-
+ rows = intopt.cderi_row
+ cols = intopt.cderi_col
+
if with_j:
dm_sparse = dms[:,rows,cols]
- dm_sparse[:, dfobj.intopt.cderi_diag] *= .5
+ if hermi == 0:
+ dm_sparse += dms[:,cols,rows]
+ else:
+ dm_sparse *= 2
+ dm_sparse[:, intopt.cderi_diag] *= .5
if with_k:
vk = cupy.zeros_like(dms)
-
+
# SCF K matrix with occ
if getattr(dms_tag, 'mo_coeff', None) is not None:
+ assert hermi == 1
mo_occ = dms_tag.mo_occ
mo_coeff = dms_tag.mo_coeff
nmo = mo_occ.shape[-1]
mo_coeff = mo_coeff.reshape(-1,nao,nmo)
mo_occ = mo_occ.reshape(-1,nmo)
+ mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
nocc = 0
occ_coeff = [0]*nset
for i in range(nset):
occ_idx = mo_occ[i] > 0
- occ_coeff[i] = mo_coeff[i][:,occ_idx][ao_idx] * mo_occ[i][occ_idx]**0.5
+ occ_coeff[i] = mo_coeff[i][:,occ_idx] * mo_occ[i][occ_idx]**0.5
nocc += mo_occ[i].sum()
blksize = dfobj.get_blksize(extra=nao*nocc)
if with_j:
@@ -300,7 +307,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
# leading dimension is 1
if with_j:
- rhoj = 2.0*dm_sparse.dot(cderi_sparse)
+ rhoj = dm_sparse.dot(cderi_sparse)
vj_packed += cupy.dot(rhoj, cderi_sparse.T)
cderi_sparse = rhoj = None
for i in range(nset):
@@ -316,18 +323,18 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
vj[:,rows,cols] = vj_packed
vj[:,cols,rows] = vj_packed
- # CP-HF K matrix
elif hasattr(dms_tag, 'mo1'):
+ # K matrix in CP-HF or TDDFT
occ_coeffs = dms_tag.occ_coeff
mo1s = dms_tag.mo1
- mo_occ = dms_tag.mo_occ
- if not isinstance(occ_coeffs, list):
- occ_coeffs = [occ_coeffs * 2.0] # For restricted
- if not isinstance(mo1s, list):
+ if not isinstance(occ_coeffs, (tuple, list)):
+ # *2 for double occupancy in RHF/RKS
+ occ_coeffs = [occ_coeffs * 2.0]
+ if not isinstance(mo1s, (tuple, list)):
mo1s = [mo1s]
- occ_coeffs = [occ_coeff[ao_idx] for occ_coeff in occ_coeffs]
- mo1s = [mo1[:,ao_idx] for mo1 in mo1s]
+ occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs]
+ mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
if with_j:
vj_sparse = cupy.zeros_like(dm_sparse)
@@ -336,7 +343,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
blksize = dfobj.get_blksize(extra=2*nao*nocc)
for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
if with_j:
- rhoj = 2.0*dm_sparse.dot(cderi_sparse)
+ rhoj = dm_sparse.dot(cderi_sparse)
vj_sparse += cupy.dot(rhoj, cderi_sparse.T)
rhoj = None
cderi_sparse = None
@@ -346,8 +353,8 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
rhok = contract('Lij,jk->Lki', cderi, occ_coeff).reshape([-1,nao])
for i in range(mo1.shape[0]):
rhok1 = contract('Lij,jk->Lki', cderi, mo1[i]).reshape([-1,nao])
- #contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[iset])
- vk[iset] += cupy.dot(rhok.T, rhok1)
+ #contract('Lki,Lkj->ij', rhok1, rhok, alpha=1.0, beta=1.0, out=vk[iset])
+ vk[iset] += cupy.dot(rhok1.T, rhok)
iset += 1
mo1 = rhok1 = rhok = None
cderi = None
@@ -356,7 +363,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
vj = cupy.zeros(dms_shape)
vj[:,rows,cols] = vj_sparse
vj[:,cols,rows] = vj_sparse
- if with_k:
+ if with_k and hermi:
transpose_sum(vk)
vj_sparse = None
# general K matrix with density matrix
@@ -366,25 +373,24 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
blksize = dfobj.get_blksize()
for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
if with_j:
- rhoj = 2.0*dm_sparse.dot(cderi_sparse)
+ rhoj = dm_sparse.dot(cderi_sparse)
vj_sparse += cupy.dot(rhoj, cderi_sparse.T)
if with_k:
for k in range(nset):
rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
- #vk[k] += contract('Lki,Lkj->ij', cderi, rhok)
- vk[k] += cupy.dot(cderi.reshape([-1,nao]).T, rhok)
+ #vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
+ vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
if with_j:
vj = cupy.zeros(dms_shape)
vj[:,rows,cols] = vj_sparse
vj[:,cols,rows] = vj_sparse
rhok = None
- rev_ao_idx = dfobj.intopt.rev_ao_idx
if with_j:
- vj = take_last2d(vj, rev_ao_idx)
+ vj = intopt.unsort_orbitals(vj, axis=[1,2])
vj = vj.reshape(out_shape)
if with_k:
- vk = take_last2d(vk, rev_ao_idx)
+ vk = intopt.unsort_orbitals(vk, axis=[1,2])
vk = vk.reshape(out_shape)
t1 = log.timer_debug1('vj and vk', *t1)
if out_cupy:
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 05a09639..15645846 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -17,7 +17,7 @@
import numpy
import cupy
from cupyx.scipy.linalg import solve_triangular
-from pyscf import scf
+from pyscf import scf, gto
from gpu4pyscf.df import int3c2e, df
from gpu4pyscf.lib.cupy_helper import (print_mem_info, tag_array,
unpack_tril, contract, load_library, take_last2d, cholesky)
@@ -88,11 +88,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
raise NotImplementedError()
mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
mo_occ = cupy.asarray(mf_grad.base.mo_occ)
- ao_idx = intopt.ao_idx
- dm = take_last2d(dm0, ao_idx)
+ dm = intopt.sort_orbitals(dm0, axis=[0,1])
orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
- orbo = orbo[ao_idx, :]
+ mo_coeff = None
+ orbo = intopt.sort_orbitals(orbo, axis=[0])
nocc = orbo.shape[-1]
# (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -126,8 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
else:
int2c_e1 = auxmol.intor('int2c2e_ip1')
int2c_e1 = cupy.asarray(int2c_e1)
- aux_ao_idx = intopt.aux_ao_idx
- rev_aux_idx = numpy.argsort(aux_ao_idx)
+
auxslices = auxmol.aoslice_by_atom()
aux_cart2sph = intopt.aux_cart2sph
low_t = low.T.copy()
@@ -141,7 +140,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj)
else:
rhoj_cart = rhoj
- rhoj = rhoj[rev_aux_idx]
+
+ rhoj = intopt.unsort_orbitals(rhoj, aux_axis=[0])
tmp = contract('xpq,q->xp', int2c_e1, rhoj)
vjaux = -contract('xp,p->xp', tmp, rhoj)
vjaux_2c = cupy.array([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
@@ -153,7 +153,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
#rhok = solve_triangular(low_t, rhok, lower=False)
rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
tmp = contract('pij,qij->pq', rhok, rhok)
- tmp = take_last2d(tmp, rev_aux_idx)
+ tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
vkaux = tmp = None
@@ -166,26 +166,25 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
t0 = log.timer_debug1('rhoj and rhok', *t0)
int2c_e1 = None
- nao_cart = intopt.mol.nao
+ nao_cart = intopt._sorted_mol.nao
block_size = with_df.get_blksize(nao=nao_cart)
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
group_size_aux=block_size)#, group_size=block_size)
- if not intopt._mol.cart:
+ dm_cart = dm
+ orbo_cart = orbo
+ if not mol.cart:
# sph2cart for ao
cart2sph = intopt.cart2sph
orbo_cart = cart2sph @ orbo
dm_cart = cart2sph @ dm @ cart2sph.T
- else:
- dm_cart = dm
- orbo_cart = orbo
- dm = orbo = None
+ dm = orbo = None
vj = vk = rhoj_tmp = rhok_tmp = None
vjaux = vkaux = None
- naux_cart = intopt.auxmol.nao
+ naux_cart = intopt._sorted_auxmol.nao
if with_j:
vj = cupy.zeros((3,nao_cart), order='C')
vjaux = cupy.zeros((3,naux_cart))
@@ -193,8 +192,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
vk = cupy.zeros((3,nao_cart), order='C')
vkaux = cupy.zeros((3,naux_cart))
cupy.get_default_memory_pool().free_all_blocks()
+ t1 = log.init_timer()
for cp_kl_id in range(len(intopt.aux_log_qs)):
- t1 = log.init_timer()
k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
assert k1-k0 <= block_size
if with_j:
@@ -233,33 +232,36 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-
- # vj and vk are still in cartesian
- cart_ao_idx = intopt.cart_ao_idx
- rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
- aoslices = intopt.mol.aoslice_by_atom()
+
+ # NOTE: vj and vk are still in cartesian
+ _sorted_mol = intopt._sorted_mol
+ natm = _sorted_mol.natm
+ ao2atom = numpy.zeros([nao_cart, natm])
+ ao_loc = _sorted_mol.ao_loc
+ for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]):
+ ao2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+ ao2atom = cupy.asarray(ao2atom)
if with_j:
- vj = vj[:, rev_cart_ao_idx]
- vj = [-vj[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
- vj = cupy.asarray(vj)
+ vj = -ao2atom.T @ vj.T
if with_k:
- vk = vk[:, rev_cart_ao_idx]
- vk = [-vk[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
- vk = cupy.asarray(vk)
+ vk = -ao2atom.T @ vk.T
t0 = log.timer_debug1('(di,j|P) and (i,j|dP)', *t0)
- cart_aux_idx = intopt.cart_aux_idx
- rev_cart_aux_idx = numpy.argsort(cart_aux_idx)
- auxslices = intopt.auxmol.aoslice_by_atom()
+ _sorted_auxmol = intopt._sorted_auxmol
+ natm = _sorted_auxmol.natm
+ aux2atom = numpy.zeros([naux_cart, natm])
+ ao_loc = _sorted_auxmol.ao_loc
+ for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
+ aux2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+ aux2atom = cupy.asarray(aux2atom)
if with_j:
- vjaux = vjaux[:, rev_cart_aux_idx]
- vjaux_3c = cupy.asarray([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
- vjaux = vjaux_2c + vjaux_3c
+ vjaux_3c = aux2atom.T @ vjaux.T
+ vjaux = vjaux_2c - vjaux_3c
if with_k:
- vkaux = vkaux[:, rev_cart_aux_idx]
- vkaux_3c = cupy.asarray([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
- vkaux = vkaux_2c + vkaux_3c
+ vkaux_3c = aux2atom.T @ vkaux.T
+ vkaux = vkaux_2c - vkaux_3c
+
return vj, vk, vjaux, vkaux
@@ -303,4 +305,4 @@ def extra_force(self, atom_id, envs):
else:
return 0
-Grad = Gradients
+Grad = Gradients
\ No newline at end of file
diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
index c19cc3d6..5dcb7c23 100644
--- a/gpu4pyscf/df/grad/uhf.py
+++ b/gpu4pyscf/df/grad/uhf.py
@@ -17,7 +17,7 @@
import cupy
import copy
from cupyx.scipy.linalg import solve_triangular
-from pyscf import scf
+from pyscf import scf, gto
from gpu4pyscf.df import int3c2e
from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library, take_last2d
from gpu4pyscf.grad import uhf as uhf_grad
@@ -68,13 +68,14 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
if mo_occ is None:
mo_occ = cupy.asarray(mf_grad.base.mo_occ)
- ao_idx = intopt.ao_idx
- dm = take_last2d(dm0, ao_idx)
+
+ dm = intopt.sort_orbitals(dm0, axis=[0,1])
if dm2 is not None:
- dm2_tmp = take_last2d(dm2, ao_idx)
+ dm2_tmp = intopt.sort_orbitals(dm2, axis=[0,1])
+
# (L|ij) -> rhoj: (L), rhok: (L|oo)
orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
- orbo = orbo[ao_idx, :]
+ orbo = intopt.sort_orbitals(orbo, axis=[0])
nocc = orbo.shape[-1]
# (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -115,8 +116,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
else:
int2c_e1 = auxmol.intor('int2c2e_ip1')
int2c_e1 = cupy.asarray(int2c_e1)
- aux_ao_idx = intopt.aux_ao_idx
- rev_aux_idx = np.argsort(aux_ao_idx)
auxslices = auxmol.aoslice_by_atom()
aux_cart2sph = intopt.aux_cart2sph
low_t = low.T.copy()
@@ -133,11 +132,11 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj)
else:
rhoj_cart = rhoj
-
- rhoj = rhoj[rev_aux_idx]
+ rhoj = intopt.unsort_orbitals(rhoj, aux_axis=[0])
if dm2 is not None:
- rhoj2 = rhoj2[rev_aux_idx]
+ rhoj2 = intopt.unsort_orbitals(rhoj2, aux_axis=[0])
+
tmp = contract('xpq,q->xp', int2c_e1, rhoj)
if dm2 is not None:
vjaux = -contract('xp,p->xp', tmp, rhoj2)
@@ -151,7 +150,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
elif low.tag == 'cd':
rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
tmp = contract('pij,qij->pq', rhok, rhok)
- tmp = take_last2d(tmp, rev_aux_idx)
+ tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
vkaux = tmp = None
@@ -164,33 +163,34 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
t0 = log.timer_debug1('rhoj and rhok', *t0)
int2c_e1 = None
- nao_cart = intopt.mol.nao
+ nao_cart = intopt._sorted_mol.nao
block_size = with_df.get_blksize(nao=nao_cart)
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
group_size_aux=block_size)#, group_size=block_size)
- if not intopt._mol.cart:
+
+ if not mol.cart:
# sph2cart for ao
cart2sph = intopt.cart2sph
orbo_cart = cart2sph @ orbo
if dm2 is None:
dm_cart = cart2sph @ dm @ cart2sph.T
else:
- dm2_tmp = take_last2d(dm2, ao_idx)
+ dm2_tmp = intopt.sort_orbitals(dm2, axis=[0,1])
dm_cart = cart2sph @ dm2_tmp @ cart2sph.T
else:
if dm2 is None:
dm_cart = dm
else:
- dm_cart = take_last2d(dm2, ao_idx)
+ dm_cart = intopt.sort_orbitals(dm2, axis=[0,1])
orbo_cart = orbo
dm = orbo = None
vj = vk = rhoj_tmp = rhok_tmp = None
vjaux = vkaux = None
- naux_cart = intopt.auxmol.nao
+ naux_cart = intopt._sorted_auxmol.nao
if with_j:
vj = cupy.zeros((3,nao_cart), order='C')
vjaux = cupy.zeros((3,naux_cart))
@@ -198,8 +198,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
vk = cupy.zeros((3,nao_cart), order='C')
vkaux = cupy.zeros((3,naux_cart))
cupy.get_default_memory_pool().free_all_blocks()
+ t1 = log.init_timer()
for cp_kl_id in range(len(intopt.aux_log_qs)):
- t1 = log.init_timer()
k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
assert k1-k0 <= block_size
if with_j:
@@ -239,32 +239,34 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
- cart_ao_idx = intopt.cart_ao_idx
- rev_cart_ao_idx = np.argsort(cart_ao_idx)
- aoslices = intopt.mol.aoslice_by_atom()
+ # NOTE: vj and vk are still in cartesian
+ _sorted_mol = intopt._sorted_mol
+ natm = _sorted_mol.natm
+ ao2atom = np.zeros([nao_cart, natm])
+ ao_loc = _sorted_mol.ao_loc
+ for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]):
+ ao2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+ ao2atom = cupy.asarray(ao2atom)
if with_j:
- vj = vj[:, rev_cart_ao_idx]
- vj = [-vj[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
- vj = cupy.asarray(vj)
+ vj = -ao2atom.T @ vj.T
if with_k:
- vk = vk[:, rev_cart_ao_idx]
- vk = [-vk[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]]
- vk = cupy.asarray(vk)
+ vk = -ao2atom.T @ vk.T
t0 = log.timer_debug1('(di,j|P) and (i,j|dP)', *t0)
- cart_aux_idx = intopt.cart_aux_idx
- rev_cart_aux_idx = np.argsort(cart_aux_idx)
- auxslices = intopt.auxmol.aoslice_by_atom()
-
+ _sorted_auxmol = intopt._sorted_auxmol
+ natm = _sorted_auxmol.natm
+ aux2atom = np.zeros([naux_cart, natm])
+ ao_loc = _sorted_auxmol.ao_loc
+ for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
+ aux2atom[ao_loc[ibas]:ao_loc[ibas+1],iatm] = 1
+ aux2atom = cupy.asarray(aux2atom)
if with_j:
- vjaux = vjaux[:, rev_cart_aux_idx]
- vjaux_3c = cupy.asarray([-vjaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
- vjaux = vjaux_2c + vjaux_3c
+ vjaux_3c = aux2atom.T @ vjaux.T
+ vjaux = vjaux_2c - vjaux_3c
if with_k:
- vkaux = vkaux[:, rev_cart_aux_idx]
- vkaux_3c = cupy.asarray([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
- vkaux = vkaux_2c + vkaux_3c
+ vkaux_3c = aux2atom.T @ vkaux.T
+ vkaux = vkaux_2c - vkaux_3c
return vj, vk, vjaux, vkaux
class Gradients(uhf_grad.Gradients):
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index cc669174..b09e41af 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -96,19 +96,17 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
# ================================ sorted AO begin ===============================================
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
- ao_idx = intopt.ao_idx
- aux_ao_idx = intopt.aux_ao_idx
- naux = len(aux_ao_idx)
- mocc_2 = mocc_2[ao_idx, :]
- dm0 = take_last2d(dm0, ao_idx)
+ naux = auxmol.nao #len(aux_ao_idx)
+ mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0])
+ dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
dm0_tag = tag_array(dm0, occ_coeff=mocc_2)
int2c = cupy.asarray(int2c, order='C')
- int2c = take_last2d(int2c, aux_ao_idx)
+ int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
solve_j2c = _gen_metric_solver(int2c)
int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
- int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+ int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
hj_ao_ao = cupy.zeros([nao,nao,3,3])
hk_ao_ao = cupy.zeros([nao,nao,3,3])
@@ -255,7 +253,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
else:
int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
- int2c_ipip1 = take_last2d(int2c_ipip1, aux_ao_idx)
+ int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
# (00|0)(2|0)(0|00)
# p,xp->px
@@ -271,7 +269,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
else:
int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
- int2c_ip1ip2 = take_last2d(int2c_ip1ip2, aux_ao_idx)
+ int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
if with_k:
hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
@@ -329,29 +327,22 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
rho2c_10= int2c_ip1_inv = None
t1 = log.timer_debug1('contract int2c_*', *t1)
- ao_idx = np.argsort(intopt.ao_idx)
- aux_idx = np.argsort(intopt.aux_ao_idx)
- rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
- dm0 = dm0[rev_ao_ao]
- hj_ao_diag = hj_ao_diag[ao_idx]
- hj_ao_ao = hj_ao_ao[rev_ao_ao]
+ dm0 = intopt.unsort_orbitals(dm0, axis=[0,1])
+ hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
+ hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
if hessobj.auxbasis_response:
- rev_ao_aux = cupy.ix_(ao_idx, aux_idx)
- hj_ao_aux = hj_ao_aux[rev_ao_aux]
+ hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
if hessobj.auxbasis_response > 1:
- rev_aux_aux = cupy.ix_(aux_idx, aux_idx)
- hj_aux_diag = hj_aux_diag[aux_idx]
- hj_aux_aux = hj_aux_aux[rev_aux_aux]
-
+ hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
+ hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
if with_k:
- hk_ao_diag = hk_ao_diag[ao_idx]
- hk_ao_ao = hk_ao_ao[rev_ao_ao]
+ hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
+ hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
if hessobj.auxbasis_response:
- hk_ao_aux = hk_ao_aux[rev_ao_aux]
+ hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
if hessobj.auxbasis_response > 1:
- hk_aux_diag = hk_aux_diag[aux_idx]
- hk_aux_aux = hk_aux_aux[rev_aux_aux]
-
+ hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0])
+ hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1])
#======================================== sort AO end ===========================================
# Energy weighted density matrix
# pi,qi,i->pq
@@ -460,7 +451,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
mo_occ = cupy.asarray(mo_occ, order='C')
mf = hessobj.base
- #auxmol = hessobj.base.with_df.auxmol
auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis)
aoslices = mol.aoslice_by_atom()
auxslices = auxmol.aoslice_by_atom()
@@ -486,16 +476,14 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
aosym=False,
group_size_aux=BLKSIZE,
group_size=BLKSIZE)
- ao_idx = intopt.ao_idx
- aux_ao_idx = intopt.aux_ao_idx
- naux = len(aux_ao_idx)
- mocc = mocc[ao_idx, :]
+ naux = auxmol.nao
+ mocc = intopt.sort_orbitals(mocc, axis=[0])
nocc = mocc.shape[1]
- mo_coeff = mo_coeff[ao_idx,:]
- dm0 = take_last2d(dm0, ao_idx)
+ mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0])
+ dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
dm0_tag = tag_array(dm0, occ_coeff=mocc)
-
- int2c = take_last2d(int2c, aux_ao_idx)
+
+ int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
solve_j2c = _gen_metric_solver(int2c)
wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
rhoj0 = solve_j2c(wj)
@@ -530,7 +518,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
else:
int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
- int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+ int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
# Generate rhok0_P__
if isinstance(rhok0_Pl_, cupy.ndarray):
@@ -583,17 +571,17 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
rhoj0 = rhok0_Pl_ = None
vk1_ao *= 2.0
vk1_buf *= 2.0
- rev_ao_idx = np.argsort(ao_idx)
- vj1_buf = take_last2d(vj1_buf, rev_ao_idx)
- vk1_buf = take_last2d(vk1_buf, rev_ao_idx)
+
+ vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+ vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2])
vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
vj1_ao = vk1_ao = None
t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
- mocc = mocc[rev_ao_idx]
- mo_coeff = mo_coeff[rev_ao_idx]
+ mocc = intopt.unsort_orbitals(mocc, axis=[0])
+ mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[0])
release_gpu_stack()
# ========================== sorted AO end ================================
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 468a0add..014142fa 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -55,7 +55,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
raise NotImplementedError
omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
- with_k = abs(hyb) > 1e-10
+ with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
atmlst, max_memory, verbose,
with_k=with_k)
@@ -98,11 +98,12 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
mem_now = lib.current_memory()[0]
max_memory = max(2000, mf.max_memory*.9-mem_now)
h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+ with_k = ni.libxc.is_hybrid_xc(mf.xc)
for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
- atmlst, verbose, abs(hyb) > 1e-10):
+ atmlst, verbose, with_k):
h1mo[ia] += h1 + vj1
- if abs(hyb) > 1e-10 or abs(alpha-hyb) > 1e-10:
+ if with_k:
h1mo[ia] -= .5 * hyb * vk1
if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
diff --git a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
index 490608e6..c39f0172 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
@@ -83,6 +83,7 @@ def test_df_gga(self):
mf = mf.to_gpu()
hessobj = mf.Hessian()
+ hessobj.base.cphf_grids = hessobj.base.grids
hess_gpu = hessobj.kernel()
assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
@@ -98,9 +99,11 @@ def test_df_mgga(self):
mf = mf.to_gpu()
hessobj = mf.Hessian()
+ hessobj.base.cphf_grids = hessobj.base.grids
hess_gpu = hessobj.kernel()
assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
if __name__ == "__main__":
print("Full Tests for DF UKS Hessian")
- unittest.main()
\ No newline at end of file
+ unittest.main()
+
\ No newline at end of file
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index a66b6557..71a6c7dc 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -100,23 +100,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
# ================================ sorted AO begin ===============================================
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
- ao_idx = intopt.ao_idx
- aux_ao_idx = intopt.aux_ao_idx
- mocca = mocca[ao_idx, :]
- moccb = moccb[ao_idx, :]
- dm0a = take_last2d(dm0a, ao_idx)
- dm0b = take_last2d(dm0b, ao_idx)
+ mocca = intopt.sort_orbitals(mocca, axis=[0])
+ moccb = intopt.sort_orbitals(moccb, axis=[0])
+ dm0a = intopt.sort_orbitals(dm0a, axis=[0,1])
+ dm0b = intopt.sort_orbitals(dm0b, axis=[0,1])
+
dm0a_tag = tag_array(dm0a, occ_coeff=mocca)
dm0b_tag = tag_array(dm0b, occ_coeff=moccb)
int2c = cupy.asarray(int2c, order='C')
- int2c = take_last2d(int2c, aux_ao_idx)
+ int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
+
int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR)
solve_j2c = _gen_metric_solver(int2c)
int2c = None
int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
- int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+ int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
hj_ao_ao = cupy.zeros([nao,nao,3,3])
hk_ao_ao = cupy.zeros([nao,nao,3,3])
@@ -272,7 +272,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
else:
int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
- int2c_ipip1 = take_last2d(int2c_ipip1, aux_ao_idx)
+ int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
# (00|0)(2|0)(0|00)
# p,xp->px
@@ -289,7 +289,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
else:
int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
- int2c_ip1ip2 = take_last2d(int2c_ip1ip2, aux_ao_idx)
+ int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
if with_k:
hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
@@ -349,32 +349,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
rho2c_10= int2c_ip1_inv = None
t1 = log.timer_debug1('contract int2c_*', *t1)
- ao_idx = np.argsort(intopt.ao_idx)
- aux_idx = np.argsort(intopt.aux_ao_idx)
- rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
- #dm0 = dm0[rev_ao_ao]
- hj_ao_diag = hj_ao_diag[ao_idx]
- hj_ao_ao = hj_ao_ao[rev_ao_ao]
+ hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
+ hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
if hessobj.auxbasis_response:
- rev_ao_aux = cupy.ix_(ao_idx, aux_idx)
- hj_ao_aux = hj_ao_aux[rev_ao_aux]
+ hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
if hessobj.auxbasis_response > 1:
- rev_aux_aux = cupy.ix_(aux_idx, aux_idx)
- hj_aux_diag = hj_aux_diag[aux_idx]
- hj_aux_aux = hj_aux_aux[rev_aux_aux]
-
+ hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
+ hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
if with_k:
- hk_ao_diag = hk_ao_diag[ao_idx]
- hk_ao_ao = hk_ao_ao[rev_ao_ao]
+ hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
+ hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
if hessobj.auxbasis_response:
- hk_ao_aux = hk_ao_aux[rev_ao_aux]
+ hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
if hessobj.auxbasis_response > 1:
- hk_aux_diag = hk_aux_diag[aux_idx]
- hk_aux_aux = hk_aux_aux[rev_aux_aux]
-
- mocca = mocca[ao_idx]
- moccb = moccb[ao_idx]
-
+ hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0])
+ hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1])
+ mocca = intopt.unsort_orbitals(mocca, axis=[0])
+ moccb = intopt.unsort_orbitals(moccb, axis=[0])
#======================================== sort AO end ===========================================
# Energy weighted density matrix
# pi,qi,i->pq
@@ -517,17 +508,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
aosym=False,
group_size_aux=BLKSIZE,
group_size=BLKSIZE)
- ao_idx = intopt.ao_idx
- aux_ao_idx = intopt.aux_ao_idx
-
- mocca = mocca[ao_idx, :]
- moccb = moccb[ao_idx, :]
- mo_coeff = mo_coeff[:, ao_idx,:]
- dm0a = take_last2d(dm0a, ao_idx)
- dm0b = take_last2d(dm0b, ao_idx)
+
+ mocca = intopt.sort_orbitals(mocca, axis=[0])
+ moccb = intopt.sort_orbitals(moccb, axis=[0])
+ mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
+ dm0a = intopt.sort_orbitals(dm0a, axis=[0,1])
+ dm0b = intopt.sort_orbitals(dm0b, axis=[0,1])
dm0 = dm0a + dm0b
- int2c = take_last2d(int2c, aux_ao_idx)
+ int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
solve_j2c = _gen_metric_solver(int2c)
int2c = None
@@ -567,10 +556,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega)
dm0_tag = tag_array(dm0, occ_coeff=moccb)
vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega)
- rev_ao_idx = np.argsort(ao_idx)
- vj1_buf = take_last2d(vj1_buf, rev_ao_idx)
- vk1a_buf = take_last2d(vk1a_buf, rev_ao_idx)
- vk1b_buf = take_last2d(vk1b_buf, rev_ao_idx)
+
+ vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+ vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2])
+ vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2])
vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0])
vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1])
@@ -597,13 +586,13 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
else:
int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
- int2c_ip1 = take_last2d(int2c_ip1, aux_ao_idx)
+ int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
# generate rhok0_P__
if isinstance(rhok0a_Pl_, cupy.ndarray):
rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca)
else:
- naux = len(aux_ao_idx)
+ naux = auxmol.nao
nocc = mocca.shape[1]
rhok0a_P__ = cupy.empty([naux,nocc,nocc])
for p0, p1 in lib.prange(0,naux,64):
@@ -615,7 +604,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
if isinstance(rhok0b_Pl_, cupy.ndarray):
rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb)
else:
- naux = len(aux_ao_idx)
+ naux = auxmol.nao
nocc = moccb.shape[1]
rhok0b_P__ = cupy.empty([naux,nocc,nocc])
for p0, p1 in lib.prange(0,naux,64):
@@ -670,9 +659,9 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
vk1a_int3c_ip2 = vk1b_int3c_ip2 = None
t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
- mocca = mocca[rev_ao_idx]
- moccb = moccb[rev_ao_idx]
- mo_coeff = mo_coeff[:,rev_ao_idx]
+ mocca = intopt.unsort_orbitals(mocca, axis=[0])
+ moccb = intopt.unsort_orbitals(moccb, axis=[0])
+ mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1])
release_gpu_stack()
# ========================== sorted AO end ================================
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 9ab957be..3a4dbd52 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -57,7 +57,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
raise NotImplementedError
omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
- with_k = abs(hyb) > 1e-10
+ with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
atmlst, max_memory, verbose,
with_k=with_k)
@@ -103,13 +103,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
mem_now = lib.current_memory()[0]
max_memory = max(2000, mf.max_memory*.9-mem_now)
h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+ with_k = ni.libxc.is_hybrid_xc(mf.xc)
for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
- atmlst, verbose, abs(hyb) > 1e-10):
+ atmlst, verbose, with_k):
h1moa[ia] += h1[0] + vj1[0]
h1mob[ia] += h1[1] + vj1[1]
- if abs(hyb) > 1e-10 or abs(alpha-hyb) > 1e-10:
+ if with_k:
vk1a, vk1b = vk1
h1moa[ia] -= hyb * vk1a
h1mob[ia] -= hyb * vk1b
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index f2aa0a5a..834c587c 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -64,19 +64,13 @@ def make_fake_mol():
class VHFOpt(_vhf.VHFOpt):
def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen',
qcondname='CVHFsetnr_direct_scf', dmcondname=None):
- # use local basis_seg_contraction for efficiency
- # TODO: switch _mol and mol
- self.mol = basis_seg_contraction(mol,allow_replica=True)
- self.auxmol = basis_seg_contraction(auxmol, allow_replica=True)
- self._mol = mol
- self._auxmol = auxmol
+ self.mol = mol # original mol
+ self.auxmol = auxmol # original auxiliary mol
+ self._sorted_mol = None # sorted mol
+ self._sorted_auxmol = None # sorted auxilary mol
- '''
- # Note mol._bas will be sorted in .build() method. VHFOpt should be
- # initialized after mol._bas updated.
- '''
- self.nao = self.mol.nao
- self.naux = self.auxmol.nao
+ self._ao_idx = None
+ self._aux_ao_idx = None
self._intor = intor
self._prescreen = prescreen
@@ -85,11 +79,6 @@ def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen',
self.bpcache = None
- self.cart_ao_idx = None
- self.sph_ao_idx = None
- self.cart_aux_idx = None
- self.sph_aux_idx = None
-
self.cart_ao_loc = []
self.cart_aux_loc = []
self.sph_ao_loc = []
@@ -128,14 +117,16 @@ def build(self, cutoff=1e-14, group_size=None,
a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
we will pair (ao,ao) and (aux,1) separately.
'''
- _mol = self._mol
- _auxmol = self._auxmol
- mol = self.mol
- auxmol = self.auxmol
+ _mol = self.mol
+ _auxmol = self.auxmol
+ mol = basis_seg_contraction(_mol,allow_replica=True)
+ auxmol = basis_seg_contraction(_auxmol, allow_replica=True)
+
log = logger.new_logger(_mol, _mol.verbose)
cput0 = log.init_timer()
- sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
+ _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
+
if group_size is not None :
uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size)
self.nctr = len(uniq_l_ctr)
@@ -145,16 +136,16 @@ def build(self, cutoff=1e-14, group_size=None,
_, _, fake_uniq_l_ctr, fake_l_ctr_counts = sort_mol(fake_mol, log=log)
# sort auxiliary mol
- sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(auxmol, log=log)
+ _sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(auxmol, log=log)
if group_size_aux is not None:
aux_uniq_l_ctr, aux_l_ctr_counts = _split_l_ctr_groups(aux_uniq_l_ctr, aux_l_ctr_counts, group_size_aux)
-
- tot_mol = sorted_mol + fake_mol + sorted_auxmol
- tot_mol.cart = True
- self.tot_mol = tot_mol
+
+ _tot_mol = _sorted_mol + fake_mol + _sorted_auxmol
+ _tot_mol.cart = True
+ self._tot_mol = _tot_mol
# Initialize vhfopt after reordering mol._bas
- _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen,
+ _vhf.VHFOpt.__init__(self, _sorted_mol, self._intor, self._prescreen,
self._qcondname, self._dmcondname)
self.direct_scf_tol = cutoff
@@ -169,32 +160,19 @@ def build(self, cutoff=1e-14, group_size=None,
cput1 = log.timer_debug1('Get pairing', *cput1)
# contraction coefficient for ao basis
- cart_ao_loc = sorted_mol.ao_loc_nr(cart=True)
- sph_ao_loc = sorted_mol.ao_loc_nr(cart=False)
+ cart_ao_loc = _sorted_mol.ao_loc_nr(cart=True)
+ sph_ao_loc = _sorted_mol.ao_loc_nr(cart=False)
self.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets]
self.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets]
self.angular = [l[0] for l in uniq_l_ctr]
- cart_ao_loc = mol.ao_loc_nr(cart=True)
- sph_ao_loc = mol.ao_loc_nr(cart=False)
- nao = sph_ao_loc[-1]
- ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1])
- self.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
+ # Sorted AO indices
+ ao_loc = mol.ao_loc_nr(cart=_mol.cart)
+ ao_idx = np.array_split(np.arange(_mol.nao), ao_loc[1:-1])
+ self._ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
# cartesian ao index
- nao = cart_ao_loc[-1]
- ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
- self.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
- ncart = cart_ao_loc[-1]
- nsph = sph_ao_loc[-1]
- self.cart2sph = block_c2s_diag(ncart, nsph, self.angular, l_ctr_counts)
-
- if _mol.cart:
- inv_idx = np.argsort(self.cart_ao_idx, kind='stable').astype(np.int32)
- self.coeff = cupy.eye(ncart)[:,inv_idx]
- else:
- inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32)
- self.coeff = self.cart2sph[:, inv_idx]
+ self.cart2sph = block_c2s_diag(self.angular, l_ctr_counts)
cput1 = log.timer_debug1('AO cart2sph coeff', *cput1)
# pairing auxiliary basis with fake basis set
@@ -203,36 +181,22 @@ def build(self, cutoff=1e-14, group_size=None,
aux_l_ctr_offsets = np.append(0, np.cumsum(aux_l_ctr_counts))
# contraction coefficient for auxiliary basis
- cart_aux_loc = sorted_auxmol.ao_loc_nr(cart=True)
- sph_aux_loc = sorted_auxmol.ao_loc_nr(cart=False)
+ cart_aux_loc = _sorted_auxmol.ao_loc_nr(cart=True)
+ sph_aux_loc = _sorted_auxmol.ao_loc_nr(cart=False)
self.cart_aux_loc = [cart_aux_loc[cp] for cp in aux_l_ctr_offsets]
self.sph_aux_loc = [sph_aux_loc[cp] for cp in aux_l_ctr_offsets]
self.aux_angular = [l[0] for l in aux_uniq_l_ctr]
- cart_aux_loc = self.auxmol.ao_loc_nr(cart=True)
- sph_aux_loc = self.auxmol.ao_loc_nr(cart=False)
- naux = sph_aux_loc[-1]
- ao_idx = np.array_split(np.arange(naux), sph_aux_loc[1:-1])
- self.sph_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
+ aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart)
+ ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1])
+ self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
# cartesian aux index
- naux = cart_aux_loc[-1]
- ao_idx = np.array_split(np.arange(naux), cart_aux_loc[1:-1])
- self.cart_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
- ncart = cart_aux_loc[-1]
- nsph = sph_aux_loc[-1]
- self.aux_cart2sph = block_c2s_diag(ncart, nsph, self.aux_angular, aux_l_ctr_counts)
-
- if _auxmol.cart:
- inv_idx = np.argsort(self.cart_aux_idx, kind='stable').astype(np.int32)
- self.aux_coeff = cupy.eye(ncart)[:,inv_idx]
- else:
- inv_idx = np.argsort(self.sph_aux_idx, kind='stable').astype(np.int32)
- self.aux_coeff = self.aux_cart2sph[:, inv_idx]
+ self.aux_cart2sph = block_c2s_diag(self.aux_angular, aux_l_ctr_counts)
aux_l_ctr_offsets += fake_l_ctr_offsets[-1]
cput1 = log.timer_debug1('aux cart2sph coeff', *cput1)
- ao_loc = sorted_mol.ao_loc_nr(cart=_mol.cart)
+ ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart)
self.ao_pairs_row, self.ao_pairs_col = get_ao_pairs(pair2bra, pair2ket, ao_loc)
cderi_row = cupy.hstack(self.ao_pairs_row)
cderi_col = cupy.hstack(self.ao_pairs_col)
@@ -268,7 +232,7 @@ def build(self, cutoff=1e-14, group_size=None,
bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1)
bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32)
log_qs = log_qs + aux_log_qs
- ao_loc = tot_mol.ao_loc_nr(cart=True)
+ ao_loc = _tot_mol.ao_loc_nr(cart=True)
ncptype = len(log_qs)
self.bpcache = ctypes.POINTER(BasisProdCache)()
@@ -278,9 +242,9 @@ def build(self, cutoff=1e-14, group_size=None,
ao_loc.ctypes.data_as(ctypes.c_void_p),
bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
- tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm),
- tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas),
- tot_mol._env.ctypes.data_as(ctypes.c_void_p))
+ _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm),
+ _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas),
+ _tot_mol._env.ctypes.data_as(ctypes.c_void_p))
cput1 = log.timer_debug1('Initialize GPU cache', *cput1)
self.bas_pairs_locs = bas_pairs_locs
@@ -294,25 +258,79 @@ def build(self, cutoff=1e-14, group_size=None,
if _mol.cart:
self.ao_loc = self.cart_ao_loc
- self.ao_idx = self.cart_ao_idx
else:
self.ao_loc = self.sph_ao_loc
- self.ao_idx = self.sph_ao_idx
if _auxmol.cart:
self.aux_ao_loc = self.cart_aux_loc
- self.aux_ao_idx = self.cart_aux_idx
else:
self.aux_ao_loc = self.sph_aux_loc
- self.aux_ao_idx = self.sph_aux_idx
- self.rev_ao_idx = np.argsort(self.ao_idx, kind='stable').astype(np.int32)
- self.ao_idx = cupy.array(self.ao_idx)
- self.cart_ao_idx = cupy.array(self.cart_ao_idx)
- self.sph_ao_idx = cupy.array(self.sph_ao_idx)
- self.aux_ao_idx = cupy.array(self.aux_ao_idx)
- self.cart_aux_idx = cupy.array(self.cart_aux_idx)
- self.sph_aux_idx = cupy.array(self.sph_aux_idx)
- self.rev_ao_idx = cupy.array(self.rev_ao_idx)
+ self._sorted_mol = _sorted_mol
+ self._sorted_auxmol = _sorted_auxmol
+
+ def sort_orbitals(self, mat, axis=[], aux_axis=[]):
+ ''' Transform given axis of a matrix into sorted AO,
+ and transform given auxiliary axis of a matrix into sorted auxiliary AO
+ '''
+ idx = self._ao_idx
+ aux_idx = self._aux_ao_idx
+ shape_ones = (1,) * mat.ndim
+ fancy_index = []
+ for dim, n in enumerate(mat.shape):
+ if dim in axis:
+ assert n == len(idx)
+ indices = idx
+ elif dim in aux_axis:
+ assert n == len(aux_idx)
+ indices = aux_idx
+ else:
+ indices = np.arange(n)
+ idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+ fancy_index.append(indices.reshape(idx_shape))
+ return mat[tuple(fancy_index)]
+
+ def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]):
+ ''' Transform given axis of a matrix into sorted AO,
+ and transform given auxiliary axis of a matrix into original auxiliary AO
+ '''
+ idx = self._ao_idx
+ aux_idx = self._aux_ao_idx
+ shape_ones = (1,) * sorted_mat.ndim
+ fancy_index = []
+ for dim, n in enumerate(sorted_mat.shape):
+ if dim in axis:
+ assert n == len(idx)
+ indices = idx
+ elif dim in aux_axis:
+ assert n == len(aux_idx)
+ indices = aux_idx
+ else:
+ indices = np.arange(n)
+ idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+ fancy_index.append(indices.reshape(idx_shape))
+ mat = cupy.empty_like(sorted_mat)
+ mat[tuple(fancy_index)] = sorted_mat
+ return mat
+
+ @property
+ def coeff(self):
+ nao = self.mol.nao
+ if self.mol.cart:
+ coeff = cupy.eye(nao)
+ self._coeff = self.unsort_orbitals(coeff, axis=[1])
+ else:
+ self._coeff = self.unsort_orbitals(self.cart2sph, axis=[1])
+ return self._coeff
+
+ @property
+ def aux_coeff(self):
+ naux = self.auxmol.nao
+ if self.auxmol.cart:
+ coeff = cupy.eye(naux)
+ self._aux_coeff = self.unsort_orbitals(coeff, aux_axis=[1])
+ else:
+ self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1])
+ return self._aux_coeff
def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
log = logger.new_logger(mol, mol.verbose)
@@ -351,7 +369,7 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
li = intopt.angular[cpi]
lj = intopt.angular[cpj]
int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
- if not intopt._mol.cart:
+ if not intopt.mol.cart:
int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
@@ -378,7 +396,7 @@ def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None):
'''
fn = getattr(libgvhf, 'GINTbuild_int3c2e_' + ip_type + '_jk')
if omega is None: omega = 0.0
- nao = intopt.mol.nao
+ nao = intopt._sorted_mol.nao
n_dm = 1
cp_kl_id = cp_aux_id + len(intopt.log_qs)
@@ -451,19 +469,19 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
if omega is None: omega = 0.0
if stream is None: stream = cupy.cuda.get_current_stream()
- nao = intopt.mol.nao
- naux = intopt.auxmol.nao
+ nao = intopt._sorted_mol.nao
+ naux = intopt._sorted_auxmol.nao
norb = nao + naux + 1
ao_loc = intopt.ao_loc
aux_ao_loc = intopt.aux_ao_loc
comp = 3**order
- lmax = intopt.mol._bas[:gto.ANG_OF].max()
- aux_lmax = intopt.auxmol._bas[:gto.ANG_OF].max()
+ lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max()
+ aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max()
nroots = (lmax + aux_lmax + order)//2 + 1
if nroots > NROOT_ON_GPU:
from pyscf.gto.moleintor import getints, make_cintopt
- pmol = intopt.tot_mol
+ pmol = intopt._tot_mol
intor = pmol._add_suffix('int3c2e_' + ip_type)
opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
@@ -519,9 +537,9 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
int3c_blk = cupy.asarray(int3c_cpu)
- if not intopt._auxmol.cart:
+ if not intopt.auxmol.cart:
int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
- if not intopt._mol.cart:
+ if not intopt.mol.cart:
int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
@@ -550,9 +568,9 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None):
if omega is None: omega = 0.0
if stream is None: stream = cupy.cuda.get_current_stream()
- nao = len(intopt.ao_idx)
- nao_cart = intopt.mol.nao
- naux_cart = intopt.auxmol.nao
+ nao = intopt.mol.nao
+ nao_cart = intopt._sorted_mol.nao
+ naux_cart = intopt._sorted_auxmol.nao
norb_cart = nao_cart + naux_cart + 1
ao_loc = intopt.ao_loc
aux_ao_loc = intopt.aux_ao_loc
@@ -615,20 +633,20 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None):
yield aux_id, ints_slices
def get_ao2atom(intopt, aoslices):
- ao_idx = intopt.ao_idx
- ao2atom = cupy.zeros([len(ao_idx), len(aoslices)])
+ nao = intopt.mol.nao
+ ao2atom = cupy.zeros([nao, len(aoslices)])
for ia, aoslice in enumerate(aoslices):
_, _, p0, p1 = aoslice
ao2atom[p0:p1,ia] = 1.0
- return ao2atom[ao_idx,:]
+ return intopt.sort_orbitals(ao2atom, axis=[0])
def get_aux2atom(intopt, auxslices):
- aux_ao_idx = intopt.aux_ao_idx
- aux2atom = cupy.zeros([len(aux_ao_idx), len(auxslices)])
+ naux = intopt.auxmol.nao
+ aux2atom = cupy.zeros([naux, len(auxslices)])
for ia, auxslice in enumerate(auxslices):
_, _, p0, p1 = auxslice
aux2atom[p0:p1,ia] = 1.0
- return aux2atom[aux_ao_idx,:]
+ return intopt.sort_orbitals(aux2atom, aux_axis=[0])
def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
'''
@@ -636,22 +654,24 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
'''
n_dm = 1
- naux = intopt.cart_aux_loc[-1]#len(intopt.cart_aux_idx)
- rhoj = cupy.zeros([naux])
+ naux = intopt._sorted_auxmol.nao
+
coeff = intopt.coeff
if dm0.ndim == 3:
dm0 = dm0[0] + dm0[1]
dm_cart = coeff @ dm0 @ coeff.T
-
+
num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs]
bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32)
bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32)
-
+
ncp_ij = len(intopt.log_qs)
ncp_kl = len(intopt.aux_log_qs)
norb = dm_cart.shape[0]
+
+ rhoj = cupy.zeros([naux])
err = libgvhf.GINTbuild_j_int3c2e_pass1(
intopt.bpcache,
ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p),
@@ -665,7 +685,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
ctypes.c_int(ncp_kl))
if err != 0:
raise RuntimeError('CUDA error in get_j_pass1')
-
+
if sort_j:
aux_coeff = intopt.aux_coeff
rhoj = cupy.dot(rhoj, aux_coeff)
@@ -676,8 +696,8 @@ def get_j_int3c2e_pass2(intopt, rhoj):
get vj pass2 for int3c2e
'''
n_dm = 1
- norb = len(intopt.cart_ao_idx)
- naux = len(intopt.cart_aux_idx)
+ norb = intopt._sorted_mol.nao
+ naux = intopt._sorted_auxmol.nao
vj = cupy.zeros([norb, norb])
num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
@@ -688,9 +708,10 @@ def get_j_int3c2e_pass2(intopt, rhoj):
ncp_ij = len(intopt.log_qs)
ncp_kl = len(intopt.aux_log_qs)
-
- aux_coeff = intopt.aux_coeff
- rhoj = cupy.dot(aux_coeff, rhoj)
+
+ rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0])
+ if not intopt.auxmol.cart:
+ rhoj = intopt.aux_cart2sph @ rhoj
err = libgvhf.GINTbuild_j_int3c2e_pass2(
intopt.bpcache,
@@ -706,8 +727,11 @@ def get_j_int3c2e_pass2(intopt, rhoj):
if err != 0:
raise RuntimeError('CUDA error in get_j_pass2')
- coeff = intopt.coeff
- vj = coeff.T @ vj @ coeff
+
+ if not intopt.mol.cart:
+ cart2sph = intopt.cart2sph
+ vj = cart2sph.T @ vj @ cart2sph
+ vj = intopt.unsort_orbitals(vj, axis=[0,1])
vj = vj + vj.T
return vj
@@ -719,7 +743,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE)
if omega is None: omega = 0.0
- naux = len(intopt.aux_ao_idx)
+ naux = auxmol.nao
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
nocc = orbo.shape[1]
rhoj = cupy.empty([naux])
@@ -736,7 +760,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
li = intopt.angular[cpi]
lj = intopt.angular[cpj]
int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
- if not intopt._mol.cart:
+ if not intopt.mol.cart:
int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
@@ -761,8 +785,8 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
# vj and vk responses (due to int3c2e_ip1) to changes in atomic positions
'''
ao2atom = get_ao2atom(intopt, aoslices)
- natom = len(aoslices)
- nao = len(intopt.ao_idx)
+ natom = intopt.mol.natm
+ nao = intopt.mol.nao
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
nocc = orbo.shape[1]
vj1_buf = cupy.zeros([3,nao,nao])
@@ -820,8 +844,8 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome
vj and vk responses (due to int3c2e_ip2) to changes in atomic positions
'''
aux2atom = get_aux2atom(intopt, auxslices)
- natom = len(auxslices)
- nao = len(intopt.ao_idx)
+ natom = intopt.mol.natm
+ nao = intopt.mol.nao
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
nocc = orbo.shape[1]
vj1 = cupy.zeros([natom,3,nao,nocc])
@@ -863,8 +887,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
'''
get wj and wk for int3c2e_ip1
'''
- nao = len(intopt.ao_idx)
- naux = len(intopt.aux_ao_idx)
+ nao = intopt.mol.nao
+ naux = intopt.auxmol.nao
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
nocc = orbo.shape[1]
@@ -903,7 +927,7 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
'''
get wj and wk for int3c2e_ip2
'''
- naux = len(intopt.aux_ao_idx)
+ naux = intopt.auxmol.nao
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
nocc = orbo.shape[1]
wj = cupy.zeros([naux,3])
@@ -918,12 +942,12 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
'''
get hj and hk with int3c2e_ipip1
'''
- nao_sph = dm0_tag.shape[0]
+ nao = dm0_tag.shape[0]
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
- hj = cupy.zeros([nao_sph,9])
+ hj = cupy.zeros([nao,9])
hk = None
if with_k:
- hk = cupy.zeros([nao_sph,9])
+ hk = cupy.zeros([nao,9])
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1', omega=omega):
tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
@@ -931,21 +955,21 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
- hj = hj.reshape([nao_sph,3,3])
+ hj = hj.reshape([nao,3,3])
if with_k:
- hk = hk.reshape([nao_sph,3,3])
+ hk = hk.reshape([nao,3,3])
return hj, hk
def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
'''
# get hj and hk with int3c2e_ipvip1
'''
- nao_sph = dm0_tag.shape[0]
+ nao = dm0_tag.shape[0]
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
- hj = cupy.zeros([nao_sph,nao_sph,9])
+ hj = cupy.zeros([nao,nao,9])
hk = None
if with_k:
- hk = cupy.zeros([nao_sph,nao_sph,9])
+ hk = cupy.zeros([nao,nao,9])
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1', omega=omega):
tmp = contract('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1])
hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
@@ -953,22 +977,22 @@ def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
- hj = hj.reshape([nao_sph,nao_sph,3,3])
+ hj = hj.reshape([nao,nao,3,3])
if with_k:
- hk = hk.reshape([nao_sph,nao_sph,3,3])
+ hk = hk.reshape([nao,nao,3,3])
return hj, hk
def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
'''
# get hj and hk with int3c2e_ip1ip2
'''
- nao_sph = dm0_tag.shape[0]
- naux_sph = rhok.shape[0]
+ nao = dm0_tag.shape[0]
+ naux = rhok.shape[0]
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
- hj = cupy.zeros([nao_sph,naux_sph,9])
+ hj = cupy.zeros([nao,naux,9])
hk = None
if with_k:
- hk = cupy.zeros([nao_sph,naux_sph,9])
+ hk = cupy.zeros([nao,naux,9])
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1ip2', omega=omega):
tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
@@ -976,21 +1000,21 @@ def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
- hj = hj.reshape([nao_sph,naux_sph,3,3])
+ hj = hj.reshape([nao,naux,3,3])
if with_k:
- hk = hk.reshape([nao_sph,naux_sph,3,3])
+ hk = hk.reshape([nao,naux,3,3])
return hj, hk
def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
'''
# get hj and hk with int3c2e_ipip2
'''
- naux_sph = rhok.shape[0]
+ naux = rhok.shape[0]
orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
- hj = cupy.zeros([naux_sph,9])
+ hj = cupy.zeros([naux,9])
hk = None
if with_k:
- hk = cupy.zeros([naux_sph,9])
+ hk = cupy.zeros([naux,9])
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip2', omega=omega):
tmp = contract('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1])
hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
@@ -998,9 +1022,9 @@ def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
- hj = hj.reshape([naux_sph,3,3])
+ hj = hj.reshape([naux,3,3])
if with_k:
- hk = hk.reshape([naux_sph,3,3])
+ hk = hk.reshape([naux,3,3])
return hj, hk
def get_hess_nuc_elec(mol, dm):
@@ -1016,8 +1040,7 @@ def get_hess_nuc_elec(mol, dm):
fakemol.stdout = mol.stdout
intopt = VHFOpt(mol, fakemol, 'int2e')
intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
- ao_idx = intopt.ao_idx
- dm = take_last2d(cupy.asarray(dm), ao_idx)
+ dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
natm = mol.natm
nao = mol.nao
@@ -1172,9 +1195,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s
if err != 0:
raise RuntimeError("int3c2e_ip failed\n")
- if not intopt._auxmol.cart:
+ if not intopt.auxmol.cart:
int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
- if not intopt._mol.cart:
+ if not intopt.mol.cart:
int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
@@ -1183,13 +1206,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s
k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk
- ao_idx = np.argsort(intopt.ao_idx)
- aux_idx = np.argsort(intopt.aux_ao_idx)
- int3c = int3c[cupy.ix_(np.arange(3), aux_idx, ao_idx, ao_idx)]
-
+ int3c = intopt.unsort_orbitals(int3c, aux_axis=[1], axis=[2,3])
return int3c.transpose([0,3,2,1])
-
def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', direct_scf_tol=1e-13, omega=None, stream=None):
'''
Generate full int3c2e type tensor on GPU
@@ -1219,13 +1238,12 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
nroots = (lmax + aux_lmax + order)//2 + 1
if nroots > NROOT_ON_GPU:
from pyscf.gto.moleintor import getints, make_cintopt
- mol = intopt.mol
- pmol = intopt.tot_mol
+ pmol = intopt._tot_mol
intor = pmol._add_suffix('int3c2e_' + ip_type)
opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
- nao_cart = intopt.mol.nao
- naux_cart = intopt.auxmol.nao
+ nao_cart = intopt._sorted_mol.nao
+ naux_cart = intopt._sorted_auxmol.nao
norb_cart = nao_cart + naux_cart + 1
ao_loc = intopt.ao_loc
aux_ao_loc = intopt.aux_ao_loc
@@ -1281,9 +1299,9 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
int3c_blk = cupy.asarray(int3c_cpu)
- if not intopt._auxmol.cart:
+ if not intopt.auxmol.cart:
int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
- if not intopt._mol.cart:
+ if not intopt.mol.cart:
int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
@@ -1293,10 +1311,7 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk
- ao_idx = np.argsort(intopt.ao_idx)
- aux_idx = np.argsort(intopt.aux_ao_idx)
- int3c = int3c[cupy.ix_(np.arange(comp), aux_idx, ao_idx, ao_idx)]
-
+ int3c = intopt.unsort_orbitals(int3c, aux_axis=[1], axis=[2,3])
return int3c.transpose([0,3,2,1])
def get_dh1e(mol, dm0):
@@ -1313,7 +1328,7 @@ def get_dh1e(mol, dm0):
fakemol.stdout = mol.stdout
intopt = VHFOpt(mol, fakemol, 'int2e')
intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
- dm0_sorted = take_last2d(dm0, intopt.ao_idx)
+ dm0_sorted = intopt.sort_orbitals(dm0, axis=[0,1])
dh1e = cupy.zeros([natm,3])
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'):
dh1e[k0:k1,:3] += contract('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
@@ -1332,7 +1347,7 @@ def get_d2h1e(mol, dm0):
d2h1e_offdiag = cupy.zeros([natm, nao, 9])
intopt = VHFOpt(mol, fakemol, 'int2e')
intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
- dm0_sorted = take_last2d(dm0, intopt.ao_idx)
+ dm0_sorted = intopt.sort_orbitals(dm0, axis=[0,1])
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
@@ -1352,8 +1367,8 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
'''
if stream is None: stream = cupy.cuda.get_current_stream()
if omega is None: omega = 0.0
- nao_cart = intopt.mol.nao
- naux_cart = intopt.auxmol.nao
+ nao_cart = intopt._sorted_mol.nao
+ naux_cart = intopt._sorted_auxmol.nao
norb_cart = nao_cart + naux_cart + 1
cpi = intopt.cp_idx[cp_ij_id]
@@ -1381,7 +1396,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
# if possible, write the data into the given allocated space
# otherwise, need a temporary space for cart2sph
'''
- if out is None or (lk > 1 and not intopt._auxmol.cart):
+ if out is None or (lk > 1 and not intopt.auxmol.cart):
int3c_blk = cupy.zeros([nk,nj,ni], order='C')
strides = np.array([1, ni, ni*nj, 1], dtype=np.int32)
else:
@@ -1408,7 +1423,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
raise RuntimeError('GINT_fill_int2e failed')
# move this operation to j2c?
- if lk > 1 and intopt._auxmol.cart == 0:
+ if lk > 1 and intopt.auxmol.cart == 0:
int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out)
return int3c_blk
@@ -1445,10 +1460,7 @@ def get_int3c2e(mol, auxmol=None, auxbasis='weigend+etb', direct_scf_tol=1e-13,
int3c[:, j0:j1, i0:i1] = int3c_slice
row, col = np.tril_indices(nao)
int3c[:, row, col] = int3c[:, col, row]
- ao_idx = np.argsort(intopt.ao_idx)
- aux_id = np.argsort(intopt.aux_ao_idx)
- int3c = int3c[np.ix_(aux_id, ao_idx, ao_idx)]
-
+ int3c = intopt.unsort_orbitals(int3c, aux_axis=[0], axis=[1,2])
return int3c.transpose([2,1,0])
def sort_mol(mol0, cart=True, log=None):
diff --git a/gpu4pyscf/df/tests/test_jk.py b/gpu4pyscf/df/tests/test_df_jk.py
similarity index 54%
rename from gpu4pyscf/df/tests/test_jk.py
rename to gpu4pyscf/df/tests/test_df_jk.py
index f353e529..6fb3f841 100644
--- a/gpu4pyscf/df/tests/test_jk.py
+++ b/gpu4pyscf/df/tests/test_df_jk.py
@@ -17,9 +17,10 @@
import numpy as np
import cupy
import pyscf
-from pyscf import df
+from pyscf import df, lib
from gpu4pyscf import scf as gpu_scf
from gpu4pyscf.df import int3c2e, df_jk
+from gpu4pyscf.df.df import DF
atom='''
Ti 0.0 0.0 0.0
@@ -31,18 +32,20 @@
bas='def2-tzvpp'
def setUpModule():
- global mol, auxmol
- mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
- mol.output = '/dev/null'
- mol.cart = True
- mol.build()
- mol.verbose = 1
+ global mol, mol_sph, auxmol, auxmol_sph
+ mol = pyscf.M(atom=atom, basis=bas, output='/dev/null', cart=True, verbose=1)
auxmol = df.addons.make_auxmol(mol, auxbasis='sto3g')
+ mol_sph = pyscf.M(atom=atom, basis=bas, output='/dev/null', cart=False, verbose=1)
+ auxmol_sph = df.addons.make_auxmol(mol_sph, auxbasis='sto3g')
+
def tearDownModule():
- global mol, auxmol
+ global mol, mol_sph, auxmol, auxmol_sph
mol.stdout.close()
- del mol, auxmol
+ mol_sph.stdout.close()
+ auxmol.stdout.close()
+ auxmol_sph.stdout.close()
+ del mol, auxmol, mol_sph, auxmol_sph
class KnownValues(unittest.TestCase):
@@ -51,7 +54,7 @@ def test_vj_incore(self):
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(1e-14, diag_block_with_triu=False, aosym=True)
cupy.random.seed(np.asarray(1, dtype=np.uint64))
- nao = len(intopt.ao_idx)
+ nao = intopt.mol.nao
dm = cupy.random.rand(nao, nao)
dm = dm + dm.T
@@ -64,6 +67,25 @@ def test_vj_incore(self):
vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore)
vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore)
assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5
+
+ def test_vj_sph_incore(self):
+ int3c_gpu = int3c2e.get_int3c2e(mol_sph, auxmol, aosym=True, direct_scf_tol=1e-14)
+ intopt = int3c2e.VHFOpt(mol_sph, auxmol, 'int2e')
+ intopt.build(1e-14, diag_block_with_triu=False, aosym=True)
+ cupy.random.seed(np.asarray(1, dtype=np.uint64))
+ nao = intopt.mol.nao
+ dm = cupy.random.rand(nao, nao)
+ dm = dm + dm.T
+
+ # pass 1
+ rhoj_outcore = cupy.einsum('ijL,ij->L', int3c_gpu, dm)
+ rhoj_incore = 2.0*int3c2e.get_j_int3c2e_pass1(intopt, dm)
+ assert cupy.linalg.norm(rhoj_outcore - rhoj_incore) < 1e-8
+
+ # pass 2
+ vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore)
+ vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore)
+ assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5
def test_j_outcore(self):
cupy.random.seed(np.asarray(1, dtype=np.uint64))
@@ -72,10 +94,22 @@ def test_j_outcore(self):
dm = dm + dm.T
mf = gpu_scf.RHF(mol).density_fit()
mf.kernel()
- vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False)
+ vj0, _ = mf.get_jk(dm=dm, with_j=True, with_k=False, hermi=1)
vj = df_jk.get_j(mf.with_df, dm)
assert cupy.linalg.norm(vj - vj0) < 1e-4
+
+ def test_jk_hermi0(self):
+ dfobj = DF(mol, 'sto3g').build()
+ np.random.seed(3)
+ nao = mol.nao
+ dm = np.random.rand(nao, nao)
+ refj, refk = dfobj.to_cpu().get_jk(dm, hermi=0)
+ vj, vk = dfobj.get_jk(dm, hermi=0)
+ assert abs(vj - refj).max() < 1e-9
+ assert abs(vk - refk).max() < 1e-9
+ assert abs(lib.fp(vj) - 455.864593801164).max() < 1e-9
+ assert abs(lib.fp(vk) - 37.7022369618297).max() < 1e-9
if __name__ == "__main__":
print("Full Tests for DF JK")
- unittest.main()
\ No newline at end of file
+ unittest.main()
diff --git a/gpu4pyscf/df/tests/test_df_rhf.py b/gpu4pyscf/df/tests/test_df_rhf.py
index 3852c70b..abb2da46 100644
--- a/gpu4pyscf/df/tests/test_df_rhf.py
+++ b/gpu4pyscf/df/tests/test_df_rhf.py
@@ -31,15 +31,11 @@
def setUpModule():
global mol_sph, mol_cart
- mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0)
- mol_sph.output = '/dev/null'
- mol_sph.build()
- mol_sph.verbose = 1
+ mol_sph = pyscf.M(atom=atom, basis=bas, cart=0,
+ symmetry=True, output='/dev/null', verbose=1)
- mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1)
- mol_cart.output = '/dev/null'
- mol_cart.build()
- mol_cart.verbose = 1
+ mol_cart = pyscf.M(atom=atom, basis=bas, cart=1,
+ output='/dev/null', verbose=1)
def tearDownModule():
global mol_sph, mol_cart
diff --git a/gpu4pyscf/df/tests/test_df_rks.py b/gpu4pyscf/df/tests/test_df_rks.py
index 4cd40701..1aa69944 100644
--- a/gpu4pyscf/df/tests/test_df_rks.py
+++ b/gpu4pyscf/df/tests/test_df_rks.py
@@ -31,15 +31,11 @@
def setUpModule():
global mol_sph, mol_cart
- mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0)
- mol_sph.output = '/dev/null'
- mol_sph.build()
- mol_sph.verbose = 1
-
- mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1)
- mol_cart.output = '/dev/null'
- mol_cart.build()
- mol_cart.verbose = 1
+ mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0,
+ output='/dev/null', verbose=1)
+
+ mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1,
+ output='/dev/null', verbose=1)
def tearDownModule():
global mol_sph, mol_cart
diff --git a/gpu4pyscf/df/tests/test_df_rks_grad.py b/gpu4pyscf/df/tests/test_df_rks_grad.py
index ea382e66..a218630d 100644
--- a/gpu4pyscf/df/tests/test_df_rks_grad.py
+++ b/gpu4pyscf/df/tests/test_df_rks_grad.py
@@ -117,17 +117,17 @@ def _vs_cpu(mol, grid_response=False, xc=xc0, disp=disp0, tol=1e-9):
assert abs(g_analy - ref).max() < tol
class KnownValues(unittest.TestCase):
-
+
def test_grad_with_grids_response(self):
print("-----testing DF DFT gradient with grids response----")
_check_grad(mol_sph, grid_response=True, xc='LDA', disp=None)
_check_grad(mol_sph, grid_response=True, xc='B3LYP', disp=None)
_check_grad(mol_sph, grid_response=True, xc='m06', disp=None, tol=1e-4)
-
+
def test_grad_lda(self):
print("-----LDA testing-------")
_vs_cpu(mol_sph, xc='LDA', disp=None)
-
+
def test_grad_gga(self):
print('-----GGA testing-------')
_vs_cpu(mol_sph, xc='PBE', disp=None)
@@ -147,7 +147,7 @@ def test_grad_rsh(self):
def test_grad_nlc(self):
print('--------nlc testing-------------')
_vs_cpu(mol_sph, xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-7)
-
+
def test_grad_cart(self):
print('------ Cart testing--------')
_vs_cpu(mol_cart, xc='B3LYP', disp=None)
@@ -163,7 +163,7 @@ def test_grad_d4(self):
def test_grad_wb97m_d3bj(self):
print('------ wB97m-d3bj --------')
_vs_cpu(mol_sph, xc='wb97m-d3bj', tol=1e-8)
-
+
if __name__ == "__main__":
print("Full Tests for DF Gradient")
unittest.main()
diff --git a/gpu4pyscf/dft/__init__.py b/gpu4pyscf/dft/__init__.py
index d1ae3570..c65e412d 100644
--- a/gpu4pyscf/dft/__init__.py
+++ b/gpu4pyscf/dft/__init__.py
@@ -1,9 +1,9 @@
from . import rks
-from .rks import RKS
+from .rks import RKS, KohnShamDFT
from .uks import UKS
from .gks import GKS
from .roks import ROKS
-from gpu4pyscf.dft.gen_grid import Grids
+from .gen_grid import Grids
def KS(mol, xc='LDA,VWN'):
if mol.spin == 0:
diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index dda28353..3f709733 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -26,6 +26,7 @@ class GKS(gks.GKS, GHF):
def __init__(self, mol, xc='LDA,VWN'):
raise NotImplementedError
+ reset = rks.RKS.reset
energy_elec = rks.RKS.energy_elec
get_veff = NotImplemented
nuc_grad_method = NotImplemented
diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py
index 8a07e3c3..850a879a 100644
--- a/gpu4pyscf/dft/libxc.py
+++ b/gpu4pyscf/dft/libxc.py
@@ -124,17 +124,18 @@ def _check_arrays(current_arrays, fields, sizes, factor, required):
"""
A specialized function built to construct and check the sizes of arrays given to the LibXCFunctional class.
"""
-
# Nothing supplied so we build it out
if current_arrays is None:
current_arrays = {}
+ if not required:
+ for label in fields:
+ current_arrays[label] = None
+ return current_arrays
+
for label in fields:
- if required:
- size = sizes[label]
- current_arrays[label] = cupy.empty((factor, size), dtype=np.float64)
- else:
- current_arrays[label] = None # cupy.empty((1))
+ size = sizes[label]
+ current_arrays[label] = cupy.empty((factor, size), dtype=np.float64)
return current_arrays
@@ -150,6 +151,7 @@ class _xcfun(ctypes.Structure):
class XCfun:
def __init__(self, xc, spin):
+ self.spin = spin
self._spin = 1 if spin == 'unpolarized' else 2
self.xc_func = _libxc.xc_func_alloc()
if isinstance(xc, str):
@@ -178,6 +180,9 @@ def needs_laplacian(self):
rsh_coeff = dft.libxc.rsh_coeff
def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_kxc=False, do_lxc=False):
+ # TODO: turn to dft.libxc.eval_xc for do_kxc and do_lxc
+ assert not do_kxc
+ assert not do_lxc
if isinstance(inp, cupy.ndarray):
inp = {"rho": cupy.asarray(inp, dtype=cupy.double)}
elif isinstance(inp, dict):
@@ -207,12 +212,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
args.extend([ inp[x] for x in input_labels])
args.extend([output[x] for x in output_labels])
- cuda_args = []
- for arg in args:
- if(isinstance(arg, cupy.ndarray)):
- arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
- cuda_args.append(arg)
- #_libxc.xc_lda(*cuda_args)
out_params = xc_lda_out_params()
buf_params = xc_lda_out_params()
@@ -246,12 +245,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
args.extend([ inp[x] for x in input_labels])
args.extend([output[x] for x in output_labels])
- cuda_args = []
- for arg in args:
- if(isinstance(arg, cupy.ndarray)):
- arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
- cuda_args.append(arg)
- #_libxc.xc_gga(*cuda_args)
out_params = xc_gga_out_params()
buf_params = xc_gga_out_params()
@@ -295,12 +288,6 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
args.insert(-1, cupy.empty((1))) # Add none ptr to laplacian
#args.insert(-1, cupy.zeros_like(inp['rho']))
args.extend([output[x] for x in output_labels])
- cuda_args = []
- for arg in args:
- if(isinstance(arg, cupy.ndarray)):
- arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p)
- cuda_args.append(arg)
- #_libxc.xc_mgga(*cuda_args)
out_params = xc_mgga_out_params()
buf_params = xc_mgga_out_params()
@@ -310,13 +297,14 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k
setattr(buf_params, label, buf[label].data.ptr)
setattr(out_params, label, output[label].data.ptr)
stream = cupy.cuda.get_current_stream()
+ lapl = cupy.empty(1)
err = libgdft.GDFT_xc_mgga(
stream.ptr,
self.xc_func,
npoints,
inp['rho'].data.ptr,
inp['sigma'].data.ptr,
- cupy.empty(1).data.ptr,
+ lapl.data.ptr,
inp['tau'].data.ptr,
ctypes.byref(out_params),
ctypes.byref(buf_params)
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index b68d5368..c1bb1180 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -16,6 +16,7 @@
# along with this program. If not, see .
import ctypes
+from functools import lru_cache
import contextlib
import numpy as np
import cupy
@@ -25,7 +26,7 @@
from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index
from gpu4pyscf.gto.mole import basis_seg_contraction
from gpu4pyscf.lib.cupy_helper import (
- contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, take_last2d, transpose_sum,
+ contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, transpose_sum,
grouped_dot, grouped_gemm)
from gpu4pyscf.dft import xc_deriv, xc_alias, libxc
from gpu4pyscf import __config__
@@ -41,7 +42,6 @@
# Should we release the cupy cache?
FREE_CUPY_CACHE = False
-MGGA_DENSITY_LAPL = False
USE_SPARSITY = 2 # 0: no sparsity, 1: in-house GEMM, 2: sparse in AO direction
libgdft = load_library('libgdft')
@@ -52,23 +52,26 @@
libgdft.GDFTdot_ao_ao_sparse.restype = ctypes.c_int
libgdft.GDFTdot_aow_ao_sparse.restype = ctypes.c_int
-def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None,
- non0tab=None, out=None, verbose=None, ctr_offsets_slice=None):
+def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None,
+ non0tab=None, out=None, verbose=None, ctr_offsets_slice=None, gdftopt=None,
+ transpose=True):
''' evaluate ao values for given coords and shell indices
Kwargs:
shls_slice : offsets of shell slices to be evaluated
ao_loc_slice: offsets of ao slices to be evaluated
ctr_offsets_slice: offsets of contraction patterns
Returns:
- ao: comp x nao_slice x ngrids, ao is in C-contiguous
+ ao: comp x nao_slice x ngrids, ao is in C-contiguous.
+ comp x ngrids x nao_slice if tranpose, be compatiable with PySCF.
'''
- opt = getattr(ni, 'gdftopt', None)
- with_opt = True
- if opt is None or mol not in [opt.mol, opt._sorted_mol]:
- ni.build(mol, coords)
- opt = ni.gdftopt
- with_opt = False
- mol = None
+ if gdftopt is None:
+ opt = _GDFTOpt.from_mol(mol)
+ with opt.gdft_envs_cache():
+ return eval_ao(
+ mol, coords, deriv, shls_slice, nao_slice, ao_loc_slice,
+ non0tab, out, verbose, ctr_offsets_slice, opt, transpose)
+
+ opt = gdftopt
_sorted_mol = opt._sorted_mol
if shls_slice is None:
@@ -78,6 +81,9 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_sl
ao_loc_slice = cupy.asarray(_sorted_mol.ao_loc_nr())
nao_slice = _sorted_mol.nao
else:
+ assert ao_loc_slice is not None
+ assert nao_slice is not None
+ assert ctr_offsets_slice is not None
ctr_offsets = opt.l_ctr_offsets
nctr = ctr_offsets.size - 1
@@ -96,44 +102,34 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_sl
if out is None:
out = cupy.empty((comp, nao_slice, ngrids), order='C')
- if not with_opt:
- # mol may be different to _GDFTOpt._sorted_mol.
- # nao should be consistent with the _GDFTOpt._sorted_mol object
- coeff = cupy.asarray(opt.coeff)
- with opt.gdft_envs_cache():
- err = libgdft.GDFTeval_gto(
- ctypes.cast(stream.ptr, ctypes.c_void_p),
- ctypes.cast(out.data.ptr, ctypes.c_void_p),
- ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart),
- ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids),
- ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p),
- ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p),
- ctypes.c_int(nao_slice),
- ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr),
- ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p),
- _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p))
- out = contract('nig,ij->njg', out, coeff).transpose([0,2,1])
- else:
- err = libgdft.GDFTeval_gto(
- ctypes.cast(stream.ptr, ctypes.c_void_p),
- ctypes.cast(out.data.ptr, ctypes.c_void_p),
- ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart),
- ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids),
- ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p),
- ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p),
- ctypes.c_int(nao_slice),
- ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr),
- ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p),
- _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p))
+ err = libgdft.GDFTeval_gto(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ ctypes.cast(out.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(deriv), ctypes.c_int(_sorted_mol.cart),
+ ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids),
+ ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p),
+ ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(nao_slice),
+ ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr),
+ ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p),
+ _sorted_mol._bas.ctypes.data_as(ctypes.c_void_p))
+
if err != 0:
raise RuntimeError('CUDA Error in evaluating AO')
+ if mol is not _sorted_mol:
+ coeff = cupy.asarray(opt.coeff)
+ out = contract('nig,ij->njg', out, coeff)
+
+ if transpose:
+ out = out.transpose(0,2,1)
+
if deriv == 0:
out = out[0]
return out
def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
- with_lapl=True, verbose=None):
+ with_lapl=False, verbose=None):
xctype = xctype.upper()
if xctype in ('LDA', 'HF'):
_, ngrids = ao.shape
@@ -153,17 +149,13 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
if hermi:
rho[1:4] *= 2 # *2 for + einsum('pi,ij,pj->p', ao[i], dm, ao[0])
else:
- c0 = dm.dot(ao[0])
+ c0 = dm.T.dot(ao[0])
for i in range(1, 4):
rho[i] += _contract_rho(ao[i], c0)
else: # meta-GGA
- if with_lapl:
- # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
- rho = cupy.empty((6,ngrids))
- tau_idx = 5
- else:
- rho = cupy.empty((5,ngrids))
- tau_idx = 4
+ assert not with_lapl
+ rho = cupy.empty((5,ngrids))
+ tau_idx = 4
c0 = dm.dot(ao[0])
rho[0] = _contract_rho(c0, ao[0])
@@ -181,11 +173,11 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
return rho
def eval_rho1(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
- with_lapl=True, verbose=None):
+ with_lapl=False, verbose=None):
raise NotImplementedError
def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
- with_lapl=True, verbose=None, out=None):
+ with_lapl=False, verbose=None, out=None):
xctype = xctype.upper()
if xctype == 'LDA' or xctype == 'HF':
_, ngrids = ao.shape
@@ -205,40 +197,24 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
_contract_rho(c0, c1, rho=rho[i])
rho[1:] *= 2
else: # meta-GGA
- if with_lapl:
- # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
- rho = cupy.empty((6,ngrids))
- tau_idx = 5
- else:
- rho = cupy.empty((5,ngrids))
- tau_idx = 4
+ assert not with_lapl
+ rho = cupy.empty((5,ngrids))
+ tau_idx = 4
c0 = cupy.dot(cpos.T, ao[0])
_contract_rho(c0, c0, rho=rho[0])
-
rho[tau_idx] = 0
for i in range(1, 4):
c1 = cupy.dot(cpos.T, ao[i])
rho[i] = _contract_rho(c0, c1)
rho[tau_idx] += _contract_rho(c1, c1)
- if with_lapl:
- if ao.shape[0] > 4:
- XX, YY, ZZ = 4, 7, 9
- ao2 = ao[XX] + ao[YY] + ao[ZZ]
- c1 = cupy.dot(cpos.T, ao2)
- #:rho[4] = numpy.einsum('pi,pi->p', c0, c1)
- rho[4] = _contract_rho(c0, c1)
- rho[4] += rho[5]
- rho[4] *= 2
- else:
- rho[4] = 0
rho[1:4] *= 2
rho[tau_idx] *= .5
return rho
def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
- with_lapl=True, verbose=None):
+ with_lapl=False, verbose=None):
xctype = xctype.upper()
if xctype == 'LDA' or xctype == 'HF':
_, ngrids = ao.shape
@@ -261,15 +237,9 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
rho[i] += _contract_rho(c0[0], c_0[i])
rho *= 2.0
else: # meta-GGA
- # TODO: complete this
- if with_lapl:
- raise NotImplementedError("mGGA with lapl not implemented")
- # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
- rho = cupy.empty((6,ngrids))
- tau_idx = 5
- else:
- rho = cupy.empty((5,ngrids))
- tau_idx = 4
+ assert not with_lapl
+ rho = cupy.empty((5,ngrids))
+ tau_idx = 4
c_0 = contract('nig,io->nog', ao, cpos1)
#:rho[0] = numpy.einsum('pi,pi->p', c0, c0)
rho[0] = _contract_rho(c0[0], c_0[0])
@@ -281,27 +251,22 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
rho[i]+= _contract_rho(c0[0], c_0[i])
rho[tau_idx] += _contract_rho(c_0[i], c0[i])
rho *= 2.0
- if with_lapl:
- raise NotImplementedError("mGGA with lapl not implemented")
- if ao.shape[0] > 4:
- XX, YY, ZZ = 4, 7, 9
- ao2 = ao[XX] + ao[YY] + ao[ZZ]
- c1 = _dot_ao_dm(mol, ao2, cpos1, non0tab, shls_slice, ao_loc)
- #:rho[4] = numpy.einsum('pi,pi->p', c0, c1)
- rho[4] = _contract_rho(c0, c1)
- rho[4] += rho[5]
- rho[4] *= 2
- else:
- rho[4] = 0
rho[tau_idx] *= .5
return rho
-def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
- with_lapl=True, verbose=None):
- ''' ao: nd x nao x ng
- c0: nd x nocc x ng
- mo1: na x nao x nocc
+def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0,
+ with_lapl=False, verbose=None):
+ '''Evaluate density using first order orbitals. This density is typically
+ derived from the non-symmetric density matrix (hermi=0) in TDDFT
+ dm[i] = mo0.dot(mo1[i].T) and symmetric density matrix (hermi=1) in CPHF
+ dm[i] = mo0.dot(mo1[i].T) + mo1[i].dot(mo0.T)
+
+ ao: nd x nao x ng
+ mo0: nao x nocc
+ mo1: na x nao x nocc
'''
+ log = logger.new_logger(mol, verbose)
+ t0 = log.init_timer()
xctype = xctype.upper()
if xctype == 'LDA' or xctype == 'HF':
_, ngrids = ao.shape
@@ -309,30 +274,32 @@ def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
_, ngrids = ao[0].shape
na = mo1.shape[0]
- cpos1= mo1
if xctype == 'LDA' or xctype == 'HF':
- c_0 = contract('aio,ig->aog', cpos1, ao)#cupy.dot(cpos1.T, ao)
+ c0 = mo0.T.dot(ao)
+ t1 = log.timer_debug2('eval occ_coeff', *t0)
+ c_0 = contract('aio,ig->aog', mo1, ao)
rho = cupy.empty([na,ngrids])
for i in range(na):
rho[i] = _contract_rho(c0, c_0[i])
- rho *= 2.0
elif xctype in ('GGA', 'NLC'):
- log = logger.new_logger(mol, mol.verbose)
- t0 = log.init_timer()
- c_0 = contract('nig,aio->anog', ao, cpos1)
- t0 = log.timer_debug2('ao * cpos', *t0)
+ c0 = contract('nig,io->nog', ao, mo0)
+ t1 = log.timer_debug2('eval occ_coeff', *t0)
+ c_0 = contract('nig,aio->anog', ao, mo1)
+ t1 = log.timer_debug2('ao * cpos', *t1)
rho = cupy.empty([na, 4, ngrids])
for i in range(na):
_contract_rho_gga(c0, c_0[i], rho=rho[i])
- t0 = log.timer_debug2('contract rho', *t0)
else: # meta-GGA
- if with_lapl:
- raise NotImplementedError("mGGA with lapl not implemented")
+ assert not with_lapl
rho = cupy.empty((na,5,ngrids))
- c_0 = contract('nig,aio->anog', ao, cpos1)
+ c0 = contract('nig,io->nog', ao, mo0)
+ c_0 = contract('nig,aio->anog', ao, mo1)
for i in range(na):
_contract_rho_mgga(c0, c_0[i], rho=rho[i])
-
+ if hermi:
+ # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao
+ rho *= 2.
+ t0 = log.timer_debug2('contract rho', *t0)
return rho
def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars):
@@ -435,7 +402,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
log = logger.new_logger(mol, verbose)
xctype = ni._xc_type(xc_code)
opt = getattr(ni, 'gdftopt', None)
- if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+ if opt is None:
ni.build(mol, grids.coords)
opt = ni.gdftopt
@@ -443,17 +410,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
mo_occ = getattr(dms,'mo_occ', None)
mol = None
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- nao, nao0 = coeff.shape
+ nao, nao0 = opt.coeff.shape
dms = cupy.asarray(dms)
dm_shape = dms.shape
- #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
- dms = dms.reshape(-1,nao0,nao0)
- dms = take_last2d(dms, opt.ao_idx)
+ dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
nset = len(dms)
if mo_coeff is not None:
- mo_coeff = mo_coeff[opt.ao_idx]
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
nelec = cupy.empty(nset)
excsum = cupy.empty(nset)
@@ -464,27 +428,24 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
ao_deriv = 0
else:
ao_deriv = 1
- with_lapl = MGGA_DENSITY_LAPL
ngrids = grids.weights.size
if xctype == 'LDA':
rho_tot = cupy.empty([nset,1,ngrids])
elif xctype == 'GGA':
rho_tot = cupy.empty([nset,4,ngrids])
else:
- if with_lapl:
- rho_tot = cupy.empty([nset,6,ngrids])
- else:
- rho_tot = cupy.empty([nset,5,ngrids])
+ rho_tot = cupy.empty([nset,5,ngrids])
p0 = p1 = 0
t1 = t0 = log.init_timer()
- for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
p1 = p0 + weight.size
for i in range(nset):
if mo_coeff is None:
- rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
+ rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1)
else:
mo_coeff_mask = mo_coeff[idx,:]
- rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl)
+ rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
p0 = p1
t1 = log.timer_debug2('eval rho slice', *t1)
t0 = log.timer_debug1('eval rho', *t0)
@@ -501,6 +462,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
excsum[i] = cupy.dot(den, exc[:,0])
wv.append(vxc * grids.weights)
+ # *.5 for v+v.conj().T at the end
if xctype == 'GGA':
wv[i][0] *= .5
if xctype == 'MGGA':
@@ -512,7 +474,8 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
t1 = t0
p0 = p1 = 0
- for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
p1 = p0 + weight.size
for i in range(nset):
if xctype == 'LDA':
@@ -535,8 +498,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
p0 = p1
t1 = log.timer_debug2('integration', *t1)
t0 = log.timer_debug1('vxc integration', *t0)
- rev_ao_idx = opt.rev_ao_idx
- vmat = take_last2d(vmat, rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
if xctype != 'LDA':
transpose_sum(vmat)
@@ -553,7 +515,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
return nelec, excsum, vmat
def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype='LDA',
- with_lapl=True, verbose=None, out=None):
+ with_lapl=False, verbose=None, out=None):
groups = len(ao_group)
xctype = xctype.upper()
if xctype == 'LDA' or xctype == 'HF':
@@ -600,6 +562,7 @@ def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype='
rho[1:] *= 2
rho_group.append(rho)
else: # meta-GGA
+ assert not with_lapl
c0_group = []
cpos_group4 = []
ao_group4 = []
@@ -646,7 +609,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
log = logger.new_logger(mol, verbose)
xctype = ni._xc_type(xc_code)
opt = getattr(ni, 'gdftopt', None)
- if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+ if opt is None:
ni.build(mol, grids.coords)
opt = ni.gdftopt
@@ -655,17 +618,14 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
mol = None
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- nao, nao0 = coeff.shape
+ nao, nao0 = opt.coeff.shape
dms = cupy.asarray(dms)
dm_shape = dms.shape
- #dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
- dms = dms.reshape(-1,nao0,nao0)
- dms = take_last2d(dms, opt.ao_idx)
+ dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
nset = len(dms)
if mo_coeff is not None:
- mo_coeff = mo_coeff[opt.ao_idx]
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
nelec = cupy.zeros(nset)
excsum = cupy.zeros(nset)
@@ -676,27 +636,24 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
ao_deriv = 0
else:
ao_deriv = 1
- with_lapl = MGGA_DENSITY_LAPL
ngrids = grids.weights.size
if xctype == 'LDA':
rho_tot = cupy.empty([nset,1,ngrids])
elif xctype == 'GGA':
rho_tot = cupy.empty([nset,4,ngrids])
else:
- if with_lapl:
- rho_tot = cupy.empty([nset,6,ngrids])
- else:
- rho_tot = cupy.empty([nset,5,ngrids])
+ rho_tot = cupy.empty([nset,5,ngrids])
p0 = p1 = 0
t1 = t0 = log.init_timer()
- for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
p1 = p0 + weight.size
for i in range(nset):
if mo_coeff is None:
- rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
+ rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1)
else:
mo_coeff_mask = mo_coeff[idx,:]
- rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl)
+ rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
p0 = p1
t1 = log.timer_debug2('eval rho slice', *t1)
t0 = log.timer_debug1('eval rho', *t0)
@@ -772,8 +729,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
raise NotImplementedError(f'numint.nr_rks for functional {xc_code}')
t1 = log.timer_debug2('integration', *t1)
t0 = log.timer_debug1('vxc integration', *t0)
- rev_ao_idx = opt.rev_ao_idx
- vmat = take_last2d(vmat, rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
if xctype != 'LDA':
transpose_sum(vmat)
@@ -794,7 +750,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
log = logger.new_logger(mol, verbose)
xctype = ni._xc_type(xc_code)
opt = getattr(ni, 'gdftopt', None)
- if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+ if opt is None:
ni.build(mol, grids.coords)
opt = ni.gdftopt
@@ -802,18 +758,17 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
mo_occ = getattr(dms,'mo_occ', None)
mol = None
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- nao, nao0 = coeff.shape
+ nao, nao0 = opt.coeff.shape
dma, dmb = dms
dm_shape = dma.shape
dma = cupy.asarray(dma).reshape(-1,nao0,nao0)
dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0)
- dma = [coeff @ dm @ coeff.T for dm in dma]
- dmb = [coeff @ dm @ coeff.T for dm in dmb]
+ dma = opt.sort_orbitals(dma, axis=[1,2])
+ dmb = opt.sort_orbitals(dmb, axis=[1,2])
nset = len(dma)
if mo_coeff is not None:
- mo_coeff = coeff @ mo_coeff
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
nelec = np.zeros((2,nset))
excsum = np.zeros(nset)
@@ -825,18 +780,18 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
ao_deriv = 0
else:
ao_deriv = 1
- with_lapl = MGGA_DENSITY_LAPL
- for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
for i in range(nset):
t0 = log.init_timer()
if mo_coeff is None:
- rho_a = eval_rho(_sorted_mol, ao_mask, dma[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
- rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][np.ix_(idx,idx)], xctype=xctype, hermi=1, with_lapl=with_lapl)
+ rho_a = eval_rho(_sorted_mol, ao_mask, dma[i][idx[:,None],idx], xctype=xctype, hermi=1)
+ rho_b = eval_rho(_sorted_mol, ao_mask, dmb[i][idx[:,None],idx], xctype=xctype, hermi=1)
else:
mo_coeff_mask = mo_coeff[:, idx,:]
- rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype, with_lapl)
- rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype, with_lapl)
+ rho_a = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype)
+ rho_b = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype)
rho = cupy.stack([rho_a, rho_b], axis=0)
exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
@@ -882,8 +837,8 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
excsum[i] += cupy.dot(den_b, exc[:,0])
t1 = log.timer_debug1('integration', *t1)
- vmata = [coeff.T @ v @ coeff for v in vmata]
- vmatb = [coeff.T @ v @ coeff for v in vmatb]
+ vmata = opt.unsort_orbitals(vmata, axis=[1,2])
+ vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
if xctype != 'LDA':
for i in range(nset):
vmata[i] = vmata[i] + vmata[i].T
@@ -918,7 +873,6 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
dm = coeff @ cupy.asarray(dm) @ coeff.T
if mo_coeff is not None:
mo_coeff = coeff @ mo_coeff
- with_lapl = MGGA_DENSITY_LAPL
mem_avail = get_avail_mem()
blksize = mem_avail*.2/8/nao//ALIGNED * ALIGNED
@@ -932,11 +886,11 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
t1 = t0 = log.init_timer()
for p0, p1 in lib.prange(0,ngrids,blksize):
coords = grids.coords[p0:p1]
- ao = eval_ao(ni, _sorted_mol, coords, 0)
+ ao = eval_ao(_sorted_mol, coords, 0, gdftopt=opt, transpose=False)
if mo_coeff is None:
- rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1, with_lapl=with_lapl)
+ rho[p0:p1] = eval_rho(_sorted_mol, ao, dm, xctype='LDA', hermi=1)
else:
- rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA', with_lapl)
+ rho[p0:p1] = eval_rho2(_sorted_mol, ao, mo_coeff, mo_occ, None, 'LDA')
t1 = log.timer_debug2('eval rho slice', *t1)
t0 = log.timer_debug1('eval rho', *t0)
@@ -957,16 +911,15 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
opt = ni.gdftopt
_sorted_mol = opt.mol
- coeff = cupy.asarray(opt.coeff)
- nao, nao0 = coeff.shape
+ nao, nao0 = opt.coeff.shape
dms = cupy.asarray(dms)
dm_shape = dms.shape
# AO basis -> gdftopt AO basis
with_mocc = hasattr(dms, 'mo1')
if with_mocc:
- mo1 = dms.mo1[:,opt.ao_idx] * 2.0**0.5
- occ_coeff = dms.occ_coeff[opt.ao_idx] * 2.0**0.5
- dms = take_last2d(dms, opt.ao_idx)
+ mo1 = opt.sort_orbitals(dms.mo1, axis=[1])
+ occ_coeff = opt.sort_orbitals(dms.occ_coeff, axis=[0]) * 2.0
+ dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
nset = len(dms)
vmat = cupy.zeros((nset, nao, nao))
@@ -974,29 +927,23 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
ao_deriv = 0
else:
ao_deriv = 1
- with_lapl = MGGA_DENSITY_LAPL
p0 = 0
p1 = 0
t1 = t0 = log.init_timer()
- for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
p0, p1 = p1, p1+len(weights)
# precompute molecular orbitals
if with_mocc:
occ_coeff_mask = occ_coeff[mask]
- if xctype == 'LDA':
- c0 = _dot_ao_dm(_sorted_mol, ao, occ_coeff_mask, None, None, None)
- elif xctype == "GGA":
- c0 = contract('nig,io->nog', ao, occ_coeff_mask)
- else: # mgga
- c0 = contract('nig,io->nog', ao, occ_coeff_mask)
- t1 = log.timer_debug2(f'eval occ_coeff, with mocc: {with_mocc}', *t1)
- if with_mocc:
- rho1 = eval_rho4(_sorted_mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False)
+ rho1 = eval_rho4(_sorted_mol, ao, occ_coeff_mask, mo1[:,mask],
+ xctype=xctype, hermi=hermi)
else:
# slow version
rho1 = []
for i in range(nset):
- rho_tmp = eval_rho(_sorted_mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl)
+ rho_tmp = eval_rho(_sorted_mol, ao, dms[i,mask[:,None],mask],
+ xctype=xctype, hermi=hermi)
rho1.append(rho_tmp)
rho1 = cupy.stack(rho1, axis=0)
t1 = log.timer_debug2('eval rho', *t1)
@@ -1012,12 +959,10 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
for i in range(nset):
if xctype == 'LDA':
vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T)
- add_sparse(vmat[i], vmat_tmp, mask)
elif xctype == 'GGA':
wv[i,0] *= .5
aow = _scale_ao(ao, wv[i])
vmat_tmp = aow.dot(ao[0].T)
- add_sparse(vmat[i], vmat_tmp, mask)
elif xctype == 'NLC':
raise NotImplementedError('NLC')
else:
@@ -1025,13 +970,13 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
wv[i,4] *= .5
vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T)
vmat_tmp+= _tau_dot(ao, ao, wv[i,4])
- add_sparse(vmat[i], vmat_tmp, mask)
+ add_sparse(vmat[i], vmat_tmp, mask)
t1 = log.timer_debug2('integration', *t1)
- ao = c0 = rho1 = None
+ ao = rho1 = None
t0 = log.timer_debug1('vxc', *t0)
- vmat = take_last2d(vmat, opt.rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
if xctype != 'LDA':
transpose_sum(vmat)
@@ -1054,7 +999,8 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
fxc = fxc[0,:,0] + fxc[0,:,1]
else:
fxc = fxc[0,:,0] - fxc[0,:,1]
- return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc)
+ return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc,
+ max_memory=max_memory, verbose=verbose)
def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0,
@@ -1069,8 +1015,7 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
opt = ni.gdftopt
mol = None
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- nao, nao0 = coeff.shape
+ nao, nao0 = opt.coeff.shape
dma, dmb = dms
dm_shape = dma.shape
# AO basis -> gdftopt AO basis
@@ -1078,17 +1023,15 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
if with_mocc:
mo1a, mo1b = dms.mo1
occ_coeffa, occ_coeffb = dms.occ_coeff
- mo1a = contract('nio,pi->npo', mo1a, coeff)
- mo1b = contract('nio,pi->npo', mo1b, coeff)
- occ_coeff_a = contract('io,pi->po', occ_coeffa, coeff)
- occ_coeff_b = contract('io,pi->po', occ_coeffb, coeff)
+ mo1a = opt.sort_orbitals(mo1a, axis=[1])
+ mo1b = opt.sort_orbitals(mo1b, axis=[1])
+ occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0])
+ occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0])
dma = cupy.asarray(dma).reshape(-1,nao0,nao0)
dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0)
- dma = contract('nij,qj->niq', dma, coeff)
- dma = contract('pi,niq->npq', coeff, dma)
- dmb = contract('nij,qj->niq', dmb, coeff)
- dmb = contract('pi,niq->npq', coeff, dmb)
+ dma = opt.sort_orbitals(dma, axis=[1,2])
+ dmb = opt.sort_orbitals(dmb, axis=[1,2])
nset = len(dma)
vmata = cupy.zeros((nset, nao, nao))
@@ -1096,84 +1039,65 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
if xctype == 'LDA':
ao_deriv = 0
+ nvar = 1
+ elif xctype == 'GGA':
+ ao_deriv = 1
+ nvar = 4
else:
ao_deriv = 1
- with_lapl = MGGA_DENSITY_LAPL
- p0 = 0
- p1 = 0
- for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ nvar = 5
+ p0 = p1 = 0
+ for ao, mask, weights, coords in ni.block_loop(
+ _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory):
t0 = log.init_timer()
p0, p1 = p1, p1+len(weights)
+ # precompute fxc_w
+ fxc_w = fxc[:,:,:,:,p0:p1] * weights
+
# precompute molecular orbitals
if with_mocc:
occ_coeff_a_mask = occ_coeff_a[mask]
occ_coeff_b_mask = occ_coeff_b[mask]
- if xctype == 'LDA':
- c0_a = _dot_ao_dm(_sorted_mol, ao, occ_coeff_a_mask, None, None, None)
- c0_b = _dot_ao_dm(_sorted_mol, ao, occ_coeff_b_mask, None, None, None)
- elif xctype == "GGA":
- c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask)
- c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask)
- else: # mgga
- c0_a = contract('nig,io->nog', ao, occ_coeff_a_mask)
- c0_b = contract('nig,io->nog', ao, occ_coeff_b_mask)
-
- if with_mocc:
- rho1a = eval_rho4(_sorted_mol, ao, c0_a, mo1a[:,mask], xctype=xctype, with_lapl=with_lapl)
- rho1b = eval_rho4(_sorted_mol, ao, c0_b, mo1b[:,mask], xctype=xctype, with_lapl=with_lapl)
- else:
- # slow version
- rho1a = []
- rho1b = []
+ rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask],
+ xctype=xctype, hermi=hermi)
+ rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask],
+ xctype=xctype, hermi=hermi)
+ rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0)
+ else: # slow version
+ rho1 = cupy.empty((2, nset, nvar, p1-p0))
for i in range(nset):
- rho_tmp = eval_rho(_sorted_mol, ao, dma[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl)
- rho1a.append(rho_tmp)
- rho_tmp = eval_rho(_sorted_mol, ao, dmb[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=with_lapl)
- rho1b.append(rho_tmp)
- rho1a = cupy.stack(rho1a, axis=0)
- rho1b = cupy.stack(rho1b, axis=0)
- rho1 = cupy.stack([rho1a, rho1b], axis=0)
+ rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask],
+ xctype=xctype, hermi=hermi)
+ rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask],
+ xctype=xctype, hermi=hermi)
t0 = log.timer_debug1('rho', *t0)
- # precompute fxc_w
- if xctype == 'LDA':
- fxc_w = fxc[:,0,:,0,p0:p1] * weights
- else:
- fxc_w = fxc[:,:,:,:,p0:p1] * weights
-
for i in range(nset):
+ wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
if xctype == 'LDA':
- wv = contract('ag,abg->bg', rho1[:,i], fxc_w)
- va = ao.dot(_scale_ao(ao, wv[0]).T)
- vb = ao.dot(_scale_ao(ao, wv[1]).T)
- add_sparse(vmata[i], va, mask)
- add_sparse(vmatb[i], vb, mask)
+ va = ao.dot(_scale_ao(ao, wv[0,0]).T)
+ vb = ao.dot(_scale_ao(ao, wv[1,0]).T)
elif xctype == 'GGA':
- wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
- wv[:,0] *= .5
+ wv[:,0] *= .5 # for transpose_sum at the end
va = ao[0].dot(_scale_ao(ao, wv[0]).T)
vb = ao[0].dot(_scale_ao(ao, wv[1]).T)
- add_sparse(vmata[i], va, mask)
- add_sparse(vmatb[i], vb, mask)
elif xctype == 'NLC':
raise NotImplementedError('NLC')
else:
- wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
- wv[:,[0, 4]] *= .5
+ wv[:,[0,4]] *= .5 # for transpose_sum at the end
va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T)
vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T)
va += _tau_dot(ao, ao, wv[0,4])
vb += _tau_dot(ao, ao, wv[1,4])
- add_sparse(vmata[i], va, mask)
- add_sparse(vmatb[i], vb, mask)
- vmata = [coeff.T @ v @ coeff for v in vmata]
- vmatb = [coeff.T @ v @ coeff for v in vmatb]
+ add_sparse(vmata[i], va, mask)
+ add_sparse(vmatb[i], vb, mask)
+ vmata = opt.unsort_orbitals(vmata, axis=[1,2])
+ vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
if xctype != 'LDA':
# For real orbitals, K_{ia,bj} = K_{ia,jb}. It simplifies real fxc_jb
# [(\nabla mu) nu + mu (\nabla nu)] * fxc_jb = ((\nabla mu) nu f_jb) + h.c.
- for i in range(nset):
- vmata[i] = vmata[i] + vmata[i].T
- vmatb[i] = vmatb[i] + vmatb[i].T
+ transpose_sum(vmata)
+ transpose_sum(vmatb)
if FREE_CUPY_CACHE:
dma = dmb = None
@@ -1228,23 +1152,22 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
nao, nao0 = opt.coeff.shape
mol = None
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)]
+
+ dms = dms.reshape(-1,nao0,nao0)
assert len(dms) == 1
+ dms = opt.sort_orbitals(dms, axis=[1,2])
if mo_coeff is not None:
- mo_coeff = coeff @ mo_coeff
- with_lapl = MGGA_DENSITY_LAPL
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
ao_deriv = 1
vvrho = []
for ao, idx, weight, coords \
in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory):
- #rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1)
if mo_coeff is None:
- rho = eval_rho(_sorted_mol, ao, dms[0][np.ix_(idx,idx)], xctype='GGA', hermi=1, with_lapl=with_lapl)
+ rho = eval_rho(_sorted_mol, ao, dms[0][idx[:,None],idx], xctype='GGA', hermi=1)
else:
mo_coeff_mask = mo_coeff[idx,:]
- rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA', with_lapl)
+ rho = eval_rho2(_sorted_mol, ao, mo_coeff_mask, mo_occ, None, 'GGA')
vvrho.append(rho)
rho = cupy.hstack(vvrho)
@@ -1277,7 +1200,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
t1 = log.timer_debug1('integration', *t1)
transpose_sum(vmat)
- vmat = take_last2d(vmat, opt.rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[0,1])
log.timer_debug1('eval vv10', *t0)
return nelec, excsum, vmat
@@ -1293,7 +1216,6 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
raise NotImplementedError('NLC')
else:
ao_deriv = 0
- with_lapl = MGGA_DENSITY_LAPL
opt = getattr(ni, 'gdftopt', None)
if opt is None or mol not in [opt.mol, opt._sorted_mol]:
ni.build(mol, grids.coords)
@@ -1301,28 +1223,34 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
mol = None
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- nao = coeff.shape[0]
- if spin == 0:
- mo_coeff = coeff @ mo_coeff
+ mo_coeff = cupy.asarray(mo_coeff)
+ nao = opt.coeff.shape[0]
+ if mo_coeff.ndim == 2: # RHF
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
rho = []
t1 = t0 = log.init_timer()
- for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
mo_coeff_mask = mo_coeff[idx,:]
- rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl)
+ rho_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
rho.append(rho_slice)
t1 = log.timer_debug2('eval rho slice', *t1)
rho = cupy.hstack(rho)
+ if spin == 1: # RKS with nr_rks_fxc_st
+ rho *= .5
+ rho = cupy.repeat(rho[None], 2, axis=0)
t0 = log.timer_debug1('eval rho in fxc', *t0)
else:
- mo_coeff = contract('ip,npj->nij', coeff, cupy.asarray(mo_coeff))
+ assert spin == 1
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
rhoa = []
rhob = []
t1 = t0 = log.init_timer()
- for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+ max_memory=max_memory):
mo_coeff_mask = mo_coeff[:,idx,:]
- rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype, with_lapl)
- rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype, with_lapl)
+ rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[0], mo_occ[0], None, xctype)
+ rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask[1], mo_occ[1], None, xctype)
rhoa.append(rhoa_slice)
rhob.append(rhob_slice)
t1 = log.timer_debug2('eval rho in fxc', *t1)
@@ -1348,7 +1276,8 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None
if omega is None: omega = ni.omega
if xctype is None: xctype = ni._xc_type(xc_code)
- if ni.xcfuns is None: ni.xcfuns = _init_xcfuns(xc_code, spin_polarized)
+
+ xcfuns = ni._init_xcfuns(xc_code, spin_polarized)
inp = {}
if not spin_polarized:
@@ -1391,13 +1320,13 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None
"v3sigma2lapl", "v3sigma2tau",
"v3sigmalapl2", "v3sigmalapltau", "v3sigmatau2",
"v3lapl3", "v3lapl2tau", "v3lapltau2", "v3tau3"]
- if len(ni.xcfuns) == 1:
- xcfun, _ = ni.xcfuns[0]
+ if len(xcfuns) == 1:
+ xcfun, _ = xcfuns[0]
xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc)
ret_full = xc_res
else:
ret_full = {}
- for xcfun, w in ni.xcfuns:
+ for xcfun, w in xcfuns:
xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc)
for label in xc_res:
if label in ret_full:
@@ -1539,11 +1468,14 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids]
ao_mask = eval_ao(
- ni, _sorted_mol, coords, deriv,
+ _sorted_mol, coords, deriv,
nao_slice=len(idx),
shls_slice=non0shl_idx,
ao_loc_slice=ao_loc_slice,
- ctr_offsets_slice=ctr_offsets_slice)
+ ctr_offsets_slice=ctr_offsets_slice,
+ gdftopt=opt,
+ transpose=False
+ )
t1 = log.timer_debug2('evaluate ao slice', *t1)
if pad > 0:
@@ -1579,7 +1511,7 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
raise RuntimeError('Not enough GPU memory')
opt = getattr(ni, 'gdftopt', None)
- if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+ if opt is None:
ni.build(mol, grids.coords)
opt = ni.gdftopt
@@ -1590,7 +1522,6 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
total_used_bytes = 0
mem_limit = get_avail_mem()
- mol = None
_sorted_mol = opt._sorted_mol
with opt.gdft_envs_cache():
block_id = 0
@@ -1605,11 +1536,14 @@ def _grouped_block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[block_id, blksize, ngrids]
ao_mask = eval_ao(
- ni, _sorted_mol, coords, deriv,
+ _sorted_mol, coords, deriv,
nao_slice=len(idx),
shls_slice=non0shl_idx,
ao_loc_slice=ao_loc_slice,
- ctr_offsets_slice=ctr_offsets_slice)
+ ctr_offsets_slice=ctr_offsets_slice,
+ gdftopt=opt,
+ transpose=False
+ )
if pad > 0:
if deriv == 0:
@@ -1660,7 +1594,7 @@ def _xc_type(self, xc_code):
class NumInt(lib.StreamObject, LibXCMixin):
from gpu4pyscf.lib.utils import to_gpu, device
- _keys = {'screen_idx', 'xcfuns', 'gdftopt'}
+ _keys = {'screen_index', 'xcfuns', 'gdftopt', 'pair_mask', 'grid_blksize', 'non0ao_idx'}
gdftopt = None
pair_mask = None
screen_index = None
@@ -1700,14 +1634,27 @@ def build(self, mol, coords):
# cannot patch this function
eval_xc_eff = eval_xc_eff
block_loop = _block_loop
- eval_rho2 = eval_rho2
- eval_ao = eval_ao
- #eval_rho2 = staticmethod(eval_rho2)
+ eval_ao = staticmethod(eval_ao)
+ eval_rho = staticmethod(eval_rho)
+ eval_rho2 = staticmethod(eval_rho2)
def to_cpu(self):
ni = numint.NumInt()
return ni
+ @lru_cache(10)
+ def _init_xcfuns(self, xc_code, spin):
+ return _init_xcfuns(xc_code, spin)
+
+ def reset(self):
+ self.gdftopt = None
+ self.pair_mask = None
+ self.screen_index = None
+ self.xcfuns = None
+ self.grid_blksize = None
+ self.non0ao_idx = {}
+ return self
+
def _make_pairs2shls_idx(pair_mask, l_bas_loc, hermi=0):
if hermi:
pair_mask = np.tril(pair_mask)
@@ -1985,9 +1932,7 @@ def build(self, mol=None):
coeff = np.vstack([coeff, np.zeros((paddings, coeff.shape[1]))])
pmol._decontracted = True
self._sorted_mol = pmol
- inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32)
- self.ao_idx = cupy.asarray(ao_idx, dtype=np.int32)
- self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32)
+ self._ao_idx = cupy.asarray(ao_idx, dtype=np.int32)
self.coeff = coeff[ao_idx]
self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32)
self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32)
@@ -2014,5 +1959,40 @@ def gdft_envs_cache(self):
finally:
libgdft.GDFTdel_envs(ctypes.byref(self.envs_cache))
+ def sort_orbitals(self, mat, axis=[]):
+ ''' Transform given axis of a matrix into sorted AO
+ '''
+ idx = self._ao_idx
+ shape_ones = (1,) * mat.ndim
+ fancy_index = []
+ for dim, n in enumerate(mat.shape):
+ if dim in axis:
+ assert n == len(idx)
+ indices = idx
+ else:
+ indices = np.arange(n)
+ idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+ fancy_index.append(indices.reshape(idx_shape))
+ return mat[tuple(fancy_index)]
+
+ def unsort_orbitals(self, sorted_mat, axis=[], out=None):
+ ''' Transform given axis of a matrix into original AO
+ '''
+ idx = self._ao_idx
+ shape_ones = (1,) * sorted_mat.ndim
+ fancy_index = []
+ for dim, n in enumerate(sorted_mat.shape):
+ if dim in axis:
+ assert n == len(idx)
+ indices = idx
+ else:
+ indices = np.arange(n)
+ idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+ fancy_index.append(indices.reshape(idx_shape))
+ if out is None:
+ out = cupy.empty_like(sorted_mat)
+ out[tuple(fancy_index)] = sorted_mat
+ return out
+
class _GDFTEnvsCache(ctypes.Structure):
pass
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index fb3820b3..333034f8 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -25,15 +25,13 @@
from gpu4pyscf.lib import logger
from gpu4pyscf.dft import numint, gen_grid
from gpu4pyscf.scf import hf
-from gpu4pyscf.lib.cupy_helper import load_library, tag_array
+from gpu4pyscf.lib.cupy_helper import tag_array
from pyscf import __config__
__all__ = [
- 'get_veff', 'RKS'
+ 'get_veff', 'RKS', 'KohnShamDFT',
]
-libcupy_helper = load_library('libcupy_helper')
-
def prune_small_rho_grids_(ks, mol, dm, grids):
rho = ks._numint.get_rho(mol, dm, grids, ks.max_memory, verbose=ks.verbose)
@@ -134,16 +132,14 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
if hermi == 2: # because rho = 0
n, exc, vxc = 0, 0, 0
else:
- max_memory = ks.max_memory - lib.current_memory()[0]
- n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm, max_memory=max_memory)
+ n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm)
if ks.do_nlc():
if ni.libxc.is_nlc(ks.xc):
xc = ks.xc
else:
assert ni.libxc.is_nlc(ks.nlc)
xc = ks.nlc
- n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
- max_memory=max_memory)
+ n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm)
exc += enlc
vxc += vnlc
@@ -151,8 +147,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
t0 = logger.timer_debug1(ks, 'vxc tot', *t0)
#enabling range-separated hybrids
- omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
- if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+ if not ni.libxc.is_hybrid_xc(ks.xc):
vk = None
if (ks._eri is None and ks.direct_scf and
getattr(vhf_last, 'vj', None) is not None):
@@ -164,6 +159,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
vxc += vj
else:
+ omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
if (ks._eri is None and ks.direct_scf and
getattr(vhf_last, 'vk', None) is not None):
ddm = cupy.asarray(dm) - cupy.asarray(dm_last)
@@ -232,6 +228,16 @@ def energy_elec(ks, dm=None, h1e=None, vhf=None):
# Inherit pyscf KohnShamDFT class since this is tested in the pyscf dispersion code
class KohnShamDFT(rks.KohnShamDFT):
+ _keys = {'cphf_grids', *rks.KohnShamDFT._keys}
+
+ to_rhf = NotImplemented
+ to_uhf = NotImplemented
+ to_ghf = NotImplemented
+ to_hf = NotImplemented
+ to_rks = NotImplemented
+ to_uks = NotImplemented
+ to_gks = NotImplemented
+
_keys = rks.KohnShamDFT._keys
def __init__(self, xc='LDA,VWN'):
@@ -245,6 +251,14 @@ def __init__(self, xc='LDA,VWN'):
self.nlcgrids = gen_grid.Grids(self.mol)
self.nlcgrids.level = getattr(
__config__, 'dft_rks_RKS_nlcgrids_level', self.nlcgrids.level)
+
+ # Default CPHF grids is SG1 grids
+ # Reference:
+ # https://gaussian.com/integral/?tabid=1#Integral_keyword__Grid_option
+ self.cphf_grids = gen_grid.Grids(self.mol)
+ self.cphf_grids.prune = gen_grid.sg1_prune
+ self.cphf_grids.atom_grid = (50,194)
+
# Use rho to filter grids
self.small_rho_cutoff = getattr(
__config__, 'dft_rks_RKS_small_rho_cutoff', 1e-7)
@@ -261,7 +275,7 @@ def omega(self, v):
def dump_flags(self, verbose=None):
# TODO: add this later
return
-
+
reset = rks.KohnShamDFT.reset
do_nlc = rks.KohnShamDFT.do_nlc
@@ -285,7 +299,8 @@ def reset(self, mol=None):
hf.SCF.reset(self, mol)
self.grids.reset(mol)
self.nlcgrids.reset(mol)
- self._numint.gdftopt = None
+ self.cphf_grids.reset(mol)
+ self._numint.reset()
return self
def nuc_grad_method(self):
diff --git a/gpu4pyscf/dft/tests/test_ao_values.py b/gpu4pyscf/dft/tests/test_ao_values.py
index 86d52d6c..8a1a1457 100644
--- a/gpu4pyscf/dft/tests/test_ao_values.py
+++ b/gpu4pyscf/dft/tests/test_ao_values.py
@@ -55,40 +55,35 @@ def test_ao_sph_deriv0(self):
coords = np.random.random((100,3))
ao = mol_sph.eval_gto('GTOval_sph_deriv0', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=0)
+ ao_gpu = numint.eval_ao(mol_sph, coords, deriv=0)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
-
+
def test_ao_sph_deriv1(self):
coords = np.random.random((100,3))
ao = mol_sph.eval_gto('GTOval_sph_deriv1', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=1)
+ ao_gpu = numint.eval_ao(mol_sph, coords, deriv=1)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
def test_ao_sph_deriv2(self):
coords = np.random.random((4,3))
ao = mol_sph.eval_gto('GTOval_sph_deriv2', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=2)
+ ao_gpu = numint.eval_ao(mol_sph, coords, deriv=2)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
def test_ao_sph_deriv3(self):
coords = np.random.random((100,3))
ao = mol_sph.eval_gto('GTOval_sph_deriv3', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=3)
+ ao_gpu = numint.eval_ao(mol_sph, coords, deriv=3)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
def test_ao_sph_deriv4(self):
coords = np.random.random((100,3))
ao = mol_sph.eval_gto('GTOval_sph_deriv4', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=4)
+ ao_gpu = numint.eval_ao(mol_sph, coords, deriv=4)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
# cart mol
@@ -96,24 +91,21 @@ def test_ao_cart_deriv0(self):
coords = np.random.random((100,3))
ao = mol_cart.eval_gto('GTOval_cart_deriv0', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=0)
+ ao_gpu = numint.eval_ao(mol_cart, coords, deriv=0)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
def test_ao_cart_deriv1(self):
coords = np.random.random((100,3))
ao = mol_cart.eval_gto('GTOval_cart_deriv1', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=1)
+ ao_gpu = numint.eval_ao(mol_cart, coords, deriv=1)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
def test_ao_cart_deriv2(self):
coords = np.random.random((100,3))
ao = mol_cart.eval_gto('GTOval_cart_deriv2', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=2)
+ ao_gpu = numint.eval_ao(mol_cart, coords, deriv=2)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
def test_ao_cart_deriv3(self):
@@ -128,8 +120,7 @@ def test_ao_cart_deriv4(self):
coords = np.random.random((100,3))
ao = mol_cart.eval_gto('GTOval_cart_deriv4', coords)
ao_cpu = cupy.asarray(ao)
- ni = NumInt()
- ao_gpu = numint.eval_ao(ni, mol_cart, coords, deriv=4)
+ ao_gpu = numint.eval_ao(mol_cart, coords, deriv=4)
assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8
if __name__ == "__main__":
diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py
index 229f0854..80d305aa 100644
--- a/gpu4pyscf/dft/tests/test_libxc.py
+++ b/gpu4pyscf/dft/tests/test_libxc.py
@@ -47,8 +47,12 @@ def tearDownModule():
mol.stdout.close()
del mol
+def _diff(dat, ref):
+ d = dat - ref
+ return np.min((abs(d/(ref+1e-300)), abs(d)), axis=0)
+
class KnownValues(unittest.TestCase):
- def _check_xc(self, xc):
+ def _check_xc(self, xc, spin=0, fxc_tol=1e-10, kxc_tol=1e-10):
ni_cpu = numint_cpu()
ni_gpu = numint_gpu()
xctype = ni_cpu._xc_type(xc)
@@ -60,26 +64,42 @@ def _check_xc(self, xc):
grids = Grids(mol).build()
ao = ni_cpu.eval_ao(mol, grids.coords, ao_deriv)
rho = ni_cpu.eval_rho(mol, ao, dm0, xctype=xctype)
+ if spin != 0:
+ rho = (rho, rho)
exc_cpu, vxc_cpu, fxc_cpu, kxc_cpu = ni_cpu.eval_xc_eff(xc, rho, deriv=2, xctype=xctype)
exc_gpu, vxc_gpu, fxc_gpu, kxc_gpu = ni_gpu.eval_xc_eff(xc, cupy.array(rho), deriv=2, xctype=xctype)
- assert(np.linalg.norm((exc_gpu[:,0].get() - exc_cpu)) < 1e-10)
- assert(np.linalg.norm((vxc_gpu.get() - vxc_cpu)) < 1e-10)
+ assert _diff(exc_gpu[:,0].get(), exc_cpu).max() < 1e-10
+ assert _diff(vxc_gpu.get(), vxc_cpu).max() < 1e-10
if fxc_gpu is not None:
- assert(np.linalg.norm((fxc_gpu.get() - fxc_cpu))/np.linalg.norm(fxc_cpu) < 1e-6)
+ assert _diff(fxc_gpu.get(), fxc_cpu).max() < fxc_tol
if kxc_gpu is not None:
- assert(np.linalg.norm(kxc_gpu.get() - kxc_cpu) < 1e-5)
+ assert _diff(kxc_gpu.get(), kxc_cpu).max() < kxc_tol
def test_LDA(self):
self._check_xc('LDA_C_VWN')
def test_GGA(self):
- self._check_xc('GGA_C_PBE')
+ self._check_xc('HYB_GGA_XC_B3LYP')
+ self._check_xc('GGA_X_B88', fxc_tol=1e-10)
+ self._check_xc('GGA_C_PBE', fxc_tol=1e-5)
def test_mGGA(self):
- self._check_xc('MGGA_C_M06')
+ self._check_xc('MGGA_C_M06', fxc_tol=1e-5)
+
+ def test_u_LDA(self):
+ self._check_xc('LDA_C_VWN', spin=1)
+
+ def test_u_GGA(self):
+ # large errors found in B88 for the spin polarized case
+ self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-3)
+ self._check_xc('GGA_X_B88', spin=1, fxc_tol=1e-1)
+ self._check_xc('GGA_C_PBE', spin=1, fxc_tol=1e-5)
+
+ def test_u_mGGA(self):
+ self._check_xc('MGGA_C_M06', spin=1, fxc_tol=1e-5)
if __name__ == "__main__":
print("Full Tests for xc fun")
- unittest.main()
\ No newline at end of file
+ unittest.main()
diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py
index ba34f63d..505df831 100644
--- a/gpu4pyscf/dft/tests/test_numint.py
+++ b/gpu4pyscf/dft/tests/test_numint.py
@@ -155,7 +155,7 @@ def test_rks_gga(self):
def test_rks_mgga(self):
self._check_vxc('nr_rks', MGGA_M06)
-
+
def test_uks_lda(self):
self._check_vxc('nr_uks', LDA)#'lda', -6.362059440515177)
@@ -212,7 +212,25 @@ def test_vv10(self):
v = dft.numint._vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars)
self.assertAlmostEqual(lib.fp(v[0].get()), 0.15894647203764295, 8)
self.assertAlmostEqual(lib.fp(v[1].get()), 0.20500922537924576, 8)
- return
+
+ def test_eval_rho(self):
+ np.random.seed(1)
+ dm = np.random.random(dm0.shape)
+ ni_gpu = NumInt()
+ ni_cpu = pyscf_numint()
+ for xctype in ('LDA', 'GGA', 'MGGA'):
+ deriv = 1
+ if xctype == 'LDA':
+ deriv = 0
+ ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+ ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+ rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+ ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+ self.assertAlmostEqual(abs(rho.get() - ref).max(), 0, 10)
+
+ rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+ ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+ self.assertAlmostEqual(abs(rho.get() - ref).max(), 0, 10)
if __name__ == "__main__":
print("Full Tests for dft numint")
diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py
index 398f8b81..7ccf20c7 100644
--- a/gpu4pyscf/dft/uks.py
+++ b/gpu4pyscf/dft/uks.py
@@ -133,7 +133,8 @@ def reset(self, mol=None):
hf.SCF.reset(self, mol)
self.grids.reset(mol)
self.nlcgrids.reset(mol)
- self._numint.gdftopt = None
+ self.cphf_grids.reset(mol)
+ self._numint.reset()
return self
def nuc_grad_method(self):
@@ -145,4 +146,4 @@ def to_cpu(self):
mf = uks.UKS(self.mol, xc=self.xc)
mf.disp = self.disp
utils.to_cpu(self, mf)
- return mf
\ No newline at end of file
+ return mf
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 7cc5e78d..70ab8240 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -256,8 +256,8 @@ def get_grad_hcore(mf_grad, mo_coeff=None, mo_occ=None):
intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
intopt.build(1e-14, diag_block_with_triu=True, aosym=False,
group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
- orbo_sorted = orbo[intopt.ao_idx]
- mo_coeff_sorted = mo_coeff[intopt.ao_idx]
+ orbo_sorted = intopt.sort_orbitals(orbo, axis=[0])
+ mo_coeff_sorted = intopt.sort_orbitals(mo_coeff, axis=[0])
for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'):
dh1e[k0:k1,:,j0:j1,:] += contract('xkji,io->kxjo', int3c_blk, orbo_sorted[i0:i1])
dh1e[k0:k1,:,i0:i1,:] += contract('xkji,jo->kxio', int3c_blk, orbo_sorted[j0:j1])
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 2ef4a6d8..1fd43ac0 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -135,9 +135,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
coeff = cupy.asarray(opt.coeff)
nao, nao0 = coeff.shape
dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
- dms = take_last2d(dms, opt.ao_idx)
- mo_coeff = mo_coeff[opt.ao_idx]
-
+ dms = opt.sort_orbitals(dms, axis=[1,2])
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
nset = len(dms)
assert nset == 1
vmat = cupy.zeros((nset,3,nao,nao))
@@ -179,8 +178,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
vtmp = _gga_grad_sum_(ao_mask, wv)
vtmp += _tau_grad_dot_(ao_mask, wv[4])
add_sparse(vmat[idm], vtmp, idx)
- #vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat]
- vmat = take_last2d(vmat, opt.rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[2,3])
exc = None
if nset == 1:
vmat = vmat[0]
@@ -203,10 +201,9 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
_sorted_mol = opt._sorted_mol
coeff = cupy.asarray(opt.coeff)
nao, nao0 = coeff.shape
- dms = cupy.asarray(dms)
- dms = [coeff @ dm @ coeff.T
- for dm in dms.reshape(-1,nao0,nao0)]
- mo_coeff = coeff @ mo_coeff
+ dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
+ dms = opt.sort_orbitals(dms, axis=[1,2])
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
nset = len(dms)
assert nset == 1
@@ -238,10 +235,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
vmat_tmp = _gga_grad_sum_(ao_mask, wv)
add_sparse(vmat, vmat_tmp, mask)
- #vmat = contract('npq,qj->npj', vmat, coeff)
- #vmat = contract('pi,npj->nij', coeff, vmat)
- rev_ao_idx = opt.rev_ao_idx
- vmat = take_last2d(vmat, rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
exc = None
# - sign because nabla_X = -nabla_x
return exc, -vmat
@@ -358,7 +352,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)):
ngrids = weight.size
for p0, p1 in lib.prange(0,ngrids,block_size):
- ao = numint.eval_ao(ni, _sorted_mol, coords[p0:p1, :], ao_deriv)
+ ao = numint.eval_ao(_sorted_mol, coords[p0:p1, :], ao_deriv, gdftopt=opt, transpose=False)
if xctype == 'LDA':
rho = numint.eval_rho(_sorted_mol, ao[0], dms,
@@ -409,7 +403,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
#:vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff)
vmat = sandwich_dot(vmat, coeff)
-
+
# - sign because nabla_X = -nabla_x
return excsum, -vmat
@@ -424,7 +418,7 @@ def grids_response_cc(grids):
atm_dist = gto.inter_distance(mol, atm_coords)
atm_dist = cupy.asarray(atm_dist)
atm_coords = cupy.asarray(atm_coords)
-
+
def _radii_adjust(mol, atomic_radii):
charges = mol.atom_charges()
if grids.radii_adjust == radi.treutler_atomic_radii_adjust:
diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py
index 32848381..32d18207 100644
--- a/gpu4pyscf/grad/uks.py
+++ b/gpu4pyscf/grad/uks.py
@@ -90,7 +90,7 @@ def get_veff(ks_grad, mol=None, dm=None, verbose=None):
vxc_tmp[0] += vnlc
vxc_tmp[1] += vnlc
t0 = logger.timer(ks_grad, 'vxc', *t0)
-
+
mo_coeff_alpha = mf.mo_coeff[0]
mo_coeff_beta = mf.mo_coeff[1]
occ_coeff0 = cupy.asarray(mo_coeff_alpha[:, mf.mo_occ[0]>0.5], order='C')
@@ -139,9 +139,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
coeff = cupy.asarray(opt.coeff)
nao, nao0 = coeff.shape
dms = cupy.asarray(dms)
- dms = take_last2d(dms, opt.ao_idx)
- mo_coeff = mo_coeff[:, opt.ao_idx]
-
+ dms = opt.sort_orbitals(dms, axis=[1,2])
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
nset = len(dms)
vmat = cupy.zeros((nset,3,nao,nao))
if xctype == 'LDA':
@@ -193,7 +192,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
vtmp += rks_grad._tau_grad_dot_(ao_mask, wv[1,4])
add_sparse(vmat[1], vtmp, idx)
- vmat = take_last2d(vmat, opt.rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[2,3])
exc = None
# - sign because nabla_X = -nabla_x
@@ -216,8 +215,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
nao, nao0 = coeff.shape
dms = cupy.asarray(dms)
assert dms.ndim == 3 and dms.shape[0] == 2
- #:dms = cupy.einsum('pi,nij,qj->npq', coeff, dms, coeff)
- dms = sandwich_dot(dms.reshape(-1,nao0,nao0), coeff.T)
+ dms = opt.sort_orbitals(dms.reshape(-1,nao0,nao0), axis=[1,2])
excsum = cupy.zeros((natm, 3))
vmat = cupy.zeros((2,3,nao,nao))
@@ -239,7 +237,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
for atm_id, (coords, weight, weight1) in enumerate(rks_grad.grids_response_cc(grids)):
ngrids = weight.size
for p0, p1 in lib.prange(0,ngrids,block_size):
- ao = numint.eval_ao(ni, _sorted_mol, coords[p0:p1, :], ao_deriv)
+ ao = numint.eval_ao(_sorted_mol, coords[p0:p1, :], ao_deriv, gdftopt=opt, transpose=False)
if xctype == 'LDA':
rho_a = numint.eval_rho(_sorted_mol, ao[0], dms[0],
xctype=xctype, hermi=1, with_lapl=False)
@@ -304,9 +302,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
excsum[atm_id] += cupy.einsum('xij,ji->x', vtmp, dms[1]) * 2
rho = vxc = None
- #:vmat = cupy.einsum('pi,snpq,qj->snij', coeff, vmat, coeff)
- vmat = sandwich_dot(vmat.reshape(6,nao,nao), coeff).reshape(2,3,nao0,nao0)
-
+ vmat = opt.unsort_orbitals(vmat, axis=[2,3])
# - sign because nabla_X = -nabla_x
return excsum, -vmat
@@ -326,8 +322,8 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he
_sorted_mol = opt._sorted_mol
coeff = cupy.asarray(opt.coeff)
nao, nao0 = coeff.shape
- mo_coeff_0 = coeff @ mo_coeff[0]
- mo_coeff_1 = coeff @ mo_coeff[1]
+ mo_coeff_0 = opt.sort_orbitals(mo_coeff[0], axis=[0])
+ mo_coeff_1 = opt.sort_orbitals(mo_coeff[1], axis=[0])
nset = 1
assert nset == 1
@@ -361,8 +357,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, relativity=0, he
vmat_tmp = rks_grad._gga_grad_sum_(ao_mask, wv)
add_sparse(vmat, vmat_tmp, mask)
- rev_ao_idx = opt.rev_ao_idx
- vmat = take_last2d(vmat, rev_ao_idx)
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
exc = None
# - sign because nabla_X = -nabla_x
return exc, -vmat
diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py
index 83e3e323..01af5ca0 100644
--- a/gpu4pyscf/gto/mole.py
+++ b/gpu4pyscf/gto/mole.py
@@ -86,7 +86,7 @@ def basis_seg_contraction(mol, allow_replica=False):
pmol.output = mol.output
pmol.verbose = mol.verbose
pmol.stdout = mol.stdout
- pmol.cart = True
+ pmol.cart = True #mol.cart
pmol._bas = np.asarray(np.vstack(_bas), dtype=np.int32)
pmol._env = _env
return pmol
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 41669c93..3d2545e2 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -561,15 +561,17 @@ def gen_vind(mf, mo_coeff, mo_occ):
nao, nmo = mo_coeff.shape
mocc = mo_coeff[:,mo_occ>0]
nocc = mocc.shape[1]
- vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+ mocc_2 = mocc * 2
+ grids = getattr(mf, 'cphf_grids', None)
+ vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
def fx(mo1):
mo1 = cupy.asarray(mo1)
mo1 = mo1.reshape(-1,nmo,nocc)
mo1_mo = contract('npo,ip->nio', mo1, mo_coeff)
- #dm1 = contract('nio,jo->nij', 2.0*mo1_mo, mocc)
+ #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2)
#dm1 = dm1 + dm1.transpose(0,2,1)
- dm1 = mo1_mo.dot(2.0*mocc.T)
+ dm1 = mo1_mo.dot(mocc_2.T)
transpose_sum(dm1)
dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ)
v1 = vresp(dm1)
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 942438f9..4f03da9e 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -52,7 +52,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
if mf.do_nlc():
raise NotImplementedError
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
- with_k = abs(hyb) > 1e-10
+ with_k = ni.libxc.is_hybrid_xc(mf.xc)
de2, ej, ek = rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
atmlst, max_memory, verbose,
with_k=with_k)
@@ -103,7 +103,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
mf = hessobj.base
ni = mf._numint
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
- with_k = abs(hyb) > 1e-10
+ with_k = ni.libxc.is_hybrid_xc(mf.xc)
avail_mem -= 8 * h1mo.size
slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
@@ -146,7 +146,6 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
mo_occ = cupy.asarray(mo_occ)
mo_coeff = cupy.asarray(mo_coeff)
- nao_sph = mo_coeff.shape[0]
ni = mf._numint
xctype = ni._xc_type(mf.xc)
shls_slice = (0, mol.nbas)
@@ -157,8 +156,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
ni.build(mol, grids.coords)
opt = ni.gdftopt
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- mo_coeff = coeff @ mo_coeff
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
nao = mo_coeff.shape[0]
vmat = cupy.zeros((6,nao,nao))
@@ -251,9 +249,8 @@ def contract_(ao, aoidx, wv, mask):
1,3,4,
2,4,5]]
- vmat = contract('npq,qj->npj', vmat, coeff)
- vmat = contract('pi,npj->nij', coeff, vmat)
- return vmat.reshape(3,3,nao_sph,nao_sph)
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
+ return vmat.reshape(3,3,nao,nao)
def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype):
p0, p1 = aoslices[atm_id][2:]
@@ -344,7 +341,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
_sorted_mol = opt._sorted_mol
coeff = cupy.asarray(opt.coeff)
dm0 = mf.make_rdm1(mo_coeff, mo_occ)
- dm0_sorted = take_last2d(dm0, opt.ao_idx)
+ dm0_sorted = opt.sort_orbitals(dm0, axis=[0,1])
vmat_dm = cupy.zeros((_sorted_mol.natm,3,3,nao))
ipip = cupy.zeros((3,3,nao,nao))
if xctype == 'LDA':
@@ -361,7 +358,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
wv = weight * vxc[0]
aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)]
_d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
- dm0_mask = dm0_sorted[numpy.ix_(mask, mask)]
+ dm0_mask = dm0_sorted[mask[:,None], mask]
ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask)
ao_dm0 = numint._dot_ao_dm(mol, ao[0], dm0, mask, shls_slice, ao_loc)
@@ -379,7 +376,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
ao_dm0 = aow = None
t1 = log.timer_debug2('integration', *t1)
for ia in range(_sorted_mol.natm):
- vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx]
+ vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
p0, p1 = aoslices[ia][2:]
vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
elif xctype == 'GGA':
@@ -399,7 +396,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
_d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False)
ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)]
wf = weight * fxc
- dm0_mask = dm0_sorted[numpy.ix_(mask, mask)]
+ dm0_mask = dm0_sorted[mask[:,None], mask]
ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask)
vmat_dm_tmp = cupy.empty([3,3,nao_non0])
for ia in range(_sorted_mol.natm):
@@ -416,7 +413,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
ao_dm0 = aow = None
t1 = log.timer_debug2('integration', *t1)
for ia in range(_sorted_mol.natm):
- vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx]
+ vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
p0, p1 = aoslices[ia][2:]
vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1])
@@ -444,7 +441,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
_d1d2_dot_(ipip, mol, [aow[0], aow[1], aow[2]], [ao[XX], ao[XY], ao[XZ]], mask, ao_loc, False)
_d1d2_dot_(ipip, mol, [aow[1], aow[3], aow[4]], [ao[YX], ao[YY], ao[YZ]], mask, ao_loc, False)
_d1d2_dot_(ipip, mol, [aow[2], aow[4], aow[5]], [ao[ZX], ao[ZY], ao[ZZ]], mask, ao_loc, False)
- dm0_mask = dm0_sorted[numpy.ix_(mask, mask)]
+ dm0_mask = dm0_sorted[mask[:,None], mask]
ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)]
ao_dm_mask = contract('nig,ij->njg', ao_mask[:4], dm0_mask)
wf = weight * fxc
@@ -483,7 +480,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
vmat_dm[ia][:,:,mask] += vmat_dm_tmp
t1 = log.timer_debug2('integration', *t1)
for ia in range(_sorted_mol.natm):
- vmat_dm[ia] = vmat_dm[ia][:,:,opt.rev_ao_idx]
+ vmat_dm[ia][:,:,opt._ao_idx] = vmat_dm[ia]
p0, p1 = aoslices[ia][2:]
vmat_dm[ia] += contract('xypq,pq->xyp', ipip[:,:,:,p0:p1], dm0[:,p0:p1])
vmat_dm[ia] += contract('yxqp,pq->xyp', ipip[:,:,p0:p1], dm0[:,p0:p1])
diff --git a/gpu4pyscf/hessian/tests/test_rks_hessian.py b/gpu4pyscf/hessian/tests/test_rks_hessian.py
index bdc1b2f6..bbe272d3 100644
--- a/gpu4pyscf/hessian/tests/test_rks_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rks_hessian.py
@@ -70,7 +70,9 @@ def _check_vxc(method, xc='LDA'):
def _vs_cpu(mf, tol=1e-7):
mf.conv_tol_cpscf = 1e-8
ref = mf.Hessian().kernel()
- e2_gpu = mf.Hessian().to_gpu().kernel()
+ hessobj = mf.Hessian().to_gpu()
+ hessobj.base.cphf_grids = hessobj.base.grids
+ e2_gpu = hessobj.kernel()
assert abs(ref - e2_gpu).max() < tol
class KnownValues(unittest.TestCase):
diff --git a/gpu4pyscf/hessian/tests/test_uks_hessian.py b/gpu4pyscf/hessian/tests/test_uks_hessian.py
index c9853579..76beb1e8 100644
--- a/gpu4pyscf/hessian/tests/test_uks_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_uks_hessian.py
@@ -81,7 +81,9 @@ def _check_vxc(method, xc='LDA'):
def _vs_cpu(mf, tol=1e-7):
mf.conv_tol_cpscf = 1e-8
ref = mf.Hessian().kernel()
- e2_gpu = mf.Hessian().to_gpu().kernel()
+ hessobj = mf.Hessian().to_gpu()
+ hessobj.base.cphf_grids = hessobj.base.grids
+ e2_gpu = hessobj.kernel()
assert abs(ref - e2_gpu).max() < tol
class KnownValues(unittest.TestCase):
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index a338dc59..76f9ae9f 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -324,7 +324,8 @@ def gen_vind(mf, mo_coeff, mo_occ):
moccb = mo_coeff[1][:,mo_occ[1]>0]
nocca = mocca.shape[1]
noccb = moccb.shape[1]
- vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+ grids = getattr(mf, 'cphf_grids', None)
+ vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
def fx(mo1):
mo1 = cupy.asarray(mo1)
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index b4d9fc48..00c861b3 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -53,7 +53,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
if mf.nlc != '':
raise NotImplementedError
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
- with_k = abs(hyb) > 1e-10
+ with_k = ni.libxc.is_hybrid_xc(mf.xc)
de2, ej, ek = uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
atmlst, max_memory, verbose,
with_k=with_k)
@@ -112,7 +112,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
mf = hessobj.base
ni = mf._numint
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
- with_k = abs(hyb) > 1e-10
+ with_k = ni.libxc.is_hybrid_xc(mf.xc)
avail_mem -= 8 * (h1moa.size + h1mob.size)
slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
@@ -183,8 +183,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
opt = ni.gdftopt
_sorted_mol = opt._sorted_mol
- coeff = cupy.asarray(opt.coeff)
- mo_coeff = contract('nij,pi->npj', mo_coeff, coeff)
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
nao = mo_coeff.shape[1]
# TODO: check mol in opt?
vmata = cupy.zeros((6,nao,nao))
@@ -304,10 +303,10 @@ def contract_(ao, aoidx, wv, mask):
vmatb = vmatb[[0,1,2,
1,3,4,
2,4,5]]
- vmata = contract('npq,qj->npj', vmata, coeff)
- vmata = contract('pi,npj->nij', coeff, vmata).reshape(3,3,nao_sph,nao_sph)
- vmatb = contract('npq,qj->npj', vmatb, coeff)
- vmatb = contract('pi,npj->nij', coeff, vmatb).reshape(3,3,nao_sph,nao_sph)
+ vmata = opt.unsort_orbitals(vmata, axis=[1,2])
+ vmata = vmata.reshape(3,3,nao_sph,nao_sph)
+ vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
+ vmatb = vmatb.reshape(3,3,nao_sph,nao_sph)
return vmata, vmatb
def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype):
@@ -400,8 +399,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
coeff = cupy.asarray(opt.coeff)
dm0a, dm0b = mf.make_rdm1(mo_coeff, mo_occ)
- dm0a_sorted = take_last2d(dm0a, opt.ao_idx)
- dm0b_sorted = take_last2d(dm0b, opt.ao_idx)
+ dm0a_sorted = opt.sort_orbitals(dm0a, axis=[0,1])
+ dm0b_sorted = opt.sort_orbitals(dm0b, axis=[0,1])
vmata_dm = cupy.zeros((mol.natm,3,3,nao))
vmatb_dm = cupy.zeros((mol.natm,3,3,nao))
ipipa = cupy.zeros((3,3,nao,nao))
@@ -423,8 +422,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
_d1d2_dot_(ipipa, mol, aowa, ao[1:4], mask, ao_loc, False)
aowb = [numint._scale_ao(ao[i], wv[1]) for i in range(1, 4)]
_d1d2_dot_(ipipb, mol, aowb, ao[1:4], mask, ao_loc, False)
- dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)]
- dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)]
+ dm0a_mask = dm0a_sorted[mask[:,None], mask]
+ dm0b_mask = dm0b_sorted[mask[:,None], mask]
ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask)
ao_dmb_mask = contract('nig,ij->njg', ao_mask[:4], dm0b_mask)
@@ -451,8 +450,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
t1 = log.timer_debug2('integration', *t1)
for ia in range(_sorted_mol.natm):
p0, p1 = aoslices[ia][2:]
- vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx]
- vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx]
+ vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
+ vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
vmatb_dm[ia] += contract('xypq,pq->xyp', ipipb[:,:,:,p0:p1], dm0b[:,p0:p1])
elif xctype == 'GGA':
@@ -476,8 +475,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
ao_dm0a = [numint._dot_ao_dm(mol, ao[i], dm0a, mask, shls_slice, ao_loc) for i in range(4)]
ao_dm0b = [numint._dot_ao_dm(mol, ao[i], dm0b, mask, shls_slice, ao_loc) for i in range(4)]
wf = weight * fxc
- dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)]
- dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)]
+ dm0a_mask = dm0a_sorted[mask[:,None], mask]
+ dm0b_mask = dm0b_sorted[mask[:,None], mask]
ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask)
ao_dmb_mask = contract('nig,ij->njg', ao_mask[:4], dm0b_mask)
vmata_dm_tmp = cupy.empty([3,3,nao_non0])
@@ -507,8 +506,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
ao_dm0a = ao_dm0b = aow = None
t1 = log.timer_debug2('integration', *t1)
for ia in range(_sorted_mol.natm):
- vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx]
- vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx]
+ vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
+ vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
p0, p1 = aoslices[ia][2:]
vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1])
@@ -546,8 +545,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
_d1d2_dot_(ipipb, mol, [aow[1], aow[3], aow[4]], [ao[YX], ao[YY], ao[YZ]], mask, ao_loc, False)
_d1d2_dot_(ipipb, mol, [aow[2], aow[4], aow[5]], [ao[ZX], ao[ZY], ao[ZZ]], mask, ao_loc, False)
- dm0a_mask = dm0a_sorted[numpy.ix_(mask, mask)]
- dm0b_mask = dm0b_sorted[numpy.ix_(mask, mask)]
+ dm0a_mask = dm0a_sorted[mask[:,None], mask]
+ dm0b_mask = dm0b_sorted[mask[:,None], mask]
ao_dm0a = [numint._dot_ao_dm(mol, ao[i], dm0a, mask, shls_slice, ao_loc) for i in range(4)]
ao_dm0b = [numint._dot_ao_dm(mol, ao[i], dm0b, mask, shls_slice, ao_loc) for i in range(4)]
ao_dma_mask = contract('nig,ij->njg', ao_mask[:4], dm0a_mask)
@@ -622,8 +621,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
vmatb_dm[ia][:,:,mask] += vmatb_dm_tmp
t1 = log.timer_debug2('integration', *t1)
for ia in range(_sorted_mol.natm):
- vmata_dm[ia] = vmata_dm[ia][:,:,opt.rev_ao_idx]
- vmatb_dm[ia] = vmatb_dm[ia][:,:,opt.rev_ao_idx]
+ vmata_dm[ia][:,:,opt._ao_idx] = vmata_dm[ia]
+ vmatb_dm[ia][:,:,opt._ao_idx] = vmatb_dm[ia]
p0, p1 = aoslices[ia][2:]
vmata_dm[ia] += contract('xypq,pq->xyp', ipipa[:,:,:,p0:p1], dm0a[:,p0:p1])
vmata_dm[ia] += contract('yxqp,pq->xyp', ipipa[:,:,p0:p1], dm0a[:,p0:p1])
diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
index e5115f5b..4390407e 100644
--- a/gpu4pyscf/lib/CMakeLists.txt
+++ b/gpu4pyscf/lib/CMakeLists.txt
@@ -148,6 +148,7 @@ if(BUILD_SOLVENT)
endif()
add_subdirectory(gvhf-rys)
+add_subdirectory(gvhf-md)
option(BUILD_LIBXC "Using libxc for DFT" ON)
if(BUILD_LIBXC)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index a3b3b341..2edfd17e 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -226,9 +226,12 @@ def dist_matrix(x, y, out=None):
raise RuntimeError('failed in calculating distance matrix')
return out
-def block_c2s_diag(ncart, nsph, angular, counts):
+def block_c2s_diag(angular, counts):
'''
- constract a cartesian to spherical transformation of n shells
+ Diagonal blocked cartesian to spherical transformation
+ Args:
+ angular (list): angular momentum type, e.g. [0,1,2,3]
+ counts (list): count of each angular momentum
'''
if _data['c2s'] is None:
c2s_data = cupy.concatenate([cupy.asarray(x.ravel()) for x in c2s_l])
@@ -246,7 +249,8 @@ def block_c2s_diag(ncart, nsph, angular, counts):
offsets += [c2s_offset[l]] * count
rows = cupy.hstack(rows)
cols = cupy.hstack(cols)
-
+
+ ncart, nsph = int(rows[-1]), int(cols[-1])
cart2sph = cupy.zeros([ncart, nsph])
offsets = cupy.asarray(offsets, dtype='int32')
@@ -358,11 +362,12 @@ def transpose_sum(a, stream=None):
return a + a.transpose(0,2,1)
'''
assert a.flags.c_contiguous
- n = a.shape[-1]
+ out = a
if a.ndim == 2:
- a = a.reshape([-1,n,n])
+ a = a[None]
assert a.ndim == 3
- count = a.shape[0]
+ count, m, n = a.shape
+ assert m == n
stream = cupy.cuda.get_current_stream()
err = libcupy_helper.transpose_sum(
ctypes.cast(stream.ptr, ctypes.c_void_p),
@@ -372,7 +377,7 @@ def transpose_sum(a, stream=None):
)
if err != 0:
raise RuntimeError('failed in transpose_sum kernel')
- return a
+ return out
# for i > j of 2d mat, mat[j,i] = mat[i,j]
def hermi_triu(mat, hermi=1, inplace=True):
@@ -911,10 +916,11 @@ def sandwich_dot(a, c, out=None):
a = a[None]
counts = a.shape[0]
m = c.shape[1]
- out = cupy.empty((counts, m, m))
+ dtype = np.result_type(a, c)
+ out = cupy.empty((counts, m, m), dtype=dtype)
tmp = None
for i in range(counts):
- tmp = cupy.dot(c.T, a[i], out=tmp)
+ tmp = cupy.dot(c.conj().T, a[i], out=tmp)
cupy.dot(tmp, c, out=out[i])
if a_ndim == 2:
out = out[0]
diff --git a/gpu4pyscf/lib/cusolver.py b/gpu4pyscf/lib/cusolver.py
index 27fcb0b0..454567bd 100644
--- a/gpu4pyscf/lib/cusolver.py
+++ b/gpu4pyscf/lib/cusolver.py
@@ -66,22 +66,65 @@
ctypes.c_void_p # *devInfo
]
+# https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-sygvd
+libcusolver.cusolverDnZhegvd_bufferSize.argtypes = [
+ ctypes.c_void_p, # handle
+ ctypes.c_int, # itype
+ ctypes.c_int, # jobz
+ ctypes.c_int, # uplo
+ ctypes.c_int, # n
+ ctypes.c_void_p, # *A
+ ctypes.c_int, # lda
+ ctypes.c_void_p, # *B
+ ctypes.c_int, # ldb
+ ctypes.c_void_p, # *w
+ ctypes.c_void_p # *lwork
+]
+
+libcusolver.cusolverDnZhegvd.argtypes = [
+ ctypes.c_void_p, # handle
+ ctypes.c_int, # itype
+ ctypes.c_int, # jobz
+ ctypes.c_int, # uplo
+ ctypes.c_int, # n
+ ctypes.c_void_p, # *A
+ ctypes.c_int, # lda
+ ctypes.c_void_p, # *B
+ ctypes.c_int, # ldb
+ ctypes.c_void_p, # *w
+ ctypes.c_void_p, # *work
+ ctypes.c_int, # lwork
+ ctypes.c_void_p # *devInfo
+]
+
def eigh(h, s):
'''
solve generalized eigenvalue problem
'''
+ assert h.dtype == s.dtype
+ assert h.dtype in (np.float64, np.complex128)
n = h.shape[0]
w = cupy.zeros(n)
- A = h.copy()
- B = s.copy()
+ if h.dtype == np.complex128 and h.flags.c_contiguous:
+ # zhegvd requires the matrices in F-order. For hermitian matrices,
+ # .T.copy() is equivalent to .conj()
+ A = h.conj()
+ B = s.conj()
+ else:
+ A = h.copy()
+ B = s.copy()
_handle = device.get_cusolver_handle()
# TODO: reuse workspace
- if n in _buffersize:
- lwork = _buffersize[n]
+ if (h.dtype, n) in _buffersize:
+ lwork = _buffersize[h.dtype, n]
else:
- lwork = ctypes.c_int()
- status = libcusolver.cusolverDnDsygvd_bufferSize(
+ lwork = ctypes.c_int(0)
+ if h.dtype == np.float64:
+ fn = libcusolver.cusolverDnDsygvd_bufferSize
+ else:
+ fn = libcusolver.cusolverDnZhegvd_bufferSize
+ status = fn(
_handle,
CUSOLVER_EIG_TYPE_1,
CUSOLVER_EIG_MODE_VECTOR,
@@ -98,10 +141,14 @@ def eigh(h, s):
if status != 0:
raise RuntimeError("failed in buffer size")
-
- work = cupy.empty(lwork)
+
+ if h.dtype == np.float64:
+ fn = libcusolver.cusolverDnDsygvd
+ else:
+ fn = libcusolver.cusolverDnZhegvd
+ work = cupy.empty(lwork, dtype=h.dtype)
devInfo = cupy.empty(1, dtype=np.int32)
- status = libcusolver.cusolverDnDsygvd(
+ status = fn(
_handle,
CUSOLVER_EIG_TYPE_1,
CUSOLVER_EIG_MODE_VECTOR,
@@ -116,7 +163,7 @@ def eigh(h, s):
lwork,
devInfo.data.ptr
)
-
+
if status != 0:
raise RuntimeError("failed in eigh kernel")
return w, A.T
@@ -126,10 +173,14 @@ def cholesky(A):
assert A.flags['C_CONTIGUOUS']
x = A.copy()
handle = device.get_cusolver_handle()
- potrf = cusolver.dpotrf
- potrf_bufferSize = cusolver.dpotrf_bufferSize
+ if A.dtype == np.float64:
+ potrf = cusolver.dpotrf
+ potrf_bufferSize = cusolver.dpotrf_bufferSize
+ else:
+ potrf = cusolver.zpotrf
+ potrf_bufferSize = cusolver.zpotrf_bufferSize
buffersize = potrf_bufferSize(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n)
- workspace = cupy.empty(buffersize)
+ workspace = cupy.empty(buffersize, dtype=A.dtype)
dev_info = cupy.empty(1, dtype=np.int32)
potrf(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n,
workspace.data.ptr, buffersize, dev_info.data.ptr)
@@ -137,4 +188,4 @@ def cholesky(A):
if dev_info[0] != 0:
raise RuntimeError('failed to perform Cholesky Decomposition')
cupy.linalg._util._tril(x,k=0)
- return x
\ No newline at end of file
+ return x
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index 07d35547..573e1777 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -42,20 +42,20 @@ def _auto_create_mode(array, mode):
'ndim mismatch: {} != {}'.format(array.ndim, mode.ndim))
return mode
-def _create_tensor_descriptor(a):
- handle = cutensor._get_handle()
- key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides))
- # hard coded
- alignment_req = 8
- if key not in _tensor_descriptors:
- num_modes = a.ndim
- extent = np.array(a.shape, dtype=np.int64)
- stride = np.array(a.strides, dtype=np.int64) // a.itemsize
- cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype)
- _tensor_descriptors[key] = cutensor.TensorDescriptor(
- handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data,
- cutensor_dtype, alignment_req=alignment_req)
- return _tensor_descriptors[key]
+#def _create_tensor_descriptor(a):
+# handle = cutensor._get_handle()
+# key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides))
+# # hard coded
+# alignment_req = 8
+# if key not in _tensor_descriptors:
+# num_modes = a.ndim
+# extent = np.array(a.shape, dtype=np.int64)
+# stride = np.array(a.strides, dtype=np.int64) // a.itemsize
+# cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype)
+# _tensor_descriptors[key] = cutensor.TensorDescriptor(
+# handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data,
+# cutensor_dtype, alignment_req=alignment_req)
+# return _tensor_descriptors[key]
def contraction(
pattern, a, b, alpha, beta,
@@ -80,14 +80,14 @@ def contraction(
mode_b = list(str_b)
mode_c = list(str_c)
- if(out is not None):
- c = out
- else:
- c = cupy.empty([shape[k] for k in str_c], order='C')
+ if out is None:
+ dtype = np.result_type(a, b, alpha)
+ out = cupy.empty([shape[k] for k in str_c], order='C', dtype=dtype)
+ c = out
- desc_a = _create_tensor_descriptor(a)
- desc_b = _create_tensor_descriptor(b)
- desc_c = _create_tensor_descriptor(c)
+ desc_a = cutensor.create_tensor_descriptor(a)
+ desc_b = cutensor.create_tensor_descriptor(b)
+ desc_c = cutensor.create_tensor_descriptor(c)
mode_a = _auto_create_mode(a, mode_a)
mode_b = _auto_create_mode(b, mode_b)
diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu
index 5c6dbd1c..1f6a6939 100644
--- a/gpu4pyscf/lib/gdft/contract_rho.cu
+++ b/gpu4pyscf/lib/gdft/contract_rho.cu
@@ -56,6 +56,7 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids,
}
}
+// half of the GGA rho
__global__
void GDFTcontract_rho4_kernel(double *rho, double *bra, double *ket, int ngrids, int nao, int count)
{
@@ -109,7 +110,7 @@ void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngri
double v[4] = {0.0, 0.0, 0.0, 0.0};
if (active){
for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
- int ket_idx = grid_id + ao_id * Ngrids;
+ size_t ket_idx = grid_id + ao_id * Ngrids;
double bra_tmp = bra[ket_idx];
double ket_tmp = ket[ket_idx];
@@ -143,7 +144,7 @@ void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngri
if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads();
if (iy == 0 && active) {
- rho[grid_id + ngrids * i] = 2.0 * buf[ix];
+ rho[grid_id + ngrids * i] = buf[ix];
}
}
}
@@ -161,7 +162,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr
double v[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
if (active){
for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) {
- int ket_idx = grid_id + ao_id * Ngrids;
+ size_t ket_idx = grid_id + ao_id * Ngrids;
double bra_tmp0 = bra[ket_idx];
double ket_tmp0 = ket[ket_idx];
@@ -207,7 +208,7 @@ void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngr
if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads();
if (iy == 0 && active) {
- rho[grid_id + ngrids * i] = 2.0 * buf[ix];
+ rho[grid_id + ngrids * i] = buf[ix];
}
}
}
@@ -358,4 +359,4 @@ int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv,
return 0;
}
-}
\ No newline at end of file
+}
diff --git a/gpu4pyscf/lib/gdft/libxc.cu b/gpu4pyscf/lib/gdft/libxc.cu
index 639eecc6..3eeb1b76 100644
--- a/gpu4pyscf/lib/gdft/libxc.cu
+++ b/gpu4pyscf/lib/gdft/libxc.cu
@@ -73,37 +73,121 @@ void _memset_lda(xc_lda_out_params *out, int order, int np, const xc_dimensions
if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk);
if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
+ if(order >= 3) cudaMemset(out->v3rho3, 0, sizeof(double)*np*dim->v3rho3);
+ if(order >= 4) cudaMemset(out->v4rho4, 0, sizeof(double)*np*dim->v4rho4);
}
__host__
void _memset_gga(xc_gga_out_params *out, int order, int np, const xc_dimensions *dim){
if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk);
- if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
- if(order >= 1) cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); // (sigma, lapl, tau)
- if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
- if(order >= 2) cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
- if(order >= 2) cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
+ if(order >= 1) {
+ cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
+ cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma); // (sigma, lapl, tau)
+ }
+ if(order >= 2) {
+ cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
+ cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
+ cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
+ }
+ if(order >= 3) {
+ cudaMemset(out->v3rho3, 0, sizeof(double)*np*dim->v3rho3);
+ cudaMemset(out->v3rho2sigma, 0, sizeof(double)*np*dim->v3rho2sigma);
+ cudaMemset(out->v3rhosigma2, 0, sizeof(double)*np*dim->v3rhosigma2);
+ cudaMemset(out->v3sigma3, 0, sizeof(double)*np*dim->v3sigma3);
+ }
+ if(order >= 4) {
+ cudaMemset(out->v4rho4, 0, sizeof(double)*np*dim->v4rho4);
+ cudaMemset(out->v4rho3sigma, 0, sizeof(double)*np*dim->v4rho3sigma);
+ cudaMemset(out->v4rho2sigma2, 0, sizeof(double)*np*dim->v4rho2sigma2);
+ cudaMemset(out->v4rhosigma3, 0, sizeof(double)*np*dim->v4rhosigma3);
+ cudaMemset(out->v4sigma4, 0, sizeof(double)*np*dim->v4sigma4);
+ }
}
__host__
void _memset_mgga(xc_mgga_out_params *out, int order, int np, const xc_dimensions *dim){
if(order >= 0) cudaMemset(out->zk, 0, sizeof(double)*np*dim->zk);
- if(order >= 1) cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
- if(order >= 1) cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma);
- if(order >= 1 && out->vlapl != NULL) cudaMemset(out->vlapl, 0, sizeof(double)*np*dim->vlapl); // (sigma, lapl, tau)
- if(order >= 1) cudaMemset(out->vtau, 0, sizeof(double)*np*dim->vtau);
+ if(order >= 1) {
+ cudaMemset(out->vrho, 0, sizeof(double)*np*dim->vrho);
+ cudaMemset(out->vsigma, 0, sizeof(double)*np*dim->vsigma);
+ cudaMemset(out->vtau, 0, sizeof(double)*np*dim->vtau);
+ if(out->vlapl != NULL) cudaMemset(out->vlapl, 0, sizeof(double)*np*dim->vlapl); // (sigma, lapl, tau)
+ }
- if(order >= 2) cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
- if(order >= 2) cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
- if(order >= 2 && out->v2rholapl != NULL) cudaMemset(out->v2rholapl, 0, sizeof(double)*np*dim->v2rholapl);
- if(order >= 2) cudaMemset(out->v2rhotau, 0, sizeof(double)*np*dim->v2rhotau);
- if(order >= 2) cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
- if(order >= 2 && out->v2sigmalapl != NULL) cudaMemset(out->v2sigmalapl, 0, sizeof(double)*np*dim->v2sigmalapl);
- if(order >= 2) cudaMemset(out->v2sigmatau, 0, sizeof(double)*np*dim->v2sigmatau);
- if(order >= 2 && out->v2lapl2 != NULL) cudaMemset(out->v2lapl2, 0, sizeof(double)*np*dim->v2lapl2);
- if(order >= 2 && out->v2lapltau != NULL) cudaMemset(out->v2lapltau, 0, sizeof(double)*np*dim->v2lapltau);
- if(order >= 2) cudaMemset(out->v2tau2, 0, sizeof(double)*np*dim->v2tau2);
+ if(order >= 2) {
+ cudaMemset(out->v2rho2, 0, sizeof(double)*np*dim->v2rho2);
+ cudaMemset(out->v2rhosigma, 0, sizeof(double)*np*dim->v2rhosigma);
+ cudaMemset(out->v2rhotau, 0, sizeof(double)*np*dim->v2rhotau);
+ cudaMemset(out->v2sigma2, 0, sizeof(double)*np*dim->v2sigma2);
+ cudaMemset(out->v2sigmatau, 0, sizeof(double)*np*dim->v2sigmatau);
+ cudaMemset(out->v2tau2, 0, sizeof(double)*np*dim->v2tau2);
+ if(out->v2rholapl != NULL) cudaMemset(out->v2rholapl, 0, sizeof(double)*np*dim->v2rholapl);
+ if(out->v2sigmalapl != NULL) cudaMemset(out->v2sigmalapl, 0, sizeof(double)*np*dim->v2sigmalapl);
+ if(out->v2lapl2 != NULL) cudaMemset(out->v2lapl2, 0, sizeof(double)*np*dim->v2lapl2);
+ if(out->v2lapltau != NULL) cudaMemset(out->v2lapltau, 0, sizeof(double)*np*dim->v2lapltau);
+ }
+
+ if (order >= 3) {
+ cudaMemset(out->v3rho3 , 0, sizeof(double)*np*dim->v3rho3);
+ cudaMemset(out->v3rho2sigma , 0, sizeof(double)*np*dim->v3rho2sigma);
+ cudaMemset(out->v3rho2tau , 0, sizeof(double)*np*dim->v3rho2tau);
+ cudaMemset(out->v3rhosigma2 , 0, sizeof(double)*np*dim->v3rhosigma2);
+ cudaMemset(out->v3rhosigmatau , 0, sizeof(double)*np*dim->v3rhosigmatau);
+ cudaMemset(out->v3rhotau2 , 0, sizeof(double)*np*dim->v3rhotau2);
+ cudaMemset(out->v3sigma3 , 0, sizeof(double)*np*dim->v3sigma3);
+ cudaMemset(out->v3sigma2tau , 0, sizeof(double)*np*dim->v3sigma2tau);
+ cudaMemset(out->v3sigmatau2 , 0, sizeof(double)*np*dim->v3sigmatau2);
+ cudaMemset(out->v3tau3 , 0, sizeof(double)*np*dim->v3tau3);
+ if (out->v3rho2lapl != NULL) cudaMemset(out->v3rho2lapl , 0, sizeof(double)*np*dim->v3rho2lapl);
+ if (out->v3rhosigmalapl!= NULL) cudaMemset(out->v3rhosigmalapl, 0, sizeof(double)*np*dim->v3rhosigmalapl);
+ if (out->v3rholapl2 != NULL) cudaMemset(out->v3rholapl2 , 0, sizeof(double)*np*dim->v3rholapl2);
+ if (out->v3rholapltau != NULL) cudaMemset(out->v3rholapltau , 0, sizeof(double)*np*dim->v3rholapltau);
+ if (out->v3sigma2lapl != NULL) cudaMemset(out->v3sigma2lapl , 0, sizeof(double)*np*dim->v3sigma2lapl);
+ if (out->v3sigmalapl2 != NULL) cudaMemset(out->v3sigmalapl2 , 0, sizeof(double)*np*dim->v3sigmalapl2);
+ if (out->v3sigmalapltau!= NULL) cudaMemset(out->v3sigmalapltau, 0, sizeof(double)*np*dim->v3sigmalapltau);
+ if (out->v3lapl3 != NULL) cudaMemset(out->v3lapl3 , 0, sizeof(double)*np*dim->v3lapl3);
+ if (out->v3lapl2tau != NULL) cudaMemset(out->v3lapl2tau , 0, sizeof(double)*np*dim->v3lapl2tau);
+ if (out->v3lapltau2 != NULL) cudaMemset(out->v3lapltau2 , 0, sizeof(double)*np*dim->v3lapltau2);
+ }
+
+ if (order >= 4) {
+ cudaMemset(out->v4rho4 , 0, sizeof(double)*np*dim->v4rho4);
+ cudaMemset(out->v4rho3sigma , 0, sizeof(double)*np*dim->v4rho3sigma);
+ cudaMemset(out->v4rho3tau , 0, sizeof(double)*np*dim->v4rho3tau);
+ cudaMemset(out->v4rho2sigma2 , 0, sizeof(double)*np*dim->v4rho2sigma2);
+ cudaMemset(out->v4rho2sigmatau , 0, sizeof(double)*np*dim->v4rho2sigmatau);
+ cudaMemset(out->v4rho2tau2 , 0, sizeof(double)*np*dim->v4rho2tau2);
+ cudaMemset(out->v4rhosigma3 , 0, sizeof(double)*np*dim->v4rhosigma3);
+ cudaMemset(out->v4rhosigma2tau , 0, sizeof(double)*np*dim->v4rhosigma2tau);
+ cudaMemset(out->v4rhosigmatau2 , 0, sizeof(double)*np*dim->v4rhosigmatau2);
+ cudaMemset(out->v4rhotau3 , 0, sizeof(double)*np*dim->v4rhotau3);
+ cudaMemset(out->v4sigma4 , 0, sizeof(double)*np*dim->v4sigma4);
+ cudaMemset(out->v4sigma3tau , 0, sizeof(double)*np*dim->v4sigma3tau);
+ cudaMemset(out->v4sigma2tau2 , 0, sizeof(double)*np*dim->v4sigma2tau2);
+ cudaMemset(out->v4sigmatau3 , 0, sizeof(double)*np*dim->v4sigmatau3);
+ cudaMemset(out->v4tau4 , 0, sizeof(double)*np*dim->v4tau4);
+ if (out->v4rho3lapl != NULL) cudaMemset(out->v4rho3lapl , 0, sizeof(double)*np*dim->v4rho3lapl);
+ if (out->v4rho2sigmalapl != NULL) cudaMemset(out->v4rho2sigmalapl , 0, sizeof(double)*np*dim->v4rho2sigmalapl);
+ if (out->v4rho2lapl2 != NULL) cudaMemset(out->v4rho2lapl2 , 0, sizeof(double)*np*dim->v4rho2lapl2);
+ if (out->v4rho2lapltau != NULL) cudaMemset(out->v4rho2lapltau , 0, sizeof(double)*np*dim->v4rho2lapltau);
+ if (out->v4rhosigma2lapl != NULL) cudaMemset(out->v4rhosigma2lapl , 0, sizeof(double)*np*dim->v4rhosigma2lapl);
+ if (out->v4rhosigmalapl2 != NULL) cudaMemset(out->v4rhosigmalapl2 , 0, sizeof(double)*np*dim->v4rhosigmalapl2);
+ if (out->v4rhosigmalapltau!= NULL) cudaMemset(out->v4rhosigmalapltau, 0, sizeof(double)*np*dim->v4rhosigmalapltau);
+ if (out->v4rholapl3 != NULL) cudaMemset(out->v4rholapl3 , 0, sizeof(double)*np*dim->v4rholapl3);
+ if (out->v4rholapl2tau != NULL) cudaMemset(out->v4rholapl2tau , 0, sizeof(double)*np*dim->v4rholapl2tau);
+ if (out->v4rholapltau2 != NULL) cudaMemset(out->v4rholapltau2 , 0, sizeof(double)*np*dim->v4rholapltau2);
+ if (out->v4sigma3lapl != NULL) cudaMemset(out->v4sigma3lapl , 0, sizeof(double)*np*dim->v4sigma3lapl);
+ if (out->v4sigma2lapl2 != NULL) cudaMemset(out->v4sigma2lapl2 , 0, sizeof(double)*np*dim->v4sigma2lapl2);
+ if (out->v4sigma2lapltau != NULL) cudaMemset(out->v4sigma2lapltau , 0, sizeof(double)*np*dim->v4sigma2lapltau);
+ if (out->v4sigmalapl3 != NULL) cudaMemset(out->v4sigmalapl3 , 0, sizeof(double)*np*dim->v4sigmalapl3);
+ if (out->v4sigmalapl2tau != NULL) cudaMemset(out->v4sigmalapl2tau , 0, sizeof(double)*np*dim->v4sigmalapl2tau);
+ if (out->v4sigmalapltau2 != NULL) cudaMemset(out->v4sigmalapltau2 , 0, sizeof(double)*np*dim->v4sigmalapltau2);
+ if (out->v4lapl4 != NULL) cudaMemset(out->v4lapl4 , 0, sizeof(double)*np*dim->v4lapl4);
+ if (out->v4lapl3tau != NULL) cudaMemset(out->v4lapl3tau , 0, sizeof(double)*np*dim->v4lapl3tau);
+ if (out->v4lapl2tau2 != NULL) cudaMemset(out->v4lapl2tau2 , 0, sizeof(double)*np*dim->v4lapl2tau2);
+ if (out->v4lapltau3 != NULL) cudaMemset(out->v4lapltau3 , 0, sizeof(double)*np*dim->v4lapltau3);
+ }
}
__host__
diff --git a/gpu4pyscf/lib/gvhf-md/CMakeLists.txt b/gpu4pyscf/lib/gvhf-md/CMakeLists.txt
new file mode 100644
index 00000000..c241d1c2
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=128")
+
+add_library(gvhf_md SHARED
+ md_contract_j.cu md_j_driver.cu md_pairdata.c unrolled_md_j.cu
+)
+
+#option(BUILD_SHARED_LIBS "build shared libraries" 1)
+#option(ENABLE_STATIC "Enforce static library build" 0)
+#if(ENABLE_STATIC)
+# set(BUILD_SHARED_LIBS 0)
+#endif()
+
+set_target_properties(gvhf_md PROPERTIES
+ LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
+ CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(gvhf_md OpenMP::OpenMP_C)
diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
new file mode 100644
index 00000000..2d1b3a12
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
@@ -0,0 +1,467 @@
+#include
+#include
+#include
+#include
+
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/gamma_inc.cu"
+
+#define TILEX 2
+#define TILEY 4
+
+extern __constant__ uint16_t c_Rt_idx[];
+extern __constant__ uint16_t c_Rt_offsets[];
+
+#define ADDR(l, t, u, v) \
+ ((l+1)*(l+2)*(l+3)/6 - ((l)-(t)+1)*((l)-(t)+2)*((l)-(t)+3)/6 + \
+ ((l)-(t)+1)*((l)-(t)+2)/2 - ((l)-(t)-(u)+1)*((l)-(t)-(u)+2)/2 + (v))
+
+__device__
+static void iter_Rt_n(double *out, double *Rt, double rx, double ry, double rz,
+ int l, int sq_id, int nsq_per_block)
+{
+ uint16_t *p1 = c_Rt_idx + c_Rt_offsets[l];
+ double *pout = out + nsq_per_block;
+ int k = 0;
+ for (int v = 0, i = 0; v < l; ++v) {
+ pout[sq_id+k*nsq_per_block] = rz * Rt[sq_id+i*nsq_per_block] + v * Rt[sq_id+p1[k]*nsq_per_block];
+ ++k; ++i;
+ }
+ for (int u = 0, i = 0; u < l; ++u) {
+ for (int v = 0; v < l-u; ++v) {
+ pout[sq_id+k*nsq_per_block] = ry * Rt[sq_id+i*nsq_per_block] + u * Rt[sq_id+p1[k]*nsq_per_block];
+ ++k; ++i;
+ }
+ }
+ //int nf3 = l*(l+1)*(l+2)/6;
+ //Fold3Index *fold3idx = c_i_in_fold3idx + (l-1)*nf3/4;;
+ //for (int i = 0; i < nf3; ++i) {
+ // Fold3Index f3i = fold3idx[i];
+ // int t = f3i.x;
+ // pout[sq_id+(k+i)*nsq_per_block] = rx * Rt[sq_id+i*nsq_per_block]
+ // + t * Rt[sq_id+p1[k+i]*nsq_per_block];
+ //}
+ for (int t = 0, i = 0; t < l; ++t) {
+ // corresponding to the nested loops
+ // for (u = 0; u < l-t; ++u) for (v = 0; v < l-t-u; ++v)
+ for (int uv = 0; uv < (l-t) * (l-t+1) / 2; ++uv) {
+ pout[sq_id+(k+i)*nsq_per_block] = rx * Rt[sq_id+i*nsq_per_block]
+ + t * Rt[sq_id+p1[k+i]*nsq_per_block];
+ ++i;
+ }
+ }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void md_j_kernel(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int threadsx = blockDim.x;
+ int threadsy = blockDim.y;
+ int bsizex = threadsx * TILEX;
+ int bsizey = threadsy * TILEY;
+ int task_ij0 = blockIdx.x * bsizex;
+ int task_kl0 = blockIdx.y * bsizey;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + threadsx * ty;
+ int nsq_per_block = threadsx * threadsy;
+ int gout_id = threadIdx.z;
+ int gout_stride = blockDim.z;
+ int t_id = sq_id + nsq_per_block * gout_id;
+ int threads = nsq_per_block * gout_stride;
+ int li = bounds.li;
+ int lj = bounds.lj;
+ int lk = bounds.lk;
+ int ll = bounds.ll;
+ int lij = li + lj;
+ int lkl = lk + ll;
+ int order = lij + lkl;
+ int nf3ijkl = (order+1)*(order+2)*(order+3)/6;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
+ int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
+ int ij_fold3idx_cum = lij*nf3ij/4;
+ int kl_fold3idx_cum = lkl*nf3kl/4;
+ Fold3Index *ij_fold3idx = c_i_in_fold3idx + ij_fold3idx_cum;
+ Fold3Index *kl_fold3idx = c_i_in_fold3idx + kl_fold3idx_cum;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + (order+1) * nsq_per_block;
+ double *Rq_cache = Rp_cache + bsizex*4;
+ double *vj_ij_cache = Rq_cache + bsizey*4;
+ double *vj_kl_cache = vj_ij_cache + nf3ij * bsizex;
+
+ // zero out all cache;
+ for (int n = t_id; n < (bsizex*4 + bsizey*4 + nf3ij*bsizex + nf3kl*bsizey); n += threads) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+ if (t_id < bsizex) {
+ int task_ij = blockIdx.x * bsizex + t_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[t_id+0*bsizex] = xij;
+ Rp_cache[t_id+1*bsizex] = yij;
+ Rp_cache[t_id+2*bsizex] = zij;
+ Rp_cache[t_id+3*bsizex] = aij;
+ } else {
+ Rp_cache[t_id+3*bsizex] = 1.;
+ }
+ }
+ if (t_id < bsizey) {
+ int task_kl = blockIdx.y * bsizey + t_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[t_id+0*bsizey] = xkl;
+ Rq_cache[t_id+1*bsizey] = ykl;
+ Rq_cache[t_id+2*bsizey] = zkl;
+ Rq_cache[t_id+3*bsizey] = akl;
+ } else {
+ Rq_cache[t_id+3*bsizey] = 1.;
+ }
+ }
+ //for (int n = ty+threadsy*gout_id; n < nf3ij*TILEX; n += threadsy*gout_stride) {
+ // int i = n / TILEX;
+ // int tile = n % TILEX;
+ // int task_ij = blockIdx.x * bsizex + tile * threadsx + tx;
+ // if (task_ij < npairs_ij) {
+ // int pair_ij = pair_ij_mapping[task_ij];
+ // int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ // int sq_ij = tx + tile * threadsx;
+ // dm_ij_cache[sq_ij+i*bsizex] = dm[dm_ij_pair0+i];
+ // }
+ //}
+ //for (int n = tx+threadsx*gout_id; n < nf3kl*TILEY; n += threadsx*gout_stride) {
+ // int i = n / TILEY;
+ // int tile = n % TILEY;
+ // int task_kl = blockIdx.y * bsizey + tile * threadsy + ty;
+ // if (task_kl < npairs_kl) {
+ // int pair_kl = pair_kl_mapping[task_kl];
+ // int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ // int sq_kl = ty + tile * threadsy;
+ // dm_kl_cache[sq_kl+i*bsizey] = dm[dm_kl_pair0+i];
+ // }
+ //}
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < TILEX; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < TILEY; ++batch_kl) {
+ int task_ij0 = blockIdx.x * bsizex + batch_ij * threadsx;
+ int task_kl0 = blockIdx.y * bsizey + batch_kl * threadsy;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * threadsx;
+ int sq_kl = ty + batch_kl * threadsy;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ // TODO: skip certain blocks when task_ij < task_kl
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ double *Rt, *buf;
+ if (gout_id == 0) {
+ double xij = Rp_cache[sq_ij+0*bsizex];
+ double yij = Rp_cache[sq_ij+1*bsizex];
+ double zij = Rp_cache[sq_ij+2*bsizex];
+ double aij = Rp_cache[sq_ij+3*bsizex];
+ double xkl = Rq_cache[sq_kl+0*bsizey];
+ double ykl = Rq_cache[sq_kl+1*bsizey];
+ double zkl = Rq_cache[sq_kl+2*bsizey];
+ double akl = Rq_cache[sq_kl+3*bsizey];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, order, sq_id, nsq_per_block);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= order; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*nsq_per_block] *= fac;
+ }
+ if (order % 2 == 0) {
+ Rt = vj_kl_cache + nf3kl*bsizey;
+ buf = Rt + nf3ijkl * nsq_per_block;
+ } else {
+ buf = vj_kl_cache + nf3kl*bsizey;
+ Rt = buf + nf3ijkl * nsq_per_block;
+ }
+ Rt[sq_id] = gamma_inc[sq_id+order*nsq_per_block];
+ for (int n = 1; n <= order; ++n) {
+ // swap input and output
+ double *tmp = buf;
+ buf = Rt;
+ Rt = tmp;
+ Rt[sq_id] = gamma_inc[sq_id+(order-n)*nsq_per_block];
+ switch (n) {
+ case 1:
+ Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+2*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+3*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+ break;
+ case 2:
+ Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+3*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+4*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+6*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+7*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+8*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+9*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ break;
+ case 3:
+ Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+3*nsq_per_block] = zpq * buf[sq_id+2*nsq_per_block] + 2 * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+4*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+6*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+7*nsq_per_block] = ypq * buf[sq_id+3*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+8*nsq_per_block] = ypq * buf[sq_id+4*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+9*nsq_per_block] = ypq * buf[sq_id+5*nsq_per_block] + 2 * buf[sq_id+3*nsq_per_block];
+ Rt[sq_id+10*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+11*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+12*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+13*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block];
+ Rt[sq_id+14*nsq_per_block] = xpq * buf[sq_id+4*nsq_per_block];
+ Rt[sq_id+15*nsq_per_block] = xpq * buf[sq_id+5*nsq_per_block];
+ Rt[sq_id+16*nsq_per_block] = xpq * buf[sq_id+6*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+17*nsq_per_block] = xpq * buf[sq_id+7*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+18*nsq_per_block] = xpq * buf[sq_id+8*nsq_per_block] + buf[sq_id+3*nsq_per_block];
+ Rt[sq_id+19*nsq_per_block] = xpq * buf[sq_id+9*nsq_per_block] + 2 * buf[sq_id+6*nsq_per_block];
+ break;
+ case 4:
+ Rt[sq_id+1*nsq_per_block] = zpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+2*nsq_per_block] = zpq * buf[sq_id+1*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+3*nsq_per_block] = zpq * buf[sq_id+2*nsq_per_block] + 2 * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+4*nsq_per_block] = zpq * buf[sq_id+3*nsq_per_block] + 3 * buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+5*nsq_per_block] = ypq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+6*nsq_per_block] = ypq * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+7*nsq_per_block] = ypq * buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+8*nsq_per_block] = ypq * buf[sq_id+3*nsq_per_block];
+ Rt[sq_id+9*nsq_per_block] = ypq * buf[sq_id+4*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+10*nsq_per_block] = ypq * buf[sq_id+5*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+11*nsq_per_block] = ypq * buf[sq_id+6*nsq_per_block] + buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+12*nsq_per_block] = ypq * buf[sq_id+7*nsq_per_block] + 2 * buf[sq_id+4*nsq_per_block];
+ Rt[sq_id+13*nsq_per_block] = ypq * buf[sq_id+8*nsq_per_block] + 2 * buf[sq_id+5*nsq_per_block];
+ Rt[sq_id+14*nsq_per_block] = ypq * buf[sq_id+9*nsq_per_block] + 3 * buf[sq_id+7*nsq_per_block];
+ Rt[sq_id+15*nsq_per_block] = xpq * buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+16*nsq_per_block] = xpq * buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+17*nsq_per_block] = xpq * buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+18*nsq_per_block] = xpq * buf[sq_id+3*nsq_per_block];
+ Rt[sq_id+19*nsq_per_block] = xpq * buf[sq_id+4*nsq_per_block];
+ Rt[sq_id+20*nsq_per_block] = xpq * buf[sq_id+5*nsq_per_block];
+ Rt[sq_id+21*nsq_per_block] = xpq * buf[sq_id+6*nsq_per_block];
+ Rt[sq_id+22*nsq_per_block] = xpq * buf[sq_id+7*nsq_per_block];
+ Rt[sq_id+23*nsq_per_block] = xpq * buf[sq_id+8*nsq_per_block];
+ Rt[sq_id+24*nsq_per_block] = xpq * buf[sq_id+9*nsq_per_block];
+ Rt[sq_id+25*nsq_per_block] = xpq * buf[sq_id+10*nsq_per_block] + buf[sq_id+0*nsq_per_block];
+ Rt[sq_id+26*nsq_per_block] = xpq * buf[sq_id+11*nsq_per_block] + buf[sq_id+1*nsq_per_block];
+ Rt[sq_id+27*nsq_per_block] = xpq * buf[sq_id+12*nsq_per_block] + buf[sq_id+2*nsq_per_block];
+ Rt[sq_id+28*nsq_per_block] = xpq * buf[sq_id+13*nsq_per_block] + buf[sq_id+4*nsq_per_block];
+ Rt[sq_id+29*nsq_per_block] = xpq * buf[sq_id+14*nsq_per_block] + buf[sq_id+5*nsq_per_block];
+ Rt[sq_id+30*nsq_per_block] = xpq * buf[sq_id+15*nsq_per_block] + buf[sq_id+7*nsq_per_block];
+ Rt[sq_id+31*nsq_per_block] = xpq * buf[sq_id+16*nsq_per_block] + 2 * buf[sq_id+10*nsq_per_block];
+ Rt[sq_id+32*nsq_per_block] = xpq * buf[sq_id+17*nsq_per_block] + 2 * buf[sq_id+11*nsq_per_block];
+ Rt[sq_id+33*nsq_per_block] = xpq * buf[sq_id+18*nsq_per_block] + 2 * buf[sq_id+13*nsq_per_block];
+ Rt[sq_id+34*nsq_per_block] = xpq * buf[sq_id+19*nsq_per_block] + 3 * buf[sq_id+16*nsq_per_block];
+ break;
+ default: iter_Rt_n(Rt, buf, xpq, ypq, zpq, n, sq_id, nsq_per_block);
+ }
+ }
+ }
+
+ Rt = vj_kl_cache + nf3kl*bsizey;
+ double *vj_cache = Rt + nf3ijkl * nsq_per_block;
+ //for (k = 0, e = 0; e <= l1; ++e) {
+ //for (f = 0; f <= l1-e; ++f) {
+ //for (g = 0; g <= l1-e-f; ++g, ++k) {
+ // double rho_kl_val = rho_kl[k];
+ // double jvec_kl_val = 0.;
+ // double fac = 1;
+ // if ((e + f + g) % 2 != 0) {
+ // fac = -1;
+ // }
+ // for (i = 0, t = 0; t <= l2; ++t) {
+ // for (u = 0; u <= l2-t; ++u) {
+ // for (v = 0; v <= l2-t-u; ++v, ++i) {
+ // s = fac * R[e+t,f+u,g+v]
+ // jvec_kl_val += s * rho_ij[i];
+ // jvec_ij[i] += s * rho_kl_val;
+ // } } }
+ // jvec_kl[k] += jvec_kl_val;
+ //} } }
+ for (int k = gout_id; k < nf3kl+gout_id; k += gout_stride) {
+ __syncthreads();
+ double vj_kl = 0.;
+ if (k < nf3kl) {
+ Fold3Index f3k = kl_fold3idx[k];
+ int e = f3k.x;
+ int f = f3k.y;
+ int g = f3k.z;
+ double fac = 1.;
+ if ((e + f + g) % 2 != 0) {
+ fac = -1.;
+ }
+ for (int i = 0, t = 0; t <= lij; ++t) {
+ for (int u = 0; u <= lij-t; ++u) {
+ for (int v = 0; v <= lij-t-u; ++v, ++i) {
+ //double s = Rt[sq_id+ADDR(order,e+t,f+u,g+v)*nsq_per_block];
+ int ix = order-e-t;
+ int xoffset = ix*(ix+1)*(ix+2)/6;
+ int iy = ix-f-u;
+ int i2y = (iy+1)*(iy+2)/2;
+ double s = Rt[sq_id+(nf3ijkl-xoffset-i2y+g+v)*nsq_per_block];
+ vj_kl += fac * s * dm[dm_ij_pair0+i];
+ } } }
+ //atomicAdd(vj+dm_kl_pair0+k, vj_kl);
+ }
+ vj_cache[t_id] = vj_kl;
+ for (int stride = threadsx/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[t_id] += vj_cache[t_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+k*bsizey] += vj_cache[t_id];
+ }
+ }
+
+ for (int i = gout_id; i < nf3ij+gout_id; i += gout_stride) {
+ __syncthreads();
+ double vj_ij = 0.;
+ if (i < nf3ij) {
+ Fold3Index f3i = ij_fold3idx[i];
+ int t = f3i.x;
+ int u = f3i.y;
+ int v = f3i.z;
+ for (int k = 0, e = 0; e <= lkl; ++e) {
+ for (int f = 0; f <= lkl-e; ++f) {
+ for (int g = 0; g <= lkl-e-f; ++g, ++k) {
+ //double s = Rt[sq_id+ADDR(order,e+t,f+u,g+v)*nsq_per_block];
+ int ix = order-e-t;
+ int xoffset = ix*(ix+1)*(ix+2)/6;
+ int iy = ix-f-u;
+ int i2y = (iy+1)*(iy+2)/2;
+ double s = Rt[sq_id+(nf3ijkl-xoffset-i2y+g+v)*nsq_per_block];
+ double d = dm[dm_kl_pair0+k];
+ if ((e + f + g) % 2 == 0) {
+ vj_ij += s * d;
+ } else {
+ vj_ij -= s * d;
+ }
+ } } }
+ //atomicAdd(vj+dm_ij_pair0+i, vj_ij);
+ }
+ vj_cache[t_id] = vj_ij;
+ for (int stride = threadsy/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[t_id] += vj_cache[t_id + stride*threadsx];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+i*bsizex] += vj_cache[t_id];
+ }
+ }
+ __syncthreads();
+ } }
+
+ for (int n = ty+threadsy*gout_id; n < nf3ij*TILEX; n += threadsy*gout_stride) {
+ int i = n / TILEX;
+ int tile = n % TILEX;
+ int task_ij = blockIdx.x * bsizex + tile * threadsx + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * threadsx;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*bsizex]);
+ }
+ }
+ for (int n = tx+threadsx*gout_id; n < nf3kl*TILEY; n += threadsx*gout_stride) {
+ int i = n / TILEY;
+ int tile = n % TILEY;
+ int task_kl = blockIdx.y * bsizey + tile * threadsy + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * threadsy;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*bsizey]);
+ }
+ }
+}
diff --git a/gpu4pyscf/lib/gvhf-md/md_j_driver.cu b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
new file mode 100644
index 00000000..e48407a6
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/md_j_driver.cu
@@ -0,0 +1,434 @@
+#include
+#include
+#include
+#include
+#include
+
+#include "gvhf-rys/vhf.cuh"
+
+#define TILEX 2
+#define TILEY 4
+
+__constant__ uint16_t c_Rt_idx[5967];
+__constant__ uint16_t c_Rt_offsets[19];
+__constant__ Fold2Index c_i_in_fold2idx[165];
+__constant__ Fold3Index c_i_in_fold3idx[495];
+
+
+extern __global__ void md_j_kernel(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds);
+int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
+ int *scheme, int workers, double omega);
+void set_md_j_unrolled_shm_size();
+
+static uint16_t Rt_idx[] = {
+// l = 1
+0,0,0,
+// l = 2
+0,0,0,0,0,0,0,0,0,
+// l = 3
+0,0,1,0,0,0,0,1,3,0,0,0,0,0,0,0,1,3,6,
+// l = 4
+0,0,1,2,0,0,0,0,0,1,2,4,5,7,0,0,0,0,0,0,
+0,0,0,0,0,1,2,4,5,7,10,11,13,16,
+// l = 5
+0,0,1,2,3,0,0,0,0,0,0,1,2,3,5,6,7,9,10,12,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,5,
+6,7,9,10,12,15,16,17,19,20,22,25,26,28,31,
+// l = 6
+0,0,1,2,3,4,0,0,0,0,0,0,0,1,2,3,4,6,7,8,
+9,11,12,13,15,16,18,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,6,7,8,9,11,12,13,
+15,16,18,21,22,23,24,26,27,28,30,31,33,36,37,38,40,41,43,46,
+47,49,52,
+// l = 7
+0,0,1,2,3,4,5,0,0,0,0,0,0,0,0,1,2,3,4,5,
+7,8,9,10,11,13,14,15,16,18,19,20,22,23,25,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,1,2,3,4,5,7,8,9,10,11,13,14,15,16,18,19,
+20,22,23,25,28,29,30,31,32,34,35,36,37,39,40,41,43,44,46,49,
+50,51,52,54,55,56,58,59,61,64,65,66,68,69,71,74,75,77,80,
+// l = 8
+0,0,1,2,3,4,5,6,0,0,0,0,0,0,0,0,0,1,2,3,
+4,5,6,8,9,10,11,12,13,15,16,17,18,19,21,22,23,24,26,27,
+28,30,31,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,1,2,3,4,5,6,8,9,10,11,12,13,15,16,17,18,19,21,22,
+23,24,26,27,28,30,31,33,36,37,38,39,40,41,43,44,45,46,47,49,
+50,51,52,54,55,56,58,59,61,64,65,66,67,68,70,71,72,73,75,76,
+77,79,80,82,85,86,87,88,90,91,92,94,95,97,100,101,102,104,105,107,
+110,111,113,116,
+// l = 9
+0,0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0,0,0,1,
+2,3,4,5,6,7,9,10,11,12,13,14,15,17,18,19,20,21,22,24,
+25,26,27,28,30,31,32,33,35,36,37,39,40,42,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+1,2,3,4,5,6,7,9,10,11,12,13,14,15,17,18,19,20,21,22,
+24,25,26,27,28,30,31,32,33,35,36,37,39,40,42,45,46,47,48,49,
+50,51,53,54,55,56,57,58,60,61,62,63,64,66,67,68,69,71,72,73,
+75,76,78,81,82,83,84,85,86,88,89,90,91,92,94,95,96,97,99,100,
+101,103,104,106,109,110,111,112,113,115,116,117,118,120,121,122,124,125,127,130,
+131,132,133,135,136,137,139,140,142,145,146,147,149,150,152,155,156,158,161,
+// l = 10
+0,0,1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0,0,0,
+0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,20,21,
+22,23,24,25,27,28,29,30,31,32,34,35,36,37,38,40,41,42,43,45,
+46,47,49,50,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,20,21,
+22,23,24,25,27,28,29,30,31,32,34,35,36,37,38,40,41,42,43,45,
+46,47,49,50,52,55,56,57,58,59,60,61,62,64,65,66,67,68,69,70,
+72,73,74,75,76,77,79,80,81,82,83,85,86,87,88,90,91,92,94,95,
+97,100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,118,119,121,
+122,123,124,126,127,128,130,131,133,136,137,138,139,140,141,143,144,145,146,147,
+149,150,151,152,154,155,156,158,159,161,164,165,166,167,168,170,171,172,173,175,
+176,177,179,180,182,185,186,187,188,190,191,192,194,195,197,200,201,202,204,205,
+207,210,211,213,216,
+// l = 11
+0,0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0,0,0,0,
+0,0,0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,
+19,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,38,39,40,41,
+42,43,45,46,47,48,49,51,52,53,54,56,57,58,60,61,63,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,
+18,19,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,38,39,40,
+41,42,43,45,46,47,48,49,51,52,53,54,56,57,58,60,61,63,66,67,
+68,69,70,71,72,73,74,76,77,78,79,80,81,82,83,85,86,87,88,89,
+90,91,93,94,95,96,97,98,100,101,102,103,104,106,107,108,109,111,112,113,
+115,116,118,121,122,123,124,125,126,127,128,130,131,132,133,134,135,136,138,139,
+140,141,142,143,145,146,147,148,149,151,152,153,154,156,157,158,160,161,163,166,
+167,168,169,170,171,172,174,175,176,177,178,179,181,182,183,184,185,187,188,189,
+190,192,193,194,196,197,199,202,203,204,205,206,207,209,210,211,212,213,215,216,
+217,218,220,221,222,224,225,227,230,231,232,233,234,236,237,238,239,241,242,243,
+245,246,248,251,252,253,254,256,257,258,260,261,263,266,267,268,270,271,273,276,
+277,279,282,
+// l = 12
+0,0,1,2,3,4,5,6,7,8,9,10,0,0,0,0,0,0,0,0,
+0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,
+17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,
+39,40,42,43,44,45,46,47,48,50,51,52,53,54,55,57,58,59,60,61,
+63,64,65,66,68,69,70,72,73,75,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,12,
+13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34,
+35,36,37,38,39,40,42,43,44,45,46,47,48,50,51,52,53,54,55,57,
+58,59,60,61,63,64,65,66,68,69,70,72,73,75,78,79,80,81,82,83,
+84,85,86,87,89,90,91,92,93,94,95,96,97,99,100,101,102,103,104,105,
+106,108,109,110,111,112,113,114,116,117,118,119,120,121,123,124,125,126,127,129,
+130,131,132,134,135,136,138,139,141,144,145,146,147,148,149,150,151,152,154,155,
+156,157,158,159,160,161,163,164,165,166,167,168,169,171,172,173,174,175,176,178,
+179,180,181,182,184,185,186,187,189,190,191,193,194,196,199,200,201,202,203,204,
+205,206,208,209,210,211,212,213,214,216,217,218,219,220,221,223,224,225,226,227,
+229,230,231,232,234,235,236,238,239,241,244,245,246,247,248,249,250,252,253,254,
+255,256,257,259,260,261,262,263,265,266,267,268,270,271,272,274,275,277,280,281,
+282,283,284,285,287,288,289,290,291,293,294,295,296,298,299,300,302,303,305,308,
+309,310,311,312,314,315,316,317,319,320,321,323,324,326,329,330,331,332,334,335,
+336,338,339,341,344,345,346,348,349,351,354,355,357,360,
+// l = 13
+0,0,1,2,3,4,5,6,7,8,9,10,11,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,13,14,
+15,16,17,18,19,20,21,22,23,25,26,27,28,29,30,31,32,33,34,36,
+37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,55,56,57,58,
+59,60,61,63,64,65,66,67,68,70,71,72,73,74,76,77,78,79,81,82,
+83,85,86,88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,
+5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,25,26,
+27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,46,47,48,
+49,50,51,52,53,55,56,57,58,59,60,61,63,64,65,66,67,68,70,71,
+72,73,74,76,77,78,79,81,82,83,85,86,88,91,92,93,94,95,96,97,
+98,99,100,101,103,104,105,106,107,108,109,110,111,112,114,115,116,117,118,119,
+120,121,122,124,125,126,127,128,129,130,131,133,134,135,136,137,138,139,141,142,
+143,144,145,146,148,149,150,151,152,154,155,156,157,159,160,161,163,164,166,169,
+170,171,172,173,174,175,176,177,178,180,181,182,183,184,185,186,187,188,190,191,
+192,193,194,195,196,197,199,200,201,202,203,204,205,207,208,209,210,211,212,214,
+215,216,217,218,220,221,222,223,225,226,227,229,230,232,235,236,237,238,239,240,
+241,242,243,245,246,247,248,249,250,251,252,254,255,256,257,258,259,260,262,263,
+264,265,266,267,269,270,271,272,273,275,276,277,278,280,281,282,284,285,287,290,
+291,292,293,294,295,296,297,299,300,301,302,303,304,305,307,308,309,310,311,312,
+314,315,316,317,318,320,321,322,323,325,326,327,329,330,332,335,336,337,338,339,
+340,341,343,344,345,346,347,348,350,351,352,353,354,356,357,358,359,361,362,363,
+365,366,368,371,372,373,374,375,376,378,379,380,381,382,384,385,386,387,389,390,
+391,393,394,396,399,400,401,402,403,405,406,407,408,410,411,412,414,415,417,420,
+421,422,423,425,426,427,429,430,432,435,436,437,439,440,442,445,446,448,451,
+// l = 14
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,
+12,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,
+34,35,36,37,39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55,
+56,57,58,60,61,62,63,64,65,66,67,69,70,71,72,73,74,75,77,78,
+79,80,81,82,84,85,86,87,88,90,91,92,93,95,96,97,99,100,102,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,
+17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,36,37,
+39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55,56,57,58,60,
+61,62,63,64,65,66,67,69,70,71,72,73,74,75,77,78,79,80,81,82,
+84,85,86,87,88,90,91,92,93,95,96,97,99,100,102,105,106,107,108,109,
+110,111,112,113,114,115,116,118,119,120,121,122,123,124,125,126,127,128,130,131,
+132,133,134,135,136,137,138,139,141,142,143,144,145,146,147,148,149,151,152,153,
+154,155,156,157,158,160,161,162,163,164,165,166,168,169,170,171,172,173,175,176,
+177,178,179,181,182,183,184,186,187,188,190,191,193,196,197,198,199,200,201,202,
+203,204,205,206,208,209,210,211,212,213,214,215,216,217,219,220,221,222,223,224,
+225,226,227,229,230,231,232,233,234,235,236,238,239,240,241,242,243,244,246,247,
+248,249,250,251,253,254,255,256,257,259,260,261,262,264,265,266,268,269,271,274,
+275,276,277,278,279,280,281,282,283,285,286,287,288,289,290,291,292,293,295,296,
+297,298,299,300,301,302,304,305,306,307,308,309,310,312,313,314,315,316,317,319,
+320,321,322,323,325,326,327,328,330,331,332,334,335,337,340,341,342,343,344,345,
+346,347,348,350,351,352,353,354,355,356,357,359,360,361,362,363,364,365,367,368,
+369,370,371,372,374,375,376,377,378,380,381,382,383,385,386,387,389,390,392,395,
+396,397,398,399,400,401,402,404,405,406,407,408,409,410,412,413,414,415,416,417,
+419,420,421,422,423,425,426,427,428,430,431,432,434,435,437,440,441,442,443,444,
+445,446,448,449,450,451,452,453,455,456,457,458,459,461,462,463,464,466,467,468,
+470,471,473,476,477,478,479,480,481,483,484,485,486,487,489,490,491,492,494,495,
+496,498,499,501,504,505,506,507,508,510,511,512,513,515,516,517,519,520,522,525,
+526,527,528,530,531,532,534,535,537,540,541,542,544,545,547,550,551,553,556,
+// l = 15
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,
+10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,
+32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,48,49,50,51,52,
+54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,70,71,72,73,75,
+76,77,78,79,80,81,82,84,85,86,87,88,89,90,92,93,94,95,96,97,
+99,100,101,102,103,105,106,107,108,110,111,112,114,115,117,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,
+5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,
+26,27,29,30,31,32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,
+48,49,50,51,52,54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,
+70,71,72,73,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90,92,
+93,94,95,96,97,99,100,101,102,103,105,106,107,108,110,111,112,114,115,117,
+120,121,122,123,124,125,126,127,128,129,130,131,132,134,135,136,137,138,139,140,
+141,142,143,144,145,147,148,149,150,151,152,153,154,155,156,157,159,160,161,162,
+163,164,165,166,167,168,170,171,172,173,174,175,176,177,178,180,181,182,183,184,
+185,186,187,189,190,191,192,193,194,195,197,198,199,200,201,202,204,205,206,207,
+208,210,211,212,213,215,216,217,219,220,222,225,226,227,228,229,230,231,232,233,
+234,235,236,238,239,240,241,242,243,244,245,246,247,248,250,251,252,253,254,255,
+256,257,258,259,261,262,263,264,265,266,267,268,269,271,272,273,274,275,276,277,
+278,280,281,282,283,284,285,286,288,289,290,291,292,293,295,296,297,298,299,301,
+302,303,304,306,307,308,310,311,313,316,317,318,319,320,321,322,323,324,325,326,
+328,329,330,331,332,333,334,335,336,337,339,340,341,342,343,344,345,346,347,349,
+350,351,352,353,354,355,356,358,359,360,361,362,363,364,366,367,368,369,370,371,
+373,374,375,376,377,379,380,381,382,384,385,386,388,389,391,394,395,396,397,398,
+399,400,401,402,403,405,406,407,408,409,410,411,412,413,415,416,417,418,419,420,
+421,422,424,425,426,427,428,429,430,432,433,434,435,436,437,439,440,441,442,443,
+445,446,447,448,450,451,452,454,455,457,460,461,462,463,464,465,466,467,468,470,
+471,472,473,474,475,476,477,479,480,481,482,483,484,485,487,488,489,490,491,492,
+494,495,496,497,498,500,501,502,503,505,506,507,509,510,512,515,516,517,518,519,
+520,521,522,524,525,526,527,528,529,530,532,533,534,535,536,537,539,540,541,542,
+543,545,546,547,548,550,551,552,554,555,557,560,561,562,563,564,565,566,568,569,
+570,571,572,573,575,576,577,578,579,581,582,583,584,586,587,588,590,591,593,596,
+597,598,599,600,601,603,604,605,606,607,609,610,611,612,614,615,616,618,619,621,
+624,625,626,627,628,630,631,632,633,635,636,637,639,640,642,645,646,647,648,650,
+651,652,654,655,657,660,661,662,664,665,667,670,671,673,676,
+// l = 16
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,
+8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28,
+29,31,32,33,34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,
+51,52,53,54,55,56,58,59,60,61,62,63,64,65,66,67,68,70,71,72,
+73,74,75,76,77,78,79,81,82,83,84,85,86,87,88,89,91,92,93,94,
+95,96,97,98,100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,
+118,119,121,122,123,124,126,127,128,130,131,133,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,
+12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,
+34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54,
+55,56,58,59,60,61,62,63,64,65,66,67,68,70,71,72,73,74,75,76,
+77,78,79,81,82,83,84,85,86,87,88,89,91,92,93,94,95,96,97,98,
+100,101,102,103,104,105,106,108,109,110,111,112,113,115,116,117,118,119,121,122,
+123,124,126,127,128,130,131,133,136,137,138,139,140,141,142,143,144,145,146,147,
+148,149,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169,
+170,171,172,173,174,175,176,178,179,180,181,182,183,184,185,186,187,188,190,191,
+192,193,194,195,196,197,198,199,201,202,203,204,205,206,207,208,209,211,212,213,
+214,215,216,217,218,220,221,222,223,224,225,226,228,229,230,231,232,233,235,236,
+237,238,239,241,242,243,244,246,247,248,250,251,253,256,257,258,259,260,261,262,
+263,264,265,266,267,268,270,271,272,273,274,275,276,277,278,279,280,281,283,284,
+285,286,287,288,289,290,291,292,293,295,296,297,298,299,300,301,302,303,304,306,
+307,308,309,310,311,312,313,314,316,317,318,319,320,321,322,323,325,326,327,328,
+329,330,331,333,334,335,336,337,338,340,341,342,343,344,346,347,348,349,351,352,
+353,355,356,358,361,362,363,364,365,366,367,368,369,370,371,372,374,375,376,377,
+378,379,380,381,382,383,384,386,387,388,389,390,391,392,393,394,395,397,398,399,
+400,401,402,403,404,405,407,408,409,410,411,412,413,414,416,417,418,419,420,421,
+422,424,425,426,427,428,429,431,432,433,434,435,437,438,439,440,442,443,444,446,
+447,449,452,453,454,455,456,457,458,459,460,461,462,464,465,466,467,468,469,470,
+471,472,473,475,476,477,478,479,480,481,482,483,485,486,487,488,489,490,491,492,
+494,495,496,497,498,499,500,502,503,504,505,506,507,509,510,511,512,513,515,516,
+517,518,520,521,522,524,525,527,530,531,532,533,534,535,536,537,538,539,541,542,
+543,544,545,546,547,548,549,551,552,553,554,555,556,557,558,560,561,562,563,564,
+565,566,568,569,570,571,572,573,575,576,577,578,579,581,582,583,584,586,587,588,
+590,591,593,596,597,598,599,600,601,602,603,604,606,607,608,609,610,611,612,613,
+615,616,617,618,619,620,621,623,624,625,626,627,628,630,631,632,633,634,636,637,
+638,639,641,642,643,645,646,648,651,652,653,654,655,656,657,658,660,661,662,663,
+664,665,666,668,669,670,671,672,673,675,676,677,678,679,681,682,683,684,686,687,
+688,690,691,693,696,697,698,699,700,701,702,704,705,706,707,708,709,711,712,713,
+714,715,717,718,719,720,722,723,724,726,727,729,732,733,734,735,736,737,739,740,
+741,742,743,745,746,747,748,750,751,752,754,755,757,760,761,762,763,764,766,767,
+768,769,771,772,773,775,776,778,781,782,783,784,786,787,788,790,791,793,796,797,
+798,800,801,803,806,807,809,812,
+// l = 17
+0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,
+6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,
+27,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46,48,
+49,50,51,52,53,54,55,56,57,58,59,60,62,63,64,65,66,67,68,69,
+70,71,72,73,75,76,77,78,79,80,81,82,83,84,85,87,88,89,90,91,
+92,93,94,95,96,98,99,100,101,102,103,104,105,106,108,109,110,111,112,113,
+114,115,117,118,119,120,121,122,123,125,126,127,128,129,130,132,133,134,135,136,
+138,139,140,141,143,144,145,147,148,150,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,
+18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,
+39,40,41,42,43,44,45,46,48,49,50,51,52,53,54,55,56,57,58,59,
+60,62,63,64,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81,
+82,83,84,85,87,88,89,90,91,92,93,94,95,96,98,99,100,101,102,103,
+104,105,106,108,109,110,111,112,113,114,115,117,118,119,120,121,122,123,125,126,
+127,128,129,130,132,133,134,135,136,138,139,140,141,143,144,145,147,148,150,153,
+154,155,156,157,158,159,160,161,162,163,164,165,166,167,169,170,171,172,173,174,
+175,176,177,178,179,180,181,182,184,185,186,187,188,189,190,191,192,193,194,195,
+196,198,199,200,201,202,203,204,205,206,207,208,209,211,212,213,214,215,216,217,
+218,219,220,221,223,224,225,226,227,228,229,230,231,232,234,235,236,237,238,239,
+240,241,242,244,245,246,247,248,249,250,251,253,254,255,256,257,258,259,261,262,
+263,264,265,266,268,269,270,271,272,274,275,276,277,279,280,281,283,284,286,289,
+290,291,292,293,294,295,296,297,298,299,300,301,302,304,305,306,307,308,309,310,
+311,312,313,314,315,316,318,319,320,321,322,323,324,325,326,327,328,329,331,332,
+333,334,335,336,337,338,339,340,341,343,344,345,346,347,348,349,350,351,352,354,
+355,356,357,358,359,360,361,362,364,365,366,367,368,369,370,371,373,374,375,376,
+377,378,379,381,382,383,384,385,386,388,389,390,391,392,394,395,396,397,399,400,
+401,403,404,406,409,410,411,412,413,414,415,416,417,418,419,420,421,423,424,425,
+426,427,428,429,430,431,432,433,434,436,437,438,439,440,441,442,443,444,445,446,
+448,449,450,451,452,453,454,455,456,457,459,460,461,462,463,464,465,466,467,469,
+470,471,472,473,474,475,476,478,479,480,481,482,483,484,486,487,488,489,490,491,
+493,494,495,496,497,499,500,501,502,504,505,506,508,509,511,514,515,516,517,518,
+519,520,521,522,523,524,525,527,528,529,530,531,532,533,534,535,536,537,539,540,
+541,542,543,544,545,546,547,548,550,551,552,553,554,555,556,557,558,560,561,562,
+563,564,565,566,567,569,570,571,572,573,574,575,577,578,579,580,581,582,584,585,
+586,587,588,590,591,592,593,595,596,597,599,600,602,605,606,607,608,609,610,611,
+612,613,614,615,617,618,619,620,621,622,623,624,625,626,628,629,630,631,632,633,
+634,635,636,638,639,640,641,642,643,644,645,647,648,649,650,651,652,653,655,656,
+657,658,659,660,662,663,664,665,666,668,669,670,671,673,674,675,677,678,680,683,
+684,685,686,687,688,689,690,691,692,694,695,696,697,698,699,700,701,702,704,705,
+706,707,708,709,710,711,713,714,715,716,717,718,719,721,722,723,724,725,726,728,
+729,730,731,732,734,735,736,737,739,740,741,743,744,746,749,750,751,752,753,754,
+755,756,757,759,760,761,762,763,764,765,766,768,769,770,771,772,773,774,776,777,
+778,779,780,781,783,784,785,786,787,789,790,791,792,794,795,796,798,799,801,804,
+805,806,807,808,809,810,811,813,814,815,816,817,818,819,821,822,823,824,825,826,
+828,829,830,831,832,834,835,836,837,839,840,841,843,844,846,849,850,851,852,853,
+854,855,857,858,859,860,861,862,864,865,866,867,868,870,871,872,873,875,876,877,
+879,880,882,885,886,887,888,889,890,892,893,894,895,896,898,899,900,901,903,904,
+905,907,908,910,913,914,915,916,917,919,920,921,922,924,925,926,928,929,931,934,
+935,936,937,939,940,941,943,944,946,949,950,951,953,954,956,959,960,962,965,
+};
+
+// l*(l+1)*(l+2)*(l+3)//24 - l
+static uint16_t Rt_idx_offsets[] = {
+0,0,3,12,31,65,120,203,322,486,705,990,1353,1807,2366,3045,3860,4828,5967,
+};
+
+extern "C" {
+int MD_build_j(double *vj, double *dm, int n_dm, int nao,
+ RysIntEnvVars envs, int *scheme, int *shls_slice,
+ int ntile_ij_pairs, int ntile_kl_pairs,
+ int *tile_ij_mapping, int *tile_kl_mapping, float *tile_q_cond,
+ float *q_cond, float *dm_cond, float cutoff,
+ uint32_t *batch_head, int workers, double omega,
+ int *atm, int natm, int *bas, int nbas, double *env)
+{
+ uint16_t ish0 = shls_slice[0];
+ uint16_t jsh0 = shls_slice[2];
+ uint16_t ksh0 = shls_slice[4];
+ uint16_t lsh0 = shls_slice[6];
+ uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS];
+ uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+ uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS];
+ uint8_t ll = bas[ANG_OF + lsh0*BAS_SLOTS];
+ uint8_t order = li + lj + lk + ll;
+ BoundsInfo bounds = {li, lj, lk, ll,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0,
+ ntile_ij_pairs, ntile_kl_pairs, tile_ij_mapping, tile_kl_mapping,
+ q_cond, dm_cond, cutoff};
+
+ JKMatrix jk = {vj, NULL, dm, (uint16_t)n_dm};
+
+ if (!md_j_unrolled(&envs, &jk, &bounds, scheme, workers, omega)) {
+ int lij = li + lj;
+ int lkl = lk + ll;
+ int threads_ij = scheme[0];
+ int threads_kl = scheme[1];
+ int bsizex = threads_ij * TILEX;
+ int bsizey = threads_kl * TILEY;
+ int nsq_per_block = threads_ij * threads_kl;
+ int gout_stride = scheme[2];
+ dim3 threads(threads_ij, threads_kl, gout_stride);
+ int nf3ij = (lij+1)*(lij+2)*(lij+3)/6;
+ int nf3kl = (lkl+1)*(lkl+2)*(lkl+3)/6;
+ int buflen = (order+1) * nsq_per_block
+ + bsizex * (4+nf3ij) + bsizey * (4+nf3kl)
+ + (order+1)*(order+2)*(order+3)/6 * nsq_per_block;
+ buflen += MAX(order*(order+1)*(order+2)/6, gout_stride) * nsq_per_block;
+ int blocks_ij = (ntile_ij_pairs + bsizex - 1) / bsizex;
+ int blocks_kl = (ntile_kl_pairs + bsizey - 1) / bsizey;
+ dim3 blocks(blocks_ij, blocks_kl);
+ md_j_kernel<<>>(envs, jk, bounds);
+ }
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in MD_build_j: %s\n", cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+void init_mdj_constant(int shm_size)
+{
+ Fold2Index i_in_fold2idx[165];
+ Fold3Index i_in_fold3idx[495];
+ int n2 = 0;
+ int n3 = 0;
+ for (int l = 0; l <= LMAX*2; ++l) {
+ for (int i = 0, ijk = 0; i <= l; ++i) {
+ for (int j = 0; j <= l-i; ++j, ++n2) {
+ i_in_fold2idx[n2].x = i;
+ i_in_fold2idx[n2].y = j;
+ i_in_fold2idx[n2].fold3offset = ijk;
+ for (int k = 0; k <= l-i-j; ++k, ++n3, ++ijk) {
+ i_in_fold3idx[n3].x = i;
+ i_in_fold3idx[n3].y = j;
+ i_in_fold3idx[n3].z = k;
+ i_in_fold3idx[n3].fold2yz = (l+1)*(l+2)/2 - (l-j+1)*(l-j+2)/2 + k;
+ }
+ } }
+ }
+ cudaMemcpyToSymbol(c_Rt_idx, Rt_idx, sizeof(Rt_idx)); // reuse these buffer to store Rt1_idx
+ cudaMemcpyToSymbol(c_Rt_offsets, Rt_idx_offsets, sizeof(Rt_idx_offsets));
+ cudaMemcpyToSymbol(c_i_in_fold2idx, i_in_fold2idx, 165*sizeof(Fold2Index));
+ cudaMemcpyToSymbol(c_i_in_fold3idx, i_in_fold3idx, 495*sizeof(Fold3Index));
+ cudaFuncSetAttribute(md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+ set_md_j_unrolled_shm_size();
+}
+}
diff --git a/gpu4pyscf/lib/gvhf-md/md_pairdata.c b/gpu4pyscf/lib/gvhf-md/md_pairdata.c
new file mode 100644
index 00000000..8b8b7017
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/md_pairdata.c
@@ -0,0 +1,203 @@
+#include
+#include
+#include "gvhf-rys/vhf.cuh"
+
+#define Ex_at(i,j,t) Ex[(i)*stride1+(j)*stride2+t]
+#define Ey_at(i,j,t) Ey[(i)*stride1+(j)*stride2+t]
+#define Ez_at(i,j,t) Ez[(i)*stride1+(j)*stride2+t]
+
+void get_E_cart_components(double *Ecart, int li, int lj, double ai, double aj,
+ double *Ra, double *Rb)
+{
+ double aij = ai + aj;
+ double xixj = Ra[0] - Rb[0];
+ double yiyj = Ra[1] - Rb[1];
+ double zizj = Ra[2] - Rb[2];
+ double theta_ij = ai * aj / aij;
+ double Kab = exp(-theta_ij * (xixj*xixj + yiyj*yiyj + zizj*zizj));
+ double Xp = (ai * Ra[0] + aj * Rb[0]) / aij;
+ double Yp = (ai * Ra[1] + aj * Rb[1]) / aij;
+ double Zp = (ai * Ra[2] + aj * Rb[2]) / aij;
+ double Xpa = Xp - Ra[0];
+ double Ypa = Yp - Ra[1];
+ double Zpa = Zp - Ra[2];
+ double Xpb = Xp - Rb[0];
+ double Ypb = Yp - Rb[1];
+ double Zpb = Zp - Rb[2];
+ int lij = li + lj;
+ int stride2 = lij+1;
+ int stride1 = (lj+1) * stride2;
+ int Ex_size = (li+1) * stride1;
+ double *Ex = Ecart;
+ double *Ey = Ex + Ex_size;
+ double *Ez = Ey + Ex_size;
+ int i, j, t;
+ double fac, fac1;
+
+ Ex_at(0,0,0) = 1.;
+ Ey_at(0,0,0) = 1.;
+ Ez_at(0,0,0) = Kab;
+ for (t = 1; t <= lij; t++) {
+ Ex_at(0,0,t) = 0.;
+ Ey_at(0,0,t) = 0.;
+ Ez_at(0,0,t) = 0.;
+ }
+
+ for (j = 1; j <= lj; j++) {
+ Ex_at(0,j,0) = Xpb * Ex_at(0,j-1,0) + Ex_at(0,j-1,1);
+ Ey_at(0,j,0) = Ypb * Ey_at(0,j-1,0) + Ey_at(0,j-1,1);
+ Ez_at(0,j,0) = Zpb * Ez_at(0,j-1,0) + Ez_at(0,j-1,1);
+ for (t = 1; t <= lij; t++) {
+ fac = j/(2*aij*t);
+ Ex_at(0,j,t) = fac * Ex_at(0,j-1,t-1);
+ Ey_at(0,j,t) = fac * Ey_at(0,j-1,t-1);
+ Ez_at(0,j,t) = fac * Ez_at(0,j-1,t-1);
+ }
+ }
+
+ for (i = 1; i <= li; i++) {
+ Ex_at(i,0,0) = Xpa * Ex_at(i-1,0,0) + Ex_at(i-1,0,1);
+ Ey_at(i,0,0) = Ypa * Ey_at(i-1,0,0) + Ey_at(i-1,0,1);
+ Ez_at(i,0,0) = Zpa * Ez_at(i-1,0,0) + Ez_at(i-1,0,1);
+ for (t = 1; t <= lij; t++) {
+ fac = i/(2*aij*t);
+ Ex_at(i,0,t) = fac * Ex_at(i-1,0,t-1);
+ Ey_at(i,0,t) = fac * Ey_at(i-1,0,t-1);
+ Ez_at(i,0,t) = fac * Ez_at(i-1,0,t-1);
+ }
+ }
+
+ for (i = 1; i <= li; i++) {
+ for (j = 1; j <= lj; j++) {
+ Ex_at(i,j,0) = Xpb * Ex_at(i,j-1,0) + Ex_at(i,j-1,1);
+ Ey_at(i,j,0) = Ypb * Ey_at(i,j-1,0) + Ey_at(i,j-1,1);
+ Ez_at(i,j,0) = Zpb * Ez_at(i,j-1,0) + Ez_at(i,j-1,1);
+ for (t = 1; t <= lij; t++) {
+ fac = i/(2*aij*t);
+ fac1 = j/(2*aij*t);
+ Ex_at(i,j,t) = fac*Ex_at(i-1,j,t-1) + fac1*Ex_at(i,j-1,t-1);
+ Ey_at(i,j,t) = fac*Ey_at(i-1,j,t-1) + fac1*Ey_at(i,j-1,t-1);
+ Ez_at(i,j,t) = fac*Ez_at(i-1,j,t-1) + fac1*Ez_at(i,j-1,t-1);
+ }
+ }
+ }
+}
+
+// Shape of E tensor is [:li+lj,:li,:lj]
+void get_E_tensor(double *Et, int li, int lj, double ai, double aj,
+ double *Ra, double *Rb, double *buf)
+{
+ get_E_cart_components(buf, li, lj, ai, aj, Ra, Rb);
+ int lij = li + lj;
+ int stride2 = lij+1;
+ int stride1 = (lj+1) * stride2;
+ int Ex_size = (li+1) * stride1;
+ double *Ex = buf;
+ double *Ey = Ex + Ex_size;
+ double *Ez = Ey + Ex_size;
+ int t, u, v, n;
+ int ix, iy, iz;
+ int jx, jy, jz;
+
+ n = 0;
+ // products subject to t+u+v <= li+lj
+ for (t = 0; t <= lij; t++) {
+ for (u = 0; u <= lij-t; u++) {
+ for (v = 0; v <= lij-t-u; v++) {
+ for (ix = li; ix >= 0; ix--) {
+ for (iy = li-ix; iy >= 0; iy--) {
+ iz = li - ix - iy;
+ for (jx = lj; jx >= 0; jx--) {
+ for (jy = lj-jx; jy >= 0; jy--) {
+ jz = lj - jx - jy;
+ Et[n] = Ex_at(ix,jx,t) * Ey_at(iy,jy,u) * Ez_at(iz,jz,v);
+ n++;
+ } }
+ } }
+ } } }
+}
+
+void Et_dot_dm(double *Et_dm, double *dm, int *ao_loc, int *pair_loc,
+ int *bas, int nbas, double *env)
+{
+ int l2 = 2*LMAX;
+ int Et_size = (l2+1)*(l2+2)*(l2+3)/6*NCART_MAX*NCART_MAX;
+ int Ex_size = (2*LMAX+1)*(LMAX+1)*(LMAX+1);
+ double *Et = malloc(sizeof(double) * (Et_size+3*Ex_size));
+ double *buf = Et + Et_size;
+
+ size_t nao = ao_loc[nbas];
+ for (int ish = 0; ish < nbas; ish++) {
+ int li = bas[ish*BAS_SLOTS+ANG_OF];
+ int i0 = ao_loc[ish];
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double ci = env[bas[ish*BAS_SLOTS+PTR_COEFF]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ for (int jsh = 0; jsh <= ish; jsh++) {
+ int lj = bas[jsh*BAS_SLOTS+ANG_OF];
+ int j0 = ao_loc[jsh];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double cj = env[bas[jsh*BAS_SLOTS+PTR_COEFF]];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rho = Et_dm + pair_loc[ish*nbas+jsh];
+ int lij = li + lj;
+ int nfi = (li + 1) * (li + 2) / 2;
+ int nfj = (lj + 1) * (lj + 2) / 2;
+ int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6;
+ get_E_tensor(Et, li, lj, ai, aj, ri, rj, buf);
+ double cc = ci * cj;
+ double *pdm = dm + j0*nao + i0;
+ for (int n = 0, t = 0; t < Et_len; t++) {
+ double rho_t = 0.;
+ for (int i = 0; i < nfi; i++) {
+ for (int j = 0; j < nfj; j++, n++) {
+ rho_t += Et[n] * cc * pdm[j*nao+i];
+ } }
+ rho[t] = rho_t;
+ }
+ }
+ }
+ free(Et);
+}
+
+void jengine_dot_Et(double *vj, double *jvec, int *ao_loc, int *pair_loc,
+ int *bas, int nbas, double *env)
+{
+ int l2 = 2*LMAX;
+ int Et_size = (l2+1)*(l2+2)*(l2+3)/6*NCART_MAX*NCART_MAX;
+ int Ex_size = (2*LMAX+1)*(LMAX+1)*(LMAX+1);
+ double *Et = malloc(sizeof(double) * (Et_size+3*Ex_size));
+ double *buf = Et + Et_size;
+
+ size_t nao = ao_loc[nbas];
+ for (int ish = 0; ish < nbas; ish++) {
+ int li = bas[ish*BAS_SLOTS+ANG_OF];
+ int i0 = ao_loc[ish];
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double ci = env[bas[ish*BAS_SLOTS+PTR_COEFF]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ for (int jsh = 0; jsh <= ish; jsh++) {
+ int lj = bas[jsh*BAS_SLOTS+ANG_OF];
+ int j0 = ao_loc[jsh];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double cj = env[bas[jsh*BAS_SLOTS+PTR_COEFF]];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double *jvec_ij = jvec + pair_loc[ish*nbas+jsh];
+ int lij = li + lj;
+ int nfi = (li + 1) * (li + 2) / 2;
+ int nfj = (lj + 1) * (lj + 2) / 2;
+ int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6;
+ get_E_tensor(Et, li, lj, ai, aj, ri, rj, buf);
+ double cc = ci * cj;
+ double *pj = vj + i0*nao+j0;
+ for (int n = 0, t = 0; t < Et_len; t++) {
+ double fac = cc * jvec_ij[t];
+ for (int i = 0; i < nfi; i++) {
+ for (int j = 0; j < nfj; j++, n++) {
+ pj[i*nao+j] += Et[n] * fac;
+ } }
+ }
+ }
+ }
+ free(Et);
+}
diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
new file mode 100644
index 00000000..61a679f3
--- /dev/null
+++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
@@ -0,0 +1,5077 @@
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/gamma_inc_unrolled.cu"
+
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_0_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 256;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 256;
+ double *vj_cache = vj_kl_cache + 256;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 256;
+ // zero out all cache;
+ for (int n = sq_id; n < 3328; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 0);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 0; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_1_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 512;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 1024;
+ double *vj_cache = vj_kl_cache + 256;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 1024;
+ // zero out all cache;
+ for (int n = sq_id; n < 4864; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 1);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 1; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_1_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 768;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 1024;
+ double *vj_cache = vj_kl_cache + 1024;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 1024;
+ // zero out all cache;
+ for (int n = sq_id; n < 6400; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 2);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 2; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+512];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+512];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+512];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+// TILEX=16, TILEY=8, cache_dm=True
+__global__
+void md_j_1_2(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 128;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 1024;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 512;
+ double *vj_kl_cache = vj_ij_cache + 1024;
+ double *vj_cache = vj_kl_cache + 1280;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 1024;
+ // zero out all cache;
+ for (int n = sq_id; n < 6400; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 128) {
+ int task_kl = blockIdx.y * 128 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+128] = ykl;
+ Rq_cache[sq_id+256] = zkl;
+ Rq_cache[sq_id+384] = akl;
+ } else {
+ Rq_cache[sq_id+384] = 1.;
+ }
+ }
+ for (int n = ty; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 80; n += 16) {
+ int i = n / 8;
+ int tile = n % 8;
+ int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+ dm_kl_cache[sq_kl+i*128] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 8; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 128 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+128];
+ double zkl = Rq_cache[sq_kl+256];
+ double akl = Rq_cache[sq_kl+384];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 3);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 3; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+512];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+128] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+0];
+ double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+ double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+ vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_2 = ypq * R_1_0_0_2;
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+512];
+ double R_0_1_0_2 = xpq * R_1_0_0_2;
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+512];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+384] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+256];
+ double R_1_0_1_1 = ypq * R_2_0_0_1;
+ double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+512];
+ double R_0_1_1_1 = xpq * R_1_0_1_1;
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+256];
+ double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+ double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+ vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+512];
+ double R_0_1_2_0 = xpq * R_1_0_2_0;
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+640] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+512];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+768] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+512];
+ double R_1_1_0_1 = xpq * R_2_0_0_1;
+ double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+896] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+512];
+ double R_1_1_1_0 = xpq * R_2_0_1_0;
+ double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+1024] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+512];
+ double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+ double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+ vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+1152] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+128];
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+640];
+ vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768];
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+896];
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+1024];
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+1152];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+128];
+ vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+640];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768];
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+896];
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+1024];
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+1152];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+128];
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+640];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768];
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+896];
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+1024];
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+1152];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+128];
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+640];
+ vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768];
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+896];
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+1024];
+ vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+1152];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 80; n += 16) {
+ int i = n / 8;
+ int tile = n % 8;
+ int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+ }
+ }
+}
+
+// TILEX=16, TILEY=16, cache_dm=True
+__global__
+void md_j_2_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 768;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 2560;
+ double *vj_cache = vj_kl_cache + 256;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 2560;
+ // zero out all cache;
+ for (int n = sq_id; n < 7936; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 2);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 2; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+// TILEX=16, TILEY=8, cache_dm=True
+__global__
+void md_j_2_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 128;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 1024;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 512;
+ double *vj_kl_cache = vj_ij_cache + 2560;
+ double *vj_cache = vj_kl_cache + 512;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 2560;
+ // zero out all cache;
+ for (int n = sq_id; n < 7936; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 128) {
+ int task_kl = blockIdx.y * 128 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+128] = ykl;
+ Rq_cache[sq_id+256] = zkl;
+ Rq_cache[sq_id+384] = akl;
+ } else {
+ Rq_cache[sq_id+384] = 1.;
+ }
+ }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 32; n += 16) {
+ int i = n / 8;
+ int tile = n % 8;
+ int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+ dm_kl_cache[sq_kl+i*128] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 8; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 128 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+128];
+ double zkl = Rq_cache[sq_kl+256];
+ double akl = Rq_cache[sq_kl+384];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 3);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 3; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+ double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+ double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+ vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+768];
+ double R_0_0_1_2 = ypq * R_1_0_0_2;
+ vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+1024];
+ double R_1_0_1_1 = ypq * R_2_0_0_1;
+ double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+ vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1280];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_0_2 = xpq * R_1_0_0_2;
+ vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_1_1 = xpq * R_1_0_1_1;
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_1 = xpq * R_2_0_0_1;
+ double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+ vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+128] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1024];
+ double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+ double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+ vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_2_0 = xpq * R_1_0_2_0;
+ vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_1_0 = xpq * R_2_0_1_0;
+ double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+ vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1024];
+ vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1792];
+ vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2048];
+ double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+ double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+ vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+384] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+384];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 32; n += 16) {
+ int i = n / 8;
+ int tile = n % 8;
+ int task_kl = blockIdx.y * 128 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*128]);
+ }
+ }
+}
+
+// TILEX=16, TILEY=4, cache_dm=True
+__global__
+void md_j_2_2(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 256;
+ int task_kl0 = blockIdx.y * 64;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 1280;
+ double *Rq_cache = Rp_cache + 1024;
+ double *vj_ij_cache = Rq_cache + 256;
+ double *vj_kl_cache = vj_ij_cache + 2560;
+ double *vj_cache = vj_kl_cache + 640;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 2560;
+ // zero out all cache;
+ for (int n = sq_id; n < 7936; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 256) {
+ int task_ij = blockIdx.x * 256 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+256] = yij;
+ Rp_cache[sq_id+512] = zij;
+ Rp_cache[sq_id+768] = aij;
+ } else {
+ Rp_cache[sq_id+768] = 1.;
+ }
+ }
+ if (sq_id < 64) {
+ int task_kl = blockIdx.y * 64 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+64] = ykl;
+ Rq_cache[sq_id+128] = zkl;
+ Rq_cache[sq_id+192] = akl;
+ } else {
+ Rq_cache[sq_id+192] = 1.;
+ }
+ }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*256] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 40; n += 16) {
+ int i = n / 4;
+ int tile = n % 4;
+ int task_kl = blockIdx.y * 64 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*64]);
+ dm_kl_cache[sq_kl+i*64] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 16; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 4; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 256 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 64 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+256];
+ double zij = Rp_cache[sq_ij+512];
+ double aij = Rp_cache[sq_ij+768];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+64];
+ double zkl = Rq_cache[sq_kl+128];
+ double akl = Rq_cache[sq_kl+192];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 4);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 4; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+256];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+512];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+768];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+1024];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+1280];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+ double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+ double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+ vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+768];
+ double R_0_0_1_2 = ypq * R_1_0_0_2;
+ vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+1024];
+ double R_1_0_1_1 = ypq * R_2_0_0_1;
+ double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+ vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1280];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_0_2 = xpq * R_1_0_0_2;
+ vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_1_1 = xpq * R_1_0_1_1;
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_1 = xpq * R_2_0_0_1;
+ double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+ vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+64] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+256];
+ double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256];
+ double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1;
+ double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2;
+ vj_kl += R_0_0_0_4 * dm_ij_cache[sq_ij+512];
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+768];
+ double R_0_0_1_3 = ypq * R_1_0_0_3;
+ vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+1024];
+ double R_1_0_1_2 = ypq * R_2_0_0_2;
+ double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2;
+ vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+1280];
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_0_3 = xpq * R_1_0_0_3;
+ vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_1_2 = xpq * R_1_0_1_2;
+ vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_2 = xpq * R_2_0_0_2;
+ double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2;
+ vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+128] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+1024];
+ double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+ double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+ vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_2_0 = xpq * R_1_0_2_0;
+ vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_1_0 = xpq * R_2_0_1_0;
+ double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+ vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+192] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+512];
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+768];
+ vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+1024];
+ double R_2_0_1_1 = ypq * R_3_0_0_1;
+ double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1;
+ double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1;
+ vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+1280];
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1536];
+ vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_2_1 = xpq * R_1_0_2_1;
+ vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_1_1 = xpq * R_2_0_1_1;
+ double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1;
+ vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+512];
+ vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+768];
+ vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+1024];
+ double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256];
+ double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0;
+ double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0;
+ vj_kl += R_0_0_4_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_3_0 = xpq * R_1_0_3_0;
+ vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_2_0 = xpq * R_2_0_2_0;
+ double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0;
+ vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+320] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+1024];
+ vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1792];
+ vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+2048];
+ double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+ double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+ vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+384] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+512];
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+768];
+ vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1024];
+ vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1280];
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1536];
+ vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+1792];
+ vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+2048];
+ double R_2_1_0_1 = xpq * R_3_0_0_1;
+ double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1;
+ double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1;
+ vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+448] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+512];
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+768];
+ vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1024];
+ vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1792];
+ vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+2048];
+ double R_2_1_1_0 = xpq * R_3_0_1_0;
+ double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0;
+ double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0;
+ vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+256];
+ vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+512];
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+768];
+ vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1024];
+ vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+1280];
+ vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1536];
+ vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+1792];
+ vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2048];
+ double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256];
+ double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0;
+ double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0;
+ vj_kl += R_0_4_0_0 * dm_ij_cache[sq_ij+2304];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+576] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_0_0_4 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_0_4_0 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+64];
+ vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+128];
+ vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+192];
+ vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+320];
+ vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+384];
+ vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+448];
+ vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij += R_0_4_0_0 * dm_kl_cache[sq_kl+576];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_ij = blockIdx.x * 256 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*256]);
+ }
+ }
+ for (int n = tx; n < 40; n += 16) {
+ int i = n / 4;
+ int tile = n % 4;
+ int task_kl = blockIdx.y * 64 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*64]);
+ }
+ }
+}
+
+// TILEX=8, TILEY=16, cache_dm=True
+__global__
+void md_j_3_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 128;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 1024;
+ double *Rq_cache = Rp_cache + 512;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 2560;
+ double *vj_cache = vj_kl_cache + 256;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 2560;
+ // zero out all cache;
+ for (int n = sq_id; n < 7424; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 128) {
+ int task_ij = blockIdx.x * 128 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+128] = yij;
+ Rp_cache[sq_id+256] = zij;
+ Rp_cache[sq_id+384] = aij;
+ } else {
+ Rp_cache[sq_id+384] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 8;
+ int tile = n % 8;
+ int task_ij = blockIdx.x * 128 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*128] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 8; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 128 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+128];
+ double zij = Rp_cache[sq_ij+256];
+ double aij = Rp_cache[sq_ij+384];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 3);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 3; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+128];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+256];
+ double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+ double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+ vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+384];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+512];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+640];
+ double R_0_0_1_2 = ypq * R_1_0_0_2;
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+768];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+896];
+ double R_1_0_1_1 = ypq * R_2_0_0_1;
+ double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+1024];
+ double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+ double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+ vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+1152];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+1280];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1408];
+ double R_0_1_0_2 = xpq * R_1_0_0_2;
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1536];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+1664];
+ double R_0_1_1_1 = xpq * R_1_0_1_1;
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1792];
+ double R_0_1_2_0 = xpq * R_1_0_2_0;
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1920];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+2048];
+ double R_1_1_0_1 = xpq * R_2_0_0_1;
+ double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+2176];
+ double R_1_1_1_0 = xpq * R_2_0_1_0;
+ double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+2304];
+ double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+ double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+ vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+2432];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+128] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+384] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+640] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+896] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1152] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1408] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1664] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1920] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2176] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2304] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2432] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 160; n += 16) {
+ int i = n / 8;
+ int tile = n % 8;
+ int task_ij = blockIdx.x * 128 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*128]);
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+// TILEX=4, TILEY=16, cache_dm=True
+__global__
+void md_j_3_1(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 64;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 1280;
+ double *Rq_cache = Rp_cache + 256;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 1280;
+ double *vj_cache = vj_kl_cache + 1024;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 1280;
+ // zero out all cache;
+ for (int n = sq_id; n < 6144; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 64) {
+ int task_ij = blockIdx.x * 64 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+64] = yij;
+ Rp_cache[sq_id+128] = zij;
+ Rp_cache[sq_id+192] = aij;
+ } else {
+ Rp_cache[sq_id+192] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 80; n += 16) {
+ int i = n / 4;
+ int tile = n % 4;
+ int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*64] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 4; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 64 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+64];
+ double zij = Rp_cache[sq_ij+128];
+ double aij = Rp_cache[sq_ij+192];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 4);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 4; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+64];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+128];
+ double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+ double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+ vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+192];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+320];
+ double R_0_0_1_2 = ypq * R_1_0_0_2;
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+384];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+448];
+ double R_1_0_1_1 = ypq * R_2_0_0_1;
+ double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+512];
+ double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+ double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+ vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+576];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+640];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+704];
+ double R_0_1_0_2 = xpq * R_1_0_0_2;
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+768];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+832];
+ double R_0_1_1_1 = xpq * R_1_0_1_1;
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+896];
+ double R_0_1_2_0 = xpq * R_1_0_2_0;
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+960];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+1024];
+ double R_1_1_0_1 = xpq * R_2_0_0_1;
+ double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1088];
+ double R_1_1_1_0 = xpq * R_2_0_1_0;
+ double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1152];
+ double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+ double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+ vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1216];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_0_1 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_0_2 * dm_ij_cache[sq_ij+64];
+ vj_kl -= R_0_0_0_3 * dm_ij_cache[sq_ij+128];
+ double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256];
+ double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1;
+ double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2;
+ vj_kl -= R_0_0_0_4 * dm_ij_cache[sq_ij+192];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+320];
+ double R_0_0_1_3 = ypq * R_1_0_0_3;
+ vj_kl -= R_0_0_1_3 * dm_ij_cache[sq_ij+384];
+ vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+448];
+ double R_1_0_1_2 = ypq * R_2_0_0_2;
+ double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2;
+ vj_kl -= R_0_0_2_2 * dm_ij_cache[sq_ij+512];
+ double R_2_0_1_1 = ypq * R_3_0_0_1;
+ double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1;
+ double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1;
+ vj_kl -= R_0_0_3_1 * dm_ij_cache[sq_ij+576];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+640];
+ vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+704];
+ double R_0_1_0_3 = xpq * R_1_0_0_3;
+ vj_kl -= R_0_1_0_3 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+832];
+ double R_0_1_1_2 = xpq * R_1_0_1_2;
+ vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+896];
+ double R_0_1_2_1 = xpq * R_1_0_2_1;
+ vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+960];
+ vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+1024];
+ double R_1_1_0_2 = xpq * R_2_0_0_2;
+ double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2;
+ vj_kl -= R_0_2_0_2 * dm_ij_cache[sq_ij+1088];
+ double R_1_1_1_1 = xpq * R_2_0_1_1;
+ double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1;
+ vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+1152];
+ double R_2_1_0_1 = xpq * R_3_0_0_1;
+ double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1;
+ double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1;
+ vj_kl -= R_0_3_0_1 * dm_ij_cache[sq_ij+1216];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+256] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_0_1_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_0_1_1 * dm_ij_cache[sq_ij+64];
+ vj_kl -= R_0_0_1_2 * dm_ij_cache[sq_ij+128];
+ vj_kl -= R_0_0_1_3 * dm_ij_cache[sq_ij+192];
+ vj_kl -= R_0_0_2_0 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_0_2_1 * dm_ij_cache[sq_ij+320];
+ vj_kl -= R_0_0_2_2 * dm_ij_cache[sq_ij+384];
+ vj_kl -= R_0_0_3_0 * dm_ij_cache[sq_ij+448];
+ vj_kl -= R_0_0_3_1 * dm_ij_cache[sq_ij+512];
+ double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256];
+ double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0;
+ double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0;
+ vj_kl -= R_0_0_4_0 * dm_ij_cache[sq_ij+576];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+640];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+704];
+ vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+832];
+ vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+896];
+ double R_0_1_3_0 = xpq * R_1_0_3_0;
+ vj_kl -= R_0_1_3_0 * dm_ij_cache[sq_ij+960];
+ vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+1024];
+ vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+1088];
+ double R_1_1_2_0 = xpq * R_2_0_2_0;
+ double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0;
+ vj_kl -= R_0_2_2_0 * dm_ij_cache[sq_ij+1152];
+ double R_2_1_1_0 = xpq * R_3_0_1_0;
+ double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0;
+ double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0;
+ vj_kl -= R_0_3_1_0 * dm_ij_cache[sq_ij+1216];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+512] += vj_cache[sq_id];
+ }
+ vj_kl = 0.;
+ vj_kl -= R_0_1_0_0 * dm_ij_cache[sq_ij+0];
+ vj_kl -= R_0_1_0_1 * dm_ij_cache[sq_ij+64];
+ vj_kl -= R_0_1_0_2 * dm_ij_cache[sq_ij+128];
+ vj_kl -= R_0_1_0_3 * dm_ij_cache[sq_ij+192];
+ vj_kl -= R_0_1_1_0 * dm_ij_cache[sq_ij+256];
+ vj_kl -= R_0_1_1_1 * dm_ij_cache[sq_ij+320];
+ vj_kl -= R_0_1_1_2 * dm_ij_cache[sq_ij+384];
+ vj_kl -= R_0_1_2_0 * dm_ij_cache[sq_ij+448];
+ vj_kl -= R_0_1_2_1 * dm_ij_cache[sq_ij+512];
+ vj_kl -= R_0_1_3_0 * dm_ij_cache[sq_ij+576];
+ vj_kl -= R_0_2_0_0 * dm_ij_cache[sq_ij+640];
+ vj_kl -= R_0_2_0_1 * dm_ij_cache[sq_ij+704];
+ vj_kl -= R_0_2_0_2 * dm_ij_cache[sq_ij+768];
+ vj_kl -= R_0_2_1_0 * dm_ij_cache[sq_ij+832];
+ vj_kl -= R_0_2_1_1 * dm_ij_cache[sq_ij+896];
+ vj_kl -= R_0_2_2_0 * dm_ij_cache[sq_ij+960];
+ vj_kl -= R_0_3_0_0 * dm_ij_cache[sq_ij+1024];
+ vj_kl -= R_0_3_0_1 * dm_ij_cache[sq_ij+1088];
+ vj_kl -= R_0_3_1_0 * dm_ij_cache[sq_ij+1152];
+ double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256];
+ double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0;
+ double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0;
+ vj_kl -= R_0_4_0_0 * dm_ij_cache[sq_ij+1216];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_0_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+64] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_3 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+128] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_0_4 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_1_3 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_0_3 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+192] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_2_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+320] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_1_3 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_2_2 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+384] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_2_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_3_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+448] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_2_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_3_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_0_3_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_0_4_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_1_3_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+576] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_0_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+640] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+704] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_0_3 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_0_2 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_2_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+832] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_1_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+896] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_1_2_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_1_3_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_2_2_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+960] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_2_0_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_2_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_3_0_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_2_0_2 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_3_0_1 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1088] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_2_1_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_2_2_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_3_1_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1152] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0];
+ vj_ij -= R_0_3_0_1 * dm_kl_cache[sq_kl+256];
+ vj_ij -= R_0_3_1_0 * dm_kl_cache[sq_kl+512];
+ vj_ij -= R_0_4_0_0 * dm_kl_cache[sq_kl+768];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1216] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 80; n += 16) {
+ int i = n / 4;
+ int tile = n % 4;
+ int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*64]);
+ }
+ }
+ for (int n = tx; n < 64; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+// TILEX=4, TILEY=16, cache_dm=True
+__global__
+void md_j_4_0(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds)
+{
+ int *pair_ij_mapping = bounds.tile_ij_mapping;
+ int *pair_kl_mapping = bounds.tile_kl_mapping;
+ int task_ij0 = blockIdx.x * 64;
+ int task_kl0 = blockIdx.y * 256;
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ float *q_cond = bounds.q_cond;
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ return;
+ }
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int sq_id = tx + 16 * ty;
+ int *bas = envs.bas;
+ int *dm_pair_loc = envs.ao_loc;
+ int nbas = envs.nbas;
+ double *env = envs.env;
+ double *dm = jk.dm;
+ double *vj = jk.vj;
+ double vj_ij, vj_kl;
+
+ int npairs_ij = bounds.npairs_ij;
+ int npairs_kl = bounds.npairs_kl;
+ extern __shared__ double gamma_inc[];
+ double *Rp_cache = gamma_inc + 1280;
+ double *Rq_cache = Rp_cache + 256;
+ double *vj_ij_cache = Rq_cache + 1024;
+ double *vj_kl_cache = vj_ij_cache + 2240;
+ double *vj_cache = vj_kl_cache + 256;
+ double *dm_ij_cache = vj_cache + 256;
+ double *dm_kl_cache = dm_ij_cache + 2240;
+ // zero out all cache;
+ for (int n = sq_id; n < 6528; n += 256) {
+ Rp_cache[n] = 0.;
+ }
+ __syncthreads();
+
+ if (sq_id < 64) {
+ int task_ij = blockIdx.x * 64 + sq_id;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ double ai = env[bas[ish*BAS_SLOTS+PTR_EXP]];
+ double aj = env[bas[jsh*BAS_SLOTS+PTR_EXP]];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double aij = ai + aj;
+ double xij = (ai * ri[0] + aj * rj[0]) / aij;
+ double yij = (ai * ri[1] + aj * rj[1]) / aij;
+ double zij = (ai * ri[2] + aj * rj[2]) / aij;
+ Rp_cache[sq_id+0] = xij;
+ Rp_cache[sq_id+64] = yij;
+ Rp_cache[sq_id+128] = zij;
+ Rp_cache[sq_id+192] = aij;
+ } else {
+ Rp_cache[sq_id+192] = 1.;
+ }
+ }
+ if (sq_id < 256) {
+ int task_kl = blockIdx.y * 256 + sq_id;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ double ak = env[bas[ksh*BAS_SLOTS+PTR_EXP]];
+ double al = env[bas[lsh*BAS_SLOTS+PTR_EXP]];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+ double akl = ak + al;
+ double xkl = (ak * rk[0] + al * rl[0]) / akl;
+ double ykl = (ak * rk[1] + al * rl[1]) / akl;
+ double zkl = (ak * rk[2] + al * rl[2]) / akl;
+ Rq_cache[sq_id+0] = xkl;
+ Rq_cache[sq_id+256] = ykl;
+ Rq_cache[sq_id+512] = zkl;
+ Rq_cache[sq_id+768] = akl;
+ } else {
+ Rq_cache[sq_id+768] = 1.;
+ }
+ }
+ for (int n = ty; n < 140; n += 16) {
+ int i = n / 4;
+ int tile = n % 4;
+ int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ dm_ij_cache[sq_ij+i*64] = dm[dm_ij_pair0+i];
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ dm_kl_cache[sq_kl+i*256] = dm[dm_kl_pair0+i];
+ }
+ }
+ __syncthreads();
+
+ for (int batch_ij = 0; batch_ij < 4; ++batch_ij) {
+ for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
+ int task_ij0 = blockIdx.x * 64 + batch_ij * 16;
+ int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
+ if (task_ij0 >= npairs_ij || task_kl0 >= npairs_kl) {
+ continue;
+ }
+ int pair_ij0 = pair_ij_mapping[task_ij0];
+ int pair_kl0 = pair_kl_mapping[task_kl0];
+ if (q_cond[pair_ij0] + q_cond[pair_kl0] < bounds.cutoff) {
+ continue;
+ }
+
+ int sq_ij = tx + batch_ij * 16;
+ int sq_kl = ty + batch_kl * 16;
+ int task_ij = task_ij0 + tx;
+ int task_kl = task_kl0 + ty;
+ double fac_sym = PI_FAC;
+ if (task_ij >= npairs_ij) {
+ task_ij = task_ij0;
+ fac_sym = 0.;
+ }
+ if (task_kl >= npairs_kl) {
+ task_kl = task_kl0;
+ fac_sym = 0.;
+ }
+ int pair_ij = pair_ij_mapping[task_ij];
+ int pair_kl = pair_kl_mapping[task_kl];
+
+ int ish = pair_ij / nbas;
+ int jsh = pair_ij % nbas;
+ int ksh = pair_kl / nbas;
+ int lsh = pair_kl % nbas;
+ if (ish == jsh) fac_sym *= .5;
+ if (ksh == lsh) fac_sym *= .5;
+ if (pair_ij_mapping == pair_kl_mapping) {
+ if (task_ij == task_kl) fac_sym *= .5;
+ if (task_ij < task_kl) fac_sym = 0.;
+ }
+ double xij = Rp_cache[sq_ij+0];
+ double yij = Rp_cache[sq_ij+64];
+ double zij = Rp_cache[sq_ij+128];
+ double aij = Rp_cache[sq_ij+192];
+ double xkl = Rq_cache[sq_kl+0];
+ double ykl = Rq_cache[sq_kl+256];
+ double zkl = Rq_cache[sq_kl+512];
+ double akl = Rq_cache[sq_kl+768];
+ double fac = fac_sym / (aij*akl*sqrt(aij+akl));
+ double xpq = xij - xkl;
+ double ypq = yij - ykl;
+ double zpq = zij - zkl;
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * akl / (aij + akl);
+ double theta_rr = theta * rr;
+ eval_gamma_inc_fn(gamma_inc, theta_rr, 4);
+ double a2 = -2. * theta;
+ gamma_inc[sq_id] *= fac;
+ for (int i = 1; i <= 4; i++) {
+ fac *= a2;
+ gamma_inc[sq_id+i*256] *= fac;
+ }
+ vj_kl = 0.;
+ vj_kl += gamma_inc[sq_id+0*256] * dm_ij_cache[sq_ij+0];
+ double R_0_0_0_1 = zpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_1 * dm_ij_cache[sq_ij+64];
+ double R_1_0_0_1 = zpq * gamma_inc[sq_id+2*256];
+ double R_0_0_0_2 = zpq * R_1_0_0_1 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_0_2 * dm_ij_cache[sq_ij+128];
+ double R_2_0_0_1 = zpq * gamma_inc[sq_id+3*256];
+ double R_1_0_0_2 = zpq * R_2_0_0_1 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_0_3 = zpq * R_1_0_0_2 + 2 * R_1_0_0_1;
+ vj_kl += R_0_0_0_3 * dm_ij_cache[sq_ij+192];
+ double R_3_0_0_1 = zpq * gamma_inc[sq_id+4*256];
+ double R_2_0_0_2 = zpq * R_3_0_0_1 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_0_0_3 = zpq * R_2_0_0_2 + 2 * R_2_0_0_1;
+ double R_0_0_0_4 = zpq * R_1_0_0_3 + 3 * R_1_0_0_2;
+ vj_kl += R_0_0_0_4 * dm_ij_cache[sq_ij+256];
+ double R_0_0_1_0 = ypq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_1_0 * dm_ij_cache[sq_ij+320];
+ double R_0_0_1_1 = ypq * R_1_0_0_1;
+ vj_kl += R_0_0_1_1 * dm_ij_cache[sq_ij+384];
+ double R_0_0_1_2 = ypq * R_1_0_0_2;
+ vj_kl += R_0_0_1_2 * dm_ij_cache[sq_ij+448];
+ double R_0_0_1_3 = ypq * R_1_0_0_3;
+ vj_kl += R_0_0_1_3 * dm_ij_cache[sq_ij+512];
+ double R_1_0_1_0 = ypq * gamma_inc[sq_id+2*256];
+ double R_0_0_2_0 = ypq * R_1_0_1_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_0_2_0 * dm_ij_cache[sq_ij+576];
+ double R_1_0_1_1 = ypq * R_2_0_0_1;
+ double R_0_0_2_1 = ypq * R_1_0_1_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_0_2_1 * dm_ij_cache[sq_ij+640];
+ double R_1_0_1_2 = ypq * R_2_0_0_2;
+ double R_0_0_2_2 = ypq * R_1_0_1_2 + 1 * R_1_0_0_2;
+ vj_kl += R_0_0_2_2 * dm_ij_cache[sq_ij+704];
+ double R_2_0_1_0 = ypq * gamma_inc[sq_id+3*256];
+ double R_1_0_2_0 = ypq * R_2_0_1_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_0_3_0 = ypq * R_1_0_2_0 + 2 * R_1_0_1_0;
+ vj_kl += R_0_0_3_0 * dm_ij_cache[sq_ij+768];
+ double R_2_0_1_1 = ypq * R_3_0_0_1;
+ double R_1_0_2_1 = ypq * R_2_0_1_1 + 1 * R_2_0_0_1;
+ double R_0_0_3_1 = ypq * R_1_0_2_1 + 2 * R_1_0_1_1;
+ vj_kl += R_0_0_3_1 * dm_ij_cache[sq_ij+832];
+ double R_3_0_1_0 = ypq * gamma_inc[sq_id+4*256];
+ double R_2_0_2_0 = ypq * R_3_0_1_0 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_0_3_0 = ypq * R_2_0_2_0 + 2 * R_2_0_1_0;
+ double R_0_0_4_0 = ypq * R_1_0_3_0 + 3 * R_1_0_2_0;
+ vj_kl += R_0_0_4_0 * dm_ij_cache[sq_ij+896];
+ double R_0_1_0_0 = xpq * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_1_0_0 * dm_ij_cache[sq_ij+960];
+ double R_0_1_0_1 = xpq * R_1_0_0_1;
+ vj_kl += R_0_1_0_1 * dm_ij_cache[sq_ij+1024];
+ double R_0_1_0_2 = xpq * R_1_0_0_2;
+ vj_kl += R_0_1_0_2 * dm_ij_cache[sq_ij+1088];
+ double R_0_1_0_3 = xpq * R_1_0_0_3;
+ vj_kl += R_0_1_0_3 * dm_ij_cache[sq_ij+1152];
+ double R_0_1_1_0 = xpq * R_1_0_1_0;
+ vj_kl += R_0_1_1_0 * dm_ij_cache[sq_ij+1216];
+ double R_0_1_1_1 = xpq * R_1_0_1_1;
+ vj_kl += R_0_1_1_1 * dm_ij_cache[sq_ij+1280];
+ double R_0_1_1_2 = xpq * R_1_0_1_2;
+ vj_kl += R_0_1_1_2 * dm_ij_cache[sq_ij+1344];
+ double R_0_1_2_0 = xpq * R_1_0_2_0;
+ vj_kl += R_0_1_2_0 * dm_ij_cache[sq_ij+1408];
+ double R_0_1_2_1 = xpq * R_1_0_2_1;
+ vj_kl += R_0_1_2_1 * dm_ij_cache[sq_ij+1472];
+ double R_0_1_3_0 = xpq * R_1_0_3_0;
+ vj_kl += R_0_1_3_0 * dm_ij_cache[sq_ij+1536];
+ double R_1_1_0_0 = xpq * gamma_inc[sq_id+2*256];
+ double R_0_2_0_0 = xpq * R_1_1_0_0 + 1 * gamma_inc[sq_id+1*256];
+ vj_kl += R_0_2_0_0 * dm_ij_cache[sq_ij+1600];
+ double R_1_1_0_1 = xpq * R_2_0_0_1;
+ double R_0_2_0_1 = xpq * R_1_1_0_1 + 1 * R_1_0_0_1;
+ vj_kl += R_0_2_0_1 * dm_ij_cache[sq_ij+1664];
+ double R_1_1_0_2 = xpq * R_2_0_0_2;
+ double R_0_2_0_2 = xpq * R_1_1_0_2 + 1 * R_1_0_0_2;
+ vj_kl += R_0_2_0_2 * dm_ij_cache[sq_ij+1728];
+ double R_1_1_1_0 = xpq * R_2_0_1_0;
+ double R_0_2_1_0 = xpq * R_1_1_1_0 + 1 * R_1_0_1_0;
+ vj_kl += R_0_2_1_0 * dm_ij_cache[sq_ij+1792];
+ double R_1_1_1_1 = xpq * R_2_0_1_1;
+ double R_0_2_1_1 = xpq * R_1_1_1_1 + 1 * R_1_0_1_1;
+ vj_kl += R_0_2_1_1 * dm_ij_cache[sq_ij+1856];
+ double R_1_1_2_0 = xpq * R_2_0_2_0;
+ double R_0_2_2_0 = xpq * R_1_1_2_0 + 1 * R_1_0_2_0;
+ vj_kl += R_0_2_2_0 * dm_ij_cache[sq_ij+1920];
+ double R_2_1_0_0 = xpq * gamma_inc[sq_id+3*256];
+ double R_1_2_0_0 = xpq * R_2_1_0_0 + 1 * gamma_inc[sq_id+2*256];
+ double R_0_3_0_0 = xpq * R_1_2_0_0 + 2 * R_1_1_0_0;
+ vj_kl += R_0_3_0_0 * dm_ij_cache[sq_ij+1984];
+ double R_2_1_0_1 = xpq * R_3_0_0_1;
+ double R_1_2_0_1 = xpq * R_2_1_0_1 + 1 * R_2_0_0_1;
+ double R_0_3_0_1 = xpq * R_1_2_0_1 + 2 * R_1_1_0_1;
+ vj_kl += R_0_3_0_1 * dm_ij_cache[sq_ij+2048];
+ double R_2_1_1_0 = xpq * R_3_0_1_0;
+ double R_1_2_1_0 = xpq * R_2_1_1_0 + 1 * R_2_0_1_0;
+ double R_0_3_1_0 = xpq * R_1_2_1_0 + 2 * R_1_1_1_0;
+ vj_kl += R_0_3_1_0 * dm_ij_cache[sq_ij+2112];
+ double R_3_1_0_0 = xpq * gamma_inc[sq_id+4*256];
+ double R_2_2_0_0 = xpq * R_3_1_0_0 + 1 * gamma_inc[sq_id+3*256];
+ double R_1_3_0_0 = xpq * R_2_2_0_0 + 2 * R_2_1_0_0;
+ double R_0_4_0_0 = xpq * R_1_3_0_0 + 3 * R_1_2_0_0;
+ vj_kl += R_0_4_0_0 * dm_ij_cache[sq_ij+2176];
+ __syncthreads();
+ vj_cache[sq_id] = vj_kl;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (tx < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride];
+ }
+ }
+ __syncthreads();
+ if (tx == 0 && task_kl0+ty < npairs_kl) {
+ vj_kl_cache[sq_kl+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += gamma_inc[sq_id+0*256] * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+0] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+64] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+128] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_3 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+192] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_0_4 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+256] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+320] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+384] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+448] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_1_3 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+512] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+576] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+640] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_2_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+704] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_3_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+768] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_3_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+832] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_0_4_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+896] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+960] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1024] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1088] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_0_3 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1152] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1216] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1280] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_1_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1344] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_2_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1408] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_2_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1472] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_1_3_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1536] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1600] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1664] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_0_2 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1728] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1792] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_1_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1856] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_2_2_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1920] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_3_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+1984] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_3_0_1 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2048] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_3_1_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2112] += vj_cache[sq_id];
+ }
+ vj_ij = 0.;
+ vj_ij += R_0_4_0_0 * dm_kl_cache[sq_kl+0];
+ __syncthreads();
+ vj_cache[sq_id] = vj_ij;
+ for (int stride = 8; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (ty < stride) {
+ vj_cache[sq_id] += vj_cache[sq_id + stride*16];
+ }
+ }
+ __syncthreads();
+ if (ty == 0 && task_ij0+tx < npairs_ij) {
+ vj_ij_cache[sq_ij+2176] += vj_cache[sq_id];
+ }
+ __syncthreads();
+ } }
+ for (int n = ty; n < 140; n += 16) {
+ int i = n / 4;
+ int tile = n % 4;
+ int task_ij = blockIdx.x * 64 + tile * 16 + tx;
+ if (task_ij < npairs_ij) {
+ int pair_ij = pair_ij_mapping[task_ij];
+ int dm_ij_pair0 = dm_pair_loc[pair_ij];
+ int sq_ij = tx + tile * 16;
+ atomicAdd(vj+dm_ij_pair0+i, vj_ij_cache[sq_ij+i*64]);
+ }
+ }
+ for (int n = tx; n < 16; n += 16) {
+ int i = n / 16;
+ int tile = n % 16;
+ int task_kl = blockIdx.y * 256 + tile * 16 + ty;
+ if (task_kl < npairs_kl) {
+ int pair_kl = pair_kl_mapping[task_kl];
+ int dm_kl_pair0 = dm_pair_loc[pair_kl];
+ int sq_kl = ty + tile * 16;
+ atomicAdd(vj+dm_kl_pair0+i, vj_kl_cache[sq_kl+i*256]);
+ }
+ }
+}
+
+int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
+ int *scheme, int workers, double omega)
+{
+ int li = bounds->li;
+ int lj = bounds->lj;
+ int lk = bounds->lk;
+ int ll = bounds->ll;
+ int lij = li + lj;
+ int lkl = lk + ll;
+ dim3 threads(16, 16);
+ dim3 blocks;
+ int ijkl = lij*9 + lkl;
+ switch (ijkl) {
+ case 0: // lij=0, lkl=0
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_0_0<<>>(*envs, *jk, *bounds); break;
+ case 9: // lij=1, lkl=0
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_1_0<<>>(*envs, *jk, *bounds); break;
+ case 10: // lij=1, lkl=1
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_1_1<<>>(*envs, *jk, *bounds); break;
+ case 11: // lij=1, lkl=2
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 127) / 128;
+ md_j_1_2<<>>(*envs, *jk, *bounds); break;
+ case 18: // lij=2, lkl=0
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_2_0<<>>(*envs, *jk, *bounds); break;
+ case 19: // lij=2, lkl=1
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 127) / 128;
+ md_j_2_1<<>>(*envs, *jk, *bounds); break;
+ case 20: // lij=2, lkl=2
+ blocks.x = (bounds->npairs_ij + 255) / 256;
+ blocks.y = (bounds->npairs_kl + 63) / 64;
+ md_j_2_2<<>>(*envs, *jk, *bounds); break;
+ case 27: // lij=3, lkl=0
+ blocks.x = (bounds->npairs_ij + 127) / 128;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_3_0<<>>(*envs, *jk, *bounds); break;
+ case 28: // lij=3, lkl=1
+ blocks.x = (bounds->npairs_ij + 63) / 64;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_3_1<<>>(*envs, *jk, *bounds); break;
+ case 36: // lij=4, lkl=0
+ blocks.x = (bounds->npairs_ij + 63) / 64;
+ blocks.y = (bounds->npairs_kl + 255) / 256;
+ md_j_4_0<<>>(*envs, *jk, *bounds); break;
+ default: return 0;
+ }
+ return 1;
+}
+
+void set_md_j_unrolled_shm_size()
+{
+ cudaFuncSetAttribute(md_j_0_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 3584*sizeof(double));
+ cudaFuncSetAttribute(md_j_1_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 5376*sizeof(double));
+ cudaFuncSetAttribute(md_j_1_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 7168*sizeof(double));
+ cudaFuncSetAttribute(md_j_1_2, cudaFuncAttributeMaxDynamicSharedMemorySize, 7424*sizeof(double));
+ cudaFuncSetAttribute(md_j_2_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 8704*sizeof(double));
+ cudaFuncSetAttribute(md_j_2_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 8960*sizeof(double));
+ cudaFuncSetAttribute(md_j_2_2, cudaFuncAttributeMaxDynamicSharedMemorySize, 9216*sizeof(double));
+ cudaFuncSetAttribute(md_j_3_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 8448*sizeof(double));
+ cudaFuncSetAttribute(md_j_3_1, cudaFuncAttributeMaxDynamicSharedMemorySize, 7424*sizeof(double));
+ cudaFuncSetAttribute(md_j_4_0, cudaFuncAttributeMaxDynamicSharedMemorySize, 7808*sizeof(double));
+}
diff --git a/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu b/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu
index 3953c63b..89758bfd 100644
--- a/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu
+++ b/gpu4pyscf/lib/gvhf-rys/gamma_inc.cu
@@ -4,10 +4,8 @@
#define SQRTPIE4 .886226925452758013
__device__
-static void eval_gamma_inc_fn(double *f, double t, int m)
+static void eval_gamma_inc_fn(double *f, double t, int m, int sq_id, int block_size)
{
- int sq_id = threadIdx.x;
- int block_size = blockDim.x;
if (t < EPS_FLOAT64) {
f[sq_id] = 1.;
for (int i = 1; i <= m; i++) {
diff --git a/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu b/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu
index 17c8c570..88ba3436 100644
--- a/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu
+++ b/gpu4pyscf/lib/gvhf-rys/gamma_inc_unrolled.cu
@@ -6,8 +6,8 @@
__device__
static void eval_gamma_inc_fn(double *f, double t, int m)
{
- int t_id = threadIdx.x + blockDim.x * threadIdx.y;
- int block_size = blockDim.x * blockDim.y;
+ int t_id = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
+ int block_size = blockDim.x * blockDim.y * blockDim.z;
if (t < EPS_FLOAT64) {
f[t_id] = 1.;
for (int i = 1; i <= m; i++) {
diff --git a/gpu4pyscf/lib/tests/test_cusolver.py b/gpu4pyscf/lib/tests/test_cusolver.py
index e69de29b..0f4941c7 100644
--- a/gpu4pyscf/lib/tests/test_cusolver.py
+++ b/gpu4pyscf/lib/tests/test_cusolver.py
@@ -0,0 +1,64 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import scipy.linalg
+import cupy as cp
+from gpu4pyscf.lib.cusolver import eigh, cholesky
+
+def test_eigh_real():
+ np.random.seed(6)
+ n = 12
+ a = np.random.rand(n, n)
+ a = a + a.T
+ b = np.random.rand(n, n)
+ b = b.dot(b.T)
+ ref = scipy.linalg.eigh(a, b)
+ e, c = eigh(cp.asarray(a), cp.asarray(b))
+ assert abs(e.get() - ref[0]).max() < 1e-10
+ ovlp = c.get().T.dot(b).dot(ref[1])
+ assert abs(abs(ovlp) - np.eye(n)).max() < 1e-10
+
+def test_eigh_cmplx():
+ np.random.seed(6)
+ n = 12
+ a = np.random.rand(n, n) + np.random.rand(n, n) * 1j
+ a = a + a.conj().T
+ b = np.random.rand(n, n) + np.random.rand(n, n) * 1j
+ b = b.dot(b.conj().T)
+ ref = scipy.linalg.eigh(a, b)
+ e, c = eigh(cp.asarray(a), cp.asarray(b))
+ assert abs(e.get() - ref[0]).max() < 1e-10
+ ovlp = c.get().conj().T.dot(b).dot(ref[1])
+ assert abs(abs(ovlp) - np.eye(n)).max() < 1e-10
+
+def test_cholesky_real():
+ np.random.seed(6)
+ n = 12
+ a = np.random.rand(n, n)
+ a = a.dot(a.T)
+ ref = np.linalg.cholesky(a)
+ x = cholesky(cp.asarray(a))
+ assert abs(x.get() - ref).max() < 1e-12
+
+def test_cholesky_cmplx():
+ np.random.seed(6)
+ n = 12
+ a = np.random.rand(n, n) + np.random.rand(n, n) * 1j
+ a = a.dot(a.conj().T)
+ ref = np.linalg.cholesky(a)
+ x = cholesky(cp.asarray(a))
+ assert abs(x.get() - ref).max() < 1e-12
diff --git a/gpu4pyscf/lib/tests/test_cutensor.py b/gpu4pyscf/lib/tests/test_cutensor.py
index ca338331..3e9ef1c4 100644
--- a/gpu4pyscf/lib/tests/test_cutensor.py
+++ b/gpu4pyscf/lib/tests/test_cutensor.py
@@ -38,6 +38,13 @@ def test_contract(self):
c_contract = contract('lkji,jk->il', a, b[10:20,10:20])
assert cupy.linalg.norm(c_einsum - c_contract) < 1e-10
+ def test_complex_valued(self):
+ a = cupy.random.rand(10,9,11) + cupy.random.rand(10,9,11)*1j
+ b = cupy.random.rand(11,7,13) + cupy.random.rand(11,7,13)*1j
+ c_einsum = cupy.einsum('ijk,ikl->jl', a[3:9,:,4:10], b[3:9,:6, 7:13])
+ c_contract = contract('ijk,ikl->jl', a[3:9,:,4:10], b[3:9,:6, 7:13])
+ assert cupy.linalg.norm(c_einsum - c_contract) < 1e-10
+
def test_cache(self):
a = cupy.random.rand(20,20,20,20)
b = cupy.random.rand(20,20)
@@ -52,4 +59,4 @@ def test_cache(self):
if __name__ == "__main__":
print("Full tests for cutensor module")
- unittest.main()
\ No newline at end of file
+ unittest.main()
diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py
index 753b987c..d8c3b0c2 100644
--- a/gpu4pyscf/mp/dfmp2.py
+++ b/gpu4pyscf/mp/dfmp2.py
@@ -100,8 +100,7 @@ def loop_ao2mo(self, mo_coeff, nocc):
mo_coeff = cupy.asarray(mo_coeff, order='C')
Lov = None
with_df = self.with_df
- ao_idx = with_df.intopt.ao_idx
- mo_coeff = mo_coeff[ao_idx]
+ mo_coeff = with_df.intopt.sort_orbitals(mo_coeff, axis=[0])
orbo = mo_coeff[:,:nocc]
orbv = mo_coeff[:,nocc:]
blksize = with_df.get_blksize()
diff --git a/gpu4pyscf/mp/tests/test_mp2.py b/gpu4pyscf/mp/tests/test_mp2.py
index 43142fd8..9cffad01 100644
--- a/gpu4pyscf/mp/tests/test_mp2.py
+++ b/gpu4pyscf/mp/tests/test_mp2.py
@@ -155,4 +155,4 @@ def test_to_gpu(self):
if __name__ == "__main__":
print("Full Tests for mp2")
- unittest.main()
\ No newline at end of file
+ unittest.main()
diff --git a/gpu4pyscf/pbc/__init__.py b/gpu4pyscf/pbc/__init__.py
new file mode 100644
index 00000000..f7ec6fe8
--- /dev/null
+++ b/gpu4pyscf/pbc/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+from gpu4pyscf.pbc import scf
+from gpu4pyscf.pbc import dft
diff --git a/gpu4pyscf/pbc/df/__init__.py b/gpu4pyscf/pbc/df/__init__.py
new file mode 100644
index 00000000..6b9e0c3f
--- /dev/null
+++ b/gpu4pyscf/pbc/df/__init__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+from . import fft
+#from . import aft
+#from . import df
+from .fft import FFTDF
+#from .df import DF, GDF
+#from .aft import AFTDF
+
+class DF: pass # Just a placeholder
diff --git a/gpu4pyscf/pbc/df/fft.py b/gpu4pyscf/pbc/df/fft.py
new file mode 100644
index 00000000..f84894ac
--- /dev/null
+++ b/gpu4pyscf/pbc/df/fft.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''GPW method'''
+
+import numpy as np
+import cupy as cp
+from pyscf import gto
+from pyscf import lib
+from pyscf.pbc.df import fft as fft_cpu
+from pyscf.pbc.df import aft as aft_cpu
+from pyscf.pbc.df.aft import _check_kpts, ft_ao
+from pyscf.pbc.gto import pseudo
+from pyscf.pbc.lib.kpts_helper import is_zero
+from gpu4pyscf.lib import logger, utils
+from gpu4pyscf.pbc import tools
+from gpu4pyscf.pbc.df import fft_jk
+
+__all__ = [
+ 'get_nuc', 'get_pp', 'get_SI', 'FFTDF'
+]
+
+def get_nuc(mydf, kpts=None):
+ from gpu4pyscf.pbc.dft import numint
+ kpts, is_single_kpt = _check_kpts(mydf, kpts)
+ cell = mydf.cell
+ assert cell.low_dim_ft_type != 'inf_vacuum'
+ assert cell.dimension > 1
+ mesh = mydf.mesh
+ charge = cp.asarray(-cell.atom_charges())
+ Gv = cell.get_Gv(mesh)
+ SI = get_SI(cell, mesh=mesh)
+ rhoG = charge.dot(SI)
+
+ coulG = tools.get_coulG(cell, mesh=mesh, Gv=Gv)
+ vneG = rhoG * coulG
+ vneR = tools.ifft(vneG, mesh).real
+
+ nkpts = len(kpts)
+ nao = cell.nao
+ if is_zero(kpts):
+ vne = cp.zeros((nkpts,nao,nao))
+ else:
+ vne = cp.zeros((nkpts,nao,nao), dtype=np.complex128)
+ kpts = np.asarray(kpts)
+ ao_ks = numint.eval_ao_kpts(cell, mydf.grids.coords, kpts)
+ for k, ao in enumerate(ao_ks):
+ vne[k] += (ao.conj().T*vneR).dot(ao)
+
+ if is_single_kpt:
+ vne = vne[0]
+ return vne
+
+def get_pp(mydf, kpts=None):
+ '''Get the periodic pseudopotential nuc-el AO matrix, with G=0 removed.
+ '''
+ from gpu4pyscf.pbc.dft import numint
+ kpts, is_single_kpt = _check_kpts(mydf, kpts)
+ cell = mydf.cell
+ assert cell.low_dim_ft_type != 'inf_vacuum'
+ assert cell.dimension > 1
+ mesh = mydf.mesh
+ Gv = cell.get_Gv(mesh)
+ SI = get_SI(cell, mesh=mesh)
+ vpplocG = pseudo.get_vlocG(cell, Gv)
+ vpplocG = -np.einsum('ij,ij->j', SI, vpplocG)
+ vpplocG = cp.asarray(vpplocG)
+ # vpploc evaluated in real-space
+ vpplocR = tools.ifft(vpplocG, mesh).real
+
+ ngrids = len(vpplocG)
+ nkpts = len(kpts)
+ nao = cell.nao
+ if is_zero(kpts):
+ vpp = cp.zeros((nkpts,nao,nao))
+ else:
+ vpp = cp.zeros((nkpts,nao,nao), dtype=np.complex128)
+ kpts = np.asarray(kpts)
+ ao_ks = numint.eval_ao_kpts(cell, mydf.grids.coords, kpts)
+ for k, ao in enumerate(ao_ks):
+ vpp[k] += (ao.conj().T*vpplocR).dot(ao)
+
+ # vppnonloc evaluated in reciprocal space
+ fakemol = gto.Mole()
+ fakemol._atm = np.zeros((1,gto.ATM_SLOTS), dtype=np.int32)
+ fakemol._bas = np.zeros((1,gto.BAS_SLOTS), dtype=np.int32)
+ ptr = gto.PTR_ENV_START
+ fakemol._env = np.zeros(ptr+10)
+ fakemol._bas[0,gto.NPRIM_OF ] = 1
+ fakemol._bas[0,gto.NCTR_OF ] = 1
+ fakemol._bas[0,gto.PTR_EXP ] = ptr+3
+ fakemol._bas[0,gto.PTR_COEFF] = ptr+4
+
+ # buf for SPG_lmi upto l=0..3 and nl=3
+ buf = np.empty((48,ngrids), dtype=np.complex128)
+ def vppnl_by_k(kpt):
+ Gk = Gv + kpt
+ G_rad = lib.norm(Gk, axis=1)
+ aokG = ft_ao.ft_ao(cell, Gv, kpt=kpt) * (1/cell.vol)**.5
+ vppnl = 0
+ for ia in range(cell.natm):
+ symb = cell.atom_symbol(ia)
+ if symb not in cell._pseudo:
+ continue
+ pp = cell._pseudo[symb]
+ p1 = 0
+ for l, proj in enumerate(pp[5:]):
+ rl, nl, hl = proj
+ if nl > 0:
+ fakemol._bas[0,gto.ANG_OF] = l
+ fakemol._env[ptr+3] = .5*rl**2
+ fakemol._env[ptr+4] = rl**(l+1.5)*np.pi**1.25
+ pYlm_part = fakemol.eval_gto('GTOval', Gk)
+
+ p0, p1 = p1, p1+nl*(l*2+1)
+ # pYlm is real, SI[ia] is complex
+ pYlm = np.ndarray((nl,l*2+1,ngrids), dtype=np.complex128, buffer=buf[p0:p1])
+ for k in range(nl):
+ qkl = pseudo.pp._qli(G_rad*rl, l, k)
+ pYlm[k] = pYlm_part.T * qkl
+ #:SPG_lmi = np.einsum('g,nmg->nmg', SI[ia].conj(), pYlm)
+ #:SPG_lm_aoG = np.einsum('nmg,gp->nmp', SPG_lmi, aokG)
+ #:tmp = np.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+ #:vppnl += np.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+ if p1 > 0:
+ SPG_lmi = buf[:p1]
+ SPG_lmi *= SI[ia].conj()
+ SPG_lm_aoGs = lib.zdot(SPG_lmi, aokG)
+ p1 = 0
+ for l, proj in enumerate(pp[5:]):
+ rl, nl, hl = proj
+ if nl > 0:
+ p0, p1 = p1, p1+nl*(l*2+1)
+ hl = np.asarray(hl)
+ SPG_lm_aoG = SPG_lm_aoGs[p0:p1].reshape(nl,l*2+1,-1)
+ tmp = np.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+ vppnl += np.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+ return vppnl * (1./cell.vol)
+
+ for k, kpt in enumerate(kpts):
+ vppnl = vppnl_by_k(kpt)
+ if is_zero(kpt):
+ vpp[k] += cp.asarray(vppnl.real)
+ else:
+ vpp[k] += cp.asarray(vppnl)
+
+ if is_single_kpt:
+ vpp = vpp[0]
+ return vpp
+
+def get_SI(cell, Gv=None, mesh=None, atmlst=None):
+ '''Calculate the structure factor (0D, 1D, 2D, 3D) for all atoms; see MH (3.34).
+
+ Args:
+ cell : instance of :class:`Cell`
+
+ Gv : (N,3) array
+ G vectors
+
+ atmlst : list of ints, optional
+ Indices of atoms for which the structure factors are computed.
+
+ Returns:
+ SI : (natm, ngrids) ndarray, dtype=np.complex128
+ The structure factor for each atom at each G-vector.
+ '''
+ coords = cp.asarray(cell.atom_coords())
+ if atmlst is not None:
+ coords = coords[np.asarray(atmlst)]
+ if Gv is None:
+ if mesh is None:
+ mesh = cell.mesh
+ basex, basey, basez = cell.get_Gv_weights(mesh)[1]
+ basex = cp.asarray(basex)
+ basey = cp.asarray(basey)
+ basez = cp.asarray(basez)
+ b = cp.asarray(cell.reciprocal_vectors())
+ rb = coords.dot(b.T)
+ SIx = cp.exp(-1j*rb[:,0,None] * basex)
+ SIy = cp.exp(-1j*rb[:,1,None] * basey)
+ SIz = cp.exp(-1j*rb[:,2,None] * basez)
+ SI = SIx[:,:,None,None] * SIy[:,None,:,None] * SIz[:,None,None,:]
+ natm = coords.shape[0]
+ SI = SI.reshape(natm, -1)
+ else:
+ SI = cp.exp(-1j*coords.dot(cp.asarray(Gv).T))
+ return SI
+
+
+class FFTDF(lib.StreamObject):
+ '''Density expansion on plane waves (GPW method)
+ '''
+
+ blockdim = 240
+
+ _keys = fft_cpu.FFTDF._keys
+
+ def __init__(self, cell, kpts=np.zeros((1,3))):
+ from gpu4pyscf.pbc.dft import gen_grid
+ from gpu4pyscf.pbc.dft import numint
+ self.cell = cell
+ self.stdout = cell.stdout
+ self.verbose = cell.verbose
+ self.max_memory = cell.max_memory
+ self.kpts = kpts
+ self.grids = gen_grid.UniformGrids(cell)
+
+ # The following attributes are not input options.
+ # self.exxdiv has no effects. It was set in the get_k_kpts function to
+ # mimic the KRHF/KUHF object in the call to tools.get_coulG.
+ self.exxdiv = None
+ self._numint = numint.KNumInt()
+ self._rsh_df = {} # Range separated Coulomb DF objects
+
+ mesh = fft_cpu.FFTDF.mesh
+ dump_flags = fft_cpu.FFTDF.dump_flags
+ check_sanity = fft_cpu.FFTDF.check_sanity
+ build = fft_cpu.FFTDF.build
+ reset = fft_cpu.FFTDF.reset
+
+ aoR_loop = NotImplemented
+
+ get_pp = get_pp
+ get_nuc = get_nuc
+
+ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
+ with_j=True, with_k=True, omega=None, exxdiv=None):
+ if omega is not None: # J/K for RSH functionals
+ with self.range_coulomb(omega) as rsh_df:
+ return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+ omega=None, exxdiv=exxdiv)
+
+ kpts, is_single_kpt = _check_kpts(self, kpts)
+ if is_single_kpt:
+ vj, vk = fft_jk.get_jk(self, dm, hermi, kpts[0], kpts_band,
+ with_j, with_k, exxdiv)
+ else:
+ vj = vk = None
+ if with_k:
+ vk = fft_jk.get_k_kpts(self, dm, hermi, kpts, kpts_band, exxdiv)
+ if with_j:
+ vj = fft_jk.get_j_kpts(self, dm, hermi, kpts, kpts_band)
+ return vj, vk
+
+ get_eri = get_ao_eri = NotImplemented
+ ao2mo = get_mo_eri = NotImplemented
+ ao2mo_7d = NotImplemented
+ get_ao_pairs_G = get_ao_pairs = NotImplemented
+ get_mo_pairs_G = get_mo_pairs = NotImplemented
+
+ range_coulomb = aft_cpu.AFTDF.range_coulomb
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+
+ def to_cpu(self):
+ obj = utils.to_cpu(self)
+ return obj.reset()
diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py
new file mode 100644
index 00000000..31e9a5d7
--- /dev/null
+++ b/gpu4pyscf/pbc/df/fft_jk.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''
+JK with GPW
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.lib.kpts_helper import is_zero, member
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.pbc import tools
+
+__all__ = [
+ 'get_j_kpts', 'get_k_kpts', 'get_jk', 'get_j', 'get_k',
+ 'get_j_e1_kpts', 'get_k_e1_kpts'
+]
+
+def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
+ '''Get the Coulomb (J) AO matrix at sampled k-points.
+
+ Args:
+ dm_kpts : (nkpts, nao, nao) ndarray or a list of (nkpts,nao,nao) ndarray
+ Density matrix at each k-point. If a list of k-point DMs, eg,
+ UHF alpha and beta DM, the alpha and beta DMs are contracted
+ separately.
+ kpts : (nkpts, 3) ndarray
+
+ Kwargs:
+ kpts_band : (3,) ndarray or (*,3) ndarray
+ A list of arbitrary "band" k-points at which to evalute the matrix.
+
+ Returns:
+ vj : (nkpts, nao, nao) ndarray
+ or list of vj if the input dm_kpts is a list of DMs
+ '''
+ cell = mydf.cell
+ mesh = mydf.mesh
+ assert cell.low_dim_ft_type != 'inf_vacuum'
+ assert cell.dimension > 1
+
+ ni = mydf._numint
+ dm_kpts = cp.asarray(dm_kpts, order='C')
+ dms = _format_dms(dm_kpts, kpts)
+ nset, nkpts, nao = dms.shape[:3]
+
+ coulG = tools.get_coulG(cell, mesh=mesh)
+ ngrids = len(coulG)
+
+ if hermi == 1 or is_zero(kpts):
+ vR = cp.zeros((nset,ngrids))
+ ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts)
+ for i in range(nset):
+ rhoR = ni.eval_rho(cell, ao_ks, dm_kpts[i], hermi=hermi).real
+ rhoG = tools.fft(rhoR, mesh)
+ vG = coulG * rhoG
+ vR[i] = tools.ifft(vG, mesh).real
+ else:
+ vR = cp.zeros((nset,ngrids), dtype=np.complex128)
+ ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts)
+ for i in range(nset):
+ rhoR = ni.eval_rho(cell, ao_ks, dm_kpts[i], hermi=hermi)
+ rhoG = tools.fft(rhoR, mesh)
+ vG = coulG * rhoG
+ vR[i] = tools.ifft(vG, mesh)
+
+ vR *= cell.vol / ngrids
+ kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+ nband = len(kpts_band)
+ if is_zero(kpts_band):
+ vj_kpts = cp.zeros((nset,nband,nao,nao))
+ else:
+ vj_kpts = cp.zeros((nset,nband,nao,nao), dtype=np.complex128)
+
+ if input_band is not None:
+ ao_ks = ni.eval_ao(cell, mydf.grids.coords, kpts_band)
+ for k, ao in enumerate(ao_ks):
+ for i in range(nset):
+ aow = ao * vR[i,:,None]
+ vj_kpts[i,k] += ao.conj().T.dot(aow)
+
+ return _format_jks(vj_kpts, dm_kpts, input_band, kpts)
+
+def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None,
+ exxdiv=None):
+ '''Get the Coulomb (J) and exchange (K) AO matrices at sampled k-points.
+
+ Args:
+ dm_kpts : (nkpts, nao, nao) ndarray
+ Density matrix at each k-point
+ kpts : (nkpts, 3) ndarray
+
+ Kwargs:
+ hermi : int
+ Whether K matrix is hermitian
+
+ | 0 : not hermitian and not symmetric
+ | 1 : hermitian
+
+ kpts_band : (3,) ndarray or (*,3) ndarray
+ A list of arbitrary "band" k-points at which to evalute the matrix.
+
+ Returns:
+ vj : (nkpts, nao, nao) ndarray
+ vk : (nkpts, nao, nao) ndarray
+ or list of vj and vk if the input dm_kpts is a list of DMs
+ '''
+ cell = mydf.cell
+ mesh = mydf.mesh
+ assert cell.low_dim_ft_type != 'inf_vacuum'
+ assert cell.dimension > 1
+ coords = mydf.grids.coords
+ ngrids = coords.shape[0]
+
+ if getattr(dm_kpts, 'mo_coeff', None) is not None:
+ mo_coeff = dm_kpts.mo_coeff
+ mo_occ = dm_kpts.mo_occ
+ else:
+ mo_coeff = None
+
+ ni = mydf._numint
+ kpts = np.asarray(kpts)
+ dm_kpts = cp.asarray(dm_kpts, order='C')
+ dms = _format_dms(dm_kpts, kpts)
+ nset, nkpts, nao = dms.shape[:3]
+
+ weight = 1./nkpts * (cell.vol/ngrids)
+
+ kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+ nband = len(kpts_band)
+
+ if is_zero(kpts_band) and is_zero(kpts):
+ vk_kpts = cp.zeros((nset,nband,nao,nao), dtype=dms.dtype)
+ else:
+ vk_kpts = cp.zeros((nset,nband,nao,nao), dtype=np.complex128)
+
+ ao2_kpts = ni.eval_ao(cell, coords, kpts=kpts)
+ if input_band is None:
+ ao1_kpts = ao2_kpts
+ else:
+ ao1_kpts = ni.eval_ao(cell, coords, kpts=kpts_band)
+
+ if mo_coeff is not None and nset == 1:
+ mo2_kpts = [
+ ao.dot(mo[:,occ>0] * occ[occ>0]**.5)
+ for occ, mo, ao in zip(mo_occ, mo_coeff, ao2_kpts)]
+ ao2_kpts = mo2_kpts
+ else:
+ mo2_kpts = None
+
+ vR_dm = cp.empty((nset,nao,ngrids), dtype=vk_kpts.dtype)
+ blksize = 32
+
+ for k2, ao2 in enumerate(ao2_kpts):
+ ao2T = ao2.T
+ kpt2 = kpts[k2]
+ naoj = ao2.shape[1]
+ if mo2_kpts is None:
+ ao_dms = [dms[i,k2].dot(ao2T.conj()) for i in range(nset)]
+ else:
+ ao_dms = [ao2T.conj()]
+
+ for k1, ao1 in enumerate(ao1_kpts):
+ ao1T = ao1.T
+ kpt1 = kpts_band[k1]
+
+ # If we have an ewald exxdiv, we add the G=0 correction near the
+ # end of the function to bypass any discretization errors
+ # that arise from the FFT.
+ if exxdiv == 'ewald':
+ coulG = tools.get_coulG(cell, kpt2-kpt1, False, mydf, mesh)
+ else:
+ coulG = tools.get_coulG(cell, kpt2-kpt1, exxdiv, mydf, mesh)
+ if is_zero(kpt1-kpt2):
+ expmikr = cp.array(1.)
+ else:
+ expmikr = cp.exp(-1j * coords.dot(cp.asarray(kpt2-kpt1)))
+
+ for p0, p1 in lib.prange(0, nao, blksize):
+ rho1 = contract('ig,jg->ijg', ao1T[p0:p1].conj()*expmikr, ao2T)
+ vG = tools.fft(rho1.reshape(-1,ngrids), mesh)
+ rho1 = None
+ vG *= coulG
+ vR = tools.ifft(vG, mesh).reshape(p1-p0,naoj,ngrids)
+ vG = None
+ if vk_kpts.dtype == np.double:
+ vR = vR.real
+ for i in range(nset):
+ vR_dm[i,p0:p1] = contract('ijg,jg->ig', vR, ao_dms[i])
+ vR = None
+ vR_dm *= expmikr.conj()
+
+ for i in range(nset):
+ vk_kpts[i,k1] += weight * vR_dm[i].dot(ao1)
+
+ # Function _ewald_exxdiv_for_G0 to add back in the G=0 component to vk_kpts
+ # Note in the _ewald_exxdiv_for_G0 implementation, the G=0 treatments are
+ # different for 1D/2D and 3D systems. The special treatments for 1D and 2D
+ # can only be used with AFTDF/GDF/MDF method. In the FFTDF method, 1D, 2D
+ # and 3D should use the ewald probe charge correction.
+ if exxdiv == 'ewald':
+ vk_kpts = _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band=kpts_band)
+
+ return _format_jks(vk_kpts, dm_kpts, input_band, kpts)
+
+def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None,
+ with_j=True, with_k=True, exxdiv=None):
+ '''Get the Coulomb (J) and exchange (K) AO matrices for the given density matrix.
+
+ Args:
+ dm : ndarray or list of ndarrays
+ A density matrix or a list of density matrices
+
+ Kwargs:
+ hermi : int
+ Whether J, K matrix is hermitian
+ | 0 : no hermitian or symmetric
+ | 1 : hermitian
+ | 2 : anti-hermitian
+ kpt : (3,) ndarray
+ The "inner" dummy k-point at which the DM was evaluated (or
+ sampled).
+ kpts_band : (3,) ndarray or (*,3) ndarray
+ The "outer" primary k-point at which J and K are evaluated.
+
+ Returns:
+ The function returns one J and one K matrix, corresponding to the input
+ density matrix (both order and shape).
+ '''
+ dm = cp.asarray(dm, order='C')
+ vj = vk = None
+ if with_j:
+ vj = get_j(mydf, dm, hermi, kpt, kpts_band)
+ if with_k:
+ vk = get_k(mydf, dm, hermi, kpt, kpts_band, exxdiv)
+ return vj, vk
+
+def get_j(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None):
+ '''Get the Coulomb (J) AO matrix for the given density matrix.
+
+ Args:
+ dm : ndarray or list of ndarrays
+ A density matrix or a list of density matrices
+
+ Kwargs:
+ hermi : int
+ Whether J, K matrix is hermitian
+ | 0 : no hermitian or symmetric
+ | 1 : hermitian
+ | 2 : anti-hermitian
+ kpt : (3,) ndarray
+ The "inner" dummy k-point at which the DM was evaluated (or
+ sampled).
+ kpts_band : (3,) ndarray or (*,3) ndarray
+ The "outer" primary k-point at which J and K are evaluated.
+
+ Returns:
+ The function returns one J matrix, corresponding to the input
+ density matrix (both order and shape).
+ '''
+ dm = cp.asarray(dm, order='C')
+ nao = dm.shape[-1]
+ dm_kpts = dm.reshape(-1,1,nao,nao)
+ vj = get_j_kpts(mydf, dm_kpts, hermi, kpt.reshape(1,3), kpts_band)
+ if kpts_band is None:
+ vj = vj[:,0,:,:]
+ if dm.ndim == 2:
+ vj = vj[0]
+ return vj
+
+
+def get_k(mydf, dm, hermi=1, kpt=np.zeros(3), kpts_band=None, exxdiv=None):
+ '''Get the Coulomb (J) and exchange (K) AO matrices for the given density matrix.
+
+ Args:
+ dm : ndarray or list of ndarrays
+ A density matrix or a list of density matrices
+
+ Kwargs:
+ hermi : int
+ Whether J, K matrix is hermitian
+ | 0 : no hermitian or symmetric
+ | 1 : hermitian
+ | 2 : anti-hermitian
+ kpt : (3,) ndarray
+ The "inner" dummy k-point at which the DM was evaluated (or
+ sampled).
+ kpts_band : (3,) ndarray or (*,3) ndarray
+ The "outer" primary k-point at which J and K are evaluated.
+
+ Returns:
+ The function returns one J and one K matrix, corresponding to the input
+ density matrix (both order and shape).
+ '''
+ dm = cp.asarray(dm, order='C')
+ nao = dm.shape[-1]
+ dm_kpts = dm.reshape(-1,1,nao,nao)
+ vk = get_k_kpts(mydf, dm_kpts, hermi, kpt.reshape(1,3), kpts_band, exxdiv)
+ if kpts_band is None:
+ vk = vk[:,0,:,:]
+ if dm.ndim == 2:
+ vk = vk[0]
+ return vk
+
+get_j_e1_kpts = NotImplemented
+get_k_e1_kpts = NotImplemented
+
+def _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band=None):
+ from pyscf.pbc.tools.pbc import madelung
+ s = cp.asarray(cell.pbc_intor('int1e_ovlp', hermi=1, kpts=kpts))
+ m = madelung(cell, kpts)
+ if kpts is None:
+ for i,dm in enumerate(dms):
+ vk[i] += m * s.dot(dm).dot(s)
+ elif np.shape(kpts) == (3,):
+ if kpts_band is None or is_zero(kpts_band-kpts):
+ for i,dm in enumerate(dms):
+ vk[i] += m * s.dot(dm).dot(s)
+
+ elif kpts_band is None or np.array_equal(kpts, kpts_band):
+ for k in range(len(kpts)):
+ for i,dm in enumerate(dms):
+ vk[i,k] += m * s[k].dot(dm[k]).dot(s[k])
+ else:
+ for k, kpt in enumerate(kpts):
+ for kp in member(kpt, kpts_band.reshape(-1,3)):
+ for i,dm in enumerate(dms):
+ vk[i,kp] += m * s[k].dot(dm[k]).dot(s[k])
+ return vk
diff --git a/gpu4pyscf/pbc/dft/__init__.py b/gpu4pyscf/pbc/dft/__init__.py
new file mode 100644
index 00000000..1de0a907
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/__init__.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''Kohn-Sham DFT for periodic systems
+'''
+
+from .gen_grid import UniformGrids, BeckeGrids
+from . import rks
+#from . import uks
+#from . import krks
+#from . import kuks
+from .rks import KohnShamDFT
+
+RKS = rks.RKS
+#UKS = uks.UKS
+#KRKS = krks.KRKS
+#KUKS = kuks.KUKS
diff --git a/gpu4pyscf/pbc/dft/gen_grid.py b/gpu4pyscf/pbc/dft/gen_grid.py
new file mode 100644
index 00000000..af1c40b4
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/gen_grid.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import ctypes
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.pbc.dft import gen_grid as gen_grid_cpu
+from pyscf.pbc.gto.cell import get_uniform_grids
+from gpu4pyscf.lib import utils
+
+class UniformGrids(lib.StreamObject):
+ '''Uniform Grid class.'''
+
+ def __init__(self, cell):
+ self.cell = cell
+ self.stdout = cell.stdout
+ self.verbose = cell.verbose
+ self.mesh = cell.mesh
+ self.non0tab = None
+ self._coords = None
+ self._weights = None
+
+ @property
+ def coords(self):
+ if self._coords is not None:
+ return self._coords
+ else:
+ return cp.asarray(get_uniform_grids(self.cell, self.mesh))
+ @coords.setter
+ def coords(self, x):
+ self._coords = x
+
+ @property
+ def weights(self):
+ if self._weights is not None:
+ return self._weights
+ else:
+ ngrids = np.prod(self.mesh)
+ weights = cp.empty(ngrids)
+ weights[:] = self.cell.vol / ngrids
+ return weights
+ @weights.setter
+ def weights(self, x):
+ self._weights = x
+
+ @property
+ def size(self):
+ return np.prod(self.mesh)
+
+ reset = gen_grid_cpu.UniformGrids.reset
+ build = gen_grid_cpu.UniformGrids.build
+ dump_flags = gen_grid_cpu.UniformGrids.dump_flags
+ kernel = gen_grid_cpu.UniformGrids.kernel
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+
+ def to_cpu(self):
+ obj = utils.to_cpu(self)
+ return obj.reset()
+
+class BeckeGrids:
+ pass
diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py
new file mode 100644
index 00000000..7ecf6202
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/numint.py
@@ -0,0 +1,433 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.lib.kpts_helper import is_zero
+from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+from pyscf.pbc.dft import numint as numint_cpu
+from pyscf.dft.gen_grid import CUTOFF
+from pyscf.pbc.lib.kpts import KPoints
+from gpu4pyscf.dft import numint
+from gpu4pyscf.lib.cupy_helper import transpose_sum, contract, get_avail_mem
+from gpu4pyscf.lib import utils
+
+MIN_BLK_SIZE = numint.MIN_BLK_SIZE
+ALIGNED = numint.ALIGNED
+
+def eval_ao(cell, coords, kpt=np.zeros(3), deriv=0, relativity=0, shls_slice=None,
+ non0tab=None, cutoff=None, out=None, verbose=None):
+ '''Collocate AO crystal orbitals (opt. gradients) on the real-space grid.
+
+ Args:
+ cell : instance of :class:`Cell`
+
+ coords : (nx*ny*nz, 3) ndarray
+ The real-space grid point coordinates.
+
+ Kwargs:
+ kpt : (3,) ndarray
+ The k-point corresponding to the crystal AO.
+ deriv : int
+ AO derivative order. It affects the shape of the return array.
+ If deriv=0, the returned AO values are stored in a (N,nao) array.
+ Otherwise the AO values are stored in an array of shape (M,N,nao).
+ Here N is the number of grids, nao is the number of AO functions,
+ M is the size associated to the derivative deriv.
+
+ Returns:
+ aoR : ([4,] nx*ny*nz, nao=cell.nao_nr()) ndarray
+ The value of the AO crystal orbitals on the real-space grid by default.
+ If deriv=1, also contains the value of the orbitals gradient in the
+ x, y, and z directions. It can be either complex or float array,
+ depending on the kpt argument. If kpt is not given (gamma point),
+ aoR is a float array.
+ '''
+ ao_kpts = eval_ao_kpts(cell, coords, np.reshape(kpt, (-1,3)), deriv)
+ return ao_kpts[0]
+
+def eval_ao_kpts(cell, coords, kpts=None, deriv=0, relativity=0,
+ shls_slice=None, non0tab=None, cutoff=None, out=None, verbose=None):
+ '''
+ Returns:
+ ao_kpts: (nkpts, [comp], ngrids, nao) ndarray
+ AO values at each k-point
+ '''
+ return [cp.asarray(ao) for ao in numint_cpu.eval_ao_kpts(cell, coords.get(), kpts, deriv)]
+
+
+def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=False,
+ verbose=None):
+ '''Collocate the density (opt. gradients) on the real-space grid.
+
+ Args:
+ cell : instance of :class:`Mole` or :class:`Cell`
+
+ ao : ([4,] nx*ny*nz, nao=cell.nao_nr()) ndarray
+ The value of the AO crystal orbitals on the real-space grid by default.
+ If xctype='GGA', also contains the value of the gradient in the x, y,
+ and z directions.
+
+ Returns:
+ rho : ([4,] nx*ny*nz) ndarray
+ The value of the density on the real-space grid. If xctype='GGA',
+ also contains the value of the gradient in the x, y, and z
+ directions.
+
+ See Also:
+ pyscf.dft.numint.eval_rho
+
+ '''
+ if np.iscomplexobj(ao) or np.iscomplexobj(dm):
+ ngrids, nao = ao.shape[-2:]
+ ao_loc = cell.ao_loc_nr()
+ assert nao == ao_loc[-1]
+ dm = cp.asarray(dm, dtype=np.complex128)
+
+ if hermi == 1:
+ def dot_bra(bra, aodm):
+ rho = contract('pi,pi->p', bra.real, aodm.real)
+ rho += contract('pi,pi->p', bra.imag, aodm.imag)
+ return rho
+ dtype = np.float64
+ else:
+ def dot_bra(bra, aodm):
+ return contract('pi,pi->p', bra.conj(), aodm)
+ dtype = np.complex128
+
+ if xctype == 'LDA' or xctype == 'HF':
+ c0 = ao.dot(dm)
+ rho = dot_bra(ao, c0)
+
+ elif xctype == 'GGA':
+ rho = cp.empty((4,ngrids), dtype=dtype)
+ c0 = ao[0].dot(dm)
+ rho[0] = dot_bra(ao[0], c0)
+ for i in range(1, 4):
+ rho[i] = dot_bra(ao[i], c0)
+ if hermi == 1:
+ rho[1:4] *= 2
+ else:
+ c1 = ao[0].dot(dm.conj().T)
+ for i in range(1, 4):
+ rho[i] += dot_bra(c1, ao[i])
+
+ else: # MGGA
+ assert not with_lapl
+ rho = cp.empty((5,ngrids), dtype=dtype)
+ tau_idx = 4
+ c0 = ao[0].dot(dm)
+ rho[0] = dot_bra(ao[0], c0)
+ rho[tau_idx] = 0
+ for i in range(1, 4):
+ c1 = ao[i].dot(dm)
+ rho[tau_idx] += dot_bra(ao[i], c1)
+ rho[i] = dot_bra(ao[i], c0)
+ if hermi == 1:
+ rho[i] *= 2
+ else:
+ rho[i] += dot_bra(ao[0], c1)
+ rho[tau_idx] *= .5
+ else:
+ # real orbitals and real DM
+ # TODO: call numint.eval_rho. However, the structure of ao is not compatible
+ # rho = numint.eval_rho(cell, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
+ ngrids, nao = ao.shape[-2:]
+ ao_loc = cell.ao_loc_nr()
+ assert nao == ao_loc[-1]
+
+ def dot_bra(bra, aodm):
+ return contract('pi,pi->p', bra, aodm)
+
+ if xctype == 'LDA' or xctype == 'HF':
+ c0 = ao.dot(dm)
+ rho = dot_bra(ao, c0)
+
+ elif xctype == 'GGA':
+ rho = cp.empty((4,ngrids))
+ c0 = ao[0].dot(dm)
+ rho[0] = dot_bra(ao[0], c0)
+ for i in range(1, 4):
+ rho[i] = dot_bra(ao[i], c0)
+ if hermi == 1:
+ rho[1:4] *= 2
+ else:
+ c1 = ao[0].dot(dm.T)
+ for i in range(1, 4):
+ rho[i] += dot_bra(c1, ao[i])
+
+ else: # MGGA
+ assert not with_lapl
+ rho = cp.empty((5,ngrids))
+ tau_idx = 4
+ c0 = ao[0].dot(dm)
+ rho[0] = dot_bra(ao[0], c0)
+ rho[tau_idx] = 0
+ for i in range(1, 4):
+ c1 = ao[i].dot(dm)
+ rho[tau_idx] += dot_bra(ao[i], c1)
+ rho[i] = dot_bra(ao[i], c0)
+ if hermi == 1:
+ rho[i] *= 2
+ else:
+ rho[i] += dot_bra(ao[0], c1)
+ rho[tau_idx] *= .5
+ return rho
+
+nr_uks_vxc = nr_uks = NotImplemented
+nr_nlc_vxc = NotImplemented
+nr_rks_fxc = NotImplemented
+nr_rks_fxc_st = NotImplemented
+nr_uks_fxc = NotImplemented
+cache_xc_kernel = NotImplemented
+cache_xc_kernel1 = NotImplemented
+
+
+def get_rho(ni, cell, dm, grids, kpts=np.zeros((1,3)), max_memory=2000):
+ '''Density in real space
+ '''
+ assert dm.ndim == 2 or dm.shape[0] == 1
+ rho = cp.empty(grids.size)
+ nao = cell.nao
+ p1 = 0
+ for ao_k1, ao_k2, mask, weight, coords \
+ in ni.block_loop(cell, grids, nao, 0, kpts, None, max_memory):
+ p0, p1 = p1, p1 + weight.size
+ rho[p0:p1] = ni.eval_rho(cell, ao_k1, dm, xctype='LDA', hermi=1)
+ return rho
+
+def _scale_ao(ao, wv, out=None):
+ # TODO: reuse gpu4pyscf.dft.numint._scale_ao
+ if wv.ndim == 1:
+ return ao * wv[:,None]
+ else:
+ return contract('ngi,ng->gi', ao, wv)
+
+def _tau_dot(bra, ket, wv):
+ '''1/2 '''
+ # TODO: reuse gpu4pyscf.dft.numint._tau_dot
+ wv = .5 * wv
+ mat = bra[1].conj().T.dot(_scale_ao(ket[1], wv))
+ mat += bra[2].conj().T.dot(_scale_ao(ket[2], wv))
+ mat += bra[3].conj().T.dot(_scale_ao(ket[3], wv))
+ return mat
+
+class NumInt(lib.StreamObject, numint.LibXCMixin):
+ '''Generalization of pyscf's NumInt class for a single k-point shift and
+ periodic images.
+ '''
+
+ get_vxc = nr_vxc = numint_cpu.NumInt.nr_vxc
+
+ def nr_rks(self, cell, grids, xc_code, dms, relativity=0, hermi=1,
+ kpt=None, kpts_band=None, max_memory=2000, verbose=None):
+ if kpt is None:
+ kpt = np.zeros(3)
+ xctype = self._xc_type(xc_code)
+ if xctype == 'LDA':
+ ao_deriv = 0
+ nvar = 1
+ elif xctype == 'GGA':
+ ao_deriv = 1
+ nvar = 4
+ elif xctype == 'MGGA':
+ ao_deriv = 1
+ nvar = 5
+ elif xctype == 'HF':
+ return 0, 0, cp.zeros_like(dms)
+ else:
+ raise NotImplementedError(f'nr_rks for functional {xc_code}')
+
+ dms = cp.asarray(dms)
+ dm_shape = dms.shape
+ nao = dm_shape[-1]
+ dms = dms.reshape(nao,nao)
+ ngrids = grids.size
+
+ rho = cp.empty([nvar,ngrids])
+ p0 = p1 = 0
+ for ao_ks, weight, coords \
+ in self.block_loop(cell, grids, ao_deriv, kpt=kpt):
+ p0, p1 = p1, p1 + weight.size
+ rho[:,p0:p1] = eval_rho(cell, ao_ks[0], dms, xctype=xctype, hermi=hermi)
+
+ if xctype == 'LDA':
+ exc, vxc = self.eval_xc_eff(xc_code, rho[0], deriv=1, xctype=xctype)[:2]
+ else:
+ exc, vxc = self.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2]
+ den = rho[0] * grids.weights
+ nelec = den.sum()
+ excsum = cp.sum(den * exc[:,0])
+
+ wv = vxc * grids.weights
+ # *.5 for v+v.conj().T at the end
+ if xctype == 'GGA':
+ wv[0] *= .5
+ elif xctype == 'MGGA':
+ wv[[0,4]] *= .5
+
+ kpts_band, input_band = _format_kpts_band(kpts_band, kpt), kpts_band
+ nband = len(kpts_band)
+ if is_zero(kpts_band):
+ vmat = cp.zeros((nband, nao, nao))
+ else:
+ vmat = cp.zeros((nband, nao, nao), dtype=np.complex128)
+ v_hermi = 1 # the output matrix must be hermitian
+ p0 = p1 = 0
+ for ao_ks, weight, coords \
+ in self.block_loop(cell, grids, ao_deriv, kpts_band=kpts_band):
+ p0, p1 = p1, p1 + weight.size
+ for k, ao in enumerate(ao_ks):
+ if xctype == 'LDA':
+ aow = _scale_ao(ao, wv[0,p0:p1])
+ vmat[k] += ao.conj().T.dot(aow)
+ elif xctype == 'GGA':
+ aow = _scale_ao(ao[:4], wv[:4,p0:p1])
+ vmat[k] += ao[0].conj().T.dot(aow)
+ elif xctype == 'MGGA':
+ aow = _scale_ao(ao[:4], wv[:4,p0:p1])
+ vmat[k] += ao[0].conj().T.dot(aow)
+ vmat[k] += _tau_dot(ao, ao, wv[4,p0:p1])
+
+ if v_hermi and xctype != 'LDA':
+ vmat = vmat + vmat.transpose(0, 2, 1).conj()
+ if input_band is None:
+ vmat = vmat[0]
+ return nelec, excsum, vmat
+
+ def nr_uks(self, cell, grids, xc_code, dms, relativity=0, hermi=1,
+ kpt=None, kpts_band=None, max_memory=2000, verbose=None):
+ raise NotImplementedError
+
+ def block_loop(self, cell, grids, deriv=0, kpt=None, kpts_band=None):
+ '''Define this macro to loop over grids by blocks.
+ '''
+ nao = cell.nao
+ grids_coords = grids.coords
+ grids_weights = grids.weights
+ ngrids = grids_coords.shape[0]
+ comp = (deriv+1)*(deriv+2)*(deriv+3)//6
+
+ #cupy.get_default_memory_pool().free_all_blocks()
+ mem_avail = get_avail_mem()
+ blksize = int((mem_avail*.2/8/((comp+1)*nao))/ ALIGNED) * ALIGNED
+ blksize = min(blksize, MIN_BLK_SIZE)
+ if blksize < ALIGNED:
+ raise RuntimeError('Not enough GPU memory')
+
+ if kpts_band is None:
+ if kpt is None:
+ kpts = np.zeros((1, 3))
+ else:
+ kpts = np.reshape(kpt, (1, 3))
+ elif kpt is None:
+ kpts = np.reshape(kpts_band, (-1, 3))
+ else:
+ raise RuntimeError('Cannot produce AOs for kpt and kpts_band in the same run')
+
+ for ip0, ip1 in lib.prange(0, ngrids, blksize):
+ coords = grids_coords[ip0:ip1]
+ weight = grids_weights[ip0:ip1]
+ ao_ks = eval_ao_kpts(cell, coords, kpts, deriv=deriv)
+ yield ao_ks, weight, coords
+ ao_ks = None
+
+ eval_xc_eff = numint.eval_xc_eff
+ _init_xcfuns = numint.NumInt._init_xcfuns
+
+ get_fxc = nr_fxc = numint_cpu.NumInt.nr_fxc
+ nr_rks_fxc = nr_rks_fxc
+ nr_uks_fxc = nr_uks_fxc
+ nr_rks_fxc_st = nr_rks_fxc_st
+ nr_nlc_vxc = nr_nlc_vxc
+ cache_xc_kernel = cache_xc_kernel
+ cache_xc_kernel1 = cache_xc_kernel1
+ get_rho = get_rho
+
+ eval_ao = staticmethod(eval_ao)
+ eval_rho = staticmethod(eval_rho)
+ eval_rho2 = NotImplemented
+ eval_rho1 = NotImplemented
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+
+ def to_cpu(self):
+ return numint_cpu.NumInt()
+
+_NumInt = NumInt
+
+
+class KNumInt(lib.StreamObject, numint.LibXCMixin):
+ '''Generalization of pyscf's NumInt class for k-point sampling and
+ periodic images.
+ '''
+ def __init__(self, kpts=np.zeros((1,3))):
+ self.kpts = np.reshape(kpts, (-1,3))
+
+ eval_ao = staticmethod(eval_ao_kpts)
+
+ make_mask = NotImplemented
+
+ def eval_rho(self, cell, ao_kpts, dm_kpts, non0tab=None, xctype='LDA',
+ hermi=0, with_lapl=True, verbose=None):
+ '''Collocate the density (opt. gradients) on the real-space grid.
+
+ Args:
+ cell : Mole or Cell object
+ ao_kpts : (nkpts, ngrids, nao) ndarray
+ AO values at each k-point
+ dm_kpts: (nkpts, nao, nao) ndarray
+ Density matrix at each k-point
+
+ Returns:
+ rhoR : (ngrids,) ndarray
+ '''
+ nkpts = len(ao_kpts)
+ rho_ks = [eval_rho(cell, ao_kpts[k], dm_kpts[k], non0tab, xctype,
+ hermi, with_lapl, verbose)
+ for k in range(nkpts)]
+ dtype = np.result_type(*rho_ks)
+ rho = cp.zeros(rho_ks[0].shape, dtype=dtype)
+ for k in range(nkpts):
+ rho += rho_ks[k]
+ rho *= 1./nkpts
+ return rho
+
+ get_vxc = nr_vxc = numint_cpu.KNumInt.nr_vxc
+ eval_rho1 = NotImplemented
+ nr_rks = NotImplemented
+ nr_uks = NotImplemented
+
+ block_loop = NotImplemented
+ eval_rho2 = NotImplemented
+ get_vxc = nr_vxc = numint_cpu.KNumInt.nr_vxc
+ nr_rks_fxc = nr_rks_fxc
+ nr_uks_fxc = nr_uks_fxc
+ nr_rks_fxc_st = nr_rks_fxc_st
+ cache_xc_kernel = cache_xc_kernel
+ cache_xc_kernel1 = cache_xc_kernel1
+ get_rho = get_rho
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+
+ def to_cpu(self):
+ return numint_cpu.KNumInt()
+
+_KNumInt = KNumInt
diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py
new file mode 100644
index 00000000..f514b9e4
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/rks.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''
+Non-relativistic Restricted Kohn-Sham for periodic systems at a single k-point
+'''
+
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.dft import rks as ks_cpu
+from pyscf.pbc.scf import khf
+from pyscf.pbc.dft import multigrid
+from gpu4pyscf.lib import logger, utils
+from gpu4pyscf.dft import rks as mol_ks
+from gpu4pyscf.pbc.scf import hf as pbchf
+from gpu4pyscf.pbc.dft import gen_grid
+from gpu4pyscf.pbc.dft import numint
+from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from pyscf import __config__
+
+__all__ = [
+ 'get_veff', 'RKS', 'KohnShamDFT',
+]
+
+def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
+ kpt=None, kpts_band=None):
+ '''Coulomb + XC functional
+
+ .. note::
+ This function will change the ks object.
+
+ Args:
+ ks : an instance of :class:`RKS`
+ XC functional are controlled by ks.xc attribute. Attribute
+ ks.grids might be initialized.
+ dm : ndarray or list of ndarrays
+ A density matrix or a list of density matrices
+
+ Returns:
+ matrix Veff = J + Vxc. Veff can be a list matrices, if the input
+ dm is a list of density matrices.
+ '''
+ if cell is None: cell = ks.cell
+ if dm is None: dm = ks.make_rdm1()
+ if kpt is None: kpt = ks.kpt
+ t0 = logger.init_timer(ks)
+
+ ni = ks._numint
+ hybrid = ni.libxc.is_hybrid_xc(ks.xc)
+
+ if isinstance(ks.with_df, multigrid.MultiGridFFTDF):
+ if ks.do_nlc():
+ raise NotImplementedError(f'MultiGrid for NLC functional {ks.xc} + {ks.nlc}')
+
+ ground_state = (isinstance(dm, cp.ndarray) and dm.ndim == 2
+ and kpts_band is None)
+ ks.initialize_grids(cell, dm, kpt, ground_state)
+
+ if hermi == 2: # because rho = 0
+ n, exc, vxc = 0, 0, 0
+ else:
+ n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
+ kpt, kpts_band)
+ logger.info(ks, 'nelec by numeric integration = %s', n)
+ if ks.do_nlc():
+ if ni.libxc.is_nlc(ks.xc):
+ xc = ks.xc
+ else:
+ assert ni.libxc.is_nlc(ks.nlc)
+ xc = ks.nlc
+ n, enlc, vnlc = ni.nr_nlc_vxc(cell, ks.nlcgrids, xc, dm, 0, hermi, kpt)
+ exc += enlc
+ vxc += vnlc
+ logger.info(ks, 'nelec with nlc grids = %s', n)
+ t0 = logger.timer(ks, 'vxc', *t0)
+
+ if not hybrid:
+ vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
+ vxc += vj
+ else:
+ omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
+ if omega == 0:
+ vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
+ vk *= hyb
+ elif alpha == 0: # LR=0, only SR exchange
+ vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
+ vk = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=-omega)
+ vk *= hyb
+ elif hyb == 0: # SR=0, only LR exchange
+ vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
+ vk = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
+ vk *= alpha
+ else: # SR and LR exchange with different ratios
+ vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
+ vk *= hyb
+ vklr = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
+ vklr *= (alpha - hyb)
+ vk += vklr
+ vxc += vj - vk * .5
+
+ if ground_state:
+ exc -= contract('ij,ji->', dm, vk).real * .5 * .5
+
+ if ground_state:
+ ecoul = contract('ij,ji->', dm, vj).real * .5
+ else:
+ ecoul = None
+
+ vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=None, vk=None)
+ return vxc
+
+def prune_small_rho_grids_(ks, cell, dm, grids, kpts):
+ raise NotImplementedError
+
+def get_rho(mf, dm=None, grids=None, kpt=None):
+ if dm is None: dm = mf.make_rdm1()
+ if grids is None: grids = mf.grids
+ if kpt is None: kpt = mf.kpt
+ if dm[0].ndim == 2: # the UKS density matrix
+ dm = dm[0] + dm[1]
+ if isinstance(mf.with_df, multigrid.MultiGridFFTDF):
+ rho = mf.with_df.get_rho(dm, kpt)
+ else:
+ rho = mf._numint.get_rho(mf.cell, dm, grids, kpt, mf.max_memory)
+ return rho
+
+
+class KohnShamDFT(mol_ks.KohnShamDFT):
+ '''PBC-KS'''
+
+ _keys = ks_cpu.KohnShamDFT._keys
+
+ def __init__(self, xc='LDA,VWN'):
+ self.xc = xc
+ self.grids = gen_grid.UniformGrids(self.cell)
+ self.nlc = ''
+ self.nlcgrids = gen_grid.UniformGrids(self.cell)
+ self.small_rho_cutoff = getattr(
+ __config__, 'dft_rks_RKS_small_rho_cutoff', 1e-7)
+ if isinstance(self, khf.KSCF):
+ self._numint = numint.KNumInt(self.kpts)
+ else:
+ self._numint = numint.NumInt()
+
+ build = ks_cpu.KohnShamDFT.build
+ reset = ks_cpu.KohnShamDFT.reset
+ dump_flags = ks_cpu.KohnShamDFT.dump_flags
+
+ get_veff = NotImplemented
+ get_rho = get_rho
+
+ density_fit = NotImplemented
+ rs_density_fit = NotImplemented
+
+ jk_method = NotImplemented
+
+ to_rks = NotImplemented
+ to_uks = NotImplemented
+ to_gks = NotImplemented
+ to_hf = NotImplemented
+
+ def initialize_grids(self, cell, dm, kpts, ground_state=True):
+ '''Initialize self.grids the first time call get_veff'''
+ if self.grids.coords is None:
+ t0 = (logger.process_clock(), logger.perf_counter())
+ self.grids.build(with_non0tab=True)
+ if (isinstance(self.grids, gen_grid.BeckeGrids) and
+ self.small_rho_cutoff > 1e-20 and ground_state):
+ self.grids = prune_small_rho_grids_(
+ self, self.cell, dm, self.grids, kpts)
+ t0 = logger.timer(self, 'setting up grids', *t0)
+ is_nlc = self.do_nlc()
+ if is_nlc and self.nlcgrids.coords is None:
+ t0 = (logger.process_clock(), logger.perf_counter())
+ self.nlcgrids.build(with_non0tab=True)
+ if (isinstance(self.grids, gen_grid.BeckeGrids) and
+ self.small_rho_cutoff > 1e-20 and ground_state):
+ self.nlcgrids = prune_small_rho_grids_(
+ self, self.cell, dm, self.nlcgrids, kpts)
+ t0 = logger.timer(self, 'setting up nlc grids', *t0)
+ return self
+
+# Update the KohnShamDFT label in pbc.scf.hf module
+pbchf.KohnShamDFT = KohnShamDFT
+
+
+class RKS(KohnShamDFT, pbchf.RHF):
+ '''RKS class adapted for PBCs.
+
+ This is a literal duplication of the molecular RKS class with some `mol`
+ variables replaced by `cell`.
+ '''
+
+ def __init__(self, cell, kpt=np.zeros(3), xc='LDA,VWN', exxdiv='ewald'):
+ pbchf.RHF.__init__(self, cell, kpt, exxdiv=exxdiv)
+ KohnShamDFT.__init__(self, xc)
+
+ def dump_flags(self, verbose=None):
+ pbchf.RHF.dump_flags(self, verbose)
+ KohnShamDFT.dump_flags(self, verbose)
+ return self
+
+ get_veff = get_veff
+ energy_elec = mol_ks.energy_elec
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+
+ def to_cpu(self):
+ mf = ks_cpu.RKS(self.cell)
+ utils.to_cpu(self, out=mf)
+ return mf
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
new file mode 100644
index 00000000..1489ee40
--- /dev/null
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import tempfile
+import numpy as np
+from pyscf.pbc import gto as pbcgto
+from gpu4pyscf.pbc import dft as pbcdft
+
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ global cell
+ L = 4
+ n = 21
+ cell = pbcgto.Cell()
+ cell.build(unit = 'B',
+ a = ((L,0,0),(0,L,0),(0,0,L)),
+ mesh = [n,n,n],
+ atom = [['He', (L/2.-.5,L/2.,L/2.-.5)],
+ ['He', (L/2. ,L/2.,L/2.+.5)]],
+ basis = { 'He': [[0, (0.8, 1.0)],
+ [0, (1.0, 1.0)],
+ [0, (1.2, 1.0)]]})
+ cls.cell = cell
+
+ @classmethod
+ def tearDownClass(cls):
+ global cell
+ del cell
+
+ def test_lda_fft(self):
+ mf = pbcdft.RKS(cell, xc='lda,vwn').run()
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e0, c0 = mf_ref.get_bands(kpts_band)
+ e1, c1 = mf.get_bands(kpts_band)
+ self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+ self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+ def test_gga_fft(self):
+ mf = pbcdft.RKS(cell, xc='pbe0').run()
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e0, c0 = mf_ref.get_bands(kpts_band)
+ e1, c1 = mf.get_bands(kpts_band)
+ self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+ self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+ def test_rsh_fft(self):
+ mf = pbcdft.RKS(cell, xc='camb3lyp').run()
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e0, c0 = mf_ref.get_bands(kpts_band)
+ e1, c1 = mf.get_bands(kpts_band)
+ self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+ self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+ def test_lda_fft_with_kpt(self):
+ np.random.seed(1)
+ k = np.random.random(3)
+ mf = pbcdft.RKS(cell, xc='lda,vwn', kpt=k).run()
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e0, c0 = mf_ref.get_bands(kpts_band)
+ e1, c1 = mf.get_bands(kpts_band)
+ self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+ self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+ def test_gga_fft_with_kpt(self):
+ np.random.seed(1)
+ k = np.random.random(3)
+ mf = pbcdft.RKS(cell, xc='pbe0', kpt=k).run()
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e0, c0 = mf_ref.get_bands(kpts_band)
+ e1, c1 = mf.get_bands(kpts_band)
+ self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+ self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+ def test_rsh_fft_with_kpt(self):
+ np.random.seed(1)
+ k = np.random.random(3)
+ mf = pbcdft.RKS(cell, xc='camb3lyp', kpt=k).run(conv_tol=1e-8)
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e0, c0 = mf_ref.get_bands(kpts_band)
+ e1, c1 = mf.get_bands(kpts_band)
+ self.assertAlmostEqual(abs(e1[0].get() - e0[0]).max(), 0, 7)
+ self.assertAlmostEqual(abs(e1[1].get() - e0[1]).max(), 0, 7)
+
+if __name__ == '__main__':
+ print("Full Tests for pbc.dft.rks")
+ unittest.main()
diff --git a/gpu4pyscf/pbc/scf/__init__.py b/gpu4pyscf/pbc/scf/__init__.py
new file mode 100644
index 00000000..70ac5a5b
--- /dev/null
+++ b/gpu4pyscf/pbc/scf/__init__.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''Hartree-Fock for periodic systems
+'''
+
+from .import hf
+#from . import uhf
+#from . import khf
+#from . import kuhf
+
+rhf = hf
+#krhf = khf
+
+#UHF = uhf.UHF
+RHF = rhf.RHF
+#KRHF = krhf.KRHF
+#KUHF = kuhf.KRHF
diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py
new file mode 100644
index 00000000..83ad7b47
--- /dev/null
+++ b/gpu4pyscf/pbc/scf/hf.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''
+Hartree-Fock for periodic systems at a single k-point
+'''
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.scf import hf as hf_cpu
+from gpu4pyscf.lib import logger, utils
+from gpu4pyscf.lib.cupy_helper import return_cupy_array, contract
+from gpu4pyscf.scf import hf as mol_hf
+from gpu4pyscf.pbc import df
+
+__all__ = [
+ 'RHF', 'SCF'
+]
+
+def get_bands(mf, kpts_band, cell=None, dm=None, kpt=None):
+ '''Get energy bands at the given (arbitrary) 'band' k-points.
+
+ Returns:
+ mo_energy : (nmo,) ndarray or a list of (nmo,) ndarray
+ Bands energies E_n(k)
+ mo_coeff : (nao, nmo) ndarray or a list of (nao,nmo) ndarray
+ Band orbitals psi_n(k)
+ '''
+ if cell is None: cell = mf.cell
+ if dm is None: dm = mf.make_rdm1()
+ if kpt is None: kpt = mf.kpt
+
+ kpts_band = np.asarray(kpts_band)
+ single_kpt_band = (getattr(kpts_band, 'ndim', None) == 1)
+ kpts_band = kpts_band.reshape(-1,3)
+
+ fock = mf.get_veff(cell, dm, kpt=kpt, kpts_band=kpts_band)
+ fock += mf.get_hcore(cell, kpts_band)
+ s1e = mf.get_ovlp(cell, kpts_band)
+ nkpts = len(kpts_band)
+ mo_energy = []
+ mo_coeff = []
+ for k in range(nkpts):
+ e, c = mf.eig(fock[k], s1e[k])
+ mo_energy.append(e)
+ mo_coeff.append(c)
+
+ if single_kpt_band:
+ mo_energy = mo_energy[0]
+ mo_coeff = mo_coeff[0]
+ return mo_energy, mo_coeff
+
+get_fock = mol_hf.get_fock
+get_occ = mol_hf.get_occ
+get_grad = mol_hf.get_grad
+make_rdm1 = mol_hf.make_rdm1
+energy_elec = mol_hf.energy_elec
+
+def get_rho(mf, dm=None, grids=None, kpt=None):
+ '''Compute density in real space
+ '''
+ from gpu4pyscf.pbc.dft import gen_grid
+ from gpu4pyscf.pbc.dft import numint
+ if dm is None:
+ dm = mf.make_rdm1()
+ if getattr(dm, 'ndim', None) != 2: # UHF
+ dm = dm[0] + dm[1]
+ if grids is None:
+ grids = gen_grid.UniformGrids(mf.cell)
+ if kpt is None:
+ kpt = mf.kpt
+ ni = numint.NumInt()
+ return ni.get_rho(mf.cell, dm, grids, kpt, mf.max_memory)
+
+class SCF(mol_hf.SCF):
+ '''SCF base class adapted for PBCs.
+
+ Attributes:
+ kpt : (3,) ndarray
+ The AO k-point in Cartesian coordinates, in units of 1/Bohr.
+
+ exxdiv : str
+ Exchange divergence treatment, can be one of
+
+ | None : ignore G=0 contribution in exchange
+ | 'ewald' : Ewald probe charge correction [JCP 122, 234102 (2005); DOI:10.1063/1.1926272]
+
+ with_df : density fitting object
+ Default is the instance of FFTDF class (GPW method).
+ '''
+
+ _keys = hf_cpu.SCF._keys
+
+ def __init__(self, cell, kpt=np.zeros(3), exxdiv='ewald'):
+ if not cell._built:
+ cell.build()
+ mol_hf.SCF.__init__(self, cell)
+ self.with_df = df.FFTDF(cell)
+ # Range separation JK builder
+ self.rsjk = None
+ self.exxdiv = exxdiv
+ self.kpt = kpt
+ self.conv_tol = 1e-8
+ if cell.precision:
+ self.conv_tol = max(cell.precision * 10, 1e-8)
+
+ def check_sanity(self):
+ if (isinstance(self.exxdiv, str) and self.exxdiv.lower() != 'ewald' and
+ isinstance(self.with_df, df.DF)):
+ logger.warn(self, 'exxdiv %s is not supported in DF', self.exxdiv)
+
+ if self.verbose >= logger.DEBUG:
+ super().check_sanity()
+ return self
+
+ kpt = hf_cpu.SCF.kpt
+ kpts = hf_cpu.SCF.kpts
+ mol = hf_cpu.SCF.mol # required by the hf.kernel
+
+ reset = hf_cpu.SCF.reset
+ build = hf_cpu.SCF.build
+ dump_flags = hf_cpu.SCF.dump_flags
+
+ get_bands = get_bands
+ get_rho = get_rho
+
+ get_ovlp = return_cupy_array(hf_cpu.SCF.get_ovlp)
+
+ def get_hcore(self, cell=None, kpt=None):
+ if cell is None: cell = self.cell
+ if kpt is None: kpt = self.kpt
+ if cell.pseudo:
+ nuc = self.with_df.get_pp(kpt)
+ else:
+ nuc = self.with_df.get_nuc(kpt)
+ if len(cell._ecpbas) > 0:
+ raise NotImplementedError('ECP in PBC SCF')
+ return nuc + cp.asarray(cell.pbc_intor('int1e_kin', 1, 1, kpt))
+
+ def get_jk(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
+ with_j=True, with_k=True, omega=None, **kwargs):
+ r'''Get Coulomb (J) and exchange (K) following :func:`scf.hf.RHF.get_jk_`.
+ for particular k-point (kpt).
+
+ When kpts_band is given, the J, K matrices on kpts_band are evaluated.
+
+ J_{pq} = \sum_{rs} (pq|rs) dm[s,r]
+ K_{pq} = \sum_{rs} (pr|sq) dm[r,s]
+
+ where r,s are orbitals on kpt. p and q are orbitals on kpts_band
+ if kpts_band is given otherwise p and q are orbitals on kpt.
+ '''
+ if cell is None: cell = self.cell
+ if dm is None: dm = self.make_rdm1()
+ if kpt is None: kpt = self.kpt
+
+ cpu0 = logger.init_timer(self)
+ dm = cp.asarray(dm)
+ nao = dm.shape[-1]
+ vj, vk = self.with_df.get_jk(dm.reshape(-1,nao,nao), hermi, kpt, kpts_band,
+ with_j, with_k, omega, exxdiv=self.exxdiv)
+ if with_j:
+ vj = _format_jks(vj, dm, kpts_band)
+ if with_k:
+ vk = _format_jks(vk, dm, kpts_band)
+ logger.timer(self, 'vj and vk', *cpu0)
+ return vj, vk
+
+ def get_j(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
+ omega=None):
+ r'''Compute J matrix for the given density matrix and k-point (kpt).
+ When kpts_band is given, the J matrices on kpts_band are evaluated.
+
+ J_{pq} = \sum_{rs} (pq|rs) dm[s,r]
+
+ where r,s are orbitals on kpt. p and q are orbitals on kpts_band
+ if kpts_band is given otherwise p and q are orbitals on kpt.
+ '''
+ return self.get_jk(cell, dm, hermi, kpt, kpts_band, with_k=False,
+ omega=omega)[0]
+
+ def get_k(self, cell=None, dm=None, hermi=1, kpt=None, kpts_band=None,
+ omega=None):
+ '''Compute K matrix for the given density matrix.
+ '''
+ return self.get_jk(cell, dm, hermi, kpt, kpts_band, with_j=False,
+ omega=omega)[1]
+
+ get_veff = hf_cpu.SCF.get_veff
+ energy_nuc = hf_cpu.SCF.energy_nuc
+ _finalize = hf_cpu.SCF._finalize
+
+ def get_init_guess(self, cell=None, key='minao', s1e=None):
+ if cell is None: cell = self.cell
+ dm = mol_hf.SCF.get_init_guess(self, cell, key)
+ dm = normalize_dm_(self, dm, s1e)
+ return dm
+
+ init_guess_by_1e = hf_cpu.SCF.init_guess_by_1e
+ init_guess_by_chkfile = hf_cpu.SCF.init_guess_by_chkfile
+ from_chk = hf_cpu.SCF.from_chk
+ dump_chk = hf_cpu.SCF.dump_chk
+ analyze = NotImplemented
+ mulliken_pop = NotImplemented
+ density_fit = NotImplemented
+ rs_density_fit = NotImplemented
+ x2c = x2c1e = sfx2c1e = NotImplemented
+ spin_square = NotImplemented
+ dip_moment = NotImplemented
+
+
+class KohnShamDFT:
+ '''A mock DFT base class
+
+ The base class is defined in the pbc.dft.rks module. This class can
+ be used to verify if an SCF object is an pbc-Hartree-Fock method or an
+ pbc-DFT method. It should be overwritten by the actual KohnShamDFT class
+ when loading dft module.
+ '''
+
+
+class RHF(SCF):
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+
+ def to_cpu(self):
+ mf = hf_cpu.RHF(self.cell)
+ utils.to_cpu(self, out=mf)
+ return mf
+
+def _format_jks(vj, dm, kpts_band):
+ if kpts_band is None:
+ vj = vj.reshape(dm.shape)
+ elif kpts_band.ndim == 1: # a single k-point on bands
+ vj = vj.reshape(dm.shape)
+ elif getattr(dm, "ndim", 0) == 2:
+ vj = vj[0]
+ return vj
+
+def normalize_dm_(mf, dm, s1e=None):
+ '''
+ Force density matrices integrated to the correct number of electrons.
+ '''
+ cell = mf.cell
+ if s1e is None:
+ s1e = mf.get_ovlp(cell)
+ ne = contract('ij,ji->', dm, s1e).real
+ if abs(ne - cell.nelectron) > 0.01:
+ logger.debug(mf, 'Big errors in the electron number of initial guess '
+ 'density matrix (Ne/cell = %g)!', ne)
+ dm *= cell.nelectron / ne
+ return dm
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
new file mode 100644
index 00000000..ca11d5b0
--- /dev/null
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.pbc.scf import hf as pbchf_cpu
+from pyscf.pbc import gto as pbcgto
+from gpu4pyscf.pbc import scf
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ L = 4
+ n = 21
+ cell = pbcgto.Cell()
+ cell.build(unit = 'B',
+ verbose = 7,
+ output = '/dev/null',
+ a = ((L,0,0),(0,L,0),(0,0,L)),
+ mesh = [n,n,n],
+ atom = [['He', (L/2.-.5,L/2.,L/2.-.5)],
+ ['He', (L/2. ,L/2.,L/2.+.5)]],
+ basis = { 'He': [[0, (0.8, 1.0)],
+ [0, (1.0, 1.0)],
+ [0, (1.2, 1.0)]]})
+ cls.cell = cell
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.cell.stdout.close()
+
+ def test_rhf_exx_ewald(self):
+ cell = self.cell
+ mf = scf.RHF(cell, exxdiv='ewald').run()
+ self.assertAlmostEqual(mf.e_tot, -4.3511582284698633, 7)
+ self.assertTrue(mf.mo_coeff.dtype == np.double)
+ #kmf = scf.KRHF(cell, [[0,0,0]], exxdiv='ewald').run()
+ #self.assertAlmostEqual(mf.e_tot, kmf.e_tot, 8)
+
+ # test bands
+ np.random.seed(1)
+ kpts_band = np.random.random((2,3))
+ e1, c1 = mf.get_bands(kpts_band)
+ #e0, c0 = kmf.get_bands(kpts_band)
+ #self.assertAlmostEqual(abs(e0[0]-e1[0]).max(), 0, 7)
+ #self.assertAlmostEqual(abs(e0[1]-e1[1]).max(), 0, 7)
+ self.assertAlmostEqual(lib.fp(e1[0].get()), -6.2986775452228283, 6)
+ self.assertAlmostEqual(lib.fp(e1[1].get()), -7.6616273746782362, 6)
+
+ def test_rhf_exx_ewald_with_kpt(self):
+ np.random.seed(1)
+ k = np.random.random(3)
+ cell = self.cell
+ mf = scf.RHF(cell, k, exxdiv='ewald')
+ e1 = mf.kernel()
+ self.assertAlmostEqual(e1, -4.2048655827967139, 7)
+ self.assertTrue(mf.mo_coeff.dtype == np.complex128)
+
+ #kmf = scf.KRHF(cell, k, exxdiv='ewald')
+ #e0 = kmf.kernel()
+ #self.assertTrue(np.allclose(e0,e1))
+
+ # test bands
+ np.random.seed(1)
+ kpt_band = np.random.random(3)
+ e1, c1 = mf.get_bands(kpt_band)
+ #e0, c0 = kmf.get_bands(kpt_band)
+ #self.assertAlmostEqual(abs(e0-e1).max(), 0, 7)
+ self.assertAlmostEqual(lib.fp(e1.get()), -6.8312867098806249, 6)
+
+ def test_rhf_exx_None(self):
+ cell = self.cell
+ mf = scf.RHF(cell, exxdiv=None)
+ e1 = mf.kernel()
+ self.assertAlmostEqual(e1, -2.9325094887283196, 7)
+ self.assertTrue(mf.mo_coeff.dtype == np.double)
+
+ #mf = scf.KRHF(cell, [[0,0,0]], exxdiv=None)
+ #e0 = mf.kernel()
+ #self.assertTrue(np.allclose(e0,e1))
+
+ np.random.seed(1)
+ k = np.random.random(3)
+ mf = scf.RHF(cell, k, exxdiv=None)
+ mf.init_guess = 'hcore'
+ e1 = mf.kernel()
+ self.assertAlmostEqual(e1, -2.7862168430230341, 7)
+ self.assertTrue(mf.mo_coeff.dtype == np.complex128)
+
+ #mf = scf.KRHF(cell, k, exxdiv=None)
+ #mf.init_guess = 'hcore'
+ #e0 = mf.kernel()
+ #self.assertTrue(np.allclose(e0,e1))
+
+ def test_jk(self):
+ cell = self.cell
+ nao = cell.nao
+ np.random.seed(2)
+ dm = np.random.random((2,nao,nao)) + .5j*np.random.random((2,nao,nao))
+ dm = dm + dm.conj().transpose(0,2,1)
+ ref = pbchf_cpu.RHF(cell).get_jk(cell, dm)
+
+ dm = cp.asarray(dm)
+ vj, vk = scf.RHF(cell).get_jk(cell, dm)
+ self.assertAlmostEqual(abs(vj.get() - ref[0]).max(), 0, 9)
+ self.assertAlmostEqual(abs(vk.get() - ref[1]).max(), 0, 9)
+
+
+if __name__ == '__main__':
+ print("Full Tests for pbc.scf.hf")
+ unittest.main()
diff --git a/gpu4pyscf/pbc/tools/__init__.py b/gpu4pyscf/pbc/tools/__init__.py
new file mode 100644
index 00000000..12b67013
--- /dev/null
+++ b/gpu4pyscf/pbc/tools/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+from .pbc import *
diff --git a/gpu4pyscf/pbc/tools/pbc.py b/gpu4pyscf/pbc/tools/pbc.py
new file mode 100644
index 00000000..c5fc91e8
--- /dev/null
+++ b/gpu4pyscf/pbc/tools/pbc.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import numpy as np
+import cupy as cp
+from gpu4pyscf.lib.cupy_helper import return_cupy_array
+from pyscf.pbc.tools.pbc import get_coulG
+
+get_coulG = return_cupy_array(get_coulG)
+
+def fft(f, mesh):
+ '''Perform the 3D FFT from real (R) to reciprocal (G) space.
+
+ After FFT, (u, v, w) -> (j, k, l).
+ (jkl) is in the index order of Gv.
+
+ FFT normalization factor is 1., as in MH and in `numpy.fft`.
+
+ Args:
+ f : (nx*ny*nz,) ndarray
+ The function to be FFT'd, flattened to a 1D array corresponding
+ to the index order of :func:`cartesian_prod`.
+ mesh : (3,) ndarray of ints (= nx,ny,nz)
+ The number G-vectors along each direction.
+
+ Returns:
+ (nx*ny*nz,) ndarray
+ The FFT 1D array in same index order as Gv (natural order of
+ numpy.fft).
+
+ '''
+ if f.size == 0:
+ return cp.zeros_like(f)
+
+ f3d = cp.asarray(f).reshape(-1, *mesh)
+ assert (f3d.shape[0] == 1 or f[0].size == f3d[0].size)
+ g3d = cp.fft.fftn(f3d, axes=(1,2,3))
+ ngrids = np.prod(mesh)
+ if f.ndim == 1 or (f.ndim == 3 and f.size == ngrids):
+ return g3d.ravel()
+ else:
+ return g3d.reshape(-1, ngrids)
+
+def ifft(g, mesh):
+ '''Perform the 3D inverse FFT from reciprocal (G) space to real (R) space.
+
+ Inverse FFT normalization factor is 1./N, same as in `numpy.fft` but
+ **different** from MH (they use 1.).
+
+ Args:
+ g : (nx*ny*nz,) ndarray
+ The function to be inverse FFT'd, flattened to a 1D array
+ corresponding to the index order of `span3`.
+ mesh : (3,) ndarray of ints (= nx,ny,nz)
+ The number G-vectors along each direction.
+
+ Returns:
+ (nx*ny*nz,) ndarray
+ The inverse FFT 1D array in same index order as Gv (natural order
+ of numpy.fft).
+
+ '''
+ if g.size == 0:
+ return cp.zeros_like(g)
+
+ g3d = cp.asarray(g).reshape(-1, *mesh)
+ assert (g3d.shape[0] == 1 or g[0].size == g3d[0].size)
+ f3d = cp.fft.ifftn(g3d, axes=(1,2,3))
+ ngrids = np.prod(mesh)
+ if g.ndim == 1 or (g.ndim == 3 and g.size == ngrids):
+ return f3d.ravel()
+ else:
+ return f3d.reshape(-1, ngrids)
+
+
+def fftk(f, mesh, expmikr):
+ r'''Perform the 3D FFT of a real-space function which is (periodic*e^{ikr}).
+
+ fk(k+G) = \sum_r fk(r) e^{-i(k+G)r} = \sum_r [f(k)e^{-ikr}] e^{-iGr}
+ '''
+ return fft(f*expmikr, mesh)
+
+
+def ifftk(g, mesh, expikr):
+ r'''Perform the 3D inverse FFT of f(k+G) into a function which is (periodic*e^{ikr}).
+
+ fk(r) = (1/Ng) \sum_G fk(k+G) e^{i(k+G)r} = (1/Ng) \sum_G [fk(k+G)e^{iGr}] e^{ikr}
+ '''
+ return ifft(g, mesh) * expikr
diff --git a/gpu4pyscf/properties/shielding.py b/gpu4pyscf/properties/shielding.py
index 1ef5e844..ae98dc04 100644
--- a/gpu4pyscf/properties/shielding.py
+++ b/gpu4pyscf/properties/shielding.py
@@ -18,7 +18,7 @@
from pyscf.data import nist
from pyscf.scf import _vhf, jk
from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, take_last2d, add_sparse
+from gpu4pyscf.lib.cupy_helper import contract, sandwich_dot, add_sparse
from gpu4pyscf.scf import cphf
def gen_vind(mf, mo_coeff, mo_occ):
@@ -37,23 +37,20 @@ def gen_vind(mf, mo_coeff, mo_occ):
mvir = mo_coeff[:, mo_occ == 0]
nocc = mocc.shape[1]
nvir = nmo - nocc
- omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(
- mf.xc, spin=mf.mol.spin)
+ omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin)
+ # FIXME: check if hybrid
+ # FIXME: handle rsh
def fx(mo1):
mo1 = mo1.reshape(-1, nvir, nocc) # * the saving pattern
mo1_mo_real = contract('nai,ua->nui', mo1, mvir)
dm1 = 2*contract('nui,vi->nuv', mo1_mo_real, mocc.conj())
- dm1 -= dm1.transpose(0, 2, 1)
+ dm1 = dm1 - dm1.transpose(0, 2, 1)
if hasattr(mf,'with_df'):
- v1 = cupy.empty((3, nao, nao))
- for i in range(3):
- v1[i] =+mf.get_jk(mf.mol, dm1[i], hermi=2, with_j=False)[1]*0.5*hyb
+ vk = mf.get_jk(mf.mol, dm1, hermi=2, with_j=False)[1]
else:
- v1 = np.empty((3, nao, nao))
- for i in range(3):
- v1[i] = -jk.get_jk(mf.mol, dm1[i].get(), 'ijkl,jk->il')*0.5*hyb
- v1 = cupy.array(v1)
+ vk = cupy.array(jk.get_jk(mf.mol, dm1.get(), ['ijkl,jk->il']*3))
+ v1 = -.5*hyb * vk
tmp = contract('nuv,vi->nui', v1, mocc)
v1vo = contract('nui,ua->nai', tmp, mvir.conj())
@@ -68,7 +65,7 @@ def nr_rks(ni, mol, grids, xc_code, dms):
mo_coeff = getattr(dms, 'mo_coeff', None)
mo_occ = getattr(dms, 'mo_occ', None)
nao = mo_coeff.shape[1]
-
+
opt = getattr(ni, 'gdftopt', None)
if opt is None:
ni.build(mol, grids.coords)
@@ -77,9 +74,8 @@ def nr_rks(ni, mol, grids, xc_code, dms):
coeff = cupy.asarray(opt.coeff)
nao, nao0 = coeff.shape
- dms = cupy.asarray(dms).reshape(-1,nao0,nao0)
- dms = take_last2d(dms, opt.ao_idx)
- mo_coeff = mo_coeff[opt.ao_idx]
+ dms = sandwich_dot(cupy.asarray(dms).reshape(-1,nao0,nao0), coeff.T)
+ mo_coeff = coeff.dot(mo_coeff)
vmat = cupy.zeros((3, nao, nao))
if xctype == 'LDA':
@@ -100,7 +96,7 @@ def nr_rks(ni, mol, grids, xc_code, dms):
vtmp = contract('pu,p,vp->uv', giao_aux[idirect], wv, ao)
vtmp = cupy.ascontiguousarray(vtmp)
add_sparse(vmat[idirect], vtmp, index)
-
+
elif xctype == 'GGA':
wv = vxc * weight
giao = _sorted_mol.eval_gto('GTOval_ig', coords.get(), comp=3)
@@ -133,7 +129,7 @@ def nr_rks(ni, mol, grids, xc_code, dms):
ao = None
- vmat = take_last2d(vmat, opt.rev_ao_idx)
+ vmat = sandwich_dot(vmat, coeff)
if numint.FREE_CUPY_CACHE:
dms = None
@@ -164,8 +160,9 @@ def get_vxc(mf, dm0):
vk = None
vxc += vj
else:
- omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(
- mf.xc, spin=mf.mol.spin)
+ omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin)
+ # FIXME: check if hybrid
+ # FIXME: handle rsh
vxc += vj - vk*hyb*0.5
return vxc
@@ -211,19 +208,16 @@ def eval_shielding(mf):
s1jk = -contract('xiq,qj->xij', tmp, mocc)*0.5
tmp = contract('nai,ua->nui', s1jk, mocc)
s1jkdm1 = contract('nui,vi->nuv', tmp, mocc.conj())*2
- s1jkdm1 -= s1jkdm1.transpose(0, 2, 1)
- omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(
- mf.xc, spin=mf.mol.spin)
+ s1jkdm1 = s1jkdm1 - s1jkdm1.transpose(0, 2, 1)
+ omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mf.mol.spin)
+ # FIXME: check if hybrid
+ # FIXME: handle rsh
+
if hasattr(mf,'with_df'):
- vk2 = cupy.empty((3, nao, nao))
- for i in range(3):
- vk2[i] = +mf.get_jk(mf.mol, s1jkdm1[i], hermi=2, with_j=False)[1]*0.5*hyb
-
+ vk = mf.get_jk(mf.mol, s1jkdm1, hermi=2, with_j=False)[1]
else:
- vk2 = np.empty((3, nao, nao))
- for i in range(3):
- vk2[i] = -jk.get_jk(mf.mol, s1jkdm1[i].get(), 'ijkl,jk->il')*0.5*hyb
- vk2 = cupy.array(vk2)
+ vk = cupy.array(jk.get_jk(mf.mol, s1jkdm1.get(), ['ijkl,jk->il']*3))
+ vk2 = -.5*hyb * vk
h1ao += vk2
tmp = contract('xuv,ua->xav', h1ao, mvir)
veff_ai = contract('xav,vi->xai', tmp, mocc)
diff --git a/gpu4pyscf/properties/tests/test_shielding.py b/gpu4pyscf/properties/tests/test_shielding.py
index e2415c80..0bbe9c07 100644
--- a/gpu4pyscf/properties/tests/test_shielding.py
+++ b/gpu4pyscf/properties/tests/test_shielding.py
@@ -135,4 +135,4 @@ def test_rks_b3lyp_df(self):
if __name__ == "__main__":
print("Full Tests for nmr shielding constants")
- unittest.main()
\ No newline at end of file
+ unittest.main()
diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py
index 874ab513..c2c8b056 100644
--- a/gpu4pyscf/qmmm/chelpg.py
+++ b/gpu4pyscf/qmmm/chelpg.py
@@ -48,7 +48,7 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
_, _, fake_uniq_l_ctr, fake_l_ctr_counts = int3c2e.sort_mol(fake_mol)
# sort auxiliary mol
- sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol(
+ sorted_auxmol, _, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol(
intopt.auxmol)
if group_size_aux is not None:
aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e._split_l_ctr_groups(
@@ -88,10 +88,7 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1])
intopt.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx])
ncart = cart_ao_loc[-1]
- nsph = sph_ao_loc[-1]
- intopt.cart2sph = block_c2s_diag(ncart, nsph, intopt.angular, l_ctr_counts)
- inv_idx = np.argsort(intopt.sph_ao_idx, kind='stable').astype(np.int32)
- intopt.coeff = intopt.cart2sph[:, inv_idx]
+ intopt.cart2sph = block_c2s_diag(intopt.angular, l_ctr_counts)
# pairing auxiliary basis with fake basis set
fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts))
@@ -109,7 +106,6 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
cart_aux_loc = intopt.auxmol.ao_loc_nr(cart=True)
sph_aux_loc = intopt.auxmol.ao_loc_nr(cart=False)
ncart = cart_aux_loc[-1]
- nsph = sph_aux_loc[-1]
# inv_idx = np.argsort(intopt.sph_aux_idx, kind='stable').astype(np.int32)
aux_l_ctr_offsets += fake_l_ctr_offsets[-1]
@@ -159,6 +155,13 @@ def _build_VHFOpt(intopt, cutoff=1e-14, group_size=None,
intopt.cp_idx, intopt.cp_jdx = np.unravel_index(
np.arange(ncptype), (nl, nl))
+ intopt._sorted_mol = sorted_mol
+ intopt._sorted_auxmol = sorted_auxmol
+ if intopt.mol.cart:
+ intopt._ao_idx = intopt.cart_ao_idx
+ else:
+ intopt._ao_idx = intopt.sph_ao_idx
+
def eval_chelpg_layer_gpu(mf, deltaR=0.3, Rhead=2.8, ifqchem=True, Rvdw=modified_Bondi, verbose=None):
"""Cal chelpg charge
diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py
index f704133e..986ae2f2 100644
--- a/gpu4pyscf/qmmm/pbc/itrf.py
+++ b/gpu4pyscf/qmmm/pbc/itrf.py
@@ -1026,7 +1026,8 @@ def calculate_h1e(self, h1_gpu):
v = cp.zeros_like(g_qm)
for i0,i1,j0,j1,k0,k1,j3c in int3c2e.loop_int3c2e_general(intopt, ip_type='ip1'):
v[:,i0:i1,j0:j1] += contract('xkji,k->xij', j3c, charges[k0:k1])
- g_qm += cupy_helper.take_last2d(v, intopt.rev_ao_idx)
+ v = intopt.unsort_orbitals(v, axis=[1,2])
+ g_qm += v #cupy_helper.take_last2d(v, intopt.rev_ao_idx)
elif mm_mol.charge_model == 'point' and len(coords) != 0:
max_memory = self.max_memory - lib.current_memory()[0]
blksize = int(min(max_memory*1e6/8/nao**2/3, 200))
@@ -1079,7 +1080,7 @@ def grad_hcore_mm(self, dm, mol=None):
intopt.build(self.base.direct_scf_tol, diag_block_with_triu=True, aosym=False,
group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
- dm_ = cupy_helper.take_last2d(dm, intopt.sph_ao_idx)
+ dm_ = intopt.sort_orbitals(dm, axis=[0,1])
for i0,i1,j0,j1,k0,k1,j3c in int3c2e.loop_int3c2e_general(intopt, ip_type='ip2'):
j3c = contract('xkji,k->xkji', j3c, charges[k0:k1])
g_[k0:k1] += contract('xkji,ij->kx', j3c, dm_[i0:i1,j0:j1])
diff --git a/gpu4pyscf/scf/_response_functions.py b/gpu4pyscf/scf/_response_functions.py
index 6677cf6f..b86b0514 100644
--- a/gpu4pyscf/scf/_response_functions.py
+++ b/gpu4pyscf/scf/_response_functions.py
@@ -19,7 +19,7 @@
from gpu4pyscf.scf import hf, uhf
def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
- singlet=None, hermi=0, max_memory=None):
+ singlet=None, hermi=0, grids=None, max_memory=None):
'''Generate a function to compute the product of RHF response function and
RHF density matrices.
@@ -31,24 +31,29 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
if mo_coeff is None: mo_coeff = mf.mo_coeff
if mo_occ is None: mo_occ = mf.mo_occ
mol = mf.mol
+
if isinstance(mf, hf.KohnShamDFT):
+ if grids is None:
+ grids = mf.grids
+ if grids and grids.coords is None:
+ grids.build(mol=mol, with_non0tab=False, sort_grids=True)
ni = mf._numint
ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
- if getattr(mf, 'nlc', '') != '':
+ if mf.do_nlc():
logger.warn(mf, 'NLC functional found in DFT object. Its second '
'deriviative is not available. Its contribution is '
'not included in the response function.')
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
- hybrid = abs(hyb) > 1e-10
+ hybrid = ni.libxc.is_hybrid_xc(mf.xc)
if singlet is None:
# for ground state orbital hessian
- rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
- mo_coeff, mo_occ, 0)
+ spin = 0
else:
- rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
- [mo_coeff]*2, [mo_occ*.5]*2, spin=1)
- dm0 = None #mf.make_rdm1(mo_coeff, mo_occ)
+ spin = 1
+ rho0, vxc, fxc = ni.cache_xc_kernel(
+ mol, grids, mf.xc, mo_coeff, mo_occ, spin, max_memory=max_memory)
+ dm0 = None
if singlet is None:
# Without specify singlet, used in ground state orbital hessian
@@ -57,9 +62,9 @@ def vind(dm1):
if hermi == 2:
v1 = cupy.zeros_like(dm1)
else:
- v1 = ni.nr_rks_fxc(mol, mf.grids, mf.xc, dm0, dm1, 0, hermi,
+ v1 = ni.nr_rks_fxc(mol, grids, mf.xc, dm0, dm1, 0, hermi,
rho0, vxc, fxc, max_memory=max_memory)
- if hybrid or abs(alpha) > 1e-10:
+ if hybrid:
if hermi != 2:
vj, vk = mf.get_jk(mol, dm1, hermi=hermi)
vk *= hyb
@@ -71,8 +76,45 @@ def vind(dm1):
elif hermi != 2:
v1 += mf.get_j(mol, dm1, hermi=hermi)
return v1
- else:
- raise NotImplementedError('only singlet response is supported!')
+
+ elif singlet:
+ fxc *= .5
+ def vind(dm1):
+ if hermi == 2:
+ v1 = cupy.zeros_like(dm1)
+ else:
+ # nr_rks_fxc_st requires alpha of dm1, dm1*.5 should be scaled
+ v1 = ni.nr_rks_fxc_st(mol, grids, mf.xc, dm0, dm1, 0, True,
+ rho0, vxc, fxc, max_memory=max_memory)
+ if hybrid:
+ if hermi != 2:
+ vj, vk = mf.get_jk(mol, dm1, hermi=hermi)
+ vk *= hyb
+ if abs(omega) > 1e-10: # For range separated Coulomb
+ vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
+ v1 += vj - .5 * vk
+ else:
+ v1 -= .5 * hyb * mf.get_k(mol, dm1, hermi=hermi)
+ elif hermi != 2:
+ v1 += mf.get_j(mol, dm1, hermi=hermi)
+ return v1
+
+ else: # triplet
+ fxc *= .5
+ def vind(dm1):
+ if hermi == 2:
+ v1 = cupy.zeros_like(dm1)
+ else:
+ # nr_rks_fxc_st requires alpha of dm1, dm1*.5 should be scaled
+ v1 = ni.nr_rks_fxc_st(mol, grids, mf.xc, dm0, dm1, 0, False,
+ rho0, vxc, fxc, max_memory=max_memory)
+ if hybrid:
+ vk = mf.get_k(mol, dm1, hermi=hermi)
+ vk *= hyb
+ if abs(omega) > 1e-10: # For range separated Coulomb
+ vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
+ v1 += -.5 * vk
+ return v1
else: # HF
if (singlet is None or singlet) and hermi != 2:
@@ -87,7 +129,7 @@ def vind(dm1):
def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
- with_j=True, hermi=0, max_memory=None):
+ with_j=True, hermi=0, grids=None, max_memory=None):
'''Generate a function to compute the product of UHF response function and
UHF density matrices.
'''
@@ -96,6 +138,10 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
if mo_occ is None: mo_occ = mf.mo_occ
mol = mf.mol
if isinstance(mf, hf.KohnShamDFT):
+ if grids is None:
+ grids = mf.grids
+ if grids and grids.coords is None:
+ grids.build(mol=mol, with_non0tab=False, sort_grids=True)
ni = mf._numint
ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
if mf.do_nlc():
@@ -105,19 +151,15 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
hybrid = ni.libxc.is_hybrid_xc(mf.xc)
- rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
+ rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
mo_coeff, mo_occ, 1)
dm0 = None
- if max_memory is None:
- mem_now = lib.current_memory()[0]
- max_memory = max(2000, mf.max_memory*.8-mem_now)
-
def vind(dm1):
if hermi == 2:
v1 = cupy.zeros_like(dm1)
else:
- v1 = ni.nr_uks_fxc(mol, mf.grids, mf.xc, dm0, dm1, 0, hermi,
+ v1 = ni.nr_uks_fxc(mol, grids, mf.xc, dm0, dm1, 0, hermi,
rho0, vxc, fxc, max_memory=max_memory)
if not hybrid:
if with_j:
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index b84d0a58..a069d89b 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -25,13 +25,14 @@
from pyscf.scf import hf
from pyscf.scf import chkfile
from gpu4pyscf import lib
+from gpu4pyscf.lib import utils
from gpu4pyscf.lib.cupy_helper import eigh, tag_array, return_cupy_array, cond
from gpu4pyscf.scf import diis, jk
from gpu4pyscf.lib import logger
__all__ = [
'get_jk', 'get_occ', 'get_grad', 'damping', 'level_shift', 'get_fock',
- 'energy_elec', 'RHF'
+ 'energy_elec', 'RHF', 'SCF'
]
def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
@@ -238,33 +239,13 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
scf_conv = True
break
- if(cycle == mf.max_cycle):
- logger.warn("SCF failed to converge")
+ if (cycle + 1 == mf.max_cycle):
+ assert not scf_conv
+ logger.warn(mf, "SCF failed to converge")
return scf_conv, e_tot, mo_energy, mo_coeff, mo_occ
-def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'):
- from pyscf.data import nist
- if mol is None: mol = mf.mol
- if dm is None: dm = mf.make_rdm1()
- nao = mol.nao
- with mol.with_common_orig((0,0,0)):
- ao_quad = mol.intor_symmetric('int1e_rr').reshape(3,3,nao,nao)
-
- el_quad = np.einsum('xyij,ji->xy', ao_quad, dm).real
-
- # Nuclear contribution
- charges = mol.atom_charges()
- coords = mol.atom_coords()
- nucl_quad = np.einsum('i,ix,iy->xy', charges, coords, coords)
-
- mol_quad = nucl_quad - el_quad
-
- if unit.upper() == 'DEBYE-ANG':
- mol_quad *= nist.AU2DEBYE * nist.BOHR
- return mol_quad
-
def energy_tot(mf, dm=None, h1e=None, vhf=None):
r'''Total Hartree-Fock energy, electronic part plus nuclear repulstion
See :func:`scf.hf.energy_elec` for the electron part
@@ -310,6 +291,27 @@ def scf(mf, dm0=None, **kwargs):
mf._finalize()
return mf.e_tot
+def canonicalize(mf, mo_coeff, mo_occ, fock=None):
+ '''Canonicalization diagonalizes the Fock matrix within occupied, open,
+ virtual subspaces separatedly (without change occupancy).
+ '''
+ if fock is None:
+ dm = mf.make_rdm1(mo_coeff, mo_occ)
+ fock = mf.get_fock(dm=dm)
+ coreidx = mo_occ == 2
+ viridx = mo_occ == 0
+ openidx = ~(coreidx | viridx)
+ mo = cupy.empty_like(mo_coeff)
+ mo_e = cupy.empty(mo_occ.size)
+ for idx in (coreidx, openidx, viridx):
+ if cupy.any(idx) > 0:
+ orb = mo_coeff[:,idx]
+ f1 = orb.conj().T.dot(fock).dot(orb)
+ e, c = cupy.linalg.eigh(f1)
+ mo[:,idx] = orb.dot(c)
+ mo_e[idx] = e
+ return mo_e, mo
+
def as_scanner(mf):
if isinstance(mf, pyscf_lib.SinglePointScanner):
return mf
@@ -354,9 +356,10 @@ class SCF(pyscf_lib.StreamObject):
conv_tol_grad = hf.SCF.conv_tol_grad
max_cycle = hf.SCF.max_cycle
init_guess = hf.SCF.init_guess
+ conv_tol_cpscf = 1e-4
disp = None
- DIIS = hf.SCF.DIIS
+ DIIS = diis.SCF_DIIS
diis = hf.SCF.diis
diis_space = hf.SCF.diis_space
diis_damp = hf.SCF.diis_damp
@@ -410,9 +413,11 @@ def check_sanity(self):
build = hf.SCF.build
opt = NotImplemented
dump_flags = hf.SCF.dump_flags
- get_fock = hf.SCF.get_fock
- get_occ = hf.SCF.get_occ
- get_grad = hf.SCF.get_grad
+ get_hcore = return_cupy_array(hf.SCF.get_hcore)
+ get_ovlp = return_cupy_array(hf.SCF.get_ovlp)
+ get_fock = get_fock
+ get_occ = get_occ
+ get_grad = staticmethod(get_grad)
dump_chk = hf.SCF.dump_chk
init_guess_by_minao = hf.SCF.init_guess_by_minao
init_guess_by_atom = hf.SCF.init_guess_by_atom
@@ -421,41 +426,65 @@ def check_sanity(self):
init_guess_by_1e = hf.SCF.init_guess_by_1e
init_guess_by_chkfile = hf.SCF.init_guess_by_chkfile
from_chk = hf.SCF.from_chk
- get_init_guess = hf.SCF.get_init_guess
- make_rdm1 = hf.SCF.make_rdm1
- make_rdm2 = hf.SCF.make_rdm2
- energy_elec = hf.SCF.energy_elec
- energy_tot = hf.SCF.energy_tot
+ get_init_guess = return_cupy_array(hf.SCF.get_init_guess)
+ make_rdm1 = make_rdm1
+ make_rdm2 = NotImplemented
+ energy_elec = energy_elec
+ energy_tot = energy_tot
energy_nuc = hf.SCF.energy_nuc
check_convergence = None
_eigh = staticmethod(eigh)
eig = hf.SCF.eig
do_disp = hf.SCF.do_disp
get_dispersion = hf.SCF.get_dispersion
-
- scf = hf.SCF.scf
+ kernel = scf = scf
as_scanner = hf.SCF.as_scanner
_finalize = hf.SCF._finalize
init_direct_scf = hf.SCF.init_direct_scf
- get_jk = hf.SCF.get_jk
+ get_jk = _get_jk
get_j = hf.SCF.get_j
get_k = hf.SCF.get_k
- get_veff = hf.SCF.get_veff
- analyze = hf.SCF.analyze
+ get_veff = NotImplemented
mulliken_meta = hf.SCF.mulliken_meta
pop = hf.SCF.pop
- dip_moment = hf.SCF.dip_moment
_is_mem_enough = NotImplemented
density_fit = NotImplemented
- sfx2c1e = NotImplemented
- x2c1e = NotImplemented
- x2c = NotImplemented
newton = NotImplemented
- remove_soscf = NotImplemented
+ x2c = x2c1e = sfx2c1e = NotImplemented
stability = NotImplemented
nuc_grad_method = NotImplemented
update_ = NotImplemented
+ canonicalize = NotImplemented
istype = hf.SCF.istype
+ to_rhf = NotImplemented
+ to_uhf = NotImplemented
+ to_ghf = NotImplemented
+ to_rks = NotImplemented
+ to_uks = NotImplemented
+ to_gks = NotImplemented
+ to_ks = NotImplemented
+ canonicalize = NotImplemented
+ mulliken_pop = NotImplemented
+ mulliken_meta = NotImplemented
+
+ def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None,
+ verbose=logger.NOTE):
+ if mol is None: mol = self.mol
+ if dm is None: dm = self.make_rdm1()
+ return hf.dip_moment(mol, dm.get(), unit, origin, verbose)
+
+ def quad_moment(self, mol=None, dm=None, unit='DebyeAngstrom', origin=None,
+ verbose=logger.NOTE):
+ if mol is None: mol = self.mol
+ if dm is None: dm = self.make_rdm1()
+ return hf.quad_moment(mol, dm.get(), unit, origin, verbose)
+
+ def remove_soscf(self):
+ lib.logger.warn('remove_soscf has no effect in current version')
+ return self
+
+ def analyze(self, *args, **kwargs):
+ return self.to_cpu().analyze()
def reset(self, mol=None):
if mol is not None:
@@ -469,7 +498,6 @@ class KohnShamDFT:
A mock DFT base class, to be compatible with PySCF
'''
-from gpu4pyscf.lib import utils
class RHF(SCF):
to_gpu = utils.to_gpu
@@ -477,42 +505,8 @@ class RHF(SCF):
_keys = {'e_disp', 'h1e', 's1e', 'e_mf', 'conv_tol_cpscf', 'disp_with_3body'}
- conv_tol_cpscf = 1e-4
- DIIS = diis.SCF_DIIS
- get_jk = _get_jk
- _eigh = staticmethod(eigh)
- make_rdm1 = make_rdm1
- energy_elec = energy_elec
- get_fock = get_fock
- get_occ = get_occ
get_veff = get_veff
- get_grad = staticmethod(get_grad)
- quad_moment = _quad_moment
- energy_tot = energy_tot
-
- get_hcore = return_cupy_array(hf.RHF.get_hcore)
- get_ovlp = return_cupy_array(hf.RHF.get_ovlp)
- get_init_guess = return_cupy_array(hf.RHF.get_init_guess)
- init_direct_scf = NotImplemented
- make_rdm2 = NotImplemented
- newton = NotImplemented
- x2c = x2c1e = sfx2c1e = NotImplemented
- to_rhf = NotImplemented
- to_uhf = NotImplemented
- to_ghf = NotImplemented
- to_rks = NotImplemented
- to_uks = NotImplemented
- to_gks = NotImplemented
- to_ks = NotImplemented
- canonicalize = NotImplemented
- # TODO: Enable followings after testing
- analyze = NotImplemented
- stability = NotImplemented
- mulliken_pop = NotImplemented
- mulliken_meta = NotImplemented
-
- scf = scf
- kernel = scf
+ canonicalize = canonicalize
def check_sanity(self):
mol = self.mol
@@ -529,6 +523,10 @@ def density_fit(self, auxbasis=None, with_df=None, only_dfj=False):
import gpu4pyscf.df.df_jk
return gpu4pyscf.df.df_jk.density_fit(self, auxbasis, with_df, only_dfj)
+ def newton(self):
+ from gpu4pyscf.scf.soscf import newton
+ return newton(self)
+
def to_cpu(self):
mf = hf.RHF(self.mol)
utils.to_cpu(self, out=mf)
diff --git a/gpu4pyscf/scf/hf_symm.py b/gpu4pyscf/scf/hf_symm.py
new file mode 100644
index 00000000..486c02fd
--- /dev/null
+++ b/gpu4pyscf/scf/hf_symm.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+from gpu4pyscf.scf.hf import RHF
+from gpu4pyscf.scf.rohf import ROHF
+
+SymAdaptedRHF = RHF
+SymAdaptedROHF = ROHF
diff --git a/gpu4pyscf/scf/int2c2e.py b/gpu4pyscf/scf/int2c2e.py
index 8ec1564d..0dbc8730 100644
--- a/gpu4pyscf/scf/int2c2e.py
+++ b/gpu4pyscf/scf/int2c2e.py
@@ -33,7 +33,7 @@ def get_int2c2e_sorted(mol, intopt=None, direct_scf_tol=1e-13, aosym=None, omega
nao = mol.nao
rows, cols = np.tril_indices(nao)
- nao_cart = intopt.mol.nao
+ nao_cart = intopt._sorted_mol.nao
norb_cart = nao_cart + 1
int2c = cupy.zeros([nao_cart, nao_cart], order='F')
@@ -137,5 +137,5 @@ def get_int2c2e(mol, direct_scf_tol=1e-13):
intopt = VHFOpt(mol, mol, 'int2e')
intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=True)
int2c = get_int2c2e_sorted(mol, intopt=intopt)
- int2c = take_last2d(int2c, intopt.rev_ao_idx)
+ int2c = intopt.unsort_orbitals(int2c, axis=[0,1])
return int2c
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
new file mode 100644
index 00000000..7ec884b8
--- /dev/null
+++ b/gpu4pyscf/scf/j_engine.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''
+J engine using McMurchie-Davidson algorithm
+'''
+
+import ctypes
+import functools
+import math
+import numpy as np
+import cupy as cp
+import scipy.linalg
+from pyscf import lib
+from pyscf import __config__
+from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum
+from gpu4pyscf.__config__ import props as gpu_specs
+from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import jk
+from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars
+
+__all__ = [
+ 'get_j',
+]
+
+PTR_BAS_COORD = 7
+LMAX = 4
+SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
+ int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
+THREADS = 256
+
+libvhf_md = load_library('libgvhf_md')
+libvhf_md.MD_build_j.restype = ctypes.c_int
+
+def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None):
+ '''Compute J matrix
+ '''
+ log = logger.new_logger(mol, verbose)
+ cput0 = log.init_timer()
+ if vhfopt is None:
+ with mol.with_range_coulomb(omega):
+ vhfopt = _VHFOpt(mol).build()
+ if omega is None:
+ omega = mol.omega
+
+ mol = vhfopt.mol
+ nbas = mol.nbas
+ nao, nao_orig = vhfopt.coeff.shape
+ dm = cp.asarray(dm, order='C')
+ dms = dm.reshape(-1,nao_orig,nao_orig)
+ n_dm = dms.shape[0]
+ assert n_dm == 1
+ #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+ dms = sandwich_dot(dms, vhfopt.coeff.T)
+ dms = cp.asarray(dms, order='C')
+ if hermi != 1:
+ dms = transpose_sum(dms)
+ else:
+ dms *= 2.
+
+ ao_loc = mol.ao_loc
+ dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
+ log_max_dm = dm_cond.max()
+ log_cutoff = math.log(vhfopt.direct_scf_tol)
+
+ dms = dms.get()
+ pair_loc = _make_j_engine_pair_locs(mol)
+ dm_xyz = np.zeros(pair_loc[-1])
+ # Must use this modified _env to ensure the consistency with GPU kernel
+ # In this _env, normalization coefficients for s and p funcitons are scaled.
+ _env = vhfopt._mol_gpu[2].get()
+ libvhf_md.Et_dot_dm(
+ dm_xyz.ctypes, dms.ctypes, ao_loc.ctypes, pair_loc.ctypes,
+ mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes)
+ dm_xyz = cp.asarray(dm_xyz)
+ vj_xyz = cp.zeros_like(dm_xyz)
+
+ pair_loc_on_gpu = cp.asarray(pair_loc)
+ rys_envs = RysIntEnvVars(
+ mol.natm, mol.nbas,
+ vhfopt.rys_envs.atm, vhfopt.rys_envs.bas, vhfopt.rys_envs.env,
+ pair_loc_on_gpu.data.ptr,
+ )
+
+ libvhf_md.init_mdj_constant(ctypes.c_int(SHM_SIZE))
+
+ uniq_l_ctr = vhfopt.uniq_l_ctr
+ uniq_l = uniq_l_ctr[:,0]
+ l_ctr_bas_loc = vhfopt.l_ctr_offsets
+ l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+ n_groups = len(uniq_l_ctr)
+ tile_mappings = {}
+ workers = gpu_specs['multiProcessorCount']
+ info = cp.empty(2, dtype=np.uint32)
+
+ for i in range(n_groups):
+ for j in range(i+1):
+ ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
+ jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]
+ ij_shls = (ish0, ish1, jsh0, jsh1)
+ sub_q = vhfopt.q_cond[ish0:ish1,jsh0:jsh1]
+ mask = sub_q > log_cutoff# - log_max_dm
+ if i == j:
+ mask = cp.tril(mask)
+ t_ij = (cp.arange(ish0, ish1, dtype=np.int32)[:,None] * nbas +
+ cp.arange(jsh0, jsh1, dtype=np.int32))
+ idx = cp.argsort(sub_q[mask])[::-1]
+ tile_mappings[i,j] = t_ij[mask][idx]
+ t1 = t2 = log.timer_debug1('q_cond and dm_cond', *cput0)
+
+ timing_collection = {}
+ kern_counts = 0
+ kern = libvhf_md.MD_build_j
+
+ for i in range(n_groups):
+ for j in range(i+1):
+ ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
+ l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
+ tile_ij_mapping = tile_mappings[i,j]
+ for k in range(i+1):
+ for l in range(k+1):
+ if i == k and j < l: continue
+ llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+ kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+ l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+ tile_kl_mapping = tile_mappings[k,l]
+ scheme = _md_j_engine_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+ err = kern(
+ ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
+ ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(n_dm), ctypes.c_int(nao),
+ rys_envs, (ctypes.c_int*3)(*scheme),
+ (ctypes.c_int*8)(*ij_shls, *kl_shls),
+ ctypes.c_int(tile_ij_mapping.size),
+ ctypes.c_int(tile_kl_mapping.size),
+ ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+ ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+ ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
+ ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
+ lib.c_null_ptr(),
+ ctypes.c_float(log_cutoff-log_max_dm),
+ ctypes.cast(info.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(workers), ctypes.c_double(omega),
+ mol._atm.ctypes, ctypes.c_int(mol.natm),
+ mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes)
+ if err != 0:
+ raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+ if log.verbose >= logger.DEBUG1:
+ ntasks = tile_ij_mapping.size * tile_kl_mapping.size
+ t1, t1p = log.timer_debug1(f'processing {llll}, tasks ~= {ntasks}', *t1), t1
+ if llll not in timing_collection:
+ timing_collection[llll] = 0
+ timing_collection[llll] += t1[1] - t1p[1]
+ kern_counts += 1
+
+ if log.verbose >= logger.DEBUG1:
+ log.debug1('kernel launches %d', kern_counts)
+ for llll, t in timing_collection.items():
+ log.debug1('%s wall time %.2f', llll, t)
+ cp.cuda.Stream.null.synchronize()
+ log.timer_debug1('cuda kernel', *t2)
+
+ vj_xyz = vj_xyz.get()
+ vj = np.zeros_like(dms)
+ libvhf_md.jengine_dot_Et(
+ vj.ctypes, vj_xyz.ctypes, ao_loc.ctypes, pair_loc.ctypes,
+ mol._bas.ctypes, ctypes.c_int(mol.nbas), _env.ctypes)
+ #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, cp.asarray(vj), vhfopt.coeff)
+ vj = sandwich_dot(vj, vhfopt.coeff)
+ vj = transpose_sum(vj)
+ vj = vj.reshape(dm.shape)
+ log.timer('vj', *cput0)
+ return vj
+
+class _VHFOpt(jk._VHFOpt):
+ def __init__(self, mol, cutoff=1e-13):
+ self.mol, self.coeff = mol.decontract_basis(to_cart=True, aggregate=True)
+ self.direct_scf_tol = cutoff
+ self.uniq_l_ctr = None
+ self.l_ctr_offsets = None
+ self.q_cond = None
+ self.tile_q_cond = None
+ self.tile = 1
+
+def _md_j_engine_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
+ ls = l_ctr_pattern[:,0]
+ li, lj, lk, ll = ls
+ order = li + lj + lk + ll
+ lij = li + lj
+ lkl = lk + ll
+ nf3ij = (lij+1)*(lij+2)*(lij+3)//6
+ nf3kl = (lkl+1)*(lkl+2)*(lkl+3)//6
+ unit = order+1 + (order+1)*(order+2)*(2*order+3)//6
+ counts = shm_size // (unit*8)
+ if counts >= THREADS:
+ nsq = THREADS
+ else:
+ nsq = _nearest_power2(counts)
+ ij = _nearest_power2(int(nsq**.5))
+ kl = nsq // ij
+ tilex, tiley = 2, 4
+ cache_size = ij*tilex * (4+nf3ij) + kl*tiley * (4+nf3kl)
+ while (nsq * unit + cache_size) * 8 > shm_size:
+ nsq //= 2
+ ij = _nearest_power2(int(nsq**.5))
+ kl = nsq // ij
+ cache_size = ij*tilex * (4+nf3ij) + kl*tiley * (4+nf3kl)
+ gout_stride = THREADS // nsq
+ return ij, kl, gout_stride
+
+def _nearest_power2(n):
+ t = 0
+ while n > 1:
+ n >>= 1
+ t += 1
+ return 2**t
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 13b19277..939ba956 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -1,3 +1,24 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''
+Compute J/K matrices
+'''
+
import ctypes
import math
import numpy as np
diff --git a/gpu4pyscf/scf/rohf.py b/gpu4pyscf/scf/rohf.py
index 9e80a93b..67153195 100644
--- a/gpu4pyscf/scf/rohf.py
+++ b/gpu4pyscf/scf/rohf.py
@@ -15,29 +15,76 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+from functools import reduce
import numpy as np
import cupy
-from pyscf.scf import rohf
+from pyscf.scf import rohf as rohf_cpu
from gpu4pyscf.scf import hf, uhf
-from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf.lib.cupy_helper import tag_array, contract
-class ROHF(rohf.ROHF, hf.RHF):
+def get_roothaan_fock(focka_fockb, dma_dmb, s):
+ '''Roothaan's effective fock.
+ Ref. http://www-theor.ch.cam.ac.uk/people/ross/thesis/node15.html
+
+ ======== ======== ====== =========
+ space closed open virtual
+ ======== ======== ====== =========
+ closed Fc Fb Fc
+ open Fb Fc Fa
+ virtual Fc Fa Fc
+ ======== ======== ====== =========
+
+ where Fc = (Fa + Fb) / 2
+
+ Returns:
+ Roothaan effective Fock matrix
+ '''
+ nao = s.shape[0]
+ focka, fockb = focka_fockb
+ dma, dmb = dma_dmb
+ fc = (focka + fockb) * .5
+# Projector for core, open-shell, and virtual
+ pc = cupy.dot(dmb, s)
+ po = cupy.dot(dma-dmb, s)
+ pv = cupy.eye(nao) - cupy.dot(dma, s)
+ fock = reduce(cupy.dot, (pc.conj().T, fc, pc)) * .5
+ fock += reduce(cupy.dot, (po.conj().T, fc, po)) * .5
+ fock += reduce(cupy.dot, (pv.conj().T, fc, pv)) * .5
+ fock += reduce(cupy.dot, (po.conj().T, fockb, pc))
+ fock += reduce(cupy.dot, (po.conj().T, focka, pv))
+ fock += reduce(cupy.dot, (pv.conj().T, fc, pc))
+ fock = fock + fock.conj().T
+ fock = tag_array(fock, focka=focka, fockb=fockb)
+ return fock
+
+def canonicalize(mf, mo_coeff, mo_occ, fock=None):
+ '''Canonicalization diagonalizes the Fock matrix within occupied, open,
+ virtual subspaces separatedly (without change occupancy).
+ '''
+ if getattr(fock, 'focka', None) is None:
+ dm = mf.make_rdm1(mo_coeff, mo_occ)
+ fock = mf.get_fock(dm=dm)
+ mo_e, mo_coeff = hf.canonicalize(mf, mo_coeff, mo_occ, fock)
+ fa, fb = fock.focka, fock.fockb
+ mo_ea = contract('pi,pi->i', mo_coeff.conj(), fa.dot(mo_coeff)).real
+ mo_eb = contract('pi,pi->i', mo_coeff.conj(), fb.dot(mo_coeff)).real
+ mo_e = tag_array(mo_e, mo_ea=mo_ea, mo_eb=mo_eb)
+ return mo_e, mo_coeff
+
+class ROHF(hf.RHF):
from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
+ nelec = rohf_cpu.ROHF.nelec
get_jk = hf._get_jk
- _eigh = hf.RHF._eigh
+ _eigh = staticmethod(hf.eigh)
scf = kernel = hf.RHF.kernel
# FIXME: Needs more tests for get_fock and get_occ
- get_fock = hf.return_cupy_array(rohf.ROHF.get_fock)
- get_occ = hf.return_cupy_array(rohf.ROHF.get_occ)
+ get_occ = hf.return_cupy_array(rohf_cpu.ROHF.get_occ)
get_hcore = hf.RHF.get_hcore
get_ovlp = hf.RHF.get_ovlp
get_init_guess = uhf.UHF.get_init_guess
- make_rdm1 = hf.return_cupy_array(rohf.ROHF.make_rdm1)
make_rdm2 = NotImplemented
- dump_chk = NotImplemented
- newton = NotImplemented
x2c = x2c1e = sfx2c1e = NotImplemented
to_rhf = NotImplemented
to_uhf = NotImplemented
@@ -46,18 +93,83 @@ class ROHF(rohf.ROHF, hf.RHF):
to_uks = NotImplemented
to_gks = NotImplemented
to_ks = NotImplemented
- canonicalize = NotImplemented
analyze = NotImplemented
stability = NotImplemented
mulliken_pop = NotImplemented
mulliken_meta = NotImplemented
nuc_grad_method = NotImplemented
+ canonicalize = canonicalize
+
+ def make_rdm1(self, mo_coeff, mo_occ, **kwargs):
+ '''One-particle density matrix. mo_occ is a 1D array, with occupancy 1 or 2.
+ '''
+ if isinstance(mo_occ, cupy.ndarray) and mo_occ.ndim == 1:
+ mo_occa = (mo_occ > 0).astype(np.double)
+ mo_occb = (mo_occ ==2).astype(np.double)
+ else:
+ mo_occa, mo_occb = mo_occ
+ dm_a = cupy.dot(mo_coeff*mo_occa, mo_coeff.conj().T)
+ dm_b = cupy.dot(mo_coeff*mo_occb, mo_coeff.conj().T)
+ return tag_array((dm_a, dm_b), mo_coeff=mo_coeff, mo_occ=mo_occ)
+
+ def eig(self, fock, s):
+ e, c = self._eigh(fock, s)
+ if getattr(fock, 'focka', None) is not None:
+ mo_ea = contract('pi,pi->i', c.conj(), fock.focka.dot(c)).real
+ mo_eb = contract('pi,pi->i', c.conj(), fock.fockb.dot(c)).real
+ e = tag_array(e, mo_ea=mo_ea, mo_eb=mo_eb)
+ return e, c
+
+ def energy_elec(self, dm=None, h1e=None, vhf=None):
+ if dm is None: dm = self.make_rdm1()
+ elif isinstance(dm, cupy.ndarray) and dm.ndim == 2:
+ dm = [dm*.5, dm*.5]
+ return uhf.energy_elec(self, dm, h1e, vhf)
+
+ def get_fock(self, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
+ diis_start_cycle=None, level_shift_factor=None, damp_factor=None,
+ fock_last=None):
+ '''Build fock matrix based on Roothaan's effective fock.
+ See also :func:`get_roothaan_fock`
+ '''
+ if h1e is None: h1e = self.get_hcore()
+ if s1e is None: s1e = self.get_ovlp()
+ if vhf is None: vhf = self.get_veff(self.mol, dm)
+ if dm is None: dm = self.make_rdm1()
+ if isinstance(dm, cupy.ndarray) and dm.ndim == 2:
+ dm = [dm*.5, dm*.5]
+# To Get orbital energy in get_occ, we saved alpha and beta fock, because
+# Roothaan effective Fock cannot provide correct orbital energy with `eig`
+# TODO, check other treatment J. Chem. Phys. 133, 141102
+ focka = h1e + vhf[0]
+ fockb = h1e + vhf[1]
+ f = get_roothaan_fock((focka,fockb), dm, s1e)
+ if cycle < 0 and diis is None: # Not inside the SCF iteration
+ return f
+
+ if diis_start_cycle is None:
+ diis_start_cycle = self.diis_start_cycle
+ if level_shift_factor is None:
+ level_shift_factor = self.level_shift
+ if damp_factor is None:
+ damp_factor = self.damp
+
+ dm_tot = dm[0] + dm[1]
+ if 0 <= cycle < diis_start_cycle-1 and abs(damp_factor) > 1e-4 and fock_last is not None:
+ raise NotImplementedError('ROHF Fock-damping')
+ if diis and cycle >= diis_start_cycle:
+ f = diis.update(s1e, dm_tot, f, self, h1e, vhf, f_prev=fock_last)
+ if abs(level_shift_factor) > 1e-4:
+ f = hf.level_shift(s1e, dm_tot*.5, f, level_shift_factor)
+ f = tag_array(f, focka=focka, fockb=fockb)
+ return f
+
def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
if mol is None: mol = self.mol
if dm is None: dm = self.make_rdm1()
if getattr(dm, 'ndim', 0) == 2:
- dm = cupy.asarray((dm*.5,dm*.5))
+ dm = cupy.stack((dm*.5,dm*.5))
if dm_last is None or not self.direct_scf:
if getattr(dm, 'mo_coeff', None) is not None:
@@ -74,3 +186,35 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
vhf = vj[0] + vj[1] - vk
vhf += vhf_last
return vhf
+
+ def get_grad(self, mo_coeff, mo_occ, fock):
+ '''ROHF gradients is the off-diagonal block [co + cv + ov], where
+ [ cc co cv ]
+ [ oc oo ov ]
+ [ vc vo vv ]
+ '''
+ occidxa = mo_occ > 0
+ occidxb = mo_occ == 2
+ viridxa = ~occidxa
+ viridxb = ~occidxb
+ uniq_var_a = viridxa.reshape(-1,1) & occidxa
+ uniq_var_b = viridxb.reshape(-1,1) & occidxb
+
+ if getattr(fock, 'focka', None) is not None:
+ focka = fock.focka
+ fockb = fock.fockb
+ elif isinstance(fock, (tuple, list)) or getattr(fock, 'ndim', None) == 3:
+ focka, fockb = fock
+ else:
+ focka = fockb = fock
+ focka = mo_coeff.conj().T.dot(focka).dot(mo_coeff)
+ fockb = mo_coeff.conj().T.dot(fockb).dot(mo_coeff)
+
+ g = cupy.zeros_like(focka)
+ g[uniq_var_a] = focka[uniq_var_a]
+ g[uniq_var_b] += fockb[uniq_var_b]
+ return g[uniq_var_a | uniq_var_b]
+
+ def newton(self):
+ from gpu4pyscf.scf.soscf import newton
+ return newton(self)
diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py
new file mode 100644
index 00000000..f64aa441
--- /dev/null
+++ b/gpu4pyscf/scf/soscf.py
@@ -0,0 +1,704 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+'''
+Second order SCF solver
+'''
+
+import sys
+import math
+import numpy as np
+import cupy as cp
+import scipy.linalg
+from cupyx.scipy.linalg import expm
+from pyscf import lib
+from pyscf.scf import chkfile
+from pyscf.soscf import ciah
+from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu
+from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import hf, rohf, uhf
+from gpu4pyscf.lib.cupy_helper import transpose_sum, contract
+from gpu4pyscf.lib import utils
+
+def gen_g_hop_rhf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None):
+ assert mo_coeff.dtype == np.float64
+ occidx = cp.nonzero(mo_occ==2)[0]
+ viridx = cp.nonzero(mo_occ==0)[0]
+ orbo = mo_coeff[:,occidx]
+ orbv = mo_coeff[:,viridx]
+ nocc = orbo.shape[1]
+ nvir = orbv.shape[1]
+
+ if fock_ao is None:
+ dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+ fock_ao = mf.get_fock(h1e, dm=dm0)
+ fock = mo_coeff.conj().T.dot(fock_ao).dot(mo_coeff)
+ foo = fock[occidx[:,None],occidx]
+ fvv = fock[viridx[:,None],viridx]
+
+ g = fock[viridx[:,None],occidx] * 2
+ h_diag = (fvv.diagonal().real[:,None] - foo.diagonal().real) * 2
+
+ vind = mf.gen_response(mo_coeff, mo_occ, singlet=None, hermi=1)
+
+ def h_op(x):
+ x = x.reshape(nvir,nocc)
+ x2 = contract('ps,sq->pq', fvv, x)
+ x2-= contract('ps,rp->rs', foo, x)
+
+ # *2 for double occupancy
+ dm1 = orbv.dot(x*2).dot(orbo.conj().T)
+ dm1 = transpose_sum(dm1)
+ v1 = vind(dm1)
+ x2 += orbv.conj().T.dot(v1).dot(orbo)
+ return x2.ravel() * 2
+
+ return g.reshape(-1), h_op, h_diag.reshape(-1)
+
+def gen_g_hop_rohf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None):
+ if getattr(fock_ao, 'focka', None) is None:
+ dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+ fock_ao = mf.get_fock(h1e, dm=dm0)
+ fock_ao = fock_ao.focka, fock_ao.fockb
+ mo_occa = occidxa = mo_occ > 0
+ mo_occb = occidxb = mo_occ ==2
+ ug, uh_op, uh_diag = gen_g_hop_uhf(
+ mf, (mo_coeff,)*2, (mo_occa,mo_occb), fock_ao, None)
+
+ viridxa = ~occidxa
+ viridxb = ~occidxb
+ uniq_var_a = viridxa[:,None] & occidxa
+ uniq_var_b = viridxb[:,None] & occidxb
+ uniq_ab = uniq_var_a | uniq_var_b
+ nmo = mo_coeff.shape[-1]
+ nocca, noccb = mf.nelec
+ nvira = nmo - nocca
+
+ def sum_ab(x):
+ x1 = cp.zeros((nmo,nmo), dtype=x.dtype)
+ x1[uniq_var_a] = x[:nvira*nocca]
+ x1[uniq_var_b] += x[nvira*nocca:]
+ return x1[uniq_ab]
+
+ g = sum_ab(ug)
+ h_diag = sum_ab(uh_diag)
+ def h_op(x):
+ x1 = cp.zeros((nmo,nmo), dtype=x.dtype)
+ # unpack ROHF rotation parameters
+ x1[uniq_ab] = x
+ x1 = cp.hstack((x1[uniq_var_a],x1[uniq_var_b]))
+ return sum_ab(uh_op(x1))
+
+ return g, h_op, h_diag
+
+def gen_g_hop_uhf(mf, mo_coeff, mo_occ, fock_ao=None, h1e=None):
+ assert mo_coeff[0].dtype == np.float64
+ occidxa = cp.nonzero(mo_occ[0] > 0)[0]
+ occidxb = cp.nonzero(mo_occ[1] > 0)[0]
+ viridxa = cp.nonzero(mo_occ[0] == 0)[0]
+ viridxb = cp.nonzero(mo_occ[1] == 0)[0]
+ orboa = mo_coeff[0][:,occidxa]
+ orbob = mo_coeff[1][:,occidxb]
+ orbva = mo_coeff[0][:,viridxa]
+ orbvb = mo_coeff[1][:,viridxb]
+ nmo = mo_occ[0].size
+ nocca, noccb = mf.nelec
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+
+ if fock_ao is None:
+ dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+ fock_ao = mf.get_fock(h1e, dm=dm0)
+ focka = mo_coeff[0].conj().T.dot(fock_ao[0]).dot(mo_coeff[0])
+ fockb = mo_coeff[1].conj().T.dot(fock_ao[1]).dot(mo_coeff[1])
+ fooa = focka[occidxa[:,None],occidxa]
+ fvva = focka[viridxa[:,None],viridxa]
+ foob = fockb[occidxb[:,None],occidxb]
+ fvvb = fockb[viridxb[:,None],viridxb]
+
+ g = cp.hstack((focka[viridxa[:,None],occidxa].ravel(),
+ fockb[viridxb[:,None],occidxb].ravel()))
+ h_diaga = fvva.diagonal().real[:,None] - fooa.diagonal().real
+ h_diagb = fvvb.diagonal().real[:,None] - foob.diagonal().real
+ h_diag = cp.hstack((h_diaga.reshape(-1), h_diagb.reshape(-1)))
+
+ vind = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+
+ def h_op(x):
+ x1a = x[:nvira*nocca].reshape(nvira,nocca)
+ x1b = x[nvira*nocca:].reshape(nvirb,noccb)
+ x2a = contract('pr,rq->pq', fvva, x1a)
+ x2a-= contract('sq,ps->pq', fooa, x1a)
+ x2b = contract('pr,rq->pq', fvvb, x1b)
+ x2b-= contract('sq,ps->pq', foob, x1b)
+
+ d1a = orbva.dot(x1a).dot(orboa.conj().T)
+ d1b = orbvb.dot(x1b).dot(orbob.conj().T)
+ dm1 = cp.array([transpose_sum(d1a),
+ transpose_sum(d1b)])
+ v1 = vind(dm1)
+ x2a += orbva.conj().T.dot(v1[0]).dot(orboa)
+ x2b += orbvb.conj().T.dot(v1[1]).dot(orbob)
+ return cp.hstack((x2a.ravel(), x2b.ravel()))
+
+ return g, h_op, h_diag
+
+
+def _rotate_orb_cc(mf, h1e, s1e, conv_tol_grad=None, verbose=None):
+ log = logger.new_logger(mf, verbose)
+
+ if conv_tol_grad is None:
+ conv_tol_grad = (mf.conv_tol*.1)**.5
+ #TODO: dynamically adjust max_stepsize, as done in mc1step.py
+
+ def precond(x, e):
+ hdiagd = h_diag-(e-mf.ah_level_shift)
+ hdiagd[abs(hdiagd)<1e-8] = 1e-8
+ x = x/hdiagd
+ return x
+
+ t3m = log.init_timer()
+ u = g_kf = g_orb = norm_gorb = dxi = kfcount = jkcount = None
+ dm0 = vhf0 = None
+ g_op = lambda: g_orb
+ while True:
+ mo_coeff, mo_occ, dm0, vhf0, e_tot = (yield u, g_kf, kfcount, jkcount, dm0, vhf0)
+ fock_ao = mf.get_fock(h1e, s1e, vhf0, dm0)
+
+ g_kf, h_op, h_diag = mf.gen_g_hop(mo_coeff, mo_occ, fock_ao)
+ norm_gkf = cp.linalg.norm(g_kf)
+ if g_orb is None:
+ log.debug(' |g|= %4.3g (keyframe)', norm_gkf)
+ kf_trust_region = mf.kf_trust_region
+ x0_guess = g_kf
+ else:
+ norm_dg = cp.linalg.norm(g_kf-g_orb)
+ log.debug(' |g|= %4.3g (keyframe), |g-correction|= %4.3g',
+ norm_gkf, norm_dg)
+ kf_trust_region = min(max(norm_gorb/(norm_dg+1e-9), mf.kf_trust_region), 10)
+ log.debug1('Set kf_trust_region = %g', kf_trust_region)
+ x0_guess = dxi
+ g_orb = g_kf
+ norm_gorb = norm_gkf
+ problem_size = g_orb.size
+
+ ah_conv_tol = min(norm_gorb**2, mf.ah_conv_tol)
+ # increase the AH accuracy when approach convergence
+ ah_start_cycle = mf.ah_start_cycle
+ imic = 0
+ dr = 0.
+ u = 1.
+ ukf = None
+ jkcount = 0
+ kfcount = 0
+ ikf = 0
+ ihop = 0
+
+ for ah_end, ihop, w, dxi, hdxi, residual, seig \
+ in _davidson_cc(h_op, g_op, precond, x0_guess,
+ tol=ah_conv_tol, max_cycle=mf.ah_max_cycle,
+ lindep=mf.ah_lindep, verbose=log):
+ norm_residual = cp.linalg.norm(residual)
+ ah_start_tol = min(norm_gorb*5, mf.ah_start_tol)
+ if (ah_end or ihop == mf.ah_max_cycle or # make sure to use the last step
+ ((norm_residual < ah_start_tol) and (ihop >= ah_start_cycle)) or
+ (seig < mf.ah_lindep)):
+ imic += 1
+ dxmax = abs(dxi).max()
+ if ihop == problem_size:
+ log.debug1('... Hx=g fully converged for small systems')
+ elif dxmax > mf.max_stepsize:
+ scale = mf.max_stepsize / dxmax
+ log.debug1('... scale rotation size %g', scale)
+ dxi *= scale
+ hdxi *= scale
+
+ dr = dr + dxi
+ g_orb = g_orb + hdxi
+ norm_dr = cp.linalg.norm(dr)
+ norm_gorb = cp.linalg.norm(g_orb)
+ norm_dxi = cp.linalg.norm(dxi)
+ log.debug(' imic %d(%d) |g|= %4.3g |dxi|= %4.3g '
+ 'max(|x|)= %4.3g |dr|= %4.3g eig= %4.3g seig= %4.3g',
+ imic, ihop, norm_gorb, norm_dxi,
+ dxmax, norm_dr, w, seig)
+
+ max_cycle = max(mf.max_cycle_inner,
+ mf.max_cycle_inner-int(math.log(norm_gkf+1e-9)*2))
+ log.debug1('Set ah_start_tol %g, ah_start_cycle %d, max_cycle %d',
+ ah_start_tol, ah_start_cycle, max_cycle)
+ ikf += 1
+ if imic > 3 and norm_gorb > norm_gkf*mf.ah_grad_trust_region:
+ g_orb = g_orb - hdxi
+ dr -= dxi
+ norm_gorb = cp.linalg.norm(g_orb)
+ log.debug('|g| >> keyframe, Restore previouse step')
+ break
+
+ elif (imic >= max_cycle or norm_gorb < conv_tol_grad/mf.ah_grad_trust_region):
+ break
+
+ elif (ikf > 2 and # avoid frequent keyframe
+ #TODO: replace it with keyframe_scheduler
+ (ikf >= max(mf.kf_interval, mf.kf_interval-math.log(norm_dr+1e-9)) or
+ # Insert keyframe if the keyframe and the estimated g_orb are too different
+ norm_gorb < norm_gkf/kf_trust_region)):
+ ikf = 0
+ u = mf.update_rotate_matrix(dr, mo_occ, mo_coeff=mo_coeff)
+ if ukf is not None:
+ u = mf.rotate_mo(ukf, u)
+ ukf = u
+ dr[:] = 0
+ mo1 = mf.rotate_mo(mo_coeff, u)
+ dm = mf.make_rdm1(mo1, mo_occ)
+ # use mf._scf.get_veff to avoid density-fit mf polluting get_veff
+ vhf0 = mf._scf.get_veff(mf._scf.mol, dm, dm_last=dm0, vhf_last=vhf0)
+ dm0 = dm
+ # Use API to compute fock instead of "fock=h1e+vhf0". This is because get_fock
+ # is the hook being overloaded in many places.
+ fock_ao = mf.get_fock(h1e, s1e, vhf0, dm0)
+ g_kf1 = mf.get_grad(mo1, mo_occ, fock_ao)
+ norm_gkf1 = cp.linalg.norm(g_kf1)
+ norm_dg = cp.linalg.norm(g_kf1-g_orb)
+ jkcount += 1
+ kfcount += 1
+ if log.verbose >= logger.DEBUG:
+ e_tot, e_last = mf._scf.energy_tot(dm, h1e, vhf0), e_tot
+ log.debug('Adjust keyframe g_orb to |g|= %4.3g '
+ '|g-correction|=%4.3g E=%.12g dE=%.5g',
+ norm_gkf1, norm_dg, e_tot, e_tot-e_last)
+
+ if (norm_dg < norm_gorb*mf.ah_grad_trust_region # kf not too diff
+ #or norm_gkf1 < norm_gkf # grad is decaying
+ # close to solution
+ or norm_gkf1 < conv_tol_grad*mf.ah_grad_trust_region):
+ kf_trust_region = min(max(norm_gorb/(norm_dg+1e-9), mf.kf_trust_region), 10)
+ log.debug1('Set kf_trust_region = %g', kf_trust_region)
+ g_orb = g_kf = g_kf1
+ norm_gorb = norm_gkf = norm_gkf1
+ else:
+ g_orb = g_orb - hdxi
+ dr -= dxi
+ norm_gorb = cp.linalg.norm(g_orb)
+ log.debug('Out of trust region. Restore previouse step')
+ break
+
+ if ihop > 0:
+ u = mf.update_rotate_matrix(dr, mo_occ, mo_coeff=mo_coeff)
+ if ukf is not None:
+ u = mf.rotate_mo(ukf, u)
+ jkcount += ihop + 1
+ log.debug(' tot inner=%d %d JK |g|= %4.3g |u-1|= %4.3g',
+ imic, jkcount, norm_gorb, cp.linalg.norm(dr))
+ h_op = h_diag = None
+ t3m = log.timer('aug_hess in %d inner iters' % imic, *t3m)
+
+def _davidson_cc(h_op, g_op, precond, x0, tol=1e-10, xs=[], ax=[],
+ max_cycle=30, lindep=1e-14, verbose=logger.WARN):
+ if isinstance(verbose, logger.Logger):
+ log = verbose
+ else:
+ log = logger.Logger(sys.stdout, verbose)
+
+ toloose = tol**.5
+ # the first trial vector is (1,0,0,...), which is not included in xs
+ xs = list(xs)
+ ax = list(ax)
+ nx = len(xs)
+
+ problem_size = x0.size
+ max_cycle = min(max_cycle, problem_size)
+ heff = np.zeros((max_cycle+nx+1,max_cycle+nx+1), dtype=x0.dtype)
+ ovlp = np.eye(max_cycle+nx+1, dtype=x0.dtype)
+ if nx == 0:
+ xs.append(x0)
+ ax.append(h_op(x0))
+ else:
+ for i in range(1, nx+1):
+ for j in range(1, i+1):
+ heff[i,j] = xs[i-1].conj().dot(ax[j-1])
+ ovlp[i,j] = xs[i-1].conj().dot(xs[j-1])
+ heff[1:i,i] = heff[i,1:i].conj()
+ ovlp[1:i,i] = ovlp[i,1:i].conj()
+
+ w_t = 0
+ for istep in range(max_cycle):
+ g = g_op()
+ nx = len(xs)
+ for i in range(nx):
+ heff[i+1,0] = xs[i].conj().dot(g)
+ heff[nx,i+1] = xs[nx-1].conj().dot(ax[i])
+ ovlp[nx,i+1] = xs[nx-1].conj().dot(xs[i])
+ heff[0,:nx+1] = heff[:nx+1,0].conj()
+ heff[1:nx,nx] = heff[nx,1:nx].conj()
+ ovlp[1:nx,nx] = ovlp[nx,1:nx].conj()
+ nvec = nx + 1
+ #s0 = scipy.linalg.eigh(ovlp[:nvec,:nvec])[0][0]
+ #if s0 < lindep:
+ # yield True, istep, w_t, xtrial, hx, dx, s0
+ # break
+ wlast = w_t
+ xtrial, w_t, v_t, index, seig = \
+ _regular_step(heff[:nvec,:nvec], ovlp[:nvec,:nvec], xs,
+ lindep, log)
+ s0 = seig[0]
+ hx = _dgemv(v_t[1:], ax)
+ # note g*v_t[0], as the first trial vector is (1,0,0,...)
+ dx = hx + g*v_t[0] - w_t * v_t[0]*xtrial
+ norm_dx = np.linalg.norm(dx)
+ log.debug1('... AH step %d index= %d |dx|= %.5g eig= %.5g v[0]= %.5g lindep= %.5g',
+ istep+1, index, norm_dx, w_t, v_t[0].real, s0)
+ hx *= 1/v_t[0] # == h_op(xtrial)
+ if ((abs(w_t-wlast) < tol and norm_dx < toloose) or
+ s0 < lindep or
+ istep+1 == problem_size):
+ # Avoid adding more trial vectors if hessian converged
+ yield True, istep+1, w_t, xtrial, hx, dx, s0
+ if s0 < lindep or norm_dx < lindep:# or np.linalg.norm(xtrial) < lindep:
+ # stop the iteration because eigenvectors would be barely updated
+ break
+ else:
+ yield False, istep+1, w_t, xtrial, hx, dx, s0
+ x0 = precond(dx, w_t)
+ xs.append(x0)
+ ax.append(h_op(x0))
+
+def _regular_step(heff, ovlp, xs, lindep, log, root_id=0):
+ w, v, seig = lib.safe_eigh(heff, ovlp, lindep)
+ #if e[0] < -.1:
+ # sel = 0
+ #else:
+ # There exists systems that the first eigenvalue of AH is -inf.
+ # Dynamically choosing the eigenvectors may be better.
+ idx = np.nonzero(abs(v[0]) > 0.1)[0]
+ sel = idx[root_id]
+ log.debug1('CIAH eigen-sel %s', sel)
+ w_t = w[sel]
+
+ if w_t < 1e-4:
+ try:
+ e, c = scipy.linalg.eigh(heff[1:,1:], ovlp[1:,1:])
+ except scipy.linalg.LinAlgError:
+ e, c = lib.safe_eigh(heff[1:,1:], ovlp[1:,1:], lindep)[:2]
+ if np.any(e < -1e-5):
+ log.debug('Negative hessians found %s', e[e<0])
+
+ xtrial = _dgemv(v[1:,sel]/v[0,sel], xs)
+ return xtrial, w_t, v[:,sel], sel, seig
+
+def _dgemv(v, m):
+ vm = v[0] * m[0]
+ for i,vi in enumerate(v[1:]):
+ vm += vi * m[i+1]
+ return vm
+
+
+def kernel(mf, mo_coeff=None, mo_occ=None, dm=None,
+ conv_tol=1e-10, conv_tol_grad=None, max_cycle=50, dump_chk=True,
+ callback=None, verbose=logger.NOTE):
+ log = logger.new_logger(mf, verbose)
+ cput0 = log.init_timer()
+ mol = mf._scf.mol
+ assert mol is mf.mol
+
+ if conv_tol_grad is None:
+ conv_tol_grad = conv_tol**.5
+ log.info('Set conv_tol_grad to %g', conv_tol_grad)
+
+ # call mf._scf.get_hcore, mf._scf.get_ovlp because they might be overloaded
+ h1e = mf._scf.get_hcore(mol)
+ s1e = mf._scf.get_ovlp(mol)
+
+ if mo_coeff is not None and mo_occ is not None:
+ dm = mf.make_rdm1(mo_coeff, mo_occ)
+ # call mf._scf.get_veff, to avoid "newton().density_fit()" polluting get_veff
+ vhf = mf._scf.get_veff(mol, dm)
+ fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+ mo_energy, mo_tmp = mf.eig(fock, s1e)
+ mf.get_occ(mo_energy, mo_tmp)
+ mo_tmp = None
+
+ else:
+ if dm is None:
+ dm = mf.get_init_guess(mol, mf.init_guess)
+ vhf = mf._scf.get_veff(mol, dm)
+ fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+ mo_energy, mo_coeff = mf.eig(fock, s1e)
+ mo_occ = mf.get_occ(mo_energy, mo_coeff)
+ dm, dm_last = mf.make_rdm1(mo_coeff, mo_occ), dm
+ vhf = mf._scf.get_veff(mol, dm, dm_last=dm_last, vhf_last=vhf)
+
+ # Save mo_coeff and mo_occ because they are needed by function rotate_mo
+ mf.mo_coeff, mf.mo_occ = mo_coeff, mo_occ
+
+ e_tot = mf._scf.energy_tot(dm, h1e, vhf)
+ fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+ log.info('Initial guess E= %.15g |g|= %g', e_tot,
+ cp.linalg.norm(mf._scf.get_grad(mo_coeff, mo_occ, fock)))
+
+ if dump_chk and mf.chkfile:
+ chkfile.save_mol(mol, mf.chkfile)
+
+ # Copy the integral file to soscf object to avoid the integrals being
+ # cached twice.
+ if mol is mf.mol and not getattr(mf, 'with_df', None):
+ mf._eri = mf._scf._eri
+
+ rotaiter = _rotate_orb_cc(mf, h1e, s1e, conv_tol_grad, verbose=log)
+ next(rotaiter) # start the iterator
+ kftot = jktot = 0
+ norm_gorb = 0.
+ scf_conv = False
+ cput1 = log.timer('initializing second order scf', *cput0)
+
+ for imacro in range(max_cycle):
+ u, g_orb, kfcount, jkcount, dm_last, vhf = \
+ rotaiter.send((mo_coeff, mo_occ, dm, vhf, e_tot))
+ kftot += kfcount + 1
+ jktot += jkcount + 1
+
+ last_hf_e = e_tot
+ norm_gorb = cp.linalg.norm(g_orb)
+ mo_coeff = mf.rotate_mo(mo_coeff, u, log)
+ dm = mf.make_rdm1(mo_coeff, mo_occ)
+ vhf = mf._scf.get_veff(mol, dm, dm_last=dm_last, vhf_last=vhf)
+ fock = mf.get_fock(h1e, s1e, vhf, dm, level_shift_factor=0)
+ # NOTE: DO NOT change the initial guess mo_occ, mo_coeff
+ if mf.verbose >= logger.DEBUG:
+ mo_energy, mo_tmp = mf.eig(fock, s1e)
+ mf.get_occ(mo_energy, mo_tmp)
+ # call mf._scf.energy_tot for dft, because the (dft).get_veff step saved _exc in mf._scf
+ e_tot = mf._scf.energy_tot(dm, h1e, vhf)
+
+ log.info('macro= %d E= %.15g delta_E= %g |g|= %g %d KF %d JK',
+ imacro, e_tot, e_tot-last_hf_e, norm_gorb,
+ kfcount+1, jkcount)
+ cput1 = log.timer('cycle= %d'%(imacro+1), *cput1)
+
+ if callable(mf.check_convergence):
+ scf_conv = mf.check_convergence(locals())
+ elif abs(e_tot-last_hf_e) < conv_tol and norm_gorb < conv_tol_grad:
+ scf_conv = True
+
+ if dump_chk:
+ mf.dump_chk(locals())
+
+ if callable(callback):
+ callback(locals())
+
+ if scf_conv:
+ break
+
+ if callable(callback):
+ callback(locals())
+
+ rotaiter.close()
+ mo_energy, mo_coeff1 = mf._scf.canonicalize(mo_coeff, mo_occ, fock)
+ if mf.canonicalization:
+ log.info('Canonicalize SCF orbitals')
+ mo_coeff = mo_coeff1
+ if dump_chk:
+ mf.dump_chk(locals())
+ log.info('macro X = %d E=%.15g |g|= %g total %d KF %d JK',
+ imacro+1, e_tot, norm_gorb, kftot+1, jktot+1)
+
+ if cp.any(mo_occ==0):
+ homo = mo_energy[mo_occ>0].max()
+ lumo = mo_energy[mo_occ==0].min()
+ if homo > lumo:
+ log.warn('canonicalized orbital HOMO %s > LUMO %s ', homo, lumo)
+ return scf_conv, e_tot, mo_energy, mo_coeff, mo_occ
+
+# A tag to label the derived SCF class
+class _CIAH_SOSCF:
+ '''
+ Attributes for Newton solver:
+ max_cycle_inner : int
+ AH iterations within eacy macro iterations. Default is 10
+ max_stepsize : int
+ The step size for orbital rotation. Small step is prefered. Default is 0.05.
+ canonicalization : bool
+ To control whether to canonicalize the orbitals optimized by
+ Newton solver. Default is True.
+ '''
+
+ __name_mixin__ = 'SecondOrder'
+
+ max_cycle_inner = _SOSCF_cpu.max_cycle_inner
+ max_stepsize = _SOSCF_cpu.max_stepsize
+ canonicalization = _SOSCF_cpu.canonicalization
+
+ ah_start_tol = _SOSCF_cpu.ah_start_tol
+ ah_start_cycle = _SOSCF_cpu.ah_start_cycle
+ ah_level_shift = _SOSCF_cpu.ah_level_shift
+ ah_conv_tol = _SOSCF_cpu.ah_conv_tol
+ ah_lindep = _SOSCF_cpu.ah_lindep
+ ah_max_cycle = _SOSCF_cpu.ah_max_cycle
+ ah_grad_trust_region = _SOSCF_cpu.ah_grad_trust_region
+ kf_interval = _SOSCF_cpu.kf_interval
+ kf_trust_region = _SOSCF_cpu.kf_trust_region
+
+ _keys = _SOSCF_cpu._keys
+
+ to_gpu = utils.to_gpu
+ device = utils.device
+ to_cpu = utils.to_cpu
+
+ def __init__(self, mf):
+ self.__dict__.update(mf.__dict__)
+ self._scf = mf
+
+ def undo_soscf(self):
+ '''Remove the SOSCF Mixin'''
+ from gpu4pyscf.df.df_jk import _DFHF
+ if isinstance(self, _DFHF) and not isinstance(self._scf, _DFHF):
+ # where density fitting is only applied on the SOSCF hessian
+ mf = self.undo_df()
+ else:
+ mf = self
+ obj = lib.view(mf, lib.drop_class(mf.__class__, _CIAH_SOSCF))
+ del obj._scf
+ # When both self and self._scf are DF objects, they may be different df
+ # objects. The DF object of the base scf object should be used.
+ if hasattr(self._scf, 'with_df'):
+ obj.with_df = self._scf.with_df
+ return obj
+
+ undo_newton = undo_soscf
+
+ def dump_flags(self, verbose=None):
+ log = logger.new_logger(self, verbose)
+ log.info('\n')
+ super().dump_flags(verbose)
+ log.info('******** %s Newton solver flags ********', self._scf.__class__)
+ log.info('max_cycle_inner = %d', self.max_cycle_inner)
+ log.info('max_stepsize = %g', self.max_stepsize)
+ log.info('ah_start_tol = %g', self.ah_start_tol)
+ log.info('ah_level_shift = %g', self.ah_level_shift)
+ log.info('ah_conv_tol = %g', self.ah_conv_tol)
+ log.info('ah_lindep = %g', self.ah_lindep)
+ log.info('ah_start_cycle = %d', self.ah_start_cycle)
+ log.info('ah_max_cycle = %d', self.ah_max_cycle)
+ log.info('ah_grad_trust_region = %g', self.ah_grad_trust_region)
+ log.info('kf_interval = %d', self.kf_interval)
+ log.info('kf_trust_region = %d', self.kf_trust_region)
+ log.info('canonicalization = %s', self.canonicalization)
+ return self
+
+ build = _SOSCF_cpu.build
+ reset = _SOSCF_cpu.reset
+
+ def kernel(self, mo_coeff=None, mo_occ=None, dm0=None):
+ if mo_coeff is None: mo_coeff = self.mo_coeff
+ if mo_occ is None: mo_occ = self.mo_occ
+ cput0 = logger.init_timer(self)
+ self.build(self.mol)
+ self.dump_flags()
+
+ self.converged, self.e_tot, \
+ self.mo_energy, self.mo_coeff, self.mo_occ = \
+ kernel(self, mo_coeff, mo_occ, dm0, conv_tol=self.conv_tol,
+ conv_tol_grad=self.conv_tol_grad,
+ max_cycle=self.max_cycle,
+ callback=self.callback, verbose=self.verbose)
+
+ logger.timer(self, 'Second order SCF', *cput0)
+ self._finalize()
+ return self.e_tot
+
+ from_dm = _SOSCF_cpu.from_dm
+
+ gen_g_hop = gen_g_hop_rhf
+
+ def update_rotate_matrix(self, dx, mo_occ, u0=1, mo_coeff=None):
+ nmo = len(mo_occ)
+ x1 = cp.zeros((nmo,nmo), dtype=dx.dtype)
+ occidxa = mo_occ>0
+ occidxb = mo_occ==2
+ viridxa = ~occidxa
+ viridxb = ~occidxb
+ mask = (viridxa[:,None] & occidxa) | (viridxb[:,None] & occidxb)
+ x1[mask] = dx
+ dr = x1 - x1.conj().T
+ u = expm(dr)
+ if isinstance(u0, cp.ndarray):
+ u = u0.dot(u)
+ return u
+
+ def rotate_mo(self, mo_coeff, u, log=None):
+ return mo_coeff.dot(u)
+
+class _SecondOrderROHF(_CIAH_SOSCF):
+ gen_g_hop = gen_g_hop_rohf
+
+class _SecondOrderUHF(_CIAH_SOSCF):
+ gen_g_hop = gen_g_hop_uhf
+
+ def update_rotate_matrix(self, dx, mo_occ, u0=1, mo_coeff=None):
+ occidxa = mo_occ[0] > 0
+ occidxb = mo_occ[1] > 0
+ viridxa = ~occidxa
+ viridxb = ~occidxb
+
+ nmo = len(occidxa)
+ dr = cp.zeros((2,nmo,nmo), dtype=dx.dtype)
+ uniq = cp.array((viridxa[:,None] & occidxa,
+ viridxb[:,None] & occidxb))
+ dr[uniq] = dx
+ dr = dr - dr.conj().transpose(0,2,1)
+
+ if isinstance(u0, int) and u0 == 1:
+ return cp.asarray((expm(dr[0]), expm(dr[1])))
+ else:
+ return cp.asarray((u0[0].dot(expm(dr[0])),
+ u0[1].dot(expm(dr[1]))))
+
+ def rotate_mo(self, mo_coeff, u, log=None):
+ mo = cp.asarray((mo_coeff[0].dot(u[0]),
+ mo_coeff[1].dot(u[1])))
+ return mo
+
+ def kernel(self, mo_coeff=None, mo_occ=None, dm0=None):
+ if isinstance(mo_coeff, cp.ndarray) and mo_coeff.ndim == 2:
+ mo_coeff = (mo_coeff, mo_coeff)
+ if isinstance(mo_occ, cp.ndarray) and mo_occ.ndim == 1:
+ mo_occ = (cp.asarray(mo_occ >0, dtype=np.float64),
+ cp.asarray(mo_occ==2, dtype=np.float64))
+ return _CIAH_SOSCF.kernel(self, mo_coeff, mo_occ, dm0)
+
+class _SecondOrderRHF(_CIAH_SOSCF):
+ gen_g_hop = gen_g_hop_rhf
+
+def newton(mf):
+ if isinstance(mf, _CIAH_SOSCF):
+ return mf
+
+ assert isinstance(mf, hf.SCF)
+
+ if mf.istype('ROHF'):
+ cls = _SecondOrderROHF
+ elif mf.istype('UHF'):
+ cls = _SecondOrderUHF
+ elif mf.istype('GHF'):
+ raise NotImplementedError
+ elif mf.istype('RDHF'):
+ raise NotImplementedError
+ elif mf.istype('DHF'):
+ raise NotImplementedError
+ else:
+ cls = _SecondOrderRHF
+ return lib.set_class(cls(mf), (cls, mf.__class__))
diff --git a/gpu4pyscf/scf/tests/test_scf_j_engine.py b/gpu4pyscf/scf/tests/test_scf_j_engine.py
new file mode 100644
index 00000000..19291e5a
--- /dev/null
+++ b/gpu4pyscf/scf/tests/test_scf_j_engine.py
@@ -0,0 +1,45 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import pyscf
+from pyscf import lib
+from gpu4pyscf.scf import j_engine
+from pyscf.scf.hf import get_jk
+
+def test_j_engine():
+ mol = pyscf.M(
+ atom = '''
+ O 0.000 -0. 0.1174
+ H -0.757 4. -0.4696
+ H 0.757 4. -0.4696
+ C 1. 1. 0.
+ H 4. 0. 3.
+ H 0. 1. .6
+ ''',
+ basis='def2-tzvp',
+ unit='B',)
+
+ np.random.seed(9)
+ nao = mol.nao
+ dm = np.random.rand(nao, nao)
+ dm = dm.dot(dm.T)
+
+ vj = j_engine.get_j(mol, dm)
+ vj1 = vj.get()
+ ref = get_jk(mol, dm, with_k=False)[0]
+ assert abs(lib.fp(vj1) - -2327.4715195591784) < 1e-9
+ assert abs(vj1 - ref).max() < 1e-9
diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py
new file mode 100644
index 00000000..b7fa3990
--- /dev/null
+++ b/gpu4pyscf/scf/tests/test_soscf.py
@@ -0,0 +1,224 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import cupy as cp
+from pyscf import gto
+from gpu4pyscf import scf
+from gpu4pyscf import dft
+
+def setUpModule():
+ global h2o_z0, h2o_z1
+ h2o_z0 = gto.M(
+ verbose = 5,
+ output = '/dev/null',
+ atom = [
+ ["O" , (0. , 0. , 0.)],
+ [1 , (0. , -0.757 , 0.587)],
+ [1 , (0. , 0.757 , 0.587)] ],
+ basis = '6-31g')
+
+ h2o_z1 = gto.M(
+ verbose = 5,
+ output = '/dev/null',
+ atom = [
+ ["O" , (0. , 0. , 0.)],
+ [1 , (0. , -0.757 , 0.587)],
+ [1 , (0. , 0.757 , 0.587)] ],
+ basis = '6-31g',
+ charge = 1,
+ spin = 1,)
+
+def tearDownModule():
+ global h2o_z0, h2o_z1
+ h2o_z0.stdout.close()
+ h2o_z1.stdout.close()
+ del h2o_z0, h2o_z1
+
+class KnownValues(unittest.TestCase):
+ def test_nr_rhf(self):
+ mf = scf.RHF(h2o_z0)
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 2
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), -75.98394849812, 9)
+
+ def test_nr_rohf(self):
+ mf = scf.ROHF(h2o_z1)
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 20
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), -75.5783963795897, 9)
+
+ def test_nr_uhf(self):
+ mf = scf.UHF(h2o_z1)
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 2
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), -75.58051984397145, 9)
+
+ def test_nr_rks_lda(self):
+ mf = dft.RKS(h2o_z0)
+ eref = mf.kernel()
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 3
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_nr_rks_rsh(self):
+ '''test range-separated Coulomb'''
+ mf = dft.RKS(h2o_z0)
+ mf.xc = 'wb97x'
+ eref = mf.kernel()
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 3
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_nr_rks(self):
+ mf = dft.RKS(h2o_z0)
+ mf.xc = 'b3lyp'
+ eref = mf.kernel()
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 3
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_rks_gen_g_hop(self):
+ mf = dft.RKS(h2o_z0)
+ mf.grids.build()
+ mf.xc = 'b3lyp'
+ nao = h2o_z0.nao_nr()
+ mo = cp.random.random((nao,nao))
+ mo_occ = cp.zeros(nao)
+ mo_occ[:5] = 2
+ nocc, nvir = 5, nao-5
+ dm1 = cp.random.random(nvir*nocc)
+ nr = mf.newton()
+ g, hop, hdiag = nr.gen_g_hop(mo, mo_occ)
+ mf_cpu = mf.to_cpu().newton()
+ hop_ref = mf_cpu.gen_g_hop(mo.get(), mo_occ.get())[1]
+ self.assertAlmostEqual(abs(hop(dm1).get() - hop_ref(dm1.get())).max(), 0, 9)
+
+ def test_nr_roks(self):
+ mf = dft.RKS(h2o_z1)
+ mf.xc = 'b3lyp'
+ eref = mf.kernel()
+
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 3
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_nr_uks_lda(self):
+ mf = dft.UKS(h2o_z1)
+ eref = mf.kernel()
+
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 2
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_nr_uks_rsh(self):
+ '''test range-separated Coulomb'''
+ mf = dft.UKS(h2o_z1)
+ mf.xc = 'wb97x'
+ eref = mf.kernel()
+
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 3
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_nr_uks(self):
+ mf = dft.UKS(h2o_z1)
+ mf.xc = 'b3lyp'
+ eref = mf.kernel()
+
+ mf.max_cycle = 1
+ mf.conv_check = False
+ mf.kernel()
+ nr = mf.newton()
+ nr.max_cycle = 3
+ nr.conv_tol_grad = 1e-5
+ self.assertAlmostEqual(nr.kernel(), eref, 9)
+
+ def test_uks_gen_g_hop(self):
+ mf = dft.UKS(h2o_z0)
+ mf.grids.build()
+ mf.xc = 'hse06'
+ nao = h2o_z0.nao_nr()
+ mo = cp.random.random((2, nao,nao))
+ mo_occ = cp.zeros((2,nao))
+ mo_occ[:,:5] = 1
+ nocc, nvir = 5, nao-5
+ dm1 = cp.random.random(nvir*nocc*2)
+ nr = mf.newton()
+ g, hop, hdiag = nr.gen_g_hop(mo, mo_occ)
+ mf_cpu = mf.to_cpu().newton()
+ hop_ref = mf_cpu.gen_g_hop(mo.get(), mo_occ.get())[1]
+ self.assertAlmostEqual(abs(hop(dm1).get() - hop_ref(dm1.get())).max(), 0, 9)
+
+ def test_with_df(self):
+ mf = scf.RHF(h2o_z0).density_fit().newton().run()
+ self.assertTrue(mf._eri is None)
+ self.assertAlmostEqual(mf.e_tot, -75.983944727996, 9)
+ self.assertEqual(mf.__class__.__name__, 'SecondOrderDFRHF')
+
+ mf = scf.RHF(h2o_z0).newton().density_fit().run()
+ self.assertTrue(mf._eri is None)
+ self.assertAlmostEqual(mf.e_tot, -75.9839484980661, 9)
+ mf = mf.undo_newton()
+ self.assertEqual(mf.__class__.__name__, 'RHF')
+
+ def test_secondary_auxbasis(self):
+ mf_ref = scf.UHF(h2o_z0).run()
+ mf = scf.UHF(h2o_z0).newton().density_fit(auxbasis=[[0, [1., 1.]]]).run()
+ self.assertAlmostEqual(mf_ref.e_tot, mf.e_tot, 8)
+
+ mf_ref = scf.UHF(h2o_z0).density_fit().run()
+ mf = scf.UHF(h2o_z0).density_fit().newton().density_fit(auxbasis=[[0, [1., 1.]]]).run()
+ self.assertAlmostEqual(mf_ref.e_tot, mf.e_tot, 8)
+
+if __name__ == "__main__":
+ print("Full Tests for Newton solver")
+ unittest.main()
diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py
index 17826721..2c7dbf08 100644
--- a/gpu4pyscf/scf/uhf.py
+++ b/gpu4pyscf/scf/uhf.py
@@ -70,7 +70,8 @@ def spin_square(mo, s=1):
def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
diis_start_cycle=None, level_shift_factor=None, damp_factor=None):
if dm is None: dm = mf.make_rdm1()
- if h1e is None: h1e = cupy.asarray(mf.get_hcore())
+ if h1e is None: h1e = mf.get_hcore()
+ if s1e is None: s1e = mf.get_ovlp()
if vhf is None: vhf = mf.get_veff(mf.mol, dm)
if not isinstance(s1e, cupy.ndarray): s1e = cupy.asarray(s1e)
if not isinstance(dm, cupy.ndarray): dm = cupy.asarray(dm)
@@ -150,6 +151,36 @@ def energy_elec(mf, dm=None, h1e=None, vhf=None):
logger.debug(mf, 'E1 = %s Ecoul = %s', e1, e_coul.real)
return e_elec, e_coul
+def canonicalize(mf, mo_coeff, mo_occ, fock=None):
+ '''Canonicalization diagonalizes the UHF Fock matrix within occupied,
+ virtual subspaces separatedly (without change occupancy).
+ '''
+ mo_occ = cupy.asarray(mo_occ)
+ assert mo_occ.ndim == 2
+ if fock is None:
+ dm = mf.make_rdm1(mo_coeff, mo_occ)
+ fock = mf.get_fock(dm=dm)
+ occidxa = mo_occ[0] == 1
+ occidxb = mo_occ[1] == 1
+ viridxa = mo_occ[0] == 0
+ viridxb = mo_occ[1] == 0
+
+ def eig_(fock, mo_coeff, idx, es, cs):
+ if cupy.any(idx) > 0:
+ orb = mo_coeff[:,idx]
+ f1 = orb.conj().T.dot(fock).dot(orb)
+ e, c = cupy.linalg.eigh(f1)
+ es[idx] = e
+ cs[:,idx] = cupy.dot(orb, c)
+
+ mo = cupy.empty_like(mo_coeff)
+ mo_e = cupy.empty(mo_occ.shape)
+ eig_(fock[0], mo_coeff[0], occidxa, mo_e[0], mo[0])
+ eig_(fock[0], mo_coeff[0], viridxa, mo_e[0], mo[0])
+ eig_(fock[1], mo_coeff[1], occidxb, mo_e[1], mo[1])
+ eig_(fock[1], mo_coeff[1], viridxb, mo_e[1], mo[1])
+ return mo_e, mo
+
class UHF(hf.SCF):
from gpu4pyscf.lib.utils import to_gpu, device
@@ -195,6 +226,7 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
fock = self.get_hcore(self.mol) + self.get_veff(self.mol, dm1)
return get_grad(mo_coeff, mo_occ, fock)
+ make_asym_dm = NotImplemented
make_rdm2 = NotImplemented
energy_elec = energy_elec
get_init_guess = hf.return_cupy_array(uhf.UHF.get_init_guess)
@@ -204,15 +236,6 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
init_guess_by_mod_huckel = uhf.UHF.init_guess_by_mod_huckel
init_guess_by_1e = uhf.UHF.init_guess_by_1e
init_guess_by_chkfile = uhf.UHF.init_guess_by_chkfile
-
- analyze = NotImplemented
- mulliken_pop = NotImplemented
- mulliken_spin_pop = NotImplemented
- mulliken_meta = NotImplemented
- mulliken_meta_spin = NotImplemented
- canonicalize = NotImplemented
- det_ovlp = NotImplemented
- make_asym_dm = NotImplemented
_finalize = uhf.UHF._finalize
conv_tol_cpscf = 1e-4
@@ -225,9 +248,9 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
density_fit = hf.RHF.density_fit
energy_tot = hf.RHF.energy_tot
energy_elec = energy_elec
+ canonicalize = canonicalize
make_rdm2 = NotImplemented
- newton = NotImplemented
x2c = x2c1e = sfx2c1e = NotImplemented
to_rhf = NotImplemented
to_uhf = NotImplemented
@@ -236,7 +259,6 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
to_uks = NotImplemented
to_gks = NotImplemented
to_ks = NotImplemented
- canonicalize = NotImplemented
# TODO: Enable followings after testing
analyze = NotImplemented
stability = NotImplemented
@@ -290,6 +312,10 @@ def nuc_grad_method(self):
from gpu4pyscf.grad import uhf
return uhf.Gradients(self)
+ def newton(self):
+ from gpu4pyscf.scf.soscf import newton
+ return newton(self)
+
def to_cpu(self):
from gpu4pyscf.lib import utils
mf = uhf.UHF(self.mol)
diff --git a/gpu4pyscf/scf/uhf_symm.py b/gpu4pyscf/scf/uhf_symm.py
new file mode 100644
index 00000000..b1785a60
--- /dev/null
+++ b/gpu4pyscf/scf/uhf_symm.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+from gpu4pyscf.scf.uhf import UHF
+
+SymAdaptedUHF = UHF
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 1fce56f8..1df748e6 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -243,10 +243,10 @@ def grad_qv(pcmobj, dm):
dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart)
dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart)
- cart_ao_idx = intopt.cart_ao_idx
- rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
- dvj = dvj[:,rev_cart_ao_idx]
-
+ if not mol.cart:
+ dvj = dvj @ intopt.cart2sph
+ dvj = intopt.unsort_orbitals(dvj, axis=[1])
+
aoslice = intopt.mol.aoslice_by_atom()
dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index 967d25f6..1060f3d4 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -128,9 +128,19 @@ def test_to_gpu(self):
hess_gpu = hessobj.kernel()
assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-8
'''
+ mol = gto.Mole()
+ mol.atom = '''
+O 0.0000000000 -0.0000000000 0.1174000000
+H -0.7570000000 -0.0000000000 -0.4696000000
+H 0.7570000000 0.0000000000 -0.4696000000
+ '''
+ mol.basis = 'sto-3g'
+ mol.output = '/dev/null'
+ mol.build(verbose=0)
mf = pyscf.dft.RKS(mol, xc='b3lyp').density_fit().PCM()
mf.conv_tol = 1e-12
mf.conv_tol_cpscf = 1e-7
+ mf.grids.atom_grid = (50,194)
mf.kernel()
hessobj = mf.Hessian()
hess_cpu = hessobj.kernel()
@@ -148,9 +158,19 @@ def test_to_cpu(self):
e_cpu = mf.kernel()
assert abs(e_cpu - e_gpu) < 1e-8
'''
+ mol = gto.Mole()
+ mol.atom = '''
+O 0.0000000000 -0.0000000000 0.1174000000
+H -0.7570000000 -0.0000000000 -0.4696000000
+H 0.7570000000 0.0000000000 -0.4696000000
+ '''
+ mol.basis = 'sto-3g'
+ mol.output = '/dev/null'
+ mol.build(verbose=0)
mf = dft.RKS(mol, xc='b3lyp').density_fit().PCM()
mf.conv_tol = 1e-12
mf.conv_tol_cpscf = 1e-7
+ mf.grids.atom_grid = (50,194)
mf.kernel()
hessobj = mf.Hessian()
hess_gpu = hessobj.kernel()
diff --git a/gpu4pyscf/tdscf/__init__.py b/gpu4pyscf/tdscf/__init__.py
new file mode 100644
index 00000000..552cccee
--- /dev/null
+++ b/gpu4pyscf/tdscf/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+from gpu4pyscf.tdscf import rhf
+from gpu4pyscf.tdscf import uhf
+from gpu4pyscf.tdscf import rks
+from gpu4pyscf.tdscf import uks
diff --git a/gpu4pyscf/tdscf/_uhf_resp_sf.py b/gpu4pyscf/tdscf/_uhf_resp_sf.py
new file mode 100644
index 00000000..4ea074dc
--- /dev/null
+++ b/gpu4pyscf/tdscf/_uhf_resp_sf.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+# TODO: merge this function into scf._response_functions.py
+
+import functools
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.dft import numint2c, xc_deriv
+from gpu4pyscf.scf import hf, uhf
+from gpu4pyscf.dft.numint import _scale_ao, _tau_dot, eval_rho, eval_rho2
+from gpu4pyscf.lib.cupy_helper import transpose_sum, add_sparse, contract
+
+def gen_uhf_response_sf(mf, mo_coeff=None, mo_occ=None, hermi=0,
+ collinear='mcol', collinear_samples=200):
+ '''Generate a function to compute the product of Spin Flip UKS response function
+ and UKS density matrices.
+ '''
+ assert isinstance(mf, (uhf.UHF))
+ if mo_coeff is None: mo_coeff = mf.mo_coeff
+ if mo_occ is None: mo_occ = mf.mo_occ
+ mol = mf.mol
+ assert hermi == 0
+
+ if isinstance(mf, hf.KohnShamDFT):
+ if mf.do_nlc():
+ logger.warn(mf, 'NLC functional found in DFT object. Its second '
+ 'deriviative is not available. Its contribution is '
+ 'not included in the response function.')
+
+ ni = mf._numint
+ omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
+ hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+
+ if collinear in ('ncol', 'mcol'):
+ fxc = cache_xc_kernel_sf(ni, mol, mf.grids, mf.xc, mo_coeff, mo_occ,
+ collinear_samples)[2]
+ dm0 = None
+
+ def vind(dm1):
+ if collinear in ('ncol', 'mcol'):
+ v1 = nr_uks_fxc_sf(ni, mol, mf.grids, mf.xc, dm0, dm1, 0, hermi,
+ None, None, fxc)
+ else:
+ v1 = cp.zeros_like(dm1)
+ if hybrid:
+ # j = 0 in spin flip part.
+ if omega == 0:
+ vk = mf.get_k(mol, dm1, hermi) * hyb
+ elif alpha == 0: # LR=0, only SR exchange
+ vk = mf.get_k(mol, dm1, hermi, omega=-omega) * hyb
+ elif hyb == 0: # SR=0, only LR exchange
+ vk = mf.get_k(mol, dm1, hermi, omega=omega) * alpha
+ else: # SR and LR exchange with different ratios
+ vk = mf.get_k(mol, dm1, hermi) * hyb
+ vk += mf.get_k(mol, dm1, hermi, omega=omega) * (alpha-hyb)
+ v1 -= vk
+ return v1
+ return vind
+
+ else: #HF
+ def vind(dm1):
+ vk = mf.get_k(mol, dm1, hermi)
+ return -vk
+ return vind
+
+# This function is copied from pyscf.dft.numint2c.py
+def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
+ evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype)
+ evfk = list(evfk)
+ for order in range(1, deriv+1):
+ if evfk[order] is not None:
+ evfk[order] = xc_deriv.ud2ts(evfk[order])
+ return evfk
+
+# Edited based on pyscf.dft.numint2c.mcfun_eval_xc_adapter
+def mcfun_eval_xc_adapter_sf(ni, xc_code, collinear_samples):
+ '''Wrapper to generate the eval_xc function required by mcfun
+ '''
+
+ try:
+ import mcfun
+ except ImportError:
+ raise ImportError('This feature requires mcfun library.\n'
+ 'Try install mcfun with `pip install mcfun`')
+
+ ni = numint2c.NumInt2C()
+ ni.collinear = 'mcol'
+ ni.collinear_samples = collinear_samples
+ xctype = ni._xc_type(xc_code)
+ fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype)
+ nproc = lib.num_threads()
+
+ def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None):
+ res = mcfun.eval_xc_eff_sf(
+ fn_eval_xc, rho.get(), deriv,
+ collinear_samples=collinear_samples, workers=nproc)
+ return [x if x is None else cp.asarray(x) for x in res]
+ return eval_xc_eff
+
+def cache_xc_kernel_sf(ni, mol, grids, xc_code, mo_coeff, mo_occ,
+ collinear_samples):
+ '''Compute the fxc_sf, which can be used in SF-TDDFT/TDA
+ '''
+ xctype = ni._xc_type(xc_code)
+ if xctype == 'GGA':
+ ao_deriv = 1
+ elif xctype == 'MGGA':
+ ao_deriv = 1
+ else:
+ ao_deriv = 0
+ assert isinstance(mo_coeff, cp.ndarray)
+ assert mo_coeff.ndim == 3
+
+ nao = mo_coeff[0].shape[0]
+ rhoa = []
+ rhob = []
+
+ with_lapl = False
+ opt = getattr(ni, 'gdftopt', None)
+ if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+ ni.build(mol, grids.coords)
+ opt = ni.gdftopt
+ _sorted_mol = opt._sorted_mol
+ mo_coeff = opt.sort_orbitals(mo_coeff, axis=[1])
+
+ for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ rhoa_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff[0,idx,:],
+ mo_occ[0], None, xctype, with_lapl)
+ rhob_slice = eval_rho2(_sorted_mol, ao_mask, mo_coeff[1,idx,:],
+ mo_occ[1], None, xctype, with_lapl)
+ rhoa.append(rhoa_slice)
+ rhob.append(rhob_slice)
+ rho_ab = (cp.hstack(rhoa), cp.hstack(rhob))
+ rho_z = cp.array([rho_ab[0]+rho_ab[1],
+ rho_ab[0]-rho_ab[1]])
+ eval_xc_eff = mcfun_eval_xc_adapter_sf(ni, xc_code, collinear_samples)
+ vxc, fxc = eval_xc_eff(xc_code, rho_z, deriv=2, xctype=xctype)[1:3]
+ return rho_ab, vxc, fxc
+
+def nr_uks_fxc_sf(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
+ rho0=None, vxc=None, fxc=None):
+ if fxc is None:
+ raise RuntimeError('fxc was not initialized')
+ assert hermi == 0
+ assert dms.dtype == np.double
+
+ xctype = ni._xc_type(xc_code)
+ opt = getattr(ni, 'gdftopt', None)
+ if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+ ni.build(mol, grids.coords)
+ opt = ni.gdftopt
+ mol = None
+ _sorted_mol = opt._sorted_mol
+ nao, nao0 = opt.coeff.shape
+ dm_shape = dms.shape
+
+ dms = cp.asarray(dms).reshape(-1,nao0,nao0)
+ dms = opt.sort_orbitals(dms, axis=[1,2])
+
+ nset = len(dms)
+ vmat = cp.zeros((nset, nao, nao))
+
+ if xctype == 'LDA':
+ ao_deriv = 0
+ elif xctype == 'GGA':
+ ao_deriv = 1
+ elif xctype == 'MGGA':
+ ao_deriv = 1
+ else:
+ raise RuntimeError(f'Unknown xctype {xctype}')
+ p0 = p1 = 0
+ for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
+ p0, p1 = p1, p1+len(weights)
+ # precompute fxc_w. *2.0 becausue xx + yy
+ fxc_w = fxc[:,:,p0:p1] * weights * 2.
+
+ for i in range(nset):
+ rho1 = eval_rho(_sorted_mol, ao, dms[i,mask[:,None],mask],
+ xctype=xctype, hermi=hermi)
+ if xctype == 'LDA':
+ wv = rho1 * fxc_w[0,0]
+ vtmp = ao.dot(_scale_ao(ao, wv).T)
+ elif xctype == 'GGA':
+ wv = contract('bg,abg->ag', rho1, fxc_w)
+ wv[0] *= .5 # for transpose_sum at the end
+ vtmp = ao[0].dot(_scale_ao(ao, wv).T)
+ elif xctype == 'MGGA':
+ wv = contract('bg,abg->ag', rho1, fxc_w)
+ wv[[0,4]] *= .5 # for transpose_sum at the end
+ vtmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T)
+ vtmp += _tau_dot(ao, ao, wv[4])
+ add_sparse(vmat[i], vtmp, mask)
+
+ vmat = opt.unsort_orbitals(vmat, axis=[1,2])
+ if xctype != 'LDA':
+ transpose_sum(vmat)
+ if len(dm_shape) == 2:
+ vmat = vmat[0]
+ return vmat
diff --git a/gpu4pyscf/tdscf/rhf.py b/gpu4pyscf/tdscf/rhf.py
new file mode 100644
index 00000000..9e33b6e8
--- /dev/null
+++ b/gpu4pyscf/tdscf/rhf.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+import numpy as np
+import cupy as cp
+import scipy.linalg
+from pyscf import gto
+from pyscf import lib
+from pyscf.tdscf import rhf as tdhf_cpu
+from pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig
+from gpu4pyscf import scf
+from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from gpu4pyscf.lib import utils
+from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import _response_functions # noqa
+from pyscf import __config__
+
+REAL_EIG_THRESHOLD = tdhf_cpu.REAL_EIG_THRESHOLD
+#OUTPUT_THRESHOLD = tdhf_cpu.OUTPUT_THRESHOLD
+OUTPUT_THRESHOLD = getattr(__config__, 'tdscf_rhf_get_nto_threshold', 0.3)
+
+__all__ = [
+ 'TDA', 'CIS', 'TDHF', 'TDRHF', 'TDBase'
+]
+
+
+def gen_tda_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
+ '''Generate function to compute A x
+ '''
+ assert fock_ao is None
+ assert isinstance(mf, scf.hf.SCF)
+ assert wfnsym is None
+ mo_coeff = mf.mo_coeff
+ assert mo_coeff.dtype == cp.float64
+ mo_energy = mf.mo_energy
+ mo_occ = mf.mo_occ
+ occidx = mo_occ == 2
+ viridx = mo_occ == 0
+ orbv = mo_coeff[:,viridx]
+ orbo = mo_coeff[:,occidx]
+ orbo2 = orbo * 2. # *2 for double occupancy
+
+ e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None]
+ hdiag = hdiag.ravel().get()
+ vresp = mf.gen_response(singlet=singlet, hermi=0)
+ nocc, nvir = e_ia.shape
+
+ def vind(zs):
+ zs = cp.asarray(zs).reshape(-1,nocc,nvir)
+ mo1 = contract('xov,pv->xpo', zs, orbv)
+ dms = contract('xpo,qo->xpq', mo1, orbo2.conj())
+ dms = tag_array(dms, mo1=mo1, occ_coeff=orbo)
+ v1ao = vresp(dms)
+ v1mo = contract('xpq,qo->xpo', v1ao, orbo)
+ v1mo = contract('xpo,pv->xov', v1mo, orbv.conj())
+ v1mo += zs * e_ia
+ return v1mo.reshape(v1mo.shape[0],-1).get()
+
+ return vind, hdiag
+
+
+class TDBase(lib.StreamObject):
+ to_gpu = utils.to_gpu
+ device = utils.device
+ to_cpu = utils.to_cpu
+
+ conv_tol = tdhf_cpu.TDBase.conv_tol
+ nstates = tdhf_cpu.TDBase.nstates
+ singlet = tdhf_cpu.TDBase.singlet
+ lindep = tdhf_cpu.TDBase.lindep
+ level_shift = tdhf_cpu.TDBase.level_shift
+ max_cycle = tdhf_cpu.TDBase.max_cycle
+ positive_eig_threshold = tdhf_cpu.TDBase.positive_eig_threshold
+ deg_eia_thresh = tdhf_cpu.TDBase.deg_eia_thresh
+
+ _keys = tdhf_cpu.TDBase._keys
+
+ __init__ = tdhf_cpu.TDBase.__init__
+
+ nroots = tdhf_cpu.TDBase.nroots
+ e_tot = tdhf_cpu.TDBase.e_tot
+ dump_flags = tdhf_cpu.TDBase.dump_flags
+ check_sanity = tdhf_cpu.TDBase.check_sanity
+ reset = tdhf_cpu.TDBase.reset
+ _finalize = tdhf_cpu.TDBase._finalize
+
+ gen_vind = NotImplemented
+ get_ab = NotImplemented
+ get_precond = tdhf_cpu.TDBase.get_precond
+
+ nuc_grad_method = NotImplemented
+ as_scanner = tdhf_cpu.as_scanner
+
+ oscillator_strength = tdhf_cpu.oscillator_strength
+ transition_dipole = tdhf_cpu.transition_dipole
+ transition_quadrupole = tdhf_cpu.transition_quadrupole
+ transition_octupole = tdhf_cpu.transition_octupole
+ transition_velocity_dipole = tdhf_cpu.transition_velocity_dipole
+ transition_velocity_quadrupole = tdhf_cpu.transition_velocity_quadrupole
+ transition_velocity_octupole = tdhf_cpu.transition_velocity_octupole
+ transition_magnetic_dipole = tdhf_cpu.transition_magnetic_dipole
+ transition_magnetic_quadrupole = tdhf_cpu.transition_magnetic_quadrupole
+
+ def analyze(self, verbose=None):
+ self.to_cpu().analyze(verbose)
+ return self
+
+ def get_nto(self, state=1, threshold=OUTPUT_THRESHOLD, verbose=None):
+ '''
+ Natural transition orbital analysis.
+
+ Returns:
+ A list (weights, NTOs). NTOs are natural orbitals represented in AO
+ basis. The first N_occ NTOs are occupied NTOs and the rest are virtual
+ NTOs. weights and NTOs are all stored in nparray
+ '''
+ return self.to_cpu().get_nto(state, threshold, verbose)
+
+ # needed by transition dipoles
+ def _contract_multipole(tdobj, ints, hermi=True, xy=None):
+ '''ints is the integral tensor of a spin-independent operator'''
+ if xy is None: xy = tdobj.xy
+ nstates = len(xy)
+ pol_shape = ints.shape[:-2]
+ nao = ints.shape[-1]
+
+ if not tdobj.singlet:
+ return np.zeros((nstates,) + pol_shape)
+
+ mo_coeff = tdobj._scf.mo_coeff
+ mo_occ = tdobj._scf.mo_occ
+ orbo = mo_coeff[:,mo_occ==2]
+ orbv = mo_coeff[:,mo_occ==0]
+ if isinstance(orbo, cp.ndarray):
+ orbo = orbo.get()
+ orbv = orbv.get()
+
+ #Incompatible to old np version
+ #ints = np.einsum('...pq,pi,qj->...ij', ints, orbo.conj(), orbv)
+ ints = lib.einsum('xpq,pi,qj->xij', ints.reshape(-1,nao,nao), orbo.conj(), orbv)
+ pol = np.array([np.einsum('xij,ij->x', ints, x) * 2 for x,y in xy])
+ if isinstance(xy[0][1], np.ndarray):
+ if hermi:
+ pol += [np.einsum('xij,ij->x', ints, y) * 2 for x,y in xy]
+ else: # anti-Hermitian
+ pol -= [np.einsum('xij,ij->x', ints, y) * 2 for x,y in xy]
+ pol = pol.reshape((nstates,)+pol_shape)
+ return pol
+
+class TDA(TDBase):
+ __doc__ = tdhf_cpu.TDA.__doc__
+
+ def gen_vind(self, mf=None):
+ '''Generate function to compute Ax'''
+ if mf is None:
+ mf = self._scf
+ return gen_tda_operation(mf, singlet=self.singlet)
+
+ def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+ '''
+ Generate initial guess for TDA
+
+ Kwargs:
+ nstates : int
+ The number of initial guess vectors.
+ '''
+ if mf is None: mf = self._scf
+ if nstates is None: nstates = self.nstates
+ assert wfnsym is None
+ assert not return_symmetry
+
+ mo_energy = mf.mo_energy
+ mo_occ = mf.mo_occ
+ if isinstance(mo_energy, cp.ndarray):
+ mo_energy = mo_energy.get()
+ mo_occ = mo_occ.get()
+ occidx = mo_occ == 2
+ viridx = mo_occ == 0
+ e_ia = (mo_energy[viridx] - mo_energy[occidx,None]).ravel()
+ nov = e_ia.size
+ nstates = min(nstates, nov)
+
+ # Find the nstates-th lowest energy gap
+ e_threshold = float(np.partition(e_ia, nstates-1)[nstates-1])
+ e_threshold += self.deg_eia_thresh
+
+ idx = np.where(e_ia <= e_threshold)[0]
+ x0 = np.zeros((idx.size, nov))
+ for i, j in enumerate(idx):
+ x0[i, j] = 1 # Koopmans' excitations
+
+ return x0
+
+ def kernel(self, x0=None, nstates=None):
+ '''TDA diagonalization solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+ mol = self.mol
+
+ vind, hdiag = self.gen_vind(self._scf)
+ precond = self.get_precond(hdiag)
+
+ def pickeig(w, v, nroots, envs):
+ idx = np.where(w > self.positive_eig_threshold)[0]
+ return w[idx], v[:,idx], idx
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ self.converged, self.e, x1 = lr_eigh(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ nocc = mol.nelectron // 2
+ nmo = self._scf.mo_occ.size
+ nvir = nmo - nocc
+ # 1/sqrt(2) because self.x is for alpha excitation and 2(X^+*X) = 1
+ self.xy = [(xi.reshape(nocc,nvir) * .5**.5, 0) for xi in x1]
+ log.timer('TDA', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+CIS = TDA
+
+
+def gen_tdhf_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
+ '''Generate function to compute
+
+ [ A B ][X]
+ [-B* -A*][Y]
+ '''
+ assert fock_ao is None
+ assert isinstance(mf, scf.hf.SCF)
+ mo_coeff = mf.mo_coeff
+ assert mo_coeff.dtype == cp.float64
+ mo_energy = mf.mo_energy
+ mo_occ = mf.mo_occ
+ occidx = mo_occ == 2
+ viridx = mo_occ == 0
+ orbv = mo_coeff[:,viridx]
+ orbo = mo_coeff[:,occidx]
+
+ e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None]
+ hdiag = cp.hstack((hdiag.ravel(), -hdiag.ravel())).get()
+ vresp = mf.gen_response(singlet=singlet, hermi=0)
+ nocc, nvir = e_ia.shape
+
+ def vind(xys):
+ xys = cp.asarray(xys).reshape(-1,2,nocc,nvir)
+ nz = len(xys)
+ xs, ys = xys.transpose(1,0,2,3)
+ # *2 for double occupancy
+ tmp = contract('xov,pv->xpo', xs, orbv*2)
+ dms = contract('xpo,qo->xpq', tmp, orbo.conj())
+ tmp = contract('xov,qv->xoq', ys, orbv.conj()*2)
+ dms+= contract('xoq,po->xpq', tmp, orbo)
+ v1ao = vresp(dms) # = Xjb + Yjb
+ v1_top = contract('xpq,qo->xpo', v1ao, orbo)
+ v1_top = contract('xpo,pv->xov', v1_top, orbv)
+ v1_bot = contract('xpq,po->xoq', v1ao, orbo)
+ v1_bot = contract('xoq,qv->xov', v1_bot, orbv)
+ v1_top += xs * e_ia # AX
+ v1_bot += ys * e_ia # (A*)Y
+ hx = cp.hstack((v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)))
+ return hx.get()
+
+ return vind, hdiag
+
+
+class TDHF(TDBase):
+ __doc__ = tdhf_cpu.TDHF.__doc__
+
+ @lib.with_doc(gen_tdhf_operation.__doc__)
+ def gen_vind(self, mf=None):
+ if mf is None:
+ mf = self._scf
+ return gen_tdhf_operation(mf, singlet=self.singlet)
+
+ def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+ x0 = TDA.init_guess(self, mf, nstates, wfnsym, return_symmetry)
+ y0 = np.zeros_like(x0)
+ return np.hstack([x0, y0])
+
+ def kernel(self, x0=None, nstates=None):
+ '''TDHF diagonalization with non-Hermitian eigenvalue solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+ mol = self.mol
+
+ vind, hdiag = self.gen_vind(self._scf)
+ precond = self.get_precond(hdiag)
+
+ # handle single kpt PBC SCF
+ if getattr(self._scf, 'kpt', None) is not None:
+ from pyscf.pbc.lib.kpts_helper import gamma_point
+ real_system = (gamma_point(self._scf.kpt) and
+ self._scf.mo_coeff[0].dtype == np.double)
+ else:
+ real_system = True
+
+ # We only need positive eigenvalues
+ def pickeig(w, v, nroots, envs):
+ realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) &
+ (w.real > self.positive_eig_threshold))[0]
+ # If the complex eigenvalue has small imaginary part, both the
+ # real part and the imaginary part of the eigenvector can
+ # approximately be used as the "real" eigen solutions.
+ return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system)
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ self.converged, w, x1 = lr_eig(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ nocc = mol.nelectron // 2
+ nmo = self._scf.mo_occ.size
+ nvir = nmo - nocc
+ self.e = w
+ def norm_xy(z):
+ x, y = z.reshape(2,nocc,nvir)
+ norm = lib.norm(x)**2 - lib.norm(y)**2
+ norm = np.sqrt(.5/norm) # normalize to 0.5 for alpha spin
+ return x*norm, y*norm
+ self.xy = [norm_xy(z) for z in x1]
+
+ log.timer('TDDFT', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+TDRHF = TDHF
+
+scf.hf.RHF.TDA = lib.class_as_method(TDA)
+scf.hf.RHF.TDHF = lib.class_as_method(TDHF)
diff --git a/gpu4pyscf/tdscf/rks.py b/gpu4pyscf/tdscf/rks.py
new file mode 100644
index 00000000..41971614
--- /dev/null
+++ b/gpu4pyscf/tdscf/rks.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.tdscf._lr_eig import eigh as lr_eigh
+from gpu4pyscf.dft.rks import KohnShamDFT
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum
+from gpu4pyscf.lib import logger
+from gpu4pyscf.tdscf import rhf as tdhf_gpu
+from gpu4pyscf import dft
+
+__all__ = [
+ 'TDA', 'TDDFT', 'TDRKS', 'CasidaTDDFT', 'TDDFTNoHybrid',
+]
+
+TDA = tdhf_gpu.TDA
+TDDFT = tdhf_gpu.TDHF
+TDRKS = TDDFT
+
+class CasidaTDDFT(TDDFT):
+ '''Solve the Casida TDDFT formula (A-B)(A+B)(X+Y) = (X+Y)w^2
+ '''
+
+ init_guess = TDA.init_guess
+
+ def gen_vind(self, mf=None):
+ if mf is None:
+ mf = self._scf
+ singlet = self.singlet
+ mo_coeff = mf.mo_coeff
+ assert mo_coeff.dtype == cp.double
+ mo_energy = mf.mo_energy
+ mo_occ = mf.mo_occ
+ occidx = mo_occ == 2
+ viridx = mo_occ == 0
+ orbv = mo_coeff[:,viridx]
+ orbo = mo_coeff[:,occidx]
+
+ e_ia = mo_energy[viridx] - mo_energy[occidx,None]
+ d_ia = e_ia ** .5
+ ed_ia = e_ia * d_ia
+ hdiag = e_ia.ravel() ** 2
+ hdiag = hdiag.get()
+ vresp = mf.gen_response(singlet=singlet, hermi=1)
+ nocc, nvir = e_ia.shape
+
+ def vind(zs):
+ zs = cp.asarray(zs).reshape(-1,nocc,nvir)
+ # *2 for double occupancy
+ mo1 = contract('xov,pv->xpo', zs*(d_ia*2), orbv)
+ dms = contract('xpo,qo->xpq', mo1, orbo)
+ # +cc for A+B and K_{ai,jb} in A == K_{ai,bj} in B
+ dms = transpose_sum(dms)
+ dms = tag_array(dms, mo1=mo1, occ_coeff=orbo)
+ v1ao = vresp(dms)
+ v1mo = contract('xpq,qo->xpo', v1ao, orbo)
+ v1mo = contract('xpo,pv->xov', v1mo, orbv)
+ v1mo += zs * ed_ia
+ v1mo *= d_ia
+ return v1mo.reshape(v1mo.shape[0],-1).get()
+
+ return vind, hdiag
+
+ def kernel(self, x0=None, nstates=None):
+ '''TDDFT diagonalization solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ mf = self._scf
+ if mf._numint.libxc.is_hybrid_xc(mf.xc):
+ raise RuntimeError('%s cannot be used with hybrid functional'
+ % self.__class__)
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+
+ vind, hdiag = self.gen_vind(self._scf)
+ precond = self.get_precond(hdiag)
+
+ def pickeig(w, v, nroots, envs):
+ idx = np.where(w > self.positive_eig_threshold)[0]
+ return w[idx], v[:,idx], idx
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ self.converged, w2, x1 = lr_eigh(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ mo_energy = self._scf.mo_energy
+ mo_occ = self._scf.mo_occ
+ occidx = mo_occ == 2
+ viridx = mo_occ == 0
+ e_ia = mo_energy[viridx] - mo_energy[occidx,None]
+ e_ia = e_ia**.5
+ if isinstance(e_ia, cp.ndarray):
+ e_ia = e_ia.get()
+
+ def norm_xy(w, z):
+ zp = e_ia * z.reshape(e_ia.shape)
+ zm = w/e_ia * z.reshape(e_ia.shape)
+ x = (zp + zm) * .5
+ y = (zp - zm) * .5
+ norm = lib.norm(x)**2 - lib.norm(y)**2
+ norm = (.5/norm)**.5 # normalize to 0.5 for alpha spin
+ return (x*norm, y*norm)
+
+ idx = np.where(w2 > self.positive_eig_threshold)[0]
+ self.e = w2[idx]**.5
+ self.xy = [norm_xy(self.e[i], x1[i]) for i in idx]
+ log.timer('TDDFT', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+ def nuc_grad_method(self):
+ from pyscf.grad import tdrks
+ return tdrks.Gradients(self)
+
+TDDFTNoHybrid = CasidaTDDFT
+
+def tddft(mf):
+ '''Driver to create TDDFT or CasidaTDDFT object'''
+ if mf._numint.libxc.is_hybrid_xc(mf.xc):
+ return TDDFT(mf)
+ else:
+ return CasidaTDDFT(mf)
+
+dft.rks.RKS.TDA = lib.class_as_method(TDA)
+dft.rks.RKS.TDHF = None
+#dft.rks.RKS.TDDFT = lib.class_as_method(TDDFT)
+dft.rks.RKS.TDDFTNoHybrid = lib.class_as_method(TDDFTNoHybrid)
+dft.rks.RKS.CasidaTDDFT = lib.class_as_method(CasidaTDDFT)
+dft.rks.RKS.TDDFT = tddft
diff --git a/gpu4pyscf/tdscf/tests/test_sftddft.py b/gpu4pyscf/tdscf/tests/test_sftddft.py
new file mode 100644
index 00000000..0358fb3a
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_sftddft.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto, scf
+from gpu4pyscf import tdscf
+try:
+ import mcfun
+except ImportError:
+ mcfun = None
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ mol = gto.Mole()
+ mol.verbose = 5
+ mol.output = '/dev/null'
+ mol.atom = '''
+ O 0. 0. 0.
+ H 0. -0.757 0.587
+ H 0. 0.757 0.587'''
+ mol.spin = 2
+ mol.basis = '631g'
+ cls.mol = mol.build()
+ cls.mf = mol.UHF().to_gpu().run()
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.mol.stdout.close()
+
+ def test_tda(self):
+ mf = self.mf
+ # sftddft not available in pyscf main branch. References are created
+ # using the sftda module from pyscf-forge
+ ref = [ 0.46644071, 0.55755649, 1.05310518]
+ td = mf.SFTDA().run(extype=0, conv_tol=1e-7)
+ self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+ ref = [-0.21574567, 0.00270390, 0.03143914]
+ td = mf.SFTDA().run(extype=1, conv_tol=1e-7)
+ self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+ @unittest.skipIf(mcfun is None, 'MCfun not available')
+ def test_mcol_b3lyp_tda(self):
+ mf = self.mf
+ # sftddft not available in pyscf main branch. References are created
+ # using the sftda module from pyscf-forge
+ ref = [ 0.45941171, 0.57799552, 1.06629265]
+ td = mf.SFTDA().run(collinear='mcol', extype=0, conv_tol=1e-7)
+ self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+ ref = [-0.29629139, 0.00067017, 0.01956306]
+ td = mf.SFTDA().run(collinear='mcol', extype=1, conv_tol=1e-7)
+ self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+ @unittest.skip('Numerical issues encountered in non-hermitian diagonalization')
+ def test_tdhf(self):
+ mf = self.mf
+ ref = [1.74385401, 9.38227395, 14.90168875]
+ td = mf.SFTDHF().run(extype=0, conv_tol=1e-7)
+ self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+ ref = [0.41701647, 9.59644331, 22.99972711]
+ td = mf.SFTDHF().run(extype=1, conv_tol=1e-7)
+ self.assertAlmostEqual(abs(td.e - ref).max(), 0, 6)
+
+if __name__ == "__main__":
+ print("Full Tests for spin-flip-TDA and spin-flip-TDDFT")
+ unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tdrhf.py b/gpu4pyscf/tdscf/tests/test_tdrhf.py
new file mode 100644
index 00000000..3ebc0372
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tdrhf.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto, scf
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ mol = gto.Mole()
+ mol.verbose = 7
+ mol.output = '/dev/null'
+ mol.atom = [
+ ['H' , (0. , 0. , .917)],
+ ['F' , (0. , 0. , 0.)], ]
+ mol.basis = '631g'
+ mol.symmetry = True
+ cls.mol = mol.build()
+ cls.mf = mf = scf.RHF(mol).to_gpu().run()
+ cls.df_mf = mf.density_fit().run()
+ cls.nstates = 5 # make sure first 3 states are converged
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.mol.stdout.close()
+
+ def test_tda_singlet(self):
+ mf = self.mf
+ nstates = self.nstates
+ td = mf.TDA().set(nstates=nstates)
+ assert td.device == 'gpu'
+ e = td.kernel()[0]
+ ref = [11.9027511, 11.9027511, 16.8603101]
+ self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.65616659, 5)
+
+ df_mf = self.df_mf
+ td = df_mf.TDA().set(nstates=nstates)
+ e = td.kernel()[0]
+ ref = td.to_cpu().kernel()[0][:3]
+ self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.65618093, 5)
+
+ def test_tda_triplet(self):
+ mf = self.mf
+ nstates = self.nstates
+ td = mf.TDA().set(nstates=nstates)
+ assert td.device == 'gpu'
+ td.singlet = False
+ e = td.kernel()[0]
+ ref = [11.0174650, 11.0174650, 13.1694960]
+ self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+ df_mf = self.df_mf
+ td = df_mf.TDA().set(nstates=nstates)
+ td.singlet = False
+ e = td.kernel()[0]
+ ref = td.to_cpu().kernel()[0][:3]
+ self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+ def test_tdhf_singlet(self):
+ mf = self.mf
+ nstates = self.nstates
+ td = mf.TDHF().set(nstates=nstates)
+ assert td.device == 'gpu'
+ e = td.kernel()[0]
+ ref = [11.8348584, 11.8348584, 16.6630381]
+ self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.64009191, 5)
+
+ df_mf = self.df_mf
+ td = df_mf.TDHF().set(nstates=nstates)
+ e = td.kernel()[0]
+ ref = td.to_cpu().kernel()[0][:3]
+ self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(lib.fp(np.linalg.norm(dip, axis=1)), -0.64011895, 5)
+
+ def test_tdhf_triplet(self):
+ mf = self.mf
+ nstates = self.nstates
+ td = mf.TDHF().set(nstates=nstates)
+ assert td.device == 'gpu'
+ td.singlet = False
+ e = td.kernel()[0]
+ ref = [10.8919091, 10.8919091, 12.6343507]
+ self.assertAlmostEqual(abs(e[:len(ref)] * 27.2114 - ref).max(), 0, 5)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+ df_mf = self.df_mf
+ td = df_mf.TDHF().set(nstates=nstates)
+ td.singlet = False
+ e = td.kernel()[0]
+ ref = td.to_cpu().kernel()[0][:3]
+ self.assertAlmostEqual(abs(e[:len(ref)] - ref).max(), 0, 7)
+ dip = td.transition_dipole()
+ self.assertAlmostEqual(abs(dip).max(), 0, 8)
+
+ def test_tda_vind(self):
+ mf = self.mf
+ nocc = self.mol.nelectron // 2
+ nmo = mf.mo_energy.size
+ nvir = nmo - nocc
+ zs = np.random.rand(3,nocc,nvir)
+ ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs)
+ dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ df_mf = self.df_mf
+ ref = df_mf.to_cpu().TDA().set(singlet=True).gen_vind()[0](zs)
+ dat = df_mf.TDA().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ def test_tdhf_vind(self):
+ mf = self.mf
+ nocc = self.mol.nelectron // 2
+ nmo = mf.mo_energy.size
+ nvir = nmo - nocc
+ zs = np.random.rand(3,2,nocc,nvir)
+ ref = mf.to_cpu().TDHF().set(singlet=True).gen_vind()[0](zs)
+ dat = mf.TDHF().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ df_mf = self.df_mf
+ ref = df_mf.to_cpu().TDHF().set(singlet=False).gen_vind()[0](zs)
+ dat = df_mf.TDHF().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+ print("Full Tests for rhf-TDA and rhf-TDHF")
+ unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tdrks.py b/gpu4pyscf/tdscf/tests/test_tdrks.py
new file mode 100644
index 00000000..c113c1bd
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tdrks.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ mol = gto.Mole()
+ mol.verbose = 5
+ mol.output = '/dev/null'
+ mol.atom = [
+ ['H' , (0. , 0. , .917)],
+ ['F' , (0. , 0. , 0.)], ]
+ mol.basis = '631g'
+ cls.mol = mol.build()
+
+ cls.mf = mf = mol.RHF().to_gpu().run()
+ cls.td_hf = mf.TDHF().run(conv_tol=1e-6)
+
+ mf_lda = mol.RKS().to_gpu().density_fit()
+ mf_lda.xc = 'lda, vwn'
+ mf_lda.grids.prune = None
+ mf_lda.cphf_grids = mf_lda.grids
+ cls.mf_lda = mf_lda.run(conv_tol=1e-10)
+
+ mf_bp86 = mol.RKS().to_gpu().density_fit()
+ mf_bp86.xc = 'b88,p86'
+ mf_bp86.grids.prune = None
+ mf_bp86.cphf_grids = mf_bp86.grids
+ cls.mf_bp86 = mf_bp86.run(conv_tol=1e-10)
+
+ mf_b3lyp = mol.RKS().to_gpu().density_fit()
+ mf_b3lyp.xc = 'b3lyp5'
+ mf_b3lyp.grids.prune = None
+ mf_b3lyp.cphf_grids = mf_b3lyp.grids
+ cls.mf_b3lyp = mf_b3lyp.run(conv_tol=1e-10)
+
+ mf_m06l = mol.RKS().to_gpu().density_fit()
+ mf_m06l.xc = 'm06l'
+ mf_m06l.cphf_grids = mf_m06l.grids
+ cls.mf_m06l = mf_m06l.run(conv_tol=1e-10)
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.mol.stdout.close()
+
+ def test_nohbrid_lda(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.CasidaTDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 5)
+ self.assertAlmostEqual(lib.fp(es), -1.5103950945691957, 5)
+
+ def test_nohbrid_b88p86(self):
+ mf_bp86 = self.mf_bp86
+ td = mf_bp86.CasidaTDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.4869180666784665, 6)
+
+ def test_tddft_lda(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.TDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.5103950945691957, 6)
+
+ def test_tddft_b88p86(self):
+ mf_bp86 = self.mf_bp86
+ td = mf_bp86.TDDFT()
+ assert td.device == 'gpu'
+ td.conv_tol = 1e-5
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.4869180666784665, 6)
+
+ def test_tddft_b3lyp(self):
+ mf_b3lyp = self.mf_b3lyp
+ td = mf_b3lyp.TDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.5175884245769546, 6)
+
+ def test_tddft_camb3lyp(self):
+ mol = self.mol
+ mf = mol.RKS(xc='camb3lyp').run()
+ mf.cphf_grids = mf.grids
+ td = mf.TDDFT().to_gpu()
+ assert td.device == 'gpu'
+ td.conv_tol = 1e-5
+ es = td.kernel(nstates=4)[0]
+ e_ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]*27.2114), 9.00540521503348, 6)
+
+ def test_tda_b3lypg(self):
+ mol = self.mol
+ mf = mol.RKS()
+ mf.xc = 'b3lypg'
+ mf.grids.prune = None
+ mf.cphf_grids = mf.grids
+ mf.scf()
+ td = mf.TDA().to_gpu()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.520888995669812, 6)
+
+ def test_tda_lda(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.TDA()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.5141057378565799, 6)
+
+ def test_tda_b3lyp_triplet(self):
+ mf_b3lyp = self.mf_b3lyp
+ td = mf_b3lyp.TDA()
+ assert td.device == 'gpu'
+ td.singlet = False
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.4707787881198082, 6)
+ td.analyze()
+
+ def test_tda_lda_triplet(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.TDA()
+ assert td.device == 'gpu'
+ td.singlet = False
+ es = td.kernel(nstates=6)[0]
+ ref = td.to_cpu().kernel(nstates=6)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[[0,1,2,4,5]]), -1.4695846533898422, 6)
+
+ def test_tddft_b88p86_triplet(self):
+ mf_bp86 = self.mf_bp86
+ td = mf_bp86.TDDFT()
+ assert td.device == 'gpu'
+ td.singlet = False
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.4412243124430528, 6)
+
+ def test_tda_rsh(self):
+ mol = gto.M(atom='H 0 0 0.6; H 0 0 0', basis = "6-31g")
+ mf = mol.RKS()
+ mf.xc = 'wb97'
+ mf.kernel()
+ mf.cphf_grids = mf.grids
+ td = mf.TDA().to_gpu()
+ assert td.device == 'gpu'
+ e_td = td.set(nstates=5).kernel()[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(e_td - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(e_td), 0.3953917940299652, 6)
+
+ def test_tda_m06l_singlet(self):
+ mf_m06l = self.mf_m06l
+ td = mf_m06l.TDA()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -1.5620823865741496, 6)
+
+ def test_analyze(self):
+ td_hf = self.td_hf
+ assert td_hf.device == 'gpu'
+ f = td_hf.oscillator_strength(gauge='length')
+ self.assertAlmostEqual(lib.fp(f), -0.13908774016795605, 5)
+ f = td_hf.oscillator_strength(gauge='velocity', order=2)
+ self.assertAlmostEqual(lib.fp(f), -0.096991134490587522, 5)
+
+ note_args = []
+ def temp_logger_note(rec, msg, *args):
+ note_args.append(args)
+ with lib.temporary_env(lib.logger.Logger, note=temp_logger_note):
+ td_hf.analyze()
+ ref = [(),
+ (1, 11.834865910142547, 104.76181013351982, 0.01075359074556743),
+ (2, 11.834865910142618, 104.76181013351919, 0.010753590745567499),
+ (3, 16.66308427853695, 74.40651170629978, 0.3740302871966713)]
+ self.assertAlmostEqual(abs(np.hstack(ref) -
+ np.hstack(note_args)).max(), 0, 3)
+
+ self.assertEqual(td_hf.nroots, td_hf.nstates)
+ mf = self.mf
+ self.assertAlmostEqual(lib.fp(td_hf.e_tot-mf.e_tot), 0.41508325757603637, 5)
+
+ def test_scanner(self):
+ mol = self.mol
+ td_hf = self.td_hf
+ td_scan = td_hf.as_scanner().as_scanner()
+ td_scan.nroots = 3
+ td_scan(mol)
+ self.assertAlmostEqual(lib.fp(td_scan.e), 0.41508325757603637, 5)
+
+ def test_transition_multipoles(self):
+ td_hf = self.td_hf
+ self.assertAlmostEqual(abs(lib.fp(td_hf.transition_dipole() [2])), 0.39833021312014988, 4)
+ self.assertAlmostEqual(abs(lib.fp(td_hf.transition_quadrupole() [2])), 0.14862776196563565, 4)
+ self.assertAlmostEqual(abs(lib.fp(td_hf.transition_octupole() [2])), 2.79058994496489410, 4)
+ self.assertAlmostEqual(abs(lib.fp(td_hf.transition_velocity_dipole() [2])), 0.24021409469918567, 4)
+ self.assertAlmostEqual(abs(lib.fp(td_hf.transition_magnetic_dipole() [2])), 0 , 4)
+ self.assertAlmostEqual(abs(lib.fp(td_hf.transition_magnetic_quadrupole()[2])), 0.16558596265719450, 4)
+
+ def test_reset(self):
+ mol1 = gto.M(atom='C')
+ mol = self.mol
+ td = mol.RHF().newton().TDHF().to_gpu()
+ assert td.device == 'gpu'
+ td.reset(mol1)
+ self.assertTrue(td.mol is mol1)
+ self.assertTrue(td._scf.mol is mol1)
+
+ def test_tda_vind(self):
+ mf = self.mf_bp86
+ nocc = self.mol.nelectron // 2
+ nmo = mf.mo_energy.size
+ nvir = nmo - nocc
+ zs = np.random.rand(3,nocc,nvir)
+ ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs)
+ dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ def test_tddft_vind(self):
+ mf = self.mf_b3lyp
+ nocc = self.mol.nelectron // 2
+ nmo = mf.mo_energy.size
+ nvir = nmo - nocc
+ zs = np.random.rand(3,2,nocc,nvir)
+ ref = mf.to_cpu().TDDFT().set(singlet=True).gen_vind()[0](zs)
+ dat = mf.TDDFT().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ def test_casida_tddft_vind(self):
+ mf = self.mf_lda
+ nocc = self.mol.nelectron // 2
+ nmo = mf.mo_energy.size
+ nvir = nmo - nocc
+ zs = np.random.rand(3,nocc,nvir)
+ ref = mf.to_cpu().CasidaTDDFT().set().gen_vind()[0](zs)
+ dat = mf.CasidaTDDFT().set().gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+ print("Full Tests for TD-RKS")
+ unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tduhf.py b/gpu4pyscf/tdscf/tests/test_tduhf.py
new file mode 100644
index 00000000..2b6c2df9
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tduhf.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto, scf
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ mol = gto.Mole()
+ mol.verbose = 0
+ mol.atom = [
+ ['H' , (0. , 0. , .917)],
+ ['F' , (0. , 0. , 0.)], ]
+ mol.basis = '631g'
+ # FIXME: mo_coeff of uhf_symm.SymAdaptedUHF not converted to cupy arrays
+ mol.symmetry = True
+ cls.mol = mol.build()
+ cls.mf = scf.UHF(mol).density_fit().run(conv_tol=1e-10).to_gpu()
+
+ mol1 = gto.Mole()
+ mol1.verbose = 7
+ mol1.output = '/dev/null'
+ mol1.atom = [
+ ['H' , (0. , 0. , .917)],
+ ['F' , (0. , 0. , 0.)], ]
+ mol1.basis = '631g'
+ mol1.spin = 2
+ cls.mol1 = mol1.build()
+ cls.mf1 = scf.UHF(mol1).run(conv_tol=1e-10).to_gpu()
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.mol1.stdout.close()
+
+ def test_tda(self):
+ mf = self.mf
+ td = mf.TDA()
+ assert td.device == 'gpu'
+ td.nstates = 5
+ e = td.kernel()[0]
+ ref = [11.0179839, 11.0179839, 11.9031214, 11.9031214, 13.1701375]
+ self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+ def test_tdhf(self):
+ mf = self.mf
+ td = mf.TDHF()
+ assert td.device == 'gpu'
+ td.nstates = 5
+ td.conv_tol = 1e-5
+ e = td.kernel()[0]
+ ref = [10.8924334, 10.8924334, 11.8352278, 11.8352278, 12.6350840]
+ self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+ def test_tda1(self):
+ mf1 = self.mf1
+ td = mf1.TDA()
+ assert td.device == 'gpu'
+ td.nstates = 5
+ e = td.kernel()[0]
+ ref = [ 3.3211349, 18.5597821, 21.0147390, 21.6150240, 25.0938938]
+ self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+ def test_tdhf1(self):
+ mf1 = self.mf1
+ td = mf1.TDHF()
+ assert td.device == 'gpu'
+ td.nstates = 4
+ e = td.kernel()[0]
+ ref = [ 3.3126683, 18.4954862, 20.8493515, 21.5480882,]
+ self.assertAlmostEqual(abs(e * 27.2114 - ref).max(), 0, 4)
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(e - ref).max(), 0, 4)
+
+ def test_tda_vind(self):
+ mf = self.mf1
+ nocca, noccb = mf.nelec
+ nmo = mf.mo_energy[0].size
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
+ ref = mf.to_cpu().TDA().set().gen_vind()[0](zs)
+ dat = mf.TDA().set().gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ def test_tdhf_vind(self):
+ mf = self.mf1
+ nocca, noccb = mf.nelec
+ nmo = mf.mo_energy[0].size
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb)
+ ref = mf.to_cpu().TDHF().set().gen_vind()[0](zs)
+ dat = mf.TDHF().set().gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+ print("Full Tests for uhf-TDA and uhf-TDHF")
+ unittest.main()
diff --git a/gpu4pyscf/tdscf/tests/test_tduks.py b/gpu4pyscf/tdscf/tests/test_tduks.py
new file mode 100644
index 00000000..598e4156
--- /dev/null
+++ b/gpu4pyscf/tdscf/tests/test_tduks.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+from pyscf import lib, gto
+from gpu4pyscf import tdscf
+
+class KnownValues(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ mol = gto.Mole()
+ mol.verbose = 5
+ mol.output = '/dev/null'
+ mol.atom = '''
+ O 0. 0. 0.
+ H 0. -0.757 0.587
+ H 0. 0.757 0.587'''
+ mol.spin = 2
+ mol.basis = '631g'
+ cls.mol = mol.build()
+
+ mol1 = gto.Mole()
+ mol1.verbose = 0
+ mol1.atom = '''
+ O 0. 0. 0.
+ H 0. -0.757 0.587
+ H 0. 0.757 0.587'''
+ mol1.basis = '631g'
+ cls.mol1 = mol1.build()
+
+ cls.mf_uhf = mf_uhf = mol.UHF().to_gpu().run()
+ cls.td_hf = mf_uhf.TDHF().run(conv_tol=1e-6)
+
+ mf_lda = mol.UKS().set(xc='lda', conv_tol=1e-12).to_gpu()
+ mf_lda.grids.prune = None
+ mf_lda.cphf_grids = mf_lda.grids
+ cls.mf_lda = mf_lda.density_fit().run()
+
+ mf_bp86 = mol.UKS().set(xc='b88,p86', conv_tol=1e-12).to_gpu()
+ mf_bp86.grids.prune = None
+ mf_bp86.cphf_grids = mf_bp86.grids
+ cls.mf_bp86 = mf_bp86.density_fit().run()
+
+ mf_b3lyp = mol.UKS().set(xc='b3lyp5', conv_tol=1e-12).to_gpu()
+ mf_b3lyp.grids.prune = None
+ mf_b3lyp.cphf_grids = mf_b3lyp.grids
+ cls.mf_b3lyp = mf_b3lyp.density_fit().run()
+
+ mf_m06l = mol.UKS().to_gpu().density_fit().run(xc='m06l')
+ mf_m06l.cphf_grids = mf_m06l.grids
+ cls.mf_m06l = mf_m06l
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.mol.stdout.close()
+
+ def test_nohybrid_lda(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.CasidaTDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=4)[0]
+ e_ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.0476763425122965, 6)
+
+ mol1 = self.mol1
+ mf = mol1.UKS().run(xc='lda, vwn_rpa').run()
+ mf.cphf_grids = mf.grids
+ td = mf.CasidaTDDFT().to_gpu()
+ assert td.device == 'gpu'
+ td.nstates = 5
+ es = td.kernel()[0]
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+
+ def test_nohybrid_b88p86(self):
+ mf_bp86 = self.mf_bp86
+ td = mf_bp86.CasidaTDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=4)[0]
+ e_ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.05383891686210346, 6)
+
+ def test_tddft_lda(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.TDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=4)[0]
+ ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.0476763425122965, 6)
+
+ def test_tddft_b88p86(self):
+ mf_bp86 = self.mf_bp86
+ td = mf_bp86.TDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.05383891686259823, 6)
+
+ mol1 = self.mol1
+ mf = mol1.UKS().run(xc='b88,p86').run()
+ mf.cphf_grids = mf.grids
+ td = mf.TDDFT().to_gpu()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+
+ def test_tddft_b3lyp(self):
+ mf_b3lyp = self.mf_b3lyp
+ td = mf_b3lyp.TDDFT()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=4)[0]
+ ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.047793873508724743, 6)
+
+ def test_tddft_camb3lyp(self):
+ mol1 = self.mol1
+ mf = mol1.UKS(xc='camb3lyp').run()
+ mf.cphf_grids = mf.grids
+ td = mf.TDDFT().to_gpu()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=4)[0]
+ e_ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.2827429269753051, 6)
+
+ def test_tda_b3lyp(self):
+ mf_b3lyp = self.mf_b3lyp
+ td = mf_b3lyp.TDA()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=4)[0]
+ ref = td.to_cpu().kernel(nstates=4)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.052638024165134974, 6)
+
+ def test_tda_lda(self):
+ mf_lda = self.mf_lda
+ td = mf_lda.TDA()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es[:3]), 0.05368082550881462, 6)
+
+ mol1 = self.mol1
+ mf = mol1.UKS().run(xc='lda,vwn').run()
+ mf.cphf_grids = mf.grids
+ td = mf.TDA().to_gpu()
+ assert td.device == 'gpu'
+ td.nstates = 5
+ es = td.kernel()[0]
+ ref = td.to_cpu().kernel()[0]
+ self.assertAlmostEqual(abs(es - ref).max(), 0, 8)
+
+ def test_tda_m06l(self):
+ mf_m06l = self.mf_m06l
+ td = mf_m06l.TDA()
+ assert td.device == 'gpu'
+ es = td.kernel(nstates=5)[0]
+ ref = td.to_cpu().kernel(nstates=5)[0]
+ self.assertAlmostEqual(abs(es - ref[:5]).max(), 0, 8)
+ self.assertAlmostEqual(lib.fp(es), -0.7530329968766932, 6)
+
+ def test_tda_vind(self):
+ mf = self.mf_bp86
+ nocca, noccb = mf.nelec
+ nmo = mf.mo_energy[0].size
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
+ ref = mf.to_cpu().TDA().gen_vind()[0](zs)
+ dat = mf.TDA().gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ def test_tddft_vind(self):
+ mf = self.mf_b3lyp
+ nocca, noccb = mf.nelec
+ nmo = mf.mo_energy[0].size
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb)
+ ref = mf.to_cpu().TDDFT().gen_vind()[0](zs)
+ dat = mf.TDDFT().gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+ def test_casida_tddft_vind(self):
+ mf = self.mf_lda
+ nocca, noccb = mf.nelec
+ nmo = mf.mo_energy[0].size
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
+ ref = mf.to_cpu().CasidaTDDFT().gen_vind()[0](zs)
+ dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs))
+ self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
+
+if __name__ == "__main__":
+ print("Full Tests for TD-UKS")
+ unittest.main()
diff --git a/gpu4pyscf/tdscf/uhf.py b/gpu4pyscf/tdscf/uhf.py
new file mode 100644
index 00000000..27cc0850
--- /dev/null
+++ b/gpu4pyscf/tdscf/uhf.py
@@ -0,0 +1,785 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.tdscf import uhf as tdhf_cpu
+from pyscf.data.nist import HARTREE2EV, HARTREE2WAVENUMBER
+from pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig
+from gpu4pyscf import scf
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, tag_array
+from gpu4pyscf.tdscf._uhf_resp_sf import gen_uhf_response_sf
+from gpu4pyscf.tdscf import rhf as tdhf_gpu
+from gpu4pyscf.dft import KohnShamDFT
+from pyscf import __config__
+
+__all__ = [
+ 'TDA', 'CIS', 'TDHF', 'TDUHF', 'TDBase'
+]
+
+REAL_EIG_THRESHOLD = tdhf_cpu.REAL_EIG_THRESHOLD
+
+def gen_tda_operation(mf, fock_ao=None, wfnsym=None):
+ '''A x
+ '''
+ assert fock_ao is None
+ assert isinstance(mf, scf.hf.SCF)
+ assert wfnsym is None
+ if isinstance(mf.mo_coeff, (tuple, list)):
+ # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+ # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+ # cp.asarray() for this object leads to an error in
+ # cupy._core.core._array_from_nested_sequence
+ mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+ else:
+ mo_coeff = cp.asarray(mf.mo_coeff)
+ assert mo_coeff[0].dtype == cp.float64
+ mo_energy = cp.asarray(mf.mo_energy)
+ mo_occ = cp.asarray(mf.mo_occ)
+ nao, nmo = mo_coeff[0].shape
+ occidxa = mo_occ[0] > 0
+ occidxb = mo_occ[1] > 0
+ viridxa = mo_occ[0] ==0
+ viridxb = mo_occ[1] ==0
+ orboa = mo_coeff[0][:,occidxa]
+ orbob = mo_coeff[1][:,occidxb]
+ orbva = mo_coeff[0][:,viridxa]
+ orbvb = mo_coeff[1][:,viridxb]
+
+ e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+ e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+ e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
+ hdiag = e_ia.get()
+ nocca, nvira = e_ia_a.shape
+ noccb, nvirb = e_ia_b.shape
+
+ vresp = mf.gen_response(hermi=0)
+
+ def vind(zs):
+ nz = len(zs)
+ zs = cp.asarray(zs)
+ za = zs[:,:nocca*nvira].reshape(nz,nocca,nvira)
+ zb = zs[:,nocca*nvira:].reshape(nz,noccb,nvirb)
+ mo1a = contract('xov,pv->xpo', za, orbva)
+ dmsa = contract('xpo,qo->xpq', mo1a, orboa.conj())
+ mo1b = contract('xov,pv->xpo', zb, orbvb)
+ dmsb = contract('xpo,qo->xpq', mo1b, orbob.conj())
+ dms = cp.asarray((dmsa, dmsb))
+ dms = tag_array(dms, mo1=[mo1a,mo1b], occ_coeff=[orboa,orbob])
+ v1ao = vresp(dms)
+ v1a = contract('xpq,qo->xpo', v1ao[0], orboa)
+ v1a = contract('xpo,pv->xov', v1a, orbva.conj())
+ v1b = contract('xpq,qo->xpo', v1ao[1], orbob)
+ v1b = contract('xpo,pv->xov', v1b, orbvb.conj())
+ v1a += za * e_ia_a
+ v1b += zb * e_ia_b
+ hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1)))
+ return hx.get()
+
+ return vind, hdiag
+
+
+class TDBase(tdhf_gpu.TDBase):
+ def _contract_multipole(tdobj, ints, hermi=True, xy=None):
+ if xy is None: xy = tdobj.xy
+ mo_coeff = tdobj._scf.mo_coeff
+ mo_occ = tdobj._scf.mo_occ
+ orbo_a = mo_coeff[0][:,mo_occ[0]==1]
+ orbv_a = mo_coeff[0][:,mo_occ[0]==0]
+ orbo_b = mo_coeff[1][:,mo_occ[1]==1]
+ orbv_b = mo_coeff[1][:,mo_occ[1]==0]
+ if isinstance(orbo_a, cp.ndarray):
+ orbo_a = orbo_a.get()
+ orbv_a = orbv_a.get()
+ orbo_b = orbo_b.get()
+ orbv_b = orbv_b.get()
+
+ ints_a = np.einsum('...pq,pi,qj->...ij', ints, orbo_a.conj(), orbv_a)
+ ints_b = np.einsum('...pq,pi,qj->...ij', ints, orbo_b.conj(), orbv_b)
+ pol = [(np.einsum('...ij,ij->...', ints_a, x[0]) +
+ np.einsum('...ij,ij->...', ints_b, x[1])) for x,y in xy]
+ pol = np.array(pol)
+ y = xy[0][1]
+ if isinstance(y[0], np.ndarray):
+ pol_y = [(np.einsum('...ij,ij->...', ints_a, y[0]) +
+ np.einsum('...ij,ij->...', ints_b, y[1])) for x,y in xy]
+ if hermi:
+ pol += pol_y
+ else: # anti-Hermitian
+ pol -= pol_y
+ return pol
+
+
+class TDA(TDBase):
+ __doc__ = tdhf_gpu.TDA.__doc__
+
+ singlet = None
+
+ def gen_vind(self, mf=None):
+ '''Generate function to compute Ax'''
+ if mf is None:
+ mf = self._scf
+ return gen_tda_operation(mf)
+
+ def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+ if mf is None: mf = self._scf
+ if nstates is None: nstates = self.nstates
+ assert wfnsym is None
+ assert not return_symmetry
+
+ mo_energy_a, mo_energy_b = mf.mo_energy
+ mo_occ_a, mo_occ_b = mf.mo_occ
+ if isinstance(mo_energy_a, cp.ndarray):
+ mo_energy_a = mo_energy_a.get()
+ mo_energy_b = mo_energy_b.get()
+ if isinstance(mo_occ_a, cp.ndarray):
+ mo_occ_a = mo_occ_a.get()
+ mo_occ_b = mo_occ_b.get()
+ occidxa = mo_occ_a > 0
+ occidxb = mo_occ_b > 0
+ viridxa = mo_occ_a == 0
+ viridxb = mo_occ_b == 0
+ e_ia_a = mo_energy_a[viridxa] - mo_energy_a[occidxa,None]
+ e_ia_b = mo_energy_b[viridxb] - mo_energy_b[occidxb,None]
+ nov = e_ia_a.size + e_ia_b.size
+ nstates = min(nstates, nov)
+
+ e_ia = np.append(e_ia_a.ravel(), e_ia_b.ravel())
+ # Find the nstates-th lowest energy gap
+ e_threshold = np.partition(e_ia, nstates-1)[nstates-1]
+ e_threshold += self.deg_eia_thresh
+
+ idx = np.where(e_ia <= e_threshold)[0]
+ x0 = np.zeros((idx.size, nov))
+ for i, j in enumerate(idx):
+ x0[i, j] = 1
+ return x0
+
+ def kernel(self, x0=None, nstates=None):
+ '''TDA diagonalization solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = (logger.process_clock(), logger.perf_counter())
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+
+ vind, hdiag = self.gen_vind(self._scf)
+ precond = self.get_precond(hdiag)
+
+ def pickeig(w, v, nroots, envs):
+ idx = np.where(w > self.positive_eig_threshold)[0]
+ return w[idx], v[:,idx], idx
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ self.converged, self.e, x1 = lr_eigh(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ nmo = self._scf.mo_occ[0].size
+ nocca, noccb = self._scf.nelec
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ self.xy = [((xi[:nocca*nvira].reshape(nocca,nvira), # X_alpha
+ xi[nocca*nvira:].reshape(noccb,nvirb)), # X_beta
+ (0, 0)) # (Y_alpha, Y_beta)
+ for xi in x1]
+
+ log.timer('TDA', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+CIS = TDA
+
+class SpinFlipTDA(TDBase):
+ '''
+ Attributes:
+ extype : int (0 or 1)
+ Spin flip up: exytpe=0. Spin flip down: exytpe=1.
+ collinear : str
+ collinear schemes, can be
+ 'col': collinear, by default
+ 'ncol': non-collinear
+ 'mcol': multi-collinear
+ collinear_samples : int
+ Integration samples for the multi-collinear treatment
+ '''
+
+ extype = getattr(__config__, 'tdscf_uhf_SFTDA_extype', 1)
+ collinear = getattr(__config__, 'tdscf_uhf_SFTDA_collinear', 'col')
+ collinear_samples = getattr(__config__, 'tdscf_uhf_SFTDA_collinear_samples', 200)
+
+ _keys = {'extype', 'collinear', 'collinear_samples'}
+
+ def gen_vind(self):
+ '''Generate function to compute A*x for spin-flip TDDFT case.
+ '''
+ mf = self._scf
+ assert isinstance(mf, scf.hf.SCF)
+ if isinstance(mf.mo_coeff, (tuple, list)):
+ # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+ # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+ # cp.asarray() for this object leads to an error in
+ # cupy._core.core._array_from_nested_sequence
+ mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+ else:
+ mo_coeff = cp.asarray(mf.mo_coeff)
+ assert mo_coeff[0].dtype == cp.float64
+ mo_energy = cp.asarray(mf.mo_energy)
+ mo_occ = cp.asarray(mf.mo_occ)
+ nao, nmo = mo_coeff[0].shape
+
+ extype = self.extype
+ if extype == 0:
+ occidxb = mo_occ[1] > 0
+ viridxa = mo_occ[0] ==0
+ orbob = mo_coeff[1][:,occidxb]
+ orbva = mo_coeff[0][:,viridxa]
+ orbov = (orbob, orbva)
+ e_ia = mo_energy[0][viridxa] - mo_energy[1][occidxb,None]
+ hdiag = e_ia.ravel().get()
+
+ elif extype == 1:
+ occidxa = mo_occ[0] > 0
+ viridxb = mo_occ[1] ==0
+ orboa = mo_coeff[0][:,occidxa]
+ orbvb = mo_coeff[1][:,viridxb]
+ orbov = (orboa, orbvb)
+ e_ia = mo_energy[1][viridxb] - mo_energy[0][occidxa,None]
+ hdiag = e_ia.ravel().get()
+
+ vresp = gen_uhf_response_sf(
+ mf, hermi=0, collinear=self.collinear,
+ collinear_samples=self.collinear_samples)
+
+ def vind(zs):
+ zs = cp.asarray(zs).reshape(-1, *e_ia.shape)
+ orbo, orbv = orbov
+ mo1 = contract('xov,pv->xpo', zs, orbv)
+ dms = contract('xpo,qo->xpq', mo1, orbo.conj())
+ dms = tag_array(dms, mo1=mo1, occ_coeff=orbo)
+ v1ao = vresp(dms)
+ v1mo = contract('xpq,qo->xpo', v1ao, orbo)
+ v1mo = contract('xpo,pv->xov', v1mo, orbv.conj())
+ v1mo += zs * e_ia
+ return v1mo.reshape(len(v1mo), -1).get()
+
+ return vind, hdiag
+
+ def _init_guess(self, mf, nstates):
+ mo_energy_a, mo_energy_b = mf.mo_energy
+ mo_occ_a, mo_occ_b = mf.mo_occ
+ if isinstance(mo_energy_a, cp.ndarray):
+ mo_energy_a = mo_energy_a.get()
+ mo_energy_b = mo_energy_b.get()
+ if isinstance(mo_occ_a, cp.ndarray):
+ mo_occ_a = mo_occ_a.get()
+ mo_occ_b = mo_occ_b.get()
+
+ if self.extype == 0:
+ occidxb = mo_occ_b > 0
+ viridxa = mo_occ_a ==0
+ e_ia = mo_energy_a[viridxa] - mo_energy_b[occidxb,None]
+
+ elif self.extype == 1:
+ occidxa = mo_occ_a > 0
+ viridxb = mo_occ_b ==0
+ e_ia = mo_energy_b[viridxb] - mo_energy_a[occidxa,None]
+
+ e_ia = e_ia.ravel()
+ nov = e_ia.size
+ nstates = min(nstates, nov)
+ e_threshold = np.partition(e_ia, nstates-1)[nstates-1]
+ idx = np.where(e_ia <= e_threshold)[0]
+ nstates = idx.size
+ e = e_ia[idx]
+ idx = idx[np.argsort(e)]
+ x0 = np.zeros((nstates, nov))
+ for i, j in enumerate(idx):
+ x0[i, j] = 1
+ return np.sort(e), x0.reshape(nstates, *e_ia.shape)
+
+ def init_guess(self, mf=None, nstates=None, wfnsym=None):
+ if mf is None: mf = self._scf
+ if nstates is None: nstates = self.nstates
+ x0 = self._init_guess(mf, nstates)[1]
+ return x0.reshape(len(x0), -1)
+
+ def dump_flags(self, verbose=None):
+ TDBase.dump_flags(self, verbose)
+ logger.info(self, 'extype = %s', self.extype)
+ logger.info(self, 'collinear = %s', self.collinear)
+ if self.collinear == 'mcol':
+ logger.info(self, 'collinear_samples = %s', self.collinear_samples)
+ return self
+
+ def check_sanity(self):
+ TDBase.check_sanity(self)
+ assert self.extype in (0, 1)
+ assert self.collinear in ('col', 'ncol', 'mcol')
+ return self
+
+ def kernel(self, x0=None, nstates=None):
+ '''Spin-flip TDA diagonalization solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+
+ if self.collinear == 'col' and isinstance(self._scf, KohnShamDFT):
+ mf = self._scf
+ ni = mf._numint
+ if not ni.libxc.is_hybrid_xc(mf.xc):
+ self.converged = True
+ self.e, xs = self._init_guess()
+ self.xy = [(x, 0) for x in xs]
+ return self.e, self.xy
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ # Keep all eigenvalues as SF-TDDFT allows triplet to singlet
+ # "dexcitation"
+ def all_eigs(w, v, nroots, envs):
+ return w, v, np.arange(w.size)
+
+ vind, hdiag = self.gen_vind()
+ precond = self.get_precond(hdiag)
+
+ self.converged, self.e, x1 = lr_eigh(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=all_eigs, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ nmo = self._scf.mo_occ[0].size
+ nocca, noccb = self._scf.nelec
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+
+ if self.extype == 0:
+ self.xy = [(xi.reshape(noccb,nvira), 0) for xi in x1]
+ elif self.extype == 1:
+ self.xy = [(xi.reshape(nocca,nvirb), 0) for xi in x1]
+ log.timer('SpinFlipTDA', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+
+def gen_tdhf_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
+ '''Generate function to compute
+
+ [ A B ][X]
+ [-B* -A*][Y]
+ '''
+ assert fock_ao is None
+ assert isinstance(mf, scf.hf.SCF)
+ if isinstance(mf.mo_coeff, (tuple, list)):
+ # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+ # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+ # cp.asarray() for this object leads to an error in
+ # cupy._core.core._array_from_nested_sequence
+ mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+ else:
+ mo_coeff = cp.asarray(mf.mo_coeff)
+ assert mo_coeff[0].dtype == cp.float64
+ mo_energy = cp.asarray(mf.mo_energy)
+ mo_occ = cp.asarray(mf.mo_occ)
+ occidxa = mo_occ[0] > 0
+ occidxb = mo_occ[1] > 0
+ viridxa = mo_occ[0] == 0
+ viridxb = mo_occ[1] == 0
+ orboa = mo_coeff[0][:,occidxa]
+ orbob = mo_coeff[1][:,occidxb]
+ orbva = mo_coeff[0][:,viridxa]
+ orbvb = mo_coeff[1][:,viridxb]
+
+ e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+ e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+ e_ia = hdiag = cp.hstack((e_ia_a.ravel(), e_ia_b.ravel()))
+ hdiag = cp.hstack((hdiag, -hdiag)).get()
+ nocca, nvira = e_ia_a.shape
+ noccb, nvirb = e_ia_b.shape
+
+ vresp = mf.gen_response(hermi=0)
+
+ def vind(xys):
+ nz = len(xys)
+ xys = cp.asarray(xys).reshape(nz,2,-1)
+ xs, ys = xys.transpose(1,0,2)
+ xa = xs[:,:nocca*nvira].reshape(nz,nocca,nvira)
+ xb = xs[:,nocca*nvira:].reshape(nz,noccb,nvirb)
+ ya = ys[:,:nocca*nvira].reshape(nz,nocca,nvira)
+ yb = ys[:,nocca*nvira:].reshape(nz,noccb,nvirb)
+ tmp = contract('xov,pv->xpo', xa, orbva)
+ dmsa = contract('xpo,qo->xpq', tmp, orboa.conj())
+ tmp = contract('xov,pv->xpo', xb, orbvb)
+ dmsb = contract('xpo,qo->xpq', tmp, orbob.conj())
+ tmp = contract('xov,qv->xoq', ya, orbva.conj())
+ dmsa+= contract('xoq,po->xpq', tmp, orboa)
+ tmp = contract('xov,qv->xoq', yb, orbvb.conj())
+ dmsb+= contract('xoq,po->xpq', tmp, orbob)
+ v1ao = vresp(cp.asarray((dmsa,dmsb)))
+ v1a_top = contract('xpq,qo->xpo', v1ao[0], orboa)
+ v1a_top = contract('xpo,pv->xov', v1a_top, orbva.conj())
+ v1b_top = contract('xpq,qo->xpo', v1ao[1], orbob)
+ v1b_top = contract('xpo,pv->xov', v1b_top, orbvb.conj())
+ v1a_bot = contract('xpq,po->xoq', v1ao[0], orboa.conj())
+ v1a_bot = contract('xoq,qv->xov', v1a_bot, orbva)
+ v1b_bot = contract('xpq,po->xoq', v1ao[1], orbob.conj())
+ v1b_bot = contract('xoq,qv->xov', v1b_bot, orbvb)
+
+ v1_top = xs * e_ia
+ v1_bot = ys * e_ia
+ v1_top[:,:nocca*nvira] += v1a_top.reshape(nz,-1)
+ v1_bot[:,:nocca*nvira] += v1a_bot.reshape(nz,-1)
+ v1_top[:,nocca*nvira:] += v1b_top.reshape(nz,-1)
+ v1_bot[:,nocca*nvira:] += v1b_bot.reshape(nz,-1)
+ hx = cp.hstack((v1_top, -v1_bot))
+ return hx.get()
+
+ return vind, hdiag
+
+
+class TDHF(TDBase):
+
+ singlet = None
+
+ @lib.with_doc(gen_tdhf_operation.__doc__)
+ def gen_vind(self, mf=None):
+ if mf is None:
+ mf = self._scf
+ return gen_tdhf_operation(mf, singlet=self.singlet)
+
+ def init_guess(self, mf=None, nstates=None, wfnsym=None, return_symmetry=False):
+ x0 = TDA.init_guess(self, mf, nstates, wfnsym, return_symmetry)
+ y0 = np.zeros_like(x0)
+ return np.hstack([x0, y0])
+
+ def kernel(self, x0=None, nstates=None):
+ '''TDHF diagonalization with non-Hermitian eigenvalue solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+
+ vind, hdiag = self.gen_vind(self._scf)
+ precond = self.get_precond(hdiag)
+
+ # handle single kpt PBC SCF
+ if getattr(self._scf, 'kpt', None) is not None:
+ from pyscf.pbc.lib.kpts_helper import gamma_point
+ real_system = (gamma_point(self._scf.kpt) and
+ self._scf.mo_coeff[0].dtype == np.double)
+ else:
+ real_system = True
+
+ # We only need positive eigenvalues
+ def pickeig(w, v, nroots, envs):
+ realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) &
+ (w.real > self.positive_eig_threshold))[0]
+ return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system)
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ self.converged, w, x1 = lr_eig(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ nmo = self._scf.mo_occ[0].size
+ nocca, noccb = self._scf.nelec
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ e = []
+ xy = []
+ for i, z in enumerate(x1):
+ x, y = z.reshape(2,-1)
+ norm = lib.norm(x)**2 - lib.norm(y)**2
+ if norm > 0:
+ norm = norm**-.5
+ e.append(w[i])
+ xy.append(((x[:nocca*nvira].reshape(nocca,nvira) * norm, # X_alpha
+ x[nocca*nvira:].reshape(noccb,nvirb) * norm), # X_beta
+ (y[:nocca*nvira].reshape(nocca,nvira) * norm, # Y_alpha
+ y[nocca*nvira:].reshape(noccb,nvirb) * norm)))# Y_beta
+ self.e = np.array(e)
+ self.xy = xy
+
+ log.timer('TDDFT', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+TDUHF = TDHF
+
+class SpinFlipTDHF(TDBase):
+
+ extype = SpinFlipTDA.extype
+ collinear = SpinFlipTDA.collinear
+ collinear_samples = SpinFlipTDA.collinear_samples
+
+ _keys = {'extype', 'collinear', 'collinear_samples'}
+
+ def gen_vind(self):
+ '''Generate function to compute A*x for spin-flip TDDFT case.
+ '''
+ mf = self._scf
+ assert isinstance(mf, scf.hf.SCF)
+ if isinstance(mf.mo_coeff, (tuple, list)):
+ # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+ # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+ # cp.asarray() for this object leads to an error in
+ # cupy._core.core._array_from_nested_sequence
+ mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+ else:
+ mo_coeff = cp.asarray(mf.mo_coeff)
+ assert mo_coeff[0].dtype == cp.float64
+ mo_energy = cp.asarray(mf.mo_energy)
+ mo_occ = cp.asarray(mf.mo_occ)
+ nao, nmo = mo_coeff[0].shape
+
+ occidxa = mo_occ[0] > 0
+ occidxb = mo_occ[1] > 0
+ viridxa = mo_occ[0] ==0
+ viridxb = mo_occ[1] ==0
+ orboa = mo_coeff[0][:,occidxa]
+ orbob = mo_coeff[1][:,occidxb]
+ orbva = mo_coeff[0][:,viridxa]
+ orbvb = mo_coeff[1][:,viridxb]
+ e_ia_b2a = mo_energy[0][viridxa] - mo_energy[1][occidxb,None]
+ e_ia_a2b = mo_energy[1][viridxb] - mo_energy[0][occidxa,None]
+ nocca, nvirb = e_ia_a2b.shape
+ noccb, nvira = e_ia_b2a.shape
+
+ extype = self.extype
+ if extype == 0:
+ hdiag = cp.hstack([e_ia_b2a.ravel(), -e_ia_a2b.ravel()]).get()
+ else:
+ hdiag = cp.hstack([e_ia_a2b.ravel(), -e_ia_b2a.ravel()]).get()
+
+ vresp = gen_uhf_response_sf(
+ mf, hermi=0, collinear=self.collinear,
+ collinear_samples=self.collinear_samples)
+
+ def vind(zs):
+ nz = len(zs)
+ zs = cp.asarray(zs).reshape(nz, -1)
+ if extype == 0:
+ zs_b2a = zs[:,:noccb*nvira].reshape(nz,noccb,nvira)
+ zs_a2b = zs[:,noccb*nvira:].reshape(nz,nocca,nvirb)
+ dm_b2a = contract('xov,pv->xpo', zs_b2a, orbva)
+ dm_b2a = contract('xpo,qo->xpq', dm_b2a, orbob.conj())
+ dm_a2b = contract('xov,qv->xoq', zs_a2b, orbvb.conj())
+ dm_a2b = contract('xoq,po->xpq', dm_a2b, orboa)
+ else:
+ zs_a2b = zs[:,:nocca*nvirb].reshape(nz,nocca,nvirb)
+ zs_b2a = zs[:,nocca*nvirb:].reshape(nz,noccb,nvira)
+ dm_b2a = contract('xov,pv->xpo', zs_b2a, orbva)
+ dm_b2a = contract('xpo,qo->xpq', dm_b2a, orbob.conj())
+ dm_a2b = contract('xov,qv->xoq', zs_a2b, orbvb.conj())
+ dm_a2b = contract('xoq,po->xpq', dm_a2b, orboa)
+
+ '''
+ # The slow way to compute individual terms in
+ # [A B] [X]
+ # [B* A*] [Y]
+ dms = cp.vstack([dm_b2a, dm_a2b])
+ v1ao = vresp(dms)
+ v1ao_b2a, v1ao_a2b = v1ao[:nz], v1ao[nz:]
+ if extype == 0:
+ # A*X = (aI||Jb) * z_b2a = -(ab|IJ) * z_b2a
+ v1A_b2a = contract('xpq,qo->xpo', v1ao_b2a, orbob)
+ v1A_b2a = contract('xpo,pv->xov', v1A_b2a, orbva.conj())
+ # (A*)*Y = (iA||Bj) * z_a2b = -(ij|BA) * z_a2b
+ v1A_a2b = contract('xpq,po->xoq', v1ao_a2b, orboa.conj())
+ v1A_a2b = contract('xoq,qv->xov', v1A_a2b, orbvb)
+ # B*Y = (aI||Bj) * z_a2b = -(aj|BI) * z_a2b
+ v1B_b2a = contract('xpq,qo->xpo', v1ao_a2b, orbob)
+ v1B_b2a = contract('xpo,pv->xov', v1B_b2a, orbva.conj())
+ # (B*)*X = (iA||Jb) * z_b2a = -(ib|JA) * z_b2a
+ v1B_a2b = contract('xpq,po->xoq', v1ao_b2a, orboa.conj())
+ v1B_a2b = contract('xoq,qv->xov', v1B_a2b, orbvb)
+ # add the orbital energy difference in A matrix.
+ v1_top = v1A_b2a + v1B_b2a + zs_b2a * e_ia_b2a
+ v1_bot = v1B_a2b + v1A_a2b + zs_a2b * e_ia_a2b
+ hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
+ else:
+ # A*X = (Ai||jB) * z_a2b = -(AB|ij) * z_a2b
+ v1A_a2b = contract('xpq,qo->xpo', v1ao_a2b, orboa)
+ v1A_a2b = contract('xpo,pv->xov', v1A_a2b, orbvb.conj())
+ # (A*)*Y = (Ia||bJ) * z_b2a = -(IJ|ba) * z_b2a
+ v1A_b2a = contract('xpq,po->xoq', v1ao_b2a, orbob.conj())
+ v1A_b2a = contract('xoq,qv->xov', v1A_b2a, orbva)
+ # B*Y = (Ai||bJ) * z_b2a = -(AJ|bi) * z_b2a
+ v1B_a2b = contract('xpq,qo->xpo', v1ao_b2a, orboa)
+ v1B_a2b = contract('xpo,pv->xov', v1B_a2b, orbvb.conj())
+ # (B*)*X = (Ia||jB) * z_a2b = -(IB|ja) * z_a2b
+ v1B_b2a = contract('xpq,po->xoq', v1ao_a2b, orbob.conj())
+ v1B_b2a = contract('xoq,qv->xov', v1B_b2a, orbva)
+ # add the orbital energy difference in A matrix.
+ v1_top = v1A_a2b + v1B_a2b + zs_a2b * e_ia_a2b
+ v1_bot = v1B_b2a + v1A_b2a + zs_b2a * e_ia_b2a
+ hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
+ '''
+
+ # [A B] [X]
+ # [B* A*] [Y]
+ # is simplified to
+ dms = dm_b2a + dm_a2b
+ v1ao = vresp(dms)
+ if extype == 0:
+ # v1_top = A*X+B*Y
+ # A*X = (aI||Jb) * z_b2a = -(ab|JI) * z_b2a
+ # B*Y = (aI||Bj) * z_a2b = -(aj|BI) * z_a2b
+ v1_top = contract('xpq,qo->xpo', v1ao, orbob)
+ v1_top = contract('xpo,pv->xov', v1_top, orbva.conj())
+ # (A*)*Y = (iA||Bj) * z_a2b = -(ij|BA) * z_a2b
+ # (B*)*X = (iA||Jb) * z_b2a = -(ib|JA) * z_b2a
+ # v1_bot = (B*)*X + (A*)*Y
+ v1_bot = contract('xpq,po->xoq', v1ao, orboa.conj())
+ v1_bot = contract('xoq,qv->xov', v1_bot, orbvb)
+ # add the orbital energy difference in A matrix.
+ v1_top += zs_b2a * e_ia_b2a
+ v1_bot += zs_a2b * e_ia_a2b
+ else:
+ # v1_top = A*X+B*Y
+ # A*X = (Ai||jB) * z_a2b = -(AB|ji) * z_a2b
+ # B*Y = (Ai||bJ) * z_b2a = -(AJ|bi) * z_b2a
+ v1_top = contract('xpq,qo->xpo', v1ao, orboa)
+ v1_top = contract('xpo,pv->xov', v1_top, orbvb.conj())
+ # v1_bot = (B*)*X + (A*)*Y
+ # (A*)*Y = (Ia||bJ) * z_b2a = -(IJ|ba) * z_b2a
+ # (B*)*X = (Ia||jB) * z_a2b = -(IB|ja) * z_a2b
+ v1_bot = contract('xpq,po->xoq', v1ao, orbob.conj())
+ v1_bot = contract('xoq,qv->xov', v1_bot, orbva)
+ # add the orbital energy difference in A matrix.
+ v1_top += zs_a2b * e_ia_a2b
+ v1_bot += zs_b2a * e_ia_b2a
+ hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
+ return hx.get()
+
+ return vind, hdiag
+
+ _init_guess = SpinFlipTDA._init_guess
+
+ def init_guess(self, mf=None, nstates=None, wfnsym=None):
+ if mf is None: mf = self._scf
+ if nstates is None: nstates = self.nstates
+ x0 = self._init_guess(mf, nstates)[1]
+ nx = len(x0)
+ nmo = mf.mo_occ[0].size
+ nocca, noccb = mf.nelec
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+ if self.extype == 0:
+ y0 = np.zeros((nx, nocca*nvirb))
+ else:
+ y0 = np.zeros((nx, noccb*nvira))
+ return np.hstack([x0.reshape(nx,-1), y0])
+
+ dump_flags = SpinFlipTDA.dump_flags
+ check_sanity = SpinFlipTDA.check_sanity
+
+ def kernel(self, x0=None, nstates=None):
+ '''Spin-flip TDA diagonalization solver
+ '''
+ # TODO: Enable this feature after updating the TDDFT davidson algorithm
+ # in pyscf main branch
+ raise RuntimeError('Numerical issues in lr_eig')
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+
+ if self.collinear == 'col' and isinstance(self._scf, KohnShamDFT):
+ raise NotImplementedError
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ real_system = self._scf.mo_coeff[0].dtype == np.float64
+ def pickeig(w, v, nroots, envs):
+ realidx = np.where((abs(w.imag) < REAL_EIG_THRESHOLD) &
+ (w.real > self.positive_eig_threshold))[0]
+ return lib.linalg_helper._eigs_cmplx2real(w, v, realidx, real_system)
+
+ vind, hdiag = self.gen_vind()
+ precond = self.get_precond(hdiag)
+
+ self.converged, self.e, x1 = lr_eig(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ nmo = self._scf.mo_occ[0].size
+ nocca, noccb = self._scf.nelec
+ nvira = nmo - nocca
+ nvirb = nmo - noccb
+
+ if self.extype == 0:
+ def norm_xy(z):
+ x = z[:noccb*nvira].reshape(noccb,nvira)
+ y = z[noccb*nvira:].reshape(nocca,nvirb)
+ norm = lib.norm(x)**2 - lib.norm(y)**2
+ #assert norm > 0
+ norm = abs(norm) ** -.5
+ return x*norm, y*norm
+ elif self.extype == 1:
+ def norm_xy(z):
+ x = z[:nocca*nvirb].reshape(nocca,nvirb)
+ y = z[nocca*nvirb:].reshape(noccb,nvira)
+ norm = lib.norm(x)**2 - lib.norm(y)**2
+ #assert norm > 0
+ norm = abs(norm) ** -.5
+ return x*norm, y*norm
+
+ self.xy = [norm_xy(z) for z in x1]
+ log.timer('SpinFlipTDDFT', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+scf.uhf.UHF.TDA = lib.class_as_method(TDA)
+scf.uhf.UHF.TDHF = lib.class_as_method(TDHF)
+scf.uhf.UHF.SFTDA = lib.class_as_method(SpinFlipTDA)
+scf.uhf.UHF.SFTDHF = lib.class_as_method(SpinFlipTDHF)
diff --git a/gpu4pyscf/tdscf/uks.py b/gpu4pyscf/tdscf/uks.py
new file mode 100644
index 00000000..23646332
--- /dev/null
+++ b/gpu4pyscf/tdscf/uks.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+#
+# Copyright 2024 The GPU4PySCF Developers. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import numpy as np
+import cupy as cp
+from pyscf import symm
+from pyscf import lib
+from pyscf.tdscf._lr_eig import eigh as lr_eigh
+from gpu4pyscf.dft.rks import KohnShamDFT
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum
+from gpu4pyscf.lib import logger
+from gpu4pyscf.tdscf import uhf as tdhf_gpu
+from gpu4pyscf import dft
+
+__all__ = [
+ 'TDA', 'TDDFT', 'TDUKS', 'CasidaTDDFT', 'TDDFTNoHybrid',
+]
+
+TDA = tdhf_gpu.TDA
+TDDFT = tdhf_gpu.TDHF
+TDUKS = TDDFT
+SpinFlipTDA = tdhf_gpu.SpinFlipTDA
+SpinFlipTDDFT = tdhf_gpu.SpinFlipTDHF
+
+class CasidaTDDFT(TDDFT):
+ '''Solve the Casida TDDFT formula (A-B)(A+B)(X+Y) = (X+Y)w^2
+ '''
+
+ init_guess = TDA.init_guess
+
+ def gen_vind(self, mf=None):
+ if mf is None:
+ mf = self._scf
+ if isinstance(mf.mo_coeff, (tuple, list)):
+ # The to_gpu() in pyscf is not able to convert SymAdaptedUHF.mo_coeff.
+ # In this case, mf.mo_coeff has the type (NPArrayWithTag, NPArrayWithTag).
+ # cp.asarray() for this object leads to an error in
+ # cupy._core.core._array_from_nested_sequence
+ mo_coeff = cp.asarray(mf.mo_coeff[0]), cp.asarray(mf.mo_coeff[1])
+ else:
+ mo_coeff = cp.asarray(mf.mo_coeff)
+ assert mo_coeff[0].dtype == cp.float64
+ mo_energy = cp.asarray(mf.mo_energy)
+ mo_occ = cp.asarray(mf.mo_occ)
+ occidxa = mo_occ[0] > 0
+ occidxb = mo_occ[1] > 0
+ viridxa = mo_occ[0] == 0
+ viridxb = mo_occ[1] == 0
+ orboa = mo_coeff[0][:,occidxa]
+ orbob = mo_coeff[1][:,occidxb]
+ orbva = mo_coeff[0][:,viridxa]
+ orbvb = mo_coeff[1][:,viridxb]
+
+ e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+ e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+ e_ia = cp.hstack((e_ia_a.ravel(), e_ia_b.ravel()))
+ d_ia = e_ia**.5
+ ed_ia = e_ia * d_ia
+ hdiag = e_ia ** 2
+ hdiag = hdiag.get()
+ vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
+ nocca, nvira = e_ia_a.shape
+ noccb, nvirb = e_ia_b.shape
+
+ def vind(zs):
+ assert zs.dtype == np.float64
+ nz = len(zs)
+ zs = cp.asarray(zs).reshape(nz,-1)
+ dmsa = (zs[:,:nocca*nvira] * d_ia[:nocca*nvira]).reshape(nz,nocca,nvira)
+ dmsb = (zs[:,nocca*nvira:] * d_ia[nocca*nvira:]).reshape(nz,noccb,nvirb)
+ mo1a = contract('xov,pv->xpo', dmsa, orbva)
+ dmsa = contract('xpo,qo->xpq', mo1a, orboa)
+ mo1b = contract('xov,pv->xpo', dmsb, orbvb)
+ dmsb = contract('xpo,qo->xpq', mo1b, orbob)
+ dmsa = transpose_sum(dmsa)
+ dmsb = transpose_sum(dmsb)
+ dms = cp.asarray((dmsa, dmsb))
+ dms = tag_array(dms, mo1=[mo1a,mo1b], occ_coeff=[orboa,orbob])
+ v1ao = vresp(dms)
+ v1a = contract('xpq,qo->xpo', v1ao[0], orboa)
+ v1a = contract('xpo,pv->xov', v1a, orbva)
+ v1b = contract('xpq,qo->xpo', v1ao[1], orbob)
+ v1b = contract('xpo,pv->xov', v1b, orbvb)
+ hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1)))
+ hx += ed_ia * zs
+ hx *= d_ia
+ return hx.get()
+
+ return vind, hdiag
+
+ def kernel(self, x0=None, nstates=None):
+ '''TDDFT diagonalization solver
+ '''
+ log = logger.new_logger(self)
+ cpu0 = log.init_timer()
+ mf = self._scf
+ if mf._numint.libxc.is_hybrid_xc(mf.xc):
+ raise RuntimeError('%s cannot be used with hybrid functional'
+ % self.__class__)
+ self.check_sanity()
+ self.dump_flags()
+ if nstates is None:
+ nstates = self.nstates
+ else:
+ self.nstates = nstates
+
+ vind, hdiag = self.gen_vind(self._scf)
+ precond = self.get_precond(hdiag)
+
+ def pickeig(w, v, nroots, envs):
+ idx = np.where(w > self.positive_eig_threshold)[0]
+ return w[idx], v[:,idx], idx
+
+ x0sym = None
+ if x0 is None:
+ x0 = self.init_guess()
+
+ self.converged, w2, x1 = lr_eigh(
+ vind, x0, precond, tol_residual=self.conv_tol, lindep=self.lindep,
+ nroots=nstates, x0sym=x0sym, pick=pickeig, max_cycle=self.max_cycle,
+ max_memory=self.max_memory, verbose=log)
+
+ mo_energy = self._scf.mo_energy
+ mo_occ = self._scf.mo_occ
+ occidxa = mo_occ[0] > 0
+ occidxb = mo_occ[1] > 0
+ viridxa = mo_occ[0] == 0
+ viridxb = mo_occ[1] == 0
+ e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
+ e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
+ nocca, nvira = e_ia_a.shape
+ noccb, nvirb = e_ia_b.shape
+ if isinstance(mo_energy, cp.ndarray):
+ e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
+ e_ia = e_ia**.5
+ e_ia = e_ia.get()
+ else:
+ e_ia = np.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
+ e_ia = e_ia**.5
+
+ e = []
+ xy = []
+ for i, z in enumerate(x1):
+ if w2[i] < self.positive_eig_threshold:
+ continue
+ w = w2[i] ** .5
+ zp = e_ia * z
+ zm = w/e_ia * z
+ x = (zp + zm) * .5
+ y = (zp - zm) * .5
+ norm = lib.norm(x)**2 - lib.norm(y)**2
+ if norm > 0:
+ norm = norm**-.5
+ e.append(w)
+ xy.append(((x[:nocca*nvira].reshape(nocca,nvira) * norm, # X_alpha
+ x[nocca*nvira:].reshape(noccb,nvirb) * norm), # X_beta
+ (y[:nocca*nvira].reshape(nocca,nvira) * norm, # Y_alpha
+ y[nocca*nvira:].reshape(noccb,nvirb) * norm)))# Y_beta
+ self.e = np.array(e)
+ self.xy = xy
+
+ log.timer('TDDFT', *cpu0)
+ self._finalize()
+ return self.e, self.xy
+
+TDDFTNoHybrid = CasidaTDDFT
+
+def tddft(mf):
+ '''Driver to create TDDFT or CasidaTDDFT object'''
+ if mf._numint.libxc.is_hybrid_xc(mf.xc):
+ return TDDFT(mf)
+ else:
+ return CasidaTDDFT(mf)
+
+dft.uks.UKS.TDA = lib.class_as_method(TDA)
+dft.uks.UKS.TDHF = None
+#dft.uks.UKS.TDDFT = lib.class_as_method(TDDFT)
+dft.uks.UKS.TDDFTNoHybrid = lib.class_as_method(TDDFTNoHybrid)
+dft.uks.UKS.CasidaTDDFT = lib.class_as_method(CasidaTDDFT)
+dft.uks.UKS.TDDFT = tddft
+dft.uks.UKS.SFTDA = lib.class_as_method(SpinFlipTDA)
+dft.uks.UKS.SFTDDFT = lib.class_as_method(SpinFlipTDDFT)
diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py
index dc3156cf..4546da4e 100644
--- a/gpu4pyscf/tests/test_dft.py
+++ b/gpu4pyscf/tests/test_dft.py
@@ -13,12 +13,16 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+import unittest
import numpy as np
import pyscf
import pytest
import cupy
+from gpu4pyscf.dft import rks, uks
-atom = '''
+def setUpModule():
+ global mol
+ atom = '''
C -0.07551087 1.68127663 -0.10745193
O 1.33621755 1.87147409 -0.39326987
C 1.67074668 2.95729545 0.49387976
@@ -41,112 +45,116 @@
H -3.93210821 0.28874990 -1.89865997
'''
-mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
-mol.output = '/dev/null'
-mol.build()
-mol.verbose = 1
-
-@pytest.mark.smoke
-def test_b3lyp_with_d3bj():
- print('-------- DFRKS with D3(BJ) -------')
- from gpu4pyscf.dft import rks
- mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
- mf.grids.atom_grid = (99,590)
- mf.conv_tol = 1e-10
- mf.conv_tol_cpscf = 1e-8
- mf.disp = 'd3bj'
- e_dft = mf.kernel()
- assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
- g = mf.nuc_grad_method().kernel()
- assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
- h = mf.Hessian().kernel()
- assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-@pytest.mark.smoke
-def test_b3lyp_d3bj():
- print('-------- DFRKS with D3(BJ) -------')
- from gpu4pyscf.dft import rks
- mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
- mf.grids.atom_grid = (99,590)
- mf.conv_tol = 1e-10
- mf.conv_tol_cpscf = 1e-8
- e_dft = mf.kernel()
- assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
- g = mf.nuc_grad_method().kernel()
- assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
- h = mf.Hessian().kernel()
- assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-@pytest.mark.smoke
-def test_DFUKS():
- print('------- DFUKS with D3(BJ) -------')
- from gpu4pyscf.dft import uks
- mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
- mf.grids.atom_grid = (99,590)
- mf.conv_tol = 1e-10
- mf.conv_tol_cpscf = 1e-8
- mf.disp = 'd3bj'
- e_dft = mf.kernel()
- assert np.abs(e_dft - -685.0326965349493) < 1e-7
-
- g = mf.nuc_grad_method().kernel()
- assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
-
- h = mf.Hessian().kernel()
- assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
-
-@pytest.mark.smoke
-def test_RKS():
- print('-------- RKS with D3(BJ) -------')
- from gpu4pyscf.dft import rks
- mf = rks.RKS(mol, xc='b3lyp')
- mf.grids.atom_grid = (99,590)
- mf.conv_tol = 1e-12
- mf.disp = 'd3bj'
- e_dft = mf.kernel()
- assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
- g = mf.nuc_grad_method().kernel()
- assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-@pytest.mark.smoke
-def test_DFRKS_with_SMD():
- print('----- DFRKS with SMD -----')
- from gpu4pyscf.dft import rks
- mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
- mf = mf.SMD()
- mf.grids.atom_grid = (99,590)
- mf.conv_tol = 1e-10
- mf.conv_tol_cpscf = 1e-8
- mf.disp = 'd3bj'
- e_dft = mf.kernel()
- assert np.abs(e_dft - -685.0578838805443) < 1e-7
-
- g = mf.nuc_grad_method().kernel()
- assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5
-
- h = mf.Hessian().kernel()
- assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4
-
-@pytest.mark.smoke
-def test_DFUKS_with_SMD():
- print('------- DFUKS with SMD ---------')
- from gpu4pyscf.dft import uks
- mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
- mf = mf.SMD()
- mf.grids.atom_grid = (99,590)
- mf.conv_tol = 1e-10
- mf.conv_tol_cpscf = 1e-8
- mf.disp = 'd3bj'
- e_dft = mf.kernel()
- assert np.abs(e_dft - -685.05788388063) < 1e-7
-
- g = mf.nuc_grad_method().kernel()
- assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5
-
- h = mf.Hessian().kernel()
- assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4
+ mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
+ mol.output = '/dev/null'
+ mol.build()
+ mol.verbose = 1
+
+def tearDownModule():
+ global mol
+ mol.stdout.close()
+ del mol
+
+class KnownValues(unittest.TestCase):
+ @pytest.mark.smoke
+ def test_b3lyp_with_d3bj(self):
+ print('-------- DFRKS with D3(BJ) -------')
+ mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+ mf.grids.atom_grid = (99,590)
+ mf.conv_tol = 1e-10
+ mf.conv_tol_cpscf = 1e-8
+ mf.disp = 'd3bj'
+ e_dft = mf.kernel()
+ assert np.abs(e_dft - -685.0326965348272) < 1e-7
+
+ g = mf.nuc_grad_method().kernel()
+ assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
+
+ h = mf.Hessian().kernel()
+ assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
+
+ @pytest.mark.smoke
+ def test_b3lyp_d3bj(self):
+ print('-------- DFRKS with D3(BJ) -------')
+ mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
+ mf.grids.atom_grid = (99,590)
+ mf.conv_tol = 1e-10
+ mf.conv_tol_cpscf = 1e-8
+ e_dft = mf.kernel()
+ assert np.abs(e_dft - -685.0326965348272) < 1e-7
+
+ g = mf.nuc_grad_method().kernel()
+ assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
+
+ h = mf.Hessian().kernel()
+ assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
+
+ @pytest.mark.smoke
+ def test_DFUKS(self):
+ print('------- DFUKS with D3(BJ) -------')
+ mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+ mf.grids.atom_grid = (99,590)
+ mf.conv_tol = 1e-10
+ mf.conv_tol_cpscf = 1e-8
+ mf.disp = 'd3bj'
+ e_dft = mf.kernel()
+ assert np.abs(e_dft - -685.0326965349493) < 1e-7
+
+ g = mf.nuc_grad_method().kernel()
+ assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
+
+ h = mf.Hessian().kernel()
+ assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
+
+ @pytest.mark.smoke
+ def test_RKS(self):
+ print('-------- RKS with D3(BJ) -------')
+ mf = rks.RKS(mol, xc='b3lyp')
+ mf.grids.atom_grid = (99,590)
+ mf.conv_tol = 1e-12
+ mf.disp = 'd3bj'
+ e_dft = mf.kernel()
+ assert np.abs(e_dft - -685.0325611822375) < 1e-7
+
+ g = mf.nuc_grad_method().kernel()
+ assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
+
+ @pytest.mark.smoke
+ def test_DFRKS_with_SMD(self):
+ print('----- DFRKS with SMD -----')
+ mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+ mf = mf.SMD()
+ mf.grids.atom_grid = (99,590)
+ mf.conv_tol = 1e-10
+ mf.conv_tol_cpscf = 1e-8
+ mf.disp = 'd3bj'
+ e_dft = mf.kernel()
+ assert np.abs(e_dft - -685.0578838805443) < 1e-7
+
+ g = mf.nuc_grad_method().kernel()
+ assert np.abs(cupy.linalg.norm(g) - 0.16905807654571403) < 1e-5
+
+ h = mf.Hessian().kernel()
+ assert np.abs(cupy.linalg.norm(h) - 3.743840896534178) < 1e-4
+
+ @pytest.mark.smoke
+ def test_DFUKS_with_SMD(self):
+ print('------- DFUKS with SMD ---------')
+ mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
+ mf = mf.SMD()
+ mf.grids.atom_grid = (99,590)
+ mf.conv_tol = 1e-10
+ mf.conv_tol_cpscf = 1e-8
+ mf.disp = 'd3bj'
+ e_dft = mf.kernel()
+ assert np.abs(e_dft - -685.05788388063) < 1e-7
+
+ g = mf.nuc_grad_method().kernel()
+ assert np.abs(cupy.linalg.norm(g) - 0.1690582751813457) < 1e-5
+
+ h = mf.Hessian().kernel()
+ assert np.abs(cupy.linalg.norm(h) - 3.743858482519822) < 1e-4
+
+if __name__ == "__main__":
+ print("Full Smoke Tests")
+ unittest.main()