Skip to content

Commit

Permalink
int1e first derivative with density and charge contraction working, n…
Browse files Browse the repository at this point in the history
…ot optimized
  • Loading branch information
henryw7 committed Dec 12, 2024
1 parent ed0ef9e commit a33940b
Show file tree
Hide file tree
Showing 9 changed files with 824 additions and 19 deletions.
18 changes: 9 additions & 9 deletions gpu4pyscf/gto/int3c1e.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

BLKSIZE = 128

libgvhf = load_library('libgvhf')
libgint = load_library('libgint')

class VHFOpt(_vhf.VHFOpt):
Expand Down Expand Up @@ -58,7 +57,7 @@ def __init__(self, mol, intor='int2e', prescreen='CVHFnoscreen',
def clear(self):
_vhf.VHFOpt.__del__(self)
for n, bpcache in self._bpcache.items():
libgvhf.GINTdel_basis_prod(ctypes.byref(bpcache))
libgint.GINTdel_basis_prod(ctypes.byref(bpcache))
return self

def __del__(self):
Expand Down Expand Up @@ -296,7 +295,7 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
if charge_exponents is not None:
charge_exponents = cp.asarray(charge_exponents, order='C')

int1e = cp.zeros([mol.nao, mol.nao], order='C')
int1e_charge_contracted = cp.zeros([mol.nao, mol.nao], order='C')
for cp_ij_id, _ in enumerate(intopt.log_qs):
cpi = intopt.cp_idx[cp_ij_id]
cpj = intopt.cp_jdx[cp_ij_id]
Expand Down Expand Up @@ -355,14 +354,14 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
int1e_angular_slice = cart2sph(int1e_angular_slice, axis=0, ang=lj)
int1e_angular_slice = cart2sph(int1e_angular_slice, axis=1, ang=li)

int1e[j0:j1, i0:i1] = int1e_angular_slice
int1e_charge_contracted[j0:j1, i0:i1] = int1e_angular_slice

row, col = np.tril_indices(nao)
int1e[row, col] = int1e[col, row]
int1e_charge_contracted[row, col] = int1e_charge_contracted[col, row]
ao_idx = np.argsort(intopt._ao_idx)
int1e = int1e[np.ix_(ao_idx, ao_idx)]
int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)]

return int1e
return int1e_charge_contracted

def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
omega = mol.omega
Expand Down Expand Up @@ -397,15 +396,16 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):

n_total_hermite_density = intopt.density_offset[-1]
dm_pair_ordered = np.zeros(n_total_hermite_density)
libgvhf.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p),
l_ij.ctypes.data_as(ctypes.c_void_p),
intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
ao_loc_sorted_order.ctypes.data_as(ctypes.c_void_p),
bas_coords.ctypes.data_as(ctypes.c_void_p))
bas_coords.ctypes.data_as(ctypes.c_void_p),
ctypes.c_bool(True))

dm_pair_ordered = cp.asarray(dm_pair_ordered)

Expand Down
189 changes: 187 additions & 2 deletions gpu4pyscf/gto/int3c1e_ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@
import cupy as cp
import numpy as np

from pyscf.gto import ATOM_OF
from pyscf.lib import c_null_ptr
from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem


libgint = load_library('libgint')


def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
omega = mol.omega
assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
Expand Down Expand Up @@ -122,3 +121,189 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, i_grid_split : i_grid_split + ngrids_of_split, :, :])

return int3c_ip1, int3c_ip2

def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt):
omega = mol.omega
assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."

nao = mol.nao

charges = cp.asarray(charges).reshape([-1, 1], order='C')
grids = cp.concatenate([grids, charges], axis=1)

int1e_charge_contracted = cp.zeros([3, mol.nao, mol.nao], order='C')
for cp_ij_id, _ in enumerate(intopt.log_qs):
cpi = intopt.cp_idx[cp_ij_id]
cpj = intopt.cp_jdx[cp_ij_id]
li = intopt.angular[cpi]
lj = intopt.angular[cpj]

stream = cp.cuda.get_current_stream()
nao_cart = intopt._sorted_mol.nao

log_q_ij = intopt.log_qs[cp_ij_id]

nbins = 1
bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)

i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
ni = i1 - i0
nj = j1 - j0

ao_offsets = np.array([i0, j0], dtype=np.int32)
strides = np.array([ni, ni*nj], dtype=np.int32)

charge_exponents_pointer = c_null_ptr()
if charge_exponents is not None:
charge_exponents_pointer = charge_exponents.data.ptr

ngrids = grids.shape[0]
# n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
# n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
n_charge_sum_per_thread = 10

int1e_angular_slice = cp.zeros([3, j1-j0, i1-i0], order='C')

err = libgint.GINTfill_int3c1e_ip1_charge_contracted(
ctypes.cast(stream.ptr, ctypes.c_void_p),
intopt.bpcache,
ctypes.cast(grids.data.ptr, ctypes.c_void_p),
ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
ctypes.c_int(ngrids),
ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
ctypes.c_int(nao_cart),
strides.ctypes.data_as(ctypes.c_void_p),
ao_offsets.ctypes.data_as(ctypes.c_void_p),
bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nbins),
ctypes.c_int(cp_ij_id),
ctypes.c_double(omega),
ctypes.c_int(n_charge_sum_per_thread))

if err != 0:
raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')

i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
if not mol.cart:
int1e_angular_slice = cart2sph(int1e_angular_slice, axis=1, ang=lj)
int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=li)

int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice

ao_idx = np.argsort(intopt._ao_idx)
derivative_idx = np.arange(3)
int1e_charge_contracted = int1e_charge_contracted[np.ix_(derivative_idx, ao_idx, ao_idx)]

return int1e_charge_contracted

def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt):
omega = mol.omega
assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."

nao_cart = intopt._sorted_mol.nao
ngrids = grids.shape[0]

dm = intopt.sort_orbitals(dm, [0,1])
if not mol.cart:
cart2sph_transformation_matrix = intopt.cart2sph
# TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm
dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T
dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function

dm = cp.asnumpy(dm)

ao_loc_sorted_order = intopt._sorted_mol.ao_loc_nr(cart = True)
l_ij = intopt.l_ij.T.flatten()
bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()

n_total_hermite_density = intopt.density_offset[-1]
dm_pair_ordered = np.zeros(n_total_hermite_density)
libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p),
l_ij.ctypes.data_as(ctypes.c_void_p),
intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
ao_loc_sorted_order.ctypes.data_as(ctypes.c_void_p),
bas_coords.ctypes.data_as(ctypes.c_void_p),
ctypes.c_bool(False))

dm_pair_ordered = cp.asarray(dm_pair_ordered)

n_threads_per_block_1d = 16
n_max_blocks_per_grid_1d = 65535
n_max_threads_1d = n_threads_per_block_1d * n_max_blocks_per_grid_1d
n_grid_split = int(np.ceil(ngrids / n_max_threads_1d))
if (n_grid_split > 100):
print(f"Grid dimension = {ngrids} is too large, more than 100 kernels for one electron integral will be launched.")
ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split

int3c_density_contracted = cp.zeros([3, ngrids], order='C')

for i_grid_split in range(0, ngrids, ngrids_per_split):
ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
for cp_ij_id, _ in enumerate(intopt.log_qs):
stream = cp.cuda.get_current_stream()

log_q_ij = intopt.log_qs[cp_ij_id]

nbins = 1
bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)

charge_exponents_pointer = c_null_ptr()
if charge_exponents is not None:
charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr

# n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
# n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
n_pair_sum_per_thread = nao_cart

err = libgint.GINTfill_int3c1e_ip2_density_contracted(
ctypes.cast(stream.ptr, ctypes.c_void_p),
intopt.bpcache,
ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
ctypes.c_int(ngrids_of_split),
ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
ctypes.cast(int3c_density_contracted[:, i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p),
bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nbins),
ctypes.c_int(cp_ij_id),
ctypes.c_double(omega),
ctypes.c_int(n_pair_sum_per_thread))

if err != 0:
raise RuntimeError('GINTfill_int3c1e_density_contracted failed')

return int3c_density_contracted

def get_int3c1e_ip_contracted(mol, grids, charge_exponents, dm, charges, intopt):
dm = cp.asarray(dm)
if dm.ndim == 3:
if dm.shape[0] > 2:
print("Warning: There are more than two density matrices to contract with one electron integrals, "
"it's not from an unrestricted calculation, and we're unsure about your purpose. "
"We sum the density matrices up, please check if that's expected.")
dm = cp.einsum("ijk->jk", dm)

assert dm.ndim == 2
assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao

grids = cp.asarray(grids, order='C')
if charge_exponents is not None:
charge_exponents = cp.asarray(charge_exponents, order='C')

assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
charges = cp.asarray(charges)

int3c_ip2 = get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
int3c_ip2 = int3c_ip2 * charges

int3c_ip1 = get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
int3c_ip1 = cp.einsum('xji,ij->xi', int3c_ip1, dm)

return int3c_ip1, int3c_ip2
6 changes: 4 additions & 2 deletions gpu4pyscf/gto/moleintor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import numpy as np

from gpu4pyscf.gto.int3c1e import VHFOpt, get_int3c1e, get_int3c1e_density_contracted, get_int3c1e_charge_contracted
from gpu4pyscf.gto.int3c1e_ip import get_int3c1e_ip
from gpu4pyscf.gto.int3c1e_ip import get_int3c1e_ip, get_int3c1e_ip_contracted

def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None):
assert grids is not None
Expand Down Expand Up @@ -52,6 +52,8 @@ def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direc
if dm is None and charges is None:
return get_int3c1e_ip(mol, grids, charge_exponents, intopt)
else:
raise NotImplementedError()
assert dm is not None
assert charges is not None
return get_int3c1e_ip_contracted(mol, grids, charge_exponents, dm, charges, intopt)
else:
raise NotImplementedError(f"GPU intor {intor} is not implemented.")
Loading

0 comments on commit a33940b

Please sign in to comment.