From e003ef5ff631505eaf4e98b98c2499650b0eb375 Mon Sep 17 00:00:00 2001 From: Henry Wang Date: Thu, 23 Jan 2025 02:01:33 +0000 Subject: [PATCH] PCM Hessian optimize electron hessian, remove nao*nao*ngrids memory requirement --- gpu4pyscf/gto/int3c1e_ipip.py | 410 +++++++++++ gpu4pyscf/lib/gint/CMakeLists.txt | 1 + gpu4pyscf/lib/gint/g3c1e_ipip.cu | 635 ++++++++++++++++++ gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu | 361 ++++++++++ gpu4pyscf/solvent/hessian/pcm.py | 62 +- 5 files changed, 1441 insertions(+), 28 deletions(-) create mode 100644 gpu4pyscf/gto/int3c1e_ipip.py create mode 100644 gpu4pyscf/lib/gint/g3c1e_ipip.cu create mode 100644 gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu diff --git a/gpu4pyscf/gto/int3c1e_ipip.py b/gpu4pyscf/gto/int3c1e_ipip.py new file mode 100644 index 00000000..1c2abf5c --- /dev/null +++ b/gpu4pyscf/gto/int3c1e_ipip.py @@ -0,0 +1,410 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes +import cupy as cp +import numpy as np +from pyscf import lib +from pyscf.gto import ATOM_OF +from pyscf.lib import c_null_ptr +from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem +from gpu4pyscf.gto.int3c1e import VHFOpt + +libgint = load_library('libgint') + +def get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + ngrids = grids.shape[0] + # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points + n_charge_sum_per_thread = 10 + + int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ipip1_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_charge_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + int1e_angular_slice[1,0] = int1e_angular_slice[0,1] + int1e_angular_slice[2,0] = int1e_angular_slice[0,2] + int1e_angular_slice[2,1] = int1e_angular_slice[1,2] + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3]) + +def get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + ngrids = grids.shape[0] + # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points + n_charge_sum_per_thread = 10 + + int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ipvip1_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_charge_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3]) + +def get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + ngrids = grids.shape[0] + # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points + n_charge_sum_per_thread = 10 + + int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ip1ip2_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_charge_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3]) + +def get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + nao_cart = intopt._sorted_mol.nao + ngrids = grids.shape[0] + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + dm = cp.asarray(dm) + assert dm.ndim == 2 + assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao + + dm = intopt.sort_orbitals(dm, [0,1]) + if not mol.cart: + cart2sph_transformation_matrix = intopt.cart2sph + # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm + dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T + dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function + + dm = cp.asnumpy(dm) + + ao_loc_sorted_order = intopt._sorted_mol.ao_loc_nr(cart = True) + l_ij = intopt.l_ij.T.flatten() + bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten() + + n_total_hermite_density = intopt.density_offset[-1] + dm_pair_ordered = np.empty(n_total_hermite_density) + libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p), + dm_pair_ordered.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(1), ctypes.c_int(nao_cart), + ctypes.c_int(len(intopt.bas_pairs_locs) - 1), + intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p), + intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), + l_ij.ctypes.data_as(ctypes.c_void_p), + intopt.density_offset.ctypes.data_as(ctypes.c_void_p), + ao_loc_sorted_order.ctypes.data_as(ctypes.c_void_p), + bas_coords.ctypes.data_as(ctypes.c_void_p), + ctypes.c_bool(False)) + + dm_pair_ordered = cp.asarray(dm_pair_ordered) + + n_threads_per_block_1d = 16 + n_max_blocks_per_grid_1d = 65535 + n_max_threads_1d = n_threads_per_block_1d * n_max_blocks_per_grid_1d + n_grid_split = int(np.ceil(ngrids / n_max_threads_1d)) + if (n_grid_split > 100): + print(f"Grid dimension = {ngrids} is too large, more than 100 kernels for one electron integral will be launched.") + ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split + + int3c_density_contracted = cp.zeros([3, 3, ngrids], order='C') + + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): + for cp_ij_id, _ in enumerate(intopt.log_qs): + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1] + + # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type + n_pair_sum_per_thread = nao_cart + + err = libgint.GINTfill_int3c1e_ipip2_density_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(p1-p0), + ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), + intopt.density_offset.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_pair_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_density_contracted failed') + + int3c_density_contracted[1,0] = int3c_density_contracted[0,1] + int3c_density_contracted[2,0] = int3c_density_contracted[0,2] + int3c_density_contracted[2,1] = int3c_density_contracted[1,2] + + return int3c_density_contracted + +def int1e_grids_ipip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert charges is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) + +def int1e_grids_ipvip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert charges is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) + +def int1e_grids_ip1ip2(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert charges is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt) + +def int1e_grids_ipip2(mol, grids, charge_exponents=None, dm=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert dm is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt) diff --git a/gpu4pyscf/lib/gint/CMakeLists.txt b/gpu4pyscf/lib/gint/CMakeLists.txt index 464efed6..7647c2c3 100644 --- a/gpu4pyscf/lib/gint/CMakeLists.txt +++ b/gpu4pyscf/lib/gint/CMakeLists.txt @@ -26,6 +26,7 @@ add_library(gint SHARED nr_fill_ao_ints.cu nr_fill_ao_int3c1e.cu nr_fill_ao_int3c1e_ip.cu + nr_fill_ao_int3c1e_ipip.cu nr_fill_ao_int3c2e.cu nr_fill_ao_int3c2e_ip1.cu nr_fill_ao_int3c2e_ip2.cu diff --git a/gpu4pyscf/lib/gint/g3c1e_ipip.cu b/gpu4pyscf/lib/gint/g3c1e_ipip.cu new file mode 100644 index 00000000..87ebb270 --- /dev/null +++ b/gpu4pyscf/lib/gint/g3c1e_ipip.cu @@ -0,0 +1,635 @@ +/* + * Copyright 2021-2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gint.h" + +template +__device__ +static void GINTwrite_int3c1e_ipip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double prefactor, const int i_l, const int j_l) +{ + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 2 + 1); + const int gy_offset = iy + jy * (i_l + 2 + 1); + const int gz_offset = iz + jz * (i_l + 2 + 1); + + double d2eri_dAxdAx = 0; + double d2eri_dAxdAy = 0; + double d2eri_dAxdAz = 0; + double d2eri_dAydAy = 0; + double d2eri_dAydAz = 0; + double d2eri_dAzdAz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_minus_2 = (ix >= 2 ? gx[(gx_offset - 2) * NROOTS + i_root] : 0); + const double gy_minus_2 = (iy >= 2 ? gy[(gy_offset - 2) * NROOTS + i_root] : 0); + const double gz_minus_2 = (iz >= 2 ? gz[(gz_offset - 2) * NROOTS + i_root] : 0); + const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0); + const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0); + const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0); + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root]; + const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root]; + const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root]; + const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root]; + const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root]; + const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root]; + const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1; + const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1; + const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1; + const double d2gx_dAx2 = ix * (ix - 1) * gx_minus_2 + minus_two_a * (2 * ix + 1) * gx_0 + minus_two_a * minus_two_a * gx_2; + const double d2gy_dAy2 = iy * (iy - 1) * gy_minus_2 + minus_two_a * (2 * iy + 1) * gy_0 + minus_two_a * minus_two_a * gy_2; + const double d2gz_dAz2 = iz * (iz - 1) * gz_minus_2 + minus_two_a * (2 * iz + 1) * gz_0 + minus_two_a * minus_two_a * gz_2; + d2eri_dAxdAx += d2gx_dAx2 * gy_0 * gz_0; + d2eri_dAxdAy += dgx_dAx * dgy_dAy * gz_0; + d2eri_dAxdAz += dgx_dAx * gy_0 * dgz_dAz; + d2eri_dAydAy += gx_0 * d2gy_dAy2 * gz_0; + d2eri_dAydAz += gx_0 * dgy_dAy * dgz_dAz; + d2eri_dAzdAz += gx_0 * gy_0 * d2gz_dAz2; + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdAx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdAy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdAz * prefactor; + local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydAy * prefactor; + local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydAz * prefactor; + local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAzdAz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ipip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + constexpr int l_sum_max = (NROOTS - 1) * 2 + 1; + constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2; + constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements; + double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2 + * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2 + * 6] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[GSIZE_INT3C_1E]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ipip1_charge_contracted(g, output_cache, minus_two_a, charge, i_l, j_l); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const double d2eri_dAxdAx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double d2eri_dAxdAy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double d2eri_dAxdAz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + const double d2eri_dAydAy = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij]; + const double d2eri_dAydAz = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij]; + const double d2eri_dAzdAz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdAx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdAy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdAz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydAy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydAz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdAz); + } + } +} + +template +__device__ +static void GINTwrite_int3c1e_ipvip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double minus_two_b, const double prefactor, const int i_l, const int j_l) +{ + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1 + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int j_offset = i_l + 1 + 1; + + double d2eri_dAxdBx = 0; + double d2eri_dAxdBy = 0; + double d2eri_dAxdBz = 0; + double d2eri_dAydBx = 0; + double d2eri_dAydBy = 0; + double d2eri_dAydBz = 0; + double d2eri_dAzdBx = 0; + double d2eri_dAzdBy = 0; + double d2eri_dAzdBz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_i_minus_1_j_minus_1 = ix * jx * (ix >= 1 && jx >= 1 ? gx[(ix - 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_minus_1_j_minus_1 = iy * jy * (iy >= 1 && jy >= 1 ? gy[(iy - 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_minus_1_j_minus_1 = iz * jz * (iz >= 1 && jz >= 1 ? gz[(iz - 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0); + const double gx_i_minus_1_j_1 = ix * minus_two_b * (ix >= 1 ? gx[(ix - 1 + (jx + 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_minus_1_j_1 = iy * minus_two_b * (iy >= 1 ? gy[(iy - 1 + (jy + 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_minus_1_j_1 = iz * minus_two_b * (iz >= 1 ? gz[(iz - 1 + (jz + 1) * j_offset) * NROOTS + i_root] : 0); + const double gx_i_1_j_minus_1 = jx * minus_two_a * (jx >= 1 ? gx[(ix + 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_1_j_minus_1 = jy * minus_two_a * (jy >= 1 ? gy[(iy + 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_1_j_minus_1 = jz * minus_two_a * (jz >= 1 ? gz[(iz + 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0); + const double gx_i_1_j_1 = minus_two_a * minus_two_b * gx[(ix + 1 + (jx + 1) * j_offset) * NROOTS + i_root]; + const double gy_i_1_j_1 = minus_two_a * minus_two_b * gy[(iy + 1 + (jy + 1) * j_offset) * NROOTS + i_root]; + const double gz_i_1_j_1 = minus_two_a * minus_two_b * gz[(iz + 1 + (jz + 1) * j_offset) * NROOTS + i_root]; + const double gx_0 = gx[(ix + jx * j_offset) * NROOTS + i_root]; + const double gy_0 = gy[(iy + jy * j_offset) * NROOTS + i_root]; + const double gz_0 = gz[(iz + jz * j_offset) * NROOTS + i_root]; + const double gx_i_1_j_0 = minus_two_a * gx[(ix + 1 + jx * j_offset) * NROOTS + i_root]; + const double gy_i_1_j_0 = minus_two_a * gy[(iy + 1 + jy * j_offset) * NROOTS + i_root]; + const double gz_i_1_j_0 = minus_two_a * gz[(iz + 1 + jz * j_offset) * NROOTS + i_root]; + const double gx_i_minus_1_j_0 = ix * (ix >= 1 ? gx[(ix - 1 + jx * j_offset) * NROOTS + i_root] : 0); + const double gy_i_minus_1_j_0 = iy * (iy >= 1 ? gy[(iy - 1 + jy * j_offset) * NROOTS + i_root] : 0); + const double gz_i_minus_1_j_0 = iz * (iz >= 1 ? gz[(iz - 1 + jz * j_offset) * NROOTS + i_root] : 0); + const double gx_i_0_j_1 = minus_two_b * gx[(ix + (jx + 1) * j_offset) * NROOTS + i_root]; + const double gy_i_0_j_1 = minus_two_b * gy[(iy + (jy + 1) * j_offset) * NROOTS + i_root]; + const double gz_i_0_j_1 = minus_two_b * gz[(iz + (jz + 1) * j_offset) * NROOTS + i_root]; + const double gx_i_0_j_minus_1 = jx * (jx >= 1 ? gx[(ix + (jx - 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_0_j_minus_1 = jy * (jy >= 1 ? gy[(iy + (jy - 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_0_j_minus_1 = jz * (jz >= 1 ? gz[(iz + (jz - 1) * j_offset) * NROOTS + i_root] : 0); + + d2eri_dAxdBx += (gx_i_minus_1_j_minus_1 + gx_i_minus_1_j_1 + gx_i_1_j_minus_1 + gx_i_1_j_1) * gy_0 * gz_0; + d2eri_dAxdBy += (gx_i_minus_1_j_0 + gx_i_1_j_0) * (gy_i_0_j_minus_1 + gy_i_0_j_1) * gz_0; + d2eri_dAxdBz += (gx_i_minus_1_j_0 + gx_i_1_j_0) * gy_0 * (gz_i_0_j_minus_1 + gz_i_0_j_1); + d2eri_dAydBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * (gy_i_minus_1_j_0 + gy_i_1_j_0) * gz_0; + d2eri_dAydBy += gx_0 * (gy_i_minus_1_j_minus_1 + gy_i_minus_1_j_1 + gy_i_1_j_minus_1 + gy_i_1_j_1) * gz_0; + d2eri_dAydBz += gx_0 * (gy_i_minus_1_j_0 + gy_i_1_j_0) * (gz_i_0_j_minus_1 + gz_i_0_j_1); + d2eri_dAzdBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * gy_0 * (gz_i_minus_1_j_0 + gz_i_1_j_0); + d2eri_dAzdBy += gx_0 * (gy_i_0_j_minus_1 + gy_i_0_j_1) * (gz_i_minus_1_j_0 + gz_i_1_j_0); + d2eri_dAzdBz += gx_0 * gy_0 * (gz_i_minus_1_j_minus_1 + gz_i_minus_1_j_1 + gz_i_1_j_minus_1 + gz_i_1_j_1); + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdBx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdBy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdBz * prefactor; + local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydBx * prefactor; + local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydBy * prefactor; + local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydBz * prefactor; + local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdBx * prefactor; + local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdBy * prefactor; + local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdBz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + const double* __restrict__ b_exponents = c_bpcache.a2; + + constexpr int l_sum_max = (NROOTS - 1) * 2 + 1; + constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2; + constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements; + double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2 + * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2 + * 9] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[GSIZE_INT3C_1E]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 1, j_l + 1, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + const double minus_two_b = -2.0 * b_exponents[ij]; + GINTwrite_int3c1e_ipvip1_charge_contracted(g, output_cache, minus_two_a, minus_two_b, charge, i_l, j_l); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const double d2eri_dAxdBx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double d2eri_dAxdBy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double d2eri_dAxdBz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + const double d2eri_dAydBx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij]; + const double d2eri_dAydBy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij]; + const double d2eri_dAydBz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij]; + const double d2eri_dAzdBx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij]; + const double d2eri_dAzdBy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij]; + const double d2eri_dAzdBz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdBx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdBy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdBz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydBx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydBy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydBz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdBx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdBy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdBz); + } + } +} + +template +__device__ +static void GINTwrite_int3c1e_ip1ip2_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double* u2, const double* AC, const double prefactor, const int i_l, const int j_l) +{ + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const double ACx = AC[0]; + const double ACy = AC[1]; + const double ACz = AC[2]; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 2 + 1); + const int gy_offset = iy + jy * (i_l + 2 + 1); + const int gz_offset = iz + jz * (i_l + 2 + 1); + + double d2eri_dAxdCx = 0; + double d2eri_dAxdCy = 0; + double d2eri_dAxdCz = 0; + double d2eri_dAydCx = 0; + double d2eri_dAydCy = 0; + double d2eri_dAydCz = 0; + double d2eri_dAzdCx = 0; + double d2eri_dAzdCy = 0; + double d2eri_dAzdCz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0); + const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0); + const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0); + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root]; + const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root]; + const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root]; + const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root]; + const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root]; + const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root]; + + const double two_u2 = 2.0 * u2[i_root]; + const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1; + const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1; + const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1; + const double dgx_dCx = two_u2 * (ACx * gx_0 + gx_1); + const double dgy_dCy = two_u2 * (ACy * gy_0 + gy_1); + const double dgz_dCz = two_u2 * (ACz * gz_0 + gz_1); + const double d2gx_dAxdCx = two_u2 * (ix * ACx * gx_minus_1 + ix * gx_0 + minus_two_a * ACx * gx_1 + minus_two_a * gx_2); + const double d2gy_dAydCy = two_u2 * (iy * ACy * gy_minus_1 + iy * gy_0 + minus_two_a * ACy * gy_1 + minus_two_a * gy_2); + const double d2gz_dAzdCz = two_u2 * (iz * ACz * gz_minus_1 + iz * gz_0 + minus_two_a * ACz * gz_1 + minus_two_a * gz_2); + + d2eri_dAxdCx += - d2gx_dAxdCx * gy_0 * gz_0; + d2eri_dAxdCy += - dgx_dAx * dgy_dCy * gz_0; + d2eri_dAxdCz += - dgx_dAx * gy_0 * dgz_dCz; + d2eri_dAydCx += - dgx_dCx * dgy_dAy * gz_0; + d2eri_dAydCy += - gx_0 * d2gy_dAydCy * gz_0; + d2eri_dAydCz += - gx_0 * dgy_dAy * dgz_dCz; + d2eri_dAzdCx += - dgx_dCx * gy_0 * dgz_dAz; + d2eri_dAzdCy += - gx_0 * dgy_dCy * dgz_dAz; + d2eri_dAzdCz += - gx_0 * gy_0 * d2gz_dAzdCz; + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdCx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdCy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdCz * prefactor; + local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydCx * prefactor; + local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydCy * prefactor; + local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydCz * prefactor; + local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdCx * prefactor; + local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdCy * prefactor; + local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdCz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + constexpr int l_sum_max = (NROOTS - 1) * 2 + 1; + constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2; + constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements; + double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2 + * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2 + * 9] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz }; + + double g[GSIZE_INT3C_1E]; + double u2[NROOTS]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e_save_u2(g, u2, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip1ip2_charge_contracted(g, output_cache, minus_two_a, u2, AC, charge, i_l, j_l); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const double d2eri_dAxdCx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double d2eri_dAxdCy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double d2eri_dAxdCz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + const double d2eri_dAydCx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij]; + const double d2eri_dAydCy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij]; + const double d2eri_dAydCz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij]; + const double d2eri_dAzdCx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij]; + const double d2eri_dAzdCy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij]; + const double d2eri_dAzdCz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdCz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydCz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdCz); + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ipip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, + const BasisProdOffsets offsets, const int nprim_ij, + const double omega, const double* grid_points, const double* charge_exponents) +{ + constexpr int NROOTS = (L_SUM + 2) / 2 + 1; + + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + if (task_grid >= ngrids) { + return; + } + + const double* grid_point = grid_points + task_grid * 3; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double d2eri_dCxdCx_pair_sum = 0.0; + double d2eri_dCxdCy_pair_sum = 0.0; + double d2eri_dCxdCz_pair_sum = 0.0; + double d2eri_dCydCy_pair_sum = 0.0; + double d2eri_dCydCz_pair_sum = 0.0; + double d2eri_dCzdCz_pair_sum = 0.0; + for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) { + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + // const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + // const int jsh = bas_pair2ket[bas_ij]; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + const double ACx = Ax - Cx; + const double ACy = Ay - Cy; + const double ACz = Az - Cz; + + double D_hermite[(L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6]; +#pragma unroll + for (int i_t = 0; i_t < (L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6; i_t++) { + D_hermite[i_t] = density[bas_ij - hermite_density_offsets.pair_offset_of_angular_pair + hermite_density_offsets.density_offset_of_angular_pair + i_t * hermite_density_offsets.n_pair_of_angular_pair]; + } + + double d2eri_dCxdCx = 0.0; + double d2eri_dCxdCy = 0.0; + double d2eri_dCxdCz = 0.0; + double d2eri_dCydCy = 0.0; + double d2eri_dCydCz = 0.0; + double d2eri_dCzdCz = 0.0; + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + double g[NROOTS * (L_SUM + 2 + 1) * 3]; + double u2[NROOTS]; + GINT_g1e_without_hrr_save_u2(g, u2, Cx, Cy, Cz, ish, ij, charge_exponent, omega); + + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + NROOTS * (L_SUM + 2 + 1); + const double* __restrict__ gz = g + NROOTS * (L_SUM + 2 + 1) * 2; + +#pragma unroll + for (int i_x = 0, i_t = 0; i_x <= L_SUM; i_x++) { +#pragma unroll + for (int i_y = 0; i_x + i_y <= L_SUM; i_y++) { +#pragma unroll + for (int i_z = 0; i_x + i_y + i_z <= L_SUM; i_z++, i_t++) { + double d2eri_dCxdCx_per_hermite = 0.0; + double d2eri_dCxdCy_per_hermite = 0.0; + double d2eri_dCxdCz_per_hermite = 0.0; + double d2eri_dCydCy_per_hermite = 0.0; + double d2eri_dCydCz_per_hermite = 0.0; + double d2eri_dCzdCz_per_hermite = 0.0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[i_root + NROOTS * i_x]; + const double gy_0 = gy[i_root + NROOTS * i_y]; + const double gz_0 = gz[i_root + NROOTS * i_z]; + const double gx_1 = gx[i_root + NROOTS * (i_x + 1)]; + const double gy_1 = gy[i_root + NROOTS * (i_y + 1)]; + const double gz_1 = gz[i_root + NROOTS * (i_z + 1)]; + const double gx_2 = gx[i_root + NROOTS * (i_x + 2)]; + const double gy_2 = gy[i_root + NROOTS * (i_y + 2)]; + const double gz_2 = gz[i_root + NROOTS * (i_z + 2)]; + const double two_u2 = 2.0 * u2[i_root]; + const double dgx_dCx = two_u2 * (gx_1 + ACx * gx_0); + const double dgy_dCy = two_u2 * (gy_1 + ACy * gy_0); + const double dgz_dCz = two_u2 * (gz_1 + ACz * gz_0); + const double d2gx_dCx2 = two_u2 * (-gx_0 + two_u2 * (gx_2 + ACx * gx_1 * 2 + ACx * ACx * gx_0)); + const double d2gy_dCy2 = two_u2 * (-gy_0 + two_u2 * (gy_2 + ACy * gy_1 * 2 + ACy * ACy * gy_0)); + const double d2gz_dCz2 = two_u2 * (-gz_0 + two_u2 * (gz_2 + ACz * gz_1 * 2 + ACz * ACz * gz_0)); + d2eri_dCxdCx_per_hermite += d2gx_dCx2 * gy_0 * gz_0; + d2eri_dCxdCy_per_hermite += dgx_dCx * dgy_dCy * gz_0; + d2eri_dCxdCz_per_hermite += dgx_dCx * gy_0 * dgz_dCz; + d2eri_dCydCy_per_hermite += gx_0 * d2gy_dCy2 * gz_0; + d2eri_dCydCz_per_hermite += gx_0 * dgy_dCy * dgz_dCz; + d2eri_dCzdCz_per_hermite += gx_0 * gy_0 * d2gz_dCz2; + } + const double D_t = D_hermite[i_t]; + d2eri_dCxdCx += d2eri_dCxdCx_per_hermite * D_t; + d2eri_dCxdCy += d2eri_dCxdCy_per_hermite * D_t; + d2eri_dCxdCz += d2eri_dCxdCz_per_hermite * D_t; + d2eri_dCydCy += d2eri_dCydCy_per_hermite * D_t; + d2eri_dCydCz += d2eri_dCydCz_per_hermite * D_t; + d2eri_dCzdCz += d2eri_dCzdCz_per_hermite * D_t; + } + } + } + } + d2eri_dCxdCx_pair_sum += d2eri_dCxdCx; + d2eri_dCxdCy_pair_sum += d2eri_dCxdCy; + d2eri_dCxdCz_pair_sum += d2eri_dCxdCz; + d2eri_dCydCy_pair_sum += d2eri_dCydCy; + d2eri_dCydCz_pair_sum += d2eri_dCydCz; + d2eri_dCzdCz_pair_sum += d2eri_dCzdCz; + } + atomicAdd(output + task_grid + ngrids * 0, d2eri_dCxdCx_pair_sum); + atomicAdd(output + task_grid + ngrids * 1, d2eri_dCxdCy_pair_sum); + atomicAdd(output + task_grid + ngrids * 2, d2eri_dCxdCz_pair_sum); + atomicAdd(output + task_grid + ngrids * 4, d2eri_dCydCy_pair_sum); + atomicAdd(output + task_grid + ngrids * 5, d2eri_dCydCz_pair_sum); + atomicAdd(output + task_grid + ngrids * 8, d2eri_dCzdCz_pair_sum); +} diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu new file mode 100644 index 00000000..4f3a3dee --- /dev/null +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu @@ -0,0 +1,361 @@ +/* + * Copyright 2021-2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "gint.h" +#include "gint1e.h" +#include "cuda_alloc.cuh" +#include "cint2e.cuh" + +#include "rys_roots.cu" +#include "g1e.cu" +#include "g3c1e_ipip.cu" + +static int GINTfill_int3c1e_ipip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + switch (nrys_roots) { + case 2: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +static int GINTfill_int3c1e_ipvip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + switch (nrys_roots) { + case 2: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +static int GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + switch (nrys_roots) { + case 2: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +static int GINTfill_int3c1e_ipip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, + const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_pair_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = (offsets.ntasks_ij + n_pair_sum_per_thread - 1) / n_pair_sum_per_thread; + const int ngrids = offsets.ntasks_kl; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + switch (i_l + j_l) { + case 0: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 0> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 1: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 1> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 2: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 2> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 3> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 4> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 5> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 6> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 7: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 7> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 8: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 8> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + // Up to g + g = 8 now + default: + fprintf(stderr, "i_l + j_l = %d out of range\n", i_l + j_l); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +extern "C" { +int GINTfill_int3c1e_ipip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ipip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + +int GINTfill_int3c1e_ipvip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ipvip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + +int GINTfill_int3c1e_ip1ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + +int GINTfill_int3c1e_ipip2_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + const double* dm_pair_ordered, const int* density_offset, + double* integral_density_contracted, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_pair_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + HermiteDensityOffsets hermite_density_offsets; + hermite_density_offsets.density_offset_of_angular_pair = density_offset[cp_ij_id]; + hermite_density_offsets.pair_offset_of_angular_pair = bas_pairs_locs[cp_ij_id]; + hermite_density_offsets.n_pair_of_angular_pair = bas_pairs_locs[cp_ij_id + 1] - bas_pairs_locs[cp_ij_id]; + + const int err = GINTfill_int3c1e_ipip2_density_contracted_tasks(integral_density_contracted, dm_pair_ordered, hermite_density_offsets, + offsets, i_l, j_l, nprim_ij, + omega, grid_points, charge_exponents, n_pair_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} +} diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py index 93fa1a38..719f0e02 100644 --- a/gpu4pyscf/solvent/hessian/pcm.py +++ b/gpu4pyscf/solvent/hessian/pcm.py @@ -28,6 +28,7 @@ from gpu4pyscf.lib import logger from gpu4pyscf.hessian.jk import _ao2mo from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 +from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ipip2, int1e_grids_ip1ip2 from gpu4pyscf.gto import int3c1e from gpu4pyscf.gto.int3c1e import int1e_grids from pyscf import lib as pyscf_lib @@ -221,7 +222,6 @@ def hess_nuc(pcmobj, dm, verbose=None): return d2e def hess_qv(pcmobj, dm, verbose=None): - raise NotImplementedError("This implementation requires 9 * nao * nao * ngrids of GPU memory") if not pcmobj._intermediates: pcmobj.build() dm_cache = pcmobj._intermediates.get('dm', None) @@ -244,23 +244,28 @@ def hess_qv(pcmobj, dm, verbose=None): aoslice = mol.aoslice_by_atom() aoslice = numpy.array(aoslice) - fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2) - intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') - intopt.build(1e-14, diag_block_with_triu=True, aosym=False) + intopt_derivative = int3c1e.VHFOpt(mol) + intopt_derivative.build(cutoff = 1e-14, aosym = False) + + # fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2) + # intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') + # intopt.build(1e-14, diag_block_with_triu=True, aosym=False) d2e_from_d2I = cupy.zeros([mol.natm, mol.natm, 3, 3]) - d2I_dA2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1', direct_scf_tol=1e-14) - d2I_dA2 = cupy.einsum('dijq,q->dij', d2I_dA2, q_sym) - d2I_dA2 = d2I_dA2.reshape([3, 3, nao, nao]) + # d2I_dA2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1', direct_scf_tol=1e-14) + # d2I_dA2 = cupy.einsum('dijq,q->dij', d2I_dA2, q_sym) + # d2I_dA2 = d2I_dA2.reshape([3, 3, nao, nao]) + d2I_dA2 = int1e_grids_ipip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2) for i_atom in range(mol.natm): p0,p1 = aoslice[i_atom, 2:] d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dA2[:, :, p0:p1, :]) d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dA2[:, :, p0:p1, :].transpose(0,1,3,2)) - d2I_dAdB = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1', direct_scf_tol=1e-14) - d2I_dAdB = cupy.einsum('dijq,q->dij', d2I_dAdB, q_sym) - d2I_dAdB = d2I_dAdB.reshape([3, 3, nao, nao]) + # d2I_dAdB = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1', direct_scf_tol=1e-14) + # d2I_dAdB = cupy.einsum('dijq,q->dij', d2I_dAdB, q_sym) + # d2I_dAdB = d2I_dAdB.reshape([3, 3, nao, nao]) + d2I_dAdB = int1e_grids_ipvip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2) for i_atom in range(mol.natm): pi0,pi1 = aoslice[i_atom, 2:] for j_atom in range(mol.natm): @@ -268,28 +273,29 @@ def hess_qv(pcmobj, dm, verbose=None): d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pi0:pi1, pj0:pj1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1]) d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pj0:pj1, pi0:pi1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1].transpose(0,1,3,2)) - d2I_dAdC = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ip1ip2', direct_scf_tol=1e-14) - d2I_dAdC = d2I_dAdC.reshape([3, 3, nao, nao, ngrids]) - for i_atom in range(mol.natm): - p0,p1 = aoslice[i_atom, 2:] - for j_atom in range(mol.natm): - g0,g1 = gridslice[j_atom] - d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDijq,q->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :, g0:g1], q_sym[g0:g1]) - d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDijq,q->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :, g0:g1].transpose(0,1,3,2,4), q_sym[g0:g1]) - - d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDijq,q->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :, g0:g1].transpose(1,0,2,3,4), q_sym[g0:g1]) - d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDijq,q->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :, g0:g1].transpose(1,0,3,2,4), q_sym[g0:g1]) - - d2I_dC2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip2', direct_scf_tol=1e-14) - d2I_dC2 = cupy.einsum('dijq,ij->dq', d2I_dC2, dm) - d2I_dC2 = d2I_dC2.reshape([3, 3, ngrids]) + for j_atom in range(mol.natm): + g0,g1 = gridslice[j_atom] + # d2I_dAdC = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ip1ip2', direct_scf_tol=1e-14) + # d2I_dAdC = cupy.einsum('dijq,q->dij', d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1]) + # d2I_dAdC = d2I_dAdC.reshape([3, 3, nao, nao]) + d2I_dAdC = int1e_grids_ip1ip2(mol, grid_coords[g0:g1, :], charges = q_sym[g0:g1], intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2) + + for i_atom in range(mol.natm): + p0,p1 = aoslice[i_atom, 2:] + d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :]) + d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(0,1,3,2)) + + d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,2,3)) + d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,3,2)) + + # d2I_dC2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip2', direct_scf_tol=1e-14) + # d2I_dC2 = cupy.einsum('dijq,ij->dq', d2I_dC2, dm) + # d2I_dC2 = d2I_dC2.reshape([3, 3, ngrids]) + d2I_dC2 = int1e_grids_ipip2(mol, grid_coords, dm = dm, intopt = intopt_derivative, charge_exponents = charge_exp**2) for i_atom in range(mol.natm): g0,g1 = gridslice[i_atom] d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('dDq,q->dD', d2I_dC2[:, :, g0:g1], q_sym[g0:g1]) - intopt_derivative = int3c1e.VHFOpt(mol) - intopt_derivative.build(cutoff = 1e-14, aosym = False) - dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative) d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3])