diff --git a/gpu4pyscf/gto/moleintor.py b/gpu4pyscf/gto/moleintor.py index fb060d9a..7dea1a6b 100644 --- a/gpu4pyscf/gto/moleintor.py +++ b/gpu4pyscf/gto/moleintor.py @@ -19,6 +19,7 @@ from pyscf.scf import _vhf from pyscf.gto import ATOM_OF +from pyscf.lib import c_null_ptr from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, block_c2s_diag, get_avail_mem from gpu4pyscf.lib import logger from gpu4pyscf.scf.int4c2e import BasisProdCache @@ -185,7 +186,7 @@ def cart2sph(self): # end of class VHFOpt -def get_int3c1e(mol, grids, intopt): +def get_int3c1e(mol, grids, charge_exponents, intopt): omega = mol.omega assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." @@ -207,6 +208,8 @@ def get_int3c1e(mol, grids, intopt): # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') for i_grid_split in range(0, ngrids, ngrids_per_split): ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) @@ -235,10 +238,15 @@ def get_int3c1e(mol, grids, intopt): int3c_angular_slice = cp.zeros([ngrids_of_split, j1-j0, i1-i0], order='C') + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + err = libgint.GINTfill_int3c1e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(ngrids_of_split), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), ctypes.c_int(nao_cart), @@ -270,7 +278,7 @@ def get_int3c1e(mol, grids, intopt): return int3c -def get_int3c1e_charge_contracted(mol, grids, charges, intopt): +def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt): omega = mol.omega assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." @@ -281,6 +289,8 @@ def get_int3c1e_charge_contracted(mol, grids, charges, intopt): grids = cp.asarray(grids, order='C') charges = cp.asarray(charges).reshape([-1, 1], order='C') grids = cp.concatenate([grids, charges], axis=1) + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') int1e = cp.zeros([mol.nao, mol.nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): @@ -305,6 +315,10 @@ def get_int3c1e_charge_contracted(mol, grids, charges, intopt): ao_offsets = np.array([i0, j0], dtype=np.int32) strides = np.array([ni, ni*nj], dtype=np.int32) + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + ngrids = grids.shape[0] # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points @@ -316,6 +330,7 @@ def get_int3c1e_charge_contracted(mol, grids, charges, intopt): ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(ngrids), ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), ctypes.c_int(nao_cart), @@ -345,7 +360,7 @@ def get_int3c1e_charge_contracted(mol, grids, charges, intopt): return int1e -def get_int3c1e_density_contracted(mol, grids, dm, intopt): +def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): omega = mol.omega assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." @@ -389,8 +404,6 @@ def get_int3c1e_density_contracted(mol, grids, dm, intopt): bas_coords.ctypes.data_as(ctypes.c_void_p)) dm_pair_ordered = cp.asarray(dm_pair_ordered) - grids = cp.asarray(grids, order='C') - int3c_density_contracted = cp.zeros(ngrids) n_threads_per_block_1d = 16 n_max_blocks_per_grid_1d = 65535 @@ -400,6 +413,12 @@ def get_int3c1e_density_contracted(mol, grids, dm, intopt): print(f"Grid dimension = {ngrids} is too large, more than 100 kernels for one electron integral will be launched.") ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + int3c_density_contracted = cp.zeros(ngrids) + for i_grid_split in range(0, ngrids, ngrids_per_split): ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) for cp_ij_id, _ in enumerate(intopt.log_qs): @@ -410,6 +429,10 @@ def get_int3c1e_density_contracted(mol, grids, dm, intopt): nbins = 1 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type n_pair_sum_per_thread = nao_cart @@ -418,6 +441,7 @@ def get_int3c1e_density_contracted(mol, grids, dm, intopt): ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(ngrids_of_split), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), intopt.density_offset.ctypes.data_as(ctypes.c_void_p), @@ -433,7 +457,7 @@ def get_int3c1e_density_contracted(mol, grids, dm, intopt): return int3c_density_contracted -def intor(mol, intor, grids, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None): +def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None): assert intor == 'int1e_grids' assert grids is not None assert dm is None or charges is None, \ @@ -449,10 +473,10 @@ def intor(mol, intor, grids, dm=None, charges=None, direct_scf_tol=1e-13, intopt assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." if dm is None and charges is None: - return get_int3c1e(mol, grids, intopt) + return get_int3c1e(mol, grids, charge_exponents, intopt) elif dm is not None: - return get_int3c1e_density_contracted(mol, grids, dm, intopt) + return get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt) elif charges is not None: - return get_int3c1e_charge_contracted(mol, grids, charges, intopt) + return get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt) else: raise ValueError(f"Logic error in {__file__} {__name__}") diff --git a/gpu4pyscf/gto/tests/test_int1e_grids.py b/gpu4pyscf/gto/tests/test_int1e_grids.py index 53d7dbad..30795c2c 100644 --- a/gpu4pyscf/gto/tests/test_int1e_grids.py +++ b/gpu4pyscf/gto/tests/test_int1e_grids.py @@ -17,7 +17,7 @@ import numpy as np import cupy as cp import pyscf -from pyscf import lib +from pyscf import lib, gto, df from gpu4pyscf.gto.moleintor import intor def setUpModule(): @@ -157,6 +157,110 @@ def test_int1e_grids_charge_contracted_omega(self): assert isinstance(test_int1e_dot_q, cp.ndarray) cp.testing.assert_allclose(ref_int1e_dot_q, test_int1e_dot_q, atol = charge_contraction_threshold) + # Gaussian charges + + def test_int1e_grids_full_tensor_guassian_charge(self): + np.random.seed(12351) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + int3c2e = mol_sph._add_suffix('int3c2e') + cintopt = gto.moleintor.make_cintopt(mol_sph._atm, mol_sph._bas, mol_sph._env, int3c2e) + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + ref_int1e = df.incore.aux_e2(mol_sph, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) + ref_int1e = ref_int1e.transpose((2,0,1)) + + test_int1e = intor(mol_sph, 'int1e_grids', grid_points, charge_exponents = charge_exponents) + np.testing.assert_allclose(ref_int1e, test_int1e, atol = integral_threshold) + + def test_int1e_grids_density_contracted_guassian_charge(self): + np.random.seed(12351) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + + int3c2e = mol_sph._add_suffix('int3c2e') + cintopt = gto.moleintor.make_cintopt(mol_sph._atm, mol_sph._bas, mol_sph._env, int3c2e) + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + ref_int1e = df.incore.aux_e2(mol_sph, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) + ref_int1e = ref_int1e.transpose((2,0,1)) + + ref_int1e_dot_D = np.einsum('pij,ij->p', ref_int1e, dm) + test_int1e_dot_D = intor(mol_sph, 'int1e_grids', grid_points, dm = dm, charge_exponents = charge_exponents) + assert isinstance(test_int1e_dot_D, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold) + + def test_int1e_grids_charge_contracted_guassian_charge(self): + np.random.seed(12351) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + int3c2e = mol_sph._add_suffix('int3c2e') + cintopt = gto.moleintor.make_cintopt(mol_sph._atm, mol_sph._bas, mol_sph._env, int3c2e) + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + ref_int1e = df.incore.aux_e2(mol_sph, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) + ref_int1e = ref_int1e.transpose((2,0,1)) + + ref_int1e_dot_q = np.einsum('pij,p->ij', ref_int1e, charges) + test_int1e_dot_q = intor(mol_sph, 'int1e_grids', grid_points, charges = charges, charge_exponents = charge_exponents) + assert isinstance(test_int1e_dot_q, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dot_q, test_int1e_dot_q, atol = charge_contraction_threshold) + + def test_int1e_grids_full_tensor_guassian_charge_omega(self): + np.random.seed(12351) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + int3c2e = mol_sph_omega._add_suffix('int3c2e') + cintopt = gto.moleintor.make_cintopt(mol_sph_omega._atm, mol_sph_omega._bas, mol_sph_omega._env, int3c2e) + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + ref_int1e = df.incore.aux_e2(mol_sph_omega, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) + ref_int1e = ref_int1e.transpose((2,0,1)) + + test_int1e = intor(mol_sph_omega, 'int1e_grids', grid_points, charge_exponents = charge_exponents) + np.testing.assert_allclose(ref_int1e, test_int1e, atol = integral_threshold) + + def test_int1e_grids_density_contracted_guassian_charge_omega(self): + np.random.seed(12351) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + + omega = 1.2 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + int3c2e = mol_sph_omega._add_suffix('int3c2e') + cintopt = gto.moleintor.make_cintopt(mol_sph_omega._atm, mol_sph_omega._bas, mol_sph_omega._env, int3c2e) + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + ref_int1e = df.incore.aux_e2(mol_sph_omega, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) + ref_int1e = ref_int1e.transpose((2,0,1)) + + ref_int1e_dot_D = np.einsum('pij,ij->p', ref_int1e, dm) + test_int1e_dot_D = intor(mol_sph_omega, 'int1e_grids', grid_points, dm = dm, charge_exponents = charge_exponents) + assert isinstance(test_int1e_dot_D, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dot_D, test_int1e_dot_D, atol = density_contraction_threshold) + + def test_int1e_grids_charge_contracted_guassian_charge_omega(self): + np.random.seed(12351) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + omega = 1.2 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + int3c2e = mol_sph_omega._add_suffix('int3c2e') + cintopt = gto.moleintor.make_cintopt(mol_sph_omega._atm, mol_sph_omega._bas, mol_sph_omega._env, int3c2e) + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + ref_int1e = df.incore.aux_e2(mol_sph_omega, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) + ref_int1e = ref_int1e.transpose((2,0,1)) + + ref_int1e_dot_q = np.einsum('pij,p->ij', ref_int1e, charges) + test_int1e_dot_q = intor(mol_sph_omega, 'int1e_grids', grid_points, charges = charges, charge_exponents = charge_exponents) + assert isinstance(test_int1e_dot_q, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dot_q, test_int1e_dot_q, atol = charge_contraction_threshold) + if __name__ == "__main__": print("Full Tests for One Electron Coulomb Integrals") unittest.main() diff --git a/gpu4pyscf/lib/gint/g1e.cu b/gpu4pyscf/lib/gint/g1e.cu index 63176525..dfad65e5 100644 --- a/gpu4pyscf/lib/gint/g1e.cu +++ b/gpu4pyscf/lib/gint/g1e.cu @@ -20,8 +20,9 @@ // This function assumes i_l >= j_l template __device__ -static void GINTg1e(double* __restrict__ g, const double* __restrict__ grid_point, const int ish, const int jsh, const int prim_ij, - const int i_l, const int j_l, const double omega) +static void GINTg1e(double* __restrict__ g, const double* __restrict__ grid_point, + const int ish, const int jsh, const int prim_ij, + const int i_l, const int j_l, const double charge_exponent, const double omega) { const double* __restrict__ a12 = c_bpcache.a12; const double* __restrict__ e12 = c_bpcache.e12; @@ -40,12 +41,16 @@ static void GINTg1e(double* __restrict__ g, const double* __restrict__ grid_poin const double PCx = Px - Cx; const double PCy = Py - Cy; const double PCz = Pz - Cz; + double a0 = aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double uw[NROOTS * 2]; GINTrys_root(boys_input, uw); @@ -76,11 +81,11 @@ static void GINTg1e(double* __restrict__ g, const double* __restrict__ grid_poin gz[i_root] = w[i_root]; const double u2 = a0 * u[i_root]; - const double t2 = u2 / (u2 + aij); - const double b10 = 0.5 / aij * (1.0 - t2); - const double c00x = PAx - t2 * PCx; - const double c00y = PAy - t2 * PCy; - const double c00z = PAz - t2 * PCz; + const double qt2_over_p_plus_q = u2 / (u2 + aij * q_over_p_plus_q) * q_over_p_plus_q; + const double b10 = 0.5 / aij * (1.0 - qt2_over_p_plus_q); + const double c00x = PAx - qt2_over_p_plus_q * PCx; + const double c00y = PAy - qt2_over_p_plus_q * PCy; + const double c00z = PAz - qt2_over_p_plus_q * PCz; if (i_l + j_l > 0) { double s0x = gx[i_root]; // i - 1 @@ -132,7 +137,7 @@ static void GINTg1e(double* __restrict__ g, const double* __restrict__ grid_poin template __device__ static void GINT_g1e_without_hrr(double* __restrict__ g, const double grid_x, const double grid_y, const double grid_z, - const int ish, const int prim_ij, const int l, const double omega) + const int ish, const int prim_ij, const int l, const double charge_exponent, const double omega) { const double* __restrict__ a12 = c_bpcache.a12; const double* __restrict__ e12 = c_bpcache.e12; @@ -148,12 +153,16 @@ static void GINT_g1e_without_hrr(double* __restrict__ g, const double grid_x, co const double PCx = Px - grid_x; const double PCy = Py - grid_y; const double PCz = Pz - grid_z; + double a0 = aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double uw[NROOTS * 2]; GINTrys_root(boys_input, uw); @@ -184,11 +193,11 @@ static void GINT_g1e_without_hrr(double* __restrict__ g, const double grid_x, co gz[i_root] = w[i_root]; const double u2 = a0 * u[i_root]; - const double t2 = u2 / (u2 + aij); - const double b10 = 0.5 / aij * (1.0 - t2); - const double c00x = PAx - t2 * PCx; - const double c00y = PAy - t2 * PCy; - const double c00z = PAz - t2 * PCz; + const double qt2_over_p_plus_q = u2 / (u2 + aij * q_over_p_plus_q) * q_over_p_plus_q; + const double b10 = 0.5 / aij * (1.0 - qt2_over_p_plus_q); + const double c00x = PAx - qt2_over_p_plus_q * PCx; + const double c00y = PAy - qt2_over_p_plus_q * PCy; + const double c00z = PAz - qt2_over_p_plus_q * PCz; if (l > 0) { double s0x = gx[i_root]; // i - 1 diff --git a/gpu4pyscf/lib/gint/g1e_root_123.cu b/gpu4pyscf/lib/gint/g1e_root_123.cu index 638e795e..aa1642c2 100644 --- a/gpu4pyscf/lib/gint/g1e_root_123.cu +++ b/gpu4pyscf/lib/gint/g1e_root_123.cu @@ -20,7 +20,7 @@ __global__ static void GINTfill_int3c1e_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -47,6 +47,7 @@ static void GINTfill_int3c1e_kernel00(double* output, const BasisProdOffsets off const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri = 0; for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { @@ -59,11 +60,14 @@ static void GINTfill_int3c1e_kernel00(double* output, const BasisProdOffsets off const double PCy = Py - Cy; const double PCz = Pz - Cz; double a0 = aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double eri_per_primitive = prefactor; if (boys_input > 3.e-7) { @@ -83,7 +87,7 @@ static void GINTfill_int3c1e_kernel00(double* output, const BasisProdOffsets off __global__ static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -111,6 +115,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const Ba const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri_per_grid = 0; for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { @@ -123,11 +128,14 @@ static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const Ba const double PCy = Py - Cy; const double PCz = Pz - Cz; double a0 = aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double eri_per_primitive = prefactor; if (boys_input > 3.e-7) { @@ -152,7 +160,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const Ba __global__ static void GINTfill_int3c1e_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int nprim_ij, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -165,6 +173,7 @@ static void GINTfill_int3c1e_density_contracted_kernel00(double* output, const d const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri_pair_sum = 0.0; for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) { @@ -192,11 +201,14 @@ static void GINTfill_int3c1e_density_contracted_kernel00(double* output, const d const double PCy = Py - Cy; const double PCz = Pz - Cz; double a0 = aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double eri_per_primitive = prefactor; if (boys_input > 3.e-7) { @@ -216,7 +228,7 @@ static void GINTfill_int3c1e_density_contracted_kernel00(double* output, const d __global__ static void GINTfill_int3c1e_kernel10(double* output, const BasisProdOffsets offsets, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -251,6 +263,7 @@ static void GINTfill_int3c1e_kernel10(double* output, const BasisProdOffsets off const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri_x = 0; double eri_y = 0; @@ -267,13 +280,16 @@ static void GINTfill_int3c1e_kernel10(double* output, const BasisProdOffsets off const double PAx = Px - Ax; const double PAy = Py - Ay; const double PAz = Pz - Az; - double a0 = aij; const double one_over_two_p = 0.5 / aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double eri_per_primitive_x = prefactor; double eri_per_primitive_y = prefactor; @@ -302,7 +318,7 @@ static void GINTfill_int3c1e_kernel10(double* output, const BasisProdOffsets off __global__ static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const BasisProdOffsets offsets, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -340,6 +356,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const Ba const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri_per_grid_x = 0; double eri_per_grid_y = 0; @@ -356,13 +373,16 @@ static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const Ba const double PAx = Px - Ax; const double PAy = Py - Ay; const double PAz = Pz - Az; - double a0 = aij; const double one_over_two_p = 0.5 / aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double eri_per_primitive_x = prefactor; double eri_per_primitive_y = prefactor; @@ -397,7 +417,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const Ba __global__ static void GINTfill_int3c1e_density_contracted_kernel10(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int nprim_ij, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -410,6 +430,7 @@ static void GINTfill_int3c1e_density_contracted_kernel10(double* output, const d const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri_pair_sum = 0.0; for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) { @@ -449,13 +470,16 @@ static void GINTfill_int3c1e_density_contracted_kernel10(double* output, const d const double PAx = Px - Ax; const double PAy = Py - Ay; const double PAz = Pz - Az; - double a0 = aij; const double one_over_two_p = 0.5 / aij; - const double theta = omega > 0.0 ? omega * omega / (omega * omega + aij) : 1.0; - const double sqrt_theta = omega > 0.0 ? omega / sqrt(omega * omega + aij) : 1.0; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; a0 *= theta; - const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta; + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); double eri_per_primitive_x = prefactor; double eri_per_primitive_y = prefactor; diff --git a/gpu4pyscf/lib/gint/g3c1e.cu b/gpu4pyscf/lib/gint/g3c1e.cu index be456196..af3b6147 100644 --- a/gpu4pyscf/lib/gint/g3c1e.cu +++ b/gpu4pyscf/lib/gint/g3c1e.cu @@ -64,7 +64,7 @@ template __global__ void GINTfill_int3c1e_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -82,11 +82,12 @@ void GINTfill_int3c1e_kernel_general(double* output, const BasisProdOffsets offs const int jsh = bas_pair2ket[bas_ij]; const double* grid_point = grid_points + task_grid * 3; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double g[GSIZE_INT3C_1E]; for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { - GINTg1e(g, grid_point, ish, jsh, ij, i_l, j_l, omega); + GINTg1e(g, grid_point, ish, jsh, ij, i_l, j_l, charge_exponent, omega); GINTwrite_int3c1e(g, output, ish, jsh, task_grid, i_l, j_l, stride_j, stride_ij, ao_offsets_i, ao_offsets_j); } } @@ -132,7 +133,7 @@ template __global__ void GINTfill_int3c1e_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points) + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -157,11 +158,12 @@ void GINTfill_int3c1e_charge_contracted_kernel_general(double* output, const Bas for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { const double* grid_point = grid_points + task_grid * 4; const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double g[GSIZE_INT3C_1E]; for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { - GINTg1e(g, grid_point, ish, jsh, ij, i_l, j_l, omega); + GINTg1e(g, grid_point, ish, jsh, ij, i_l, j_l, charge_exponent, omega); GINTwrite_int3c1e_charge_contracted(g, output_cache, charge, i_l, j_l); } } @@ -180,9 +182,9 @@ void GINTfill_int3c1e_charge_contracted_kernel_general(double* output, const Bas template __global__ -void GINT_int3c1e_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, - const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, - const double omega, const double* grid_points) +void GINTfill_int3c1e_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, + const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double omega, const double* grid_points, const double* charge_exponents) { const int ntasks_ij = offsets.ntasks_ij; const int ngrids = offsets.ntasks_kl; @@ -195,6 +197,7 @@ void GINT_int3c1e_density_contracted_kernel_general(double* output, const double const double Cx = grid_point[0]; const double Cy = grid_point[1]; const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; double eri_with_density_pair_sum = 0.0; for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) { @@ -215,7 +218,7 @@ void GINT_int3c1e_density_contracted_kernel_general(double* output, const double double eri_with_density_per_pair = 0.0; for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { double g[NROOTS * (l_max + 1) * 3]; - GINT_g1e_without_hrr(g, Cx, Cy, Cz, ish, ij, l, omega); + GINT_g1e_without_hrr(g, Cx, Cy, Cz, ish, ij, l, charge_exponent, omega); double eri_with_density_per_primitive = 0.0; for (int i_x = 0, i_t = 0; i_x <= l; i_x++) { diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu index f27ea6f3..cd9c143b 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu @@ -41,25 +41,9 @@ #define GSIZE4_INT3C_1E 240 #define GSIZE5_INT3C_1E 450 -/* -static void CINTcart_comp(int16_t *nx, int16_t *ny, int16_t *nz, int lmax) -{ - int inc = 0; - for (int16_t lx = lmax; lx >= 0; lx--) { - for (int16_t ly = lmax - lx; ly >= 0; ly--) { - int16_t lz = lmax - lx - ly; - nx[inc] = lx; - ny[inc] = ly; - nz[inc] = lz; - inc++; - } - } -} -*/ - static int GINTfill_int3c1e_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points, const cudaStream_t stream) + const double omega, const double* grid_points, const double* charge_exponents, const cudaStream_t stream) { const int nrys_roots = (i_l + j_l) / 2 + 1; const int ntasks_ij = offsets.ntasks_ij; @@ -72,16 +56,16 @@ static int GINTfill_int3c1e_tasks(double* output, const BasisProdOffsets offsets case 1: type_ijkl = (i_l << 2) | j_l; switch (type_ijkl) { - case (0<<2)|0: GINTfill_int3c1e_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case (1<<2)|0: GINTfill_int3c1e_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; + case (0<<2)|0: GINTfill_int3c1e_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case (1<<2)|0: GINTfill_int3c1e_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; default: fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); } break; - case 2: GINTfill_int3c1e_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case 3: GINTfill_int3c1e_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case 4: GINTfill_int3c1e_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case 5: GINTfill_int3c1e_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; + case 2: GINTfill_int3c1e_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; default: fprintf(stderr, "rys roots %d\n", nrys_roots); return 1; @@ -97,7 +81,8 @@ static int GINTfill_int3c1e_tasks(double* output, const BasisProdOffsets offsets static int GINTfill_int3c1e_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, - const double omega, const double* grid_points, const cudaStream_t stream, const int n_charge_sum_per_thread) + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) { const int nrys_roots = (i_l + j_l) / 2 + 1; const int ntasks_ij = offsets.ntasks_ij; @@ -110,16 +95,16 @@ static int GINTfill_int3c1e_charge_contracted_tasks(double* output, const BasisP case 1: type_ijkl = (i_l << 2) | j_l; switch (type_ijkl) { - case (0<<2)|0: GINTfill_int3c1e_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case (1<<2)|0: GINTfill_int3c1e_charge_contracted_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; + case (0<<2)|0: GINTfill_int3c1e_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case (1<<2)|0: GINTfill_int3c1e_charge_contracted_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; default: fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); } break; - case 2: GINTfill_int3c1e_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case 3: GINTfill_int3c1e_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case 4: GINTfill_int3c1e_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; - case 5: GINTfill_int3c1e_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points); break; + case 2: GINTfill_int3c1e_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; default: fprintf(stderr, "rys roots %d\n", nrys_roots); return 1; @@ -135,7 +120,8 @@ static int GINTfill_int3c1e_charge_contracted_tasks(double* output, const BasisP static int GINTfill_int3c1e_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, - const double omega, const double* grid_points, const int n_pair_sum_per_thread, const cudaStream_t stream) + const double omega, const double* grid_points, const double* charge_exponents, + const int n_pair_sum_per_thread, const cudaStream_t stream) { const int nrys_roots = (i_l + j_l) / 2 + 1; const int ntasks_ij = (offsets.ntasks_ij + n_pair_sum_per_thread - 1) / n_pair_sum_per_thread; @@ -148,16 +134,16 @@ static int GINTfill_int3c1e_density_contracted_tasks(double* output, const doubl case 1: type_ijkl = (i_l << 2) | j_l; switch (type_ijkl) { - case (0<<2)|0: GINTfill_int3c1e_density_contracted_kernel00<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points); break; - case (1<<2)|0: GINTfill_int3c1e_density_contracted_kernel10<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points); break; + case (0<<2)|0: GINTfill_int3c1e_density_contracted_kernel00<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case (1<<2)|0: GINTfill_int3c1e_density_contracted_kernel10<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; default: fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); } break; - case 2: GINT_int3c1e_density_contracted_kernel_general<2> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points); break; - case 3: GINT_int3c1e_density_contracted_kernel_general<3> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points); break; - case 4: GINT_int3c1e_density_contracted_kernel_general<4> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points); break; - case 5: GINT_int3c1e_density_contracted_kernel_general<5> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points); break; + case 2: GINTfill_int3c1e_density_contracted_kernel_general<2> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_density_contracted_kernel_general<3> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_density_contracted_kernel_general<4> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_density_contracted_kernel_general<5> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; default: fprintf(stderr, "rys roots %d\n", nrys_roots); return 1; @@ -173,7 +159,7 @@ static int GINTfill_int3c1e_density_contracted_tasks(double* output, const doubl extern "C" { int GINTfill_int3c1e(const cudaStream_t stream, const BasisProdCache* bpcache, - const double* grid_points, const int ngrids, + const double* grid_points, const double* charge_exponents, const int ngrids, double* integrals, const int nao, const int* strides, const int* ao_offsets, const int* bins_locs_ij, int nbins, @@ -212,7 +198,7 @@ int GINTfill_int3c1e(const cudaStream_t stream, const BasisProdCache* bpcache, const int err = GINTfill_int3c1e_tasks(integrals, offsets, i_l, j_l, nprim_ij, strides[0], strides[1], ao_offsets[0], ao_offsets[1], - omega, grid_points, stream); + omega, grid_points, charge_exponents, stream); if (err != 0) { return err; @@ -223,7 +209,7 @@ int GINTfill_int3c1e(const cudaStream_t stream, const BasisProdCache* bpcache, } int GINTfill_int3c1e_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, - const double* grid_points, const int ngrids, + const double* grid_points, const double* charge_exponents, const int ngrids, double* integral_charge_contracted, const int nao, const int* strides, const int* ao_offsets, const int* bins_locs_ij, int nbins, @@ -262,7 +248,7 @@ int GINTfill_int3c1e_charge_contracted(const cudaStream_t stream, const BasisPro const int err = GINTfill_int3c1e_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, strides[0], strides[1], ao_offsets[0], ao_offsets[1], - omega, grid_points, stream, n_charge_sum_per_thread); + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); if (err != 0) { return err; @@ -273,7 +259,7 @@ int GINTfill_int3c1e_charge_contracted(const cudaStream_t stream, const BasisPro } int GINTfill_int3c1e_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, - const double* grid_points, const int ngrids, + const double* grid_points, const double* charge_exponents, const int ngrids, const double* dm_pair_ordered, const int* density_offset, double* integral_density_contracted, const int* bins_locs_ij, int nbins, @@ -317,7 +303,7 @@ int GINTfill_int3c1e_density_contracted(const cudaStream_t stream, const BasisPr const int err = GINTfill_int3c1e_density_contracted_tasks(integral_density_contracted, dm_pair_ordered, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, - omega, grid_points, n_pair_sum_per_thread, stream); + omega, grid_points, charge_exponents, n_pair_sum_per_thread, stream); if (err != 0) { return err;