From bb3db3b3e061641a6a3cf87e871c16be5afec791 Mon Sep 17 00:00:00 2001 From: Henry Wang Date: Thu, 12 Dec 2024 08:41:18 +0800 Subject: [PATCH] Special case for int1e ip contracted version --- gpu4pyscf/lib/gint/g1e_ip_root_1.cu | 168 ++++++++++++++++++++ gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu | 36 ++--- 2 files changed, 184 insertions(+), 20 deletions(-) diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu index ebbe5449..cd080024 100644 --- a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu +++ b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu @@ -114,3 +114,171 @@ static void GINTfill_int3c1e_ip_kernel00(double* output, const BasisProdOffsets output[i0 + j0 * stride_j + task_grid * stride_ij + 4 * stride_ij * ngrids] = deri_dCy; output[i0 + j0 * stride_j + task_grid * stride_ij + 5 * stride_ij * ngrids] = deri_dCz; } + +__global__ +static void GINTfill_int3c1e_ip1_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + + const double* __restrict__ a12 = c_bpcache.a12; + const double* __restrict__ e12 = c_bpcache.e12; + const double* __restrict__ x12 = c_bpcache.x12; + const double* __restrict__ y12 = c_bpcache.y12; + const double* __restrict__ z12 = c_bpcache.z12; + + const double* __restrict__ a_exponents = c_bpcache.a1; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + double deri_dAx_grid_sum = 0; + double deri_dAy_grid_sum = 0; + double deri_dAz_grid_sum = 0; + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double deri_dAx_per_grid = 0; + double deri_dAy_per_grid = 0; + double deri_dAz_per_grid = 0; + for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { + const double aij = a12[ij]; + const double eij = e12[ij]; + const double Px = x12[ij]; + const double Py = y12[ij]; + const double Pz = z12[ij]; + const double PCx = Px - Cx; + const double PCy = Py - Cy; + const double PCz = Pz - Cz; + const double PAx = Px - Ax; + const double PAy = Py - Ay; + const double PAz = Pz - Az; + const double minus_two_a = -2.0 * a_exponents[ij]; + const double one_over_two_p = 0.5 / aij; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; + a0 *= theta; + + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; + const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); + if (boys_input > 3.e-7) { + const double sqrt_boys_input = sqrt(boys_input); + const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input); + const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input; + deri_dAx_per_grid += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R000_1 * PCx); + deri_dAy_per_grid += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R000_1 * PCy); + deri_dAz_per_grid += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R000_1 * PCz); + } + } + + const double charge = grid_point[3]; + deri_dAx_grid_sum += deri_dAx_per_grid * charge; + deri_dAy_grid_sum += deri_dAy_per_grid * charge; + deri_dAz_grid_sum += deri_dAz_per_grid * charge; + } + + const int* ao_loc = c_bpcache.ao_loc; + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + atomicAdd(output + (i0 + j0 * stride_j + 0 * stride_ij), deri_dAx_grid_sum); + atomicAdd(output + (i0 + j0 * stride_j + 1 * stride_ij), deri_dAy_grid_sum); + atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij), deri_dAz_grid_sum); +} + +__global__ +static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, + const BasisProdOffsets offsets, const int nprim_ij, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + if (task_grid >= ngrids) { + return; + } + + const double* grid_point = grid_points + task_grid * 3; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double deri_dCx_pair_sum = 0.0; + double deri_dCy_pair_sum = 0.0; + double deri_dCz_pair_sum = 0.0; + for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) { + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + + const double* __restrict__ a12 = c_bpcache.a12; + const double* __restrict__ e12 = c_bpcache.e12; + const double* __restrict__ x12 = c_bpcache.x12; + const double* __restrict__ y12 = c_bpcache.y12; + const double* __restrict__ z12 = c_bpcache.z12; + + double deri_dCx_per_pair = 0; + double deri_dCy_per_pair = 0; + double deri_dCz_per_pair = 0; + for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { + const double aij = a12[ij]; + const double eij = e12[ij]; + const double Px = x12[ij]; + const double Py = y12[ij]; + const double Pz = z12[ij]; + const double PCx = Px - Cx; + const double PCy = Py - Cy; + const double PCz = Pz - Cz; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; + a0 *= theta; + + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; + const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); + if (boys_input > 3.e-7) { + const double sqrt_boys_input = sqrt(boys_input); + const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input); + const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input; + deri_dCx_per_pair += prefactor * R000_1 * PCx; + deri_dCy_per_pair += prefactor * R000_1 * PCy; + deri_dCz_per_pair += prefactor * R000_1 * PCz; + } + } + + const double D = density[bas_ij - hermite_density_offsets.pair_offset_of_angular_pair + hermite_density_offsets.density_offset_of_angular_pair]; + deri_dCx_pair_sum += deri_dCx_per_pair * D; + deri_dCy_pair_sum += deri_dCy_per_pair * D; + deri_dCz_pair_sum += deri_dCz_per_pair * D; + } + atomicAdd(output + task_grid + 0 * ngrids, deri_dCx_pair_sum); + atomicAdd(output + task_grid + 1 * ngrids, deri_dCy_pair_sum); + atomicAdd(output + task_grid + 2 * ngrids, deri_dCz_pair_sum); +} diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu index a670b4fd..45d8f6d3 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu @@ -80,16 +80,14 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); int type_ijkl; switch (nrys_roots) { - // case 1: - // type_ijkl = (i_l << 2) | j_l; - // switch (type_ijkl) { - // case (0<<2)|0: GINTfill_int3c1e_ip1_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - // case (1<<2)|0: GINTfill_int3c1e_ip1_charge_contracted_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - // default: - // fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); - // } - // break; - case 1: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<1, GSIZE1_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 1: + type_ijkl = (i_l + 1) * 10 + j_l; + switch (type_ijkl) { + case 10: GINTfill_int3c1e_ip1_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); + } + break; case 2: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; case 3: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; case 4: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; @@ -120,16 +118,14 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); int type_ijkl; switch (nrys_roots) { - // case 1: - // type_ijkl = (i_l << 2) | j_l; - // switch (type_ijkl) { - // case (0<<2)|0: GINTfill_int3c1e_ip2_density_contracted_kernel00<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; - // case (1<<2)|0: GINTfill_int3c1e_ip2_density_contracted_kernel10<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; - // default: - // fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); - // } - // break; - case 1: GINTfill_int3c1e_ip2_density_contracted_kernel_general<1> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; + case 1: + type_ijkl = (i_l + 1) * 10 + j_l; + switch (type_ijkl) { + case 10: GINTfill_int3c1e_ip2_density_contracted_kernel00<<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); + } + break; case 2: GINTfill_int3c1e_ip2_density_contracted_kernel_general<2> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; case 3: GINTfill_int3c1e_ip2_density_contracted_kernel_general<3> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break; case 4: GINTfill_int3c1e_ip2_density_contracted_kernel_general<4> <<>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;