Add ss special case for int1e derivative

bytedance · Dec 11, 2024 · ed0ef9e · ed0ef9e
1 parent 06bf584
commit ed0ef9e
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 12 deletions.
diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
@@ -0,0 +1,116 @@
+/* Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "cint2e.cuh"
+
+__global__
+static void GINTfill_int3c1e_ip_kernel00(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                         const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                         const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+
+    const double* __restrict__ a12 = c_bpcache.a12;
+    const double* __restrict__ e12 = c_bpcache.e12;
+    const double* __restrict__ x12 = c_bpcache.x12;
+    const double* __restrict__ y12 = c_bpcache.y12;
+    const double* __restrict__ z12 = c_bpcache.z12;
+
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double deri_dAx = 0;
+    double deri_dAy = 0;
+    double deri_dAz = 0;
+    double deri_dCx = 0;
+    double deri_dCy = 0;
+    double deri_dCz = 0;
+    for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+        const double aij = a12[ij];
+        const double eij = e12[ij];
+        const double Px  = x12[ij];
+        const double Py  = y12[ij];
+        const double Pz  = z12[ij];
+        const double PCx = Px - Cx;
+        const double PCy = Py - Cy;
+        const double PCz = Pz - Cz;
+        const double PAx = Px - Ax;
+        const double PAy = Py - Ay;
+        const double PAz = Pz - Az;
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        const double one_over_two_p = 0.5 / aij;
+        double a0 = aij;
+        const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+        const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+        a0 *= q_over_p_plus_q;
+        const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+        a0 *= theta;
+
+        const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+        const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+        if (boys_input > 3.e-7) {
+            const double sqrt_boys_input = sqrt(boys_input);
+            const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+            const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+            const double R100_0 = R000_1 * PCx;
+            const double R010_0 = R000_1 * PCy;
+            const double R001_0 = R000_1 * PCz;
+            deri_dAx += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R100_0);
+            deri_dAy += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R010_0);
+            deri_dAz += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R001_0);
+            deri_dCx += prefactor * R100_0;
+            deri_dCy += prefactor * R010_0;
+            deri_dCz += prefactor * R001_0;
+        }
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    output[i0 + j0 * stride_j + task_grid * stride_ij + 0 * stride_ij * ngrids] = deri_dAx;
+    output[i0 + j0 * stride_j + task_grid * stride_ij + 1 * stride_ij * ngrids] = deri_dAy;
+    output[i0 + j0 * stride_j + task_grid * stride_ij + 2 * stride_ij * ngrids] = deri_dAz;
+    output[i0 + j0 * stride_j + task_grid * stride_ij + 3 * stride_ij * ngrids] = deri_dCx;
+    output[i0 + j0 * stride_j + task_grid * stride_ij + 4 * stride_ij * ngrids] = deri_dCy;
+    output[i0 + j0 * stride_j + task_grid * stride_ij + 5 * stride_ij * ngrids] = deri_dCz;
+}
diff --git a/gpu4pyscf/lib/gint/g1e_root_123.cu → gpu4pyscf/lib/gint/g1e_root_1.cu b/gpu4pyscf/lib/gint/g1e_root_123.cu → gpu4pyscf/lib/gint/g1e_root_1.cu
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu
@@ -27,7 +27,7 @@
 
 #include "rys_roots.cu"
 #include "g1e.cu"
-#include "g1e_root_123.cu"
+#include "g1e_root_1.cu"
 #include "g3c1e.cu"
 
 static int GINTfill_int3c1e_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,

diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
@@ -27,7 +27,7 @@
 
 #include "rys_roots.cu"
 #include "g1e.cu"
-// #include "g1e_root_123.cu"
+#include "g1e_ip_root_1.cu"
 #include "g3c1e_ip.cu"
 
 static int GINTfill_int3c1e_ip_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
@@ -42,16 +42,14 @@ static int GINTfill_int3c1e_ip_tasks(double* output, const BasisProdOffsets offs
     const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
     int type_ijkl;
     switch (nrys_roots) {
-    // case 1:
-    //     type_ijkl = (i_l << 2) | j_l;
-    //     switch (type_ijkl) {
-    //     case (0<<2)|0: GINTfill_int3c1e_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
-    //     case (1<<2)|0: GINTfill_int3c1e_kernel10<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
-    //     default:
-    //         fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
-    //     }
-    //     break;
-    case 1: GINTfill_int3c1e_ip_kernel_general<1, GSIZE1_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 1:
+        type_ijkl = (i_l + 1) * 10 + j_l;
+        switch (type_ijkl) {
+        case 10: GINTfill_int3c1e_ip_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
+        }
+        break;
     case 2: GINTfill_int3c1e_ip_kernel_general<2, GSIZE2_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
     case 3: GINTfill_int3c1e_ip_kernel_general<3, GSIZE3_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
     case 4: GINTfill_int3c1e_ip_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;