diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
index ebbe5449..cd080024 100644
--- a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
+++ b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
@@ -114,3 +114,171 @@ static void GINTfill_int3c1e_ip_kernel00(double* output, const BasisProdOffsets
     output[i0 + j0 * stride_j + task_grid * stride_ij + 4 * stride_ij * ngrids] = deri_dCy;
     output[i0 + j0 * stride_j + task_grid * stride_ij + 5 * stride_ij * ngrids] = deri_dCz;
 }
+
+__global__
+static void GINTfill_int3c1e_ip1_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij,
+                                                            const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                            const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    if (task_ij >= ntasks_ij) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+
+    const double* __restrict__ a12 = c_bpcache.a12;
+    const double* __restrict__ e12 = c_bpcache.e12;
+    const double* __restrict__ x12 = c_bpcache.x12;
+    const double* __restrict__ y12 = c_bpcache.y12;
+    const double* __restrict__ z12 = c_bpcache.z12;
+
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    double deri_dAx_grid_sum = 0;
+    double deri_dAy_grid_sum = 0;
+    double deri_dAz_grid_sum = 0;
+    for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+        const double* grid_point = grid_points + task_grid * 4;
+        const double Cx = grid_point[0];
+        const double Cy = grid_point[1];
+        const double Cz = grid_point[2];
+        const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+        double deri_dAx_per_grid = 0;
+        double deri_dAy_per_grid = 0;
+        double deri_dAz_per_grid = 0;
+        for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+            const double aij = a12[ij];
+            const double eij = e12[ij];
+            const double Px  = x12[ij];
+            const double Py  = y12[ij];
+            const double Pz  = z12[ij];
+            const double PCx = Px - Cx;
+            const double PCy = Py - Cy;
+            const double PCz = Pz - Cz;
+            const double PAx = Px - Ax;
+            const double PAy = Py - Ay;
+            const double PAz = Pz - Az;
+            const double minus_two_a = -2.0 * a_exponents[ij];
+            const double one_over_two_p = 0.5 / aij;
+            double a0 = aij;
+            const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+            const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+            a0 *= q_over_p_plus_q;
+            const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+            const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+            a0 *= theta;
+
+            const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+            const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+            if (boys_input > 3.e-7) {
+                const double sqrt_boys_input = sqrt(boys_input);
+                const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+                const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+                deri_dAx_per_grid += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R000_1 * PCx);
+                deri_dAy_per_grid += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R000_1 * PCy);
+                deri_dAz_per_grid += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R000_1 * PCz);
+            }
+        }
+
+        const double charge = grid_point[3];
+        deri_dAx_grid_sum += deri_dAx_per_grid * charge;
+        deri_dAy_grid_sum += deri_dAy_per_grid * charge;
+        deri_dAz_grid_sum += deri_dAz_per_grid * charge;
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    atomicAdd(output + (i0 + j0 * stride_j + 0 * stride_ij), deri_dAx_grid_sum);
+    atomicAdd(output + (i0 + j0 * stride_j + 1 * stride_ij), deri_dAy_grid_sum);
+    atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij), deri_dAz_grid_sum);
+}
+
+__global__
+static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
+                                                             const BasisProdOffsets offsets, const int nprim_ij,
+                                                             const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+    if (task_grid >= ngrids) {
+        return;
+    }
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double deri_dCx_pair_sum = 0.0;
+    double deri_dCy_pair_sum = 0.0;
+    double deri_dCz_pair_sum = 0.0;
+    for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) {
+        const int bas_ij = offsets.bas_ij + task_ij;
+        const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+
+        const double* __restrict__ a12 = c_bpcache.a12;
+        const double* __restrict__ e12 = c_bpcache.e12;
+        const double* __restrict__ x12 = c_bpcache.x12;
+        const double* __restrict__ y12 = c_bpcache.y12;
+        const double* __restrict__ z12 = c_bpcache.z12;
+
+        double deri_dCx_per_pair = 0;
+        double deri_dCy_per_pair = 0;
+        double deri_dCz_per_pair = 0;
+        for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+            const double aij = a12[ij];
+            const double eij = e12[ij];
+            const double Px  = x12[ij];
+            const double Py  = y12[ij];
+            const double Pz  = z12[ij];
+            const double PCx = Px - Cx;
+            const double PCy = Py - Cy;
+            const double PCz = Pz - Cz;
+            double a0 = aij;
+            const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+            const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+            a0 *= q_over_p_plus_q;
+            const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+            const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+            a0 *= theta;
+
+            const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+            const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+            if (boys_input > 3.e-7) {
+                const double sqrt_boys_input = sqrt(boys_input);
+                const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+                const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+                deri_dCx_per_pair += prefactor * R000_1 * PCx;
+                deri_dCy_per_pair += prefactor * R000_1 * PCy;
+                deri_dCz_per_pair += prefactor * R000_1 * PCz;
+            }
+        }
+
+        const double D = density[bas_ij - hermite_density_offsets.pair_offset_of_angular_pair + hermite_density_offsets.density_offset_of_angular_pair];
+        deri_dCx_pair_sum += deri_dCx_per_pair * D;
+        deri_dCy_pair_sum += deri_dCy_per_pair * D;
+        deri_dCz_pair_sum += deri_dCz_per_pair * D;
+    }
+    atomicAdd(output + task_grid + 0 * ngrids, deri_dCx_pair_sum);
+    atomicAdd(output + task_grid + 1 * ngrids, deri_dCy_pair_sum);
+    atomicAdd(output + task_grid + 2 * ngrids, deri_dCz_pair_sum);
+}
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
index a670b4fd..45d8f6d3 100644
--- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
@@ -80,16 +80,14 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba
     const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
     int type_ijkl;
     switch (nrys_roots) {
-    // case 1:
-    //     type_ijkl = (i_l << 2) | j_l;
-    //     switch (type_ijkl) {
-    //     case (0<<2)|0: GINTfill_int3c1e_ip1_charge_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
-    //     case (1<<2)|0: GINTfill_int3c1e_ip1_charge_contracted_kernel10<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
-    //     default:
-    //         fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
-    //     }
-    //     break;
-    case 1: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<1, GSIZE1_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 1:
+        type_ijkl = (i_l + 1) * 10 + j_l;
+        switch (type_ijkl) {
+        case 10: GINTfill_int3c1e_ip1_charge_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
+        }
+        break;
     case 2: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
     case 3: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
     case 4: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
@@ -120,16 +118,14 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d
     const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
     int type_ijkl;
     switch (nrys_roots) {
-    // case 1:
-    //     type_ijkl = (i_l << 2) | j_l;
-    //     switch (type_ijkl) {
-    //     case (0<<2)|0: GINTfill_int3c1e_ip2_density_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
-    //     case (1<<2)|0: GINTfill_int3c1e_ip2_density_contracted_kernel10<<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
-    //     default:
-    //         fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
-    //     }
-    //     break;
-    case 1: GINTfill_int3c1e_ip2_density_contracted_kernel_general<1> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
+    case 1:
+        type_ijkl = (i_l + 1) * 10 + j_l;
+        switch (type_ijkl) {
+        case 10: GINTfill_int3c1e_ip2_density_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
+        }
+        break;
     case 2: GINTfill_int3c1e_ip2_density_contracted_kernel_general<2> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
     case 3: GINTfill_int3c1e_ip2_density_contracted_kernel_general<3> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
     case 4: GINTfill_int3c1e_ip2_density_contracted_kernel_general<4> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;