diff --git a/gpu4pyscf/lib/gint/g3c1e.cu b/gpu4pyscf/lib/gint/g3c1e.cu index f8ee07c7..4bf449d7 100644 --- a/gpu4pyscf/lib/gint/g3c1e.cu +++ b/gpu4pyscf/lib/gint/g3c1e.cu @@ -92,6 +92,98 @@ static void GINTfill_int3c1e_kernel_general(double* output, const BasisProdOffse } } +template +__device__ +static void GINTwrite_int3c1e_charge_contracted(const double* g, double* local_output, const double prefactor) +{ + constexpr int NROOTS = (LI + LJ) / 2 + 1; + + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (LI + 1) * (LJ + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + constexpr int n_density_elements_i = (LI + 1) * (LI + 2) / 2; + constexpr int n_density_elements_j = (LJ + 1) * (LJ + 2) / 2; +#pragma unroll + for (int j = 0; j < n_density_elements_j; j++) { +#pragma unroll + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[LJ] + j; + const int loc_i = c_l_locs[LI] + i; + + const int ix = (idx[loc_i] + idx[loc_j] * (LI + 1)) * NROOTS; + const int iy = (idy[loc_i] + idy[loc_j] * (LI + 1)) * NROOTS; + const int iz = (idz[loc_i] + idz[loc_j] * (LI + 1)) * NROOTS; + + double eri = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + eri += gx[ix + i_root] * gy[iy + i_root] * gz[iz + i_root]; + } + local_output[i + j * n_density_elements_i] += eri * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_charge_contracted_kernel_expanded(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + constexpr int L_SUM = LI + LJ; + constexpr int NROOTS = L_SUM / 2 + 1; + + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + + constexpr int n_density_elements_i = (LI + 1) * (LI + 2) / 2; + constexpr int n_density_elements_j = (LJ + 1) * (LJ + 2) / 2; + double output_cache[n_density_elements_i * n_density_elements_j] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[3 * NROOTS * (LI + 1) * (LJ + 1)]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, LI, LJ, charge_exponent, omega); + GINTwrite_int3c1e_charge_contracted(g, output_cache, charge); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; +#pragma unroll + for (int j = 0; j < n_density_elements_j; j++) { +#pragma unroll + for (int i = 0; i < n_density_elements_i; i++) { + const double eri_grid_sum = output_cache[i + j * n_density_elements_i]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j), eri_grid_sum); + } + } +} + template __device__ static void GINTwrite_int3c1e_charge_contracted(const double* g, double* local_output, const double prefactor, const int i_l, const int j_l) @@ -110,13 +202,9 @@ static void GINTwrite_int3c1e_charge_contracted(const double* g, double* local_o const int loc_j = c_l_locs[j_l] + j; const int loc_i = c_l_locs[i_l] + i; - int ix = idx[loc_i] + idx[loc_j] * (i_l + 1); - int iy = idy[loc_i] + idy[loc_j] * (i_l + 1); - int iz = idz[loc_i] + idz[loc_j] * (i_l + 1); - - ix = ix * NROOTS; - iy = iy * NROOTS; - iz = iz * NROOTS; + const int ix = (idx[loc_i] + idx[loc_j] * (i_l + 1)) * NROOTS; + const int iy = (idy[loc_i] + idy[loc_j] * (i_l + 1)) * NROOTS; + const int iz = (idz[loc_i] + idz[loc_j] * (i_l + 1)) * NROOTS; double eri = 0; #pragma unroll diff --git a/gpu4pyscf/lib/gint/g3c1e_ip.cu b/gpu4pyscf/lib/gint/g3c1e_ip.cu index 85831dfb..ede4b125 100644 --- a/gpu4pyscf/lib/gint/g3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/g3c1e_ip.cu @@ -136,6 +136,123 @@ static void GINTfill_int3c1e_ip_kernel_general(double* output, const BasisProdOf } } +template +__device__ +static void GINTwrite_int3c1e_ip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double prefactor) +{ + constexpr int NROOTS = (LI + LJ + 1) / 2 + 1; + + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (LI + 1 + 1) * (LJ + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + constexpr int n_density_elements_i = (LI + 1) * (LI + 2) / 2; + constexpr int n_density_elements_j = (LJ + 1) * (LJ + 2) / 2; + constexpr int n_density_elements_ij = n_density_elements_i * n_density_elements_j; +#pragma unroll + for (int j = 0; j < n_density_elements_j; j++) { +#pragma unroll + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[LJ] + j; + const int loc_i = c_l_locs[LI] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (LI + 1 + 1); + const int gy_offset = iy + jy * (LI + 1 + 1); + const int gz_offset = iz + jz * (LI + 1 + 1); + + double deri_dAx = 0; + double deri_dAy = 0; + double deri_dAz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double dgx_dAx = (ix > 0 ? ix * gx[(gx_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gx[(gx_offset + 1) * NROOTS + i_root]; + const double dgy_dAy = (iy > 0 ? iy * gy[(gy_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gy[(gy_offset + 1) * NROOTS + i_root]; + const double dgz_dAz = (iz > 0 ? iz * gz[(gz_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gz[(gz_offset + 1) * NROOTS + i_root]; + deri_dAx += dgx_dAx * gy_0 * gz_0; + deri_dAy += gx_0 * dgy_dAy * gz_0; + deri_dAz += gx_0 * gy_0 * dgz_dAz; + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += deri_dAx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += deri_dAy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += deri_dAz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + constexpr int L_SUM = LI + LJ; + constexpr int NROOTS = (L_SUM + 1) / 2 + 1; + + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + constexpr int n_density_elements_i = (LI + 1) * (LI + 2) / 2; + constexpr int n_density_elements_j = (LJ + 1) * (LJ + 2) / 2; + double output_cache[n_density_elements_i * n_density_elements_j * 3] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[3 * NROOTS * (LI + 1 + 1) * (LJ + 1)]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, LI + 1, LJ, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip1_charge_contracted(g, output_cache, minus_two_a, charge); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + constexpr int n_density_elements_ij = n_density_elements_i * n_density_elements_j; +#pragma unroll + for (int j = 0; j < n_density_elements_j; j++) { +#pragma unroll + for (int i = 0; i < n_density_elements_i; i++) { + const double deri_dAx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double deri_dAy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double deri_dAz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), deri_dAx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), deri_dAy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), deri_dAz); + } + } +} + template __device__ static void GINTwrite_int3c1e_ip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double prefactor, const int i_l, const int j_l) diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu index e2586764..87eeb87e 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu @@ -73,30 +73,33 @@ static int GINTfill_int3c1e_charge_contracted_tasks(double* output, const BasisP const double omega, const double* grid_points, const double* charge_exponents, const int n_charge_sum_per_thread, const cudaStream_t stream) { - const int nrys_roots = (i_l + j_l) / 2 + 1; const int ntasks_ij = offsets.ntasks_ij; const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; const dim3 threads(THREADSX, THREADSY); const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); - int type_ijkl; - switch (nrys_roots) { - case 1: - type_ijkl = (i_l << 2) | j_l; - switch (type_ijkl) { - case (0<<2)|0: GINTfill_int3c1e_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case (1<<2)|0: GINTfill_int3c1e_charge_contracted_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + const int type_ij = i_l * 10 + j_l; + switch (type_ij) { + case 00: GINTfill_int3c1e_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 10: GINTfill_int3c1e_charge_contracted_kernel10<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 11: GINTfill_int3c1e_charge_contracted_kernel_expanded<1, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 20: GINTfill_int3c1e_charge_contracted_kernel_expanded<2, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 21: GINTfill_int3c1e_charge_contracted_kernel_expanded<2, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 22: GINTfill_int3c1e_charge_contracted_kernel_expanded<2, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 30: GINTfill_int3c1e_charge_contracted_kernel_expanded<3, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 31: GINTfill_int3c1e_charge_contracted_kernel_expanded<3, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 32: GINTfill_int3c1e_charge_contracted_kernel_expanded<3, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 40: GINTfill_int3c1e_charge_contracted_kernel_expanded<4, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 41: GINTfill_int3c1e_charge_contracted_kernel_expanded<4, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + const int nrys_roots = (i_l + j_l) / 2 + 1; + switch (nrys_roots) { + case 4: GINTfill_int3c1e_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; default: - fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); + fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots); + return 1; } - break; - case 2: GINTfill_int3c1e_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case 3: GINTfill_int3c1e_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case 4: GINTfill_int3c1e_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case 5: GINTfill_int3c1e_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - default: - fprintf(stderr, "rys roots %d\n", nrys_roots); - return 1; } cudaError_t err = cudaGetLastError(); @@ -127,8 +130,7 @@ static int GINTfill_int3c1e_density_contracted_tasks(double* output, const doubl case 6: GINTfill_int3c1e_density_contracted_kernel_general< 6> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; case 7: GINTfill_int3c1e_density_contracted_kernel_general< 7> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; case 8: GINTfill_int3c1e_density_contracted_kernel_general< 8> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; - case 9: GINTfill_int3c1e_density_contracted_kernel_general< 9> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; - case 10: GINTfill_int3c1e_density_contracted_kernel_general<10> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + // Up to g + g = 8 now default: fprintf(stderr, "i_l + j_l = %d out of range\n", i_l + j_l); return 1; diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu index cd5b0abc..095cf8c5 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu @@ -72,29 +72,37 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba const double omega, const double* grid_points, const double* charge_exponents, const int n_charge_sum_per_thread, const cudaStream_t stream) { - const int nrys_roots = (i_l + j_l + 1) / 2 + 1; const int ntasks_ij = offsets.ntasks_ij; const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; const dim3 threads(THREADSX, THREADSY); const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); - int type_ijkl; - switch (nrys_roots) { - case 1: - type_ijkl = (i_l + 1) * 10 + j_l; - switch (type_ijkl) { - case 10: GINTfill_int3c1e_ip1_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + const int type_ij = i_l * 10 + j_l; + switch (type_ij) { + case 00: GINTfill_int3c1e_ip1_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 01: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<0, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 02: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<0, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 03: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<0, 3> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 04: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<0, 4> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 10: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<1, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 11: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<1, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 12: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<1, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 13: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<1, 3> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 20: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<2, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 21: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<2, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 22: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<2, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 30: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<3, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 31: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<3, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 40: GINTfill_int3c1e_ip1_charge_contracted_kernel_expanded<4, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + switch (nrys_roots) { + case 4: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; default: - fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl); + fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots); + return 1; } - break; - case 2: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case 3: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case 4: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - case 5: GINTfill_int3c1e_ip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; - default: - fprintf(stderr, "rys roots %d\n", nrys_roots); - return 1; } cudaError_t err = cudaGetLastError(); @@ -125,8 +133,7 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d case 6: GINTfill_int3c1e_ip2_density_contracted_kernel_general< 6> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; case 7: GINTfill_int3c1e_ip2_density_contracted_kernel_general< 7> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; case 8: GINTfill_int3c1e_ip2_density_contracted_kernel_general< 8> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; - case 9: GINTfill_int3c1e_ip2_density_contracted_kernel_general< 9> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; - case 10: GINTfill_int3c1e_ip2_density_contracted_kernel_general<10> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + // Up to g + g = 8 now default: fprintf(stderr, "i_l + j_l = %d out of range\n", i_l + j_l); return 1;