Skip to content

Commit

Permalink
Recover a performance improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
henryw7 committed Dec 4, 2024
1 parent b0ff950 commit 0859567
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 43 deletions.
4 changes: 2 additions & 2 deletions gpu4pyscf/gto/moleintor.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]

grids = cp.asarray(grids, order='C')
charges = cp.asarray(charges)
charges = cp.asarray(charges).reshape([-1, 1], order='C')
grids = cp.concatenate([grids, charges], axis=1)
if charge_exponents is not None:
charge_exponents = cp.asarray(charge_exponents, order='C')

Expand Down Expand Up @@ -329,7 +330,6 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
ctypes.cast(stream.ptr, ctypes.c_void_p),
intopt.bpcache,
ctypes.cast(grids.data.ptr, ctypes.c_void_p),
ctypes.cast(charges.data.ptr, ctypes.c_void_p),
ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
ctypes.c_int(ngrids),
ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
Expand Down
12 changes: 6 additions & 6 deletions gpu4pyscf/lib/gint/g1e_root_123.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static void GINTfill_int3c1e_kernel00(double* output, const BasisProdOffsets off
__global__
static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij,
const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
const double omega, const double* grid_points, const double* charges, const double* charge_exponents)
const double omega, const double* grid_points, const double* charge_exponents)
{
const int ntasks_ij = offsets.ntasks_ij;
const int ngrids = offsets.ntasks_kl;
Expand All @@ -111,7 +111,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const Ba

double eri_grid_sum = 0.0;
for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
const double* grid_point = grid_points + task_grid * 3;
const double* grid_point = grid_points + task_grid * 4;
const double Cx = grid_point[0];
const double Cy = grid_point[1];
const double Cz = grid_point[2];
Expand Down Expand Up @@ -146,7 +146,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel00(double* output, const Ba
eri_per_grid += eri_per_primitive;
}

const double charge = charges[task_grid];
const double charge = grid_point[3];
eri_grid_sum += eri_per_grid * charge;
}

Expand Down Expand Up @@ -318,7 +318,7 @@ static void GINTfill_int3c1e_kernel10(double* output, const BasisProdOffsets off
__global__
static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const BasisProdOffsets offsets, const int nprim_ij,
const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
const double omega, const double* grid_points, const double* charges, const double* charge_exponents)
const double omega, const double* grid_points, const double* charge_exponents)
{
const int ntasks_ij = offsets.ntasks_ij;
const int ngrids = offsets.ntasks_kl;
Expand Down Expand Up @@ -352,7 +352,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const Ba
double eri_grid_sum_y = 0;
double eri_grid_sum_z = 0;
for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
const double* grid_point = grid_points + task_grid * 3;
const double* grid_point = grid_points + task_grid * 4;
const double Cx = grid_point[0];
const double Cy = grid_point[1];
const double Cz = grid_point[2];
Expand Down Expand Up @@ -400,7 +400,7 @@ static void GINTfill_int3c1e_charge_contracted_kernel10(double* output, const Ba
eri_per_grid_z += eri_per_primitive_z;
}

const double charge = charges[task_grid];
const double charge = grid_point[3];
eri_grid_sum_x += eri_per_grid_x * charge;
eri_grid_sum_y += eri_per_grid_y * charge;
eri_grid_sum_z += eri_per_grid_z * charge;
Expand Down
12 changes: 6 additions & 6 deletions gpu4pyscf/lib/gint/g3c1e.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ template <int NROOTS, int GSIZE_INT3C_1E>
__global__
void GINTfill_int3c1e_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
const double omega, const double* grid_points, const double* charges, const double* charge_exponents)
const double omega, const double* grid_points, const double* charge_exponents)
{
const int ntasks_ij = offsets.ntasks_ij;
const int ngrids = offsets.ntasks_kl;
Expand All @@ -156,8 +156,8 @@ void GINTfill_int3c1e_charge_contracted_kernel_general(double* output, const Bas
* (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2] { 0.0 };

for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
const double* grid_point = grid_points + task_grid * 3;
const double charge = charges[task_grid];
const double* grid_point = grid_points + task_grid * 4;
const double charge = grid_point[3];
const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;

double g[GSIZE_INT3C_1E];
Expand All @@ -182,9 +182,9 @@ void GINTfill_int3c1e_charge_contracted_kernel_general(double* output, const Bas

template <int NROOTS>
__global__
void GINT_int3c1e_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
const double omega, const double* grid_points, const double* charge_exponents)
void GINTfill_int3c1e_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
const double omega, const double* grid_points, const double* charge_exponents)
{
const int ntasks_ij = offsets.ntasks_ij;
const int ngrids = offsets.ntasks_kl;
Expand Down
42 changes: 13 additions & 29 deletions gpu4pyscf/lib/gint/nr_fill_ao_int3c1e.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@
#define GSIZE4_INT3C_1E 240
#define GSIZE5_INT3C_1E 450

/*
static void CINTcart_comp(int16_t *nx, int16_t *ny, int16_t *nz, int lmax)
{
int inc = 0;
for (int16_t lx = lmax; lx >= 0; lx--) {
for (int16_t ly = lmax - lx; ly >= 0; ly--) {
int16_t lz = lmax - lx - ly;
nx[inc] = lx;
ny[inc] = ly;
nz[inc] = lz;
inc++;
}
}
}
*/

static int GINTfill_int3c1e_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
const double omega, const double* grid_points, const double* charge_exponents, const cudaStream_t stream)
Expand Down Expand Up @@ -97,7 +81,7 @@ static int GINTfill_int3c1e_tasks(double* output, const BasisProdOffsets offsets

static int GINTfill_int3c1e_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
const double omega, const double* grid_points, const double* charges, const double* charge_exponents,
const double omega, const double* grid_points, const double* charge_exponents,
const int n_charge_sum_per_thread, const cudaStream_t stream)
{
const int nrys_roots = (i_l + j_l) / 2 + 1;
Expand All @@ -111,16 +95,16 @@ static int GINTfill_int3c1e_charge_contracted_tasks(double* output, const BasisP
case 1:
type_ijkl = (i_l << 2) | j_l;
switch (type_ijkl) {
case (0<<2)|0: GINTfill_int3c1e_charge_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charges, charge_exponents); break;
case (1<<2)|0: GINTfill_int3c1e_charge_contracted_kernel10<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charges, charge_exponents); break;
case (0<<2)|0: GINTfill_int3c1e_charge_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
case (1<<2)|0: GINTfill_int3c1e_charge_contracted_kernel10<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
default:
fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
}
break;
case 2: GINTfill_int3c1e_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charges, charge_exponents); break;
case 3: GINTfill_int3c1e_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charges, charge_exponents); break;
case 4: GINTfill_int3c1e_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charges, charge_exponents); break;
case 5: GINTfill_int3c1e_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charges, charge_exponents); break;
case 2: GINTfill_int3c1e_charge_contracted_kernel_general<2, GSIZE2_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
case 3: GINTfill_int3c1e_charge_contracted_kernel_general<3, GSIZE3_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
case 4: GINTfill_int3c1e_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
case 5: GINTfill_int3c1e_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
default:
fprintf(stderr, "rys roots %d\n", nrys_roots);
return 1;
Expand Down Expand Up @@ -156,10 +140,10 @@ static int GINTfill_int3c1e_density_contracted_tasks(double* output, const doubl
fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
}
break;
case 2: GINT_int3c1e_density_contracted_kernel_general<2> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 3: GINT_int3c1e_density_contracted_kernel_general<3> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 4: GINT_int3c1e_density_contracted_kernel_general<4> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 5: GINT_int3c1e_density_contracted_kernel_general<5> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 2: GINTfill_int3c1e_density_contracted_kernel_general<2> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 3: GINTfill_int3c1e_density_contracted_kernel_general<3> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 4: GINTfill_int3c1e_density_contracted_kernel_general<4> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
case 5: GINTfill_int3c1e_density_contracted_kernel_general<5> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, i_l, j_l, nprim_ij, omega, grid_points, charge_exponents); break;
default:
fprintf(stderr, "rys roots %d\n", nrys_roots);
return 1;
Expand Down Expand Up @@ -225,7 +209,7 @@ int GINTfill_int3c1e(const cudaStream_t stream, const BasisProdCache* bpcache,
}

int GINTfill_int3c1e_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
const double* grid_points, const double* charges, const double* charge_exponents, const int ngrids,
const double* grid_points, const double* charge_exponents, const int ngrids,
double* integral_charge_contracted, const int nao,
const int* strides, const int* ao_offsets,
const int* bins_locs_ij, int nbins,
Expand Down Expand Up @@ -264,7 +248,7 @@ int GINTfill_int3c1e_charge_contracted(const cudaStream_t stream, const BasisPro

const int err = GINTfill_int3c1e_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
strides[0], strides[1], ao_offsets[0], ao_offsets[1],
omega, grid_points, charges, charge_exponents, n_charge_sum_per_thread, stream);
omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);

if (err != 0) {
return err;
Expand Down

0 comments on commit 0859567

Please sign in to comment.