Minor optimization in Rys integrals (pyscf#262)

* some minor optimization * ruff * ruff * merge master * bugfix * remove temp changes * remove comments
bytedance · Dec 4, 2024 · d4dce92 · d4dce92
1 parent 8a897f3
commit d4dce92
Show file tree

Hide file tree

Showing 46 changed files with 2,450 additions and 84,886 deletions.
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
@@ -114,6 +114,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
+    t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
     rhoj0_P = solve_j2c(wj)
     rhok0_P__ = solve_j2c(wk_P__)
     wj = wk_P__ = None
@@ -242,7 +243,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             hk_aux_diag = .5*hk
         hj = hk = None
         t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
-
+    
     # int2c contributions
     if hessobj.auxbasis_response > 1:
         if omega and omega > 1e-10:

diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
@@ -495,7 +495,6 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
         opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
 
     nbins = 1
-
     for aux_id, log_q_kl in enumerate(intopt.aux_log_qs):
         cp_kl_id = aux_id + len(intopt.log_qs)
         lk = intopt.aux_angular[aux_id]
@@ -518,7 +517,6 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
 
             ao_offsets = np.array([i0,j0,nao+1+k0,nao], dtype=np.int32)
             strides = np.array([1, ni, ni*nj, ni*nj*nk], dtype=np.int32)
-
             # Use GPU kernels for low-angular momentum
             if (li + lj + lk + order)//2 + 1 < NROOT_ON_GPU:
                 int3c_blk = cupy.zeros([comp, nk, nj, ni], order='C', dtype=np.float64)
@@ -545,13 +543,11 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None):
                 shls_slice = np.array([ishl0, ishl1, jshl0, jshl1, kshl0, kshl1], dtype=np.int64)
                 int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
                 int3c_blk = cupy.asarray(int3c_cpu)
-
             if not intopt.auxmol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
             if not intopt.mol.cart:
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
                 int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
-
             i0, i1 = ao_loc[cpi], ao_loc[cpi+1]
             j0, j1 = ao_loc[cpj], ao_loc[cpj+1]
             k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
@@ -657,10 +653,12 @@ def get_aux2atom(intopt, auxslices):
         aux2atom[p0:p1,ia] = 1.0
     return intopt.sort_orbitals(aux2atom, aux_axis=[0])
 
-def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
+def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None):
     '''
     get rhoj pass1 for int3c2e
     '''
+    if stream is None: stream = cupy.cuda.get_current_stream()
+
     n_dm = 1
 
     naux = intopt._sorted_auxmol.nao
@@ -681,7 +679,9 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
     norb = dm_cart.shape[0]
 
     rhoj = cupy.zeros([naux])
+
     err = libgvhf.GINTbuild_j_int3c2e_pass1(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
         intopt.bpcache,
         ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p),
         ctypes.cast(rhoj.data.ptr, ctypes.c_void_p),
@@ -700,10 +700,12 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True):
         rhoj = cupy.dot(rhoj, aux_coeff)
     return rhoj
 
-def get_j_int3c2e_pass2(intopt, rhoj):
+def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
     '''
     get vj pass2 for int3c2e
     '''
+    if stream is None: stream = cupy.cuda.get_current_stream()
+
     n_dm = 1
     norb = intopt._sorted_mol.nao
     naux = intopt._sorted_auxmol.nao
@@ -723,6 +725,7 @@ def get_j_int3c2e_pass2(intopt, rhoj):
         rhoj = intopt.aux_cart2sph @ rhoj
 
     err = libgvhf.GINTbuild_j_int3c2e_pass2(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
         intopt.bpcache,
         ctypes.cast(vj.data.ptr, ctypes.c_void_p),
         ctypes.cast(rhoj.data.ptr, ctypes.c_void_p),

diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -240,6 +240,7 @@ def test_hessian_rks_D3(self):
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
         h = hobj.kernel()
+        print(np.linalg.norm(h))
         _check_dft_hessian(mf, h, ix=0,iy=0)
 
     def test_hessian_rks_D4(self):

diff --git a/gpu4pyscf/gto/tests/test_int1e_grids.py b/gpu4pyscf/gto/tests/test_int1e_grids.py
@@ -45,8 +45,8 @@ def setUpModule():
     grid_points = lib.cartesian_prod([xs, ys, zs])
 
     # All of the following thresholds bound the max value of the corresponding matrix / tensor.
-    integral_threshold = 1e-12
-    density_contraction_threshold = 1e-10
+    integral_threshold = 1e-10
+    density_contraction_threshold = 1e-8
 
 def tearDownModule():
     global mol_sph, mol_cart, grid_points

diff --git a/gpu4pyscf/lib/gint/cint2e.cuh b/gpu4pyscf/lib/gint/cint2e.cuh
@@ -23,10 +23,7 @@
 
 //extern __constant__ GINTEnvVars c_envs;
 extern __constant__ BasisProdCache c_bpcache;
-extern __constant__ int16_t c_idx4c[NFffff*3];
+//extern __constant__ int16_t c_idx4c[NFffff*3];
 
-/*
-__constant__ GINTEnvVars c_envs;
-__constant__ BasisProdCache c_bpcache;
-__constant__ int16_t c_idx4c[NFffff*3];
-*/
+extern __constant__ int c_idx[TOT_NF*3];
+extern __constant__ int c_l_locs[GPU_LMAX+2];
diff --git a/gpu4pyscf/lib/gint/constant.cu b/gpu4pyscf/lib/gint/constant.cu
@@ -18,6 +18,20 @@
 
 //__constant__ GINTEnvVars c_envs;
 __constant__ BasisProdCache c_bpcache;
-__constant__ int16_t c_idx4c[NFffff*3];
-__constant__ int c_idx[TOT_NF*3]; 
-__constant__ int c_l_locs[GPU_LMAX+1];
+//__constant__ int16_t c_idx4c[NFffff*3];
+
+// Generated with GINTinit_index1d_xyz
+__constant__ int c_idx[TOT_NF*3] = {
+    0, 1, 0, 0, 2, 1, 1, 0, 0, 0, 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 4, 3, 3, 
+    2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2, 1, 
+    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 
+    2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 
+    1, 0, 0, 1, 0, 2, 1, 0, 3, 2, 1, 0, 0, 1, 0, 2, 1, 0, 3, 2, 1, 0, 4, 
+    3, 2, 1, 0, 0, 1, 0, 2, 1, 0, 3, 2, 1, 0, 4, 3, 2, 1, 0, 5, 4, 3, 2, 
+    1, 0, 0, 1, 0, 2, 1, 0, 3, 2, 1, 0, 4, 3, 2, 1, 0, 5, 4, 3, 2, 1, 0, 
+    6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 1, 2, 
+    0, 1, 2, 3, 0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 0, 1, 0, 
+    1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 0, 1, 0, 1, 2, 
+    0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6};
+
+__constant__ int c_l_locs[GPU_LMAX+2] = {0, 1, 4, 10, 20, 35, 56, 84};
diff --git a/gpu4pyscf/lib/gint/g2e.c b/gpu4pyscf/lib/gint/g2e.c
@@ -73,8 +73,8 @@ void GINTinit_EnvVars(GINTEnvVars *envs,
         envs->nrys_roots = nroots;
         envs->fac = fac;
 
-        int ibase = 1;//i_l >= j_l; //li_ceil >= lj_ceil;
-        int kbase = 1;//k_l >= l_l; //lk_ceil >= ll_ceil;
+        int ibase = 1;//li_ceil >= lj_ceil;
+        int kbase = 1;//lk_ceil >= ll_ceil;
         envs->ibase = ibase;
         envs->kbase = kbase;