CI on multi-GPU runner (pyscf#285)

* disable j-engine kernels requiring large shared memory * use independent logger * update labels for runners * workflow syntax * assert isinstance(verbose,int)
bytedance · Dec 16, 2024 · 031089d · 031089d
1 parent bf03d85
commit 031089d
Show file tree

Hide file tree

Showing 10 changed files with 137 additions and 93 deletions.
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -13,10 +13,35 @@ permissions:
   contents: read
 
 jobs:
-  build:
-
-    runs-on: self-hosted
+  single-gpu:
+    runs-on: [self-hosted, Linux, X64, v100]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Install dependencies
+      run: |
+        pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+        python3 -m pip install --upgrade pip
+        pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
+        pip3 install pyscf --upgrade
+        pip3 install git+https://github.com/pyscf/properties --upgrade
+        pip3 install numpy --upgrade
+        pip3 install h5py --upgrade
+        pip3 install gpu4pyscf-libxc-cuda12x --upgrade
+        pip3 install cupy-cuda12x --upgrade
+        git config --global core.compression 9
+    - name: Build GPU4PySCF
+      run: |
+        export CUDA_HOME=/usr/local/cuda
+        export CMAKE_CONFIGURE_ARGS="-DBUILD_LIBXC=OFF -DCUDA_ARCHITECTURES=70-real -DBUILD_CUTLASS=ON"
+        sh build.sh
+    - name: Test with pytest
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
 
+  multi-gpu:
+    runs-on: [self-hosted, Linux, X64, 2T4]
     steps:
     - uses: actions/checkout@v3
     - name: Install dependencies

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -86,7 +86,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         j2c = cupy.asarray(j2c_cpu, order='C')
         t0 = log.timer_debug1('2c2e', *t0)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-        intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, 
+        intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True,
                      group_size=GROUP_SIZE, group_size_aux=GROUP_SIZE)
         log.timer_debug1('prepare intopt', *t0)
         self.j2c = j2c.copy()
@@ -105,7 +105,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         naux = self.naux = self.cd_low.shape[1]
         log.debug('size of aux basis %d', naux)
 
-        self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, 
+        self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low,
                                        omega=omega, use_gpu_memory=self.use_gpu_memory)
         log.timer_debug1('cholesky_eri', *t0)
         self.intopt = intopt
@@ -144,8 +144,8 @@ def get_blksize(self, extra=0, nao=None):
         return blksize
 
     def loop(self, blksize=None, unpack=True):
-        ''' loop over cderi for the current device 
-            and unpack the CDERI in (Lij) format 
+        ''' loop over cderi for the current device
+            and unpack the CDERI in (Lij) format
         '''
         device_id = cupy.cuda.Device().id
         cderi_sparse = self._cderi[device_id]
@@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
             yield buf2, buf.T
             if isinstance(cderi_sparse, np.ndarray):
                 cupy.cuda.Device().synchronize()
-            
+
             if buf_prefetch is not None:
                 buf = buf_prefetch
-            
+
     def reset(self, mol=None):
         '''Reset mol and clean up relevant attributes for scanner mode'''
         if mol is not None:
@@ -198,7 +198,7 @@ def reset(self, mol=None):
     get_ao_eri = get_eri = NotImplemented
     get_mo_eri = ao2mo = NotImplemented
 
-def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, 
+def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
                      omega=None, sr_only=False, use_gpu_memory=True):
     '''
     Returns:
@@ -210,13 +210,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
 
     # Available memory on Device 0.
     avail_mem = get_avail_mem()
-    
+
     if use_gpu_memory:
         # CDERI will be equally distributed to the devices
         # Other devices usually have more memory available than Device 0
         # CDERI will use up to 40% of the available memory
         use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
-    
+
     if use_gpu_memory:
         log.debug("Saving CDERI on GPU")
     else:
@@ -235,7 +235,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
             _cderi[device_id] = cderi_blk
 
     npairs_per_ctr = [len(intopt.ao_pairs_row[cp_ij_id]) for cp_ij_id in range(len(intopt.log_qs))]
-    
+
     npairs_per_ctr = np.array(npairs_per_ctr)
     total_task_list = np.argsort(npairs_per_ctr)
     task_list_per_device = []
@@ -253,13 +253,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
             future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
             futures.append(future)
-    
+
     for future in futures:
         future.result()
-    
+
     if not use_gpu_memory:
         cupy.cuda.Device().synchronize()
-    
+
     return _cderi
 
 def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
@@ -273,6 +273,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
     pairs_loc = np.append(0, np.cumsum(npairs))
     blksize = (naux + _num_devices - 1) // _num_devices
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(mol.verbose, int)
         log = logger.new_logger(mol, mol.verbose)
         t1 = log.init_timer()
         cd_low_tag = cd_low.tag
@@ -320,7 +321,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
 
             row = intopt.ao_pairs_row[cp_ij_id] - i0
             col = intopt.ao_pairs_col[cp_ij_id] - j0
-            
+
             ints_slices_f= cupy.empty([naoaux,len(row)], order='F')
             ints_slices_f[:] = ints_slices[:,col,row]
             ints_slices = None
@@ -330,12 +331,12 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             elif cd_low_tag == 'cd':
                 cderi_block = solve_triangular(cd_low, ints_slices_f, lower=True, overwrite_b=True)
             else:
-                RuntimeError('Tag is not found in lower triangular matrix.')
+                raise RuntimeError('Tag is not found in lower triangular matrix.')
             t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq} on Device {device_id}', *t1)
 
-            # TODO: 
+            # TODO:
             # 1) async data transfer
-            # 2) auxiliary basis in the last dimension 
+            # 2) auxiliary basis in the last dimension
 
             # if CDERI is saved on CPU
             ij0 = pairs_loc[cp_ij_id]

diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
@@ -249,6 +249,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
     ''' Calculate J and K matrices on single GPU
     '''
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
         dms = cupy.asarray(dms)
@@ -313,6 +314,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
     '''
     vj = vk = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
         dms = cupy.asarray(dms)
@@ -373,6 +375,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
     ''' Calculate J and K matrices with density matrix
     '''
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
         dms = cupy.asarray(dms)
@@ -404,7 +407,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
                 for k in range(nset):
                     rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
                     #vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
-                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))            
+                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
         if with_j:
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
@@ -437,7 +440,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
 
     assert nao == dfobj.nao
     intopt = dfobj.intopt
-    
+
     nao = dms_tag.shape[-1]
     dms = dms_tag.reshape([-1,nao,nao])
     intopt = dfobj.intopt
@@ -456,7 +459,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         with ThreadPoolExecutor(max_workers=_num_devices) as executor:
             for device_id in range(_num_devices):
                 future = executor.submit(
-                    _jk_task_with_mo, 
+                    _jk_task_with_mo,
                     dfobj, dms, mo_coeff, mo_occ,
                     hermi=hermi, device_id=device_id,
                     with_j=with_j, with_k=with_k)
@@ -477,7 +480,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         with ThreadPoolExecutor(max_workers=_num_devices) as executor:
             for device_id in range(_num_devices):
                 future = executor.submit(
-                    _jk_task_with_mo1, 
+                    _jk_task_with_mo1,
                     dfobj, dms, mo1s, occ_coeffs,
                     hermi=hermi, device_id=device_id,
                     with_j=with_j, with_k=with_k)

diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
@@ -25,6 +25,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
     rhoj = rhok = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
         log = logger.new_logger(with_df.mol, with_df.verbose)
+        assert isinstance(with_df.verbose, int)
         t0 = log.init_timer()
         dm = cupy.asarray(dm)
         orbo = cupy.asarray(orbo)
@@ -34,7 +35,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
         cols = with_df.intopt.cderi_col
         dm_sparse = dm[rows, cols]
         dm_sparse[with_df.intopt.cderi_diag] *= .5
-        
+
         blksize = with_df.get_blksize()
         if with_j:
             rhoj = cupy.empty([naux_slice])
@@ -65,18 +66,18 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
                 _jk_task, with_df, dm, orbo,
                 with_j=with_j, with_k=with_k, device_id=device_id)
             futures.append(future)
-    
+
     rhoj_total = []
     rhok_total = []
     for future in futures:
         rhoj, rhok = future.result()
         rhoj_total.append(rhoj)
         rhok_total.append(rhok)
-        
+
     rhoj = rhok = None
     if with_j:
         rhoj = concatenate(rhoj_total)
     if with_k:
         rhok = concatenate(rhok_total)
-    
+
     return rhoj, rhok