Skip to content

Commit

Permalink
CI on multi-GPU runner (pyscf#285)
Browse files Browse the repository at this point in the history
* disable j-engine kernels requiring large shared memory

* use independent logger

* update labels for runners

* workflow syntax

* assert isinstance(verbose,int)
  • Loading branch information
wxj6000 authored Dec 16, 2024
1 parent bf03d85 commit 031089d
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 93 deletions.
31 changes: 28 additions & 3 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,35 @@ permissions:
contents: read

jobs:
build:

runs-on: self-hosted
single-gpu:
runs-on: [self-hosted, Linux, X64, v100]
steps:
- uses: actions/checkout@v3
- name: Install dependencies
run: |
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install --upgrade pip
pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
pip3 install pyscf --upgrade
pip3 install git+https://github.com/pyscf/properties --upgrade
pip3 install numpy --upgrade
pip3 install h5py --upgrade
pip3 install gpu4pyscf-libxc-cuda12x --upgrade
pip3 install cupy-cuda12x --upgrade
git config --global core.compression 9
- name: Build GPU4PySCF
run: |
export CUDA_HOME=/usr/local/cuda
export CMAKE_CONFIGURE_ARGS="-DBUILD_LIBXC=OFF -DCUDA_ARCHITECTURES=70-real -DBUILD_CUTLASS=ON"
sh build.sh
- name: Test with pytest
run: |
echo $GITHUB_WORKSPACE
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
multi-gpu:
runs-on: [self-hosted, Linux, X64, 2T4]
steps:
- uses: actions/checkout@v3
- name: Install dependencies
Expand Down
35 changes: 18 additions & 17 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
j2c = cupy.asarray(j2c_cpu, order='C')
t0 = log.timer_debug1('2c2e', *t0)
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True,
intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True,
group_size=GROUP_SIZE, group_size_aux=GROUP_SIZE)
log.timer_debug1('prepare intopt', *t0)
self.j2c = j2c.copy()
Expand All @@ -105,7 +105,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
naux = self.naux = self.cd_low.shape[1]
log.debug('size of aux basis %d', naux)

self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low,
self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low,
omega=omega, use_gpu_memory=self.use_gpu_memory)
log.timer_debug1('cholesky_eri', *t0)
self.intopt = intopt
Expand Down Expand Up @@ -144,8 +144,8 @@ def get_blksize(self, extra=0, nao=None):
return blksize

def loop(self, blksize=None, unpack=True):
''' loop over cderi for the current device
and unpack the CDERI in (Lij) format
''' loop over cderi for the current device
and unpack the CDERI in (Lij) format
'''
device_id = cupy.cuda.Device().id
cderi_sparse = self._cderi[device_id]
Expand Down Expand Up @@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
yield buf2, buf.T
if isinstance(cderi_sparse, np.ndarray):
cupy.cuda.Device().synchronize()

if buf_prefetch is not None:
buf = buf_prefetch

def reset(self, mol=None):
'''Reset mol and clean up relevant attributes for scanner mode'''
if mol is not None:
Expand All @@ -198,7 +198,7 @@ def reset(self, mol=None):
get_ao_eri = get_eri = NotImplemented
get_mo_eri = ao2mo = NotImplemented

def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
omega=None, sr_only=False, use_gpu_memory=True):
'''
Returns:
Expand All @@ -210,13 +210,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,

# Available memory on Device 0.
avail_mem = get_avail_mem()

if use_gpu_memory:
# CDERI will be equally distributed to the devices
# Other devices usually have more memory available than Device 0
# CDERI will use up to 40% of the available memory
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices

if use_gpu_memory:
log.debug("Saving CDERI on GPU")
else:
Expand All @@ -235,7 +235,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
_cderi[device_id] = cderi_blk

npairs_per_ctr = [len(intopt.ao_pairs_row[cp_ij_id]) for cp_ij_id in range(len(intopt.log_qs))]

npairs_per_ctr = np.array(npairs_per_ctr)
total_task_list = np.argsort(npairs_per_ctr)
task_list_per_device = []
Expand All @@ -253,13 +253,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
omega=omega, sr_only=sr_only, device_id=device_id)
futures.append(future)

for future in futures:
future.result()

if not use_gpu_memory:
cupy.cuda.Device().synchronize()

return _cderi

def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
Expand All @@ -273,6 +273,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
pairs_loc = np.append(0, np.cumsum(npairs))
blksize = (naux + _num_devices - 1) // _num_devices
with cupy.cuda.Device(device_id), _streams[device_id]:
assert isinstance(mol.verbose, int)
log = logger.new_logger(mol, mol.verbose)
t1 = log.init_timer()
cd_low_tag = cd_low.tag
Expand Down Expand Up @@ -320,7 +321,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de

row = intopt.ao_pairs_row[cp_ij_id] - i0
col = intopt.ao_pairs_col[cp_ij_id] - j0

ints_slices_f= cupy.empty([naoaux,len(row)], order='F')
ints_slices_f[:] = ints_slices[:,col,row]
ints_slices = None
Expand All @@ -330,12 +331,12 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
elif cd_low_tag == 'cd':
cderi_block = solve_triangular(cd_low, ints_slices_f, lower=True, overwrite_b=True)
else:
RuntimeError('Tag is not found in lower triangular matrix.')
raise RuntimeError('Tag is not found in lower triangular matrix.')
t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq} on Device {device_id}', *t1)

# TODO:
# TODO:
# 1) async data transfer
# 2) auxiliary basis in the last dimension
# 2) auxiliary basis in the last dimension

# if CDERI is saved on CPU
ij0 = pairs_loc[cp_ij_id]
Expand Down
11 changes: 7 additions & 4 deletions gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
''' Calculate J and K matrices on single GPU
'''
with cupy.cuda.Device(device_id), _streams[device_id]:
assert isinstance(dfobj.verbose, int)
log = logger.new_logger(dfobj.mol, dfobj.verbose)
t0 = log.init_timer()
dms = cupy.asarray(dms)
Expand Down Expand Up @@ -313,6 +314,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
'''
vj = vk = None
with cupy.cuda.Device(device_id), _streams[device_id]:
assert isinstance(dfobj.verbose, int)
log = logger.new_logger(dfobj.mol, dfobj.verbose)
t0 = log.init_timer()
dms = cupy.asarray(dms)
Expand Down Expand Up @@ -373,6 +375,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
''' Calculate J and K matrices with density matrix
'''
with cupy.cuda.Device(device_id), _streams[device_id]:
assert isinstance(dfobj.verbose, int)
log = logger.new_logger(dfobj.mol, dfobj.verbose)
t0 = log.init_timer()
dms = cupy.asarray(dms)
Expand Down Expand Up @@ -404,7 +407,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
for k in range(nset):
rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
#vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
if with_j:
vj = cupy.zeros(dms_shape)
vj[:,rows,cols] = vj_sparse
Expand Down Expand Up @@ -437,7 +440,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-

assert nao == dfobj.nao
intopt = dfobj.intopt

nao = dms_tag.shape[-1]
dms = dms_tag.reshape([-1,nao,nao])
intopt = dfobj.intopt
Expand All @@ -456,7 +459,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
future = executor.submit(
_jk_task_with_mo,
_jk_task_with_mo,
dfobj, dms, mo_coeff, mo_occ,
hermi=hermi, device_id=device_id,
with_j=with_j, with_k=with_k)
Expand All @@ -477,7 +480,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
future = executor.submit(
_jk_task_with_mo1,
_jk_task_with_mo1,
dfobj, dms, mo1s, occ_coeffs,
hermi=hermi, device_id=device_id,
with_j=with_j, with_k=with_k)
Expand Down
9 changes: 5 additions & 4 deletions gpu4pyscf/df/grad/jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
rhoj = rhok = None
with cupy.cuda.Device(device_id), _streams[device_id]:
log = logger.new_logger(with_df.mol, with_df.verbose)
assert isinstance(with_df.verbose, int)
t0 = log.init_timer()
dm = cupy.asarray(dm)
orbo = cupy.asarray(orbo)
Expand All @@ -34,7 +35,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
cols = with_df.intopt.cderi_col
dm_sparse = dm[rows, cols]
dm_sparse[with_df.intopt.cderi_diag] *= .5

blksize = with_df.get_blksize()
if with_j:
rhoj = cupy.empty([naux_slice])
Expand Down Expand Up @@ -65,18 +66,18 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
_jk_task, with_df, dm, orbo,
with_j=with_j, with_k=with_k, device_id=device_id)
futures.append(future)

rhoj_total = []
rhok_total = []
for future in futures:
rhoj, rhok = future.result()
rhoj_total.append(rhoj)
rhok_total.append(rhok)

rhoj = rhok = None
if with_j:
rhoj = concatenate(rhoj_total)
if with_k:
rhok = concatenate(rhok_total)

return rhoj, rhok
Loading

0 comments on commit 031089d

Please sign in to comment.