Merge branch 'master' into feat/gpudavidson

bytedance · Jan 17, 2025 · 86283f6 · 86283f6
2 parents f107a07 + 1233861
commit 86283f6
Show file tree

Hide file tree

Showing 82 changed files with 6,616 additions and 2,058 deletions.
diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
@@ -14,7 +14,7 @@ permissions:
 jobs:
   build:
 
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux, X64, v100]
 
     steps:
     - uses: actions/checkout@v3
@@ -23,6 +23,7 @@ jobs:
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
+        pip3 install pytest-benchmark
         pip3 install pyscf --upgrade
         pip3 install numpy --upgrade
         pip3 install scipy --upgrade
@@ -35,8 +36,13 @@ jobs:
         export PATH=${CUDA_HOME}/bin:${PATH}
         export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
         sh build.sh
-    - name: Smoke Test
+    - name: Test RKS
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest --durations=0
+        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+    - name: Test UKS
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -21,6 +21,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -38,7 +39,7 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
@@ -48,6 +49,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -65,4 +67,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 **/build
 **/launch_logs
 **/deps
+**/.benchmarks
 core
 **tmp*
 *.egg-info/

diff --git a/CHANGELOG b/CHANGELOG
@@ -0,0 +1,98 @@
+v1.3.0 (2025-01-07)
+-------------------
+* New Features
+  - PBC analytical Fourier transform on GPU
+* Improvements
+  - Optimized computation efficiency and memory footprint for density fitting Hessian
+  - Support pickle serialization for most classes (SCF, DF, PCM, etc.)
+  - Efficiency of moving CuPy arrays between GPU cards
+
+
+v1.2.1 (2024-12-20)
+-------------------
+* New Features
+  - Change the license from GPL v3.0 to Apache 2.0
+  - Multi-GPU support for SCF, Gradients, and Hessian computation using AO-direct algorithm
+  - Add PBC HF and DFT with k-points, UHF/UKS, and density fitting
+* Improvements
+  - Change the default conv_tol_cpscf = 1e-3 / batch of atoms to conv_tol_cpscf = 1e-6 / atom
+  - Fix numerical instability in complex-valued TDHF diagonalization
+  - Improve PCM and QMMM with int1e_grids kernel
+  - Support non-symmetric int3c2e integral
+  - Optimize Hessian calculation with direct SCF
+  - Improve the numerical stability of int3c2e for point charge
+  - Add CI workflow for multi-GPU
+* Fixes
+  - Fix non-contiguous array error in p2p transfer between GPUs.
+  - Fix bugs in NMR calculations
+
+
+v1.2.0 (2024-12-09)
+-------------------
+* New Features
+  - Spin-conserved TDA and TDDFT methods
+  - Spin-flip TDA method.
+  - J-engine using McMuchie-Davidson integral algorithm
+  - Support multi-GPU density fitting energy, gradients and Hessian computation.
+  - Second order SCF solver
+* Improvements
+  - Support non-hermitian density matrix in J/K builder
+  - Secondary grids for CPHF solver
+  - 3-center integral computation efficiency for gradients and hessian
+  - One-electron Coulomb integrals against point charges and Gaussian charge distributions on grids.
+  - Automatically apply SCF initial guess from existing wavefunction
+
+
+v1.1.0 (2024-10-29)
+-------------------
+* New Features
+  - Add esp charge and resp charge by @wxj6000 in #208
+  - New Rys kernel by @sunqm in #221
+  - Optimize nuclear gradients using new Rys kernel by @sunqm in #224
+  - GPU kernel for analytical hessian by @sunqm in #227
+  - Add QM/MM by @MoleOrbitalHybridAnalyst in #218
+* Improvements
+  - Improved compatiability with pyscf 2.7.0 by @wxj6000 in #216
+  - Add skipping SCF cycles by @kvkarandashev in #229
+  - Skip building gint, gvhf, ... when building libxc by @wxj6000 in #210
+* Bugfix
+  - Typo in build_wheels.sh by @wxj6000 in #209
+  - Typo in dft_driver.py by @wxj6000 in #220
+  - Bugfix: cusolver error when specifying gpu by @wxj6000 in #213
+  - Bugfix: error in int2c2e by @wxj6000 in #212
+  - Bugfix: inconsistent gradient with CPU. Improved to_cpu, uks gradient, and grid_response by @wxj6000 in #230
+  - Bugfix: recompute int3c2e in DF UHF by @wxj6000 in #226
+  - New Contributors
+  - @MoleOrbitalHybridAnalyst made their first contribution in #218
+  - @kvkarandashev made their first contribution in #229
+
+
+v1.0.2 (2024-09-03)
+-------------------
+* Bugfix: append data in h5 file by @wxj6000 in #200
+* Support customized CHELPG radii by @wxj6000 in #202
+* Add cupy installation guide for developer installation instructions by @henryw7 in #204
+* Bugfix: save density when spin unrestricted by @wxj6000 in #205
+* Add chkfile support for pysisyphus by @henryw7 in #203
+
+
+v1.0.1 (2024-08-24)
+-------------------
+* Bugfix in rks.reset by @wxj6000 in #191. The bug leads to the failure of geometry optimization with direct SCF (#190)
+* Bugfix when CUDA unified memory is disabled. Removed CUDA unified memory in libxc, and reduced the overhead in calling libxc @wxj6000 in #180, #189
+* Bugfix and Improvement in opt_driver by @wxj6000 in #187 #197
+* Support SMD in opt_driver and dft driver @liuyu-chem1996 in #196
+* Support thermo calculation in dft_driver @liuyu-chem1996 in #192
+
+
+v1.0.0 (2024-07-23)
+-------------------
+Released features:
+* Density fitting scheme and direct SCF scheme
+* SCF, analytical gradient, and analytical Hessian calculations for Hartree-Fock and DFT
+* Spin-conserved and spin-flip TDA and TDDFT for excitated states
+* Nonlocal functional correction (vv10) for SCF and gradient
+* PCM models, SMD model, their analytical gradients, and semi-analytical Hessian matrix
+* Unrestricted Hartree-Fock and unrestricted DFT, gradient, and Hessian
+* MP2/DF-MP2 and CCSD (experimental)
+* Polarizability, IR, and NMR shielding (experimental)
diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -0,0 +1,141 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import cupy as cp
+from cupyx import profiler
+from gpu4pyscf.lib.cupy_helper import copy_array
+
+'''
+Benchmark different ways of transfering data from pinned memory to device
+'''
+
+# Host array
+host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8)
+big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array)
+big_host_data = big_host_data.reshape(512,512,512)
+big_host_data += np.random.rand(512,512,512)
+
+# Device array
+big_device_data = cp.empty_like(big_host_data)
+
+# Create views on both arrays
+host_view = big_host_data[:, 128:]  # Non-contiguous view on the host
+device_view = big_device_data[:, 128:]  # Non-contiguous view on the device
+
+print("Host View Shape:", host_view.shape)
+print("Device View Shape:", device_view.shape)
+
+print("------ Benchmark device to host transfer ----------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = cp.asarray(c)
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("------- Benchmark host to device transfer ---------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = c.get()
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (non-contiguous) ---------")
+
+with cp.cuda.Device(0):
+    a = cp.random.rand(512,512,512)
+    device0_view = a[:,128:]
+with cp.cuda.Device(1):
+    b = cp.random.rand(512,512,512)
+    device1_view = b[:,128:]
+perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10
+
+def cupy_copy(c, out):
+    with cp.cuda.Device(out.device):
+        out[:] = cp.asarray(c.get())
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (contiguous) ---------")
+perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy_contiguous(a, b):
+    b[:] = a
+perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy copy contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_asarray_contiguous(a, b):
+    with cp.cuda.Device(b.device):
+        b = cp.asarray(a) 
+perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(a.get() - b.get()) < 1e-10
+
+
+print('----------- Benchmark reduction across devices ------ ')
+from gpu4pyscf.lib.cupy_helper import reduce_to_device
+_num_devices = cp.cuda.runtime.getDeviceCount()
+a_dist = []
+for device_id in range(_num_devices):
+    with cp.cuda.Device(device_id):
+        a = cp.random.rand(512,512,512)
+        a_dist.append(a)
+
+perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
@@ -36,12 +36,12 @@
     atom=atom,                         # water molecule
     basis='def2-tzvpp',                # basis set
     output='./pyscf.log',              # save log file
-    verbose=6                         # control the level of print info
+    verbose=6                          # control the level of print info
     )
 
 mf_GPU = rks.RKS(                      # restricted Kohn-Sham DFT
     mol,                               # pyscf.gto.object
-    xc='b3lyp'                        # xc funtionals, such as pbe0, wb97m-v, tpss,
+    xc='b3lyp'                         # xc funtionals, such as pbe0, wb97m-v, tpss,
     ).density_fit()                    # density fitting
 
 mf_GPU.grids.atom_grid = (99,590)      # (99,590) lebedev grids, (75,302) is often enough
@@ -51,7 +51,7 @@
 
 # Compute Energy
 e_dft = mf_GPU.kernel()
-print(f"total energy = {e_dft}") # -76.26736519501688
+print(f"total energy = {e_dft}")       # -76.46668196729536
 
 # Compute Gradient
 g = mf_GPU.nuc_grad_method()

diff --git a/examples/02-h2o_geomopt.py b/examples/02-h2o_geomopt.py
@@ -43,4 +43,4 @@ def callback(envs):
 mol_eq = optimize(mf_GPU, maxsteps=20, callback=callback)
 print("Optimized coordinate:")
 print(mol_eq.atom_coords())
-print('geometry optimization took', time.time() - start_time, 's')
+print('Geometry optimization took', time.time() - start_time, 's')
diff --git a/examples/04-h2o_esp.py b/examples/04-h2o_esp.py
@@ -21,6 +21,7 @@
 import numpy as np
 from pyscf import gto
 from gpu4pyscf.dft import rks
+from gpu4pyscf.gto.int3c1e import int1e_grids
 
 atom ='''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -33,10 +34,8 @@
 mf.kernel()
 dm = mf.make_rdm1()  # compute one-electron density matrix
 
-# Use default mesh grids
-coords = mf.grids.coords.get()
+# Use default Lebedev grids
+coords = mf.grids.coords
 
-# The efficiency can be improved if needed
-from pyscf import df
-fakemol = gto.fakemol_for_charges(coords)
-v = np.einsum('ijp,ij->p', df.incore.aux_e2(mol, fakemol), dm)
+# Calculate electrostatic potential
+v = int1e_grids(mol, coords, dm=dm) # performing 'ijp,ij->p' efficiently
diff --git a/examples/05-h2o_multipole_moment.py b/examples/05-h2o_multipole_moment.py
@@ -32,10 +32,10 @@
 mf.kernel()
 dm = mf.make_rdm1()
 
-dip = mf.dip_moment(unit='DEBYE', dm=dm.get())
+dip = mf.dip_moment(unit='DEBYE', dm=dm)
 print('dipole moment:')
 print(dip)
 
-quad = mf.quad_moment(unit='DEBYE-ANG', dm=dm.get())
+quad = mf.quad_moment(unit='DEBYE-ANG', dm=dm)
 print('quadrupole moment:')
 print(quad)
diff --git a/examples/14-pcm_solvent.py b/examples/14-pcm_solvent.py
@@ -31,9 +31,9 @@
 mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
 mf = mf.PCM()
 mf.grids.atom_grid = (99,590)
-mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
-mf.with_solvent.method = 'IEF-PCM'
-mf.with_solvent.eps = 78.3553
+mf.with_solvent.lebedev_order = 29  # 302 Lebedev grids
+mf.with_solvent.method = 'IEF-PCM'   # Can be C-PCM, SS(V)PE, COSMO
+mf.with_solvent.eps = 78.3553        # Dielectric constant
 mf.kernel()
 
 gradobj = mf.nuc_grad_method()