From ca953ed24d000a4e3f3e266309b4d3dfb2b0c3cd Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Mon, 16 Oct 2023 14:44:30 -0700 Subject: [PATCH 01/19] cuda12 for libxc wheel --- .github/workflows/libxc_wheel.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/libxc_wheel.yml b/.github/workflows/libxc_wheel.yml index 13540ec3..a6f8f73f 100644 --- a/.github/workflows/libxc_wheel.yml +++ b/.github/workflows/libxc_wheel.yml @@ -6,15 +6,18 @@ on: jobs: release-pypi-linux: runs-on: ubuntu-latest - env: - img: wxj6000/manylinux2014:cuda118 + strategy: + matrix: + cuda-version: + - cuda118 + - cuda121 steps: - name: Checkout uses: actions/checkout@v3 - name: Build wheels run: | docker run --rm -v ${{ github.workspace }}:/gpu4pyscf:rw \ - ${{ env.img }} \ + wxj6000/manylinux2014:${{ matrix.cuda-version }} \ bash -exc 'sh /gpu4pyscf/builder/build_libxc.sh' - name: List available wheels run: | From 6a686897c81894b932eab82c8603b9a700544867 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 16 Oct 2023 15:05:59 -0700 Subject: [PATCH 02/19] support various einsum functions (#44) * remove cutensor, cublas, cusolver in wheels * fixed UHF in __init__.py * support various einsum * correct for linter --- examples/13-einsum_engine.py | 34 ++++++++++++++++++++ gpu4pyscf/df/df_jk.py | 6 ++-- gpu4pyscf/df/grad/rhf.py | 1 - gpu4pyscf/lib/cupy_helper.py | 33 +++++++++---------- gpu4pyscf/lib/cutensor.py | 61 +++++++++++++++++++++++++----------- 5 files changed, 96 insertions(+), 39 deletions(-) create mode 100644 examples/13-einsum_engine.py diff --git a/examples/13-einsum_engine.py b/examples/13-einsum_engine.py new file mode 100644 index 00000000..f3954222 --- /dev/null +++ b/examples/13-einsum_engine.py @@ -0,0 +1,34 @@ +# gpu4pyscf is a plugin to use Nvidia GPU in PySCF package +# +# Copyright (C) 2022 Qiming Sun +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os +os.environ['CONTRACT_ENGINE'] = 'opt_einsum' # 'cupy', 'cuquantum' + +import pyscf +from gpu4pyscf.dft import rks + +atom =''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 +''' + +mol = pyscf.M(atom=atom, basis='def2-tzvpp') +mf = rks.RKS(mol, xc='LDA').density_fit() + +e_dft = mf.kernel() # compute total energy +print(f"total energy = {e_dft}") \ No newline at end of file diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index fdd857c5..d52dfff2 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -162,7 +162,7 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): ''' if mol is None: mol = self.mol if dm is None: dm = self.make_rdm1() - + # for DFT if mf_class == rks.RKS: return rks.get_veff(self, dm=dm) @@ -248,7 +248,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- outputs and input are on the same device TODO: separate into three cases: j only, k only, j and k ''' - + log = logger.new_logger(dfobj.mol, dfobj.verbose) out_shape = dms_tag.shape out_cupy = isinstance(dms_tag, cupy.ndarray) @@ -290,7 +290,7 @@ def get_j(cderi_sparse): vj_tmp[:,cols,rows] = vj_sparse vj_sparse = None return vj_tmp - + # SCF K matrix with occ if nset == 1 and hasattr(dms_tag, 'occ_coeff'): occ_coeff = cupy.asarray(dms_tag.occ_coeff[ao_idx, :], order='C') diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 0598418c..292d9309 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -16,7 +16,6 @@ import numpy import cupy -import pyscf from cupyx.scipy.linalg import solve_triangular from pyscf.df.grad import rhf from pyscf.lib import logger diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 412e74d5..9f2cef59 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -24,7 +24,8 @@ from gpu4pyscf.gto import mole from gpu4pyscf.lib.cutensor import contract from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA -LMAX_ON_GPU = 8 + +LMAX_ON_GPU = 6 DSOLVE_LINDEP = 1e-15 c2s_l = mole.get_cart2sph(lmax=LMAX_ON_GPU) @@ -64,7 +65,7 @@ def print_mem_info(): #mem_stack = stack_size_per_thread GB = 1024 * 1024 * 1024 print(f'mem_avail: {mem_avail/GB:.3f} GB, total_mem: {total_mem/GB:.3f} GB, used_mem: {used_mem/GB:.3f} GB,mem_limt: {mem_limit/GB:.3f} GB') - + def get_avail_mem(): mempool = cupy.get_default_memory_pool() used_mem = mempool.used_bytes() @@ -83,7 +84,7 @@ def device2host_2d(a_cpu, a_gpu, stream=None): libcupy_helper.async_d2h_2d( ctypes.cast(stream.ptr, ctypes.c_void_p), a_cpu.ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(a_cpu.strides[0]), + ctypes.c_int(a_cpu.strides[0]), ctypes.cast(a_gpu.data.ptr, ctypes.c_void_p), ctypes.c_int(a_gpu.strides[0]), ctypes.c_int(a_gpu.shape[0]), @@ -146,7 +147,7 @@ def add_sparse(a, b, indices): ''' n = a.shape[0] m = b.shape[0] - + err = libcupy_helper.add_sparse( ctypes.cast(a.data.ptr, ctypes.c_void_p), ctypes.cast(b.data.ptr, ctypes.c_void_p), @@ -205,7 +206,7 @@ def block_diag(blocks, out=None): rows = np.cumsum(np.asarray([0] + [x.shape[0] for x in blocks])) cols = np.cumsum(np.asarray([0] + [x.shape[1] for x in blocks])) offsets = np.cumsum(np.asarray([0] + [x.shape[0]*x.shape[1] for x in blocks])) - + m, n = rows[-1], cols[-1] if out is None: out = cupy.zeros([m, n]) rows = cupy.asarray(rows, dtype='int32') @@ -227,7 +228,7 @@ def block_diag(blocks, out=None): if err != 0: raise RuntimeError('failed in block_diag kernel') return out - + def take_last2d(a, indices, out=None): ''' reorder the last 2 dimensions with 'indices', the first n-2 indices do not change @@ -303,7 +304,7 @@ def cart2sph(t, axis=0, ang=1, out=None): ''' transform 'axis' of a tensor from cartesian basis into spherical basis ''' - if(ang <= 1): + if(ang <= 1): if(out is not None): out[:] = t return t size = list(t.shape) @@ -314,9 +315,9 @@ def cart2sph(t, axis=0, ang=1, out=None): i0 = max(1, np.prod(size[:axis])) i3 = max(1, np.prod(size[axis+1:])) out_shape = size[:axis] + [nli*li_size[1]] + size[axis+1:] - + t_cart = t.reshape([i0*nli, li_size[0], i3]) - if(out is not None): + if(out is not None): out = out.reshape([i0*nli, li_size[1], i3]) t_sph = contract('min,ip->mpn', t_cart, c2s, out=out) return t_sph.reshape(out_shape) @@ -364,7 +365,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, if not (isinstance(b, cupy.ndarray) and b.ndim == 1): b = cupy.asarray(b) - + if x0 is None: x1 = b else: @@ -402,7 +403,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, ax.extend(axt) if callable(callback): callback(cycle, xs, ax) - + x1 = axt.copy() for i in range(len(xs)): xsi = cupy.asarray(xs[i]) @@ -419,22 +420,22 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, idx.append(i) innerprod.append(innerprod1) log.debug('krylov cycle %d r = %g', cycle, max_innerprod**.5) - + if max_innerprod < lindep or max_innerprod < tol**2: break x1 = x1[idx] - + xs = cupy.asarray(xs) ax = cupy.asarray(ax) nd = cycle + 1 h = cupy.einsum('in,jn->ij', xs, ax) - + # Add the contribution of I in (1+a) h += cupy.diag(cupy.asarray(innerprod[:nd])) g = cupy.zeros((nd,nroots), dtype=x1.dtype) - + if b.ndim == 1: g[0] = innerprod[0] else: @@ -447,7 +448,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, for j in range(nroots): g[i,j] = cupy.dot(xsi.conj(), b[j]) ''' - + c = cupy.linalg.solve(h, g) x = _gen_x0(c, cupy.asarray(xs)) if b.ndim == 1: diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index c7703005..c590b7f6 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -13,9 +13,6 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . - -import os -import ctypes import numpy as np import cupy from cupy._environment import _preload_libs @@ -31,9 +28,6 @@ except Exception: continue -if libcutensor is None: - print('cannot find cutensor') - _handle = Handle() _modes = {} _contraction_descriptors = {} @@ -50,7 +44,7 @@ def _create_mode_with_cache(mode): else: raise TypeError('Cannot create tensor mode: {}'.format(type(x))) key = tuple(integer_mode) - + if key in _modes: mode = _modes[key] else: @@ -70,11 +64,11 @@ def create_contraction_descriptor(handle, desc_a.ptr, mode_a.data, alignment_req_A, desc_b.ptr, mode_b.data, alignment_req_B, desc_c.ptr, mode_c.data, alignment_req_C) - + if key in _contraction_descriptors: desc = _contraction_descriptors[key] return desc - + desc = cutensor_backend.ContractionDescriptor() cutensor_backend.initContractionDescriptor( handle, @@ -99,11 +93,11 @@ def contraction(pattern, a, b, alpha, beta, out=None): key = str_a + str_b val = list(a.shape) + list(b.shape) shape = {k:v for k, v in zip(key, val)} - + mode_a = list(str_a) mode_b = list(str_b) mode_c = list(str_c) - + if(out is not None): c = out else: @@ -126,7 +120,7 @@ def contraction(pattern, a, b, alpha, beta, out=None): except Exception: ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_MIN) ws = cupy.empty(ws_size, dtype=np.int8) - + plan = cutensor_backend.ContractionPlan() cutensor_backend.initContractionPlan(_handle, plan, desc, find, ws_size) alpha = np.asarray(alpha) @@ -137,11 +131,40 @@ def contraction(pattern, a, b, alpha, beta, out=None): ws.data.ptr, ws_size) return out -def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None): - ''' - a wrapper for general tensor contraction - pattern has to be a standard einsum notation - ''' - c = contraction(pattern, a, b, alpha, beta, out=out) +import os +if 'CONTRACT_ENGINE' in os.environ: + contract_engine = os.environ['CONTRACT_ENGINE'] +else: + contract_engine = None - return c \ No newline at end of file +if libcutensor is None: + contract_engine = 'cupy' + +# override the 'contract' function if einsum is customized or cutensor is not found +if contract_engine is not None: + einsum = None + if contract_engine == 'opt_einsum': + import opt_einsum + einsum = opt_einsum.contract + elif contract_engine == 'cuquantum': + from cuquantum import contract as einsum + elif contract_engine == 'cupy': + einsum = cupy.einsum + else: + raise RuntimeError('unknown tensor contraction engine.') + + import warnings + warnings.warn(f'using {contract_engine} as the tensor contraction engine.') + def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None): + if out is None: + return cupy.asarray(einsum(pattern, a, b), order='C') + else: + out[:] = alpha*einsum(pattern, a, b) + beta*out + return cupy.asarray(out, order='C') +else: + def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None): + ''' + a wrapper for general tensor contraction + pattern has to be a standard einsum notation + ''' + return contraction(pattern, a, b, alpha, beta, out=out) From 036f9bca6120783a1ae118556ef98340d2a9fe0c Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Mon, 16 Oct 2023 15:20:03 -0700 Subject: [PATCH 03/19] fix libxc wheel --- .github/workflows/libxc_wheel.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/libxc_wheel.yml b/.github/workflows/libxc_wheel.yml index a6f8f73f..7e54b6c5 100644 --- a/.github/workflows/libxc_wheel.yml +++ b/.github/workflows/libxc_wheel.yml @@ -7,6 +7,7 @@ jobs: release-pypi-linux: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: cuda-version: - cuda118 From 0abb83485e4c7241b318e9566dbe3d4f842e43ef Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Mon, 16 Oct 2023 16:01:02 -0700 Subject: [PATCH 04/19] Remove unrelated packages from wheel --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fd675dfe..f72cad48 100755 --- a/setup.py +++ b/setup.py @@ -113,7 +113,7 @@ def initialize_with_default_plat_name(self): package_dir={'gpu4pyscf': 'gpu4pyscf'}, # packages are under directory pyscf # include *.so *.dat files. They are now placed in MANIFEST.in include_package_data=True, # include everything in source control - packages=[*find_namespace_packages('.'), 'gpu4pyscf', 'gpu4pyscf.lib'], + packages=['gpu4pyscf', 'gpu4pyscf.lib'], tests_require=[ "pytest==7.2.0", "pytest-cov==4.0.0", From 9eab37c9fc7bec9174b97209cdd7b7644aded3c2 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Mon, 16 Oct 2023 16:05:53 -0700 Subject: [PATCH 05/19] Fix packages in setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f72cad48..fd5efdab 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ import subprocess import re -from setuptools import setup, find_packages, Extension, find_namespace_packages +from setuptools import setup, find_packages, Extension from setuptools.command.build_py import build_py from distutils.util import get_platform @@ -113,7 +113,7 @@ def initialize_with_default_plat_name(self): package_dir={'gpu4pyscf': 'gpu4pyscf'}, # packages are under directory pyscf # include *.so *.dat files. They are now placed in MANIFEST.in include_package_data=True, # include everything in source control - packages=['gpu4pyscf', 'gpu4pyscf.lib'], + packages=find_packages(exclude=['*test*', '*examples*', '*docker*']), tests_require=[ "pytest==7.2.0", "pytest-cov==4.0.0", From 741423f75caae0e89f1746df14908d436cf651ad Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 17 Oct 2023 20:03:30 -0700 Subject: [PATCH 06/19] accelerate solvent models with GPU (#48) * numpy -> cupy for solvent * for linter * remove grad switch from pcm.py * passed flake8 * solvent integrals on GPU * flake8 --- benchmarks/scf/generate_tables.ipynb | 108 ++---- examples/14-pcm_solvent.py | 38 ++ examples/dft_driver.py | 18 +- gpu4pyscf/__init__.py | 2 +- gpu4pyscf/df/int3c2e.py | 4 +- gpu4pyscf/grad/rks.py | 3 +- gpu4pyscf/solvent/__init__.py | 37 ++ gpu4pyscf/solvent/_attach_solvent.py | 130 +++++++ gpu4pyscf/solvent/grad/pcm.py | 218 ++++++----- gpu4pyscf/solvent/pcm.py | 470 +++++------------------ gpu4pyscf/solvent/tests/test_pcm.py | 13 +- gpu4pyscf/solvent/tests/test_pcm_grad.py | 39 +- 12 files changed, 485 insertions(+), 595 deletions(-) create mode 100644 examples/14-pcm_solvent.py create mode 100644 gpu4pyscf/solvent/__init__.py create mode 100644 gpu4pyscf/solvent/_attach_solvent.py diff --git a/benchmarks/scf/generate_tables.ipynb b/benchmarks/scf/generate_tables.ipynb index e49ae7cc..c3eb0dd2 100644 --- a/benchmarks/scf/generate_tables.ipynb +++ b/benchmarks/scf/generate_tables.ipynb @@ -2,12 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "execution_count": 10, + "metadata": {}, "outputs": [ { "data": { @@ -15,7 +11,7 @@ "''" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -34,12 +30,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "execution_count": 12, + "metadata": {}, "outputs": [ { "data": { @@ -47,7 +39,7 @@ "''" ] }, - "execution_count": 21, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, @@ -57,15 +49,16 @@ "text": [ "| mol | natm | LDA | PBE | B3LYP | M06 | wB97m-v |\n", "|------:|-------:|-------:|-------:|--------:|-------:|----------:|\n", - "| 2 | 3 | 0.22 | 0.32 | 0.13 | 0.24 | 0.69 |\n", - "| 3 | 15 | 0.81 | 1.35 | 1.45 | 1.8 | 4.85 |\n", - "| 4 | 30 | 1.83 | 2.76 | 4.1 | 6.66 | 7.61 |\n", - "| 5 | 60 | 2.88 | 3.71 | 7.24 | 8.36 | 9.44 |\n", - "| 6 | 96 | 4.27 | 4.48 | 7.73 | 10 | 9.79 |\n", - "| 7 | 141 | 3.94 | 4.07 | 8.57 | 10.39 | 9.39 |\n", - "| 8 | 228 | nan | nan | nan | nan | nan |\n", - "| 9 | 300 | nan | nan | nan | nan | nan |\n", - "| 10 | 417 | nan | nan | nan | nan | nan |\n" + "| 2 | 3 | 0.22 | 0.32 | 0.27 | 0.25 | 0.69 |\n", + "| 3 | 15 | 0.68 | 0.25 | 1.58 | 2.61 | 4.84 |\n", + "| 4 | 30 | 1.59 | 2.63 | 4.09 | 6.93 | 8.17 |\n", + "| 5 | 60 | 2.86 | 3.64 | 7.15 | 8.44 | 9.44 |\n", + "| 6 | 96 | 4.34 | 4.39 | 7.75 | 10.58 | 9.87 |\n", + "| 7 | 141 | 4.07 | 4.1 | 8.87 | 10.47 | 10.13 |\n", + "| 8 | 228 | 4.34 | 4.58 | 9.39 | 10.48 | 9.36 |\n", + "| 9 | 300 | 5.05 | 5.21 | 9.35 | 11.36 | nan |\n", + "| 10 | 417 | 4.91 | nan | nan | nan | nan |\n", + "| 10 | nan | nan | nan | nan | nan | nan |\n" ] } ], @@ -104,12 +97,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "execution_count": 14, + "metadata": {}, "outputs": [ { "data": { @@ -117,7 +106,7 @@ "''" ] }, - "execution_count": 23, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, @@ -127,15 +116,16 @@ "text": [ "| mol | natm | LDA | PBE | B3LYP | M06 | wB97m-v |\n", "|------:|-------:|-------:|-------:|--------:|-------:|----------:|\n", - "| 2 | 3 | 0.85 | 0.84 | 0.77 | 0.74 | 0.57 |\n", - "| 3 | 15 | 0.56 | 0.89 | 1.44 | 1.57 | 1.52 |\n", - "| 4 | 30 | 0.59 | 1.03 | 2.13 | 2.08 | 1.9 |\n", - "| 5 | 60 | 0.53 | 0.87 | 2.45 | 2.35 | 1.73 |\n", - "| 6 | 96 | 0.61 | 0.87 | 2.43 | 2.37 | 1.59 |\n", - "| 7 | 141 | 0.92 | 1.08 | 2.61 | 2.62 | 1.53 |\n", - "| 8 | 228 | nan | nan | nan | nan | nan |\n", - "| 9 | 300 | nan | nan | nan | nan | nan |\n", - "| 10 | 417 | nan | nan | nan | nan | nan |\n" + "| 2 | 3 | 0.82 | 0.89 | 0.75 | 0.82 | 0.6 |\n", + "| 3 | 15 | 0.39 | 0.19 | 1.46 | 1.52 | 1.47 |\n", + "| 4 | 30 | 0.56 | 1.04 | 2.07 | 2.25 | 1.89 |\n", + "| 5 | 60 | 0.54 | 0.87 | 2.42 | 2.4 | 1.77 |\n", + "| 6 | 96 | 0.6 | 0.87 | 2.36 | 2.51 | 1.53 |\n", + "| 7 | 141 | 0.93 | 1.1 | 2.61 | 2.59 | 1.55 |\n", + "| 8 | 228 | 1.92 | 1.9 | 3.37 | 3.39 | 1.83 |\n", + "| 9 | 300 | 2.26 | 2.02 | 3.06 | 3.59 | nan |\n", + "| 10 | 417 | 2.46 | nan | nan | nan | nan |\n", + "| 10 | nan | nan | nan | nan | nan | nan |\n" ] } ], @@ -146,66 +136,42 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [] } @@ -223,10 +189,10 @@ }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "Python3 with MLSQL", + "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "0.1" + "version": "3.9.2" }, "orig_nbformat": 4 }, diff --git a/examples/14-pcm_solvent.py b/examples/14-pcm_solvent.py new file mode 100644 index 00000000..3def0ee4 --- /dev/null +++ b/examples/14-pcm_solvent.py @@ -0,0 +1,38 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import pyscf +from pyscf import lib +from gpu4pyscf.dft import rks +lib.num_threads(8) + +atom =''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 +''' +mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=4) + +mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit() +mf = mf.PCM() +mf.grids.atom_grid = (99,590) +mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids +mf.with_solvent.method = 'IEF-PCM' +mf.with_solvent.eps = 78.3553 +mf.kernel() + +g = mf.nuc_grad_method() +g.auxbasis_response = True +f = g.kernel() diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 0a7073cf..65ca2ad5 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -23,26 +23,28 @@ import argparse parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules') -parser.add_argument("--input", type=str, default='benzene/coord') -parser.add_argument("--basis", type=str, default='def2-tzvpp') -parser.add_argument("--auxbasis", type=str, default='def2-tzvpp-jkfit') +parser.add_argument("--input", type=str, default='benzene/coord') +parser.add_argument("--basis", type=str, default='def2-tzvpp') +parser.add_argument("--auxbasis", type=str, default='def2-tzvpp-jkfit') +parser.add_argument("--solvent", type=bool, default=False) args = parser.parse_args() start_time = time.time() bas = args.basis mol = pyscf.M( - atom=args.input, - basis=bas, + atom=args.input, + basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 6 -print(mol.nao) +mol.verbose = 4 mf_df = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit(auxbasis=args.auxbasis) +if args.solvent: + mf_df = mf_df.PCM() mf_df.grids.atom_grid = (99,590) mf_df.kernel() + print('compute time for energy: {}s'.format((time.time() - start_time))) -exit() start_time = time.time() g = mf_df.nuc_grad_method() g.auxbasis_response = True diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index 4d7323eb..a52a096a 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1,2 +1,2 @@ from . import lib, grad, hessian, solvent, scf, dft -__version__ = '0.6.1' +__version__ = '0.6.1' \ No newline at end of file diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 9f01ac3b..7995c6d8 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -1097,7 +1097,7 @@ def get_dh1e(mol, dm0): intopt = VHFOpt(mol, fakemol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)] - + dh1e = cupy.zeros([natm,3]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'): dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1]) @@ -1120,7 +1120,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=N log_q_ij = intopt.log_qs[cp_ij_id] log_q_kl = intopt.aux_log_qs[cp_aux_id] - + nbins = 1 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 8695927c..276e15b0 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -501,8 +501,7 @@ def get_du(ia, ib): # JCP 98, 5612 (1993); (B10) yield coords, w0, w1 class Gradients(rhf_grad.Gradients, pyscf.grad.rks.Gradients): - device = 'gpu' - get_veff = patch_cpu_kernel(pyscf.grad.rks.Gradients.get_veff)(_get_veff) + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def get_dispersion(self): if self.base.disp[:2].upper() == 'D3': diff --git a/gpu4pyscf/solvent/__init__.py b/gpu4pyscf/solvent/__init__.py new file mode 100644 index 00000000..157e7129 --- /dev/null +++ b/gpu4pyscf/solvent/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from gpu4pyscf.solvent import pcm + +def PCM(method_or_mol, solvent_obj=None, dm=None): + '''Initialize PCM model. + + Examples: + + >>> mf = PCM(scf.RHF(mol)) + >>> mf.kernel() + >>> sol = PCM(mol) + >>> mc = PCM(CASCI(mf, 6, 6), sol) + >>> mc.kernel() + ''' + from pyscf import gto + from pyscf import scf + + if isinstance(method_or_mol, gto.mole.Mole): + return pcm.PCM(method_or_mol) + elif isinstance(method_or_mol, scf.hf.SCF): + return pcm.pcm_for_scf(method_or_mol, solvent_obj, dm) + else: + raise NotImplementedError('PCM model only support SCF') diff --git a/gpu4pyscf/solvent/_attach_solvent.py b/gpu4pyscf/solvent/_attach_solvent.py new file mode 100644 index 00000000..da556302 --- /dev/null +++ b/gpu4pyscf/solvent/_attach_solvent.py @@ -0,0 +1,130 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from pyscf import lib +from pyscf.lib import logger +from pyscf.solvent._attach_solvent import _Solvation +from gpu4pyscf.lib.cupy_helper import tag_array +from gpu4pyscf import scf + +# NOTE: copied from pyscf, different from the latest version + +def _for_scf(mf, solvent_obj, dm=None): + '''Add solvent model to SCF (HF and DFT) method. + + Kwargs: + dm : if given, solvent does not respond to the change of density + matrix. A frozen ddCOSMO potential is added to the results. + ''' + if isinstance(mf, _Solvation): + mf.with_solvent = solvent_obj + return mf + + oldMF = mf.__class__ + + if dm is not None: + solvent_obj.e, solvent_obj.v = solvent_obj.kernel(dm) + solvent_obj.frozen = True + + class SCFWithSolvent(_Solvation, oldMF): + def __init__(self, mf, solvent): + self.__dict__.update(mf.__dict__) + self.with_solvent = solvent + self._keys.update(['with_solvent']) + + def dump_flags(self, verbose=None): + oldMF.dump_flags(self, verbose) + self.with_solvent.check_sanity() + self.with_solvent.dump_flags(verbose) + return self + + def reset(self, mol=None): + self.with_solvent.reset(mol) + return oldMF.reset(self, mol) + + # Note v_solvent should not be added to get_hcore for scf methods. + # get_hcore is overloaded by many post-HF methods. Modifying + # SCF.get_hcore may lead error. + + def get_veff(self, mol=None, dm=None, *args, **kwargs): + vhf = oldMF.get_veff(self, mol, dm, *args, **kwargs) + with_solvent = self.with_solvent + if not with_solvent.frozen: + with_solvent.e, with_solvent.v = with_solvent.kernel(dm) + e_solvent, v_solvent = with_solvent.e, with_solvent.v + + # NOTE: v_solvent should not be added to vhf in this place. This is + # because vhf is used as the reference for direct_scf in the next + # iteration. If v_solvent is added here, it may break direct SCF. + return tag_array(vhf, e_solvent=e_solvent, v_solvent=v_solvent) + + def get_fock(self, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, + diis=None, diis_start_cycle=None, + level_shift_factor=None, damp_factor=None): + # DIIS was called inside oldMF.get_fock. v_solvent, as a function of + # dm, should be extrapolated as well. To enable it, v_solvent has to be + # added to the fock matrix before DIIS was called. + if getattr(vhf, 'v_solvent', None) is None: + vhf = self.get_veff(self.mol, dm) + return oldMF.get_fock(self, h1e, s1e, vhf+vhf.v_solvent, dm, cycle, diis, + diis_start_cycle, level_shift_factor, damp_factor) + + def energy_elec(self, dm=None, h1e=None, vhf=None): + if dm is None: + dm = self.make_rdm1() + if getattr(vhf, 'e_solvent', None) is None: + vhf = self.get_veff(self.mol, dm) + e_tot, e_coul = oldMF.energy_elec(self, dm, h1e, vhf) + e_tot += vhf.e_solvent + self.scf_summary['e_solvent'] = vhf.e_solvent.real + logger.debug(self, 'Solvent Energy = %.15g', vhf.e_solvent) + return e_tot, e_coul + + def nuc_grad_method(self): + grad_method = oldMF.nuc_grad_method(self) + return self.with_solvent.nuc_grad_method(grad_method) + + Gradients = nuc_grad_method + + def gen_response(self, *args, **kwargs): + vind = oldMF.gen_response(self, *args, **kwargs) + is_uhf = isinstance(self, scf.uhf.UHF) + # singlet=None is orbital hessian or CPHF type response function + singlet = kwargs.get('singlet', True) + singlet = singlet or singlet is None + def vind_with_solvent(dm1): + v = vind(dm1) + if self.with_solvent.equilibrium_solvation: + if is_uhf: + v_solvent = self.with_solvent._B_dot_x(dm1) + v += v_solvent[0] + v_solvent[1] + elif singlet: + v += self.with_solvent._B_dot_x(dm1) + return v + return vind_with_solvent + + def stability(self, *args, **kwargs): + # When computing orbital hessian, the second order derivatives of + # solvent energy needs to be computed. It is enabled by + # the attribute equilibrium_solvation in gen_response method. + # If solvent was frozen, its contribution is treated as the + # external potential. The response of solvent does not need to + # be considered in stability analysis. + with lib.temporary_env(self.with_solvent, + equilibrium_solvation=not self.with_solvent.frozen): + return oldMF.stability(self, *args, **kwargs) + + mf1 = SCFWithSolvent(mf, solvent_obj) + return mf1 \ No newline at end of file diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index a1f2b689..7c9d0047 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -19,24 +19,20 @@ # pylint: disable=C0103 import numpy -import scipy -import ctypes +import cupy +from cupyx import scipy from pyscf import lib from pyscf.lib import logger from pyscf import gto, df -from pyscf.dft import gen_grid -from pyscf.data import radii -from pyscf.solvent import ddcosmo -from pyscf.solvent import _attach_solvent - -from gpu4pyscf.solvent import pcm +from pyscf.grad import rhf as rhf_grad from gpu4pyscf.solvent.pcm import PI, switch_h +from gpu4pyscf.df import int3c2e libdft = lib.load_library('libdft') def grad_switch_h(x): ''' first derivative of h(x)''' - dy = 30.0*x**2 - 60.0*x**3 + 30.0*x**4 + dy = 30.0*x**2 - 60.0*x**3 + 30.0*x**4 dy[x<0] = 0.0 dy[x>1] = 0.0 return dy @@ -62,15 +58,15 @@ def get_dF_dA(surface): ngrids = grid_coords.shape[0] natom = atom_coords.shape[0] - dF = numpy.zeros([ngrids, natom, 3]) - dA = numpy.zeros([ngrids, natom, 3]) - + dF = cupy.zeros([ngrids, natom, 3]) + dA = cupy.zeros([ngrids, natom, 3]) + for ia in range(atom_coords.shape[0]): p0,p1 = surface['gslice_by_atom'][ia] coords = grid_coords[p0:p1] p1 = p0 + coords.shape[0] - ri_rJ = numpy.expand_dims(coords, axis=1) - atom_coords - riJ = numpy.linalg.norm(ri_rJ, axis=-1) + ri_rJ = cupy.expand_dims(coords, axis=1) - atom_coords + riJ = cupy.linalg.norm(ri_rJ, axis=-1) diJ = (riJ - R_in_J) / R_sw_J diJ[:,ia] = 1.0 diJ[diJ < 1e-8] = 0.0 @@ -79,25 +75,25 @@ def get_dF_dA(surface): fiJ = switch_h(diJ) dfiJ = grad_switch_h(diJ) / (fiJ * riJ * R_sw_J) - dfiJ = numpy.expand_dims(dfiJ, axis=-1) * ri_rJ + dfiJ = cupy.expand_dims(dfiJ, axis=-1) * ri_rJ Fi = switch_fun[p0:p1] Ai = area[p0:p1] - + # grids response - Fi = numpy.expand_dims(Fi, axis=-1) - Ai = numpy.expand_dims(Ai, axis=-1) - dFi_grid = numpy.sum(dfiJ, axis=1) - + Fi = cupy.expand_dims(Fi, axis=-1) + Ai = cupy.expand_dims(Ai, axis=-1) + dFi_grid = cupy.sum(dfiJ, axis=1) + dF[p0:p1,ia,:] += Fi * dFi_grid dA[p0:p1,ia,:] += Ai * dFi_grid # atom response - Fi = numpy.expand_dims(Fi, axis=-2) - Ai = numpy.expand_dims(Ai, axis=-2) + Fi = cupy.expand_dims(Fi, axis=-2) + Ai = cupy.expand_dims(Ai, axis=-2) dF[p0:p1,:,:] -= Fi * dfiJ dA[p0:p1,:,:] -= Ai * dfiJ - + return dF, dA def get_dD_dS(surface, dF, with_S=True, with_D=False): @@ -110,35 +106,35 @@ def get_dD_dS(surface, dF, with_S=True, with_D=False): norm_vec = surface['norm_vec'] switch_fun = surface['switch_fun'] - xi_i, xi_j = numpy.meshgrid(exponents, exponents, indexing='ij') + xi_i, xi_j = cupy.meshgrid(exponents, exponents, indexing='ij') xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5 - ri_rj = numpy.expand_dims(grid_coords, axis=1) - grid_coords - rij = numpy.linalg.norm(ri_rj, axis=-1) + ri_rj = cupy.expand_dims(grid_coords, axis=1) - grid_coords + rij = cupy.linalg.norm(ri_rj, axis=-1) xi_r_ij = xi_ij * rij - numpy.fill_diagonal(rij, 1) - - dS_dr = -(scipy.special.erf(xi_r_ij) - 2.0*xi_r_ij/PI**0.5*numpy.exp(-xi_r_ij**2))/rij**2 - numpy.fill_diagonal(dS_dr, 0) - - dS_dr= numpy.expand_dims(dS_dr, axis=-1) - drij = ri_rj/numpy.expand_dims(rij, axis=-1) + cupy.fill_diagonal(rij, 1) + + dS_dr = -(scipy.special.erf(xi_r_ij) - 2.0*xi_r_ij/PI**0.5*cupy.exp(-xi_r_ij**2))/rij**2 + cupy.fill_diagonal(dS_dr, 0) + + dS_dr= cupy.expand_dims(dS_dr, axis=-1) + drij = ri_rj/cupy.expand_dims(rij, axis=-1) dS = dS_dr * drij dD = None if with_D: - nj_rij = numpy.sum(ri_rj * norm_vec, axis=-1) - dD_dri = 4.0*xi_r_ij**2 * xi_ij / PI**0.5 * numpy.exp(-xi_r_ij**2) * nj_rij / rij**3 - numpy.fill_diagonal(dD_dri, 0.0) - - rij = numpy.expand_dims(rij, axis=-1) - nj_rij = numpy.expand_dims(nj_rij, axis=-1) - nj = numpy.expand_dims(norm_vec, axis=0) - dD_dri = numpy.expand_dims(dD_dri, axis=-1) - + nj_rij = cupy.sum(ri_rj * norm_vec, axis=-1) + dD_dri = 4.0*xi_r_ij**2 * xi_ij / PI**0.5 * cupy.exp(-xi_r_ij**2) * nj_rij / rij**3 + cupy.fill_diagonal(dD_dri, 0.0) + + rij = cupy.expand_dims(rij, axis=-1) + nj_rij = cupy.expand_dims(nj_rij, axis=-1) + nj = cupy.expand_dims(norm_vec, axis=0) + dD_dri = cupy.expand_dims(dD_dri, axis=-1) + dD = dD_dri * drij + dS_dr * (-nj/rij + 3.0*nj_rij/rij**2 * drij) dSii_dF = -exponents * (2.0/PI)**0.5 / switch_fun**2 - dSii = numpy.expand_dims(dSii_dF, axis=(1,2)) * dF + dSii = cupy.expand_dims(dSii_dF, axis=(1,2)) * dF return dD, dS, dSii @@ -148,8 +144,7 @@ def grad_kernel(pcmobj, dm): v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q) ''' mol = pcmobj.mol - nao = mol.nao - aoslice = mol.aoslice_by_atom() + gridslice = pcmobj.surface['gslice_by_atom'] grid_coords = pcmobj.surface['grid_coords'] exponents = pcmobj.surface['charge_exp'] @@ -161,53 +156,53 @@ def grad_kernel(pcmobj, dm): q = pcmobj._intermediates['q'] q_sym = pcmobj._intermediates['q_sym'] - vK_1 = numpy.linalg.solve(K.T, v_grids) + vK_1 = cupy.linalg.solve(K.T, v_grids) # ----------------- potential response ----------------------- - max_memory = pcmobj.max_memory - lib.current_memory()[0] - blksize = int(max(max_memory*.9e6/8/nao**2, 400)) - ngrids = grid_coords.shape[0] atom_coords = mol.atom_coords(unit='B') - dvj = numpy.zeros([nao,3]) - dq = numpy.zeros([ngrids,3]) - for p0, p1 in lib.prange(0, ngrids, blksize): - fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2) - # charge response - v_nj_ip1 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip1', aosym='s1', comp=3) - vj = numpy.einsum('xijn,n->xij', v_nj_ip1, q_sym) - dvj += numpy.einsum('xij,ij->ix', vj, dm) - dvj += numpy.einsum('xij,ji->ix', vj, dm) - - # electronic potential response - v_nj_ip2 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip2', aosym='s1', comp=3) - dq_slice = numpy.einsum('xijn,ij->nx', v_nj_ip2, dm) - dq[p0:p1] = numpy.einsum('nx,n->nx', dq_slice, q_sym[p0:p1]) - - de = numpy.zeros_like(atom_coords) - de += numpy.asarray([numpy.sum(dq[p0:p1], axis=0) for p0,p1 in gridslice]) - de += numpy.asarray([numpy.sum(dvj[p0:p1], axis=0) for p0,p1 in aoslice[:,2:]]) - + intopt = pcmobj.intopt + intopt.clear() + # rebuild with aosym + intopt.build(1e-14, diag_block_with_triu=True, aosym=False) + coeff = intopt.coeff + dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) + + dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart) + dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart) + + cart_ao_idx = intopt.cart_ao_idx + rev_cart_ao_idx = numpy.argsort(cart_ao_idx) + dvj = dvj[:,rev_cart_ao_idx] + + aoslice = intopt.mol.aoslice_by_atom() + dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice]) + dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]]) + de = dq + dvj + atom_charges = mol.atom_charges() fakemol_nuc = gto.fakemol_for_charges(atom_coords) - + fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=exponents.get()**2) + # nuclei response int2c2e_ip1 = mol._add_suffix('int2c2e_ip1') v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol) - dv_g = numpy.einsum('g,xng->nx', q_sym, v_ng_ip1) - de -= numpy.einsum('nx,n->nx', dv_g, atom_charges) + v_ng_ip1 = cupy.asarray(v_ng_ip1) + dv_g = cupy.einsum('g,xng->nx', q_sym, v_ng_ip1) + de -= cupy.einsum('nx,n->nx', dv_g, atom_charges) # nuclei potential response int2c2e_ip2 = mol._add_suffix('int2c2e_ip2') v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip2, fakemol_nuc, fakemol) - dv_g = numpy.einsum('n,xng->gx', atom_charges, v_ng_ip2) - dv_g = numpy.einsum('gx,g->gx', dv_g, q_sym) - de -= numpy.asarray([numpy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice]) - + v_ng_ip2 = cupy.asarray(v_ng_ip2) + dv_g = cupy.einsum('n,xng->gx', atom_charges, v_ng_ip2) + dv_g = cupy.einsum('gx,g->gx', dv_g, q_sym) + de -= cupy.asarray([cupy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice]) + ## --------------- response from stiffness matrices ---------------- gridslice = pcmobj.surface['gslice_by_atom'] dF, dA = get_dF_dA(pcmobj.surface) - + with_D = pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE' dD, dS, dSii = get_dD_dS(pcmobj.surface, dF, with_D=with_D, with_S=True) @@ -215,57 +210,57 @@ def grad_kernel(pcmobj, dm): DA = D*A epsilon = pcmobj.eps - + #de_dF = v0 * -dSii_dF * q #de += 0.5*numpy.einsum('i,inx->nx', de_dF, dF) # dQ = v^T K^-1 (dR - dK K^-1 R) v if pcmobj.method.upper() == 'C-PCM' or pcmobj.method.upper() == 'COSMO': # dR = 0, dK = dS - de_dS = numpy.einsum('i,ijx,j->ix', vK_1, dS, q) - de -= numpy.asarray([numpy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice]) - de -= 0.5*numpy.einsum('i,ijx,i->jx', vK_1, dSii, q) - + de_dS = cupy.einsum('i,ijx,j->ix', vK_1, dS, q) + de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice]) + de -= 0.5*cupy.einsum('i,ijx,i->jx', vK_1, dSii, q) + elif pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE': # IEF-PCM and SS(V)PE formally are the same in gradient calculation - # dR = f_eps/(2*pi) * (dD*A + D*dA), + # dR = f_eps/(2*pi) * (dD*A + D*dA), # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) fac = f_epsilon/(2.0*PI) Av = A*v_grids - de_dR = 0.5*fac * numpy.einsum('i,ijx,j->ix', vK_1, dD, Av) - de_dR -= 0.5*fac * numpy.einsum('i,ijx,j->jx', vK_1, dD, Av) - de_dR = numpy.asarray([numpy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice]) - de_dR += 0.5*fac * numpy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids) - - de_dS0 = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dS, q) - de_dS0 -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dS, q) - de_dS0 = numpy.asarray([numpy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice]) - de_dS0 += 0.5*numpy.einsum('i,inx,i->nx', vK_1, dSii, q) - - vK_1_DA = numpy.dot(vK_1, DA) - de_dS1 = 0.5*numpy.einsum('j,jkx,k->jx', vK_1_DA, dS, q) - de_dS1 -= 0.5*numpy.einsum('j,jkx,k->kx', vK_1_DA, dS, q) - de_dS1 = numpy.asarray([numpy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice]) - de_dS1 += 0.5*numpy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q) - - Sq = numpy.dot(S,q) + de_dR = 0.5*fac * cupy.einsum('i,ijx,j->ix', vK_1, dD, Av) + de_dR -= 0.5*fac * cupy.einsum('i,ijx,j->jx', vK_1, dD, Av) + de_dR = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice]) + de_dR += 0.5*fac * cupy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids) + + de_dS0 = 0.5*cupy.einsum('i,ijx,j->ix', vK_1, dS, q) + de_dS0 -= 0.5*cupy.einsum('i,ijx,j->jx', vK_1, dS, q) + de_dS0 = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice]) + de_dS0 += 0.5*cupy.einsum('i,inx,i->nx', vK_1, dSii, q) + + vK_1_DA = cupy.dot(vK_1, DA) + de_dS1 = 0.5*cupy.einsum('j,jkx,k->jx', vK_1_DA, dS, q) + de_dS1 -= 0.5*cupy.einsum('j,jkx,k->kx', vK_1_DA, dS, q) + de_dS1 = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice]) + de_dS1 += 0.5*cupy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q) + + Sq = cupy.dot(S,q) ASq = A*Sq - de_dD = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dD, ASq) - de_dD -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dD, ASq) - de_dD = numpy.asarray([numpy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice]) + de_dD = 0.5*cupy.einsum('i,ijx,j->ix', vK_1, dD, ASq) + de_dD -= 0.5*cupy.einsum('i,ijx,j->jx', vK_1, dD, ASq) + de_dD = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice]) - vK_1_D = numpy.dot(vK_1, D) - de_dA = 0.5*numpy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq) + vK_1_D = cupy.dot(vK_1, D) + de_dA = 0.5*cupy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq) de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1) de += de_dR - de_dK else: raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}") - - return de - -def make_grad_object(mf, grad_method): + + return de.get() + +def make_grad_object(grad_method): ''' return solvent gradient object ''' @@ -281,15 +276,16 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs): dm = kwargs.pop('dm', None) if dm is None: dm = self.base.make_rdm1(ao_repr=True) - + self.de_solvent = grad_kernel(self.base.with_solvent, dm) self.de_solute = grad_method_class.kernel(self, *args, **kwargs) self.de = self.de_solute + self.de_solvent - + if self.verbose >= logger.NOTE: logger.note(self, '--------------- %s (+%s) gradients ---------------', self.base.__class__.__name__, self.base.with_solvent.__class__.__name__) + rhf_grad._write(self, self.mol, self.de, self.atmlst) logger.note(self, '----------------------------------------------') return self.de @@ -300,4 +296,4 @@ def _finalize(self): return WithSolventGrad(grad_method) -pcm.PCM.nuc_grad_method = make_grad_object \ No newline at end of file +#pcm.PCM.nuc_grad_method = make_grad_object \ No newline at end of file diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py index aaed5922..b7a1d181 100644 --- a/gpu4pyscf/solvent/pcm.py +++ b/gpu4pyscf/solvent/pcm.py @@ -17,17 +17,18 @@ PCM family solvent model ''' # pylint: disable=C0103 - -import numpy -import scipy import ctypes +import numpy +import cupy +import cupyx.scipy as scipy from pyscf import lib from pyscf.lib import logger from pyscf import gto, df from pyscf.dft import gen_grid from pyscf.data import radii from pyscf.solvent import ddcosmo -from pyscf.solvent import _attach_solvent +from gpu4pyscf.solvent import _attach_solvent +from gpu4pyscf.df import int3c2e libdft = lib.load_library('libdft') @@ -37,13 +38,9 @@ def pcm_for_scf(mf, solvent_obj=None, dm=None): solvent_obj = PCM(mf.mol) return _attach_solvent._for_scf(mf, solvent_obj, dm) - -# Inject ddPCM to other methods -from pyscf import scf -from pyscf import mcscf -from pyscf import mp, ci, cc -from pyscf import tdscf -scf.hf.SCF.PCM = scf.hf.SCF.PCM = pcm_for_scf +# Inject PCM to SCF, TODO: add it to other methods later +from gpu4pyscf import scf +scf.hf.RHF.PCM = scf.hf.RHF.PCM = pcm_for_scf # TABLE II, J. Chem. Phys. 122, 194110 (2005) XI = { @@ -85,7 +82,7 @@ def pcm_for_scf(mf, solvent_obj=None, dm=None): def switch_h(x): ''' - switching function (eq. 3.19) + switching function (eq. 3.19) J. Chem. Phys. 133, 244111 (2010) notice the typo in the paper ''' @@ -94,33 +91,20 @@ def switch_h(x): y[x>1] = 1.0 return y -def grad_switch_h(x): - ''' first derivative of h(x)''' - dy = 30.0*x**2 - 60.0*x**3 + 30.0*x**4 - dy[x<0] = 0.0 - dy[x>1] = 0.0 - return dy - -def gradgrad_switch_h(x): - ''' 2nd derivative of h(x) ''' - ddy = 60.0*x - 180.0*x**2 + 120*x**3 - ddy[x<0] = 0.0 - ddy[x>1] = 0.0 - return ddy - def gen_surface(mol, ng=302, vdw_scale=1.2): '''J. Phys. Chem. A 1999, 103, 11060-11079''' unit_sphere = numpy.empty((ng,4)) libdft.MakeAngularGrid(unit_sphere.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ng)) + unit_sphere = cupy.asarray(unit_sphere) - atom_coords = mol.atom_coords(unit='B') + atom_coords = cupy.asarray(mol.atom_coords(unit='B')) charges = mol.atom_charges() - N_J = ng * numpy.ones(mol.natm) - R_J = numpy.asarray([vdw_scale*Bondi[chg] for chg in charges]) + N_J = ng * cupy.ones(mol.natm) + R_J = cupy.asarray([vdw_scale*Bondi[chg] for chg in charges]) R_sw_J = R_J * (14.0 / N_J)**0.5 alpha_J = 1.0/2.0 + R_J/R_sw_J - ((R_J/R_sw_J)**2 - 1.0/28)**0.5 R_in_J = R_J - alpha_J * R_sw_J - + grid_coords = [] weights = [] charge_exp = [] @@ -134,16 +118,17 @@ def gen_surface(mol, ng=302, vdw_scale=1.2): symb = mol.atom_symbol(ia) chg = gto.charge(symb) r_vdw = vdw_scale*Bondi[chg] - + atom_grid = r_vdw * unit_sphere[:,:3] + atom_coords[ia,:] - riJ = scipy.spatial.distance.cdist(atom_grid[:,:3], atom_coords) + #riJ = scipy.spatial.distance.cdist(atom_grid[:,:3], atom_coords) + riJ = cupy.sum((atom_grid[:,None,:] - atom_coords[None,:,:])**2, axis=2)**0.5 diJ = (riJ - R_in_J) / R_sw_J diJ[:,ia] = 1.0 - diJ[diJ < 1e-8] = 0.0 + diJ[diJ<1e-8] = 0.0 fiJ = switch_h(diJ) - + w = unit_sphere[:,3] * 4.0 * PI - swf = numpy.prod(fiJ, axis=1) + swf = cupy.prod(fiJ, axis=1) idx = w*swf > 1e-16 p0, p1 = p1, p1+sum(idx) @@ -154,17 +139,17 @@ def gen_surface(mol, ng=302, vdw_scale=1.2): norm_vec.append(unit_sphere[idx,:3]) xi = XI[ng] / (r_vdw * w[idx]**0.5) charge_exp.append(xi) - R_vdw.append(numpy.ones(sum(idx)) * r_vdw) + R_vdw.append(cupy.ones(idx.sum().get()) * r_vdw) area.append(w[idx]*r_vdw**2*swf[idx]) - - grid_coords = numpy.vstack(grid_coords) - norm_vec = numpy.vstack(norm_vec) - weights = numpy.concatenate(weights) - charge_exp = numpy.concatenate(charge_exp) - switch_fun = numpy.concatenate(switch_fun) - area = numpy.concatenate(area) - R_vdw = numpy.concatenate(R_vdw) - + + grid_coords = cupy.vstack(grid_coords) + norm_vec = cupy.vstack(norm_vec) + weights = cupy.concatenate(weights) + charge_exp = cupy.concatenate(charge_exp) + switch_fun = cupy.concatenate(switch_fun) + area = cupy.concatenate(area) + R_vdw = cupy.concatenate(R_vdw) + surface = { 'ng': ng, 'gslice_by_atom': gslice_by_atom, @@ -191,58 +176,6 @@ def get_F_A(surface): A = weights*R_vdw**2*switch_fun return switch_fun, A -def get_dF_dA(surface): - ''' - J. Chem. Phys. 133, 244111 (2010), Appendix C - ''' - - atom_coords = surface['atom_coords'] - grid_coords = surface['grid_coords'] - switch_fun = surface['switch_fun'] - area = surface['area'] - R_in_J = surface['R_in_J'] - R_sw_J = surface['R_sw_J'] - - ngrids = grid_coords.shape[0] - natom = atom_coords.shape[0] - dF = numpy.zeros([ngrids, natom, 3]) - dA = numpy.zeros([ngrids, natom, 3]) - - for ia in range(atom_coords.shape[0]): - p0,p1 = surface['gslice_by_atom'][ia] - coords = grid_coords[p0:p1] - p1 = p0 + coords.shape[0] - ri_rJ = numpy.expand_dims(coords, axis=1) - atom_coords - riJ = numpy.linalg.norm(ri_rJ, axis=-1) - diJ = (riJ - R_in_J) / R_sw_J - diJ[:,ia] = 1.0 - diJ[diJ < 1e-8] = 0.0 - ri_rJ[:,ia,:] = 0.0 - ri_rJ[diJ < 1e-8] = 0.0 - - fiJ = switch_h(diJ) - dfiJ = grad_switch_h(diJ) / (fiJ * riJ * R_sw_J) - dfiJ = numpy.expand_dims(dfiJ, axis=-1) * ri_rJ - - Fi = switch_fun[p0:p1] - Ai = area[p0:p1] - - # grids response - Fi = numpy.expand_dims(Fi, axis=-1) - Ai = numpy.expand_dims(Ai, axis=-1) - dFi_grid = numpy.sum(dfiJ, axis=1) - - dF[p0:p1,ia,:] += Fi * dFi_grid - dA[p0:p1,ia,:] += Ai * dFi_grid - - # atom response - Fi = numpy.expand_dims(Fi, axis=-2) - Ai = numpy.expand_dims(Ai, axis=-2) - dF[p0:p1,:,:] -= Fi * dfiJ - dA[p0:p1,:,:] -= Ai * dfiJ - - return dF, dA - def get_D_S(surface, with_S=True, with_D=False): ''' generate D and S matrix in J. Chem. Phys. 133, 244111 (2010) @@ -254,223 +187,24 @@ def get_D_S(surface, with_S=True, with_D=False): norm_vec = surface['norm_vec'] R_vdw = surface['R_vdw'] - xi_i, xi_j = numpy.meshgrid(charge_exp, charge_exp, indexing='ij') + xi_i, xi_j = cupy.meshgrid(charge_exp, charge_exp, indexing='ij') xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5 - rij = scipy.spatial.distance.cdist(grid_coords, grid_coords) + #rij = scipy.spatial.distance.cdist(grid_coords, grid_coords) + rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5 xi_r_ij = xi_ij * rij - numpy.fill_diagonal(rij, 1) + cupy.fill_diagonal(rij, 1) S = scipy.special.erf(xi_r_ij) / rij - numpy.fill_diagonal(S, charge_exp * (2.0 / PI)**0.5 / switch_fun) - + cupy.fill_diagonal(S, charge_exp * (2.0 / PI)**0.5 / switch_fun) + D = None if with_D: - drij = numpy.expand_dims(grid_coords, axis=1) - grid_coords - nrij = numpy.sum(drij * norm_vec, axis=-1) - - D = S*nrij/rij**2 -2.0*xi_r_ij/PI**0.5*numpy.exp(-xi_r_ij**2)*nrij/rij**3 - numpy.fill_diagonal(D, -charge_exp * (2.0 / PI)**0.5 / (2.0 * R_vdw)) + drij = cupy.expand_dims(grid_coords, axis=1) - grid_coords + nrij = cupy.sum(drij * norm_vec, axis=-1) - return D, S + D = S*nrij/rij**2 -2.0*xi_r_ij/PI**0.5*cupy.exp(-xi_r_ij**2)*nrij/rij**3 + cupy.fill_diagonal(D, -charge_exp * (2.0 / PI)**0.5 / (2.0 * R_vdw)) -def get_dD_dS(surface, dF, with_S=True, with_D=False): - ''' - derivative of D and S w.r.t grids, partial_i D_ij = -partial_j D_ij - S is symmetric, D is not - ''' - grid_coords = surface['grid_coords'] - exponents = surface['charge_exp'] - norm_vec = surface['norm_vec'] - switch_fun = surface['switch_fun'] - - xi_i, xi_j = numpy.meshgrid(exponents, exponents, indexing='ij') - xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5 - ri_rj = numpy.expand_dims(grid_coords, axis=1) - grid_coords - rij = numpy.linalg.norm(ri_rj, axis=-1) - xi_r_ij = xi_ij * rij - numpy.fill_diagonal(rij, 1) - - dS_dr = -(scipy.special.erf(xi_r_ij) - 2.0*xi_r_ij/PI**0.5*numpy.exp(-xi_r_ij**2))/rij**2 - numpy.fill_diagonal(dS_dr, 0) - - dS_dr= numpy.expand_dims(dS_dr, axis=-1) - drij = ri_rj/numpy.expand_dims(rij, axis=-1) - dS = dS_dr * drij - - dD = None - if with_D: - nj_rij = numpy.sum(ri_rj * norm_vec, axis=-1) - dD_dri = 4.0*xi_r_ij**2 * xi_ij / PI**0.5 * numpy.exp(-xi_r_ij**2) * nj_rij / rij**3 - numpy.fill_diagonal(dD_dri, 0.0) - - rij = numpy.expand_dims(rij, axis=-1) - nj_rij = numpy.expand_dims(nj_rij, axis=-1) - nj = numpy.expand_dims(norm_vec, axis=0) - dD_dri = numpy.expand_dims(dD_dri, axis=-1) - - dD = dD_dri * drij + dS_dr * (-nj/rij + 3.0*nj_rij/rij**2 * drij) - - dSii_dF = -exponents * (2.0/PI)**0.5 / switch_fun**2 - dSii = numpy.expand_dims(dSii_dF, axis=(1,2)) * dF - - return dD, dS, dSii - -def grad_kernel(pcmobj, dm): - ''' - dE = 0.5*v* d(K^-1 R) *v + q*dv - v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q) - ''' - mol = pcmobj.mol - nao = mol.nao - aoslice = mol.aoslice_by_atom() - gridslice = pcmobj.surface['gslice_by_atom'] - grid_coords = pcmobj.surface['grid_coords'] - exponents = pcmobj.surface['charge_exp'] - v_grids = pcmobj._intermediates['v_grids'] - A = pcmobj._intermediates['A'] - D = pcmobj._intermediates['D'] - S = pcmobj._intermediates['S'] - K = pcmobj._intermediates['K'] - q = pcmobj._intermediates['q'] - q_sym = pcmobj._intermediates['q_sym'] - - vK_1 = numpy.linalg.solve(K.T, v_grids) - - # ----------------- potential response ----------------------- - max_memory = pcmobj.max_memory - lib.current_memory()[0] - blksize = int(max(max_memory*.9e6/8/nao**2, 400)) - ngrids = grid_coords.shape[0] - atom_coords = mol.atom_coords(unit='B') - - dvj = numpy.zeros([nao,3]) - dq = numpy.zeros([ngrids,3]) - for p0, p1 in lib.prange(0, ngrids, blksize): - fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2) - # charge response - v_nj_ip1 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip1', aosym='s1', comp=3) - vj = numpy.einsum('xijn,n->xij', v_nj_ip1, q_sym) - dvj += numpy.einsum('xij,ij->ix', vj, dm) - dvj += numpy.einsum('xij,ji->ix', vj, dm) - - # electronic potential response - v_nj_ip2 = df.incore.aux_e2(mol, fakemol, intor='int3c2e_ip2', aosym='s1', comp=3) - dq_slice = numpy.einsum('xijn,ij->nx', v_nj_ip2, dm) - dq[p0:p1] = numpy.einsum('nx,n->nx', dq_slice, q_sym[p0:p1]) - - de = numpy.zeros_like(atom_coords) - de += numpy.asarray([numpy.sum(dq[p0:p1], axis=0) for p0,p1 in gridslice]) - de += numpy.asarray([numpy.sum(dvj[p0:p1], axis=0) for p0,p1 in aoslice[:,2:]]) - - atom_charges = mol.atom_charges() - fakemol_nuc = gto.fakemol_for_charges(atom_coords) - - # nuclei response - int2c2e_ip1 = mol._add_suffix('int2c2e_ip1') - v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol) - dv_g = numpy.einsum('g,xng->nx', q_sym, v_ng_ip1) - de -= numpy.einsum('nx,n->nx', dv_g, atom_charges) - - # nuclei potential response - int2c2e_ip2 = mol._add_suffix('int2c2e_ip2') - v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip2, fakemol_nuc, fakemol) - dv_g = numpy.einsum('n,xng->gx', atom_charges, v_ng_ip2) - dv_g = numpy.einsum('gx,g->gx', dv_g, q_sym) - de -= numpy.asarray([numpy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice]) - - ## --------------- response from stiffness matrices ---------------- - gridslice = pcmobj.surface['gslice_by_atom'] - dF, dA = get_dF_dA(pcmobj.surface) - - with_D = pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE' - dD, dS, dSii = get_dD_dS(pcmobj.surface, dF, with_D=with_D, with_S=True) - - if pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE': - DA = D*A - - epsilon = pcmobj.eps - - #de_dF = v0 * -dSii_dF * q - #de += 0.5*numpy.einsum('i,inx->nx', de_dF, dF) - # dQ = v^T K^-1 (dR - dK K^-1 R) v - if pcmobj.method.upper() == 'C-PCM' or pcmobj.method.upper() == 'COSMO': - # dR = 0, dK = dS - de_dS = numpy.einsum('i,ijx,j->ix', vK_1, dS, q) - de -= numpy.asarray([numpy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice]) - de -= 0.5*numpy.einsum('i,ijx,i->jx', vK_1, dSii, q) - - elif pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE': - # IEF-PCM and SS(V)PE formally are the same in gradient calculation - # dR = f_eps/(2*pi) * (dD*A + D*dA), - # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) - f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) - fac = f_epsilon/(2.0*PI) - - Av = A*v_grids - de_dR = 0.5*fac * numpy.einsum('i,ijx,j->ix', vK_1, dD, Av) - de_dR -= 0.5*fac * numpy.einsum('i,ijx,j->jx', vK_1, dD, Av) - de_dR = numpy.asarray([numpy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice]) - de_dR += 0.5*fac * numpy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids) - - de_dS0 = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dS, q) - de_dS0 -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dS, q) - de_dS0 = numpy.asarray([numpy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice]) - de_dS0 += 0.5*numpy.einsum('i,inx,i->nx', vK_1, dSii, q) - - vK_1_DA = numpy.dot(vK_1, DA) - de_dS1 = 0.5*numpy.einsum('j,jkx,k->jx', vK_1_DA, dS, q) - de_dS1 -= 0.5*numpy.einsum('j,jkx,k->kx', vK_1_DA, dS, q) - de_dS1 = numpy.asarray([numpy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice]) - de_dS1 += 0.5*numpy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q) - - Sq = numpy.dot(S,q) - ASq = A*Sq - de_dD = 0.5*numpy.einsum('i,ijx,j->ix', vK_1, dD, ASq) - de_dD -= 0.5*numpy.einsum('i,ijx,j->jx', vK_1, dD, ASq) - de_dD = numpy.asarray([numpy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_D = numpy.dot(vK_1, D) - de_dA = 0.5*numpy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq) - - de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1) - de += de_dR - de_dK - else: - raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}") - - return de - -def make_grad_object(grad_method): - ''' - return solvent gradient object - ''' - grad_method_class = grad_method.__class__ - class WithSolventGrad(grad_method_class): - def __init__(self, grad_method): - self.__dict__.update(grad_method.__dict__) - self.de_solvent = None - self.de_solute = None - self._keys = self._keys.union(['de_solvent', 'de_solute']) - - def kernel(self, *args, dm=None, atmlst=None, **kwargs): - dm = kwargs.pop('dm', None) - if dm is None: - dm = self.base.make_rdm1(ao_repr=True) - - self.de_solvent = grad_kernel(self.base.with_solvent, dm) - self.de_solute = grad_method_class.kernel(self, *args, **kwargs) - self.de = self.de_solute + self.de_solvent - - if self.verbose >= logger.NOTE: - logger.note(self, '--------------- %s (+%s) gradients ---------------', - self.base.__class__.__name__, - self.base.with_solvent.__class__.__name__) - logger.note(self, '----------------------------------------------') - return self.de - - def _finalize(self): - # disable _finalize. It is called in grad_method.kernel method - # where self.de was not yet initialized. - pass - - return WithSolventGrad(grad_method) + return D, S class PCM(ddcosmo.DDCOSMO): def __init__(self, mol): @@ -481,10 +215,7 @@ def __init__(self, mol): self._intermediates = {} def dump_flags(self, verbose=None): - logger.info(self, '******** %s (In testing) ********', self.__class__) - logger.warn(self, 'ddPCM is an experimental feature. It is ' - 'still in testing.\nFeatures and APIs may be changed ' - 'in the future.') + logger.info(self, '******** %s ********', self.__class__) logger.info(self, 'lebedev_order = %s (%d grids per sphere)', self.lebedev_order, gen_grid.LEBEDEV_ORDER[self.lebedev_order]) logger.info(self, 'lmax = %s' , self.lmax) @@ -502,48 +233,65 @@ def build(self, ng=None): vdw_scale = self.vdw_scale self.radii_table = vdw_scale * Bondi mol = self.mol - if ng is None: + if ng is None: ng = gen_grid.LEBEDEV_ORDER[self.lebedev_order] - + self.surface = gen_surface(mol, ng=ng, vdw_scale=vdw_scale) self._intermediates = {} F, A = get_F_A(self.surface) D, S = get_D_S(self.surface, with_S=True, with_D=True) - + epsilon = self.eps if self.method.upper() == 'C-PCM': f_epsilon = (epsilon-1.)/epsilon K = S - R = -f_epsilon * numpy.eye(K.shape[0]) + R = -f_epsilon * cupy.eye(K.shape[0]) elif self.method.upper() == 'COSMO': f_epsilon = (epsilon - 1.0)/(epsilon + 1.0/2.0) K = S - R = -f_epsilon * numpy.eye(K.shape[0]) + R = -f_epsilon * cupy.eye(K.shape[0]) elif self.method.upper() == 'IEF-PCM': f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) DA = D*A - DAS = numpy.dot(DA, S) + DAS = cupy.dot(DA, S) K = S - f_epsilon/(2.0*PI) * DAS - R = -f_epsilon * (numpy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA) + R = -f_epsilon * (cupy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA) elif self.method.upper() == 'SS(V)PE': f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) DA = D*A - DAS = numpy.dot(DA, S) + DAS = cupy.dot(DA, S) K = S - f_epsilon/(4.0*PI) * (DAS + DAS.T) - R = -f_epsilon * (numpy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA) + R = -f_epsilon * (cupy.eye(K.shape[0]) - 1.0/(2.0*PI)*DA) else: raise RuntimeError(f"Unknown implicit solvent model: {self.method}") intermediates = { - 'S': S, - 'D': D, - 'A': A, - 'K': K, - 'R': R, + 'S': cupy.asarray(S), + 'D': cupy.asarray(D), + 'A': cupy.asarray(A), + 'K': cupy.asarray(K), + 'R': cupy.asarray(R), 'f_epsilon': f_epsilon } self._intermediates.update(intermediates) + charge_exp = self.surface['charge_exp'] + grid_coords = self.surface['grid_coords'] + atom_coords = mol.atom_coords(unit='B') + atom_charges = mol.atom_charges() + + # Move this to GPU + auxmol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2) + intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=False, aosym=True, group_size=256) + self.intopt = intopt + + int2c2e = mol._add_suffix('int2c2e') + fakemol_nuc = gto.fakemol_for_charges(atom_coords) + v_ng = gto.mole.intor_cross(int2c2e, fakemol_nuc, auxmol) + v_grids_n = numpy.dot(atom_charges, v_ng) + self.v_grids_n = cupy.asarray(v_grids_n) + def _get_vind(self, dms): if not self._intermediates or self.grids.coords is None: self.build() @@ -553,16 +301,16 @@ def _get_vind(self, dms): K = self._intermediates['K'] R = self._intermediates['R'] - v_grids = self._get_v(self.surface, dms) - b = numpy.dot(R, v_grids) - q = numpy.linalg.solve(K, b) + v_grids = self._get_v(dms) + b = cupy.dot(R, v_grids) + q = cupy.linalg.solve(K, b) - vK_1 = numpy.linalg.solve(K.T, v_grids) - q_sym = (q + numpy.dot(R.T, vK_1))/2.0 + vK_1 = cupy.linalg.solve(K.T, v_grids) + q_sym = (q + cupy.dot(R.T, vK_1))/2.0 vmat = self._get_vmat(q_sym) - epcm = 0.5 * numpy.dot(q_sym, v_grids) - + epcm = 0.5 * cupy.dot(q_sym, v_grids) + self._intermediates['K'] = K self._intermediates['R'] = R self._intermediates['q'] = q @@ -571,54 +319,26 @@ def _get_vind(self, dms): return epcm, vmat - def _get_v(self, surface, dms): + def _get_v(self, dms): ''' return electrostatic potential on surface ''' - mol = self.mol - nao = dms.shape[-1] - atom_coords = mol.atom_coords(unit='B') - atom_charges = mol.atom_charges() - grid_coords = surface['grid_coords'] - exponents = surface['charge_exp'] - - max_memory = self.max_memory - lib.current_memory()[0] - blksize = int(max(max_memory*.9e6/8/nao**2, 400)) - ngrids = grid_coords.shape[0] - int3c2e = mol._add_suffix('int3c2e') - cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e) - v_grids_e = numpy.empty(ngrids) - for p0, p1 in lib.prange(0, ngrids, blksize): - fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2) - v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) - v_grids_e[p0:p1] = numpy.einsum('ijL,ij->L',v_nj, dms[0]) - - int2c2e = mol._add_suffix('int2c2e') - - fakemol_nuc = gto.fakemol_for_charges(atom_coords) - v_ng = gto.mole.intor_cross(int2c2e, fakemol_nuc, fakemol) - v_grids_n = numpy.dot(atom_charges, v_ng) - - v_grids = v_grids_n - v_grids_e + v_grids_e = 2.0*int3c2e.get_j_int3c2e_pass1(self.intopt, dms[0]) + v_grids = self.v_grids_n - v_grids_e return v_grids def _get_vmat(self, q): - mol = self.mol - nao = mol.nao - grid_coords = self.surface['grid_coords'] - exponents = self.surface['charge_exp'] - max_memory = self.max_memory - lib.current_memory()[0] - blksize = int(max(max_memory*.9e6/8/nao**2, 400)) - ngrids = grid_coords.shape[0] - int3c2e = mol._add_suffix('int3c2e') - cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e) - vmat = numpy.zeros([nao,nao]) - for p0, p1 in lib.prange(0, ngrids, blksize): - fakemol = gto.fakemol_for_charges(grid_coords[p0:p1], expnt=exponents**2) - v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e, aosym='s1', cintopt=cintopt) - vmat += -numpy.einsum('ijL,L->ij', v_nj, q[p0:p1]) - return vmat - + return -int3c2e.get_j_int3c2e_pass2(self.intopt, q) + def nuc_grad_method(self, grad_method): - return make_grad_object(grad_method) + from gpu4pyscf.solvent.grad import pcm as pcm_grad + if self.frozen: + raise RuntimeError('Frozen solvent model is not supported') + from gpu4pyscf import scf + if isinstance(grad_method.base, scf.hf.RHF): + return pcm_grad.make_grad_object(grad_method) + else: + raise RuntimeError('Only SCF gradient is supported') + def Hessian(self): + raise NotImplementedError('not implemented yet') diff --git a/gpu4pyscf/solvent/tests/test_pcm.py b/gpu4pyscf/solvent/tests/test_pcm.py index f0087a1a..2a78decc 100644 --- a/gpu4pyscf/solvent/tests/test_pcm.py +++ b/gpu4pyscf/solvent/tests/test_pcm.py @@ -15,13 +15,14 @@ import unittest import numpy -from pyscf import scf, gto, df -from gpu4pyscf.solvent import pcm +from pyscf import gto, df +from gpu4pyscf import scf +from gpu4pyscf.solvent import pcm def setUpModule(): global mol, epsilon, lebedev_order mol = gto.Mole() - mol.atom = ''' + mol.atom = ''' O 0.0000000000 -0.0000000000 0.1174000000 H -0.7570000000 -0.0000000000 -0.4696000000 H 0.7570000000 0.0000000000 -0.4696000000 @@ -59,7 +60,7 @@ def test_COSMO(self): e_tot = mf.kernel() print(f"Energy error in COSMO: {numpy.abs(e_tot - -74.96900351922464)}") assert numpy.abs(e_tot - -74.96900351922464) < 1e-9 - + def test_IEFPCM(self): cm = pcm.PCM(mol) cm.eps = epsilon @@ -70,7 +71,7 @@ def test_IEFPCM(self): e_tot = mf.kernel() print(f"Energy error in IEF-PCM: {numpy.abs(e_tot - -74.9690111344)}") assert numpy.abs(e_tot - -74.9690111344) < 1e-9 - + def test_SSVPE(self): cm = pcm.PCM(mol) cm.eps = epsilon @@ -81,7 +82,7 @@ def test_SSVPE(self): e_tot = mf.kernel() print(f"Energy error in SS(V)PE: {numpy.abs(e_tot - -74.9689577454)}") assert numpy.abs(e_tot - -74.9689577454) < 1e-9 - + if __name__ == "__main__": print("Full Tests for PCMs") unittest.main() \ No newline at end of file diff --git a/gpu4pyscf/solvent/tests/test_pcm_grad.py b/gpu4pyscf/solvent/tests/test_pcm_grad.py index 732aa5d2..677aa285 100644 --- a/gpu4pyscf/solvent/tests/test_pcm_grad.py +++ b/gpu4pyscf/solvent/tests/test_pcm_grad.py @@ -15,14 +15,15 @@ import unittest import numpy -from pyscf import scf, gto, df -from gpu4pyscf.solvent import pcm +from pyscf import gto +from gpu4pyscf import scf +from gpu4pyscf.solvent import pcm from gpu4pyscf.solvent.grad import pcm as pcm_grad def setUpModule(): global mol, epsilon, lebedev_order mol = gto.Mole() - mol.atom = ''' + mol.atom = ''' O 0.0000000000 -0.0000000000 0.1174000000 H -0.7570000000 -0.0000000000 -0.4696000000 H 0.7570000000 0.0000000000 -0.4696000000 @@ -44,10 +45,10 @@ def test_dA_dF(self): cm.lebedev_order = 3 cm.method = 'IEF-PCM' cm.build() - + dF, dA = pcm_grad.get_dF_dA(cm.surface) dD, dS, dSii = pcm_grad.get_dD_dS(cm.surface, dF, with_S=True, with_D=True) - + def get_FADS(mol): mol.build() cm = pcm.PCM(mol) @@ -59,7 +60,7 @@ def get_FADS(mol): D = cm._intermediates['D'] S = cm._intermediates['S'] return F, A, D, S - + eps = 1e-5 for ia in range(mol.natm): p0,p1 = cm.surface['gslice_by_atom'][ia] @@ -84,7 +85,7 @@ def get_FADS(mol): assert numpy.linalg.norm(dF0 - dF[:,ia,j]) < 1e-8 assert numpy.linalg.norm(dA0 - dA[:,ia,j]) < 1e-8 - + # the diagonal entries are calcualted separately assert numpy.linalg.norm(dSii[:,ia,j] - numpy.diag(dS0)) < 1e-8 numpy.fill_diagonal(dS0, 0) @@ -93,12 +94,12 @@ def get_FADS(mol): dS_ia[p0:p1] = dS[p0:p1,:,j] dS_ia[:,p0:p1] -= dS[:,p0:p1,j] assert numpy.linalg.norm(dS0 - dS_ia) < 1e-8 - + dD_ia = numpy.zeros_like(dD0) dD_ia[p0:p1] = dD[p0:p1,:,j] dD_ia[:,p0:p1] -= dD[:,p0:p1,j] assert numpy.linalg.norm(dD0 - dD_ia) < 1e-8 - + def test_grad_CPCM(self): cm = pcm.PCM(mol) cm.eps = epsilon @@ -117,10 +118,10 @@ def test_grad_CPCM(self): [0.49773047433563E-15, -0.12128126037559E-15, -0.58936988992306E-01], [0.22810111996954E-01, -0.68951901317025E-17, 0.29468494708267E-01], [-0.22810111996957E-01, 0.12949813945902E-15, 0.29468494708266E-01]]) - + print(f"Gradient error in CPCM: {numpy.linalg.norm(g0 - grad)}") assert numpy.linalg.norm(g0 - grad) < 1e-9 - + def test_grad_COSMO(self): cm = pcm.PCM(mol) cm.eps = epsilon @@ -134,15 +135,15 @@ def test_grad_COSMO(self): g = mf.nuc_grad_method() grad = g.kernel() - + g0 = numpy.asarray( [[-1.33560836e-16, 8.70874355e-17, -5.89638726e-02], [ 2.28202396e-02, 2.63784344e-17, 2.94819363e-02], [-2.28202396e-02, -1.08799896e-16, 2.94819363e-02]]) - + print(f"Gradient error in COSMO: {numpy.linalg.norm(g0 - grad)}") assert numpy.linalg.norm(g0 - grad) < 1e-9 - + def test_grad_IEFPCM(self): cm = pcm.PCM(mol) cm.eps = epsilon @@ -153,17 +154,17 @@ def test_grad_IEFPCM(self): mf.verbose = 0 mf.conv_tol = 1e-12 e_tot = mf.kernel() - + g = mf.nuc_grad_method() grad = g.kernel() - + g0 = numpy.asarray([ [0.18357915015649E-14, 0.14192681822347E-15, -0.58988087999658E-01], [0.22822709179063E-01, -0.10002010417168E-15, 0.29494044211805E-01], [-0.22822709179066E-01, -0.31051364515588E-16, 0.29494044211806E-01]]) print(f"Gradient error in IEFPCM: {numpy.linalg.norm(g0 - grad)}") assert numpy.linalg.norm(g0 - grad) < 1e-9 - + def test_grad_SSVPE(self): cm = pcm.PCM(mol) cm.eps = epsilon @@ -177,14 +178,14 @@ def test_grad_SSVPE(self): g = mf.nuc_grad_method() grad = g.kernel() - + g0 = numpy.asarray([ [0.76104817971710E-15, 0.11185701540547E-15, -0.58909172879217E-01], [0.22862990009767E-01, -0.13861633974903E-15, 0.29454586651678E-01], [-0.22862990009769E-01, 0.34988765678591E-16, 0.29454586651679E-01]]) print(f"Gradient error in SS(V)PE: {numpy.linalg.norm(g0 - grad)}") assert numpy.linalg.norm(g0 - grad) < 1e-9 - + if __name__ == "__main__": print("Full Tests for Gradient of PCMs") unittest.main() \ No newline at end of file From 98741874341b546d881d9e6143d435fe86596baf Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 17 Oct 2023 20:05:40 -0700 Subject: [PATCH 07/19] Hotfix 0.6.1 (#49) * numpy -> cupy for solvent * for linter * remove grad switch from pcm.py * passed flake8 * solvent integrals on GPU * flake8 * compatiable with pyscf-2.4.0 --- gpu4pyscf/__init__.py | 2 +- gpu4pyscf/dft/gen_grid.py | 129 ++++---------------------- gpu4pyscf/grad/rks.py | 38 ++++---- gpu4pyscf/grad/tests/test_rks_grad.py | 27 +++--- 4 files changed, 53 insertions(+), 143 deletions(-) diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index a52a096a..143af69c 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1,2 +1,2 @@ from . import lib, grad, hessian, solvent, scf, dft -__version__ = '0.6.1' \ No newline at end of file +__version__ = '0.6.2' diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 8e70d549..ea76e2f8 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -31,7 +31,6 @@ import cupy from pyscf import lib from pyscf.lib import logger -#from pyscf.dft import radi from pyscf import gto from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index from pyscf import __config__ @@ -262,7 +261,7 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev, vol.append(cupy.einsum('i,j->ji', rad_weight[idx], grid[:,3]).ravel()) atom_grids_tab[symb] = (cupy.vstack(coords), cupy.hstack(vol)) - + return atom_grids_tab def get_partition(mol, atom_grids_tab, @@ -291,26 +290,6 @@ def get_partition(mol, atom_grids_tab, (radii_adjust is radi.treutler_atomic_radii_adjust or radii_adjust is radi.becke_atomic_radii_adjust or f_radii_adjust is None)): - ''' - if f_radii_adjust is None: - p_radii_table = lib.c_null_ptr() - else: - f_radii_table = numpy.asarray([f_radii_adjust(i, j, 0) - for i in range(mol.natm) - for j in range(mol.natm)]) - p_radii_table = f_radii_table.ctypes.data_as(ctypes.c_void_p) - - def gen_grid_partition0(coords): - coords = numpy.asarray(coords, order='F') - ngrids = coords.shape[0] - pbecke = numpy.empty((mol.natm,ngrids)) - libdft.VXCgen_grid(pbecke.ctypes.data_as(ctypes.c_void_p), - coords.ctypes.data_as(ctypes.c_void_p), - atm_coords.ctypes.data_as(ctypes.c_void_p), - p_radii_table, - ctypes.c_int(mol.natm), ctypes.c_int(ngrids)) - return pbecke - ''' def gen_grid_partition(coords): grid_dist = cupy.linalg.norm(coords[None,:,:] - atm_coords[:,None,:], axis=-1) r12 = grid_dist[:,None,:] - grid_dist[None,:,:] @@ -426,7 +405,8 @@ def _load_conf(mod, name, default): else: return var -class Grids(lib.StreamObject): +from pyscf.dft import gen_grid +class Grids(gen_grid.Grids): '''DFT mesh grids Attributes for Grids: @@ -501,30 +481,6 @@ class Grids(lib.StreamObject): alignment = ALIGNMENT_UNIT cutoff = CUTOFF - - def __init__(self, mol): - self.mol = mol - self.stdout = mol.stdout - self.verbose = mol.verbose - self.symmetry = mol.symmetry - self.atom_grid = {} - -################################################## -# don't modify the following attributes, they are not input options - self.non0tab = None - # Integral screen index ~= NBINS + log(ao). - # screen_index > 0 for non-zero AOs - self.screen_index = None - self.coords = None - self.weights = None - self._keys = set(self.__dict__.keys()).update([ - 'atomic_radii', 'radii_adjust', 'radi_method', 'becke_scheme', - 'prune', 'level', 'alignment', 'cutoff', - ]) - - @property - def size(self): - return getattr(self.weights, 'size', 0) def __setattr__(self, key, val): if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method', @@ -532,20 +488,6 @@ def __setattr__(self, key, val): self.reset() super(Grids, self).__setattr__(key, val) - def dump_flags(self, verbose=None): - logger.info(self, 'radial grids: %s', self.radi_method.__doc__) - logger.info(self, 'becke partition: %s', self.becke_scheme.__doc__) - logger.info(self, 'pruning grids: %s', self.prune) - logger.info(self, 'grids dens level: %d', self.level) - logger.info(self, 'symmetrized grids: %s', self.symmetry) - if self.radii_adjust is not None: - logger.info(self, 'atomic radii adjust function: %s', - self.radii_adjust) - logger.debug2(self, 'atomic_radii : %s', self.atomic_radii) - if self.atom_grid: - logger.info(self, 'User specified grid scheme %s', str(self.atom_grid)) - return self - def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs): if mol is None: mol = self.mol if self.verbose >= logger.WARN: @@ -564,10 +506,10 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs): padding = _padding_size(self.size, self.alignment) logger.debug(self, 'Padding %d grids', padding) if padding > 0: - self.coords = numpy.vstack( + # cupy.vstack and cupy.hstack convert numpy array into cupy array first + self.coords = cupy.vstack( [self.coords, numpy.repeat([[1e4]*3], padding, axis=0)]) - self.weights = numpy.hstack([self.weights, numpy.zeros(padding)]) - + self.weights = cupy.hstack([self.weights, numpy.zeros(padding)]) if with_non0tab: self.non0tab = self.make_mask(mol, self.coords) self.screen_index = self.non0tab @@ -612,62 +554,27 @@ def prune_by_density_(self, rho, threshold=0): return self mol = self.mol - n = numpy.dot(rho, self.weights) + n = cupy.dot(rho, self.weights) if abs(n-mol.nelectron) < NELEC_ERROR_TOL*n: rho *= self.weights idx = abs(rho) > threshold / self.weights.size logger.debug(self, 'Drop grids %d', - self.weights.size - numpy.count_nonzero(idx)) - self.coords = numpy.asarray(self.coords [idx], order='C') - self.weights = numpy.asarray(self.weights[idx], order='C') + self.weights.size - cupy.count_nonzero(idx)) + self.coords = cupy.asarray(self.coords [idx], order='C') + self.weights = cupy.asarray(self.weights[idx], order='C') if self.alignment > 1: padding = _padding_size(self.size, self.alignment) logger.debug(self, 'prune_by_density_: %d padding grids', padding) if padding > 0: - self.coords = numpy.vstack( - [self.coords, numpy.repeat([[1e4]*3], padding, axis=0)]) - self.weights = numpy.hstack([self.weights, numpy.zeros(padding)]) + self.coords = cupy.vstack( + [self.coords, cupy.repeat([[1e4]*3], padding, axis=0)]) + self.weights = cupy.hstack([self.weights, cupy.zeros(padding)]) self.non0tab = self.make_mask(mol, self.coords) self.screen_index = self.non0tab return self - -def _default_rad(nuc, level=3): - '''Number of radial grids ''' - tab = numpy.array( (2 , 10, 18, 36, 54, 86, 118)) - period = (nuc > tab).sum() - return RAD_GRIDS[level,period] -# Period 1 2 3 4 5 6 7 # level -RAD_GRIDS = numpy.array((( 10, 15, 20, 30, 35, 40, 50), # 0 - ( 30, 40, 50, 60, 65, 70, 75), # 1 - ( 40, 60, 65, 75, 80, 85, 90), # 2 - ( 50, 75, 80, 90, 95,100,105), # 3 - ( 60, 90, 95,105,110,115,120), # 4 - ( 70,105,110,120,125,130,135), # 5 - ( 80,120,125,135,140,145,150), # 6 - ( 90,135,140,150,155,160,165), # 7 - (100,150,155,165,170,175,180), # 8 - (200,200,200,200,200,200,200),)) # 9 - -def _default_ang(nuc, level=3): - '''Order of angular grids. See LEBEDEV_ORDER for the mapping of - the order and the number of angular grids''' - tab = numpy.array( (2 , 10, 18, 36, 54, 86, 118)) - period = (nuc > tab).sum() - return LEBEDEV_ORDER[ANG_ORDER[level,period]] -# Period 1 2 3 4 5 6 7 # level -ANG_ORDER = numpy.array(((11, 15, 17, 17, 17, 17, 17 ), # 0 - (17, 23, 23, 23, 23, 23, 23 ), # 1 - (23, 29, 29, 29, 29, 29, 29 ), # 2 - (29, 29, 35, 35, 35, 35, 35 ), # 3 - (35, 41, 41, 41, 41, 41, 41 ), # 4 - (41, 47, 47, 47, 47, 47, 47 ), # 5 - (47, 53, 53, 53, 53, 53, 53 ), # 6 - (53, 59, 59, 59, 59, 59, 59 ), # 7 - (59, 59, 59, 59, 59, 59, 59 ), # 8 - (65, 65, 65, 65, 65, 65, 65 ),)) # 9 - -def _padding_size(ngrids, alignment): - if alignment <= 1: - return 0 - return (ngrids + alignment - 1) // alignment * alignment - ngrids +_default_rad = gen_grid._default_rad +RAD_GRIDS = gen_grid.RAD_GRIDS +_default_ang = gen_grid._default_ang +ANG_ORDER = gen_grid.ANG_ORDER +_padding_size = gen_grid._padding_size diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 276e15b0..008baa40 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -50,7 +50,7 @@ def _get_veff(ks_grad, mol=None, dm=None): grids = ks_grad.grids else: grids = mf.grids - + if grids.coords is None: grids.build(sort_grids=True) @@ -89,7 +89,7 @@ def _get_veff(ks_grad, mol=None, dm=None): occ_coeff = cupy.asarray(mf.mo_coeff[:, mf.mo_occ>0.5], order='C') tmp = contract('nij,jk->nik', vxc, occ_coeff) vxc = 2.0*contract('nik,ik->ni', tmp, occ_coeff) - + aoslices = mol.aoslice_by_atom() vxc = [vxc[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]] vxc = cupy.asarray(vxc) @@ -116,7 +116,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, opt = ni.gdftopt mo_occ = cupy.asarray(dms.mo_occ) mo_coeff = cupy.asarray(dms.mo_coeff) - + coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dms = cupy.asarray(dms) @@ -124,7 +124,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, for dm in dms.reshape(-1,nao0,nao0)] mo_coeff = coeff @ mo_coeff nset = len(dms) - + with opt.gdft_envs_cache(): if xctype == 'LDA': ao_deriv = 1 @@ -136,10 +136,10 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, block_size = int((mem_avail*.4/8/(comp+1)/nao - 3*nao*2)/ ALIGNED) * ALIGNED block_size = min(block_size, MIN_BLK_SIZE) log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) - + if block_size < ALIGNED: raise RuntimeError('Not enough GPU memory') - + vmat = cupy.zeros((nset,3,nao,nao)) if xctype == 'LDA': ao_deriv = 1 @@ -207,7 +207,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, exc = None if nset == 1: vmat = vmat[0] - + # - sign because nabla_X = -nabla_x return exc, -vmat @@ -221,7 +221,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_occ = cupy.asarray(dms.mo_occ) mo_coeff = cupy.asarray(dms.mo_coeff) - + coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dms = cupy.asarray(dms) @@ -255,9 +255,9 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 # *.5 because vmat + vmat.T at the end vmat += _gga_grad_sum_(ao, wv) - + vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff) - + exc = None # - sign because nabla_X = -nabla_x return exc, -vmat @@ -288,7 +288,7 @@ def _d1_dot_(ao1, ao2): vmat1 = cupy.dot(ao1[1], ao2) vmat2 = cupy.dot(ao1[2], ao2) return cupy.stack([vmat0,vmat1,vmat2]) - + def _gga_grad_sum_(ao, wv): #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) aow = numint._scale_ao(ao[:4], wv[:4]) @@ -296,7 +296,7 @@ def _gga_grad_sum_(ao, wv): aow = _make_dR_dao_w(ao, wv[:4]) vmat += _d1_dot_(aow, ao[0].T) return vmat - + # XX, XY, XZ = 4, 5, 6 # YX, YY, YZ = 5, 7, 8 # ZX, ZY, ZZ = 6, 8, 9 @@ -342,10 +342,10 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, block_size = int((mem_avail*.4/8/(comp+1)/nao - 3*nao*2)/ ALIGNED) * ALIGNED block_size = min(block_size, MIN_BLK_SIZE) log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) - + if block_size < ALIGNED: raise RuntimeError('Not enough GPU memory') - + for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)): ngrids = weight.size for p0, p1 in lib.prange(0,ngrids,block_size): @@ -371,7 +371,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = weight[p0:p1] * vxc wv[0] *= .5 wv[4] *= .5 # for the factor 1/2 in tau - + vmat += _gga_grad_sum_(ao, wv) vmat += _tau_grad_dot_(ao, wv[4]) @@ -502,7 +502,9 @@ def get_du(ia, ib): # JCP 98, 5612 (1993); (B10) class Gradients(rhf_grad.Gradients, pyscf.grad.rks.Gradients): from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - + + get_veff = _get_veff + def get_dispersion(self): if self.base.disp[:2].upper() == 'D3': from pyscf import lib @@ -511,12 +513,12 @@ def get_dispersion(self): d3 = disp.DFTD3Dispersion(self.mol, xc=self.base.xc, version=self.base.disp) _, g_d3 = d3.kernel() return g_d3 - + if self.base.disp[:2].upper() == 'D4': from pyscf.data.elements import charge atoms = numpy.array([ charge(a[0]) for a in self.mol._atom]) coords = self.mol.atom_coords() - + from pyscf import lib with lib.with_omp_threads(1): from dftd4.interface import DampingParam, DispersionModel diff --git a/gpu4pyscf/grad/tests/test_rks_grad.py b/gpu4pyscf/grad/tests/test_rks_grad.py index 04d7c7c8..59f36f14 100644 --- a/gpu4pyscf/grad/tests/test_rks_grad.py +++ b/gpu4pyscf/grad/tests/test_rks_grad.py @@ -40,7 +40,7 @@ def tearDownModule(): global mol mol.stdout.close() del mol - + def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6): mf = rks.RKS(mol, xc=xc) mf.direct_scf_tol = 1e-14 @@ -50,11 +50,10 @@ def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6): if mf._numint.libxc.is_nlc(mf.xc): mf.nlcgrids.level = nlcgrids_level mf.kernel() - cpu_gradient = pyscf.grad.RKS(mf) cpu_gradient.grid_response = grid_response g_cpu = cpu_gradient.kernel() - + # TODO: use to_gpu functionality mf.__class__ = gpu4pyscf.dft.rks.RKS @@ -63,49 +62,51 @@ def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6): mf.grids.level = grids_level mf.grids.prune = None mf.grids.small_rho_cutoff = 1e-30 + mf.grids.build() if mf._numint.libxc.is_nlc(mf.xc): mf.nlcgrids = gpu4pyscf.dft.gen_grid.Grids(mol) mf.nlcgrids.level = nlcgrids_level - + mf.nlcgrids.build() + gpu_gradient = gpu4pyscf.grad.RKS(mf) gpu_gradient.grid_response = grid_response g_gpu = gpu_gradient.kernel() assert(cupy.linalg.norm(g_cpu - g_gpu) < tol) class KnownValues(unittest.TestCase): - + def test_grad_with_grids_response(self): print("-----testing DFT gradient with grids response----") _check_grad(grid_response=True, tol=1e-5) - + def test_grad_without_grids_response(self): print('-----testing DFT gradient without grids response----') _check_grad(grid_response=False, tol=1e-5) - + def test_grad_lda(self): print("-----LDA testing-------") _check_grad(xc='LDA', disp=None, tol=1e-5) - + def test_grad_gga(self): print('-----GGA testing-------') _check_grad(xc='PBE', disp=None, tol=1e-5) - + def test_grad_hybrid(self): print('------hybrid GGA testing--------') _check_grad(xc='B3LYP', disp=None, tol=1e-5) - + def test_grad_mgga(self): print('-------mGGA testing-------------') _check_grad(xc='m06', disp=None, tol=1e-4) - + def test_grad_rsh(self): print('--------RSH testing-------------') _check_grad(xc='wb97', disp=None, tol=1e-4) - + def test_grad_nlc(self): print('--------nlc testing-------------') _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-5) - + if __name__ == "__main__": print("Full Tests for Gradient") unittest.main() From 4b1e36de4204b012f7cba6c2137fa5e55299b91b Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 17 Oct 2023 23:31:53 -0700 Subject: [PATCH 08/19] added __init__.py files (#50) --- MANIFEST.in | 3 ++- gpu4pyscf/__init__.py | 2 +- gpu4pyscf/df/__init__.py | 15 +++++++++++++++ gpu4pyscf/df/cderi.py | 6 +++--- gpu4pyscf/df/grad/__init__.py | 19 +++++++++++++++++++ gpu4pyscf/df/hessian/__init__.py | 19 +++++++++++++++++++ gpu4pyscf/gto/__init__.py | 14 ++++++++++++++ gpu4pyscf/lib/__init__.py | 15 +++++++++++++++ 8 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 gpu4pyscf/df/__init__.py create mode 100644 gpu4pyscf/df/grad/__init__.py create mode 100644 gpu4pyscf/df/hessian/__init__.py create mode 100644 gpu4pyscf/gto/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index bb38fb88..c53a5f21 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,13 +2,14 @@ include MANIFEST.in include README.md setup.py CHANGELOG AUTHORS LICENSE NOTICE global-exclude *.py[cod] +global-exclude *~ #global-exclude *.cu #global-exclude *.h #global-exclude *.c #global-exclude *.cuh #global-exclude *.sh -prune */__pycache__ +prune */__pycache__ recursive-exclude */__pycache__ * prune gpu4pyscf/lib/build diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index 143af69c..7fb02e52 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1,2 +1,2 @@ from . import lib, grad, hessian, solvent, scf, dft -__version__ = '0.6.2' +__version__ = '0.6.3' diff --git a/gpu4pyscf/df/__init__.py b/gpu4pyscf/df/__init__.py new file mode 100644 index 00000000..6716c097 --- /dev/null +++ b/gpu4pyscf/df/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + diff --git a/gpu4pyscf/df/cderi.py b/gpu4pyscf/df/cderi.py index 07ee360e..c1ae59c7 100644 --- a/gpu4pyscf/df/cderi.py +++ b/gpu4pyscf/df/cderi.py @@ -40,7 +40,7 @@ def __init__(self, nao, naux, nblocks) -> None: ctypes.c_int(nblocks), ctypes.c_int(nao)) return - + def __del__(self): self.row = [] self.col = [] @@ -57,8 +57,8 @@ def add_block(self, data, rows, cols): assert rows.dtype == cupy.int64 and cols.dtype == cupy.int64 nij = len(rows) err = libcupy_helper.add_block( - ctypes.byref(self.handle), - ctypes.c_int(nij), + ctypes.byref(self.handle), + ctypes.c_int(nij), ctypes.c_int(self.naux), ctypes.cast(rows.data.ptr, ctypes.c_void_p), ctypes.cast(cols.data.ptr, ctypes.c_void_p), diff --git a/gpu4pyscf/df/grad/__init__.py b/gpu4pyscf/df/grad/__init__.py new file mode 100644 index 00000000..22c672e3 --- /dev/null +++ b/gpu4pyscf/df/grad/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from . import rhf, rks + +RHF = rhf.Gradients +RKS = rks.Gradients \ No newline at end of file diff --git a/gpu4pyscf/df/hessian/__init__.py b/gpu4pyscf/df/hessian/__init__.py new file mode 100644 index 00000000..2b55ed12 --- /dev/null +++ b/gpu4pyscf/df/hessian/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from . import rhf, rks + +RHF = rhf.Hessian +RKS = rks.Hessian \ No newline at end of file diff --git a/gpu4pyscf/gto/__init__.py b/gpu4pyscf/gto/__init__.py new file mode 100644 index 00000000..25a4587e --- /dev/null +++ b/gpu4pyscf/gto/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . diff --git a/gpu4pyscf/lib/__init__.py b/gpu4pyscf/lib/__init__.py index 8ef56b43..147324d9 100644 --- a/gpu4pyscf/lib/__init__.py +++ b/gpu4pyscf/lib/__init__.py @@ -1,3 +1,18 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + import os import numpy from gpu4pyscf.lib import diis From d22e8823e85a5e354cb4087c28a2ea9aacc16441 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 17 Oct 2023 23:59:27 -0700 Subject: [PATCH 09/19] Create __init__.py --- gpu4pyscf/solvent/grad/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 gpu4pyscf/solvent/grad/__init__.py diff --git a/gpu4pyscf/solvent/grad/__init__.py b/gpu4pyscf/solvent/grad/__init__.py new file mode 100644 index 00000000..25a4587e --- /dev/null +++ b/gpu4pyscf/solvent/grad/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . From 3b7b0917f1dd7ab0053083b19982342b3c080b11 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Wed, 18 Oct 2023 00:18:29 -0700 Subject: [PATCH 10/19] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0020f873..58b87b32 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,10 @@ Features - SCF, analytical Gradient, and analytical Hessian calculations for Hartree-Fock and DFT; - LDA, GGA, mGGA, hybrid, and range-separated functionals via [libXC](https://gitlab.com/libxc/libxc/-/tree/master/); - Geometry optimization and transition state search via [geomeTRIC](https://geometric.readthedocs.io/en/latest/); -- Dispersion corrections via [DFT3](https://github.com/dftd3/simple-dftd3) and [DFT4](https://github.com/dftd4/dftd4); +- Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4); - Nonlocal functional correction (vv10) for SCF and gradient; - ECP is supported and calculated on CPU; +- PCM solvent models and their analytical gradients; Limitations -------- From 7c343e4711341e0c50f958bbaadca2285ce7daf2 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Wed, 18 Oct 2023 12:43:56 -0700 Subject: [PATCH 11/19] Refactor _DFHF class. Add tests for to_cpu (#46) * Refactor _DFHF class. Add tests for to_cpu * Undefined variables * Update df_jk.py --------- Co-authored-by: Xiaojie Wu --- gpu4pyscf/df/df.py | 1 + gpu4pyscf/df/df_jk.py | 278 ++++++++++++++++-------------- gpu4pyscf/df/tests/test_df_scf.py | 25 ++- gpu4pyscf/dft/gks.py | 3 +- gpu4pyscf/dft/rks.py | 51 +++--- gpu4pyscf/lib/utils.py | 4 +- gpu4pyscf/scf/hf.py | 21 ++- gpu4pyscf/scf/tests/test_scf.py | 17 +- 8 files changed, 234 insertions(+), 166 deletions(-) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 75e0753a..cc199ca2 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -47,6 +47,7 @@ def __init__(self, mol, auxbasis=None): def to_cpu(self): from gpu4pyscf.lib.utils import to_cpu obj = to_cpu(self) + del obj.intopt, obj.cd_low, obj.nao, obj.naux return obj.reset() def build(self, direct_scf_tol=1e-14, omega=None): diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index d52dfff2..c46bb999 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -96,8 +96,6 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False): with_df.verbose = mf.verbose with_df.auxbasis = auxbasis - mf_class = mf.__class__ - if isinstance(mf, df_jk._DFHF): if mf.with_df is None: mf.with_df = with_df @@ -108,139 +106,153 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False): mf.only_dfj = only_dfj return mf - class DensityFitting(df_jk._DFHF, mf_class): - __doc__ = ''' - Density fitting SCF class - Attributes for density-fitting SCF: - auxbasis : str or basis dict - Same format to the input attribute mol.basis. - The default basis 'weigend+etb' means weigend-coulomb-fit basis - for light elements and even-tempered basis for heavy elements. - with_df : DF object - Set mf.with_df = None to switch off density fitting mode. - See also the documents of class %s for other SCF attributes. - ''' % mf_class - - from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - - def __init__(self, mf, dfobj, only_dfj): - self.__dict__.update(mf.__dict__) - self._eri = None - self.rhoj = None - self.rhok = None - self.direct_scf = False - self.with_df = dfobj - self.only_dfj = only_dfj - self._keys = self._keys.union(['with_df', 'only_dfj']) - - init_workflow = init_workflow - - def reset(self, mol=None): - self.with_df.reset(mol) - return mf_class.reset(self, mol) - - def get_jk(self, mol=None, dm=None, hermi=1, with_j=True, with_k=True, - omega=None): - if dm is None: dm = self.make_rdm1() - if self.with_df and self.only_dfj: - vj = vk = None - if with_j: - vj, vk = self.with_df.get_jk(dm, hermi, True, False, - self.direct_scf_tol, omega) - if with_k: - vk = mf_class.get_jk(self, mol, dm, hermi, False, True, omega)[1] - elif self.with_df: - vj, vk = self.with_df.get_jk(dm, hermi, with_j, with_k, - self.direct_scf_tol, omega) - else: - vj, vk = mf_class.get_jk(self, mol, dm, hermi, with_j, with_k, omega) - return vj, vk - - def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): - ''' - effective potential - ''' - if mol is None: mol = self.mol - if dm is None: dm = self.make_rdm1() - - # for DFT - if mf_class == rks.RKS: - return rks.get_veff(self, dm=dm) - - if self.direct_scf: - ddm = cupy.asarray(dm) - dm_last - vj, vk = self.get_jk(mol, ddm, hermi=hermi) - return vhf_last + vj - vk * .5 - else: - vj, vk = self.get_jk(mol, dm, hermi=hermi) - return vj - vk * .5 - - def energy_elec(self, dm=None, h1e=None, vhf=None): - ''' - electronic energy - ''' - if dm is None: dm = self.make_rdm1() - if h1e is None: h1e = self.get_hcore() - if vhf is None: vhf = self.get_veff(self.mol, dm) - # for DFT - if mf_class == rks.RKS: - e1 = cupy.sum(h1e*dm) - ecoul = self.ecoul - exc = self.exc - e2 = ecoul + exc - #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}') - return e1+e2, e2 - - e1 = cupy.einsum('ij,ji->', h1e, dm).real - e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5 - self.scf_summary['e1'] = e1 - self.scf_summary['e2'] = e_coul - #logger.debug(self, 'E1 = %s E_coul = %s', e1, e_coul) - return e1+e_coul, e_coul - - def energy_tot(self, dm, h1e, vhf=None): - ''' - compute tot energy - ''' - nuc = self.energy_nuc() - e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc - self.scf_summary['nuc'] = nuc.real - return e_tot - - def nuc_grad_method(self): - if mf_class == rks.RKS: - from gpu4pyscf.df.grad import rks as rks_grad - return rks_grad.Gradients(self) - if mf_class == hf.RHF: - from gpu4pyscf.df.grad import rhf as rhf_grad - return rhf_grad.Gradients(self) - raise NotImplementedError() - - - def Hessian(self): - from gpu4pyscf.df.hessian import rhf, rks - if isinstance(self, scf.rhf.RHF): - if isinstance(self, scf.hf.KohnShamDFT): - return rks.Hessian(self) - else: - return rhf.Hessian(self) - else: - raise NotImplementedError + dfmf = _DFHF(mf, with_df, only_dfj) + return lib.set_class(dfmf, (_DFHF, mf.__class__)) - # for pyscf 1.0, 1.1 compatibility - @property - def _cderi(self): - naux = self.with_df.get_naoaux() - return next(self.with_df.loop(blksize=naux)) - @_cderi.setter - def _cderi(self, x): - self.with_df._cderi = x +class _DFHF(df_jk._DFHF): + ''' + Density fitting SCF class + Attributes for density-fitting SCF: + auxbasis : str or basis dict + Same format to the input attribute mol.basis. + The default basis 'weigend+etb' means weigend-coulomb-fit basis + for light elements and even-tempered basis for heavy elements. + with_df : DF object + Set mf.with_df = None to switch off density fitting mode. + ''' - @property - def auxbasis(self): - return getattr(self.with_df, 'auxbasis', None) + from gpu4pyscf.lib.utils import to_gpu, device + + def __init__(self, mf, dfobj, only_dfj): + self.__dict__.update(mf.__dict__) + self._eri = None + self.rhoj = None + self.rhok = None + self.direct_scf = False + self.with_df = dfobj + self.only_dfj = only_dfj + self._keys = self._keys.union(['with_df', 'only_dfj']) + + def undo_df(self): + '''Remove the DFHF Mixin''' + obj = lib.view(self, lib.drop_class(self.__class__, _DFHF)) + del obj.rhoj, obj.rhok, obj.with_df, obj.only_dfj + return obj + + def reset(self, mol=None): + self.with_df.reset(mol) + return super().reset(mol) + + init_workflow = init_workflow + + def get_jk(self, mol=None, dm=None, hermi=1, with_j=True, with_k=True, + omega=None): + if dm is None: dm = self.make_rdm1() + if self.with_df and self.only_dfj: + vj = vk = None + if with_j: + vj, vk = self.with_df.get_jk(dm, hermi, True, False, + self.direct_scf_tol, omega) + if with_k: + vk = super().get_jk(mol, dm, hermi, False, True, omega)[1] + elif self.with_df: + vj, vk = self.with_df.get_jk(dm, hermi, with_j, with_k, + self.direct_scf_tol, omega) + else: + vj, vk = super().get_jk(mol, dm, hermi, with_j, with_k, omega) + return vj, vk - return DensityFitting(mf, with_df, only_dfj) + def nuc_grad_method(self): + if isinstance(self, rks.RKS): + from gpu4pyscf.df.grad import rks as rks_grad + return rks_grad.Gradients(self) + if isinstance(self, hf.RHF): + from gpu4pyscf.df.grad import rhf as rhf_grad + return rhf_grad.Gradients(self) + raise NotImplementedError() + + def Hessian(self): + from pyscf.dft.rks import KohnShamDFT + from gpu4pyscf.df.hessian import rhf, rks + if isinstance(self, scf.rhf.RHF): + if isinstance(self, KohnShamDFT): + return rks.Hessian(self) + else: + return rhf.Hessian(self) + else: + raise NotImplementedError + + @property + def auxbasis(self): + return getattr(self.with_df, 'auxbasis', None) + + def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): + ''' + effective potential + ''' + if mol is None: mol = self.mol + if dm is None: dm = self.make_rdm1() + + # for DFT + if super() == rks.RKS: + return rks.get_veff(self, dm=dm) + + if self.direct_scf: + ddm = cupy.asarray(dm) - dm_last + vj, vk = self.get_jk(mol, ddm, hermi=hermi) + return vhf_last + vj - vk * .5 + else: + vj, vk = self.get_jk(mol, dm, hermi=hermi) + return vj - vk * .5 + + def energy_elec(self, dm=None, h1e=None, vhf=None): + ''' + electronic energy + ''' + if dm is None: dm = self.make_rdm1() + if h1e is None: h1e = self.get_hcore() + if vhf is None: vhf = self.get_veff(self.mol, dm) + # for DFT + if super() == rks.RKS: + e1 = cupy.sum(h1e*dm) + ecoul = self.ecoul + exc = self.exc + e2 = ecoul + exc + #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}') + return e1+e2, e2 + + e1 = cupy.einsum('ij,ji->', h1e, dm).real + e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5 + self.scf_summary['e1'] = e1 + self.scf_summary['e2'] = e_coul + #logger.debug(self, 'E1 = %s E_coul = %s', e1, e_coul) + return e1+e_coul, e_coul + + def energy_tot(self, dm, h1e, vhf=None): + ''' + compute tot energy + ''' + nuc = self.energy_nuc() + e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc + self.scf_summary['nuc'] = nuc.real + return e_tot + + + def to_cpu(self): + obj = self.undo_df().to_cpu().density_fit() + keys = dir(obj) + obj.__dict__.update(self.__dict__) + + for key in set(dir(self)).difference(keys): + delattr(obj, key) + + for key in keys: + val = getattr(obj, key) + if isinstance(val, cupy.ndarray): + setattr(obj, key, cupy.asnumpy(val)) + elif hasattr(val, 'to_cpu'): + setattr(obj, key, val.to_cpu()) + return obj def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): ''' @@ -387,4 +399,4 @@ def get_j(dfobj, dm, hermi=1, direct_scf_tol=1e-13): vj = int3c2e.get_j_int3c2e_pass2(intopt, rhoj) return vj -density_fit = _density_fit \ No newline at end of file +density_fit = _density_fit diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py index ba959b69..1b042096 100644 --- a/gpu4pyscf/df/tests/test_df_scf.py +++ b/gpu4pyscf/df/tests/test_df_scf.py @@ -17,7 +17,9 @@ import numpy as np import pyscf from pyscf import lib +from pyscf.df import df_jk as cpu_df_jk from gpu4pyscf import scf +from gpu4pyscf.df import df_jk from gpu4pyscf.dft import rks lib.num_threads(8) @@ -37,7 +39,7 @@ def setUpModule(): mol.output = '/dev/null' mol.build() mol.verbose = 1 - + def tearDownModule(): global mol mol.stdout.close() @@ -57,7 +59,7 @@ def test_rhf(self): mf = scf.RHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit') e_tot = mf.kernel() assert np.allclose(e_tot, -76.0624582299) - + def test_rks_lda(self): print('------- LDA ----------------') e_tot = run_dft("LDA_X,LDA_C_VWN") @@ -67,17 +69,17 @@ def test_rks_pbe(self): print('------- PBE ----------------') e_tot = run_dft('PBE') assert np.allclose(e_tot, -76.3800181250) - + def test_rks_b3lyp(self): print('-------- B3LYP -------------') e_tot = run_dft('B3LYP') assert np.allclose(e_tot, -76.4666493796) - + def test_rks_m06(self): print('--------- M06 --------------') e_tot = run_dft("M06") assert np.allclose(e_tot, -76.4265841359) - + def test_rks_wb97(self): print('-------- wB97 --------------') e_tot = run_dft("HYB_GGA_XC_WB97") @@ -88,6 +90,19 @@ def test_rks_wb97(self): e_tot = run_dft("HYB_MGGA_XC_WB97M_V") assert np.allclose(e_tot, -76.4334567297) + def test_to_cpu(self): + mf = scf.RHF(mol).density_fit().to_cpu() + assert isinstance(mf, cpu_df_jk._DFHF) + mf = mf.to_gpu() + assert isinstance(mf, df_jk._DFHF) + + mf = rks.RKS(mol).density_fit().to_cpu() + assert isinstance(mf, cpu_df_jk._DFHF) + assert 'gpu' not in mf.grids.__module__ + mf = mf.to_gpu() + assert isinstance(mf, df_jk._DFHF) + assert 'gpu' in mf.grids.__module__ + if __name__ == "__main__": print("Full Tests for SCF") unittest.main() diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py index 53992f0e..a94e31f5 100644 --- a/gpu4pyscf/dft/gks.py +++ b/gpu4pyscf/dft/gks.py @@ -23,8 +23,7 @@ class GKS(gks.GKS): from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mol, xc='LDA,VWN'): - super().__init__(mol, xc) - self._numint = numint.NumInt() + raise NotImplementedError get_jk = GHF.get_jk _eigh = GHF._eigh diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index 8c33ee95..31e67cc8 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -195,11 +195,38 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): else: ecoul = None t0 = logger.timer_debug1(ks, 'jk total', *t0) - ks.ecoul = ecoul - ks.exc = exc vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk) return vxc +def energy_elec(ks, dm=None, h1e=None, vhf=None): + r'''Electronic part of RKS energy. + + Note this function has side effects which cause mf.scf_summary updated. + + Args: + ks : an instance of DFT class + + dm : 2D ndarray + one-partical density matrix + h1e : 2D ndarray + Core hamiltonian + + Returns: + RKS electronic energy and the 2-electron contribution + ''' + if dm is None: dm = ks.make_rdm1() + if h1e is None: h1e = ks.get_hcore() + if vhf is None: vhf = ks.get_veff(ks.mol, dm) + e1 = cupy.einsum('ij,ji->', h1e, dm).real + ecoul = vhf.ecoul.real + exc = vhf.exc.real + e2 = ecoul + exc + ks.scf_summary['e1'] = e1 + ks.scf_summary['coul'] = ecoul + ks.scf_summary['exc'] = exc + logger.debug(ks, 'E1 = %s Ecoul = %s Exc = %s', e1, ecoul, exc) + return e1+e2, e2 + class RKS(scf.hf.RHF, rks.RKS): from gpu4pyscf.lib.utils import to_cpu, to_gpu, device @@ -241,27 +268,11 @@ def reset(self, mol=None): self._numint.gdftopt = None return self - def energy_elec(self, dm=None, h1e=None, vhf=None): - if dm is None: dm = self.make_rdm1() - if h1e is None: h1e = self.get_hcore() - if vhf is None: vhf = self.get_veff(self.mol, dm) - - e1 = cupy.sum(h1e*dm) - ecoul = self.ecoul - exc = self.exc - e2 = ecoul + exc - return e1+e2, e2 - - def energy_tot(self, dm, h1e, vhf=None): - nuc = self.energy_nuc() - e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc - self.scf_summary['nuc'] = nuc.real - return e_tot - def nuc_grad_method(self): from gpu4pyscf.grad import rks as rks_grad return rks_grad.Gradients(self) - + + energy_elec = energy_elec get_jk = RHF.get_jk get_veff = get_veff _eigh = RHF._eigh diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index 4489fab3..a5cf6187 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -38,10 +38,10 @@ def to_cpu(method): break method = method.view(pyscf_cls) - keys = set() + keys = [] for cls in pyscf_cls.__mro__[:-1]: if hasattr(cls, '_keys'): - keys.update(cls._keys) + keys.extend(cls._keys) if keys: keys = set(keys).intersection(method.__dict__) diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index a3fb3417..1d58a02e 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -301,7 +301,7 @@ def get_occ(mf, mo_energy=None, mo_coeff=None): return mo_occ def get_veff(mf, mol=None, dm=None, dm_last=None, vhf_last=None, hermi=1, vhfopt=None): - if dm_last is None: + if dm_last is None or not mf.direct_scf: vj, vk = mf.get_jk(mol, cupy.asarray(dm), hermi) return vj - vk * .5 else: @@ -351,6 +351,20 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None, f = level_shift(s1e, dm*.5, f, level_shift_factor) return f +def energy_elec(self, dm=None, h1e=None, vhf=None): + ''' + electronic energy + ''' + if dm is None: dm = self.make_rdm1() + if h1e is None: h1e = self.get_hcore() + if vhf is None: vhf = self.get_veff(self.mol, dm) + e1 = cupy.einsum('ij,ji->', h1e, dm).real + e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5 + self.scf_summary['e1'] = e1 + self.scf_summary['e2'] = e_coul + logger.debug(self, 'E1 = %s E_coul = %s', e1, e_coul) + return e1+e_coul, e_coul + def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, dump_chk=True, dm0=None, callback=None, conv_check=True, **kwargs): conv_tol = mf.conv_tol @@ -371,7 +385,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, mo_occ = cupy.asarray(dm0.mo_occ) occ_coeff = cupy.asarray(mo_coeff[:,mo_occ>0]) dm = tag_array(dm, occ_coeff=occ_coeff, mo_occ=mo_occ, mo_coeff=mo_coeff) - + # use optimized workflow if possible if hasattr(mf, 'init_workflow'): mf.init_workflow(dm0=dm) @@ -552,6 +566,7 @@ class RHF(hf.RHF): #_eigh = staticmethod(_eigh) _eigh = _eigh make_rdm1 = make_rdm1 + energy_elec = energy_elec get_fock = get_fock get_occ = get_occ get_veff = get_veff @@ -595,7 +610,7 @@ def reset(self, mol=None): def nuc_grad_method(self): from gpu4pyscf.grad import rhf return rhf.Gradients(self) - + def density_fit(self, auxbasis=None, with_df=None, only_dfj=False): import gpu4pyscf.df.df_jk return gpu4pyscf.df.df_jk.density_fit(self, auxbasis, with_df, only_dfj) diff --git a/gpu4pyscf/scf/tests/test_scf.py b/gpu4pyscf/scf/tests/test_scf.py index c5b32ace..3dd94806 100644 --- a/gpu4pyscf/scf/tests/test_scf.py +++ b/gpu4pyscf/scf/tests/test_scf.py @@ -18,6 +18,8 @@ import cupy import pyscf from pyscf import lib +from pyscf import scf as cpu_scf +from pyscf import dft as cpu_dft from gpu4pyscf import scf from gpu4pyscf.dft import rks @@ -47,6 +49,19 @@ def test_rhf(self): e_tot = mf.kernel() assert np.allclose(e_tot, -76.0667232412) + def test_to_cpu(self): + mf = scf.RHF(mol).to_cpu() + assert isinstance(mf, cpu_scf.RHF) + mf = mf.to_gpu() + assert isinstance(mf, scf.RHF) + + mf = rks.RKS(mol).to_cpu() + assert isinstance(mf, cpu_dft.rks.RKS) + assert 'gpu' not in mf.grids.__module__ + mf = mf.to_gpu() + assert isinstance(mf, rks.RKS) + assert 'gpu' in mf.grids.__module__ + if __name__ == "__main__": print("Full Tests for SCF") - unittest.main() \ No newline at end of file + unittest.main() From 5f880a250ad3e67c40df6b0fd5f29038396f63ea Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 24 Oct 2023 15:54:19 -0700 Subject: [PATCH 12/19] Optimize hessian intermediate variables (#51) * numpy -> cupy for solvent * for linter * remove grad switch from pcm.py * passed flake8 * solvent integrals on GPU * flake8 * compatiable with pyscf-2.4.0 * added solvent * fixed issues for to_cpu * store intermeidate variable on CPU * cupy.einsum -> contract --- benchmarks/df/dft_driver.py | 13 +- benchmarks/df/run_gpu4pyscf.sh | 2 +- examples/00-h2o.py | 4 +- examples/dft_driver.py | 20 +-- examples/sp.in | 20 --- gpu4pyscf/df/df.py | 8 +- gpu4pyscf/df/df_jk.py | 38 ++---- gpu4pyscf/df/hessian/rhf.py | 146 ++++++++++++--------- gpu4pyscf/df/int3c2e.py | 168 ++++++++++++++++--------- gpu4pyscf/df/tests/test_df_scf.py | 16 ++- gpu4pyscf/hessian/rhf.py | 3 +- gpu4pyscf/lib/cupy_helper.py | 3 +- gpu4pyscf/lib/gdft/nr_eval_gto.cu | 38 +++--- gpu4pyscf/lib/gdft/nr_numint_sparse.cu | 6 +- gpu4pyscf/lib/gdft/vv10.cu | 26 ++-- gpu4pyscf/lib/utils.py | 1 - gpu4pyscf/scf/tests/test_scf.py | 23 ++-- gpu4pyscf/solvent/grad/pcm.py | 1 - gpu4pyscf/solvent/pcm.py | 2 +- 19 files changed, 290 insertions(+), 248 deletions(-) delete mode 100644 examples/sp.in diff --git a/benchmarks/df/dft_driver.py b/benchmarks/df/dft_driver.py index b2682fec..ea979df8 100644 --- a/benchmarks/df/dft_driver.py +++ b/benchmarks/df/dft_driver.py @@ -16,6 +16,8 @@ parser.add_argument('--input_path', type=str, default='./') parser.add_argument('--output_path', type=str, default='./') parser.add_argument('--with_hessian', type=bool, default=False) +parser.add_argument('--solvent', type=str, default='') + args = parser.parse_args() bas = args.basis verbose = args.verbose @@ -39,13 +41,18 @@ output_file = 'PySCF-16-cores-CPU.csv' output_file = args.output_path + output_file -def run_dft(filename): +def run_dft(filename): mol = pyscf.M(atom=filename, basis=bas, max_memory=64000) - start_time = time.time() + start_time = time.time() # set verbose >= 6 for debugging timer mol.verbose = 4 #verbose mol.max_memory = 40000 mf = rks.RKS(mol, xc=xc).density_fit(auxbasis='def2-universal-jkfit') + if args.solvent: + mf = mf.PCM() + mf.lebedev_order = 29 + mf.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) mf.chkfile = None prep_time = time.time() - start_time @@ -75,7 +82,7 @@ def run_dft(filename): # calculate hessian if args.device == 'GPU': cupy.get_default_memory_pool().free_all_blocks() - + hess_time = -1 if args.with_hessian: try: diff --git a/benchmarks/df/run_gpu4pyscf.sh b/benchmarks/df/run_gpu4pyscf.sh index c3c5dc6b..c50cfceb 100644 --- a/benchmarks/df/run_gpu4pyscf.sh +++ b/benchmarks/df/run_gpu4pyscf.sh @@ -3,7 +3,7 @@ DIR="./organic/xc" [ ! -d "$DIR" ] && mkdir -p "$DIR" for xc in LDA PBE B3LYP M06 wB97m-v -do +do python3 dft_driver.py --input_path ../molecules/organic/ --output_path ./organic/xc/$xc/ --xc $xc done exit diff --git a/examples/00-h2o.py b/examples/00-h2o.py index 622a8194..7f17e62d 100644 --- a/examples/00-h2o.py +++ b/examples/00-h2o.py @@ -18,7 +18,7 @@ from gpu4pyscf.dft import rks lib.num_threads(8) -atom =''' +atom =''' O 0.0000000000 -0.0000000000 0.1174000000 H -0.7570000000 -0.0000000000 -0.4696000000 H 0.7570000000 0.0000000000 -0.4696000000 @@ -34,7 +34,7 @@ mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) -mol.verbose = 1 +mol.verbose = 4 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) mf_GPU.grids.level = grids_level mf_GPU.conv_tol = scf_tol diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 65ca2ad5..3b68d665 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -15,17 +15,16 @@ import pyscf import time +import argparse from pyscf import lib - from gpu4pyscf.dft import rks lib.num_threads(8) -import argparse - parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules') parser.add_argument("--input", type=str, default='benzene/coord') parser.add_argument("--basis", type=str, default='def2-tzvpp') parser.add_argument("--auxbasis", type=str, default='def2-tzvpp-jkfit') +parser.add_argument("--xc", type=str, default='B3LYP') parser.add_argument("--solvent", type=bool, default=False) args = parser.parse_args() @@ -36,23 +35,28 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 4 +mol.verbose = 6 -mf_df = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit(auxbasis=args.auxbasis) +mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) if args.solvent: mf_df = mf_df.PCM() + mf_df.lebedev_order = 29 + mf_df.method = 'IEF-PCM' mf_df.grids.atom_grid = (99,590) mf_df.kernel() +scf_time = time.time() - start_time +print(f'compute time for energy: {scf_time:.3f} s') -print('compute time for energy: {}s'.format((time.time() - start_time))) start_time = time.time() g = mf_df.nuc_grad_method() g.auxbasis_response = True f = g.kernel() -print('compute time for gradient: {}s'.format((time.time() - start_time))) +grad_time = time.time() - start_time +print(f'compute time for gradient: {grad_time:.3f} s') start_time = time.time() h = mf_df.Hessian() h.auxbasis_response = 2 h_dft = h.kernel() -print('compute time for hessian: {}s'.format((time.time() - start_time))) +hess_time = time.time() - start_time +print(f'compute time for hessian: {hess_time:.3f} s') diff --git a/examples/sp.in b/examples/sp.in deleted file mode 100644 index aba5a01a..00000000 --- a/examples/sp.in +++ /dev/null @@ -1,20 +0,0 @@ -$molecule -0 1 -O 0.0000000000 -0.0000000000 0.1174000000 -H -0.7570000000 -0.0000000000 -0.4696000000 -H 0.7570000000 0.0000000000 -0.4696000000 -$end - -$rem -JOBTYPE sp -METHOD B3LYP -DFT_D D3_BJ -BASIS def2-tzvpp -SCF_CONVERGENCE 10 -THRESH 14 -RI_J TRUE -RI_K TRUE -AUX_BASIS RIJK-def2-tzvpp -PURECART 1111 -$end - diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index cc199ca2..28998230 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -132,7 +132,7 @@ def get_blksize(self, extra=0, nao=None): raise RuntimeError("Not enough GPU memory") return blksize - + def loop(self, blksize=None, unpack=True): ''' loop over all cderi and unpack @@ -208,12 +208,10 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): else: use_gpu_memory = False if(not use_gpu_memory): - import warnings - warnings.warn("Not enough GPU memory") + log.debug("Not enough GPU memory") # TODO: async allocate memory mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8) cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem) - data_stream = cupy.cuda.stream.Stream(non_blocking=False) count = 0 nq = len(intopt.log_qs) @@ -260,7 +258,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): if cpi == cpj: ints_slices = ints_slices + ints_slices.transpose([0,2,1]) ints_slices = ints_slices[:,col,row] - + if cd_low.tag == 'eig': cderi_block = cupy.dot(cd_low.T, ints_slices) ints_slices = None diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index c46bb999..7f6c3933 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -121,7 +121,7 @@ class _DFHF(df_jk._DFHF): Set mf.with_df = None to switch off density fitting mode. ''' - from gpu4pyscf.lib.utils import to_gpu, device + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mf, dfobj, only_dfj): self.__dict__.update(mf.__dict__) @@ -131,7 +131,7 @@ def __init__(self, mf, dfobj, only_dfj): self.direct_scf = False self.with_df = dfobj self.only_dfj = only_dfj - self._keys = self._keys.union(['with_df', 'only_dfj']) + self._keys = mf._keys.union(['with_df', 'only_dfj']) def undo_df(self): '''Remove the DFHF Mixin''' @@ -185,7 +185,7 @@ def Hessian(self): @property def auxbasis(self): return getattr(self.with_df, 'auxbasis', None) - + def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): ''' effective potential @@ -194,7 +194,7 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): if dm is None: dm = self.make_rdm1() # for DFT - if super() == rks.RKS: + if isinstance(self, scf.hf.KohnShamDFT): return rks.get_veff(self, dm=dm) if self.direct_scf: @@ -205,29 +205,6 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): vj, vk = self.get_jk(mol, dm, hermi=hermi) return vj - vk * .5 - def energy_elec(self, dm=None, h1e=None, vhf=None): - ''' - electronic energy - ''' - if dm is None: dm = self.make_rdm1() - if h1e is None: h1e = self.get_hcore() - if vhf is None: vhf = self.get_veff(self.mol, dm) - # for DFT - if super() == rks.RKS: - e1 = cupy.sum(h1e*dm) - ecoul = self.ecoul - exc = self.exc - e2 = ecoul + exc - #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}') - return e1+e2, e2 - - e1 = cupy.einsum('ij,ji->', h1e, dm).real - e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5 - self.scf_summary['e1'] = e1 - self.scf_summary['e2'] = e_coul - #logger.debug(self, 'E1 = %s E_coul = %s', e1, e_coul) - return e1+e_coul, e_coul - def energy_tot(self, dm, h1e, vhf=None): ''' compute tot energy @@ -237,13 +214,13 @@ def energy_tot(self, dm, h1e, vhf=None): self.scf_summary['nuc'] = nuc.real return e_tot - + ''' def to_cpu(self): obj = self.undo_df().to_cpu().density_fit() keys = dir(obj) obj.__dict__.update(self.__dict__) - for key in set(dir(self)).difference(keys): + print(key) delattr(obj, key) for key in keys: @@ -253,6 +230,7 @@ def to_cpu(self): elif hasattr(val, 'to_cpu'): setattr(obj, key, val.to_cpu()) return obj + ''' def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): ''' @@ -274,7 +252,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- nset = dms.shape[0] t0 = (logger.process_clock(), logger.perf_counter()) if dfobj._cderi is None: - log.warn('CDERI not found, build...') + log.debug('CDERI not found, build...') dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) assert nao == dfobj.nao diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 335f01ca..dfcccddb 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -38,7 +38,7 @@ import numpy as np from pyscf import lib, df from gpu4pyscf.hessian import rhf as rhf_hess -from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack +from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger @@ -94,7 +94,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=128, group_size=128) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=64, group_size_aux=32) sph_ao_idx = intopt.sph_ao_idx sph_aux_idx = intopt.sph_aux_idx @@ -117,11 +117,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_ao_aux = cupy.zeros([nao,naux,3,3]) # int3c contributions - wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) + wj, _, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) rhoj0_P = contract('pq,q->p', int2c_inv, wj) - wk_P__ = contract('Lio,ir->Lro', wk_Pl_, mocc_2) rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__) - wj = wk_P__ = wk_Pl_ = None + wj = wk_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) # int3c_ip2 contributions @@ -143,34 +142,46 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, wj1_01 = None if with_k: - for p0, p1 in lib.prange(0,naux,64): - rhok1_Pko = contract('pq,iqox->pxio', int2c_inv[p0:p1], wk1_Pko) - # (10|0)(0|10) without response of RI basis - vk2_ip1_ip1 = cupy.einsum('ipox,pyko->kixy', wk1_Pko[:,p0:p1], rhok1_Pko) - hk_ao_ao += cupy.einsum('kixy,ki->ikxy', vk2_ip1_ip1, dm0) - vk2_ip1_ip1 = None - # (10|0)(0|01) without response of RI basis - bra = cupy.einsum('pyko,io->ikpy', rhok1_Pko, mocc_2) - ket = cupy.einsum('ipox,ko->ipkx', wk1_Pko[:,p0:p1], mocc_2) - hk_ao_ao += cupy.einsum('ikpy,ipkx->ikxy', bra, ket) - bra = ket = None + if hessobj.auxbasis_response: + wk1_P__ = contract('ypq,qor->ypor', int2c_ip1, rhok0_P__) + int2c_ip1_inv = cupy.asarray(int2c_ip1_inv) + + for i0, i1 in lib.prange(0,nao,64): + wk1_Pko_islice = cupy.asarray(wk1_Pko[i0:i1]) + rhok1_Pko = contract('pq,iqox->ipox', int2c_inv, wk1_Pko_islice) + for k0, k1 in lib.prange(0,nao,64): + wk1_Pko_kslice = cupy.asarray(wk1_Pko[k0:k1]) + + # (10|0)(0|10) without response of RI basis + vk2_ip1_ip1 = contract('ipox,kpoy->ikxy', rhok1_Pko, wk1_Pko_kslice) + hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0[i0:i1,k0:k1]) + vk2_ip1_ip1 = None + + # (10|0)(0|01) without response of RI basis + bra = contract('ipox,ko->ipkx', rhok1_Pko, mocc_2[k0:k1]) + ket = contract('kpoy,io->kpiy', wk1_Pko_kslice, mocc_2[i0:i1]) + hk_ao_ao[i0:i1,k0:k1] += contract('ipkx,kpiy->ikxy', bra, ket) + bra = ket = None + wk1_Pko_kslice = None if hessobj.auxbasis_response: # (10|0)(1|00) - wk_ip2_Ipo = cupy.einsum('porx,io->ipxr', wk_ip2_P__[p0:p1], mocc_2) - hk_ao_aux[:,p0:p1] += cupy.einsum('pxio,ipyo->ipxy', rhok1_Pko, wk_ip2_Ipo) + wk_ip2_Ipo = contract('porx,io->iprx', wk_ip2_P__, mocc_2[i0:i1]) + hk_ao_aux[i0:i1] += contract('ipox,ipoy->ipxy', rhok1_Pko, wk_ip2_Ipo) wk_ip2_Ipo = None + # (10|0)(1|0)(0|00) - wk1_P__ = cupy.einsum('ypq,qor->ypor', int2c_ip1[:,p0:p1], rhok0_P__) - wk1_P_I = cupy.einsum('ypor,ir->ypoi', wk1_P__, mocc_2) - hk_ao_aux[:,p0:p1] -= cupy.einsum('pxio,ypoi->ipxy', rhok1_Pko, wk1_P_I) - wk1_P_I = wk1_P__ = None + wk1_P_I = contract('ypor,ir->ipoy', wk1_P__, mocc_2[i0:i1]) + hk_ao_aux[i0:i1] -= contract("ipox,ipoy->ipxy", rhok1_Pko, wk1_P_I) + wk1_P_I = rhok1_Pko = None + # (10|0)(0|1)(0|00) - int2c_tmp = cupy.asarray(int2c_ip1_inv[:,p0:p1], order='C') - wk1_I = contract('yqp,ipox->qxyio', int2c_tmp, wk1_Pko) - rhok0_tmp = cupy.einsum('qor,ir->qoi', rhok0_P__[p0:p1], mocc_2) - hk_ao_aux[:,p0:p1] -= cupy.einsum('qoi,qxyio->iqxy', rhok0_tmp, wk1_I) + wk1_I = contract('yqp,ipox->iqoxy', int2c_ip1_inv, wk1_Pko_islice) + rhok0_tmp = contract('qor,ir->iqo', rhok0_P__, mocc_2[i0:i1]) + hk_ao_aux[i0:i1] -= contract('iqo,iqoxy->iqxy', rhok0_tmp, wk1_I) wk1_I = rhok0_tmp = None - wk1_Pko = rhok1_Pko = int2c_tmp = None + wk1_Pko_islice = None + wk1_P__ = None + wk1_Pko = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) cupy.get_default_memory_pool().free_all_blocks() @@ -184,6 +195,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hj, hk = int3c2e.get_int3c2e_ipvip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega) hj_ao_ao += 2.0*hj hk_ao_ao += hk + hj = hk = None t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1) # int3c_ip1ip2 contributions @@ -192,6 +204,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hj, hk = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega) hj_ao_aux += hj hk_ao_aux += hk + hj = hk = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1) # int3c_ipip2 contributions @@ -200,11 +213,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hj, hk = int3c2e.get_int3c2e_ipip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega) hj_aux_diag = hj hk_aux_diag = .5*hk + hj = hk = None t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1) # int2c contributions if hessobj.auxbasis_response > 1: - aux_aux_9 = cupy.ix_(np.arange(9), sph_aux_idx, sph_aux_idx) + aux_aux_9 = np.ix_(np.arange(9), sph_aux_idx, sph_aux_idx) if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') @@ -212,12 +226,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1) int2c_ipip1 = int2c_ipip1[aux_aux_9] - rhoj2c_P = cupy.einsum('xpq,q->xp', int2c_ipip1, rhoj0_P) + rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) hj_aux_diag -= cupy.einsum('p,xp->px', rhoj0_P, rhoj2c_P).reshape(-1,3,3) if with_k: - rho2c_0 = cupy.einsum('pij,qji->pq', rhok0_P__, rhok0_P__) - hk_aux_diag -= .5 * cupy.einsum('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) + rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) + hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None if omega and omega > 1e-10: @@ -229,39 +243,41 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = int2c_ip1ip2[aux_aux_9] hj_aux_aux = -.5 * cupy.einsum('p,xpq,q->pqx', rhoj0_P, int2c_ip1ip2, rhoj0_P).reshape(naux, naux,3,3) if with_k: - hk_aux_aux = -.5 * cupy.einsum('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) + hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) int2c_ip1ip2 = aux_aux_9 = None + cupy.get_default_memory_pool().free_all_blocks() + release_gpu_stack() # aux-aux pair if hessobj.auxbasis_response > 1: - wj0_10 = cupy.einsum('ypq,p->ypq', int2c_ip1, rhoj0_P) - rhoj1 = cupy.einsum('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) - rhoj0_01 = cupy.einsum('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) - rhoj0_10 = cupy.einsum('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) - - hj_aux_aux += .5 * cupy.einsum('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) - hj_aux_aux -= cupy.einsum('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) - hj_aux_aux += .5 * cupy.einsum('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) - hj_aux_aux -= cupy.einsum('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) - hj_aux_aux += .5 * cupy.einsum('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) - hj_aux_aux += cupy.einsum('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) + wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) + rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) + rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) + + hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) + hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) + hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) + hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) + hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) + hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) wj0_01 = wj0_10 = rhoj1 = rhoj0_01 = rhoj0_10 = rhoj0_P = wj_ip2 = None if with_k: - rho2c_10 = cupy.einsum('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) - rho2c_11 = cupy.einsum('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__) - rho2c0_10 = cupy.einsum('xpq,qr->xpr', int2c_ip1, rho2c_0) # (00|0)(0|1)_(0|00) - rho2c1_10 = cupy.einsum('xpr,qry->pqxy', int2c_ip1, rho2c_10) # (00|1)_(1|0)(0|00) - rho2c0_11 = cupy.einsum('xpr,yqr->pqxy', rho2c0_10, int2c_ip1) # (00|0)(0|1)_(1|0)(0|00) - int2c_ip_ip = cupy.einsum('xpr,ysr->xyps', int2c_ip1_inv, int2c_ip1) # (0|1)(0|0)(1|0) - - hk_aux_aux += .5 * cupy.einsum('xypq,pq->pqxy', int2c_ip_ip, rho2c_0) # (00|0)(1|0)(0|1)(0|00) - hk_aux_aux += .5 * cupy.einsum('pqxy,pq->pqxy', rho2c0_11, int2c_inv) # (00|0)(0|1)(1|0)(0|00) - hk_aux_aux += cupy.einsum('xpq,yqp->pqxy', int2c_ip1_inv, rho2c0_10) # (00|0)(1|0)(1|0)(0|00) - hk_aux_aux -= cupy.einsum('pqxy,pq->pqxy', rho2c1_10, int2c_inv) # (00|1)(1|0)(0|00) - hk_aux_aux -= cupy.einsum('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv) # (00|1)(0|1)(0|00) - hk_aux_aux += .5 * cupy.einsum('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) + rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) + rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__) + rho2c0_10 = contract('xpq,qr->xpr', int2c_ip1, rho2c_0) # (00|0)(0|1)_(0|00) + rho2c1_10 = contract('xpr,qry->pqxy', int2c_ip1, rho2c_10) # (00|1)_(1|0)(0|00) + rho2c0_11 = contract('xpr,yqr->pqxy', rho2c0_10, int2c_ip1) # (00|0)(0|1)_(1|0)(0|00) + int2c_ip_ip = contract('xpr,ysr->xyps', int2c_ip1_inv, int2c_ip1) # (0|1)(0|0)(1|0) + + hk_aux_aux += .5 * contract('xypq,pq->pqxy', int2c_ip_ip, rho2c_0) # (00|0)(1|0)(0|1)(0|00) + hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c0_11, int2c_inv) # (00|0)(0|1)(1|0)(0|00) + hk_aux_aux += contract('xpq,yqp->pqxy', int2c_ip1_inv, rho2c0_10) # (00|0)(1|0)(1|0)(0|00) + hk_aux_aux -= contract('pqxy,pq->pqxy', rho2c1_10, int2c_inv) # (00|1)(1|0)(0|00) + hk_aux_aux -= contract('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv) # (00|1)(0|1)(0|00) + hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None wk_ip2_P__ = int2c_ip1_inv = None ao_idx = np.argsort(intopt.sph_ao_idx) @@ -413,12 +429,17 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)] int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) - wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) - wk_P__ = contract('pio,ir->pro', wk_Pl_, mocc) + wj, wk_Pl_, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) rhoj0 = contract('pq,q->p', int2c_inv, wj) - rhok0_Pl_ = contract('pq,qio->pio', int2c_inv, wk_Pl_) if with_k: rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__) + if isinstance(wk_Pl_, cupy.ndarray): + rhok0_Pl_ = contract('pq,qio->pio', int2c_inv, wk_Pl_) + else: + rhok0_Pl_ = np.empty_like(wk_Pl_) + for p0, p1 in lib.prange(0,nao,64): + wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) + rhok0_Pl_[:,p0:p1] = cupy.einsum('pq,qio->pio', int2c_inv, wk_tmp).get() wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None # int3c_ip1 contributions @@ -451,14 +472,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) for p0, p1 in lib.prange(0,nao,64): - vj1_tmp = cupy.einsum('pio,xp->xpio', rhok0_Pl_[:,p0:p1], wj0_10) + rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) + vj1_tmp = cupy.einsum('pio,xp->xpio', rhok_tmp, wj0_10) - wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok0_Pl_[:,p0:p1]) + wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok_tmp) vj1_tmp += cupy.einsum('xpio,p->xpio', wk0_10_Pl_, rhoj0) vj1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom) if with_k: vk1_tmp = 2.0 * cupy.einsum('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) - vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok0_Pl_[:,p0:p1]) + vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok_tmp) vk1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vk1_tmp, aux2atom) wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 7995c6d8..36268e08 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -21,7 +21,8 @@ from pyscf import gto, df, lib from pyscf.scf import _vhf from gpu4pyscf.scf.hf import BasisProdCache, _make_s_index_offsets -from gpu4pyscf.lib.cupy_helper import block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l +from gpu4pyscf.lib.cupy_helper import ( + block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info) from gpu4pyscf.lib import logger LMAX_ON_GPU = 8 @@ -316,13 +317,14 @@ def build(self, cutoff=1e-14, group_size=None, cput1 = logger.timer_debug1(tot_mol, 'Initialize GPU cache', *cput1) self.bas_pairs_locs = bas_pairs_locs ncptype = len(self.log_qs) + self.aosym = aosym if aosym: self.cp_idx, self.cp_jdx = np.tril_indices(ncptype) else: nl = int(round(np.sqrt(ncptype))) self.cp_idx, self.cp_jdx = np.unravel_index(np.arange(ncptype), (nl, nl)) -def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None): +def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size_aux=64) orbo = dm0_tag.occ_coeff @@ -331,7 +333,25 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None): nocc = orbo.shape[1] row, col = np.tril_indices(nao) wj = cupy.zeros([naux]) - wk = cupy.zeros([naux,nao,nocc]) + if with_k: + wk_P__ = cupy.zeros([naux, nocc, nocc]) # assuming naux*nocc*nocc < max_gpu_memory + else: + wk_P__ = None + avail_mem = get_avail_mem() + use_gpu_memory = True + if naux*nao*nocc*8 < 0.4*avail_mem: + try: + wk = cupy.zeros([naux,nao,nocc]) + except Exception: + use_gpu_memory = False + else: + use_gpu_memory = False + + if not use_gpu_memory: + mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8) + wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem) + + # TODO: async data transfer for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] @@ -347,10 +367,17 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None): i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1] j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1] ints_slices[:,j0:j1,i0:i1] = int3c_blk + ints_slices[:, row, col] = ints_slices[:, col, row] wj[k0:k1] = contract('Lij,ij->L', ints_slices, dm0_tag) - wk[k0:k1] = contract('Lij,jo->Lio', ints_slices, orbo) - return wj, wk + if with_k: + wk_tmp = contract('Lij,jo->Lio', ints_slices, orbo) + wk_P__[k0:k1] = contract('Lio,ir->Lro', wk_tmp, orbo) + if isinstance(wk, cupy.ndarray): + wk[k0:k1] = contract('Lij,jo->Lio', ints_slices, orbo) + else: + wk[k0:k1] = contract('Lij,jo->Lio', ints_slices, orbo).get() + return wj, wk, wk_P__ def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None): ''' @@ -688,12 +715,12 @@ def get_int3c2e_jk(intopt, dm0_tag, with_k=True, omega=None): i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1] j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1] ints_slices[:,j0:j1,i0:i1] = int3c_blk - if cpi != cpj: + if cpi != cpj and intopt.aosym: ints_slices[:,i0:i1,j0:j1] = int3c_blk.transpose([0,2,1]) - rhoj[k0:k1] += cupy.einsum('pji,ij->p', ints_slices, dm0_tag) - rhok_tmp = cupy.einsum('pji,jo->poi', ints_slices, orbo) - rhok[k0:k1] += cupy.einsum('poi,ir->por', rhok_tmp, orbo) + rhoj[k0:k1] += contract('pji,ij->p', ints_slices, dm0_tag) + rhok_tmp = contract('pji,jo->poi', ints_slices, orbo) + rhok[k0:k1] += contract('poi,ir->por', rhok_tmp, orbo) return rhoj, rhok @@ -713,23 +740,24 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega): k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - vj1_buf += cupy.einsum('xpji,p->xij', int3c_blk, rhoj[k0:k1]) + vj1_buf += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) + rhok_tmp = cupy.asarray(rhok[k0:k1]) if with_k: - rhok0_slice = cupy.einsum('pio,Jo->piJ', rhok[k0:k1], orbo) * 2 - vk1_buf += cupy.einsum('xpji,plj->xil', int3c_blk, rhok0_slice) + rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2 + vk1_buf += contract('xpji,plj->xil', int3c_blk, rhok0_slice) - rhoj0 = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag) - vj1_ao = cupy.einsum('pjo,xpi->xijo', rhok[k0:k1], rhoj0) - vj1 += 2.0*cupy.einsum('xiko,ia->axko', vj1_ao, ao2atom) + rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag) + vj1_ao = contract('pjo,xpi->xijo', rhok_tmp, rhoj0) + vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom) if with_k: - int3c_ip1_occ = cupy.einsum('xpji,jo->xpio', int3c_blk, orbo) - vk1_ao = cupy.einsum('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) - vk1 += cupy.einsum('xiko,ia->axko', vk1_ao, ao2atom) + int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo) + vk1_ao = contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) + vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) - rhok0 = cupy.einsum('pli,lo->poi', rhok0_slice, orbo) - vk1_ao = cupy.einsum('xpji,poi->xijo', int3c_blk, rhok0) - vk1 += cupy.einsum('xiko,ia->axko', vk1_ao, ao2atom) + rhok0 = contract('pli,lo->poi', rhok0_slice, orbo) + vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0) + vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) return vj1_buf, vk1_buf, vj1, vk1 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None): @@ -745,21 +773,22 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome vk1 = cupy.zeros([natom,3,nao_sph,nocc]) for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip2', omega=omega): k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - wj2 = cupy.einsum('xpji,ji->xp', int3c_blk, dm0_tag) - wk2_P__ = cupy.einsum('xpji,jo->xpio', int3c_blk, orbo) + wj2 = contract('xpji,ji->xp', int3c_blk, dm0_tag) + wk2_P__ = contract('xpji,jo->xpio', int3c_blk, orbo) - vj1_tmp = -cupy.einsum('pio,xp->xpio', rhok[k0:k1], wj2) - vj1_tmp -= cupy.einsum('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) + rhok_tmp = cupy.asarray(rhok[k0:k1]) + vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) + vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) - vj1 += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) + vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) if with_k: - rhok0_slice = cupy.einsum('pio,jo->pij', rhok[k0:k1], orbo) - vk1_tmp = -cupy.einsum('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2 + rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo) + vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2 - rhok0_oo = cupy.einsum('pio,ir->pro', rhok[k0:k1], orbo) - vk1_tmp -= cupy.einsum('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2 + rhok0_oo = contract('pio,ir->pro', rhok_tmp, orbo) + vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2 - vk1 += cupy.einsum('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) + vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None return vj1, vk1 @@ -772,11 +801,29 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] wj = cupy.empty([nao_sph,naux_sph,3]) - wk = cupy.empty([nao_sph,naux_sph,nocc,3]) + avail_mem = get_avail_mem() + use_gpu_memory = True + if nao_sph*naux_sph*nocc*3*8 < 0.4*avail_mem: + try: + wk = cupy.empty([nao_sph,naux_sph,nocc,3]) + except Exception: + use_gpu_memory = False + else: + use_gpu_memory = False + + if not use_gpu_memory: + mem = cupy.cuda.alloc_pinned_memory(nao_sph*naux_sph*nocc*3*8) + wk = np.ndarray([nao_sph,naux_sph,nocc,3], dtype=np.float64, order='C', buffer=mem) + + # TODO: async data transfer for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega): k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - wj[:,k0:k1] = cupy.einsum('xpji,ij->ipx', int3c_blk, dm0_tag) - wk[:,k0:k1] = cupy.einsum('xpji,jo->ipox', int3c_blk, orbo) + wj[:,k0:k1] = contract('xpji,ij->ipx', int3c_blk, dm0_tag) + wk_tmp = contract('xpji,jo->ipox', int3c_blk, orbo) + if use_gpu_memory: + wk[:,k0:k1] = wk_tmp + else: + wk[:,k0:k1] = wk_tmp.get() return wj, wk def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): @@ -786,13 +833,12 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): naux_sph = len(intopt.sph_aux_idx) orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] - wj = cupy.empty([naux_sph,3]) - wk = cupy.empty([naux_sph,nocc,nocc,3]) - for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip2', omega=omega): - k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - wj[k0:k1] = cupy.einsum('xpji,ij->px', int3c_blk, dm0_tag) - tmp = cupy.einsum('xpji,jo->piox', int3c_blk, orbo) - wk[k0:k1] = cupy.einsum('piox,ir->prox', tmp, orbo) + wj = cupy.zeros([naux_sph,3]) + wk = cupy.zeros([naux_sph,nocc,nocc,3]) + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip2', omega=omega): + wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0_tag[j0:j1,i0:i1]) + tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) + wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) return wj, wk def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): @@ -804,11 +850,11 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): hj = cupy.zeros([nao_sph,9]) hk = cupy.zeros([nao_sph,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1', omega=omega): - rhok_tmp = cupy.einsum('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = cupy.einsum('pio,jo->pij', rhok_tmp, orbo[j0:j1]) - tmp = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) - hj[i0:i1] += cupy.einsum('xpi,p->ix', tmp, rhoj[k0:k1]) - hk[i0:i1] += cupy.einsum('xpji,pij->ix', int3c_blk, rhok_tmp) + rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) + rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) + tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) + hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1]) + hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp) hj = hj.reshape([nao_sph,3,3]) hk = hk.reshape([nao_sph,3,3]) return hj, hk @@ -822,11 +868,11 @@ def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None) hj = cupy.zeros([nao_sph,nao_sph,9]) hk = cupy.zeros([nao_sph,nao_sph,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1', omega=omega): - rhok_tmp = cupy.einsum('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = cupy.einsum('pio,jo->pji', rhok_tmp, orbo[j0:j1]) - tmp = cupy.einsum('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1]) - hj[i0:i1,j0:j1] += cupy.einsum('xpij,p->ijx', tmp, rhoj[k0:k1]) - hk[i0:i1,j0:j1] += cupy.einsum('xpji,pji->ijx', int3c_blk, rhok_tmp) + rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) + rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1]) + tmp = contract('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1]) + hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1]) + hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp) hj = hj.reshape([nao_sph,nao_sph,3,3]) hk = hk.reshape([nao_sph,nao_sph,3,3]) return hj, hk @@ -841,11 +887,11 @@ def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None) hj = cupy.zeros([nao_sph,naux_sph,9]) hk = cupy.zeros([nao_sph,naux_sph,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1ip2', omega=omega): - rhok_tmp = cupy.einsum('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = cupy.einsum('pio,jo->pij', rhok_tmp, orbo[j0:j1]) - tmp = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) - hj[i0:i1,k0:k1] += cupy.einsum('xpi,p->ipx', tmp, rhoj[k0:k1]) - hk[i0:i1,k0:k1] += cupy.einsum('xpji,pij->ipx', int3c_blk, rhok_tmp) + rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) + rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) + tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) + hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1]) + hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp) hj = hj.reshape([nao_sph,naux_sph,3,3]) hk = hk.reshape([nao_sph,naux_sph,3,3]) return hj, hk @@ -859,11 +905,11 @@ def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None): hj = cupy.zeros([naux_sph,9]) hk = cupy.zeros([naux_sph,9]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip2', omega=omega): - rhok_tmp = cupy.einsum('por,jr->pjo', rhok[k0:k1], orbo[j0:j1]) - rhok_tmp = cupy.einsum('pjo,io->pji', rhok_tmp, orbo[i0:i1]) - tmp = cupy.einsum('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1]) - hj[k0:k1] += cupy.einsum('xp,p->px', tmp, rhoj[k0:k1]) - hk[k0:k1] += cupy.einsum('xpji,pji->px', int3c_blk, rhok_tmp) + rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1]) + rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1]) + tmp = contract('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1]) + hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) + hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp) hj = hj.reshape([naux_sph,3,3]) hk = hk.reshape([naux_sph,3,3]) return hj, hk diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py index 1b042096..8c686b63 100644 --- a/gpu4pyscf/df/tests/test_df_scf.py +++ b/gpu4pyscf/df/tests/test_df_scf.py @@ -93,15 +93,19 @@ def test_rks_wb97(self): def test_to_cpu(self): mf = scf.RHF(mol).density_fit().to_cpu() assert isinstance(mf, cpu_df_jk._DFHF) - mf = mf.to_gpu() - assert isinstance(mf, df_jk._DFHF) + # TODO: coming soon + #mf = mf.to_gpu() + #assert isinstance(mf, df_jk._DFHF) mf = rks.RKS(mol).density_fit().to_cpu() assert isinstance(mf, cpu_df_jk._DFHF) - assert 'gpu' not in mf.grids.__module__ - mf = mf.to_gpu() - assert isinstance(mf, df_jk._DFHF) - assert 'gpu' in mf.grids.__module__ + # grids are still not df._key + #assert 'gpu' not in mf.grids.__module__ + + # TODO: coming soon + #mf = mf.to_gpu() + #assert isinstance(mf, df_jk._DFHF) + #assert 'gpu' in mf.grids.__module__ if __name__ == "__main__": print("Full Tests for SCF") diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 6f642889..26c441cf 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -36,7 +36,7 @@ # import pyscf.grad.rhf to activate nuc_grad_method method from pyscf.grad import rhf # noqa from gpu4pyscf.scf import cphf -from gpu4pyscf.lib.cupy_helper import contract, tag_array +from gpu4pyscf.lib.cupy_helper import contract, tag_array, print_mem_info from gpu4pyscf.lib import logger def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -55,7 +55,6 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mo_energy = cupy.asarray(mo_energy) mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) - de2 = hessobj.partial_hess_elec(mo_energy, mo_coeff, mo_occ, atmlst, max_memory, log) if h1ao is None: diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 9f2cef59..2285b9dd 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -57,7 +57,6 @@ def print_mem_info(): cupy.get_default_memory_pool().free_all_blocks() cupy.get_default_pinned_memory_pool().free_all_blocks() mem_avail = cupy.cuda.runtime.memGetInfo()[0] - print(cupy.cuda.runtime.memGetInfo()) total_mem = mempool.total_bytes() used_mem = mempool.used_bytes() mem_limit = mempool.get_limit() @@ -322,7 +321,7 @@ def cart2sph(t, axis=0, ang=1, out=None): t_sph = contract('min,ip->mpn', t_cart, c2s, out=out) return t_sph.reshape(out_shape) -# a copy with modification from +# a copy with modification from # https://github.com/pyscf/pyscf/blob/9219058ac0a1bcdd8058166cad0fb9127b82e9bf/pyscf/lib/linalg_helper.py#L1536 def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, lindep=DSOLVE_LINDEP, callback=None, hermi=False, diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu index 34a2783a..b87ca434 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu @@ -40,7 +40,7 @@ static void _nabla1(double *fx1, double *fy1, double *fz1, fx1[0] = a2*fx0[1]; fy1[0] = a2*fy0[1]; fz1[0] = a2*fz0[1]; - + for (i = 1; i <= ANG; i++) { fx1[i] = i*fx0[i-1] + a2*fx0[i+1]; fy1[i] = i*fy0[i-1] + a2*fy0[i+1]; @@ -174,11 +174,11 @@ static void _cart_kernel_deriv0(BasOffsets offsets) double xpows[LMAX]; double ypows[LMAX]; double zpows[LMAX]; - + xpows[0] = 1.0; ypows[0] = 1.0; zpows[0] = 1.0; - + for(lx = 1; lx <= ANG ; lx++){ xpows[lx] = xpows[lx-1] * rx; ypows[lx] = ypows[lx-1] * ry; @@ -342,7 +342,7 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gtoz[7*ngrids+grid_id] = az * ry * ry * rz + byy; gtoz[8*ngrids+grid_id] = az * ry * rz * rz + 2 * byz; gtoz[9*ngrids+grid_id] = az * rz * rz * rz + 3 * bzz; - } + } // There is a bug in the comment. // Using a general formulation. // FIXME later @@ -390,7 +390,7 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gtox[12*ngrids+grid_id] = ax * ry * ry * rz * rz; gtox[13*ngrids+grid_id] = ax * ry * rz * rz * rz; gtox[14*ngrids+grid_id] = ax * rz * rz * rz * rz; - gtoy[ grid_id] = ay * rx * rx * rx * rx; + gtoy[ grid_id] = ay * rx * rx * rx * rx; gtoy[1 *ngrids+grid_id] = ay * rx * rx * rx * ry + bxxx; gtoy[2 *ngrids+grid_id] = ay * rx * rx * rx * rz; gtoy[3 *ngrids+grid_id] = ay * rx * rx * ry * ry + 2 * bxxy; @@ -404,11 +404,11 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gtoy[11*ngrids+grid_id] = ay * ry * ry * ry * rz + 3 * byyz; gtoy[12*ngrids+grid_id] = ay * ry * ry * rz * rz + 2 * byzz; gtoy[13*ngrids+grid_id] = ay * ry * rz * rz * rz + bzzz; - gtoy[14*ngrids+grid_id] = ay * rz * rz * rz * rz; + gtoy[14*ngrids+grid_id] = ay * rz * rz * rz * rz; gtoz[ grid_id] = az * rx * rx * rx * rx; gtoz[1 *ngrids+grid_id] = az * rx * rx * rx * ry; gtoz[2 *ngrids+grid_id] = az * rx * rx * rx * rz + bxxx; - gtoz[3 *ngrids+grid_id] = az * rx * rx * ry * ry; + gtoz[3 *ngrids+grid_id] = az * rx * rx * ry * ry; gtoz[4 *ngrids+grid_id] = az * rx * rx * ry * rz + bxxy; gtoz[5 *ngrids+grid_id] = az * rx * rx * rz * rz + 2 * bxxz; gtoz[6 *ngrids+grid_id] = az * rx * ry * ry * ry; @@ -477,7 +477,7 @@ static void _cart_kernel_deriv2(BasOffsets offsets) double* __restrict__ gtoyy = offsets.data + (nao * 7 + i0) * ngrids; double* __restrict__ gtoyz = offsets.data + (nao * 8 + i0) * ngrids; double* __restrict__ gtozz = offsets.data + (nao * 9 + i0) * ngrids; - + double *atom_coordx = c_envs.atom_coordx; double *atom_coordy = c_envs.atom_coordx + natm; double *atom_coordz = c_envs.atom_coordx + natm * 2; @@ -494,7 +494,7 @@ static void _cart_kernel_deriv2(BasOffsets offsets) double fx0[16], fy0[16], fz0[16]; double fx1[16], fy1[16], fz1[16]; double fx2[16], fy2[16], fz2[16]; - + fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0; for (int lx = 1; lx <= ANG+2; lx++){ fx0[lx] = fx0[lx-1] * rx; @@ -587,7 +587,7 @@ static void _cart_kernel_deriv3(BasOffsets offsets) fy0[lx] = fy0[lx-1] * ry; fz0[lx] = fz0[lx-1] * rz; } - + for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; _nabla1(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]); @@ -701,7 +701,7 @@ static void _cart_kernel_deriv4(BasOffsets offsets) fy0[lx] = fy0[lx-1] * ry; fz0[lx] = fz0[lx-1] * rz; } - + for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; _nabla1(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]); @@ -1085,7 +1085,7 @@ static void _sph_kernel_deriv1(BasOffsets offsets) gtox[6 *ngrids+grid_id] = 2.838524087272680054 * g5 + 0.473087347878780009 * g10 - 0.473087347878780002 * g0 - 2.838524087272680050 * g12; gtox[7 *ngrids+grid_id] = 1.770130769779930531 * g2 - 5.310392309339791590 * g7 ; gtox[8 *ngrids+grid_id] = 0.625835735449176134 * g0 - 3.755014412695056800 * g3 + 0.625835735449176134 * g10; - g0 = ay * rx * rx * rx * rx; + g0 = ay * rx * rx * rx * rx; g1 = ay * rx * rx * rx * ry + bxxx; g2 = ay * rx * rx * rx * rz; g3 = ay * rx * rx * ry * ry + 2 * bxxy; @@ -1099,7 +1099,7 @@ static void _sph_kernel_deriv1(BasOffsets offsets) g11 = ay * ry * ry * ry * rz + 3 * byyz; g12 = ay * ry * ry * rz * rz + 2 * byzz; g13 = ay * ry * rz * rz * rz + bzzz; - g14 = ay * rz * rz * rz * rz; + g14 = ay * rz * rz * rz * rz; gtoy[ grid_id] = 2.503342941796704538 * g1 - 2.503342941796704530 * g6 ; gtoy[1 *ngrids+grid_id] = 5.310392309339791593 * g4 - 1.770130769779930530 * g11; gtoy[2 *ngrids+grid_id] = 5.677048174545360108 * g8 - 0.946174695757560014 * g1 - 0.946174695757560014 * g6 ; @@ -1112,7 +1112,7 @@ static void _sph_kernel_deriv1(BasOffsets offsets) g0 = az * rx * rx * rx * rx; g1 = az * rx * rx * rx * ry; g2 = az * rx * rx * rx * rz + bxxx; - g3 = az * rx * rx * ry * ry; + g3 = az * rx * rx * ry * ry; g4 = az * rx * rx * ry * rz + bxxy; g5 = az * rx * rx * rz * rz + 2 * bxxz; g6 = az * rx * ry * ry * ry; @@ -1162,7 +1162,7 @@ static void _sph_kernel_deriv2(BasOffsets offsets) double* __restrict__ gtoyy = offsets.data + (nao * 7 + i0) * ngrids; double* __restrict__ gtoyz = offsets.data + (nao * 8 + i0) * ngrids; double* __restrict__ gtozz = offsets.data + (nao * 9 + i0) * ngrids; - + double *atom_coordx = c_envs.atom_coordx; double *atom_coordy = c_envs.atom_coordx + natm; double *atom_coordz = c_envs.atom_coordx + natm * 2; @@ -1179,7 +1179,7 @@ static void _sph_kernel_deriv2(BasOffsets offsets) double fx0[16], fy0[16], fz0[16]; double fx1[16], fy1[16], fz1[16]; double fx2[16], fy2[16], fz2[16]; - + fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0; for (int lx = 1; lx <= ANG+2; lx++){ fx0[lx] = fx0[lx-1] * rx; @@ -1267,14 +1267,14 @@ static void _sph_kernel_deriv3(BasOffsets offsets) fy0[lx] = fy0[lx-1] * ry; fz0[lx] = fz0[lx-1] * rz; } - + double g[GTO_MAX_CART]; for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; _nabla1(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]); _nabla1(fx2, fy2, fz2, fx1, fy1, fz1, exps[ip]); _nabla1(fx3, fy3, fz3, fx2, fy2, fz2, exps[ip]); - + _cart_gto(g, ce, fx0, fy0, fz0); _cart2sph(g, gto, ngrids, grid_id); _cart_gto(g, ce, fx1, fy0, fz0); _cart2sph(g, gtox, ngrids, grid_id); _cart_gto(g, ce, fx0, fy1, fz0); _cart2sph(g, gtoy, ngrids, grid_id); @@ -1506,7 +1506,7 @@ int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart, for (int bucket = 0; bucket < nbuckets; ++bucket) { int ish = bas_loc[bucket]; int l = bas[ANG_OF+ish*BAS_SLOTS]; - + offsets.bas_off = ish; offsets.nprim = bas[NPRIM_OF+ish*BAS_SLOTS]; offsets.fac = CINTcommon_fac_sp(l); diff --git a/gpu4pyscf/lib/gdft/nr_numint_sparse.cu b/gpu4pyscf/lib/gdft/nr_numint_sparse.cu index 7b7455b9..4ec57c76 100644 --- a/gpu4pyscf/lib/gdft/nr_numint_sparse.cu +++ b/gpu4pyscf/lib/gdft/nr_numint_sparse.cu @@ -13,7 +13,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - + #include #include #include @@ -206,13 +206,13 @@ static void _dot_aow_ao(double *out, double *bra, double *ket, double *wv, __shared__ double s_bra[THREADSXY]; __shared__ double s_ket[THREADSXY]; - + int grid_blk; for (grid_blk = 0; grid_blk < ngrids/THREADSX; grid_blk++) { int grid0 = grid_blk * THREADSX; uint8_t si = screen_index[grid_blk*bas_blocks+ish4]; uint8_t sj = screen_index[grid_blk*bas_blocks+jsh4]; - //printf("%d %d %d %d %d ***", si, sj, nbins, grid_blk*bas_blocks+ish4, grid_blk*bas_blocks+jsh4); + //printf("%d %d %d %d %d ***", si, sj, nbins, grid_blk*bas_blocks+ish4, grid_blk*bas_blocks+jsh4); if (si + sj >= 0) { //nbins) { int grid_id = grid0 + txy; for (int n = 0; n < THREADSY; n++) { diff --git a/gpu4pyscf/lib/gdft/vv10.cu b/gpu4pyscf/lib/gdft/vv10.cu index 51a1bd6f..df26dffe 100644 --- a/gpu4pyscf/lib/gdft/vv10.cu +++ b/gpu4pyscf/lib/gdft/vv10.cu @@ -49,34 +49,34 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec, double F = 0.0; double U = 0.0; double W = 0.0; - + double *xj = vvcoords; double *yj = vvcoords + vvngrids; double *zj = vvcoords + 2*vvngrids; __shared__ double3 xj_t[THREADS]; __shared__ double3 kp_t[THREADS]; - + const int tx = threadIdx.x; for (int j = 0; j < vvngrids; j+=blockDim.x) { int idx = j + threadIdx.x; xj_t[tx] = {xj[idx], yj[idx], zj[idx]}; - kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; - + kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; + __syncthreads(); for (int l = 0, M = min(THREADS, vvngrids - j); l < M; ++l){ double3 xj_tmp = xj_t[l]; double pjx = xj_tmp.x; double pjy = xj_tmp.y; double pjz = xj_tmp.z; - + // about 23 operations for each pair double DX = pjx - xi; - double DY = pjy - yi; + double DY = pjy - yi; double DZ = pjz - zi; double R2 = DX*DX + DY*DY + DZ*DZ; - + double3 kp_tmp = kp_t[l]; double Kpj = kp_tmp.x; double W0pj = kp_tmp.y; @@ -87,7 +87,7 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec, double gt = g + gp; double ggt = g * gt; double T = RpWj / (gp*ggt); - + F += T; T *= (g + gt)/ggt; U += T; @@ -126,7 +126,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords, double *xj = vvcoords; double *yj = vvcoords + vvngrids; double *zj = vvcoords + 2*vvngrids; - + __shared__ double3 xj_t[THREADS]; __shared__ double3 kp_t[THREADS]; @@ -135,7 +135,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords, int idx = j + threadIdx.x; xj_t[tx] = {xj[idx], yj[idx], zj[idx]}; - kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; + kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; __syncthreads(); for (int l = 0, M = min(THREADS, vvngrids - j); l < M; ++l){ @@ -143,10 +143,10 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords, double pjx = xj_tmp.x; double pjy = xj_tmp.y; double pjz = xj_tmp.z; - + // about 23 operations for each pair double DX = pjx - xi; - double DY = pjy - yi; + double DY = pjy - yi; double DZ = pjz - zi; double R2 = DX*DX + DY*DY + DZ*DZ; @@ -154,7 +154,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords, double Kpj = kp_tmp.x; double W0pj = kp_tmp.y; double RpWj = kp_tmp.z; - + double gp = R2*W0pj + Kpj; double g = R2*W0i + Ki; double gt = g + gp; diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index a5cf6187..5ce6613e 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -37,7 +37,6 @@ def to_cpu(method): if 'gpu4pyscf' not in pyscf_cls.__module__: break method = method.view(pyscf_cls) - keys = [] for cls in pyscf_cls.__mro__[:-1]: if hasattr(cls, '_keys'): diff --git a/gpu4pyscf/scf/tests/test_scf.py b/gpu4pyscf/scf/tests/test_scf.py index 3dd94806..29adecd4 100644 --- a/gpu4pyscf/scf/tests/test_scf.py +++ b/gpu4pyscf/scf/tests/test_scf.py @@ -31,11 +31,16 @@ H 0.7570000000 0.0000000000 -0.4696000000 ''' bas='def2-qzvpp' -mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) -mol.verbose = 4 +def setUpModule(): + global mol + mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) + mol.output = '/dev/null' + mol.verbose = 0 + mol.build() def tearDownModule(): global mol + mol.stdout.close() del mol class KnownValues(unittest.TestCase): @@ -51,16 +56,18 @@ def test_rhf(self): def test_to_cpu(self): mf = scf.RHF(mol).to_cpu() - assert isinstance(mf, cpu_scf.RHF) - mf = mf.to_gpu() - assert isinstance(mf, scf.RHF) + assert isinstance(mf, cpu_scf.hf.RHF) + # coming soon + #mf = mf.to_gpu() + #assert isinstance(mf, scf.RHF) mf = rks.RKS(mol).to_cpu() assert isinstance(mf, cpu_dft.rks.RKS) assert 'gpu' not in mf.grids.__module__ - mf = mf.to_gpu() - assert isinstance(mf, rks.RKS) - assert 'gpu' in mf.grids.__module__ + # coming soon + # mf = mf.to_gpu() + # assert isinstance(mf, rks.RKS) + #assert 'gpu' in mf.grids.__module__ if __name__ == "__main__": print("Full Tests for SCF") diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index 7c9d0047..4df17558 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -296,4 +296,3 @@ def _finalize(self): return WithSolventGrad(grad_method) -#pcm.PCM.nuc_grad_method = make_grad_object \ No newline at end of file diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py index b7a1d181..fd3b36e4 100644 --- a/gpu4pyscf/solvent/pcm.py +++ b/gpu4pyscf/solvent/pcm.py @@ -293,7 +293,7 @@ def build(self, ng=None): self.v_grids_n = cupy.asarray(v_grids_n) def _get_vind(self, dms): - if not self._intermediates or self.grids.coords is None: + if not self._intermediates: self.build() nao = dms.shape[-1] From 2fff5f3deffe1241dcd9ab34ff0fa809054a21c4 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 24 Oct 2023 15:55:35 -0700 Subject: [PATCH 13/19] Update __init__.py --- gpu4pyscf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index 7fb02e52..ca735418 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1,2 +1,2 @@ from . import lib, grad, hessian, solvent, scf, dft -__version__ = '0.6.3' +__version__ = '0.6.4' From 891ef8180372884cea39ff964a0942e14a34ad8d Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 24 Oct 2023 21:46:30 -0700 Subject: [PATCH 14/19] Add chelpg charges in qmmm folder. (#1) (#52) * Add chelpg charges in qmmm folder. * Update chelpg.py * Update chelpg.py * Add unit test for chelpg, and compare with Qchem * Add an example to calculate chelpg Co-authored-by: puzhichen <147788878+puzhichen@users.noreply.github.com> --- examples/15-chelpg.py | 39 +++ gpu4pyscf/qmmm/__init__.py | 16 ++ gpu4pyscf/qmmm/chelpg.py | 371 +++++++++++++++++++++++++++++ gpu4pyscf/qmmm/test/test_chelpg.py | 85 +++++++ 4 files changed, 511 insertions(+) create mode 100644 examples/15-chelpg.py create mode 100644 gpu4pyscf/qmmm/__init__.py create mode 100644 gpu4pyscf/qmmm/chelpg.py create mode 100644 gpu4pyscf/qmmm/test/test_chelpg.py diff --git a/examples/15-chelpg.py b/examples/15-chelpg.py new file mode 100644 index 00000000..f102741b --- /dev/null +++ b/examples/15-chelpg.py @@ -0,0 +1,39 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from pyscf import gto +from gpu4pyscf.dft import rks +from gpu4pyscf.qmmm import chelpg + + +mol = gto.Mole() +mol.verbose = 0 +mol.output = None +mol.atom = [ + [1 , (1. , 0. , 0.000)], + [1 , (0. , 1. , 0.000)], + [1 , (0. , -1.517 , 1.177)], + [1 , (0. , 1.517 , 1.177)] ] +mol.basis = '631g' +mol.unit = 'B' +mol.build() +mol.verbose = 6 + +xc = 'b3lyp' +mf = rks.RKS(mol, xc=xc) +mf.grids.level = 5 +mf.kernel() +q = chelpg.eval_chelpg_layer_gpu(mf) +print(q) # [ 0.04402311 0.11333945 -0.25767919 0.10031663] \ No newline at end of file diff --git a/gpu4pyscf/qmmm/__init__.py b/gpu4pyscf/qmmm/__init__.py new file mode 100644 index 00000000..b165366a --- /dev/null +++ b/gpu4pyscf/qmmm/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from gpu4pyscf.qmmm import chelpg \ No newline at end of file diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py new file mode 100644 index 00000000..3851acc0 --- /dev/null +++ b/gpu4pyscf/qmmm/chelpg.py @@ -0,0 +1,371 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import pyscf +import time +import cupy +import numpy as np +import scipy +import ctypes +from pyscf import lib, gto +from pyscf.scf import _vhf +from gpu4pyscf.df import int3c2e +from gpu4pyscf.scf.hf import BasisProdCache +from gpu4pyscf.lib.cupy_helper import load_library, block_c2s_diag +libgint = load_library('libgint') +libgvhf = load_library('libgvhf') +lib.num_threads(8) + + +def get_j_int3c2e_pass1(intopt, dm0): + ''' + get rhoj pass1 for int3c2e + ''' + n_dm = 1 + + naux = intopt.naux + rhoj = cupy.zeros([naux]) + coeff = intopt.coeff + dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm0, coeff) + + num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs] + num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs] + + bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32) + bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32) + + ncp_ij = len(intopt.log_qs) + ncp_kl = len(intopt.aux_log_qs) + norb = dm_cart.shape[0] + err = libgvhf.GINTbuild_j_int3c2e_pass1( + intopt.bpcache, + ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p), + ctypes.cast(rhoj.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + ctypes.c_int(naux), + ctypes.c_int(n_dm), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + bins_locs_kl.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(ncp_ij), + ctypes.c_int(ncp_kl)) + if err != 0: + raise RuntimeError('CUDA error in get_j_pass1') + return rhoj + + +class VHFOpt(_vhf.VHFOpt): + def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen', + qcondname='CVHFsetnr_direct_scf', dmcondname=None): + # use local basis_seg_contraction for efficiency + self.mol = int3c2e.basis_seg_contraction(mol, allow_replica=True) + self.auxmol = int3c2e.basis_seg_contraction(auxmol, allow_replica=True) + ''' + # Note mol._bas will be sorted in .build() method. VHFOpt should be + # initialized after mol._bas updated. + ''' + self.nao = self.mol.nao + self.naux = self.auxmol.nao + + self._intor = intor + self._prescreen = prescreen + self._qcondname = qcondname + self._dmcondname = dmcondname + + self.bpcache = None + + self.sorted_auxmol = None + self.sorted_mol = None + + self.cart_ao_idx = None + self.sph_ao_idx = None + self.cart_aux_idx = None + self.sph_aux_idx = None + + self.cart_ao_loc = [] + self.cart_aux_loc = [] + self.sph_ao_loc = [] + self.sph_aux_loc = [] + + self.cart2sph = None + self.aux_cart2sph = None + + self.angular = None + self.aux_angular = None + + self.cp_idx = None + self.cp_jdx = None + + self.log_qs = None + self.aux_log_qs = None + + def clear(self): + _vhf.VHFOpt.__del__(self) + libgvhf.GINTdel_basis_prod(ctypes.byref(self.bpcache)) + return self + + def __del__(self): + try: + self.clear() + except AttributeError: + pass + + def build(self, cutoff=1e-14, group_size=None, + group_size_aux=None, diag_block_with_triu=False, aosym=False): + ''' + int3c2e is based on int2e with (ao,ao|aux,1) + a tot_mol is created with concatenating [mol, fake_mol, aux_mol] + we will pair (ao,ao) and (aux,1) separately. + ''' + sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = int3c2e.sort_mol( + self.mol) + if group_size is not None: + uniq_l_ctr, l_ctr_counts = int3c2e._split_l_ctr_groups( + uniq_l_ctr, l_ctr_counts, group_size) + self.sorted_mol = sorted_mol + + # sort fake mol + fake_mol = int3c2e.make_fake_mol() + _, _, fake_uniq_l_ctr, fake_l_ctr_counts = int3c2e.sort_mol(fake_mol) + + # sort auxiliary mol + sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e.sort_mol( + self.auxmol) + if group_size_aux is not None: + aux_uniq_l_ctr, aux_l_ctr_counts = int3c2e._split_l_ctr_groups( + aux_uniq_l_ctr, aux_l_ctr_counts, group_size_aux) + self.sorted_auxmol = sorted_auxmol + tmp_mol = gto.mole.conc_mol(fake_mol, sorted_auxmol) + tot_mol = gto.mole.conc_mol(sorted_mol, tmp_mol) + + # Initialize vhfopt after reordering mol._bas + _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen, + self._qcondname, self._dmcondname) + self.direct_scf_tol = cutoff + + # TODO: is it more accurate to filter with overlap_cond (or exp_cond)? + q_cond = self.get_q_cond() + l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)) + log_qs, pair2bra, pair2ket = int3c2e.get_pairing( + l_ctr_offsets, l_ctr_offsets, q_cond, + diag_block_with_triu=diag_block_with_triu, aosym=aosym) + self.log_qs = log_qs.copy() + + # contraction coefficient for ao basis + cart_ao_loc = self.sorted_mol.ao_loc_nr(cart=True) + sph_ao_loc = self.sorted_mol.ao_loc_nr(cart=False) + self.cart_ao_loc = [cart_ao_loc[cp] for cp in l_ctr_offsets] + self.sph_ao_loc = [sph_ao_loc[cp] for cp in l_ctr_offsets] + self.angular = [l[0] for l in uniq_l_ctr] + + cart_ao_loc = self.mol.ao_loc_nr(cart=True) + sph_ao_loc = self.mol.ao_loc_nr(cart=False) + nao = sph_ao_loc[-1] + ao_idx = np.array_split(np.arange(nao), sph_ao_loc[1:-1]) + self.sph_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx]) + + # cartesian ao index + nao = cart_ao_loc[-1] + ao_idx = np.array_split(np.arange(nao), cart_ao_loc[1:-1]) + self.cart_ao_idx = np.hstack([ao_idx[i] for i in sorted_idx]) + ncart = cart_ao_loc[-1] + nsph = sph_ao_loc[-1] + self.cart2sph = block_c2s_diag(ncart, nsph, self.angular, l_ctr_counts) + inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32) + self.coeff = self.cart2sph[:, inv_idx] + + # pairing auxiliary basis with fake basis set + fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts)) + fake_l_ctr_offsets += l_ctr_offsets[-1] + + aux_l_ctr_offsets = np.append(0, np.cumsum(aux_l_ctr_counts)) + + # contraction coefficient for auxiliary basis + cart_aux_loc = self.sorted_auxmol.ao_loc_nr(cart=True) + sph_aux_loc = self.sorted_auxmol.ao_loc_nr(cart=False) + self.cart_aux_loc = [cart_aux_loc[cp] for cp in aux_l_ctr_offsets] + self.sph_aux_loc = [sph_aux_loc[cp] for cp in aux_l_ctr_offsets] + self.aux_angular = [l[0] for l in aux_uniq_l_ctr] + + cart_aux_loc = self.auxmol.ao_loc_nr(cart=True) + sph_aux_loc = self.auxmol.ao_loc_nr(cart=False) + ncart = cart_aux_loc[-1] + nsph = sph_aux_loc[-1] + # inv_idx = np.argsort(self.sph_aux_idx, kind='stable').astype(np.int32) + aux_l_ctr_offsets += fake_l_ctr_offsets[-1] + + # hardcoded for grids + aux_pair2bra = [np.arange(aux_l_ctr_offsets[0], aux_l_ctr_offsets[-1])] + aux_pair2ket = [np.ones(ncart) * fake_l_ctr_offsets[0]] + aux_log_qs = [np.ones(ncart)] + + self.aux_log_qs = aux_log_qs.copy() + pair2bra += aux_pair2bra + pair2ket += aux_pair2ket + + uniq_l_ctr = np.concatenate( + [uniq_l_ctr, fake_uniq_l_ctr, aux_uniq_l_ctr]) + l_ctr_offsets = np.concatenate([ + l_ctr_offsets, + fake_l_ctr_offsets[1:], + aux_l_ctr_offsets[1:]]) + + bas_pair2shls = np.hstack( + pair2bra + pair2ket).astype(np.int32).reshape(2, -1) + bas_pairs_locs = np.append(0, np.cumsum( + [x.size for x in pair2bra])).astype(np.int32) + log_qs = log_qs + aux_log_qs + ao_loc = tot_mol.ao_loc_nr(cart=True) + ncptype = len(log_qs) + + self.bpcache = ctypes.POINTER(BasisProdCache)() + if diag_block_with_triu: + scale_shellpair_diag = 1. + else: + scale_shellpair_diag = 0.5 + libgint.GINTinit_basis_prod( + ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag), + ao_loc.ctypes.data_as(ctypes.c_void_p), + bas_pair2shls.ctypes.data_as(ctypes.c_void_p), + bas_pairs_locs.ctypes.data_as( + ctypes.c_void_p), ctypes.c_int(ncptype), + tot_mol._atm.ctypes.data_as( + ctypes.c_void_p), ctypes.c_int(tot_mol.natm), + tot_mol._bas.ctypes.data_as( + ctypes.c_void_p), ctypes.c_int(tot_mol.nbas), + tot_mol._env.ctypes.data_as(ctypes.c_void_p)) + self.bas_pairs_locs = bas_pairs_locs + ncptype = len(self.log_qs) + if aosym: + self.cp_idx, self.cp_jdx = np.tril_indices(ncptype) + else: + nl = int(round(np.sqrt(ncptype))) + self.cp_idx, self.cp_jdx = np.unravel_index( + np.arange(ncptype), (nl, nl)) + + +def eval_chelpg_layer_gpu(mf, deltaR=0.3, Rhead=2.8, ifqchem=True): + """Cal chelpg charge + + Args: + mf: mean field object in pyscf + deltaR (float, optional): the intervel in the cube. Defaults to 0.3. + Rhead (float, optional): the head length. Defaults to 3.0. + ifqchem (bool, optional): whether use the modification in qchem. Defaults to True. + + Returns: + numpy.array: charges + """ + t0 = time.process_time() + t0w = time.time() + BOHR = 0.52917721092 # Angstroms + atomcoords = mf.mol.atom_coords(unit='B') + dm = cupy.array(mf.make_rdm1()) + RVDW_bondi = {1: 1.1/BOHR, 2: 1.40/BOHR, + 3: 1.82/BOHR, 6: 1.70/BOHR, 7: 1.55/BOHR, 8: 1.52/BOHR, 9: 1.47/BOHR, 10: 1.54/BOHR, + 11: 2.27/BOHR, 12: 1.73/BOHR, 14: 2.10/BOHR, 15: 1.80/BOHR, 16: 1.80/BOHR, 17: 1.75/BOHR, 18: 1.88/BOHR, + 19: 2.75/BOHR, 35: 1.85/BOHR} + + Roff = Rhead/BOHR + Deltar = 0.1 + + # smoothing function + def tau_f(R, Rcut, Roff): + return (R - Rcut)**2 * (3*Roff - Rcut - 2*R) / (Roff - Rcut)**3 + + Rshort = np.array([RVDW_bondi[iatom] for iatom in mf.mol._atm[:, 0]]) + idxxmin = np.argmin(atomcoords[:, 0] - Rshort) + idxxmax = np.argmax(atomcoords[:, 0] + Rshort) + idxymin = np.argmin(atomcoords[:, 1] - Rshort) + idxymax = np.argmax(atomcoords[:, 1] + Rshort) + idxzmin = np.argmin(atomcoords[:, 2] - Rshort) + idxzmax = np.argmax(atomcoords[:, 2] + Rshort) + atomtypes = np.array(mf.mol._atm[:, 0]) + # Generate the grids in the cube + xmin = atomcoords[:, 0].min() - Rhead/BOHR - RVDW_bondi[atomtypes[idxxmin]] + xmax = atomcoords[:, 0].max() + Rhead/BOHR + RVDW_bondi[atomtypes[idxxmax]] + ymin = atomcoords[:, 1].min() - Rhead/BOHR - RVDW_bondi[atomtypes[idxymin]] + ymax = atomcoords[:, 1].max() + Rhead/BOHR + RVDW_bondi[atomtypes[idxymax]] + zmin = atomcoords[:, 2].min() - Rhead/BOHR - RVDW_bondi[atomtypes[idxzmin]] + zmax = atomcoords[:, 2].max() + Rhead/BOHR + RVDW_bondi[atomtypes[idxzmax]] + x = np.arange(xmin, xmax, deltaR/BOHR) + y = np.arange(ymin, ymax, deltaR/BOHR) + z = np.arange(zmin, zmax, deltaR/BOHR) + gridcoords = np.meshgrid(x, y, z) + gridcoords = np.vstack(list(map(np.ravel, gridcoords))).T + + # [natom, ngrids] distance between an atom and a grid + r_pX = scipy.spatial.distance.cdist(atomcoords, gridcoords) + # delete the grids in the vdw surface and out the Rhead surface. + # the minimum distance to any atom + Rkmin = (r_pX - np.expand_dims(Rshort, axis=1)).min(axis=0) + Ron = Rshort + Deltar + Rlong = Roff - Deltar + AJk = np.ones(r_pX.shape) # the short-range weight + idx = r_pX < np.expand_dims(Rshort, axis=1) + AJk[idx] = 0 + if ifqchem: + idx2 = (r_pX < np.expand_dims(Ron, axis=1)) * \ + (r_pX >= np.expand_dims(Rshort, axis=1)) + AJk[idx2] = tau_f(r_pX, np.expand_dims(Rshort, axis=1), + np.expand_dims(Ron, axis=1))[idx2] + wLR = 1 - tau_f(Rkmin, Rlong, Roff) # the long-range weight + idx1 = Rkmin < Rlong + idx2 = Rkmin > Roff + wLR[idx1] = 1 + wLR[idx2] = 0 + else: + wLR = np.ones(r_pX.shape[-1]) # the long-range weight + idx = Rkmin > Roff + wLR[idx] = 0 + w = wLR*np.prod(AJk, axis=0) # weight for a specific poing + idx = w <= 1.0E-14 + w = np.delete(w, idx) + r_pX = np.delete(r_pX, idx, axis=1) + gridcoords = np.delete(gridcoords, idx, axis=0) + + ngrids = gridcoords.shape[0] + r_pX = cupy.array(r_pX) + r_pX_potential = 1/r_pX + potential_real = cupy.dot(cupy.array( + mf.mol.atom_charges()), r_pX_potential) + nbatch = 256*256 + + # assert nbatch < ngrids + fmol = pyscf.gto.fakemol_for_charges(gridcoords[:nbatch]) + intopt = VHFOpt(mf.mol, fmol, 'int2e') + for ibatch in range(0, ngrids, nbatch): + max_grid = min(ibatch+nbatch, ngrids) + num_grids = max_grid - ibatch + ptr = intopt.auxmol._atm[:num_grids, gto.PTR_COORD] + intopt.auxmol._env[np.vstack( + (ptr, ptr+1, ptr+2)).T] = gridcoords[ibatch:max_grid] + intopt.build(1e-14, diag_block_with_triu=False, aosym=True) + potential_real[ibatch:max_grid] -= 2.0 * \ + get_j_int3c2e_pass1(intopt, dm)[:num_grids] + + w = cupy.array(w) + r_pX_potential_omega = r_pX_potential*w + GXA = r_pX_potential_omega@r_pX_potential.T + eX = r_pX_potential_omega@potential_real + GXA_inv = cupy.linalg.inv(GXA) + g = GXA_inv@eX + alpha = (g.sum() - mf.mol.charge)/(GXA_inv.sum()) + q = g - alpha*GXA_inv@cupy.ones((mf.mol.natm)) + t6 = time.process_time() + t6w = time.time() + print("Total cpu time: ", t6 - t0) + print("Total wall time: ", t6w - t0w) + return q + diff --git a/gpu4pyscf/qmmm/test/test_chelpg.py b/gpu4pyscf/qmmm/test/test_chelpg.py new file mode 100644 index 00000000..836ce15e --- /dev/null +++ b/gpu4pyscf/qmmm/test/test_chelpg.py @@ -0,0 +1,85 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import pyscf +from pyscf import lib +from gpu4pyscf.dft import rks +from gpu4pyscf.qmmm import chelpg + +lib.num_threads(8) + +atom = ''' +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 +''' + +bas='def2tzvpp' +grids_level = 5 + +def setUpModule(): + global mol + mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) + mol.output = '/dev/null' + mol.build() + mol.verbose = 1 + +def tearDownModule(): + global mol + mol.stdout.close() + del mol + +def run_dft_chelpg(xc, deltaR): + mf = rks.RKS(mol, xc=xc) + mf.grids.level = grids_level + e_dft = mf.kernel() + q = chelpg.eval_chelpg_layer_gpu(mf, deltaR=deltaR) + return e_dft, q + + +class KnownValues(unittest.TestCase): + ''' + known values are obtained by Q-Chem + $rem + JOBTYP SP + METHOD b3lyp + BASIS def2-tzvpp + XC_GRID 000099000590 + CHELPG_DX 2 + CHELPG TRUE + SCF_CONVERGENCE 10 + $end + + Ground-State ChElPG Net Atomic Charges + + Atom Charge (a.u.) + ---------------------------------------- + 1 O -0.712558 + 2 H 0.356292 + 3 H 0.356266 + ---------------------------------------- + ''' + def test_rks_b3lyp(self): + print('-------- B3LYP -------------') + e_tot, q = run_dft_chelpg('B3LYP', 0.1) + assert np.allclose(e_tot, -76.4666495181) + assert np.allclose(q, np.array([-0.712558, 0.356292, 0.356266])) + + +if __name__ == "__main__": + print("Full Tests for SCF") + unittest.main() \ No newline at end of file From 79041b420b6e77b86ea82edc97789a849ad3af0b Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Oct 2023 17:53:53 -0700 Subject: [PATCH 15/19] Optimize hessian 2 (#53) * numpy -> cupy for solvent * for linter * remove grad switch from pcm.py * passed flake8 * solvent integrals on GPU * flake8 * compatiable with pyscf-2.4.0 * added solvent * fixed issues for to_cpu * store intermeidate variable on CPU * cupy.einsum -> contract * optimized dft integration for gradient and hessian * remove lprof * fixed a bug in nlc * precompute fxc_x --- gpu4pyscf/df/df_jk.py | 4 +- gpu4pyscf/df/grad/rhf.py | 2 +- gpu4pyscf/df/grad/rks.py | 2 +- gpu4pyscf/df/hessian/rhf.py | 6 +- gpu4pyscf/df/hessian/rks.py | 1 + gpu4pyscf/df/int3c2e.py | 119 +++++++++++++-------- gpu4pyscf/df/tests/test_df_grad.py | 28 ++--- gpu4pyscf/df/tests/test_df_scf.py | 39 +++++-- gpu4pyscf/dft/numint.py | 154 +++++++++++++++------------ gpu4pyscf/dft/rks.py | 8 +- gpu4pyscf/grad/rks.py | 161 ++++++++++++++--------------- gpu4pyscf/hessian/rhf.py | 26 ++--- gpu4pyscf/hessian/rks.py | 124 ++++++++++++---------- setup.py | 2 +- 14 files changed, 377 insertions(+), 299 deletions(-) diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 7f6c3933..e27f16f4 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -310,8 +310,8 @@ def get_j(cderi_sparse): rhok = contract('Lij,jk->Lki', cderi, occ_coeff) for i in range(mo1.shape[0]): rhok1 = contract('Lij,jk->Lki', cderi, mo1[i]) - vk[i] += contract('Lki,Lkj->ij', rhok, rhok1) - #contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[i]) + #vk[i] += contract('Lki,Lkj->ij', rhok, rhok1) + contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[i]) occ_coeff = rhok1 = rhok = mo1 = None if with_k: vk = vk + vk.transpose(0,2,1) diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 292d9309..84f5ed23 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -34,7 +34,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega if mol is None: mol = mf_grad.mol #TODO: dm has to be the SCF density matrix in this version. dm should be # extended to any 1-particle density matrix - + if(dm0 is None): dm0 = mf_grad.base.make_rdm1() mf = mf_grad.base if omega is None: diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py index 99a8ccf4..2ef88e86 100644 --- a/gpu4pyscf/df/grad/rks.py +++ b/gpu4pyscf/df/grad/rks.py @@ -129,7 +129,7 @@ def get_j(self, mol=None, dm=None, hermi=0, omega=None): def get_k(self, mol=None, dm=None, hermi=0, omega=None): _, vk, _, vkaux = self.get_jk(mol, dm, with_j=False, omega=omega) return vk, vkaux - + def extra_force(self, atom_id, envs): if self.auxbasis_response: return envs['dvhf'].aux[atom_id] diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index dfcccddb..46cb38ec 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -42,6 +42,8 @@ from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger +BLKSIZE = 128 + def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): e1, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, @@ -94,7 +96,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=64, group_size_aux=32) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) sph_ao_idx = intopt.sph_ao_idx sph_aux_idx = intopt.sph_aux_idx @@ -416,7 +418,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c = cupy.asarray(int2c) # ======================= sorted AO begin ====================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=64, group_size=64) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE) sph_ao_idx = intopt.sph_ao_idx sph_aux_idx = intopt.sph_aux_idx rev_ao_idx = np.argsort(intopt.sph_ao_idx) diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index a90b93e7..d8986f1f 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -65,6 +65,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, de2 -= (alpha - hyb) * ek_lr max_memory = None + t1 = log.timer_debug1('computing ej, ek', *t1) veff_diag = rks_hess._get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory) t1 = log.timer_debug1('computing veff_diag', *t1) aoslices = mol.aoslice_by_atom() diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 36268e08..fc755a27 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -517,6 +517,7 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): ''' + **** deprecated ********** loop over all int3c2e blocks - outer loop for k - inner loop for ij pair @@ -738,26 +739,37 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg vj1 = cupy.zeros([natom,3,nao_sph,nocc]) vk1 = cupy.zeros([natom,3,nao_sph,nocc]) - for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega): - k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - vj1_buf += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) - - rhok_tmp = cupy.asarray(rhok[k0:k1]) - if with_k: - rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2 - vk1_buf += contract('xpji,plj->xil', int3c_blk, rhok0_slice) - - rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag) - vj1_ao = contract('pjo,xpi->xijo', rhok_tmp, rhoj0) - vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom) + ncp_ij = len(intopt.log_qs) + count = 0 + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1', omega=omega): + vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) + # initialize intermediate variables + if count % ncp_ij == 0: + rhoj0 = cupy.zeros([3,k1-k0,nao_sph]) + rhok_tmp = cupy.asarray(rhok[k0:k1]) + vj1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc]) + if with_k: + rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2 + rhok0 = contract('pli,lo->poi', rhok0_slice, orbo) + int3c_ip1_occ = cupy.zeros([3,k1-k0,nao_sph,nocc]) + vk1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc]) + + # contraction + rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) if with_k: - int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo) - vk1_ao = contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) - vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) + int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + vk1_ao[:,i0:i1,j0:j1] += contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) + vk1_buf[:,i0:i1] += contract('xpji,plj->xil', int3c_blk, rhok0_slice[:,:,j0:j1]) + + # reduction + if (count+1) % ncp_ij == 0: + vj1_ao += contract('pjo,xpi->xijo', rhok_tmp, rhoj0) + vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom) + if with_k: + vk1_ao += contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) + vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) + count += 1 - rhok0 = contract('pli,lo->poi', rhok0_slice, orbo) - vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0) - vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) return vj1_buf, vk1_buf, vj1, vk1 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None): @@ -771,25 +783,36 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome nocc = orbo.shape[1] vj1 = cupy.zeros([natom,3,nao_sph,nocc]) vk1 = cupy.zeros([natom,3,nao_sph,nocc]) - for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip2', omega=omega): - k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - wj2 = contract('xpji,ji->xp', int3c_blk, dm0_tag) - wk2_P__ = contract('xpji,jo->xpio', int3c_blk, orbo) - - rhok_tmp = cupy.asarray(rhok[k0:k1]) - vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) - vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) - - vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) - if with_k: - rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo) - vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2 - rhok0_oo = contract('pio,ir->pro', rhok_tmp, orbo) - vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2 - - vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) - wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None + ncp_ij = len(intopt.log_qs) + count = 0 + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip2', omega=omega): + # initialize intermediate variables + if count % ncp_ij == 0: + wj2 = cupy.zeros([3,k1-k0]) + wk2_P__ = cupy.zeros([3,k1-k0,nao_sph,nocc]) + + # contraction + wj2 += contract('xpji,ji->xp', int3c_blk, dm0_tag[j0:j1,i0:i1]) + wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + + # reduction + if (count+1) % ncp_ij == 0: + rhok_tmp = cupy.asarray(rhok[k0:k1]) + vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) + vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) + + vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) + if with_k: + rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo) + vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2 + + rhok0_oo = contract('pio,ir->pro', rhok_tmp, orbo) + vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2 + + vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) + wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None + count += 1 return vj1, vk1 def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): @@ -800,7 +823,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): naux_sph = len(intopt.sph_aux_idx) orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] - wj = cupy.empty([nao_sph,naux_sph,3]) + + wj = cupy.zeros([nao_sph,naux_sph,3]) avail_mem = get_avail_mem() use_gpu_memory = True if nao_sph*naux_sph*nocc*3*8 < 0.4*avail_mem: @@ -816,14 +840,19 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): wk = np.ndarray([nao_sph,naux_sph,nocc,3], dtype=np.float64, order='C', buffer=mem) # TODO: async data transfer - for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega): - k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - wj[:,k0:k1] = contract('xpji,ij->ipx', int3c_blk, dm0_tag) - wk_tmp = contract('xpji,jo->ipox', int3c_blk, orbo) - if use_gpu_memory: - wk[:,k0:k1] = wk_tmp - else: - wk[:,k0:k1] = wk_tmp.get() + ncp_ij = len(intopt.log_qs) + count = 0 + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1', omega=omega): + if count % ncp_ij == 0: + wk_tmp = cupy.zeros([nao_sph, k1-k0, nocc, 3]) + wj[i0:i1,k0:k1] += contract('xpji,ij->ipx', int3c_blk, dm0_tag[i0:i1,j0:j1]) + wk_tmp[i0:i1,:] += contract('xpji,jo->ipox', int3c_blk, orbo[j0:j1]) + if (count+1) % ncp_ij == 0: + if use_gpu_memory: + wk[:,k0:k1] = wk_tmp + else: + wk[:,k0:k1] = wk_tmp.get() + count += 1 return wj, wk def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): diff --git a/gpu4pyscf/df/tests/test_df_grad.py b/gpu4pyscf/df/tests/test_df_grad.py index ade19e87..9156c2c0 100644 --- a/gpu4pyscf/df/tests/test_df_grad.py +++ b/gpu4pyscf/df/tests/test_df_grad.py @@ -40,25 +40,29 @@ auxbasis0='def2-tzvpp-jkfit' disp0='d3bj' grids_level = 6 +nlcgrids_level = 3 def setUpModule(): global mol mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000) mol.output = '/dev/null' mol.build() mol.verbose = 1 - + eps = 1.0/1024 def tearDownModule(): global mol mol.stdout.close() del mol - + def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6): mf = rks.RKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0) mf.grids.level = grids_level - mf.conv_tol = 1e-12 + mf.nlcgrids.level = nlcgrids_level + mf.conv_tol = 1e-10 + mf.verbose = 1 e_tot = mf.kernel() + g = mf.nuc_grad_method() g.auxbasis_response = True g.grid_response = grid_response @@ -94,39 +98,39 @@ def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6): assert(cupy.linalg.norm(g_analy - grad_fd) < tol) class KnownValues(unittest.TestCase): - + def test_grad_with_grids_response(self): print("-----testing DF DFT gradient with grids response----") _check_grad(grid_response=True) - + def test_grad_without_grids_response(self): print('-----testing DF DFT gradient without grids response----') _check_grad(grid_response=False) - + def test_grad_lda(self): print("-----LDA testing-------") _check_grad(xc='LDA', disp=None, tol=1e-6) - + def test_grad_gga(self): print('-----GGA testing-------') _check_grad(xc='PBE', disp=None, tol=1e-6) - + def test_grad_hybrid(self): print('------hybrid GGA testing--------') _check_grad(xc='B3LYP', disp=None, tol=1e-6) - + def test_grad_mgga(self): print('-------mGGA testing-------------') _check_grad(xc='m06', disp=None, tol=1e-4) - + def test_grad_rsh(self): print('--------RSH testing-------------') _check_grad(xc='wb97', disp=None, tol=1e-4) - + def test_grad_nlc(self): print('--------nlc testing-------------') _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-6) - + if __name__ == "__main__": print("Full Tests for DF Gradient") unittest.main() diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py index 8c686b63..eef66e16 100644 --- a/gpu4pyscf/df/tests/test_df_scf.py +++ b/gpu4pyscf/df/tests/test_df_scf.py @@ -47,7 +47,9 @@ def tearDownModule(): def run_dft(xc): mf = rks.RKS(mol, xc=xc).density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.level = grids_level + mf.grids.atom_grid = (99,590) + mf.nlcgrids.atom_grid = (50,194) + mf.conv_tol = 1e-10 e_dft = mf.kernel() return e_dft @@ -56,39 +58,54 @@ class KnownValues(unittest.TestCase): known values are obtained by Q-Chem ''' def test_rhf(self): + print('------- HF -----------------') mf = scf.RHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit') e_tot = mf.kernel() - assert np.allclose(e_tot, -76.0624582299) + e_qchem = -76.0624582299 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) def test_rks_lda(self): print('------- LDA ----------------') - e_tot = run_dft("LDA_X,LDA_C_VWN") - assert np.allclose(e_tot, -75.9046407209) + e_tot = run_dft("LDA,VWN5") + e_qchem = -75.9046407209 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) def test_rks_pbe(self): print('------- PBE ----------------') e_tot = run_dft('PBE') - assert np.allclose(e_tot, -76.3800181250) + e_qchem = -76.3800181250 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) def test_rks_b3lyp(self): print('-------- B3LYP -------------') e_tot = run_dft('B3LYP') - assert np.allclose(e_tot, -76.4666493796) + e_qchem = -76.4666493796 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) def test_rks_m06(self): print('--------- M06 --------------') e_tot = run_dft("M06") - assert np.allclose(e_tot, -76.4265841359) + e_qchem = -76.4265841359 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) def test_rks_wb97(self): print('-------- wB97 --------------') e_tot = run_dft("HYB_GGA_XC_WB97") - assert np.allclose(e_tot, -76.4486277053) + e_qchem = -76.4486277053 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) - def test_rks_wb97(self): - print('-------- wB97 --------------') + def test_rks_wb97m_v(self): + print('-------- wB97m-v --------------') e_tot = run_dft("HYB_MGGA_XC_WB97M_V") - assert np.allclose(e_tot, -76.4334567297) + e_qchem = -76.4334567297 + print(f'diff from qchem {e_tot - e_qchem}') + assert np.allclose(e_tot, e_qchem) def test_to_cpu(self): mf = scf.RHF(mol).density_fit().to_cpu() diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 2f4b3811..cc9c0e6b 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -160,7 +160,7 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', shls_slice = (0, mol.nbas) ao_loc = mol.ao_loc_nr() - + #cpos = cupy.einsum('ij,j->ij', mo_coeff[:,mo_occ>0], cupy.sqrt(mo_occ[mo_occ>0])) cpos = mo_coeff[:,mo_occ>0] * cupy.sqrt(mo_occ[mo_occ>0]) if xctype == 'LDA' or xctype == 'HF': @@ -228,10 +228,10 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', elif xctype in ('GGA', 'NLC'): rho = cupy.empty((4,ngrids)) c_0 = contract('nig,io->nog', ao, cpos1) - rho[0] = _contract_rho(c0[0], c_0[0]) + _contract_rho(c0[0], c_0[0], rho=rho[0]) for i in range(1, 4): - rho[i] = _contract_rho(c_0[0], c0[i]) - rho[i]+= _contract_rho(c0[0], c_0[i]) + _contract_rho(c_0[0], c0[i], rho=rho[i]) + rho[i] += _contract_rho(c0[0], c_0[i]) rho *= 2.0 else: # meta-GGA # TODO: complete this @@ -382,14 +382,15 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, dm_shape = dms.shape dms = [coeff @ dm @ coeff.T for dm in dms.reshape(-1,nao0,nao0)] nset = len(dms) - ao_loc = mol.ao_loc_nr() - - if mo_coeff is not None: + + if mo_coeff is not None: mo_coeff = coeff @ mo_coeff nelec = cupy.zeros(nset) excsum = cupy.zeros(nset) vmat = cupy.zeros((nset, nao, nao)) + ''' + ao_loc = mol.ao_loc_nr() if USE_SPARSITY == 1: nbins = NBINS * 2 - int(NBINS * np.log(ni.cutoff) / np.log(grids.cutoff)) pair2shls, pairs_locs = _make_pairs2shls_idx(ni.pair_mask, opt.l_bas_offsets, hermi) @@ -398,89 +399,69 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, opt.l_bas_offsets) else: pair2shls_full, pairs_locs_full = pair2shls, pairs_locs - + ''' release_gpu_stack() if xctype == 'LDA': ao_deriv = 0 else: ao_deriv = 1 - - block_id = 0 - for ao, sindex, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv, blksize=ni.grid_blksize): - if ni.grid_blksize is None: - ni.grid_blksize = weight.shape[0] - - # cache ao indices - if block_id not in ni.non0ao_idx: - t0 = (logger.process_clock(), logger.perf_counter()) - if xctype == 'LDA': - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1]) - idx = cupy.argwhere(mask).astype(np.int32)[:,0] - ao_mask = ao[idx,:] - else: - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - idx = cupy.argwhere(mask).astype(np.int32)[:,0] - ao_mask = ao[:,idx,:] - ni.non0ao_idx[block_id] = idx - t1 = log.timer_debug1('initialize ao sparsity', *t0) - else: - idx = ni.non0ao_idx[block_id] - if xctype == 'LDA': - ao_mask = ao[idx,:] - else: - ao_mask = ao[:,idx,:] - block_id += 1 + + for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): for i in range(nset): t0 = (logger.process_clock(), logger.perf_counter()) #rho = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=1) #rho = _make_rho(ao, dms[i], xctype=xctype) if mo_coeff is None: - rho = eval_rho(mol, ao, dms[i], xctype=xctype, hermi=1) + rho = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] rho = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) - t1 = log.timer_debug1('eval rho', *t0) exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] vxc = cupy.asarray(vxc, order='C') exc = cupy.asarray(exc, order='C') - t1 = log.timer_debug1('eval vxc', *t1) + t1 = log.timer_debug1('eval vxc', *t0) if xctype == 'LDA': den = rho * weight wv = weight * vxc[0] + ''' if USE_SPARSITY == 0: vmat[i] += ao.dot(_scale_ao(ao, wv).T) elif USE_SPARSITY == 1: _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) - elif USE_SPARSITY == 2: + ''' + if USE_SPARSITY == 2: aow = _scale_ao(ao_mask, wv) # vmat[i][cupy.ix_(mask, mask)] += ao_mask.dot(aow.T) add_sparse(vmat[i], ao_mask.dot(aow.T), idx) else: - raise NotImplementedError('Not implemented yet') + raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented') elif xctype == 'GGA': den = rho[0] * weight wv = vxc * weight wv[0] *= .5 + ''' if USE_SPARSITY == 0: vmat[i] += ao[0].dot(_scale_ao(ao, wv).T) elif USE_SPARSITY == 1: aow = _scale_ao(ao, wv) _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) - elif USE_SPARSITY == 2: + ''' + if USE_SPARSITY == 2: aow = _scale_ao(ao_mask, wv) #vmat[i][cupy.ix_(mask, mask)] += ao_mask[0].dot(aow.T) add_sparse(vmat[i], ao_mask[0].dot(aow.T), idx) else: - raise NotImplementedError('Not implemented yet') + raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented') elif xctype == 'NLC': raise NotImplementedError('NLC') elif xctype == 'MGGA': den = rho[0] * weight wv = vxc * weight wv[[0, 4]] *= .5 # *.5 for v+v.T + ''' if USE_SPARSITY == 0: aow = _scale_ao(ao[:4], wv[:4]) vmat[i] += ao[0].dot(aow.T) @@ -490,12 +471,15 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, pair2shls_full, pairs_locs_full, vmat[i]) _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) - else: + ''' + if USE_SPARSITY == 2: aow = _scale_ao(ao_mask, wv[:4]) vtmp = ao_mask[0].dot(aow.T) vtmp+= _tau_dot(ao_mask, ao_mask, wv[4]) #vmat[i][cupy.ix_(mask, mask)] += vtmp add_sparse(vmat[i], vtmp, idx) + else: + raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented') elif xctype == 'HF': pass else: @@ -503,7 +487,6 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, #nelec[i] += den.sum() excsum[i] += cupy.dot(den, exc)[0] t1 = log.timer_debug1('integration', *t1) - ao = None vmat = contract('pi,npq->niq', coeff, vmat) vmat = contract('qj,niq->nij', coeff, vmat) @@ -666,8 +649,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= # AO basis -> gdftopt AO basis with_mocc = hasattr(dms, 'mo1') if with_mocc: - mo1 = cupy.einsum('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5 - occ_coeff = cupy.einsum('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5 + mo1 = contract('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5 + occ_coeff = contract('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5 dms = contract('nij,qj->niq', dms, coeff) dms = contract('pi,niq->npq', coeff, dms) nset = len(dms) @@ -682,43 +665,52 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= t0 = (logger.process_clock(), logger.perf_counter()) for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv): p0, p1 = p1, p1+len(weights) - # precompute the first half + # precompute molecular orbitals if with_mocc: + occ_coeff_mask = occ_coeff[mask] if xctype == 'LDA': - c0 = _dot_ao_dm(mol, ao, occ_coeff, None, None, None) + c0 = _dot_ao_dm(mol, ao, occ_coeff_mask, None, None, None) elif xctype == "GGA": c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0]) for i in range(4): - c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff, None, None, None) + c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None) else: # mgga c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0]) for i in range(4): - c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff, None, None, None) - # loop the second half + c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None) + # precompute fxc_w + if xctype == 'LDA': + fxc_w = fxc[0,0,p0:p1] * weights + else: + fxc_w = fxc[:,:,p0:p1] * weights + # loop perturbed molecular orbitals for i in range(nset): if with_mocc: - rho1 = eval_rho3(opt.mol, ao, c0, mo1[i], xctype=xctype, with_lapl=False) + rho1 = eval_rho3(opt.mol, ao, c0, mo1[i][mask], xctype=xctype, with_lapl=False) else: - rho1 = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=hermi, with_lapl=False) + rho1 = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) if xctype == 'LDA': - wv = rho1 * fxc[0,0,p0:p1] * weights - vmat[i] += ao.dot(_scale_ao(ao, wv).T) + wv = rho1 * fxc_w + vmat_tmp = ao.dot(_scale_ao(ao, wv).T) + add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'GGA': - wv = cupy.einsum('xg,xyg->yg', rho1, fxc[:,:,p0:p1]) * weights + wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w) wv[0] *= .5 - vmat[i] += ao[0].dot(_scale_ao(ao, wv).T) + vmat_tmp = ao[0].dot(_scale_ao(ao, wv).T) + add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'NLC': raise NotImplementedError('NLC') else: - wv = cupy.einsum('xg,xyg->yg', rho1, fxc[:,:,p0:p1]) * weights + wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w) wv[[0, 4]] *= .5 - vmat[i] += ao[0].dot(_scale_ao(ao[:4], wv[:4]).T) - vmat[i] += _tau_dot(ao, ao, wv[4]) + vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T) + vmat_tmp+= _tau_dot(ao, ao, wv[4]) + add_sparse(vmat[i], vmat_tmp, mask) t0 = log.timer_debug1('vxc', *t0) ao = c0 = rho1 = None - vmat = cupy.einsum('pi,npq->niq', coeff, vmat) - vmat = cupy.einsum('qj,niq->nij', coeff, vmat) + vmat = contract('pi,npq->niq', coeff, vmat) + vmat = contract('qj,niq->nij', coeff, vmat) if xctype != 'LDA': #transpose_sum(vmat) vmat = vmat + vmat.transpose([0,2,1]) @@ -872,7 +864,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vvrho = [] for ao, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory): - rho = eval_rho(opt.mol, ao, dms[0], xctype='GGA', hermi=1) + rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1) vvrho.append(rho) rho = cupy.hstack(vvrho) exc = 0 @@ -895,7 +887,9 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 aow = _scale_ao(ao, wv) - vmat += ao[0].dot(aow.T) + #vmat += ao[0].dot(aow.T) + add_sparse(vmat, ao[0].dot(aow.T), mask) + vmat = vmat + vmat.T vmat = contract('pi,pq->iq', coeff, vmat) vmat = contract('qj,iq->ij', coeff, vmat) @@ -1058,7 +1052,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, if blksize is None: cupy.get_default_memory_pool().free_all_blocks() mem_avail = get_avail_mem() - blksize = int((mem_avail*.2/8/((comp+1)*nao + extra) - nao*2)/ ALIGNED) * ALIGNED + blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, blksize) if blksize < ALIGNED: @@ -1071,14 +1065,37 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, mol = opt.mol with opt.gdft_envs_cache(): + block_id = 0 for ip0, ip1 in lib.prange(0, ngrids, blksize): coords = grids.coords[ip0:ip1] weight = grids.weights[ip0:ip1] - sindex = None#ni.screen_index[ip0//GRID_BLKSIZE:] + #sindex = ni.screen_index[ip0//GRID_BLKSIZE:] t0 = (logger.process_clock(), logger.perf_counter()) ao = eval_ao(ni, mol, coords, deriv) log.timer_debug1('eval ao', *t0) - yield ao, sindex, weight, coords + + # cache ao indices + if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx: + t0 = (logger.process_clock(), logger.perf_counter()) + if deriv == 0: + mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1]) + idx = cupy.argwhere(mask).astype(np.int32)[:,0] + ao_mask = ao[idx,:] + else: + mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) + idx = cupy.argwhere(mask).astype(np.int32)[:,0] + ao_mask = ao[:,idx,:] + ni.non0ao_idx[deriv, block_id, blksize, ngrids] = idx + log.timer_debug1('initialize ao sparsity', *t0) + else: + idx = ni.non0ao_idx[deriv, block_id, blksize, ngrids] + if deriv == 0: + ao_mask = ao[idx,:] + else: + ao_mask = ao[:,idx,:] + block_id += 1 + log.timer_debug1('eval rho', *t0) + yield ao_mask, idx, weight, coords class NumInt(numint.NumInt): from gpu4pyscf.lib.utils import to_cpu, to_gpu, device @@ -1149,11 +1166,12 @@ def _make_pairs2shls_idx(pair_mask, l_bas_loc, hermi=0): pair2bra + pair2ket).astype(np.int32).reshape(2,-1) return bas_pair2shls, bas_pairs_locs -def _contract_rho(bra, ket): +def _contract_rho(bra, ket, rho=None): if bra.flags.c_contiguous and ket.flags.c_contiguous: assert bra.shape == ket.shape nao, ngrids = bra.shape - rho = cupy.empty(ngrids) + if rho is None: + rho = cupy.empty(ngrids) stream = cupy.cuda.get_current_stream() err = libgdft.GDFTcontract_rho( ctypes.cast(stream.ptr, ctypes.c_void_p), diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index 31e67cc8..ea12f511 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -79,7 +79,6 @@ def initialize_grids(ks, mol=None, dm=None): # Filter grids the first time setup grids ks.grids = prune_small_rho_grids_(ks, ks.mol, dm, ks.grids) t0 = logger.timer_debug1(ks, 'setting up grids', *t0) - is_nlc = ks.nlc or ks._numint.libxc.is_nlc(ks.xc) if is_nlc and ks.nlcgrids.coords is None: if ks.nlcgrids.coords is None: @@ -235,7 +234,14 @@ def __init__(self, mol, xc='LDA,VWN', disp=None): self._numint = numint.NumInt(xc=xc) self.disp = disp self.screen_tol = 1e-14 + + grids_level = self.grids.level self.grids = gen_grid.Grids(mol) + self.grids.level = grids_level + + nlcgrids_level = self.nlcgrids.level + self.nlcgrids = gen_grid.Grids(mol) + self.nlcgrids.level = nlcgrids_level def get_dispersion(self): if self.disp is None: diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 008baa40..f22e8184 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -108,7 +108,6 @@ def _get_veff(ks_grad, mol=None, dm=None): def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): - log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) if opt is None: @@ -116,93 +115,79 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, opt = ni.gdftopt mo_occ = cupy.asarray(dms.mo_occ) mo_coeff = cupy.asarray(dms.mo_coeff) - coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dms = cupy.asarray(dms) dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) for dm in dms.reshape(-1,nao0,nao0)] mo_coeff = coeff @ mo_coeff - nset = len(dms) - - with opt.gdft_envs_cache(): - if xctype == 'LDA': - ao_deriv = 1 - else: - ao_deriv = 2 - mem_avail = get_avail_mem() - comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 - block_size = int((mem_avail*.4/8/(comp+1)/nao - 3*nao*2)/ ALIGNED) * ALIGNED - block_size = min(block_size, MIN_BLK_SIZE) - log.debug1('Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) + nset = len(dms) + assert nset == 1 - if block_size < ALIGNED: - raise RuntimeError('Not enough GPU memory') + if xctype == 'LDA': + ao_deriv = 1 + else: + ao_deriv = 2 + + vmat = cupy.zeros((nset,3,nao,nao)) + if xctype == 'LDA': + ao_deriv = 1 + for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + for idm in range(nset): + mo_coeff_mask = mo_coeff[idx,:] + rho = numint.eval_rho2(opt.mol, ao_mask[0], mo_coeff_mask, mo_occ, None, xctype) + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] + wv = weight * vxc[0] + aow = numint._scale_ao(ao_mask[0], wv) + vtmp = _d1_dot_(ao_mask[1:4], aow.T) + #idx = cupy.ix_(mask, mask) + #vmat[idm][0][idx] += vtmp[0] + #vmat[idm][1][idx] += vtmp[1] + #vmat[idm][2][idx] += vtmp[2] + add_sparse(vmat[idm][0], vtmp[0], idx) + add_sparse(vmat[idm][1], vtmp[1], idx) + add_sparse(vmat[idm][2], vtmp[2], idx) + elif xctype == 'GGA': + ao_deriv = 2 + for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + for idm in range(nset): + mo_coeff_mask = mo_coeff[idx,:] + rho = numint.eval_rho2(opt.mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype) + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] + wv = weight * vxc + wv[0] *= .5 + vtmp = _gga_grad_sum_(ao_mask, wv) + #idx = cupy.ix_(mask, mask) + #vmat[idm][0][idx] += vtmp[0] + #vmat[idm][1][idx] += vtmp[1] + #vmat[idm][2][idx] += vtmp[2] + add_sparse(vmat[idm][0], vtmp[0], idx) + add_sparse(vmat[idm][1], vtmp[1], idx) + add_sparse(vmat[idm][2], vtmp[2], idx) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + + elif xctype == 'MGGA': + ao_deriv = 2 + for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + for idm in range(nset): + mo_coeff_mask = mo_coeff[idx,:] + rho = numint.eval_rho2(opt.mol, ao_mask[:10], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) + vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] + wv = weight * vxc + wv[0] *= .5 + wv[4] *= .5 # for the factor 1/2 in tau + vtmp = _gga_grad_sum_(ao_mask, wv) + vtmp += _tau_grad_dot_(ao_mask, wv[4]) + #idx = cupy.ix_(mask, mask) + #vmat[idm][0][idx] += vtmp[0] + #vmat[idm][1][idx] += vtmp[1] + #vmat[idm][2][idx] += vtmp[2] + add_sparse(vmat[idm][0], vtmp[0], idx) + add_sparse(vmat[idm][1], vtmp[1], idx) + add_sparse(vmat[idm][2], vtmp[2], idx) - vmat = cupy.zeros((nset,3,nao,nao)) - if xctype == 'LDA': - ao_deriv = 1 - for ao, _, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - for idm in range(nset): - rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, None, xctype) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] - wv = weight * vxc[0] - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - idx = cupy.argwhere(mask).astype(numpy.int32)[:,0] - ao_mask = ao[:,idx,:] - aow = numint._scale_ao(ao_mask[0], wv) - vtmp = _d1_dot_(ao_mask[1:4], aow.T) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) - elif xctype == 'GGA': - ao_deriv = 2 - for ao, _, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - for idm in range(nset): - rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, None, xctype) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] - wv = weight * vxc - wv[0] *= .5 - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - idx = cupy.argwhere(mask).astype(numpy.int32)[:,0] - ao_mask = ao[:,idx,:] - vtmp = _gga_grad_sum_(ao_mask, wv) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - - elif xctype == 'MGGA': - ao_deriv = 2 - for ao, _, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - for idm in range(nset): - rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, None, xctype) - vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] - wv = weight * vxc - wv[0] *= .5 - wv[4] *= .5 # for the factor 1/2 in tau - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - idx = cupy.argwhere(mask).astype(numpy.int32)[:,0] - ao_mask = ao[:,idx,:] - vtmp = _gga_grad_sum_(ao_mask, wv) - vtmp += _tau_grad_dot_(ao_mask, wv[4]) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat] exc = None if nset == 1: @@ -222,6 +207,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_occ = cupy.asarray(dms.mo_occ) mo_coeff = cupy.asarray(dms.mo_coeff) + mol = opt.mol coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape dms = cupy.asarray(dms) @@ -238,26 +224,31 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ao_deriv = 2 vvrho = [] - for ao, mask, weight, coords \ + for ao_mask, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory): - rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, None, xctype) + mo_coeff_mask = mo_coeff[mask] + rho = numint.eval_rho2(mol, ao_mask[:4], mo_coeff_mask, mo_occ, None, xctype, with_lapl=False) vvrho.append(rho) rho = cupy.hstack(vvrho) + vxc = numint._vv10nlc(rho, grids.coords, rho, grids.weights, grids.coords, nlc_pars)[1] vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0) vmat = cupy.zeros((3,nao,nao)) p1 = 0 - for ao, mask, weight, coords \ + for ao_mask, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory): p0, p1 = p1, p1 + weight.size wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 # *.5 because vmat + vmat.T at the end - vmat += _gga_grad_sum_(ao, wv) - - vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff) + vmat_tmp = _gga_grad_sum_(ao_mask, wv) + add_sparse(vmat[0], vmat_tmp[0], mask) + add_sparse(vmat[1], vmat_tmp[1], mask) + add_sparse(vmat[2], vmat_tmp[2], mask) + vmat = contract('npq,qj->npj', vmat, coeff) + vmat = contract('pi,npj->nij', coeff, vmat) exc = None # - sign because nabla_X = -nabla_x return exc, -vmat diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 26c441cf..190a118d 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -84,11 +84,11 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, s1ao[:,p0:p1] += s1a[:,p0:p1] s1ao[:,:,p0:p1] += s1a[:,p0:p1].transpose(0,2,1) - tmp = cupy.einsum('xpq,pi->xiq', s1ao, mocc) - s1oo = cupy.einsum('xiq,qj->xij', tmp, mocc) + tmp = contract('xpq,pi->xiq', s1ao, mocc) + s1oo = contract('xiq,qj->xij', tmp, mocc) #s1oo = cupy.einsum('xpq,pi,qj->xij', s1ao, mocc, mocc) - s1mo = cupy.einsum('xij,ip->xpj', s1ao, mo_coeff) + s1mo = contract('xij,ip->xpj', s1ao, mo_coeff) for j0 in range(i0+1): ja = atmlst[j0] @@ -96,10 +96,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # *2 for double occupancy, *2 for +c.c. #dm1 = cupy.einsum('ypi,qi->ypq', mo1[ja], mocc) #de2_gpu[i0,j0] += cupy.einsum('xpq,ypq->xy', h1ao[ia], dm1) * 4 - de2[i0,j0] += cupy.einsum('xpi,ypi->xy', h1ao[ia], mo1[ja]) * 4 + de2[i0,j0] += contract('xpi,ypi->xy', h1ao[ia], mo1[ja]) * 4 dm1 = cupy.einsum('ypi,qi,i->ypq', mo1[ja], mocc, mo_energy[mo_occ>0]) - de2[i0,j0] -= cupy.einsum('xpq,ypq->xy', s1mo, dm1) * 4 - de2[i0,j0] -= cupy.einsum('xpq,ypq->xy', s1oo, mo_e1[ja]) * 2 + de2[i0,j0] -= contract('xpq,ypq->xy', s1mo, dm1) * 4 + de2[i0,j0] -= contract('xpq,ypq->xy', s1oo, mo_e1[ja]) * 2 for j0 in range(i0): de2[j0,i0] = de2[i0,j0].T @@ -325,11 +325,11 @@ def solve_mo1(mf, mo_energy, mo_coeff, mo_occ, h1mo, s1a = cupy.asarray(s1a) def _ao2mo(mat): - tmp = cupy.einsum('xij,jo->xio', mat, mocc) - return cupy.einsum('xik,ip->xpk', tmp, mo_coeff) + tmp = contract('xij,jo->xio', mat, mocc) + return contract('xik,ip->xpk', tmp, mo_coeff) cupy.get_default_memory_pool().free_all_blocks() # TODO: calculate blksize dynamically - blksize = 10 + blksize = 8 mo1s = [None] * mol.natm e1s = [None] * mol.natm aoslices = mol.aoslice_by_atom() @@ -371,13 +371,13 @@ def gen_vind(mf, mo_coeff, mo_occ): def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmo,nocc) - mo1_mo = cupy.einsum('npo,ip->nio', mo1, mo_coeff) - dm1 = cupy.einsum('nio,jo->nij', 2.0*mo1_mo, mocc) + mo1_mo = contract('npo,ip->nio', mo1, mo_coeff) + dm1 = contract('nio,jo->nij', 2.0*mo1_mo, mocc) dm1 = dm1 + dm1.transpose(0,2,1) dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ) v1 = vresp(dm1) - tmp = cupy.einsum('nij,jo->nio', v1, mocc) - v1vo = cupy.einsum('nio,ip->npo', tmp, mo_coeff) + tmp = contract('nij,jo->nio', v1, mocc) + v1vo = contract('nio,ip->npo', tmp, mo_coeff) return v1vo return fx diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 19041963..d3898d42 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -27,7 +27,7 @@ from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract +from gpu4pyscf.lib.cupy_helper import contract, add_sparse from gpu4pyscf.lib import logger # import pyscf.grad.rks to activate nuc_grad_method method @@ -97,10 +97,10 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, veff -= (alpha-hyb)*.5 * vk1.transpose(0,2,1).reshape(3,3,nao,nao) t1 = log.timer_debug1('range-separated int2e_ipvip1 for atom %d'%ia, *t1) vk1 = vk2 = None - de2[i0,i0] += cupy.einsum('xypq,pq->xy', veff_diag[:,:,p0:p1], dm0[p0:p1])*2 + de2[i0,i0] += contract('xypq,pq->xy', veff_diag[:,:,p0:p1], dm0[p0:p1])*2 for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] - de2[i0,j0] += cupy.einsum('xypq,pq->xy', veff[:,:,q0:q1], dm0[q0:q1])*2 + de2[i0,j0] += contract('xypq,pq->xy', veff[:,:,q0:q1], dm0[q0:q1])*2 for j0 in range(i0): de2[j0,i0] = de2[i0,j0].T @@ -213,12 +213,14 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) + mo_coeff_mask = mo_coeff[mask,:] + rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff_mask, mo_occ, mask, xctype) vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1] wv = weight * vxc[0] aow = numint._scale_ao(ao[0], wv) for i in range(6): - vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) + vmat_tmp = numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) + add_sparse(vmat[i], vmat_tmp, mask) aow = None elif xctype == 'GGA': @@ -226,20 +228,22 @@ def contract_(mat, ao, aoidx, wv, mask): aow = numint._scale_ao(ao[aoidx[0]], wv[1]) aow+= numint._scale_ao(ao[aoidx[1]], wv[2]) aow+= numint._scale_ao(ao[aoidx[2]], wv[3]) - mat += numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc) + mat_tmp = numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc) + add_sparse(mat, mat_tmp, mask) ao_deriv = 3 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype) + mo_coeff_mask = mo_coeff[mask,:] + rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff_mask, mo_occ, mask, xctype) vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1] wv = weight * vxc #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) aow = numint._scale_ao(ao[:4], wv[:4]) for i in range(6): - vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) - + vmat_tmp = numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) + add_sparse(vmat[i], vmat_tmp, mask) contract_(vmat[0], ao, [XXX,XXY,XXZ], wv, mask) contract_(vmat[1], ao, [XXY,XYY,XYZ], wv, mask) contract_(vmat[2], ao, [XXZ,XYZ,XZZ], wv, mask) @@ -253,19 +257,22 @@ def contract_(mat, ao, aoidx, wv, mask): aow = numint._scale_ao(ao[aoidx[0]], wv[1]) aow+= numint._scale_ao(ao[aoidx[1]], wv[2]) aow+= numint._scale_ao(ao[aoidx[2]], wv[3]) - mat += numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc) + mat_tmp = numint._dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc) + add_sparse(mat, mat_tmp, mask) ao_deriv = 3 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) + mo_coeff_mask = mo_coeff[mask,:] + rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff_mask, mo_occ, mask, xctype) vxc = ni.eval_xc_eff(mf.xc, rho, 1, xctype=xctype)[1] wv = weight * vxc wv[4] *= .5 # for the factor 1/2 in tau #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) aow = numint._scale_ao(ao[:4], wv[:4]) for i in range(6): - vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) + vmat_tmp = numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) + add_sparse(vmat[i], vmat_tmp, mask) contract_(vmat[0], ao, [XXX,XXY,XXZ], wv, mask) contract_(vmat[1], ao, [XXY,XYY,XYZ], wv, mask) @@ -276,12 +283,14 @@ def contract_(mat, ao, aoidx, wv, mask): aow = [numint._scale_ao(ao[i], wv[4]) for i in range(1, 4)] for i, j in enumerate([XXX, XXY, XXZ, XYY, XYZ, XZZ]): - vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[0], mask, shls_slice, ao_loc) + vmat_tmp = numint._dot_ao_ao(mol, ao[j], aow[0], mask, shls_slice, ao_loc) + add_sparse(vmat[i], vmat_tmp, mask) for i, j in enumerate([XXY, XYY, XYZ, YYY, YYZ, YZZ]): - vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[1], mask, shls_slice, ao_loc) + vmat_tmp = numint._dot_ao_ao(mol, ao[j], aow[1], mask, shls_slice, ao_loc) + add_sparse(vmat[i], vmat_tmp, mask) for i, j in enumerate([XXZ, XYZ, XZZ, YYZ, YZZ, ZZZ]): - vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[2], mask, shls_slice, ao_loc) - + vmat_tmp = numint._dot_ao_ao(mol, ao[j], aow[2], mask, shls_slice, ao_loc) + add_sparse(vmat[i], vmat_tmp, mask) vmat = vmat[[0,1,2, 1,3,4, 2,4,5]] @@ -290,49 +299,45 @@ def contract_(mat, ao, aoidx, wv, mask): return vmat.reshape(3,3,nao_sph,nao_sph) def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): - # TODO: hard coded - ao = ao.transpose([0,2,1]) - ao_dm0 = [x.T for x in ao_dm0] - p0, p1 = aoslices[atm_id][2:] - ngrids = ao[0].shape[0] + ngrids = ao[0].shape[1] if xctype == 'GGA': rho1 = cupy.zeros((3,4,ngrids)) elif xctype == 'MGGA': rho1 = cupy.zeros((3,5,ngrids)) - ao_dm0_x = ao_dm0[1][:,p0:p1] - ao_dm0_y = ao_dm0[2][:,p0:p1] - ao_dm0_z = ao_dm0[3][:,p0:p1] + ao_dm0_x = ao_dm0[1][p0:p1] + ao_dm0_y = ao_dm0[2][p0:p1] + ao_dm0_z = ao_dm0[3][p0:p1] # (d_X \nabla mu) dot \nalba nu DM_{mu,nu} - rho1[0,4] += cupy.einsum('pi,pi->p', ao[XX,:,p0:p1], ao_dm0_x) - rho1[0,4] += cupy.einsum('pi,pi->p', ao[XY,:,p0:p1], ao_dm0_y) - rho1[0,4] += cupy.einsum('pi,pi->p', ao[XZ,:,p0:p1], ao_dm0_z) - rho1[1,4] += cupy.einsum('pi,pi->p', ao[YX,:,p0:p1], ao_dm0_x) - rho1[1,4] += cupy.einsum('pi,pi->p', ao[YY,:,p0:p1], ao_dm0_y) - rho1[1,4] += cupy.einsum('pi,pi->p', ao[YZ,:,p0:p1], ao_dm0_z) - rho1[2,4] += cupy.einsum('pi,pi->p', ao[ZX,:,p0:p1], ao_dm0_x) - rho1[2,4] += cupy.einsum('pi,pi->p', ao[ZY,:,p0:p1], ao_dm0_y) - rho1[2,4] += cupy.einsum('pi,pi->p', ao[ZZ,:,p0:p1], ao_dm0_z) + rho1[0,4] += cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_x) + rho1[0,4] += cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_y) + rho1[0,4] += cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_z) + rho1[1,4] += cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_x) + rho1[1,4] += cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_y) + rho1[1,4] += cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_z) + rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_x) + rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_y) + rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_z) rho1[:,4] *= .5 else: raise RuntimeError - ao_dm0_0 = ao_dm0[0][:,p0:p1] + ao_dm0_0 = ao_dm0[0][p0:p1] # (d_X \nabla_x mu) nu DM_{mu,nu} - rho1[:,0] = cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0_0) - rho1[0,1]+= cupy.einsum('pi,pi->p', ao[XX,:,p0:p1], ao_dm0_0) - rho1[0,2]+= cupy.einsum('pi,pi->p', ao[XY,:,p0:p1], ao_dm0_0) - rho1[0,3]+= cupy.einsum('pi,pi->p', ao[XZ,:,p0:p1], ao_dm0_0) - rho1[1,1]+= cupy.einsum('pi,pi->p', ao[YX,:,p0:p1], ao_dm0_0) - rho1[1,2]+= cupy.einsum('pi,pi->p', ao[YY,:,p0:p1], ao_dm0_0) - rho1[1,3]+= cupy.einsum('pi,pi->p', ao[YZ,:,p0:p1], ao_dm0_0) - rho1[2,1]+= cupy.einsum('pi,pi->p', ao[ZX,:,p0:p1], ao_dm0_0) - rho1[2,2]+= cupy.einsum('pi,pi->p', ao[ZY,:,p0:p1], ao_dm0_0) - rho1[2,3]+= cupy.einsum('pi,pi->p', ao[ZZ,:,p0:p1], ao_dm0_0) + rho1[:,0] = cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0_0) + rho1[0,1]+= cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_0) + rho1[0,2]+= cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_0) + rho1[0,3]+= cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_0) + rho1[1,1]+= cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_0) + rho1[1,2]+= cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_0) + rho1[1,3]+= cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_0) + rho1[2,1]+= cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_0) + rho1[2,2]+= cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_0) + rho1[2,3]+= cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_0) # (d_X mu) (\nabla_x nu) DM_{mu,nu} - rho1[:,1] += cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0[1][:,p0:p1]) - rho1[:,2] += cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0[2][:,p0:p1]) - rho1[:,3] += cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0[3][:,p0:p1]) + rho1[:,1] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[1][p0:p1]) + rho1[:,2] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[2][p0:p1]) + rho1[:,3] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[3][p0:p1]) # *2 for |mu> DM njp', ao, coeff) + ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] wv = weight * vxc[0] @@ -396,7 +406,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] # *2 for \nabla|ket> in rho1 - rho1 = cupy.einsum('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2 + rho1 = contract('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2 # aow ~ rho1 ~ d/dR1 wv = wf * rho1 aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] @@ -413,7 +423,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao): # TODO: improve efficiency - ao = contract('nip,ij->njp', ao, coeff) + ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] wv = weight * vxc @@ -445,7 +455,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - ao = contract('nip,ij->njp', ao, coeff) + ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] wv = weight * vxc @@ -523,7 +533,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): ao_deriv = 1 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - ao = contract('nip,ij->njp', ao, coeff) + ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] wv = weight * vxc[0] @@ -546,7 +556,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): for ao, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory): # TODO: improve efficiency - ao = contract('nip,ij->njp', ao, coeff) + ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] wv = weight * vxc @@ -570,7 +580,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - ao = contract('nip,ij->njp', ao, coeff) + ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] wv = weight * vxc @@ -583,7 +593,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): wf = weight * fxc for ia in range(mol.natm): dR_rho1 = _make_dR_rho1(ao, ao_dm0, ia, aoslices, xctype) - wv = cupy.einsum('xyg,sxg->syg', wf, dR_rho1) + wv = contract('xyg,sxg->syg', wf, dR_rho1) wv[:,0] *= .5 wv[:,4] *= .25 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] @@ -597,8 +607,8 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): p0, p1 = aoslices[ia][2:] vmat[ia,:,p0:p1] += v_ip[:,p0:p1] vmat[ia] = -vmat[ia] - vmat[ia].transpose(0,2,1) - vmat = cupy.einsum("kxij,jq->kxiq", vmat, mocc) - vmat = cupy.einsum("kxiq,ip->kxpq", vmat, mo_coeff) + vmat = contract("kxij,jq->kxiq", vmat, mocc) + vmat = contract("kxiq,ip->kxpq", vmat, mo_coeff) return vmat diff --git a/setup.py b/setup.py index fd5efdab..57372a31 100755 --- a/setup.py +++ b/setup.py @@ -122,7 +122,7 @@ def initialize_with_default_plat_name(self): ], cmdclass={'build_py': CMakeBuildPy}, install_requires=[ - 'pyscf>=2.3.0', + 'pyscf>=2.4.0', f'cupy-cuda{CUDA_VERSION}>=12.0', 'dftd3==0.7.0', 'dftd4==3.5.0', From 8635711970a899ea108ecf8b47ff1d0e15d9646c Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Oct 2023 17:54:43 -0700 Subject: [PATCH 16/19] Update __init__.py --- gpu4pyscf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index ca735418..681a2e88 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1,2 +1,2 @@ from . import lib, grad, hessian, solvent, scf, dft -__version__ = '0.6.4' +__version__ = '0.6.5' From be1aef0080bca6ec09836884964b400a3d988ba8 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 2 Nov 2023 21:02:25 -0700 Subject: [PATCH 17/19] Optimize hessian 3 (#54) * numpy -> cupy for solvent * for linter * remove grad switch from pcm.py * passed flake8 * solvent integrals on GPU * flake8 * compatiable with pyscf-2.4.0 * added solvent * fixed issues for to_cpu * store intermeidate variable on CPU * cupy.einsum -> contract * optimized dft integration for gradient and hessian * remove lprof * fixed a bug in nlc * precompute fxc_x * optimize hessian & gpu timer * remove scale_ao --- examples/00-h2o.py | 4 +- examples/dft_driver.py | 2 +- gpu4pyscf/df/df.py | 4 +- gpu4pyscf/df/df_jk.py | 4 +- gpu4pyscf/df/grad/rhf.py | 4 +- gpu4pyscf/df/grad/rks.py | 2 +- gpu4pyscf/df/hessian/rhf.py | 32 +-- gpu4pyscf/df/hessian/rks.py | 2 +- gpu4pyscf/df/int3c2e.py | 28 +-- gpu4pyscf/dft/libxc.py | 35 +-- gpu4pyscf/dft/numint.py | 184 ++++++++++++--- gpu4pyscf/dft/rks.py | 6 +- gpu4pyscf/grad/rhf.py | 40 ++-- gpu4pyscf/grad/rks.py | 68 +++--- gpu4pyscf/hessian/rks.py | 142 +++++++---- gpu4pyscf/lib/cupy_helper.py | 18 +- gpu4pyscf/lib/cupy_helper/add_sparse.cu | 13 +- gpu4pyscf/lib/cutensor.py | 27 ++- gpu4pyscf/lib/gdft/CMakeLists.txt | 2 +- gpu4pyscf/lib/gdft/contract_rho.cu | 299 +++++++++++++++++++++++- gpu4pyscf/lib/gdft/nr_eval_gto.cu | 27 --- gpu4pyscf/lib/logger.py | 51 +++- gpu4pyscf/scf/hf.py | 14 +- 23 files changed, 748 insertions(+), 260 deletions(-) diff --git a/examples/00-h2o.py b/examples/00-h2o.py index 7f17e62d..7df44258 100644 --- a/examples/00-h2o.py +++ b/examples/00-h2o.py @@ -24,7 +24,7 @@ H 0.7570000000 0.0000000000 -0.4696000000 ''' -xc='LDA' +xc='B3LYP' bas='def2-tzvpp' auxbasis='def2-tzvpp-jkfit' scf_tol = 1e-10 @@ -34,7 +34,7 @@ mol = pyscf.M(atom=atom, basis=bas, max_memory=32000) -mol.verbose = 4 +mol.verbose = 6 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) mf_GPU.grids.level = grids_level mf_GPU.conv_tol = scf_tol diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 3b68d665..12628086 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -35,7 +35,7 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 6 +mol.verbose = 1 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) if args.solvent: diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 28998230..da59f9da 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -67,8 +67,8 @@ def build(self, direct_scf_tol=1e-14, omega=None): idx = np.arange(nao) self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx) - t0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, mol.verbose) + t0 = log.init_timer() if auxmol is None: self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis) @@ -217,7 +217,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): nq = len(intopt.log_qs) for cp_ij_id, _ in enumerate(intopt.log_qs): if len(intopt.ao_pairs_row[cp_ij_id]) == 0: continue - t1 = (logger.process_clock(), logger.perf_counter()) + t1 = log.init_timer() cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index e27f16f4..76f4bed0 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -20,9 +20,9 @@ import cupy import numpy from pyscf import lib, scf, __config__ -from pyscf.lib import logger from pyscf.scf import dhf from pyscf.df import df_jk, addons +from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract, take_last2d, transpose_sum, load_library, get_avail_mem from gpu4pyscf.dft import rks, numint from gpu4pyscf.scf import hf @@ -250,7 +250,7 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- nao = dms_tag.shape[-1] dms = dms_tag.reshape([-1,nao,nao]) nset = dms.shape[0] - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() if dfobj._cderi is None: log.debug('CDERI not found, build...') dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 84f5ed23..febdee40 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -18,12 +18,12 @@ import cupy from cupyx.scipy.linalg import solve_triangular from pyscf.df.grad import rhf -from pyscf.lib import logger from pyscf import lib, scf, gto from gpu4pyscf.df import int3c2e from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library from gpu4pyscf.grad.rhf import grad_elec from gpu4pyscf import __config__ +from gpu4pyscf.lib import logger libcupy_helper = load_library('libcupy_helper') @@ -154,7 +154,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega vkaux = cupy.zeros((3,naux_cart)) cupy.get_default_memory_pool().free_all_blocks() for cp_kl_id in range(len(intopt.aux_log_qs)): - t1 = (logger.process_clock(), logger.perf_counter()) + t1 = log.init_timer() k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] assert k1-k0 <= block_size if with_j: diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py index 2ef88e86..4708c4d7 100644 --- a/gpu4pyscf/df/grad/rks.py +++ b/gpu4pyscf/df/grad/rks.py @@ -29,7 +29,7 @@ def get_veff(ks_grad, mol=None, dm=None): ''' if mol is None: mol = ks_grad.mol if dm is None: dm = ks_grad.base.make_rdm1() - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks_grad) mf = ks_grad.base ni = mf._numint diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 46cb38ec..1141de1f 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -55,7 +55,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, '''Partial derivative ''' log = logger.new_logger(hessobj, verbose) - time0 = t1 = (logger.process_clock(), logger.perf_counter()) + time0 = t1 = log.init_timer() mol = hessobj.mol mf = hessobj.base @@ -393,12 +393,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None, with_k=True, omega=None): log = logger.new_logger(hessobj, verbose) - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() mol = hessobj.mol if atmlst is None: atmlst = range(mol.natm) # FIXME with_k = True + mo_coeff = cupy.asarray(mo_coeff) + mo_occ = cupy.asarray(mo_occ) mf = hessobj.base #auxmol = hessobj.base.with_df.auxmol @@ -441,7 +443,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, rhok0_Pl_ = np.empty_like(wk_Pl_) for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) - rhok0_Pl_[:,p0:p1] = cupy.einsum('pq,qio->pio', int2c_inv, wk_tmp).get() + rhok0_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get() wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None # int3c_ip1 contributions @@ -449,8 +451,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] - vj1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vj1_ao, mo_coeff) - vk1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vk1_ao, mo_coeff) + vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff) + vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff) vj1_ao = vk1_ao = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) @@ -475,15 +477,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, for p0, p1 in lib.prange(0,nao,64): rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) - vj1_tmp = cupy.einsum('pio,xp->xpio', rhok_tmp, wj0_10) + vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) - wk0_10_Pl_ = cupy.einsum('xqp,pio->xqio', int2c_ip1, rhok_tmp) - vj1_tmp += cupy.einsum('xpio,p->xpio', wk0_10_Pl_, rhoj0) - vj1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vj1_tmp, aux2atom) + wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp) + vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0) + vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom) if with_k: - vk1_tmp = 2.0 * cupy.einsum('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) - vk1_tmp += 2.0 * cupy.einsum('xpro,pir->xpio', wk0_10_P__, rhok_tmp) - vk1_int3c_ip2[:,:,p0:p1] += cupy.einsum('xpio,pa->axio', vk1_tmp, aux2atom) + vk1_tmp = 2.0 * contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) + vk1_tmp += 2.0 * contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp) + vk1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom) wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None vj1_tmp = vk1_tmp = wk0_10_Pl_ = rhoj0 = rhok0_Pl_ = None aux2atom = None @@ -498,8 +500,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, # ========================== sorted AO end ================================ def _ao2mo(mat): - tmp = cupy.einsum('xij,jo->xio', mat, mocc) - return cupy.einsum('xik,ip->xpk', tmp, mo_coeff) + tmp = contract('xij,jo->xio', mat, mocc) + return contract('xik,ip->xpk', tmp, mo_coeff) vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2 vj1_int3c_ip1 = vj1_int3c_ip2 = None @@ -522,7 +524,7 @@ def _ao2mo(mat): vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) h1 = hcore_deriv(ia) - h1 = _ao2mo(h1) + h1 = _ao2mo(cupy.asarray(h1)) vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) if with_k: vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index d8986f1f..b7432314 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -37,7 +37,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): log = logger.new_logger(hessobj, verbose) - time0 = t1 = (logger.process_clock(), logger.perf_counter()) + time0 = t1 = log.init_timer() mol = hessobj.mol mf = hessobj.base diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index fc755a27..f3a43442 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -184,7 +184,7 @@ def build(self, cutoff=1e-14, group_size=None, a tot_mol is created with concatenating [mol, fake_mol, aux_mol] we will pair (ao,ao) and (aux,1) separately. ''' - cput0 = (logger.process_clock(), logger.perf_counter()) + cput0 = logger.init_timer(self.mol) sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(self.mol) if group_size is not None : uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size) @@ -314,6 +314,7 @@ def build(self, cutoff=1e-14, group_size=None, tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.natm), tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(tot_mol.nbas), tot_mol._env.ctypes.data_as(ctypes.c_void_p)) + cput1 = logger.timer_debug1(tot_mol, 'Initialize GPU cache', *cput1) self.bas_pairs_locs = bas_pairs_locs ncptype = len(self.log_qs) @@ -745,29 +746,24 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) # initialize intermediate variables if count % ncp_ij == 0: - rhoj0 = cupy.zeros([3,k1-k0,nao_sph]) rhok_tmp = cupy.asarray(rhok[k0:k1]) - vj1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc]) if with_k: rhok0_slice = contract('pio,Jo->piJ', rhok_tmp, orbo) * 2 rhok0 = contract('pli,lo->poi', rhok0_slice, orbo) - int3c_ip1_occ = cupy.zeros([3,k1-k0,nao_sph,nocc]) - vk1_ao = cupy.zeros([3,nao_sph,nao_sph,nocc]) - # contraction - rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) + rhoj0 = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1]) + vj1_ao = contract('pJo,xpi->xiJo', rhok_tmp, rhoj0) + vj1 += 2.0*contract('xiJo,ia->axJo', vj1_ao, ao2atom[i0:i1]) + if with_k: - int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - vk1_ao[:,i0:i1,j0:j1] += contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) vk1_buf[:,i0:i1] += contract('xpji,plj->xil', int3c_blk, rhok0_slice[:,:,j0:j1]) - # reduction - if (count+1) % ncp_ij == 0: - vj1_ao += contract('pjo,xpi->xijo', rhok_tmp, rhoj0) - vj1 += 2.0*contract('xiko,ia->axko', vj1_ao, ao2atom) - if with_k: - vk1_ao += contract('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) - vk1 += contract('xiko,ia->axko', vk1_ao, ao2atom) + vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) + vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1]) + + int3c_ip1_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice[:,:,i0:i1]) + vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1]) count += 1 return vj1_buf, vk1_buf, vj1, vk1 diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py index 65c05a77..5ed63b48 100644 --- a/gpu4pyscf/dft/libxc.py +++ b/gpu4pyscf/dft/libxc.py @@ -21,7 +21,7 @@ import cupy from pyscf import dft -libxc = np.ctypeslib.load_library( +_libxc = np.ctypeslib.load_library( 'libxc', os.path.abspath(os.path.join(__file__, '..', '..', 'lib', 'deps', 'lib'))) def _check_arrays(current_arrays, fields, factor, required): @@ -45,21 +45,21 @@ class _xcfun(ctypes.Structure): pass _xc_func_p = ctypes.POINTER(_xcfun) -libxc.xc_func_alloc.restype = _xc_func_p -libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int) -libxc.xc_func_end.argtypes = (_xc_func_p, ) -libxc.xc_func_free.argtypes = (_xc_func_p, ) +_libxc.xc_func_alloc.restype = _xc_func_p +_libxc.xc_func_init.argtypes = (_xc_func_p, ctypes.c_int, ctypes.c_int) +_libxc.xc_func_end.argtypes = (_xc_func_p, ) +_libxc.xc_func_free.argtypes = (_xc_func_p, ) class XCfun: def __init__(self, xc, spin): assert spin == 'unpolarized' self._spin = 1 - self.xc_func = libxc.xc_func_alloc() + self.xc_func = _libxc.xc_func_alloc() if isinstance(xc, str): - self.func_id = libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode())) + self.func_id = _libxc.xc_functional_get_number(ctypes.c_char_p(xc.encode())) else: self.func_id = xc - ret = libxc.xc_func_init(self.xc_func, self.func_id, self._spin) + ret = _libxc.xc_func_init(self.xc_func, self.func_id, self._spin) if ret != 0: raise RuntimeError('failed to initialize xc fun') self._family = dft.libxc.xc_type(xc) @@ -67,9 +67,10 @@ def __init__(self, xc, spin): def __del__(self): if self.xc_func is None: return - libxc.xc_func_end(self.xc_func) - libxc.xc_func_free(self.xc_func) - + # TODO: deallocate xc func + #_libxc.xc_func_end(self.xc_func) + #_libxc.xc_func_free(self.xc_func) + def needs_laplacian(self): return dft.libxc.needs_laplacian(self.func_id) @@ -85,7 +86,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k npoints = int(inp["rho"].size / self._spin) if (inp["rho"].size % self._spin): raise ValueError("Rho input has an invalid shape, must be divisible by %d" % self._spin) - + # Find the right compute function args = [self.xc_func, ctypes.c_size_t(npoints)] if self._family == 'LDA': @@ -114,7 +115,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k if(isinstance(arg, cupy.ndarray)): arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) cuda_args.append(arg) - libxc.xc_lda(*cuda_args) + _libxc.xc_lda(*cuda_args) elif self._family == 'GGA': input_labels = ["rho", "sigma"] input_num_args = 2 @@ -141,7 +142,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k if(isinstance(arg, cupy.ndarray)): arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) cuda_args.append(arg) - libxc.xc_gga(*cuda_args) + _libxc.xc_gga(*cuda_args) elif self._family == 'MGGA': # Build input args @@ -178,7 +179,7 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k output = _check_arrays(output, output_labels[5:15], npoints, do_fxc) output = _check_arrays(output, output_labels[15:35], npoints, do_kxc) output = _check_arrays(output, output_labels[35:70], npoints, do_lxc) - + args.extend([ inp[x] for x in input_labels]) if not self.needs_laplacian(): args.insert(-1, cupy.empty((1))) # Add none ptr to laplacian @@ -189,10 +190,10 @@ def compute(self, inp, output=None, do_exc=True, do_vxc=True, do_fxc=False, do_k if(isinstance(arg, cupy.ndarray)): arg = ctypes.cast(arg.data.ptr, ctypes.c_void_p) cuda_args.append(arg) - libxc.xc_mgga(*cuda_args) + _libxc.xc_mgga(*cuda_args) else: raise KeyError("Functional kind not recognized!") - + return {k: v for k, v in zip(output_labels, args[2+input_num_args:]) if v is not None} diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index cc9c0e6b..6ca2a9f8 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -21,13 +21,13 @@ import cupy from pyscf import gto, lib, dft -from pyscf.lib import logger from pyscf.dft import numint from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.scf.hf import basis_seg_contraction from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ +from gpu4pyscf.lib import logger LMAX_ON_GPU = 6 BAS_ALIGNED = 4 @@ -35,6 +35,7 @@ MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) AO_THRESHOLD = 1e-12 +AO_ALIGNMENT = 32 # Should we release the cupy cache? FREE_CUPY_CACHE = False @@ -269,6 +270,42 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', rho[tau_idx] *= .5 return rho +def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA', + with_lapl=True, verbose=None): + ''' ao: nd x nao x ng + c0: nd x nocc x ng + mo1: na x nao x nocc + ''' + xctype = xctype.upper() + if xctype == 'LDA' or xctype == 'HF': + _, ngrids = ao.shape + else: + _, ngrids = ao[0].shape + + na = mo1.shape[0] + cpos1= mo1 + if xctype == 'LDA' or xctype == 'HF': + c_0 = contract('aio,ig->aog', cpos1, ao)#cupy.dot(cpos1.T, ao) + rho = cupy.empty([na,ngrids]) + for i in range(na): + rho[i] = _contract_rho(c0, c_0[i]) + rho *= 2.0 + elif xctype in ('GGA', 'NLC'): + c_0 = contract('nig,aio->anog', ao, cpos1) + rho = cupy.empty([na, 4, ngrids]) + for i in range(na): + _contract_rho_gga(c0, c_0[i], rho=rho[i]) + + else: # meta-GGA + if with_lapl: + raise NotImplementedError("mGGA with lapl not implemented") + rho = cupy.empty((na,5,ngrids)) + c_0 = contract('nig,aio->anog', ao, cpos1) + for i in range(na): + _contract_rho_mgga(c0, c_0[i], rho=rho[i]) + + return rho + def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): thresh=1e-8 @@ -408,15 +445,14 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, ao_deriv): for i in range(nset): - t0 = (logger.process_clock(), logger.perf_counter()) - #rho = eval_rho(opt.mol, ao, dms[i], xctype=xctype, hermi=1) - #rho = _make_rho(ao, dms[i], xctype=xctype) + t0 = log.init_timer() if mo_coeff is None: rho = eval_rho(mol, ao_mask, dms[i][np.ix_(idx,idx)], xctype=xctype, hermi=1) else: mo_coeff_mask = mo_coeff[idx,:] rho = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) + t1 = log.timer_debug1('eval rho', *t0) exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] vxc = cupy.asarray(vxc, order='C') exc = cupy.asarray(exc, order='C') @@ -662,8 +698,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= ao_deriv = 1 p0 = 0 p1 = 0 - t0 = (logger.process_clock(), logger.perf_counter()) for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv): + t0 = log.init_timer() p0, p1 = p1, p1+len(weights) # precompute molecular orbitals if with_mocc: @@ -671,44 +707,51 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if xctype == 'LDA': c0 = _dot_ao_dm(mol, ao, occ_coeff_mask, None, None, None) elif xctype == "GGA": - c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0]) - for i in range(4): - c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None) + c0 = contract('nig,io->nog', ao, occ_coeff_mask) else: # mgga - c0 = cupy.empty([4,occ_coeff.shape[1],p1-p0]) - for i in range(4): - c0[i] = _dot_ao_dm(mol, ao[i], occ_coeff_mask, None, None, None) + c0 = contract('nig,io->nog', ao, occ_coeff_mask) + + if with_mocc: + rho1 = eval_rho4(opt.mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False) + else: + # slow version + rho1 = [] + for i in range(nset): + rho_tmp = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) + rho1.append(rho_tmp) + rho1 = cupy.stack(rho1, axis=0) + t0 = log.timer_debug1('rho', *t0) + # precompute fxc_w if xctype == 'LDA': fxc_w = fxc[0,0,p0:p1] * weights + wv = rho1 * fxc_w else: fxc_w = fxc[:,:,p0:p1] * weights - # loop perturbed molecular orbitals - for i in range(nset): - if with_mocc: - rho1 = eval_rho3(opt.mol, ao, c0, mo1[i][mask], xctype=xctype, with_lapl=False) - else: - rho1 = eval_rho(opt.mol, ao, dms[i][np.ix_(mask,mask)], xctype=xctype, hermi=hermi, with_lapl=False) + wv = contract('axg,xyg->ayg', rho1, fxc_w) + + for i in range(nset): if xctype == 'LDA': - wv = rho1 * fxc_w - vmat_tmp = ao.dot(_scale_ao(ao, wv).T) + vmat_tmp = ao.dot(_scale_ao(ao, wv[i]).T) add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'GGA': - wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w) - wv[0] *= .5 - vmat_tmp = ao[0].dot(_scale_ao(ao, wv).T) + wv[i,0] *= .5 + aow = _scale_ao(ao, wv[i]) + vmat_tmp = aow.dot(ao[0].T) add_sparse(vmat[i], vmat_tmp, mask) elif xctype == 'NLC': raise NotImplementedError('NLC') else: - wv = cupy.einsum('xg,xyg->yg', rho1, fxc_w) - wv[[0, 4]] *= .5 - vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[:4]).T) - vmat_tmp+= _tau_dot(ao, ao, wv[4]) + wv[i,0] *= .5 + wv[i,4] *= .5 + vmat_tmp = ao[0].dot(_scale_ao(ao[:4], wv[i,:4]).T) + vmat_tmp+= _tau_dot(ao, ao, wv[i,4]) add_sparse(vmat[i], vmat_tmp, mask) + t0 = log.timer_debug1('vxc', *t0) ao = c0 = rho1 = None + vmat = contract('pi,npq->niq', coeff, vmat) vmat = contract('qj,niq->nij', coeff, vmat) if xctype != 'LDA': @@ -723,6 +766,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= return cupy.asarray(vmat) + def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, relativity=0, singlet=True, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): @@ -851,7 +895,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, 2D array of shape (nao,nao) where nao is the number of AO functions. ''' log = logger.new_logger(mol, verbose) - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() opt = getattr(ni, 'gdftopt', None) if opt is None: ni.build(mol, grids.coords) @@ -1040,6 +1084,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, ''' Define this macro to loop over grids by blocks. Sparsity is not implemented yet + sorted_ao: by default ao_value is sorted for GPU ''' if grids.coords is None: grids.build(with_non0tab=True) @@ -1050,7 +1095,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, log = logger.new_logger(mol, mol.verbose) if blksize is None: - cupy.get_default_memory_pool().free_all_blocks() + #cupy.get_default_memory_pool().free_all_blocks() mem_avail = get_avail_mem() blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) @@ -1070,23 +1115,31 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, coords = grids.coords[ip0:ip1] weight = grids.weights[ip0:ip1] #sindex = ni.screen_index[ip0//GRID_BLKSIZE:] - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() ao = eval_ao(ni, mol, coords, deriv) - log.timer_debug1('eval ao', *t0) + t0 = log.timer_debug1('eval ao', *t0) # cache ao indices if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx: - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() if deriv == 0: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1]) - idx = cupy.argwhere(mask).astype(np.int32)[:,0] + all_idx = cupy.arange(ao.shape[0], dtype=np.int32) + idx = all_idx[mask] + pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) + zero_idx = all_idx[~mask][:pad] + idx = cupy.hstack([idx, zero_idx]) ao_mask = ao[idx,:] else: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - idx = cupy.argwhere(mask).astype(np.int32)[:,0] + all_idx = cupy.arange(ao.shape[1], dtype=np.int32) + idx = all_idx[mask] + pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) + zero_idx = all_idx[~mask][:pad] + idx = cupy.hstack([idx, zero_idx]) ao_mask = ao[:,idx,:] ni.non0ao_idx[deriv, block_id, blksize, ngrids] = idx - log.timer_debug1('initialize ao sparsity', *t0) + log.timer_debug1('init ao sparsity', *t0) else: idx = ni.non0ao_idx[deriv, block_id, blksize, ngrids] if deriv == 0: @@ -1094,7 +1147,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, else: ao_mask = ao[:,idx,:] block_id += 1 - log.timer_debug1('eval rho', *t0) + log.timer_debug1('extract sparse ao', *t0) yield ao_mask, idx, weight, coords class NumInt(numint.NumInt): @@ -1185,6 +1238,63 @@ def _contract_rho(bra, ket, rho=None): rho = cupy.einsum('ig,ig->g', bra, ket) return rho +def _contract_rho1(bra, ket, rho=None): + ''' xip,ip->xp + ''' + if bra.ndim == 2: + bra = cupy.expand_dims(bra, axis=0) + nvar, nao, ngrids = bra.shape + if rho is None: + rho = cupy.empty([nvar, ngrids]) + + for i in range(nvar): + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(rho[i].data.ptr, ctypes.c_void_p), + ctypes.cast(bra[i].data.ptr, ctypes.c_void_p), + ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') + return rho + +def _contract_rho_gga(bra, ket, rho=None): + ''' ig,nig->ng + ''' + n, nao, ngrids = bra.shape + assert n == 4 + if rho is None: + rho = cupy.empty([4,ngrids]) + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho_gga( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(rho.data.ptr, ctypes.c_void_p), + ctypes.cast(bra.data.ptr, ctypes.c_void_p), + ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') + return rho + +def _contract_rho_mgga(bra, ket, rho=None): + ''' nig,nig->ng + ''' + n, nao, ngrids = bra.shape + assert n == 4 + if rho is None: + rho = cupy.empty([5,ngrids]) + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFTcontract_rho_mgga( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(rho.data.ptr, ctypes.c_void_p), + ctypes.cast(bra.data.ptr, ctypes.c_void_p), + ctypes.cast(ket.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') + return rho + def _dot_ao_dm(mol, ao, dm, non0tab, shls_slice, ao_loc, out=None): return cupy.dot(dm.T, ao) @@ -1272,7 +1382,7 @@ def _scale_ao(ao, wv, out=None): assert wv.size == ngrids else: if ao[0].flags.f_contiguous: - return contract('nip,np->ip', ao, wv) + return cupy.einsum('nip,np->ip', ao, wv) nvar, nao, ngrids = ao.shape assert wv.shape == (nvar, ngrids) @@ -1365,6 +1475,8 @@ def build(self, mol=None): coeff = np.vstack([coeff, np.zeros((paddings, coeff.shape[1]))]) pmol._decontracted = True self.mol = pmol + inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32) + self.rev_ao_idx = cupy.asarray(inv_idx) self.coeff = coeff[ao_idx] self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32) self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32) diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index ea12f511..b4fd72b0 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -68,7 +68,7 @@ def initialize_grids(ks, mol=None, dm=None): # Initialize self.grids the first time call get_veff if mol is None: mol = ks.mol if ks.grids.coords is None: - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks) ks.grids.build() #ks.grids.build(with_non0tab=True) ks.grids.weights = cupy.asarray(ks.grids.weights) @@ -82,7 +82,7 @@ def initialize_grids(ks, mol=None, dm=None): is_nlc = ks.nlc or ks._numint.libxc.is_nlc(ks.xc) if is_nlc and ks.nlcgrids.coords is None: if ks.nlcgrids.coords is None: - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks) #ks.nlcgrids.build(with_non0tab=True) ks.nlcgrids.build() ks.nlcgrids.weights = cupy.asarray(ks.nlcgrids.weights) @@ -124,7 +124,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): if mol is None: mol = ks.mol if dm is None: dm = ks.make_rdm1() - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = logger.init_timer(ks) if ks.grids.coords is None: ks.grids.ao_values = None initialize_grids(ks, mol, dm) diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index 59a3aeac..16a3b53e 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -19,12 +19,12 @@ import cupy import numpy from pyscf import lib, gto -from pyscf.lib import logger from pyscf.grad import rhf from gpu4pyscf.lib.cupy_helper import load_library from gpu4pyscf.scf.hf import _VHFOpt -from gpu4pyscf.lib.cupy_helper import tag_array +from gpu4pyscf.lib.cupy_helper import tag_array, contract from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df +from gpu4pyscf.lib import logger LMAX_ON_GPU = 3 FREE_CUPY_CACHE = True @@ -255,8 +255,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, if atmlst is None: atmlst = range(mol.natm) - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() if hermi != 1: raise NotImplementedError('JK-builder only supports hermitian density matrix') if omega is None: @@ -328,7 +328,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, dm_shl = cupy.asarray(np.log(dm_shl)) nshls = dm_shl.shape[0] t0 = time.perf_counter() - + if hermi != 1: dm_ctr_cond = (dm_ctr_cond + dm_ctr_cond.T) * .5 fn = libgvhf.GINTget_veff_ip1 @@ -347,7 +347,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, ll = vhfopt.uniq_l_ctr[cpl,0] if lk > LMAX_ON_GPU or ll > LMAX_ON_GPU or log_q_kl.size == 0: continue - + # TODO: determine cutoff based on the relevant maximum value of dm blocks? sub_dm_cond = max(dm_ctr_cond[cpi,cpj], dm_ctr_cond[cpk,cpl], dm_ctr_cond[cpi,cpk], dm_ctr_cond[cpj,cpk], @@ -416,8 +416,6 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, coeff = dms = None cupy.get_default_memory_pool().free_all_blocks() - #if vj is not None: vj_per_atom = vj_per_atom.T - #if vk is not None: vk_per_atom = vk_per_atom.T if out_cupy: return vj_per_atom, vk_per_atom else: @@ -427,8 +425,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, def _get_jk(gradient_object, mol=None, dm=None, hermi=1, with_j=True, with_k=True, omega=None): mf = gradient_object.base - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(gradient_object) + cput0 = log.init_timer() log.debug3('apply get_grad_jk on gpu') if hasattr(mf, '_opt_gpu'): vhfopt = mf._opt_gpu @@ -457,7 +455,7 @@ def get_dh1e_ecp(mol, dm): for ia in ecp_atoms: with mol.with_rinv_at_nucleus(ia): ecp = mol.intor('ECPscalar_iprinv', comp=3) - dh1e_ecp[ia] = cupy.einsum('xij,ij->x', ecp, dm) + dh1e_ecp[ia] = contract('xij,ij->x', cupy.asarray(ecp), dm) return 2.0 * dh1e_ecp def grad_nuc(mf_grad, atmlst=None): @@ -489,11 +487,11 @@ def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None): atmlst = range(mol.natm) aoslices = mol.aoslice_by_atom() - t0 = (logger.process_clock(), logger.perf_counter()) if mo_energy is None: mo_energy = mf.mo_energy if mo_occ is None: mo_occ = mf.mo_occ if mo_coeff is None: mo_coeff = mf.mo_coeff log = logger.Logger(mf_grad.stdout, mf_grad.verbose) + t0 = log.init_timer() mo_energy = cupy.asarray(mo_energy) mo_occ = cupy.asarray(mo_occ) @@ -515,9 +513,9 @@ def calculate_h1e(h1_gpu, s1_gpu): with lib.call_in_background(calculate_h1e) as calculate_hs: calculate_hs(h1, s1) # (i | \nabla hcore | j) - t3 = log.timer_debug1("get_dh1e", *t0) + t3 = log.init_timer() dh1e = int3c2e.get_dh1e(mol, dm0) - + t4 = log.timer_debug1("get_dh1e", *t3) if mol.has_ecp(): dh1e += get_dh1e_ecp(mol, dm0) @@ -527,20 +525,20 @@ def calculate_h1e(h1_gpu, s1_gpu): log.debug('Computing Gradients of NR-HF Coulomb repulsion') dm0 = tag_array(dm0, mo_coeff=mo_coeff, mo_occ=mo_occ) - + extra_force = cupy.zeros((len(atmlst),3)) for k, ia in enumerate(atmlst): extra_force[k] += mf_grad.extra_force(ia, locals()) - + t2 = log.timer_debug1('gradients of 2e part', *t1) - - dh = cupy.einsum('xij,ij->xi', h1, dm0) - ds = cupy.einsum('xij,ij->xi', s1, dme0) + + dh = contract('xij,ij->xi', h1, dm0) + ds = contract('xij,ij->xi', s1, dme0) delec = 2.0*(dh - ds) - + delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]]) de = 2.0 * dvhf + dh1e + delec + extra_force - + if(hasattr(mf, 'disp') and mf.disp is not None): g_disp = mf_grad.get_dispersion() mf_grad.grad_disp = g_disp @@ -565,13 +563,13 @@ class Gradients(rhf.Gradients): def get_j(self, mol=None, dm=None, hermi=0, omega=None): vj, _ = self.get_jk(mol, dm, with_k=False, omega=omega) return vj - + def get_k(self, mol=None, dm=None, hermi=0, omega=None): _, vk = self.get_jk(mol, dm, with_j=False, omega=omega) return vk def extra_force(self, atom_id, envs): - ''' + ''' grid response is implemented get_veff ''' return 0 \ No newline at end of file diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index f22e8184..316478cc 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -17,6 +17,7 @@ # Modified by Xiaojie Wu '''Non-relativistic RKS analytical nuclear gradients''' +import ctypes import numpy import cupy import pyscf @@ -27,12 +28,15 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.dft import numint, xc_deriv, rks from gpu4pyscf.dft.numint import _GDFTOpt, AO_THRESHOLD -from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array +from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, add_sparse, tag_array, load_library from pyscf import __config__ MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) +libgdft = load_library('libgdft') +libgdft.GDFT_make_dR_dao_w.restype = ctypes.c_int + def _get_veff(ks_grad, mol=None, dm=None): ''' First order derivative of DFT effective potential matrix (wrt electron coordinates) @@ -124,7 +128,6 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, nset = len(dms) assert nset == 1 - if xctype == 'LDA': ao_deriv = 1 else: @@ -141,13 +144,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = weight * vxc[0] aow = numint._scale_ao(ao_mask[0], wv) vtmp = _d1_dot_(ao_mask[1:4], aow.T) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) + add_sparse(vmat[idm], vtmp, idx) elif xctype == 'GGA': ao_deriv = 2 for ao_mask, idx, weight, _ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): @@ -158,13 +155,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = weight * vxc wv[0] *= .5 vtmp = _gga_grad_sum_(ao_mask, wv) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) + add_sparse(vmat[idm], vtmp, idx) elif xctype == 'NLC': raise NotImplementedError('NLC') @@ -180,14 +171,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv[4] *= .5 # for the factor 1/2 in tau vtmp = _gga_grad_sum_(ao_mask, wv) vtmp += _tau_grad_dot_(ao_mask, wv[4]) - #idx = cupy.ix_(mask, mask) - #vmat[idm][0][idx] += vtmp[0] - #vmat[idm][1][idx] += vtmp[1] - #vmat[idm][2][idx] += vtmp[2] - add_sparse(vmat[idm][0], vtmp[0], idx) - add_sparse(vmat[idm][1], vtmp[1], idx) - add_sparse(vmat[idm][2], vtmp[2], idx) - + add_sparse(vmat[idm], vtmp, idx) vmat = [cupy.einsum('pi,npq,qj->nij', coeff, v, coeff) for v in vmat] exc = None if nset == 1: @@ -243,9 +227,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, wv = vv_vxc[:,p0:p1] * weight wv[0] *= .5 # *.5 because vmat + vmat.T at the end vmat_tmp = _gga_grad_sum_(ao_mask, wv) - add_sparse(vmat[0], vmat_tmp[0], mask) - add_sparse(vmat[1], vmat_tmp[1], mask) - add_sparse(vmat[2], vmat_tmp[2], mask) + add_sparse(vmat, vmat_tmp, mask) vmat = contract('npq,qj->npj', vmat, coeff) vmat = contract('pi,npj->nij', coeff, vmat) @@ -255,6 +237,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, def _make_dR_dao_w(ao, wv): #:aow = numpy.einsum('npi,p->npi', ao[1:4], wv[0]) + ''' aow = [ numint._scale_ao(ao[1], wv[0]), # dX nabla_x numint._scale_ao(ao[2], wv[0]), # dX nabla_y @@ -272,13 +255,34 @@ def _make_dR_dao_w(ao, wv): aow[2] += numint._scale_ao(ao[6], wv[1]) # dZ nabla_x aow[2] += numint._scale_ao(ao[8], wv[2]) # dZ nabla_y aow[2] += numint._scale_ao(ao[9], wv[3]) # dZ nabla_z + ''' + assert ao.flags.c_contiguous + assert wv.flags.c_contiguous + + _, nao, ngrids = ao.shape + aow = cupy.empty([3,nao,ngrids]) + stream = cupy.cuda.get_current_stream() + err = libgdft.GDFT_make_dR_dao_w( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(aow.data.ptr, ctypes.c_void_p), + ctypes.cast(ao.data.ptr, ctypes.c_void_p), + ctypes.cast(wv.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), ctypes.c_int(nao)) + if err != 0: + raise RuntimeError('CUDA Error') return aow -def _d1_dot_(ao1, ao2): - vmat0 = cupy.dot(ao1[0], ao2) - vmat1 = cupy.dot(ao1[1], ao2) - vmat2 = cupy.dot(ao1[2], ao2) - return cupy.stack([vmat0,vmat1,vmat2]) +def _d1_dot_(ao1, ao2, out=None): + if out is None: + vmat0 = cupy.dot(ao1[0], ao2) + vmat1 = cupy.dot(ao1[1], ao2) + vmat2 = cupy.dot(ao1[2], ao2) + return cupy.stack([vmat0,vmat1,vmat2]) + else: + cupy.dot(ao1[0], ao2, out=out[0]) + cupy.dot(ao1[1], ao2, out=out[1]) + cupy.dot(ao1[2], ao2, out=out[2]) + return out def _gga_grad_sum_(ao, wv): #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index d3898d42..82269257 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -309,35 +309,35 @@ def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): ao_dm0_y = ao_dm0[2][p0:p1] ao_dm0_z = ao_dm0[3][p0:p1] # (d_X \nabla mu) dot \nalba nu DM_{mu,nu} - rho1[0,4] += cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_x) - rho1[0,4] += cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_y) - rho1[0,4] += cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_z) - rho1[1,4] += cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_x) - rho1[1,4] += cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_y) - rho1[1,4] += cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_z) - rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_x) - rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_y) - rho1[2,4] += cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_z) + rho1[0,4] += numint._contract_rho(ao[XX,p0:p1], ao_dm0_x) + rho1[0,4] += numint._contract_rho(ao[XY,p0:p1], ao_dm0_y) + rho1[0,4] += numint._contract_rho(ao[XZ,p0:p1], ao_dm0_z) + rho1[1,4] += numint._contract_rho(ao[YX,p0:p1], ao_dm0_x) + rho1[1,4] += numint._contract_rho(ao[YY,p0:p1], ao_dm0_y) + rho1[1,4] += numint._contract_rho(ao[YZ,p0:p1], ao_dm0_z) + rho1[2,4] += numint._contract_rho(ao[ZX,p0:p1], ao_dm0_x) + rho1[2,4] += numint._contract_rho(ao[ZY,p0:p1], ao_dm0_y) + rho1[2,4] += numint._contract_rho(ao[ZZ,p0:p1], ao_dm0_z) rho1[:,4] *= .5 else: raise RuntimeError ao_dm0_0 = ao_dm0[0][p0:p1] # (d_X \nabla_x mu) nu DM_{mu,nu} - rho1[:,0] = cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0_0) - rho1[0,1]+= cupy.einsum('ip,ip->p', ao[XX,p0:p1], ao_dm0_0) - rho1[0,2]+= cupy.einsum('ip,ip->p', ao[XY,p0:p1], ao_dm0_0) - rho1[0,3]+= cupy.einsum('ip,ip->p', ao[XZ,p0:p1], ao_dm0_0) - rho1[1,1]+= cupy.einsum('ip,ip->p', ao[YX,p0:p1], ao_dm0_0) - rho1[1,2]+= cupy.einsum('ip,ip->p', ao[YY,p0:p1], ao_dm0_0) - rho1[1,3]+= cupy.einsum('ip,ip->p', ao[YZ,p0:p1], ao_dm0_0) - rho1[2,1]+= cupy.einsum('ip,ip->p', ao[ZX,p0:p1], ao_dm0_0) - rho1[2,2]+= cupy.einsum('ip,ip->p', ao[ZY,p0:p1], ao_dm0_0) - rho1[2,3]+= cupy.einsum('ip,ip->p', ao[ZZ,p0:p1], ao_dm0_0) + rho1[:,0] = numint._contract_rho1(ao[1:4,p0:p1], ao_dm0_0) + rho1[0,1]+= numint._contract_rho(ao[XX,p0:p1], ao_dm0_0) + rho1[0,2]+= numint._contract_rho(ao[XY,p0:p1], ao_dm0_0) + rho1[0,3]+= numint._contract_rho(ao[XZ,p0:p1], ao_dm0_0) + rho1[1,1]+= numint._contract_rho(ao[YX,p0:p1], ao_dm0_0) + rho1[1,2]+= numint._contract_rho(ao[YY,p0:p1], ao_dm0_0) + rho1[1,3]+= numint._contract_rho(ao[YZ,p0:p1], ao_dm0_0) + rho1[2,1]+= numint._contract_rho(ao[ZX,p0:p1], ao_dm0_0) + rho1[2,2]+= numint._contract_rho(ao[ZY,p0:p1], ao_dm0_0) + rho1[2,3]+= numint._contract_rho(ao[ZZ,p0:p1], ao_dm0_0) # (d_X mu) (\nabla_x nu) DM_{mu,nu} - rho1[:,1] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[1][p0:p1]) - rho1[:,2] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[2][p0:p1]) - rho1[:,3] += cupy.einsum('xip,ip->xp', ao[1:4,p0:p1], ao_dm0[3][p0:p1]) + rho1[:,1] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[1][p0:p1]) + rho1[:,2] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[2][p0:p1]) + rho1[:,3] += numint._contract_rho1(ao[1:4,p0:p1], ao_dm0[3][p0:p1]) # *2 for |mu> DM njp', ao, coeff[mask]) + t0 = log.init_timer() + nao_non0 = len(mask) + ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc[0] aow = [numint._scale_ao(ao[i], wv) for i in range(1, 4)] _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) @@ -409,10 +413,14 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): rho1 = contract('xig,ig->xg', ao[1:,p0:p1,:], ao_dm0[p0:p1,:]) * 2 # aow ~ rho1 ~ d/dR1 wv = wf * rho1 - aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) + vmat_tmp = cupy.zeros([3,3,nao_non0,nao_non0]) + aow = [numint._scale_ao(ao_mask[0], wv[i]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) + vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + vmat[ia] += vmat_tmp ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] @@ -420,29 +428,44 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): elif xctype == 'GGA': ao_deriv = 2 comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 - for ao, mask, weight, coords \ + for ao_mask, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory, extra=5*comp*nao): - # TODO: improve efficiency - ao = contract('nip,ij->njp', ao, coeff[mask]) + t0 = log.init_timer() + nao_non0 = len(mask) + ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 aow = rks_grad._make_dR_dao_w(ao, wv) _d1d2_dot_(ipip, mol, aow, ao[1:4], mask, ao_loc, False) ao_dm0 = [numint._dot_ao_dm(mol, ao[i], dm0, mask, shls_slice, ao_loc) for i in range(4)] wf = weight * fxc + for ia in range(mol.natm): dR_rho1 = _make_dR_rho1(ao, ao_dm0, ia, aoslices, xctype) wv = contract('xyg,sxg->syg', wf, dR_rho1) wv[:,0] *= .5 + ''' for i in range(3): aow = rks_grad._make_dR_dao_w(ao, wv[i]) vmat[ia,i] += rks_grad._d1_dot_(aow, ao[0].T) aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) + ''' + vmat_tmp = cupy.empty([3,3,nao_non0,nao_non0]) + for i in range(3): + aow = rks_grad._make_dR_dao_w(ao_mask, wv[i]) + rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i]) + aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) + vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + vmat[ia] += vmat_tmp ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] @@ -453,11 +476,15 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): YX, YY, YZ = 5, 7, 8 ZX, ZY, ZZ = 6, 8, 9 ao_deriv = 2 - for ao, mask, weight, coords \ + for ao_mask, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): - ao = contract('nip,ij->njp', ao, coeff[mask]) + t0 = log.init_timer() + nao_non0 = len(mask) + ao = contract('nip,ij->njp', ao_mask, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 wv[4] *= .25 @@ -476,20 +503,28 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): wv = contract('xyg,sxg->syg', wf, dR_rho1) wv[:,0] *= .5 wv[:,4] *= .5 # for the factor 1/2 in tau + ''' for i in range(3): aow = rks_grad._make_dR_dao_w(ao, wv[i]) vmat[ia,i] += rks_grad._d1_dot_(aow, ao[0].T) - - aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) - - aow = [numint._scale_ao(ao[1], wv[i,4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, [ao[XX], ao[XY], ao[XZ]], aow, mask, ao_loc, False) - aow = [numint._scale_ao(ao[2], wv[i,4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, [ao[YX], ao[YY], ao[YZ]], aow, mask, ao_loc, False) - aow = [numint._scale_ao(ao[3], wv[i,4]) for i in range(3)] - _d1d2_dot_(vmat[ia], mol, [ao[ZX], ao[ZY], ao[ZZ]], aow, mask, ao_loc, False) - + ''' + vmat_tmp = cupy.empty([3,3,nao_non0,nao_non0]) + for i in range(3): + aow = rks_grad._make_dR_dao_w(ao_mask, wv[i]) + rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i]) + aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False) + + aow = [numint._scale_ao(ao_mask[1], wv[i,4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, [ao_mask[XX], ao_mask[XY], ao_mask[XZ]], aow, mask, ao_loc, False) + aow = [numint._scale_ao(ao_mask[2], wv[i,4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, [ao_mask[YX], ao_mask[YY], ao_mask[YZ]], aow, mask, ao_loc, False) + aow = [numint._scale_ao(ao_mask[3], wv[i,4]) for i in range(3)] + _d1d2_dot_(vmat_tmp, mol, [ao_mask[ZX], ao_mask[ZY], ao_mask[ZZ]], aow, mask, ao_loc, False) + vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp) + vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp) + vmat[ia] += vmat_tmp + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] @@ -500,6 +535,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): mol = hessobj.mol mf = hessobj.base + log = logger.new_logger(mol, mol.verbose) if hessobj.grids is not None: grids = hessobj.grids else: @@ -533,9 +569,12 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): ao_deriv = 1 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[0], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc[0] aow = numint._scale_ao(ao[0], wv) v_ip += rks_grad._d1_dot_(ao[1:4], aow.T) @@ -550,15 +589,17 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[0], wv[i]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) elif xctype == 'GGA': ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(mol, grids, nao, ao_deriv, max_memory): - # TODO: improve efficiency + t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(mol, ao[:4], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 v_ip += rks_grad._gga_grad_sum_(ao, wv) @@ -572,17 +613,20 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): wv[:,0] *= .5 aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[0].T) + t0 = log.timer_debug1('integration', *t0) ao_dm0 = aow = None - # TODO: debug and test elif xctype == 'MGGA': if grids.level < 5: logger.warn(mol, 'MGGA Hessian is sensitive to dft grids.') ao_deriv = 2 for ao, mask, weight, coords \ in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory): + t0 = log.init_timer() ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(opt.mol, ao[:10], mo_coeff, mo_occ, mask, xctype) + t0 = log.timer_debug1('eval rho', *t0) vxc, fxc = ni.eval_xc_eff(mf.xc, rho, 2, xctype=xctype)[1:3] + t0 = log.timer_debug1('eval vxc', *t0) wv = weight * vxc wv[0] *= .5 wv[4] *= .5 # for the factor 1/2 in tau @@ -602,7 +646,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T) ao_dm0 = aow = None - + t0 = log.timer_debug1('integration', *t0) for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,p0:p1] += v_ip[:,p0:p1] diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 2285b9dd..337d2c6a 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -142,17 +142,25 @@ def unpack_sparse(cderi_sparse, row, col, p0, p1, nao, out=None, stream=None): def add_sparse(a, b, indices): ''' - a[np.ix_(indices, indices)] += b + a[:,...,:np.ix_(indices, indices)] += b ''' - n = a.shape[0] - m = b.shape[0] - + assert a.flags.c_contiguous + assert b.flags.c_contiguous + n = a.shape[-1] + m = b.shape[-1] + if a.ndim > 2: + count = np.prod(a.shape[:-2]) + elif a.ndim == 2: + count = 1 + else: + raise RuntimeError('add_sparse only supports 2d or 3d tensor') err = libcupy_helper.add_sparse( ctypes.cast(a.data.ptr, ctypes.c_void_p), ctypes.cast(b.data.ptr, ctypes.c_void_p), ctypes.cast(indices.data.ptr, ctypes.c_void_p), ctypes.c_int(n), - ctypes.c_int(m) + ctypes.c_int(m), + ctypes.c_int(count) ) if err != 0: raise RecursionError('failed in sparse_add2d') diff --git a/gpu4pyscf/lib/cupy_helper/add_sparse.cu b/gpu4pyscf/lib/cupy_helper/add_sparse.cu index eddbf92a..d8033015 100644 --- a/gpu4pyscf/lib/cupy_helper/add_sparse.cu +++ b/gpu4pyscf/lib/cupy_helper/add_sparse.cu @@ -22,8 +22,8 @@ #define THREADS 32 #define BLOCK_DIM 32 -__global__ -void _add_sparse(double *a, double *b, int *indices, int n, int m) +__global__ +void _add_sparse(double *a, double *b, int *indices, int n, int m, int count) { int row = blockIdx.x * BLOCK_DIM + threadIdx.x; int col = blockIdx.y * BLOCK_DIM + threadIdx.y; @@ -32,17 +32,18 @@ void _add_sparse(double *a, double *b, int *indices, int n, int m) } int idx_a = indices[row] * n + indices[col]; int idx_b = row * m + col; - - a[idx_a] += b[idx_b]; + for (int i = 0; i < count; i++){ + a[idx_a + i*n*n] += b[idx_b + i*m*m]; + } } extern "C" { __host__ -int add_sparse(double *a, double *b, int *indices, int n, int m){ +int add_sparse(double *a, double *b, int *indices, int n, int m, int count){ int ntile = (m + THREADS - 1) / THREADS; dim3 threads(THREADS, THREADS); dim3 blocks(ntile, ntile); - _add_sparse<<>>(a, b, indices, n, m); + _add_sparse<<>>(a, b, indices, n, m, count); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { return 1; diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index c590b7f6..aca3b082 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -19,6 +19,7 @@ from cupyx import cutensor from cupy_backends.cuda.libs import cutensor as cutensor_backend from cupy_backends.cuda.libs.cutensor import Handle +from gpu4pyscf.lib import logger libcutensor = None for lib_path in _preload_libs['cutensor']: @@ -31,6 +32,8 @@ _handle = Handle() _modes = {} _contraction_descriptors = {} +_contraction_plans = {} +_contraction_finds = {} cutensor_backend.init(_handle) @@ -82,10 +85,25 @@ def create_contraction_descriptor(handle, return desc def create_contraction_find(handle, algo=cutensor_backend.ALGO_DEFAULT): - find = cutensor_backend.ContractionFind() - cutensor_backend.initContractionFind(handle, find, algo) + key = (handle.ptr, algo) + if key in _contraction_finds: + find = _contraction_finds[key] + else: + find = cutensor_backend.ContractionFind() + cutensor_backend.initContractionFind(handle, find, algo) + _contraction_finds[key] = find return find +def create_contraction_plan(handle, desc, find, ws_size): + key = (handle.ptr, desc.ptr, find.ptr, ws_size) + if key in _contraction_plans: + plan = _contraction_plans[key] + else: + plan = cutensor_backend.ContractionPlan() + cutensor_backend.initContractionPlan(handle, plan, desc, find, ws_size) + _contraction_plans[key] = plan + return plan + def contraction(pattern, a, b, alpha, beta, out=None): pattern = pattern.replace(" ", "") str_a, rest = pattern.split(',') @@ -121,14 +139,15 @@ def contraction(pattern, a, b, alpha, beta, out=None): ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_MIN) ws = cupy.empty(ws_size, dtype=np.int8) - plan = cutensor_backend.ContractionPlan() - cutensor_backend.initContractionPlan(_handle, plan, desc, find, ws_size) + plan = create_contraction_plan(_handle, desc, find, ws_size) alpha = np.asarray(alpha) beta = np.asarray(beta) + cutensor_backend.contraction(_handle, plan, alpha.ctypes.data, a.data.ptr, b.data.ptr, beta.ctypes.data, c.data.ptr, out.data.ptr, ws.data.ptr, ws_size) + return out import os diff --git a/gpu4pyscf/lib/gdft/CMakeLists.txt b/gpu4pyscf/lib/gdft/CMakeLists.txt index fc32786c..9aef39b0 100644 --- a/gpu4pyscf/lib/gdft/CMakeLists.txt +++ b/gpu4pyscf/lib/gdft/CMakeLists.txt @@ -15,7 +15,7 @@ #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80 --ptxas-options=-v") -add_library(gdft SHARED +add_library(gdft SHARED nr_eval_gto.cu contract_rho.cu gen_grids.cu diff --git a/gpu4pyscf/lib/gdft/contract_rho.cu b/gpu4pyscf/lib/gdft/contract_rho.cu index 4d1dc0b2..1957928e 100644 --- a/gpu4pyscf/lib/gdft/contract_rho.cu +++ b/gpu4pyscf/lib/gdft/contract_rho.cu @@ -17,6 +17,12 @@ * along with this program. If not, see . */ +#include +#include +#include +#include +#include +#include #include "contract_rho.cuh" // TODO: improve this? __global__ @@ -26,20 +32,20 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, const bool active = grid_id < ngrids; size_t Ngrids = ngrids; - int ao_id; double v = 0; if (active){ - for (ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { - v += bra[grid_id + ao_id * Ngrids] * ket[grid_id + ao_id * Ngrids]; + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * Ngrids; + v += bra[ket_idx] * ket[ket_idx]; } } - + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; int ix = threadIdx.x; int iy = threadIdx.y; int ixy = ix + BLKSIZEX * iy; buf[ixy] = v; __syncthreads(); - // assume block dim = 32 x 32 + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); @@ -51,6 +57,162 @@ void GDFTcontract_rho_kernel(double *rho, double *bra, double *ket, int ngrids, } } +__global__ +void GDFTcontract_rho4_kernel(double *rho, double *bra, double *ket, int ngrids, int nao, int count) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool active = grid_id < ngrids; + size_t ket_stride = nao * ngrids; + size_t rho_stride = count * ngrids; + + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; + + for (int ia = 0; ia < count; ia++){ + double v[4] = {0.0, 0.0, 0.0, 0.0}; + if (active){ + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * ngrids; + double bra_tmp = bra[ket_idx + ia * ket_stride]; + v[0] += bra_tmp * ket[0*ket_stride + ket_idx]; + v[1] += bra_tmp * ket[1*ket_stride + ket_idx]; + v[2] += bra_tmp * ket[2*ket_stride + ket_idx]; + v[3] += bra_tmp * ket[3*ket_stride + ket_idx]; + } + } + + int ix = threadIdx.x; + int iy = threadIdx.y; + int ixy = ix + BLKSIZEX * iy; + for (int i = 0; i < 4; i++){ + buf[ixy] = v[i]; __syncthreads(); + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); + if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); + if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); + if (blockDim.y >= 4 && iy < 2) buf[ixy] += buf[ixy + BLKSIZEX * 2]; __syncthreads(); + if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); + + if (iy == 0 && active) { + rho[grid_id + ia * ngrids + rho_stride * i] = buf[ix]; + } + } + } +} + +__global__ +void GDFTcontract_rho_gga_kernel(double *rho, double *bra, double *ket, int ngrids, int nao) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool active = grid_id < ngrids; + + size_t Ngrids = ngrids; + size_t ket_stride = nao * ngrids; + + double v[4] = {0.0, 0.0, 0.0, 0.0}; + if (active){ + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * Ngrids; + double bra_tmp = bra[ket_idx]; + double ket_tmp = ket[ket_idx]; + + v[0] += bra_tmp * ket_tmp; + + ket_idx += ket_stride; + v[1] += bra_tmp * ket[ket_idx]; + v[1] += ket_tmp * bra[ket_idx]; + + ket_idx += ket_stride; + v[2] += bra_tmp * ket[ket_idx]; + v[2] += ket_tmp * bra[ket_idx]; + + ket_idx += ket_stride; + v[3] += bra_tmp * ket[ket_idx]; + v[3] += ket_tmp * bra[ket_idx]; + } + } + + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; + int ix = threadIdx.x; + int iy = threadIdx.y; + int ixy = ix + BLKSIZEX * iy; + + for (int i = 0; i < 4; i++){ + buf[ixy] = v[i]; __syncthreads(); + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); + if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); + if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); + if (blockDim.y >= 4 && iy < 2) buf[ixy] += buf[ixy + BLKSIZEX * 2]; __syncthreads(); + if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); + + if (iy == 0 && active) { + rho[grid_id + ngrids * i] = 2.0 * buf[ix]; + } + } +} + + +__global__ +void GDFTcontract_rho_mgga_kernel(double *rho, double *bra, double *ket, int ngrids, int nao) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool active = grid_id < ngrids; + + size_t Ngrids = ngrids; + size_t ket_stride = nao * ngrids; + + double v[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; + if (active){ + for (int ao_id = threadIdx.y; ao_id < nao; ao_id += BLKSIZEY) { + int ket_idx = grid_id + ao_id * Ngrids; + double bra_tmp0 = bra[ket_idx]; + double ket_tmp0 = ket[ket_idx]; + + v[0] += bra_tmp0 * ket_tmp0; + + ket_idx += ket_stride; + double bra_tmp1 = bra[ket_idx]; + double ket_tmp1 = ket[ket_idx]; + v[1] += bra_tmp0 * ket_tmp1; + v[1] += ket_tmp0 * bra_tmp1; + v[4] += bra_tmp1 * ket_tmp1; + + ket_idx += ket_stride; + bra_tmp1 = bra[ket_idx]; + ket_tmp1 = ket[ket_idx]; + v[2] += bra_tmp0 * ket_tmp1; + v[2] += ket_tmp0 * bra_tmp1; + v[4] += bra_tmp1 * ket_tmp1; + + ket_idx += ket_stride; + bra_tmp1 = bra[ket_idx]; + ket_tmp1 = ket[ket_idx]; + v[3] += bra_tmp0 * ket_tmp1; + v[3] += ket_tmp0 * bra_tmp1; + v[4] += bra_tmp1 * ket_tmp1; + + } + } + + v[4] *= 0.5; + + __shared__ double buf[BLKSIZEX*(BLKSIZEY+1)]; + int ix = threadIdx.x; + int iy = threadIdx.y; + int ixy = ix + BLKSIZEX * iy; + + for (int i = 0; i < 5; i++){ + buf[ixy] = v[i]; __syncthreads(); + if (blockDim.y >= 32 && iy < 16) buf[ixy] += buf[ixy + BLKSIZEX * 16]; __syncthreads(); + if (blockDim.y >= 16 && iy < 8) buf[ixy] += buf[ixy + BLKSIZEX * 8]; __syncthreads(); + if (blockDim.y >= 8 && iy < 4) buf[ixy] += buf[ixy + BLKSIZEX * 4]; __syncthreads(); + if (blockDim.y >= 4 && iy < 2) buf[ixy] += buf[ixy + BLKSIZEX * 2]; __syncthreads(); + if (blockDim.y >= 2 && iy < 1) buf[ixy] += buf[ixy + BLKSIZEX * 1]; __syncthreads(); + + if (iy == 0 && active) { + rho[grid_id + ngrids * i] = 2.0 * buf[ix]; + } + } +} + __global__ void GDFTscale_ao_kernel(double *out, double *ket, double *wv, int ngrids, int nao, int nvar) @@ -71,3 +233,130 @@ void GDFTscale_ao_kernel(double *out, double *ket, double *wv, } out[ixy] = val; } + +__global__ +void GDFT_make_dR_dao_w_kernel(double *out, double *ket, double *wv, + int ngrids, int nao) +{ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + int ao_id = blockIdx.y * blockDim.y + threadIdx.y; + if (grid_id >= ngrids || ao_id >= nao) { + return; + } + + size_t Ngrids = ngrids; + size_t Nag = nao * Ngrids; + size_t ixy = grid_id + ao_id * Ngrids; + + double wv0 = wv[grid_id + ngrids * 0]; + double wv1 = wv[grid_id + ngrids * 1]; + double wv2 = wv[grid_id + ngrids * 2]; + double wv3 = wv[grid_id + ngrids * 3]; + + double ket5 = ket[ixy + Nag * 5]; + double ket6 = ket[ixy + Nag * 6]; + double val; + val = ket[ixy + Nag * 1] * wv0; + val+= ket[ixy + Nag * 4] * wv1; + val+= ket5 * wv2; + val+= ket6 * wv3; + out[ixy + Nag * 0] = val; + + double ket8 = ket[ixy + Nag * 8]; + val = ket[ixy + Nag * 2] * wv0; + val+= ket5 * wv1; + val+= ket[ixy + Nag * 7] * wv2; + val+= ket8 * wv3; + out[ixy + Nag * 1] = val; + + val = ket[ixy + Nag * 3] * wv0; + val+= ket6 * wv1; + val+= ket8 * wv2; + val+= ket[ixy + Nag * 9] * wv3; + out[ixy + Nag * 2] = val; +} + + +extern "C"{ +__host__ +int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho_kernel<<>>(rho, bra, ket, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTcontract_rho4(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao, int count) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho4_kernel<<>>(rho, bra, ket, ngrids, nao, count); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTcontract_rho_gga(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho_gga_kernel<<>>(rho, bra, ket, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho_gga: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTcontract_rho_mgga(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); + GDFTcontract_rho_mgga_kernel<<>>(rho, bra, ket, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTcontract_rho_mgga: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFT_make_dR_dao_w(cudaStream_t stream, double *out, double *ket, double *wv, + int ngrids, int nao) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY); + GDFT_make_dR_dao_w_kernel<<>>(out, ket, wv, ngrids, nao); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFT_make_dR_dao_w: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv, + int ngrids, int nao, int nvar) +{ + dim3 threads(BLKSIZEX, BLKSIZEY); + dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY); + GDFTscale_ao_kernel<<>>(out, ket, wv, ngrids, nao, nvar); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +} \ No newline at end of file diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu index b87ca434..da59b1c9 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu @@ -1640,31 +1640,4 @@ int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart, //FREE(d_grids); return 0; } - -int GDFTcontract_rho(cudaStream_t stream, double *rho, double *bra, double *ket, int ngrids, int nao) -{ - dim3 threads(BLKSIZEX, BLKSIZEY); - dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX); - GDFTcontract_rho_kernel<<>>(rho, bra, ket, ngrids, nao); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error of GDFTcontract_rho: %s\n", cudaGetErrorString(err)); - return 1; - } - return 0; -} - -int GDFTscale_ao(cudaStream_t stream, double *out, double *ket, double *wv, - int ngrids, int nao, int nvar) -{ - dim3 threads(BLKSIZEX, BLKSIZEY); - dim3 blocks((ngrids+BLKSIZEX-1)/BLKSIZEX, (nao+BLKSIZEY-1)/BLKSIZEY); - GDFTscale_ao_kernel<<>>(out, ket, wv, ngrids, nao, nvar); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error of GDFTscale_ao: %s\n", cudaGetErrorString(err)); - return 1; - } - return 0; -} } diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py index 58c3f45f..60497816 100644 --- a/gpu4pyscf/lib/logger.py +++ b/gpu4pyscf/lib/logger.py @@ -25,6 +25,8 @@ WARN = lib.logger.WARN DEBUG = lib.logger.DEBUG DEBUG1= lib.logger.DEBUG1 +TIMER_LEVEL = lib.logger.TIMER_LEVEL +flush = lib.logger.flush if sys.version_info < (3, 0): process_clock = time.clock @@ -33,27 +35,66 @@ process_clock = time.process_time perf_counter = time.perf_counter -def _timer_debug1(rec, msg, cpu0=None, wall0=None, sync=True): + +def init_timer(rec): + if rec.verbose >= TIMER_LEVEL: + e0 = cupy.cuda.Event() + e0.record() + return (process_clock(), perf_counter(), e0) + elif rec.verbose >= DEBUG: + return (process_clock(), perf_counter()) + else: + return process_clock(), + +def timer(rec, msg, cpu0=None, wall0=None, gpu0=None): + if cpu0 is None: + cpu0 = rec._t0 + if wall0 and gpu0: + rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event() + if rec.verbose >= TIMER_LEVEL: + rec._e0.record() + rec._e0.synchronize() + flush(rec, ' CPU time for %20s %9.2f sec, wall time %9.2f sec, GPU time for %9.2f ms' + % (msg, rec._t0-cpu0, rec._w0-wall0, cupy.cuda.get_elapsed_time(gpu0,rec._e0))) + return rec._t0, rec._w0, rec._e0 + elif wall0: + rec._t0, rec._w0 = process_clock(), perf_counter() + if rec.verbose >= TIMER_LEVEL: + flush(rec, ' CPU time for %20s %9.2f sec, wall time %9.2f sec' + % (msg, rec._t0-cpu0, rec._w0-wall0)) + return rec._t0, rec._w0 + else: + rec._t0 = process_clock() + if rec.verbose >= TIMER_LEVEL: + flush(rec, ' CPU time for %20s %9.2f sec' % (msg, rec._t0-cpu0)) + return rec._t0, + +def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True): if rec.verbose >= DEBUG1: - if(sync): cupy.cuda.stream.get_current_stream().synchronize() - return timer(rec, msg, cpu0, wall0) + return timer(rec, msg, cpu0, wall0, gpu0) + elif wall0 and gpu0: + rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event() + rec._e0.record() + return rec._t0, rec._w0, rec._e0 elif wall0: rec._t0, rec._w0 = process_clock(), perf_counter() return rec._t0, rec._w0 else: rec._t0 = process_clock() - return rec._t0 + return rec._t0, info = lib.logger.info debug = lib.logger.debug debug1 = lib.logger.debug1 -timer = lib.logger.timer +debug2 = lib.logger.debug2 timer_debug1 = _timer_debug1 class Logger(lib.logger.Logger): def __init__(self, stdout=sys.stdout, verbose=NOTE): super().__init__(stdout=stdout, verbose=verbose) timer_debug1 = _timer_debug1 + timer = timer + init_timer = init_timer def new_logger(rec=None, verbose=None): '''Create and return a :class:`Logger` object diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 1d58a02e..8e1a9855 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -25,11 +25,11 @@ from functools import reduce from pyscf import gto from pyscf import lib as pyscf_lib -from pyscf.lib import logger from pyscf.scf import hf, jk, _vhf from gpu4pyscf import lib from gpu4pyscf.lib.cupy_helper import eigh, load_library, tag_array from gpu4pyscf.scf import diis +from gpu4pyscf.lib import logger LMAX_ON_GPU = 4 FREE_CUPY_CACHE = True @@ -40,8 +40,8 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, verbose=None): '''Compute J, K matrices with CPU-GPU hybrid algorithm ''' - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() if hermi != 1: raise NotImplementedError('JK-builder only supports hermitian density matrix') if omega is None: @@ -253,8 +253,8 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True, if omega is not None: assert omega >= 0 - cput0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mf) + cput0 = log.init_timer() log.debug3('apply get_jk on gpu') if omega is None: if hasattr(mf, '_opt_gpu'): @@ -369,9 +369,9 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, dump_chk=True, dm0=None, callback=None, conv_check=True, **kwargs): conv_tol = mf.conv_tol mol = mf.mol - t0 = (logger.process_clock(), logger.perf_counter()) verbose = mf.verbose log = logger.new_logger(mol, verbose) + t0 = log.init_timer() if(conv_tol_grad is None): conv_tol_grad = conv_tol**.5 logger.info(mf, 'Set gradient conv threshold to %g', conv_tol_grad) @@ -415,7 +415,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, t_beg = time.time() for cycle in range(mf.max_cycle): - t0 = (logger.process_clock(), logger.perf_counter()) + t0 = log.init_timer() dm_last = dm last_hf_e = e_tot @@ -575,7 +575,7 @@ class RHF(hf.RHF): quad_moment = _quad_moment def scf(self, dm0=None, **kwargs): - cput0 = (logger.process_clock(), logger.perf_counter()) + cput0 = logger.init_timer(self) self.dump_flags() self.build(self.mol) @@ -630,8 +630,8 @@ def __init__(self, mol, intor, prescreen='CVHFnoscreen', self._dmcondname = dmcondname def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): - cput0 = (logger.process_clock(), logger.perf_counter()) mol = self.mol + cput0 = logger.init_timer(mol) # Sort basis according to angular momentum and contraction patterns so # as to group the basis functions to blocks in GPU kernel. l_ctrs = mol._bas[:,[gto.ANG_OF, gto.NPRIM_OF]] From 564b3fe5dcd7258609f2f2b4346b60e1a7c785f4 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sat, 4 Nov 2023 14:18:21 -0700 Subject: [PATCH 18/19] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 58b87b32..b7100771 100644 --- a/README.md +++ b/README.md @@ -4,40 +4,40 @@ Installation -------- For **CUDA 11.x** -``` +```sh pip3 install gpu4pyscf-cuda11x ``` and install cutensor -``` +```sh python -m cupyx.tools.install_library --cuda 11.x --library cutensor ``` For **CUDA 12.x** -``` +```sh pip3 install gpu4pyscf-cuda12x ``` and install cutensor -``` +```sh python -m cupyx.tools.install_library --cuda 12.x --library cutensor ``` Compilation -------- The package provides ```dockerfiles/compile/Dockerfile``` for creating the CUDA environment. One can compile the package with -``` +```sh sh build.sh ``` This script will automatically download LibXC, and compile it with CUDA. The script will also build the wheel for installation. The compilation can take more than 5 mins. Then, one can either install the wheel with -``` +```sh cd output pip3 install gpu4pyscf-* ``` or simply add it to ```PYTHONPATH``` -``` +```sh export PYTHONPATH="${PYTHONPATH}:/your-local-path/gpu4pyscf" ``` Then install cutensor for acceleration -``` +```sh python -m cupyx.tools.install_library --cuda 11.x --library cutensor ``` @@ -64,7 +64,7 @@ Limitations Examples -------- -``` +```python import pyscf from gpu4pyscf.dft import rks From e110e9a2ec0c61399b647182cef41d1e3aeb2c95 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Wed, 8 Nov 2023 18:20:10 -0800 Subject: [PATCH 19/19] Add chelpg charges in qmmm folder. (#1) (#56) * Add chelpg charges in qmmm folder. * Update chelpg.py * Update chelpg.py * Add unit test for chelpg, and compare with Qchem * Add an example to calculate chelpg Co-authored-by: puzhichen <147788878+puzhichen@users.noreply.github.com>