From 1dff11de63ccccbe632e47d7920af0e30a9306c0 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 1 Jan 2024 22:59:54 -0800 Subject: [PATCH] Improve eval gto (#72) * fixed a bug in screen_index * added unit test for to_gpu * new grids group scheme * use grid_aligned in gpu4pyscf.__config__ * fixed a bug in eval_ao --- examples/dft_driver.py | 5 +- gpu4pyscf/__config__.py | 4 +- gpu4pyscf/df/df.py | 1 - gpu4pyscf/df/df_jk.py | 47 ++++++--- gpu4pyscf/dft/gen_grid.py | 26 +++-- gpu4pyscf/dft/numint.py | 12 ++- gpu4pyscf/dft/tests/test_ao_values.py | 2 +- gpu4pyscf/lib/gdft/gen_grids.cu | 59 ++++++++--- gpu4pyscf/lib/gdft/nr_eval_gto.cu | 143 +++++++++++--------------- 9 files changed, 168 insertions(+), 131 deletions(-) diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 676658d0..2b2c8cd6 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -35,10 +35,10 @@ max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 6 +mol.verbose = 4 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 6 +mf_df.verbose = 4 if args.solvent: mf_df = mf_df.PCM() @@ -52,6 +52,7 @@ mf_df.direct_scf_tol = 1e-14 mf_df.direct_scf = 1e-14 mf_df.conv_tol = 1e-10 +mf_df.chkfile = None e_tot = mf_df.kernel() scf_time = time.time() - start_time print(f'compute time for energy: {scf_time:.3f} s') diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index bce8e79a..987ca87c 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -4,8 +4,8 @@ GB = 1024*1024*1024 # such as A100-80G if props['totalGlobalMem'] >= 64 * GB: - min_ao_blksize = 128 - min_grid_blksize = 256*256#128*128 + min_ao_blksize = 256 + min_grid_blksize = 256*256 ao_aligned = 32 grid_aligned = 128 mem_fraction = 0.9 diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 35d79447..0b40d26e 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -279,7 +279,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): with data_stream: for i in range(naux): cderi_block[i].get(out=cderi[i,ij0:ij1]) - t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1) cupy.cuda.Device().synchronize() diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 8863a89a..23a46850 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -19,6 +19,7 @@ import copy import cupy import numpy +from cupy import cublas from pyscf import lib, scf, __config__ from pyscf.scf import dhf from pyscf.df import df_jk, addons @@ -264,42 +265,44 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- dms = take_last2d(dms, ao_idx) t1 = log.timer_debug1('init jk', *t0) + rows = dfobj.intopt.cderi_row + cols = dfobj.intopt.cderi_col if with_j: - rows = dfobj.intopt.cderi_row - cols = dfobj.intopt.cderi_col dm_sparse = dms[:,rows,cols] dm_sparse[:, dfobj.intopt.cderi_diag] *= .5 vj = cupy.zeros_like(dms) - vj_tmp = cupy.zeros_like(dms) if with_k: vk = cupy.zeros_like(dms) - def get_j(cderi_sparse): - rhoj = 2.0*dm_sparse.dot(cderi_sparse) - vj_sparse = cupy.dot(rhoj, cderi_sparse.T) - vj_tmp[:,rows,cols] = vj_sparse - vj_tmp[:,cols,rows] = vj_sparse - vj_sparse = None - return vj_tmp - # SCF K matrix with occ if nset == 1 and hasattr(dms_tag, 'occ_coeff'): occ_coeff = cupy.asarray(dms_tag.occ_coeff[ao_idx, :], order='C') nocc = occ_coeff.shape[1] blksize = dfobj.get_blksize(extra=nao*nocc) + if with_j: + vj_packed = cupy.zeros_like(dm_sparse) + for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): # leading dimension is 1 if with_j: - vj += get_j(cderi_sparse) + rhoj = 2.0*dm_sparse.dot(cderi_sparse) + vj_packed += cupy.dot(rhoj, cderi_sparse.T) if with_k: rhok = contract('Lij,jk->Lki', cderi, occ_coeff) #vk[0] += contract('Lki,Lkj->ij', rhok, rhok) - contract('Lki,Lkj->ij', rhok, rhok, alpha=1.0, beta=1.0, out=vk[0]) + cublas.syrk('T', rhok.reshape([-1,nao]), out=vk[0], alpha=1.0, beta=1.0, lower=True) + if with_j: + vj[:,rows,cols] = vj_packed + vj[:,cols,rows] = vj_packed if with_k: + vk[0][numpy.diag_indices(nao)] *= 0.5 + transpose_sum(vk) vk *= 2.0 # CP-HF K matrix elif hasattr(dms_tag, 'mo1'): + if with_j: + vj_sparse = cupy.zeros_like(dm_sparse) mo1 = dms_tag.mo1[:,ao_idx,:] nocc = mo1.shape[2] # 2.0 due to rhok and rhok1, put it here for symmetry @@ -307,7 +310,9 @@ def get_j(cderi_sparse): blksize = dfobj.get_blksize(extra=2*nao*nocc) for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): if with_j: - vj += get_j(cderi_sparse) + #vj += get_j(cderi_sparse) + rhoj = 2.0*dm_sparse.dot(cderi_sparse) + vj_sparse += cupy.dot(rhoj, cderi_sparse.T) if with_k: rhok = contract('Lij,jk->Lki', cderi, occ_coeff) for i in range(mo1.shape[0]): @@ -315,18 +320,28 @@ def get_j(cderi_sparse): #vk[i] += contract('Lki,Lkj->ij', rhok, rhok1) contract('Lki,Lkj->ij', rhok, rhok1, alpha=1.0, beta=1.0, out=vk[i]) occ_coeff = rhok1 = rhok = mo1 = None + if with_j: + vj[:,rows,cols] = vj_sparse + vj[:,cols,rows] = vj_sparse if with_k: - vk = vk + vk.transpose(0,2,1) + #vk = vk + vk.transpose(0,2,1) + transpose_sum(vk) # general K matrix with density matrix else: + if with_j: + vj_sparse = cupy.zeros_like(dm_sparse) blksize = dfobj.get_blksize() for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): if with_j: - vj += get_j(cderi_sparse) + rhoj = 2.0*dm_sparse.dot(cderi_sparse) + vj_sparse += cupy.dot(rhoj, cderi_sparse.T) if with_k: for k in range(nset): rhok = contract('Lij,jk->Lki', cderi, dms[k]) vk[k] += contract('Lki,Lkj->ij', cderi, rhok) + if with_j: + vj[:,rows,cols] = vj_sparse + vj[:,cols,rows] = vj_sparse rhok = None rev_ao_idx = dfobj.intopt.rev_ao_idx diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 9d1979d2..40c6947a 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -186,16 +186,17 @@ def gen_grids_partition(atm_coords, coords, a): natm = atm_coords.shape[0] ngrids = coords.shape[0] assert ngrids < 65535 * 16 - x_i = cupy.expand_dims(atm_coords, axis=1) - x_g = cupy.expand_dims(coords, axis=0) - squared_diff = (x_i - x_g)**2 - dist_ig = cupy.sum(squared_diff, axis=2)**0.5 + #x_i = cupy.expand_dims(atm_coords, axis=1) + #x_g = cupy.expand_dims(coords, axis=0) + #squared_diff = (x_i - x_g)**2 + #dist_ig = cupy.sum(squared_diff, axis=2)**0.5 - x_j = cupy.expand_dims(atm_coords, axis=0) - squared_diff = (x_i - x_j)**2 - dist_ij = cupy.sum(squared_diff, axis=2)**0.5 + #x_j = cupy.expand_dims(atm_coords, axis=0) + #squared_diff = (x_i - x_j)**2 + #dist_ij = cupy.sum(squared_diff, axis=2)**0.5 pbecke = cupy.ones([natm, ngrids], order='C') + ''' err = libgdft.GDFTgen_grid_partition( ctypes.cast(stream.ptr, ctypes.c_void_p), ctypes.cast(pbecke.data.ptr, ctypes.c_void_p), @@ -205,6 +206,17 @@ def gen_grids_partition(atm_coords, coords, a): ctypes.c_int(ngrids), ctypes.c_int(natm) ) + ''' + atm_coords = cupy.asarray(atm_coords, order='F') + err = libgdft.GDFTgen_grid_partition( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(pbecke.data.ptr, ctypes.c_void_p), + ctypes.cast(coords.data.ptr, ctypes.c_void_p), + ctypes.cast(atm_coords.data.ptr, ctypes.c_void_p), + ctypes.cast(a.data.ptr, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.c_int(natm) + ) if err != 0: raise RuntimeError('CUDA Error') return pbecke diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 7f17c472..f7758f1c 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -185,7 +185,8 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', ao_loc = mol.ao_loc_nr() #cpos = cupy.einsum('ij,j->ij', mo_coeff[:,mo_occ>0], cupy.sqrt(mo_occ[mo_occ>0])) - cpos = mo_coeff[:,mo_occ>0] * cupy.sqrt(mo_occ[mo_occ>0]) + #cpos = mo_coeff[:,mo_occ>0] * cupy.sqrt(mo_occ[mo_occ>0]) + cpos = (mo_coeff * mo_occ**0.5)[:,mo_occ>0] if xctype == 'LDA' or xctype == 'HF': c0 = _dot_ao_dm(mol, ao, cpos, non0tab, shls_slice, ao_loc) #:rho = numpy.einsum('pi,pi->p', c0, c0) @@ -194,11 +195,12 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', rho = cupy.empty((4,ngrids)) c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc) #:rho[0] = numpy.einsum('pi,pi->p', c0, c0) - rho[0] = _contract_rho(c0, c0) + _contract_rho(c0, c0, rho=rho[0]) for i in range(1, 4): c1 = _dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc) #:rho[i] = numpy.einsum('pi,pi->p', c0, c1) * 2 # *2 for +c.c. - rho[i] = _contract_rho(c0, c1) * 2 + _contract_rho(c0, c1, rho=rho[i]) + rho[i] *= 2 else: # meta-GGA if with_lapl: # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2 @@ -209,7 +211,7 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA', tau_idx = 4 c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc) #:rho[0] = numpy.einsum('pi,pi->p', c0, c0) - rho[0] = _contract_rho(c0, c0) + _contract_rho(c0, c0, rho=rho[0]) rho[tau_idx] = 0 for i in range(1, 4): @@ -1212,7 +1214,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, sorted_ao: by default ao_value is sorted for GPU ''' if grids.coords is None: - grids.build(with_non0tab=True) + grids.build(with_non0tab=False, sort_grids=True) if nao is None: nao = mol.nao ngrids = grids.coords.shape[0] diff --git a/gpu4pyscf/dft/tests/test_ao_values.py b/gpu4pyscf/dft/tests/test_ao_values.py index 3bb183fe..78ae6c98 100644 --- a/gpu4pyscf/dft/tests/test_ao_values.py +++ b/gpu4pyscf/dft/tests/test_ao_values.py @@ -30,7 +30,7 @@ def setUpModule(): ''' mol_sph = pyscf.M( atom=atom, - basis='ccpvdz', + basis='ccpvqz', spin=None, cart = 0, output = '/dev/null') diff --git a/gpu4pyscf/lib/gdft/gen_grids.cu b/gpu4pyscf/lib/gdft/gen_grids.cu index 27b9eca5..2c484ae3 100644 --- a/gpu4pyscf/lib/gdft/gen_grids.cu +++ b/gpu4pyscf/lib/gdft/gen_grids.cu @@ -22,28 +22,59 @@ #define NATOM_PER_BLOCK 128 __global__ -void GDFTgen_grid_kernel(double *pbecke, const double *dist_ig, const double *dist_ij, - const double *a, int ngrids, int natm) +void GDFTgen_grid_kernel(double *pbecke, const double *coords, const double *atm_coords, const double *a, +int ngrids, int natm) { int grid_id = blockIdx.x * blockDim.x + threadIdx.x; const bool active = grid_id < ngrids; - - __shared__ double dij_smem[NATOM_PER_BLOCK]; + double xg = 0.0; + double yg = 0.0; + double zg = 0.0; + if(active){ + xg = coords[3*grid_id+0]; + yg = coords[3*grid_id+1]; + zg = coords[3*grid_id+2]; + } + __shared__ double xj[NATOM_PER_BLOCK]; + __shared__ double yj[NATOM_PER_BLOCK]; + __shared__ double zj[NATOM_PER_BLOCK]; __shared__ double a_smem[NATOM_PER_BLOCK]; + __shared__ double dij_smem[NATOM_PER_BLOCK]; + const int tx = threadIdx.x; for (int atom_i = 0; atom_i < natm; atom_i++){ + double xi = atm_coords[atom_i]; + double yi = atm_coords[atom_i + natm]; + double zi = atm_coords[atom_i + 2*natm]; + double becke = 2.0; - double dig = 0.0; + double dx, dy, dz, dig; if (active){ // distance between grids and atom i - dig = dist_ig[atom_i * ngrids + grid_id]; + dx = xg - xi; + dy = yg - yi; + dz = zg - zi; + dig = norm3d(dx, dy, dz); } for (int j = 0; j < natm; j+=blockDim.x){ int atom_idx = j + tx; if (atom_idx < natm){ + double xj_t = atm_coords[atom_idx]; + double yj_t = atm_coords[atom_idx + natm]; + double zj_t = atm_coords[atom_idx + 2*natm]; + // distance between atom i and atom j - dij_smem[tx] = dist_ij[atom_i * natm + atom_idx]; + dx = xi - xj_t; + dy = yi - yj_t; + dz = zi - zj_t; + double dij = norm3d(dx, dy, dz); + + // distance between atom i and atom j + dij_smem[tx] = dij; + xj[tx] = xj_t; + yj[tx] = yj_t; + zj[tx] = zj_t; a_smem[tx] = a[atom_i * natm + atom_idx]; } __syncthreads(); @@ -51,10 +82,11 @@ void GDFTgen_grid_kernel(double *pbecke, const double *dist_ig, const double *di for (int l = 0, M = min(NATOM_PER_BLOCK, natm-j); l < M; ++l){ int atom_j = j + l; // distance between grids and atom j - double djg = 0; - if (active){ - djg = dist_ig[atom_j * ngrids + grid_id]; - } + dx = xg - xj[l]; + dy = yg - yj[l]; + dz = zg - zj[l]; + double djg = norm3d(dx, dy, dz); + double dij = dij_smem[l]; double aij = a_smem[l]; double g = (atom_i == atom_j) ? 0.0 : (dig - djg) / dij; @@ -122,12 +154,11 @@ void GDFTgroup_grids_kernel(int* group_ids, const double* atom_coords, const dou extern "C"{ __host__ int GDFTgen_grid_partition(cudaStream_t stream, double *pbecke, - const double *dist_ig, const double *dist_ij, - const double *a, int ngrids, int natm) +const double *coords, const double *atm_coords, const double *a, int ngrids, int natm) { dim3 threads(NATOM_PER_BLOCK); dim3 blocks((ngrids+NATOM_PER_BLOCK-1)/NATOM_PER_BLOCK); - GDFTgen_grid_kernel<<>>(pbecke, dist_ig, dist_ij, a, ngrids, natm); + GDFTgen_grid_kernel<<>>(pbecke, coords, atm_coords, a, ngrids, natm); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess){ fprintf(stderr, "CUDA Error of gen grids: %s\n", cudaGetErrorString(err)); diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu index c64af7fc..0295c0c0 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu @@ -298,9 +298,6 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gtoz[1*ngrids+grid_id] = az * ry; gtoz[2*ngrids+grid_id] = az * rz + ce; }else if (ANG == 2) { - double bx = ce * rx; - double by = ce * ry; - double bz = ce * rz; gto [ grid_id] = ce * rx * rx; gto [1*ngrids+grid_id] = ce * rx * ry; gto [2*ngrids+grid_id] = ce * rx * rz; @@ -308,33 +305,27 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gto [4*ngrids+grid_id] = ce * ry * rz; gto [5*ngrids+grid_id] = ce * rz * rz; double ax = ce_2a * rx; - gtox[ grid_id] = ax * rx * rx + 2 * bx; - gtox[1*ngrids+grid_id] = ax * rx * ry + by; - gtox[2*ngrids+grid_id] = ax * rx * rz + bz; + gtox[ grid_id] = (ax * rx + 2 * ce) * rx; + gtox[1*ngrids+grid_id] = (ax * rx + ce) * ry; + gtox[2*ngrids+grid_id] = (ax * rx + ce) * rz; gtox[3*ngrids+grid_id] = ax * ry * ry; gtox[4*ngrids+grid_id] = ax * ry * rz; gtox[5*ngrids+grid_id] = ax * rz * rz; double ay = ce_2a * ry; gtoy[ grid_id] = ay * rx * rx; - gtoy[1*ngrids+grid_id] = ay * rx * ry + bx; + gtoy[1*ngrids+grid_id] = (ay * ry + ce) * rx; gtoy[2*ngrids+grid_id] = ay * rx * rz; - gtoy[3*ngrids+grid_id] = ay * ry * ry + 2 * by; - gtoy[4*ngrids+grid_id] = ay * ry * rz + bz; + gtoy[3*ngrids+grid_id] = (ay * ry + 2 * ce) * ry; + gtoy[4*ngrids+grid_id] = (ay * ry + ce) * rz; gtoy[5*ngrids+grid_id] = ay * rz * rz; double az = ce_2a * rz; gtoz[ grid_id] = az * rx * rx; gtoz[1*ngrids+grid_id] = az * rx * ry; - gtoz[2*ngrids+grid_id] = az * rx * rz + bx; + gtoz[2*ngrids+grid_id] = (az * rz + ce) * rx; gtoz[3*ngrids+grid_id] = az * ry * ry; - gtoz[4*ngrids+grid_id] = az * ry * rz + by; - gtoz[5*ngrids+grid_id] = az * rz * rz + 2 * bz; + gtoz[4*ngrids+grid_id] = (az * rz + ce) * ry; + gtoz[5*ngrids+grid_id] = (az * rz + 2 * ce) * rz; } else if (ANG == 3) { - double bxx = ce * rx * rx; - double bxy = ce * rx * ry; - double bxz = ce * rx * rz; - double byy = ce * ry * ry; - double byz = ce * ry * rz; - double bzz = ce * rz * rz; gto [ grid_id] = ce * rx * rx * rx; gto [1*ngrids+grid_id] = ce * rx * rx * ry; gto [2*ngrids+grid_id] = ce * rx * rx * rz; @@ -346,43 +337,40 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gto [8*ngrids+grid_id] = ce * ry * rz * rz; gto [9*ngrids+grid_id] = ce * rz * rz * rz; double ax = ce_2a * rx; - gtox[ grid_id] = ax * rx * rx * rx + 3 * bxx; - gtox[1*ngrids+grid_id] = ax * rx * rx * ry + 2 * bxy; - gtox[2*ngrids+grid_id] = ax * rx * rx * rz + 2 * bxz; - gtox[3*ngrids+grid_id] = ax * rx * ry * ry + byy; - gtox[4*ngrids+grid_id] = ax * rx * ry * rz + byz; - gtox[5*ngrids+grid_id] = ax * rx * rz * rz + bzz; + gtox[ grid_id] = (ax * rx + 3 * ce) * rx * rx; + gtox[1*ngrids+grid_id] = (ax * rx + 2 * ce) * rx * ry; + gtox[2*ngrids+grid_id] = (ax * rx + 2 * ce) * rx * rz; + gtox[3*ngrids+grid_id] = (ax * rx + ce) * ry * ry; + gtox[4*ngrids+grid_id] = (ax * rx + ce) * ry * rz; + gtox[5*ngrids+grid_id] = (ax * rx + ce) * rz * rz; gtox[6*ngrids+grid_id] = ax * ry * ry * ry; gtox[7*ngrids+grid_id] = ax * ry * ry * rz; gtox[8*ngrids+grid_id] = ax * ry * rz * rz; gtox[9*ngrids+grid_id] = ax * rz * rz * rz; double ay = ce_2a * ry; gtoy[ grid_id] = ay * rx * rx * rx; - gtoy[1*ngrids+grid_id] = ay * rx * rx * ry + bxx; + gtoy[1*ngrids+grid_id] = (ay * ry + ce) * rx * rx; gtoy[2*ngrids+grid_id] = ay * rx * rx * rz; - gtoy[3*ngrids+grid_id] = ay * rx * ry * ry + 2 * bxy; - gtoy[4*ngrids+grid_id] = ay * rx * ry * rz + bxz; + gtoy[3*ngrids+grid_id] = (ay * ry + 2 * ce) * rx * ry; + gtoy[4*ngrids+grid_id] = (ay * ry + ce) * rx * rz; gtoy[5*ngrids+grid_id] = ay * rx * rz * rz; - gtoy[6*ngrids+grid_id] = ay * ry * ry * ry + 3 * byy; - gtoy[7*ngrids+grid_id] = ay * ry * ry * rz + 2 * byz; - gtoy[8*ngrids+grid_id] = ay * ry * rz * rz + bzz; + gtoy[6*ngrids+grid_id] = (ay * ry + 3 * ce) * ry * ry; + gtoy[7*ngrids+grid_id] = (ay * ry + 2 * ce) * ry * rz; + gtoy[8*ngrids+grid_id] = (ay * ry + ce) * rz * rz; gtoy[9*ngrids+grid_id] = ay * rz * rz * rz; double az = ce_2a * rz; gtoz[ grid_id] = az * rx * rx * rx; gtoz[1*ngrids+grid_id] = az * rx * rx * ry; - gtoz[2*ngrids+grid_id] = az * rx * rx * rz + bxx; + gtoz[2*ngrids+grid_id] = (az * rz + ce) * rx * rx; gtoz[3*ngrids+grid_id] = az * rx * ry * ry; - gtoz[4*ngrids+grid_id] = az * rx * ry * rz + bxy; - gtoz[5*ngrids+grid_id] = az * rx * rz * rz + 2 * bxz; + gtoz[4*ngrids+grid_id] = (az * rz + ce) * rx * ry; + gtoz[5*ngrids+grid_id] = (az * rz + 2 * ce) * rx * rz; gtoz[6*ngrids+grid_id] = az * ry * ry * ry; - gtoz[7*ngrids+grid_id] = az * ry * ry * rz + byy; - gtoz[8*ngrids+grid_id] = az * ry * rz * rz + 2 * byz; - gtoz[9*ngrids+grid_id] = az * rz * rz * rz + 3 * bzz; + gtoz[7*ngrids+grid_id] = (az * rz + ce) * ry * ry; + gtoz[8*ngrids+grid_id] = (az * rz + 2 * ce) * ry * rz; + gtoz[9*ngrids+grid_id] = (az * rz + 3 * ce) * rz * rz; } - // There is a bug in the comment. - // Using a general formulation. - // FIXME later - /*else if (ANG == 4) { + else if (ANG == 4) { double ax = ce_2a * rx; double ay = ce_2a * ry; double az = ce_2a * rz; @@ -417,8 +405,8 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gtox[3 *ngrids+grid_id] = ax * rx * rx * ry * ry + 2 * bxyy; gtox[4 *ngrids+grid_id] = ax * rx * rx * ry * rz + 2 * bxyz; gtox[5 *ngrids+grid_id] = ax * rx * rx * rz * rz + 2 * bxzz; - gtox[6 *ngrids+grid_id] = ax * rx * ry * ry * ry + byzz; - gtox[7 *ngrids+grid_id] = ax * rx * ry * ry * rz + byzz; + gtox[6 *ngrids+grid_id] = ax * rx * ry * ry * ry + byyy; + gtox[7 *ngrids+grid_id] = ax * rx * ry * ry * rz + byyz; gtox[8 *ngrids+grid_id] = ax * rx * ry * rz * rz + byzz; gtox[9 *ngrids+grid_id] = ax * rx * rz * rz * rz + bzzz; gtox[10*ngrids+grid_id] = ax * ry * ry * ry * ry; @@ -457,7 +445,6 @@ static void _cart_kernel_deriv1(BasOffsets offsets) gtoz[13*ngrids+grid_id] = az * ry * rz * rz * rz + 3 * byzz; gtoz[14*ngrids+grid_id] = az * rz * rz * rz * rz + 4 * bzzz; } - */ else{ double fx0[16], fy0[16], fz0[16]; @@ -1119,26 +1106,16 @@ static void _sph_kernel_deriv1(BasOffsets offsets) gto[8 *ngrids+grid_id] = 0.625835735449176134 * (g0 + g10) - 3.755014412695056800 * g3; double ax = ce_2a * rx; - double bxxx = ce * rx * rx * rx; - double bxxy = ce * rx * rx * ry; - double bxxz = ce * rx * rx * rz; - double bxyy = ce * rx * ry * ry; - double bxyz = ce * rx * ry * rz; - double bxzz = ce * rx * rz * rz; - double byyy = ce * ry * ry * ry; - double byyz = ce * ry * ry * rz; - double byzz = ce * ry * rz * rz; - double bzzz = ce * rz * rz * rz; - g0 = ax * rx * rx * rx * rx + 4 * bxxx; - g1 = ax * rx * rx * rx * ry + 3 * bxxy; - g2 = ax * rx * rx * rx * rz + 3 * bxxz; - g3 = ax * rx * rx * ry * ry + 2 * bxyy; - g4 = ax * rx * rx * ry * rz + 2 * bxyz; - g5 = ax * rx * rx * rz * rz + 2 * bxzz; - g6 = ax * rx * ry * ry * ry + byzz; - g7 = ax * rx * ry * ry * rz + byzz; - g8 = ax * rx * ry * rz * rz + byzz; - g9 = ax * rx * rz * rz * rz + bzzz; + g0 = (ax * rx + 4 * ce) * rx * rx * rx; + g1 = (ax * rx + 3 * ce) * rx * rx * ry; + g2 = (ax * rx + 3 * ce) * rx * rx * rz; + g3 = (ax * rx + 2 * ce) * rx * ry * ry; + g4 = (ax * rx + 2 * ce) * rx * ry * rz; + g5 = (ax * rx + 2 * ce) * rx * rz * rz; + g6 = (ax * rx + ce) * ry * ry * ry; + g7 = (ax * rx + ce) * ry * ry * rz; + g8 = (ax * rx + ce) * ry * rz * rz; + g9 = (ax * rx + ce) * rz * rz * rz; g10 = ax * ry * ry * ry * ry; g11 = ax * ry * ry * ry * rz; g12 = ax * ry * ry * rz * rz; @@ -1156,19 +1133,19 @@ static void _sph_kernel_deriv1(BasOffsets offsets) double ay = ce_2a * ry; g0 = ay * rx * rx * rx * rx; - g1 = ay * rx * rx * rx * ry + bxxx; + g1 = (ay * ry + ce) * rx * rx * rx; g2 = ay * rx * rx * rx * rz; - g3 = ay * rx * rx * ry * ry + 2 * bxxy; - g4 = ay * rx * rx * ry * rz + bxxz; + g3 = (ay * ry + 2 * ce) * rx * rx * ry; + g4 = (ay * ry + ce) * rx * rx * rz; g5 = ay * rx * rx * rz * rz; - g6 = ay * rx * ry * ry * ry + 3 * bxyy; - g7 = ay * rx * ry * ry * rz + 2 * bxyz; - g8 = ay * rx * ry * rz * rz + bxzz; + g6 = (ay * ry + 3 * ce) * rx * ry * ry; + g7 = (ay * ry + 2 * ce) * rx * ry * rz; + g8 = (ay * ry + ce) * rx * rz * rz; g9 = ay * rx * rz * rz * rz; - g10 = ay * ry * ry * ry * ry + 4 * byyy; - g11 = ay * ry * ry * ry * rz + 3 * byyz; - g12 = ay * ry * ry * rz * rz + 2 * byzz; - g13 = ay * ry * rz * rz * rz + bzzz; + g10 = (ay * ry + 4 * ce) * ry * ry * ry; + g11 = (ay * ry + 3 * ce) * ry * ry * rz; + g12 = (ay * ry + 2 * ce) * ry * rz * rz; + g13 = (ay * ry + ce) * rz * rz * rz; g14 = ay * rz * rz * rz * rz; gtoy[ grid_id] = 2.503342941796704538 * g1 - 2.503342941796704530 * g6 ; gtoy[1 *ngrids+grid_id] = 5.310392309339791593 * g4 - 1.770130769779930530 * g11; @@ -1183,19 +1160,19 @@ static void _sph_kernel_deriv1(BasOffsets offsets) double az = ce_2a * rz; g0 = az * rx * rx * rx * rx; g1 = az * rx * rx * rx * ry; - g2 = az * rx * rx * rx * rz + bxxx; + g2 = (az * rz + ce) * rx * rx * rx; g3 = az * rx * rx * ry * ry; - g4 = az * rx * rx * ry * rz + bxxy; - g5 = az * rx * rx * rz * rz + 2 * bxxz; + g4 = (az * rz + ce) * rx * rx * ry; + g5 = (az * rz + 2 * ce) * rx * rx * rz; g6 = az * rx * ry * ry * ry; - g7 = az * rx * ry * ry * rz + bxyy; - g8 = az * rx * ry * rz * rz + 2 * bxyz; - g9 = az * rx * rz * rz * rz + 3 * bxzz; + g7 = (az * rz + ce) * rx * ry * ry; + g8 = (az * rz + 2 * ce) * rx * ry * rz; + g9 = (az * rz + 3 * ce) * rx * rz * rz; g10 = az * ry * ry * ry * ry; - g11 = az * ry * ry * ry * rz + byyy; - g12 = az * ry * ry * rz * rz + 2 * byyz; - g13 = az * ry * rz * rz * rz + 3 * byzz; - g14 = az * rz * rz * rz * rz + 4 * bzzz; + g11 = (az * rz + ce) * ry * ry * ry; + g12 = (az * rz + 2 * ce) * ry * ry * rz; + g13 = (az * rz + 3 * ce) * ry * rz * rz; + g14 = (az * rz + 4 * ce) * rz * rz * rz; gtoz[ grid_id] = 2.503342941796704538 * g1 - 2.503342941796704530 * g6 ; gtoz[1 *ngrids+grid_id] = 5.310392309339791593 * g4 - 1.770130769779930530 * g11; gtoz[2 *ngrids+grid_id] = 5.677048174545360108 * g8 - 0.946174695757560014 * g1 - 0.946174695757560014 * g6 ;