Skip to content

Commit

Permalink
Merge pull request #912 from omarkahmed/omarkahmed/openmp-intel-gpu
Browse files Browse the repository at this point in the history
Add Intel Xe Max Support for the CCSD module
  • Loading branch information
nwchemgit authored Nov 28, 2023
2 parents dc7962f + a3fc85f commit 3c1b014
Show file tree
Hide file tree
Showing 8 changed files with 1,270 additions and 32 deletions.
23 changes: 20 additions & 3 deletions src/ccsd/GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

include ../config/makefile.h

ifdef USE_IMAX_OPENMP_TRPDRV
SUBDIRS += module
endif

OBJ_OPTIMIZE = \
ccden_driver.o \
ccden_interm2.o \
Expand Down Expand Up @@ -93,17 +97,16 @@ endif
ccsd_trpdrv_bgp2.F \
ccsd_trpdrv_offload.F \
ccsd_trpdrv_openacc.F \
ccsd_trpdrv_openmp_imax.F \
moints_trp.F

ifeq ($(TARGET),BGP)
OBJ_OPTIMIZE += ccsd_trpdrv_bgp2.o ccsd_tengy_bgp2.o ccsd_tengy_bgp.o
USES_BLAS += ccsd_trpdrv_bgp2.F
LIB_DEFINES += -DBGP
endif

ifdef USE_MIC_TRPDRV
OBJ_OPTIMIZE += ccsd_trpdrv_offload.o
USES_BLAS += ccsd_trpdrv_offload.F
LIB_DEFINES += -DUSE_MIC_TRPDRV
endif
ifeq ($(_FC),xlf)
Expand All @@ -119,9 +122,23 @@ ifeq ($(HAVE_SET_GA_PROPERTY),Y)
LIB_DEFINES += -DHAVE_SET_GA_PROPERTY
endif

ifdef USE_IMAX_OPENMP_TRPDRV

OBJ_OPTIMIZE += ccsd_trpdrv_openmp_imax.o

OBJ_OPTIMIZE += ccsd_trpdrv_omp_reduce_f.o

FOPTIONS += -O3 -fiopenmp -fopenmp-targets=spir64="-mllvm -vpo-paropt-opt-data-sharing-for-reduction=false -mllvm -vpo-paropt-atomic-free-reduction-par-global=false" -switch offload_modvars -mllvm -vpo-paropt-atomic-free-reduction-slm=true -qmkl -DMKL_ILP64 -I"${MKLROOT}/include" -I ${NWCHEM_TOP}/src/ccsd/module -mllvm -vpo-paropt-dispatch-codegen-version=1 -switch -use-host-usm-for-implicit-reduction-map

COPTIONS:=$(filter-out -fopenmp,$(COPTIONS))
COPTIONS:=$(filter-out -O1,$(COPTIONS))

COPTIONS += -O3 -fiopenmp -fopenmp-targets=spir64="-mllvm -vpo-paropt-opt-data-sharing-for-reduction=false -mllvm -vpo-paropt-atomic-free-reduction-par-global=false" -mllvm -vpo-paropt-atomic-free-reduction-slm=true -qmkl -DMKL_ILP64 -I"${MKLROOT}/include" -mllvm -vpo-paropt-dispatch-codegen-version=1

endif

ifdef USE_OPENACC_TRPDRV
OBJ_OPTIMIZE += ccsd_trpdrv_openacc.o
USES_BLAS += ccsd_trpdrv_openacc.F
FOPTIONS += -DUSE_OPENACC_TRPDRV
ifeq ($(_FC),pgf90)
FOPTIONS += -Mextend -acc -cuda -cudalib=cublas
Expand Down
44 changes: 42 additions & 2 deletions src/ccsd/aoccsd2.F
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit,
& convi,iprt,cmo,eorb,blen,
& g_ncoul, g_nexch, RefEner,
$ CC_Theory, rtdb, mxvec, geom, Tol2e, occd, oconverged)
$ CC_Theory, rtdb, mxvec, geom, Tol2e, occd,
& oconverged)
#if defined(USE_IMAX_OPENMP_TRPDRV)
use omp_lib, only: omp_interop_kind, omp_interop_none
#endif
implicit none
#include "errquit.fh"
C $Id$
Expand All @@ -16,6 +20,7 @@ subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit,
logical oconverged, occd, use_trpdrv_nb
logical use_trpdrv_omp, use_trpdrv_bgp2
logical use_trpdrv_omp_mp
logical use_trpdrv_openmp_imax
logical use_trpdrv_openacc
logical use_trpdrv_offload
c
Expand All @@ -31,7 +36,10 @@ subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit,
#include "msgids.fh"
#include "ccsdps.fh"
c
#if defined(USE_IMAX_OPENMP_TRPDRV)
c ccsd
integer(kind = omp_interop_kind) :: dummy_obj = omp_interop_none
#endif
Integer i
logical stat
integer nsh,maxbfsh,max2e,mem2
Expand Down Expand Up @@ -721,6 +729,9 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf,
if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_openacc', mt_log, 1,
1 use_trpdrv_openacc))
2 use_trpdrv_openacc=.false.
if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_openmp_imax', mt_log,
1 1, use_trpdrv_openmp_imax))
2 use_trpdrv_openmp_imax=.false.
if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_offload', mt_log, 1,
1 use_trpdrv_offload))
2 use_trpdrv_offload=.false.
Expand Down Expand Up @@ -955,6 +966,20 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf,
& blen, cmo, ncor, nocc, nvir, ndel, Tol2E)
c print *,'call trpdrv ',nvpass
call ga_sync()
#if defined(USE_IMAX_OPENMP_TRPDRV)
! Dummy parallel construct
!$omp parallel num_threads(8)
!$omp end parallel

! Dummy target construct
!$omp target
!$omp end target

! Dummy interop object
!$omp interop init(prefer_type("sycl"),targetsync: dummy_obj)
!$omp interop destroy(dummy_obj)
#endif

tx(2)=tcgtime()
c
if (use_trpdrv_omp) then
Expand Down Expand Up @@ -992,8 +1017,23 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf,
call ccsd_trpdrv_openacc(dbl_mb(k_t1),eorb,
$ g_objo,g_objv,g_coul,g_exch,ncor,nocc,nvir,iprt,
$ empt(1),empt(2),oseg_lo,oseg_hi,kchunk)
!!#else
!! call errquit('aoccsd: trpdrv_openacc disabled ',0,0)
!!#endif
! use_trpdrv_openmp_imax
else if (use_trpdrv_openmp_imax) then
#elif defined(USE_IMAX_OPENMP_TRPDRV)
if (iam.eq.0.and.oprint) then
write(luout,1818) nvpass,util_wallsec()
call util_flush(luout)
endif
1818 format(' commencing triples evaluation - OpenMP',
I 'MAX GPU version',i8,' at ',f20.2,' secs')
call ccsd_trpdrv_offload_xe(dbl_mb(k_t1),eorb,
$ g_objo,g_objv,g_coul,g_exch,ncor,nocc,nvir,iprt,
$ empt(1),empt(2),oseg_lo,oseg_hi,kchunk)
#else
call errquit('aoccsd: trpdrv_openacc disabled ',0,0)
call errquit('aoccsd: trpdrv_openmp_gpu disabled ',0,0)
#endif
c
elseif (use_trpdrv_omp_mp) then
Expand Down
121 changes: 121 additions & 0 deletions src/ccsd/ccsd_trpdrv_omp_reduce_f.F
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
subroutine ccsd_trpdrv_omp_fbody_reduce_new (f1n, f1t, f2n, f2t,
& f3n, f3t, f4n, f4t,
& eorb,
& ncor, nocc, nvir,
& emp4, emp5,
& i, k,
& eaijk,
& dintc1, dintx1, t1v1,
& dintc2, dintx2, t1v2)

use omp_lib
use iso_fortran_env
implicit none

double precision, intent(inout) :: emp4, emp5
double precision, intent(inout) :: f1n(nvir,nvir), f1t(nvir,nvir)
double precision, intent(inout) :: f2n(nvir,nvir), f2t(nvir,nvir)
double precision, intent(inout) :: f3n(nvir,nvir), f3t(nvir,nvir)
double precision, intent(inout) :: f4n(nvir,nvir), f4t(nvir,nvir)
double precision, intent(in) :: eorb(*)
double precision, intent(in) :: eaijk
double precision, intent(in) :: dintc1(nvir), dintx1(nvir)
double precision, intent(in) :: dintc2(nvir), dintx2(nvir)
double precision, intent(in) :: t1v1(nvir), t1v2(nvir)
integer, intent(in) :: ncor, nocc, nvir
integer, intent(in) :: i, k

double precision :: emp4i,emp5i,emp4k,emp5k, denom
double precision :: f1nbc,f1tbc,f1ncb,f1tcb
double precision :: f2nbc,f2tbc,f2ncb,f2tcb
double precision :: f3nbc,f3tbc,f3ncb,f3tcb
double precision :: f4nbc,f4tbc,f4ncb,f4tcb
double precision :: t1v1b,t1v2b,dintx1c,dintx2c,dintc1c,dintc2c
integer :: b,c

emp4i = 0.0
emp5i = 0.0
emp4k = 0.0
emp5k = 0.0

!$omp target teams distribute parallel do collapse(2)
& reduction(+:emp5i,emp4i,emp5k,emp4k)
& private(f1nbc,f1tbc,f1ncb,f1tcb,f2nbc,f2tbc,f2ncb,f2tcb)
& private(f3nbc,f3tbc,f3ncb,f3tcb,f4nbc,f4tbc,f4ncb,f4tcb)
& private(t1v1b,t1v2b,dintx1c,dintx2c,dintc1c,dintc2c)
& private(denom) firstprivate(eaijk,nvir,ncor,nocc)
do b=1,nvir
do c=1,nvir
denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )

f1nbc = f1n(b,c);
f1tbc = f1t(b,c);
f1ncb = f1n(c,b);
f1tcb = f1t(c,b);

f2nbc = f2n(b,c);
f2tbc = f2t(b,c);
f2ncb = f2n(c,b);
f2tcb = f2t(c,b);

f3nbc = f3n(b,c);
f3tbc = f3t(b,c);
f3ncb = f3n(c,b);
f3tcb = f3t(c,b);

f4nbc = f4n(b,c);
f4tbc = f4t(b,c);
f4ncb = f4n(c,b);
f4tcb = f4t(c,b);

t1v1b = t1v1(b);
t1v2b = t1v2(b);

dintx1c = dintx1(c);
dintx2c = dintx2(c);
dintc1c = dintc1(c);
dintc2c = dintc2(c);

emp4i = emp4i
& + (denom * (f1tbc+f1ncb+f2tcb+f3nbc+f4ncb)
& * (f1tbc-f2tbc*2-f3tbc*2+f4tbc)
& - denom * (f1nbc+f1tcb+f2ncb+f3ncb)
& * (f1tbc*2-f2tbc-f3tbc+f4tbc*2)
& + denom * 3 * (f1nbc*(f1nbc+f3ncb+f4tcb*2)
& + f2nbc*f2tcb+f3nbc*f4tbc))

emp4k = emp4k
& + (denom * (f1nbc+f1tcb+f2ncb+f3tbc+f4tcb)
& * (f1nbc-f2nbc*2-f3nbc*2+f4nbc)
& - denom * (f1tbc+f1ncb+f2tcb+f3tcb)
& * (f1nbc*2-f2nbc-f3nbc+f4nbc*2)
& + denom * 3 * (f1tbc*(f1tbc+f3tcb+f4ncb*2)
& + f2tbc*f2ncb+f3tbc*f4nbc))

emp5i = emp5i
& + (denom * t1v1b * dintx1c
& * (f1tbc+f2nbc+f4ncb
& - (f3tbc+f4nbc+f2ncb+f1nbc+f2tbc+f3ncb)*2
& + (f3nbc+f4tbc+f1ncb)*4)
& + denom * t1v1b * dintc1c
& * (f1nbc+f4nbc+f1tcb -(f2nbc+f3nbc+f2tcb)*2))

emp5k = emp5k
& + (denom * t1v2b * dintx2c
& * (f1nbc+f2tbc+f4tcb
& - (f3nbc+f4tbc+f2tcb +f1tbc+f2nbc+f3tcb)*2
& + (f3tbc+f4nbc+f1tcb)*4)
& + denom * t1v2b * dintc2c
& * (f1tbc+f4tbc+f1ncb -(f2tbc+f3tbc+f2ncb)*2))
enddo
enddo
!$omp end target teams distribute parallel do

emp4 = emp4 + emp4i
emp5 = emp5 + emp5i
if (i.ne.k) then
emp4 = emp4 + emp4k
emp5 = emp5 + emp5k
end if ! (i.ne.k)

end
Loading

0 comments on commit 3c1b014

Please sign in to comment.