From 03d554f3b67517aac5024452d3ac9dc89019f2b2 Mon Sep 17 00:00:00 2001 From: edoapra Date: Tue, 21 Nov 2023 21:52:02 -0800 Subject: [PATCH 1/5] removed misleading ifxold & addded USE_IMAX_OPENMP_TRPDRV --- src/config/makefile.h | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/src/config/makefile.h b/src/config/makefile.h index da6a50855c..ee6a9e8a9b 100644 --- a/src/config/makefile.h +++ b/src/config/makefile.h @@ -2358,32 +2358,6 @@ ifneq ($(TARGET),LINUX) endif # support for Intel(R) Fortran compiler - ifeq ($(_FC),ifxold) - DEFINES += -DIFCV8 -DIFCLINUX - FOPTIONS += -fpp -align - ifneq ($(V),1) - FOPTIONS += -Qoption,fpp,-w0 - endif - FOPTIMIZE = -g -O3 -fimf-arch-consistency=true - ifdef USE_I4FLAGS - else - FOPTIONS += -i8 - endif - ifdef USE_OPENMP - FOPTIONS += -fopenmp - ifdef USE_OFFLOAD - FOPTIONS += -fopenmp-targets=spirv64 - endif - endif - ifdef OPENBLAS_USES_OPENMP - LDOPTIONS += -fopenmp - endif - ifdef IFX_DEBUG - # debugging remove at some point - FOPTIONS += -std95 -what - endif - FDEBUG = $(FOPTIMIZE) - endif # support for traditional Intel(R) Fortran compiler @@ -2467,6 +2441,9 @@ ifneq ($(TARGET),LINUX) FOPTIONS += -fiopenmp ifdef USE_OFFLOAD FOPTIONS += -fopenmp-targets=spirv64 + ifdef USE_IMAX_OPENMP_TRPDRV + DEFINES += -DUSE_IMAX_OPENMP_TRPDRV + endif endif else FOPTIONS += -qopenmp From 05f5cb5cfb4284955ac2f12365c4d851e714db40 Mon Sep 17 00:00:00 2001 From: Omar Khalil Ahmed Date: Tue, 21 Nov 2023 22:04:12 -0800 Subject: [PATCH 2/5] Add Intel Xe GPU support through OpenMP Offload --- src/ccsd/GNUmakefile | 23 +- src/ccsd/aoccsd2.F | 44 +- src/ccsd/ccsd_trpdrv_omp_reduce_f.F | 121 +++ src/ccsd/ccsd_trpdrv_openmp_imax.F | 1044 ++++++++++++++++++++++ src/ccsd/module/GNUmakefile | 32 + src/ccsd/module/ccsd_trpdrv_mkl_module.F | 5 + 6 files changed, 1264 insertions(+), 5 deletions(-) create mode 100644 src/ccsd/ccsd_trpdrv_omp_reduce_f.F create mode 100644 src/ccsd/ccsd_trpdrv_openmp_imax.F create mode 100644 src/ccsd/module/GNUmakefile create mode 100644 src/ccsd/module/ccsd_trpdrv_mkl_module.F diff --git a/src/ccsd/GNUmakefile b/src/ccsd/GNUmakefile index 775d30eb4e..681f040e71 100644 --- a/src/ccsd/GNUmakefile +++ b/src/ccsd/GNUmakefile @@ -2,6 +2,10 @@ include ../config/makefile.h +ifdef USE_IMAX_OPENMP_TRPDRV +SUBDIRS += module +endif + OBJ_OPTIMIZE = \ ccden_driver.o \ ccden_interm2.o \ @@ -90,9 +94,6 @@ endif aoccsd2.F \ ccsd_fsig1.F \ ccsd_fsig2.F \ - ccsd_trpdrv_bgp2.F \ - ccsd_trpdrv_offload.F \ - ccsd_trpdrv_openacc.F \ moints_trp.F ifeq ($(TARGET),BGP) @@ -119,6 +120,22 @@ ifeq ($(HAVE_SET_GA_PROPERTY),Y) LIB_DEFINES += -DHAVE_SET_GA_PROPERTY endif +ifdef USE_IMAX_OPENMP_TRPDRV + + OBJ_OPTIMIZE += ccsd_trpdrv_openmp_imax.o + USES_BLAS += ccsd_trpdrv_openmp_imax.F + + OBJ_OPTIMIZE += ccsd_trpdrv_omp_reduce_f.o + + FOPTIONS += -O3 -fiopenmp -fopenmp-targets=spir64="-mllvm -vpo-paropt-opt-data-sharing-for-reduction=false -mllvm -vpo-paropt-atomic-free-reduction-par-global=false" -switch offload_modvars -mllvm -vpo-paropt-atomic-free-reduction-slm=true -qmkl -DMKL_ILP64 -I"${MKLROOT}/include" -I ${NWCHEM_TOP}/src/ccsd/module -mllvm -vpo-paropt-dispatch-codegen-version=1 -switch -use-host-usm-for-implicit-reduction-map + + COPTIONS:=$(filter-out -fopenmp,$(COPTIONS)) + COPTIONS:=$(filter-out -O1,$(COPTIONS)) + + COPTIONS += -O3 -fiopenmp -fopenmp-targets=spir64="-mllvm -vpo-paropt-opt-data-sharing-for-reduction=false -mllvm -vpo-paropt-atomic-free-reduction-par-global=false" -mllvm -vpo-paropt-atomic-free-reduction-slm=true -qmkl -DMKL_ILP64 -I"${MKLROOT}/include" -mllvm -vpo-paropt-dispatch-codegen-version=1 + +endif + ifdef USE_OPENACC_TRPDRV OBJ_OPTIMIZE += ccsd_trpdrv_openacc.o USES_BLAS += ccsd_trpdrv_openacc.F diff --git a/src/ccsd/aoccsd2.F b/src/ccsd/aoccsd2.F index cb2a3c48c2..ec436e385d 100644 --- a/src/ccsd/aoccsd2.F +++ b/src/ccsd/aoccsd2.F @@ -2,7 +2,11 @@ subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit, & convi,iprt,cmo,eorb,blen, & g_ncoul, g_nexch, RefEner, - $ CC_Theory, rtdb, mxvec, geom, Tol2e, occd, oconverged) + $ CC_Theory, rtdb, mxvec, geom, Tol2e, occd, + & oconverged) +#if defined(USE_IMAX_OPENMP_TRPDRV) + use omp_lib, only: omp_interop_kind, omp_interop_none +#endif implicit none #include "errquit.fh" C $Id$ @@ -16,6 +20,7 @@ subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit, logical oconverged, occd, use_trpdrv_nb logical use_trpdrv_omp, use_trpdrv_bgp2 logical use_trpdrv_omp_mp + logical use_trpdrv_openmp_imax logical use_trpdrv_openacc logical use_trpdrv_offload c @@ -31,7 +36,10 @@ subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit, #include "msgids.fh" #include "ccsdps.fh" c +#if defined(USE_IMAX_OPENMP_TRPDRV) c ccsd + integer(kind = omp_interop_kind) :: dummy_obj = omp_interop_none +#endif Integer i logical stat integer nsh,maxbfsh,max2e,mem2 @@ -721,6 +729,9 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf, if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_openacc', mt_log, 1, 1 use_trpdrv_openacc)) 2 use_trpdrv_openacc=.false. + if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_openmp_imax', mt_log, + 1 1, use_trpdrv_openmp_imax)) + 2 use_trpdrv_openmp_imax=.false. if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_offload', mt_log, 1, 1 use_trpdrv_offload)) 2 use_trpdrv_offload=.false. @@ -955,6 +966,20 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf, & blen, cmo, ncor, nocc, nvir, ndel, Tol2E) c print *,'call trpdrv ',nvpass call ga_sync() +#if defined(USE_IMAX_OPENMP_TRPDRV) +! Dummy parallel construct + !$omp parallel num_threads(8) + !$omp end parallel + +! Dummy target construct + !$omp target + !$omp end target + +! Dummy interop object + !$omp interop init(prefer_type("sycl"),targetsync: dummy_obj) + !$omp interop destroy(dummy_obj) +#endif + tx(2)=tcgtime() c if (use_trpdrv_omp) then @@ -992,8 +1017,23 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf, call ccsd_trpdrv_openacc(dbl_mb(k_t1),eorb, $ g_objo,g_objv,g_coul,g_exch,ncor,nocc,nvir,iprt, $ empt(1),empt(2),oseg_lo,oseg_hi,kchunk) +!!#else +!! call errquit('aoccsd: trpdrv_openacc disabled ',0,0) +!!#endif +! use_trpdrv_openmp_imax + else if (use_trpdrv_openmp_imax) then +#elif defined(USE_IMAX_OPENMP_TRPDRV) + if (iam.eq.0.and.oprint) then + write(luout,1818) nvpass,util_wallsec() + call util_flush(luout) + endif + 1818 format(' commencing triples evaluation - OpenMP', + I 'MAX GPU version',i8,' at ',f20.2,' secs') + call ccsd_trpdrv_offload_xe(dbl_mb(k_t1),eorb, + $ g_objo,g_objv,g_coul,g_exch,ncor,nocc,nvir,iprt, + $ empt(1),empt(2),oseg_lo,oseg_hi,kchunk) #else - call errquit('aoccsd: trpdrv_openacc disabled ',0,0) + call errquit('aoccsd: trpdrv_openmp_gpu disabled ',0,0) #endif c elseif (use_trpdrv_omp_mp) then diff --git a/src/ccsd/ccsd_trpdrv_omp_reduce_f.F b/src/ccsd/ccsd_trpdrv_omp_reduce_f.F new file mode 100644 index 0000000000..f33d51cd77 --- /dev/null +++ b/src/ccsd/ccsd_trpdrv_omp_reduce_f.F @@ -0,0 +1,121 @@ + subroutine ccsd_trpdrv_omp_fbody_reduce_new (f1n, f1t, f2n, f2t, + & f3n, f3t, f4n, f4t, + & eorb, + & ncor, nocc, nvir, + & emp4, emp5, + & i, k, + & eaijk, + & dintc1, dintx1, t1v1, + & dintc2, dintx2, t1v2) + + use omp_lib + use iso_fortran_env + implicit none + + double precision, intent(inout) :: emp4, emp5 + double precision, intent(inout) :: f1n(nvir,nvir), f1t(nvir,nvir) + double precision, intent(inout) :: f2n(nvir,nvir), f2t(nvir,nvir) + double precision, intent(inout) :: f3n(nvir,nvir), f3t(nvir,nvir) + double precision, intent(inout) :: f4n(nvir,nvir), f4t(nvir,nvir) + double precision, intent(in) :: eorb(*) + double precision, intent(in) :: eaijk + double precision, intent(in) :: dintc1(nvir), dintx1(nvir) + double precision, intent(in) :: dintc2(nvir), dintx2(nvir) + double precision, intent(in) :: t1v1(nvir), t1v2(nvir) + integer, intent(in) :: ncor, nocc, nvir + integer, intent(in) :: i, k + + double precision :: emp4i,emp5i,emp4k,emp5k, denom + double precision :: f1nbc,f1tbc,f1ncb,f1tcb + double precision :: f2nbc,f2tbc,f2ncb,f2tcb + double precision :: f3nbc,f3tbc,f3ncb,f3tcb + double precision :: f4nbc,f4tbc,f4ncb,f4tcb + double precision :: t1v1b,t1v2b,dintx1c,dintx2c,dintc1c,dintc2c + integer :: b,c + + emp4i = 0.0 + emp5i = 0.0 + emp4k = 0.0 + emp5k = 0.0 + + !$omp target teams distribute parallel do collapse(2) + & reduction(+:emp5i,emp4i,emp5k,emp4k) + & private(f1nbc,f1tbc,f1ncb,f1tcb,f2nbc,f2tbc,f2ncb,f2tcb) + & private(f3nbc,f3tbc,f3ncb,f3tcb,f4nbc,f4tbc,f4ncb,f4tcb) + & private(t1v1b,t1v2b,dintx1c,dintx2c,dintc1c,dintc2c) + & private(denom) firstprivate(eaijk,nvir,ncor,nocc) + do b=1,nvir + do c=1,nvir + denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk ) + + f1nbc = f1n(b,c); + f1tbc = f1t(b,c); + f1ncb = f1n(c,b); + f1tcb = f1t(c,b); + + f2nbc = f2n(b,c); + f2tbc = f2t(b,c); + f2ncb = f2n(c,b); + f2tcb = f2t(c,b); + + f3nbc = f3n(b,c); + f3tbc = f3t(b,c); + f3ncb = f3n(c,b); + f3tcb = f3t(c,b); + + f4nbc = f4n(b,c); + f4tbc = f4t(b,c); + f4ncb = f4n(c,b); + f4tcb = f4t(c,b); + + t1v1b = t1v1(b); + t1v2b = t1v2(b); + + dintx1c = dintx1(c); + dintx2c = dintx2(c); + dintc1c = dintc1(c); + dintc2c = dintc2(c); + + emp4i = emp4i + & + (denom * (f1tbc+f1ncb+f2tcb+f3nbc+f4ncb) + & * (f1tbc-f2tbc*2-f3tbc*2+f4tbc) + & - denom * (f1nbc+f1tcb+f2ncb+f3ncb) + & * (f1tbc*2-f2tbc-f3tbc+f4tbc*2) + & + denom * 3 * (f1nbc*(f1nbc+f3ncb+f4tcb*2) + & + f2nbc*f2tcb+f3nbc*f4tbc)) + + emp4k = emp4k + & + (denom * (f1nbc+f1tcb+f2ncb+f3tbc+f4tcb) + & * (f1nbc-f2nbc*2-f3nbc*2+f4nbc) + & - denom * (f1tbc+f1ncb+f2tcb+f3tcb) + & * (f1nbc*2-f2nbc-f3nbc+f4nbc*2) + & + denom * 3 * (f1tbc*(f1tbc+f3tcb+f4ncb*2) + & + f2tbc*f2ncb+f3tbc*f4nbc)) + + emp5i = emp5i + & + (denom * t1v1b * dintx1c + & * (f1tbc+f2nbc+f4ncb + & - (f3tbc+f4nbc+f2ncb+f1nbc+f2tbc+f3ncb)*2 + & + (f3nbc+f4tbc+f1ncb)*4) + & + denom * t1v1b * dintc1c + & * (f1nbc+f4nbc+f1tcb -(f2nbc+f3nbc+f2tcb)*2)) + + emp5k = emp5k + & + (denom * t1v2b * dintx2c + & * (f1nbc+f2tbc+f4tcb + & - (f3nbc+f4tbc+f2tcb +f1tbc+f2nbc+f3tcb)*2 + & + (f3tbc+f4nbc+f1tcb)*4) + & + denom * t1v2b * dintc2c + & * (f1tbc+f4tbc+f1ncb -(f2tbc+f3tbc+f2ncb)*2)) + enddo + enddo + !$omp end target teams distribute parallel do + + emp4 = emp4 + emp4i + emp5 = emp5 + emp5i + if (i.ne.k) then + emp4 = emp4 + emp4k + emp5 = emp5 + emp5k + end if ! (i.ne.k) + + end diff --git a/src/ccsd/ccsd_trpdrv_openmp_imax.F b/src/ccsd/ccsd_trpdrv_openmp_imax.F new file mode 100644 index 0000000000..fdd34d3cd4 --- /dev/null +++ b/src/ccsd/ccsd_trpdrv_openmp_imax.F @@ -0,0 +1,1044 @@ +!#define USE_CPU_REDUCTION + + module reduction_data + real(8) :: emp4i = 0.0 + real(8) :: emp4k = 0.0 + real(8) :: emp5i = 0.0 + real(8) :: emp5k = 0.0 + !$omp declare target to(emp4i,emp4k,emp5i,emp5ki) + end module reduction_data + + + subroutine ccsd_trpdrv_offload_xe(t1,xeorb, + & g_objo,g_objv,g_coul,g_exch, + & ncor,nocc,nvir,iprt,emp4,emp5, + & oseg_lo,oseg_hi, kchunk) + use iso_fortran_env +! use cudafor +! use cublas + use omp_lib + use onemkl_blas_omp_offload_ilp64 + use reduction_data + implicit none +! +#include "errquit.fh" +#include "global.fh" +#include "ccsd_len.fh" +#include "ccsdps.fh" +#include "util.fh" +#include "msgids.fh" +#include "yflop.fh" +! + double precision, intent(inout) :: emp4,emp5 + double precision, intent(in) :: t1(*) + double precision, intent(in) :: xeorb(*) + integer, intent(in) :: ncor,nocc,nvir + integer, intent(in) :: iprt + integer, intent(in) :: g_objo,g_objv,g_coul,g_exch + integer, intent(in) :: oseg_lo,oseg_hi, kchunk + +! double precision, allocatable, device :: eorb(:) +! double precision, allocatable, device :: f1n(:,:) +! double precision, allocatable, device :: f2n(:,:) +! double precision, allocatable, device :: f3n(:,:) +! double precision, allocatable, device :: f4n(:,:) +! double precision, allocatable, device :: f1t(:,:) +! double precision, allocatable, device :: f2t(:,:) +! double precision, allocatable, device :: f3t(:,:) +! double precision, allocatable, device :: f4t(:,:) + double precision, allocatable :: eorb(:) + double precision, allocatable :: f1n(:,:) + double precision, allocatable :: f2n(:,:) + double precision, allocatable :: f3n(:,:) + double precision, allocatable :: f4n(:,:) + double precision, allocatable :: f1t(:,:) + double precision, allocatable :: f2t(:,:) + double precision, allocatable :: f3t(:,:) + double precision, allocatable :: f4t(:,:) + +! double precision, allocatable, pinned :: Tij(:), Tkj(:) +! double precision, allocatable, pinned :: Tia(:), Tka(:) +! double precision, allocatable, pinned :: Xia(:), Xka(:) +! double precision, allocatable, pinned :: Jia(:), Jka(:) +! double precision, allocatable, pinned :: Jij(:), Jkj(:) +! double precision, allocatable, pinned :: Kia(:), Kka(:) +! double precision, allocatable, pinned :: Kij(:), Kkj(:) +! double precision, allocatable, pinned :: Dja(:), Djka(:), Djia(:) + double precision, allocatable :: Tij(:), Tkj(:) + double precision, allocatable :: Tia(:), Tka(:) + double precision, allocatable :: Xia(:), Xka(:) + double precision, allocatable :: Jia(:), Jka(:) + double precision, allocatable :: Jij(:), Jkj(:) + double precision, allocatable :: Kia(:), Kka(:) + double precision, allocatable :: Kij(:), Kkj(:) + double precision, allocatable :: Dja(:), Djka(:), Djia(:) + +! double precision, allocatable, device :: xTij(:), xTkj(:) +! double precision, allocatable, device :: xTia(:), xTka(:) +! double precision, allocatable, device :: xXia(:), xXka(:) +! double precision, allocatable, device :: xJia(:), xJka(:) +! double precision, allocatable, device :: xJij(:), xJkj(:) +! double precision, allocatable, device :: xKia(:), xKka(:) +! double precision, allocatable, device :: xKij(:), xKkj(:) +! used to make inline threaded tengy correct - for now +! it is correct that dint[cx]1 are paired with t1v2 and vice versa +! in the inlined tengy loops. see ccsd_tengy in ccsd_trpdrv.F for +! verification of the i-k and k-i pairing of these. +! double precision, allocatable, device :: dintc1(:),dintc2(:) +! double precision, allocatable, device :: dintx1(:),dintx2(:) +! double precision, allocatable, device :: t1v1(:),t1v2(:) + double precision, allocatable :: dintc1(:),dintc2(:) + double precision, allocatable :: dintx1(:),dintx2(:) + double precision, allocatable :: t1v1(:),t1v2(:) + integer :: alloc_error, err +! + !double precision :: emp4i,emp5i,emp4k,emp5k + !double precision, device :: eaijk + double precision :: eaijk + double precision :: denom + integer :: inode,next,nodes,me + integer :: a,b,c,i,j,k,akold,av,nbf + integer :: klo, khi + integer nxtask + external nxtask + integer :: dgemm_flops, tengy_flops + double precision agg_flops +! +! Dependencies (global array, local array, handle): +! +! These are waited on first +! +! g_objv, Dja, nbh_objv1 +! g_objv, Djka(1+(k-klo)*nvir), nbh_objv4(k) +! g_objv, Djia, nbh_objv5 +! +! These are waited on later +! +! g_objv, Tka, nbh_objv2 +! g_objv, Xka, nbh_objv3 +! g_objv, Tia, nbh_objv6 +! g_objv, Xia, nbh_objv7 +! g_objo, Tkj, nbh_objo1 +! g_objo, Jkj, nbh_objo2 +! g_objo, Kkj, nbh_objo3 +! g_objo, Tij, nbh_objo4 +! g_objo, Jij, nbh_objo5 +! g_objo, Kij, nbh_objo6 +! g_exch, Kka, nbh_exch1 +! g_exch, Kia, nbh_exch2 +! g_coul, Jka, nbh_coul1 +! g_coul, Jia, nbh_coul2 +! +! non-blocking handles +! + integer nbh_objv1,nbh_objv2,nbh_objv3 + integer nbh_objv5,nbh_objv6,nbh_objv7 + integer nbh_objv4(nocc) +! + integer nbh_objo1,nbh_objo2,nbh_objo3 + integer nbh_objo4,nbh_objo5,nbh_objo6 +! + integer nbh_exch1,nbh_exch2,nbh_coul1,nbh_coul2 + integer n_progr,pct_progr + parameter(n_progr=20) + logical i_progr(n_progr+1) + logical got_ak + integer thread_num + +! OpenMP interop objects + integer(kind = omp_interop_kind) :: obj0 = omp_interop_none + integer(kind = omp_interop_kind) :: obj1 = omp_interop_none + integer(kind = omp_interop_kind) :: obj2 = omp_interop_none + integer(kind = omp_interop_kind) :: obj3 = omp_interop_none + integer(kind = omp_interop_kind) :: obj4 = omp_interop_none + integer(kind = omp_interop_kind) :: obj5 = omp_interop_none + integer(kind = omp_interop_kind) :: obj6 = omp_interop_none + integer(kind = omp_interop_kind) :: obj7 = omp_interop_none + integer(kind = omp_interop_kind) :: obj_lev0 = omp_interop_none + +! timers + double precision :: tt0, tt1, tc0, tc1 + double precision :: t_dgemm0, t_dgemm1, t_dgemm_total + double precision :: t_red0, t_red1, t_red_total + +#if 0 + !$omp interop init(targetsync:obj0) + !$omp interop init(targetsync:obj1) + !$omp interop init(targetsync:obj2) + !$omp interop init(targetsync:obj3) + !$omp interop init(targetsync:obj4) + !$omp interop init(targetsync:obj5) + !$omp interop init(targetsync:obj6) + !$omp interop init(targetsync:obj7) +#endif + +#if 0 + !$omp interop init(prefer_type("level_zero"),targetsync: obj0) + !$omp interop init(prefer_type("level_zero"),targetsync: obj1) + !$omp interop init(prefer_type("level_zero"),targetsync: obj2) + !$omp interop init(prefer_type("level_zero"),targetsync: obj3) + !$omp interop init(prefer_type("level_zero"),targetsync: obj4) + !$omp interop init(prefer_type("level_zero"),targetsync: obj5) + !$omp interop init(prefer_type("level_zero"),targetsync: obj6) + !$omp interop init(prefer_type("level_zero"),targetsync: obj7) +#endif + +#if 1 + !$omp interop init(prefer_type("sycl"),targetsync: obj0) + !$omp interop init(prefer_type("sycl"),targetsync: obj1) + !$omp interop init(prefer_type("sycl"),targetsync: obj2) + !$omp interop init(prefer_type("sycl"),targetsync: obj3) + !$omp interop init(prefer_type("sycl"),targetsync: obj4) + !$omp interop init(prefer_type("sycl"),targetsync: obj5) + !$omp interop init(prefer_type("sycl"),targetsync: obj6) + !$omp interop init(prefer_type("sycl"),targetsync: obj7) + + !$omp interop init(prefer_type("level_zero"),targetsync: obj_lev0) +#endif + + t_dgemm_total = 0.0 + t_red_total = 0.0 + +! + nodes = ga_nnodes() + me = ga_nodeid() +! + if (me.eq.0) then + write(6,99) + endif +! 99 format(2x,'Using Fortran OpenACC+CUBLAS in CCSD(T)') + 99 format(2x,'Using Fortran OpenMP+MKL in CCSD(T)') + agg_flops = 0 +! + tt0 = util_wallsec() +! ! setup CUDA streams +! do shi=1,8 +! err = cudaStreamCreate(stream(shi)) +! if (err.ne.0) call errquit('cudaStreamCreate',err,UNKNOWN_ERR) +! err = cublasCreate(handle(shi)) +! if (err.ne.0) call errquit('cublasCreate',err,UNKNOWN_ERR) +! err = cublasSetStream(handle(shi), stream(shi)) +! if (err.ne.0) call errquit('cublasSetStream',err,UNKNOWN_ERR) +! end do +! +! device-only temp arrays +! produced by DGEMM, consumed by TENGY +! +! allocate( f1n(1:nvir,1:nvir), f1t(1:nvir,1:nvir), +! & f2n(1:nvir,1:nvir), f2t(1:nvir,1:nvir), +! & f3n(1:nvir,1:nvir), f3t(1:nvir,1:nvir), +! & f4n(1:nvir,1:nvir), f4t(1:nvir,1:nvir), +! & stat=alloc_error) +! if (alloc_error.ne.0) call errquit('f[1234][tn]',8,MA_ERR) + +#if 1 + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f1n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f1t(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f2n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f2t(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f3n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f3t(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f4n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_device_mem_alloc) + allocate( f4t(1:nvir,1:nvir) ) +#else + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f1n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f1t(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f2n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f2t(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f3n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f3t(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f4n(1:nvir,1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( f4t(1:nvir,1:nvir) ) +#endif + +! +! device-only copy of input eorb +! + nbf = ncor + nocc + nvir + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( eorb(1:nbf) ) + ! allocate( eorb(1:nbf), stat=alloc_error) + ! if (alloc_error.ne.0) call errquit('eorb',10,MA_ERR) + eorb(1:nbf) = xeorb(1:nbf) +! +! for TENGY +! +! allocate( dintc1(1:nvir), dintc2(1:nvir), +! & dintx1(1:nvir), dintx2(1:nvir), +! & t1v1(1:nvir), t1v2(1:nvir), stat=alloc_error) +! if (alloc_error.ne.0) call errquit('CXT1 temps',6,MA_ERR) + + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( dintc1(1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( dintc2(1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( dintx1(1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( dintx2(1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( t1v1(1:nvir) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( t1v2(1:nvir) ) + +! +! H/D arrays, produced by GA Get, consumed by DGEMM +! +! allocate( Tij(1:lnvv), Tkj(1:kchunk*lnvv), +! & Tia(1:lnov*nocc), Tka(1:kchunk*lnov), +! & Xia(1:lnov*nocc), Xka(1:kchunk*lnov), +! & Jia(1:lnvv), Jka(1:kchunk*lnvv), +! & Kia(1:lnvv), Kka(1:kchunk*lnvv), +! & Jij(1:lnov*nocc), Jkj(1:kchunk*lnov), +! & Kij(1:lnov*nocc), Kkj(1:kchunk*lnov), +! & Dja(1:lnov), Djka(1:nvir*kchunk), +! & Djia(1:nvir*nocc), stat=alloc_error) +! if (alloc_error.ne.0) call errquit('TKJKD alloc',1,MA_ERR) + + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Tij(1:lnvv) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Tkj(1:kchunk*lnvv) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Tia(1:lnov*nocc) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Tka(1:kchunk*lnov) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Xia(1:lnov*nocc) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Xka(1:kchunk*lnov) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Jia(1:lnvv) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Jka(1:kchunk*lnvv) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Kia(1:lnvv) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Kka(1:kchunk*lnvv) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Jij(1:lnov*nocc) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Jkj(1:kchunk*lnov) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Kij(1:lnov*nocc) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Kkj(1:kchunk*lnov) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Dja(1:lnov) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate( Djka(1:nvir*kchunk) ) + !$omp allocate allocator(omp_target_host_mem_alloc) + allocate(Djia(1:nvir*nocc) ) + + tt1 = util_wallsec() + if (me.eq.0) then + write(6,500) tt1-tt0 + 500 format('CU+MEM init took ',e15.5,' seconds') + endif +! +! call ga_sync() ! ga_sync called just before trpdrv in aoccsd2 +! + if (occsdps) then + call pstat_on(ps_trpdrv) + else + call qenter('trpdrv',0) + endif + do klo=1,n_progr+1 + i_progr(klo)=.true. + enddo + inode=-1 + next=nxtask(nodes, 1) + +!$omp target data + & map(to:eorb) + & map(to:dintc1,dintx1,t1v1,dintc2,dintx2,t1v2) + & map(to:Jia,Tkj,Tia,Kkj,Kia,Tka,Kij) + & map(to:Xka,Jij,Xia,Jkj,Jka,Tij,Kka) + + do klo = 1, nocc, kchunk + akold=0 + khi = min(nocc, klo+kchunk-1) + do a=oseg_lo,oseg_hi + av=a-ncor-nocc + do j=1,nocc + inode=inode+1 + if (inode.eq.next)then + + call ga_nbget(g_objv,1+(j-1)*lnov,j*lnov,av,av,Dja, + & lnov,nbh_objv1) + do k = klo, khi + call ga_nbget(g_objv,1+(j-1)*nvir+(k-1)*lnov, + & j*nvir+(k-1)*lnov,av,av, + & Djka(1+(k-klo)*nvir),nvir,nbh_objv4(k)) + enddo + call ga_nbget(g_objo,(klo-1)*lnvv+1,khi*lnvv,j,j,Tkj, + & (khi-klo+1)*lnvv,nbh_objo1) + call ga_nbget(g_objo,lnovv+(klo-1)*lnov+1, + & lnovv+khi*lnov,j,j,Jkj, + & (khi-klo+1)*lnov,nbh_objo2) + call ga_nbget(g_objo,lnovv+lnoov+(klo-1)*lnov+1, + & lnovv+lnoov+khi*lnov,j,j,Kkj, + & (khi-klo+1)*lnov,nbh_objo3) + if (akold .ne. a) then + akold = a + call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+klo, + & (a-oseg_lo)*nocc+khi,Jka,lnvv,nbh_coul1) + call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+klo, + & (a-oseg_lo)*nocc+khi,Kka,lnvv,nbh_exch1) + call ga_nbget(g_objv,1+lnoov+(klo-1)*lnov, + & lnoov+khi*lnov,av,av,Tka,(khi-klo+1)*lnov, + & nbh_objv2) + call ga_nbget(g_objv,1+2*lnoov+(klo-1)*lnov, + & 2*lnoov+khi*lnov,av,av,Xka,(khi-klo+1)*lnov, + & nbh_objv3) + got_ak=.true. + else + got_ak=.false. + endif + + emp4i = 0.0d0 + emp5i = 0.0d0 + emp4k = 0.0d0 + emp5k = 0.0d0 + + do i=1,nocc + + call ga_nbget(g_objv,1+(j-1)*nvir+(i-1)*lnov, + & j*nvir+(i-1)*lnov,av,av,Djia,nvir,nbh_objv5) + call ga_nbget(g_objo,(i-1)*lnvv+1,i*lnvv,j,j,Tij, + & lnvv,nbh_objo4) + call ga_nbget(g_objo,lnovv+(i-1)*lnov+1, + & lnovv+i*lnov,j,j,Jij,lnov,nbh_objo5) + call ga_nbget(g_objo,lnovv+lnoov+(i-1)*lnov+1, + & lnovv+lnoov+i*lnov,j,j,Kij,lnov,nbh_objo6) + call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+i, + & (a-oseg_lo)*nocc+i,Jia,lnvv,nbh_coul2) + call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+i, + & (a-oseg_lo)*nocc+i,Kia,lnvv,nbh_exch2) + call ga_nbget(g_objv,1+lnoov+(i-1)*lnov, + & lnoov+i*lnov,av,av,Tia,lnov,nbh_objv6) + call ga_nbget(g_objv,1+2*lnoov+(i-1)*lnov, + & 2*lnoov+i*lnov,av,av,Xia,lnov,nbh_objv7) + + if (occsdps) then + call pstat_on(ps_accwait) + else + call qenter('accwait',0) + endif +!$acc wait(9) +!$acc wait(10) + if (occsdps) then + call pstat_off(ps_accwait) + else + call qexit('accwait',0) + endif + + t1v2(:) = t1((i-1)*nvir+1:i*nvir) + if(i.eq.1) then + call ga_nbwait(nbh_objv1) ! Dja + endif + dintc1(:) = Dja(1+(i-1)*nvir:i*nvir) + call ga_nbwait(nbh_objv5) ! Djia + dintx1(:) = Djia(1:nvir) +!$omp target update to (t1v2, dintc1, dintx1) + + do k=klo,min(khi,i) + + if (occsdps) then + call pstat_on(ps_accwait) + else + call qenter('accwait',0) + endif +!$acc wait(9) +!$acc wait(10) + if (occsdps) then + call pstat_off(ps_accwait) + else + call qexit('accwait',0) + endif + + t1v1(:) = t1((k-1)*nvir+1:k*nvir) + dintc2(:) = Dja(1+(k-1)*nvir:k*nvir) + if(i.eq.1) then + call ga_nbwait(nbh_objv4(k)) ! Djka + endif + dintx2(:) = Djka(1+(k-klo)*nvir:(k-klo+1)*nvir) +!$omp target update to (t1v1, dintc2, dintx2) +! +! These are the input dependencies for the DGEMM calls below. +! We wait on all of them here because GA is not even remotely thread-safe. +! All of these are independent of k, so we wait on them only +! at the first trip of the loop. +! + if (k.eq.klo) then + if (occsdps) then + call pstat_on(ps_gpumove) + else + call qenter('gpumove',0) + endif + call ga_nbwait(nbh_coul2) + !!xJia = Jia + !err = cudaMemcpyAsync(xJia,Jia,size(Jia),stream(1)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Jia) + call ga_nbwait(nbh_objv6) + !!xTia = Tia + !err = cudaMemcpyAsync(xTia,Tia,size(Tia),stream(1)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Tia) + call ga_nbwait(nbh_exch2) + !!xKia = Kia + !err = cudaMemcpyAsync(xKia,Kia,size(Kia),stream(2)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Kia) + call ga_nbwait(nbh_objv7) + !!xXia = Xia + !err = cudaMemcpyAsync(xXia,Xia,size(Xia),stream(2)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Xia) +!$omp target update to (Jia,Tia,Kia,Xia) + if (i.eq.1) then + call ga_nbwait(nbh_objo1) + !!xTkj = Tkj + !err = cudaMemcpyAsync(xTkj,Tkj,size(Tkj),stream(1)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Tkj) + call ga_nbwait(nbh_objo3) + !!xKkj = Kkj + !err = cudaMemcpyAsync(xKkj,Kkj,size(Kkj),stream(1)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Kkj) + call ga_nbwait(nbh_objo2) + !!xJkj = Jkj + !err = cudaMemcpyAsync(xJkj,Jkj,size(Jkj),stream(3)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Jkj) +!$omp target update to (Tkj,Kkj,Jkj) + + if (got_ak) then + call ga_nbwait(nbh_coul1) + !!xJka = Jka + !err = cudaMemcpyAsync(xJka,Jka,size(Jka),stream(5)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Jka) + call ga_nbwait(nbh_objv2) + !!xTka = Tka + !err = cudaMemcpyAsync(xTka,Tka,size(Tka),stream(5)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Tka) + call ga_nbwait(nbh_exch1) + !!xKka = Kka + !err = cudaMemcpyAsync(xKka,Kka,size(Kka),stream(6)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Kka) + call ga_nbwait(nbh_objv3) + !!xXka = Xka + !err = cudaMemcpyAsync(xXka,Xka,size(Xka),stream(6)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Xka) +!$omp target update to (Jka,Tka,Kka,Xka) + endif ! got_ak + endif ! i==1 + call ga_nbwait(nbh_objo4) + !!xTij = Tij + !err = cudaMemcpyAsync(xTij,Tij,size(Tij),stream(5)) + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Tij) + call ga_nbwait(nbh_objo6) + !!xKij = Kij + !err = cudaMemcpyAsync(xKij,Kij,size(Kij),stream(5)) ! and 6 + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Kij) + call ga_nbwait(nbh_objo5) + !!xJij = Jij + !err = cudaMemcpyAsync(xJij,Jij,size(Jij),stream(7)) ! and 8 + !if (err.ne.0) then + ! call errquit('cudaMemcpyAsync',err,UNKNOWN_ERR) + !endif +!!$omp target update to (Jij) +!$omp target update to (Tij,Kij,Jij) + + ! arrays and thus copies contribute to more than one CUBLAS call + ! but the copies on streams 1:4 and 5:8 are separable. + !do shi=1,4 + ! err = cudaStreamSynchronize(stream(shi)) + ! if (err.ne.0) then + ! call errquit('cudaStreamSync',err,UNKNOWN_ERR) + ! endif + !enddo + if (occsdps) then + call pstat_off(ps_gpumove) + else + call qexit('gpumove',0) + endif + endif ! k==klo + + if (occsdps) then + call pstat_on(ps_doxxx) + else + call qenter('doxxx',0) + endif + + if (occsdps) then + call pstat_on(ps_accwait) + else + call qenter('accwait',0) + endif +!$acc wait(9) +!$acc wait(10) + if (occsdps) then + call pstat_off(ps_accwait) + else + call qexit('accwait',0) + endif + + + tc0 = util_wallsec() + t_dgemm0 = util_wallsec() + + !$omp parallel sections num_threads(8) + + !$omp section + !$omp dispatch interop(obj0) nowait + call dgemm('n','t',nvir,nvir,nvir,1.0d0, + 1 Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, + 2 f1n,nvir) + + !$omp interop use(obj0) nowait + + !$omp dispatch interop(obj0) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Tia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0, + 2 f1n,nvir) + + !$omp section + !$omp dispatch interop(obj1) nowait + call dgemm('n','t',nvir,nvir,nvir,1.0d0, + 1 Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, + 2 f2n,nvir) + + !$omp interop use(obj1) nowait + + !$omp dispatch interop(obj1) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Xia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0, + 2 f2n,nvir) + + !$omp section + !$omp dispatch interop(obj2) nowait + call dgemm('n','n',nvir,nvir,nvir,1.0d0, + 1 Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, + 2 f3n,nvir) + + !$omp interop use(obj2) nowait + + !$omp dispatch interop(obj2) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Tia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0, + 2 f3n,nvir) + + !$omp section + !$omp dispatch interop(obj3) nowait + call dgemm('n','n',nvir,nvir,nvir,1.0d0, + 1 Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, + 2 f4n,nvir) + + !$omp interop use(obj3) nowait + + !$omp dispatch interop(obj3) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Xia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0, + 2 f4n,nvir) + + !$omp section + !$omp dispatch interop(obj4) nowait + call dgemm('n','t',nvir,nvir,nvir,1.0d0, + 1 Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, + 2 f1t,nvir) + + !$omp interop use(obj4) nowait + + !$omp dispatch interop(obj4) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Tka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0, + 2 f1t,nvir) + + !$omp section + !$omp dispatch interop(obj5) nowait + call dgemm('n','t',nvir,nvir,nvir,1.0d0, + 1 Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, + 2 f2t,nvir) + + !$omp interop use(obj5) nowait + + !$omp dispatch interop(obj5) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Xka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0, + 2 f2t,nvir) + + !$omp section + !$omp dispatch interop(obj6) nowait + call dgemm('n','n',nvir,nvir,nvir,1.0d0, + 1 Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, + 2 f3t,nvir) + + !$omp interop use(obj6) nowait + + !$omp dispatch interop(obj6) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Tka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0, + 2 f3t,nvir) + + !$omp section + !$omp dispatch interop(obj7) nowait + call dgemm('n','n',nvir,nvir,nvir,1.0d0, + 1 Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, + 2 f4t,nvir) + + !$omp interop use(obj7) nowait + + !$omp dispatch interop(obj7) nowait + call dgemm('n','n',nvir,nvir,nocc,-1.0d0, + 1 Xka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0, + 2 f4t,nvir) + !$omp end parallel sections + +#if 0 + !$omp interop use(obj0) +#endif + +#if 1 + ! "omp interop use(obj_lev0)" generates a device barrier + ! on level0 queue. This behavior is specific to Intel and + ! works because of the mapping of level0 queues to + ! hardware + ! queues -- only one hardware queue is used. + ! + ! For both Sycl and Level0 queues: + ! interop use(obj) nowait == device barrier + ! interop use(obj) == device barrier + EventHostSynchronize on host + + !$omp interop use(obj_lev0) nowait +#endif + + t_dgemm1 = util_wallsec() + t_dgemm_total = t_dgemm_total + (t_dgemm1 - t_dgemm0) + + ! 8 pairs of DGEMM w/ VVV and VVO cost, 2 for FMA + dgemm_flops = 8*nvir*nvir*(nocc+nvir)*2 + agg_flops = agg_flops + dgemm_flops + + if (occsdps) then + call pstat_off(ps_doxxx) + call pstat_on(ps_tengy) + else + call qexit('doxxx',0) + call qenter('tengy',0) + endif + + eaijk=eorb(a) - ( eorb(ncor+i) + & +eorb(ncor+j) + & +eorb(ncor+k) ) + +#ifdef USE_YFLOP + flops_ycount = flops_ycount + nvir*nvir*( + & 3 + 2*( + & 12 + + & 11 + + & 11 ) + + & 2*27 ) +#endif + + t_red0 = util_wallsec() + +#ifdef USE_CPU_REDUCTION + +!$omp target data map(to:f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t) +!$omp target data map(tofrom: emp4i,emp5i) map(to: eaijk) +!$omp target teams distribute parallel do + do b=1,nvir + do c=1,nvir + denom=-1.0d0/( eorb(ncor+nocc+b) + & +eorb(ncor+nocc+c)+eaijk ) + emp4i=emp4i+denom* + & (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3n(b,c)+f4n(c,b))* + & (f1t(b,c)-2*f2t(b,c)-2*f3t(b,c)+f4t(b,c)) + & -denom* + & (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3n(c,b))* + & (2*f1t(b,c)-f2t(b,c)-f3t(b,c)+2*f4t(b,c)) + & +3*denom*( + & f1n(b,c)*(f1n(b,c)+f3n(c,b)+2*f4t(c,b))+ + & f2n(b,c)*f2t(c,b)+f3n(b,c)*f4t(b,c)) + emp5i=emp5i+denom*t1v1(b)*dintx1(c)* + & ( f1t(b,c)+f2n(b,c)+f4n(c,b) + & -2*(f3t(b,c)+f4n(b,c)+f2n(c,b)+ + & f1n(b,c)+f2t(b,c)+f3n(c,b)) + & +4*(f3n(b,c)+f4t(b,c)+f1n(c,b))) + & +denom*t1v1(b)*dintc1(c)* + & ( f1n(b,c)+f4n(b,c)+f1t(c,b) + & -2*(f2n(b,c)+f3n(b,c)+f2t(c,b))) + end do + end do +!$omp end target teams distribute parallel do +!$omp end target data + + if (i.ne.k) then +!$omp target data map(tofrom: emp4k,emp5k) map(to: eaijk) +!$omp target teams distribute parallel do + do b=1,nvir + do c=1,nvir + denom=-1.0d0/( eorb(ncor+nocc+b) + & +eorb(ncor+nocc+c)+eaijk ) + emp4k=emp4k+denom* + & (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3t(b,c)+f4t(c,b))* + & (f1n(b,c)-2*f2n(b,c)-2*f3n(b,c)+f4n(b,c)) + & -denom* + & (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3t(c,b))* + & (2*f1n(b,c)-f2n(b,c)-f3n(b,c)+2*f4n(b,c)) + & +3*denom*( + & f1t(b,c)*(f1t(b,c)+f3t(c,b)+2*f4n(c,b))+ + & f2t(b,c)*f2n(c,b)+f3t(b,c)*f4n(b,c)) + emp5k=emp5k+denom*t1v2(b)*dintx2(c)* + & ( f1n(b,c)+f2t(b,c)+f4t(c,b) + & -2*(f3n(b,c)+f4t(b,c)+f2t(c,b)+ + & f1t(b,c)+f2n(b,c)+f3t(c,b)) + & +4*(f3t(b,c)+f4n(b,c)+f1t(c,b))) + & +denom*t1v2(b)*dintc2(c)* + & ( f1t(b,c)+f4t(b,c)+f1n(c,b) + & -2*(f2t(b,c)+f3t(b,c)+f2n(c,b))) + end do + end do +!$omp end target teams distribute parallel do +!$omp end target data + end if ! (i.ne.k) +!$omp end target data + + emp4 = emp4 + emp4i + emp5 = emp5 + emp5i + emp4 = emp4 + emp4k + emp5 = emp5 + emp5k +#else + +#if 0 +!use two separate calls to C implementation +!$omp target update to(emp4i,emp5i) + call ccsd_trpdrv_omp_reduce_01(f1n, f1t, f2n, f2t, + & f3n, f3t, f4n, f4t, + & eorb, + & ncor, nocc, nvir, +! & emp4i, emp5i, + & eaijk, + & dintc1, dintx1, t1v1) +!$omp target update from(emp4i,emp5i) + + if (i.ne.k) then +!$omp target update to(emp4k,emp5k) + call ccsd_trpdrv_omp_reduce_02(f1n, f1t, f2n, f2t, + & f3n, f3t, f4n, f4t, + & eorb, + & ncor, nocc, nvir, +! & emp4k, emp5k, + & eaijk, + & dintc2, dintx2, t1v2) +!$omp target update from(emp4k,emp5k) + end if ! (i.ne.k) + + emp4 = emp4 + emp4i + emp5 = emp5 + emp5i + emp4 = emp4 + emp4k + emp5 = emp5 + emp5k +#endif + + call ccsd_trpdrv_omp_fbody_reduce_new (f1n, f1t, f2n, f2t, + & f3n, f3t, f4n, f4t, + & eorb, + & ncor, nocc, nvir, + & emp4, emp5, + & i, k, + & eaijk, + & dintc1, dintx1, t1v1, + & dintc2, dintx2, t1v2) + +#endif + + t_red1 = util_wallsec() + t_red_total = t_red_total + (t_red1 - t_red0) + + tc1 = util_wallsec() + + tengy_flops = nvir*nvir*( 3 + 2*( 12 + 11 + 11 ) + 2*27 ) + agg_flops = agg_flops + tengy_flops + + if (occsdps) then + call pstat_off(ps_tengy) + else + call qexit('tengy',0) + endif + + end do ! k + end do ! i + + if (occsdps) then + call pstat_on(ps_accwait) + else + call qenter('accwait',0) + endif +!$acc wait(9) +!$acc wait(10) + if (occsdps) then + call pstat_off(ps_accwait) + else + call qexit('accwait',0) + endif + + if (iprt.gt.50)then + write(6,1234)me,a,j,emp4,emp5 + 1234 format(' me aijk',3i5,2e15.5) + end if + next=nxtask(nodes, 1) + + if(me.eq.0) then + pct_progr=(a-(ncor+nocc)+((klo-1)/kchunk)*nvir)*n_progr/ + & ((nocc/kchunk)*nvir)+1 + if(i_progr(pct_progr)) then + i_progr(pct_progr)=.false. + + write(6,4321) ' ccsd(t): done ', + & a-(ncor+nocc)+((klo-1)/kchunk)*nvir, + & ' out of ',(nocc/kchunk)*nvir, + & ' progress: ', + & ((a-(ncor+nocc)+((klo-1)/kchunk)*nvir)*100)/ + & ((nocc/kchunk)*nvir), + & '%, Gflops=',1e-9*(dgemm_flops+tengy_flops)/(tc1-tc0), + & ' at ',(util_wallsec()-tt0),' secs', + & ', Time for Dgemms =', (t_dgemm1 - t_dgemm0), ' secs', + & ', Time for Reduction =', (t_red1 - t_red0), ' secs' + + call util_flush(6) + 4321 format(a,i8,a,i8,a,i3,a,1pg11.4,a,0pf10.1,a, + & a, 0pf10.5, a, a, 0pf10.5, a) + + endif + endif + end if + end do + end do + end do + + print *, " " + print *, "- - - - - - - - - - - - - - - - - - - - - - - - - - -" + write(6,4331) + & 'TOTAL Time for Dgemms =', t_dgemm_total, ' secs; ', + & 'TOTAL Time for Reduction =', t_red_total, ' secs' + print *, "- - - - - - - - - - - - - - - - - -- - - - - - - - - -" + print *, " " + + call util_flush(6) + 4331 format(a, 0pf10.5, a, a, 0pf10.5, a) + +! end mapping of all data below +!$omp end target data + + !$omp interop destroy(obj0) + !$omp interop destroy(obj1) + !$omp interop destroy(obj2) + !$omp interop destroy(obj3) + !$omp interop destroy(obj4) + !$omp interop destroy(obj5) + !$omp interop destroy(obj6) + !$omp interop destroy(obj7) + !$omp interop destroy(obj_lev0) + + call ga_sync() + next=nxtask(-nodes, 1) + tt1=util_wallsec() + call ga_dgop(msg_cc_diis1,agg_flops,1, '+') + if(me.eq.0) then + write(6,4322) ' ccsd(t): 100% done, Aggregate Gflops=', + & 1e-9*agg_flops/(tt1-tt0),' in ',(tt1-tt0),' secs' + 4322 format(a,1pg11.4,a,0pf10.1,a) + call util_flush(6) + endif + call ga_sync() + if (occsdps) then + call pstat_off(ps_trpdrv) + else + call qexit('trpdrv',0) + endif +! + tt0 = util_wallsec() + deallocate( f1n, f1t, f2n, f2t, f3n, f3t, f4n, f4t, + & stat=alloc_error) + if (alloc_error.ne.0) call errquit('free f[1234][tn]',8,MA_ERR) + + deallocate( eorb, dintc1, dintx1, t1v1, dintc2, dintx2, t1v2, + & stat=alloc_error) + if (alloc_error.ne.0) call errquit('free CXT1 temps',6,MA_ERR) + + deallocate( Tij, Tkj, Tia, Tka, Xia, Xka, + & Jia, Jka, Kia, Kka, Jij, Jkj, Kij, Kkj, + & Dja, Djka, Djia, stat=alloc_error) + if (alloc_error.ne.0) call errquit('free TKJKD',1,MA_ERR) +! deallocate( xTij, xTkj, xTia, xTka, xXia, xXka, +! & xJia, xJka, xKia, xKka, xJij, xJkj, xKij, xKkj, +! & stat=alloc_error) +! if (alloc_error.ne.0) call errquit('free TKJKD GPU',1,MA_ERR) +! +! CUDA stuff +! + !do shi=1,8 + ! err = cublasDestroy(handle(shi)) + ! if (err.ne.0) call errquit('cublasDestroy',err,UNKNOWN_ERR) + ! err = cudaStreamDestroy(stream(shi)) + ! if (err.ne.0) call errquit('cudaStreamDestroy',err,UNKNOWN_ERR) + !end do +! + tt1 = util_wallsec() + if (me.eq.0) then + write(6,501) tt1-tt0 + 501 format('CU+MEM free took ',e15.5,' seconds') + endif +! + end diff --git a/src/ccsd/module/GNUmakefile b/src/ccsd/module/GNUmakefile new file mode 100644 index 0000000000..fba37698bc --- /dev/null +++ b/src/ccsd/module/GNUmakefile @@ -0,0 +1,32 @@ +# $Id$ + +include ../../config/makefile.h + +ifdef USE_IMAX_OPENMP_TRPDRV + OBJ_OPTIMIZE = \ + + OBJ_OPTIMIZE += ccsd_trpdrv_mkl_module.o + + + FOPTIONS += -fiopenmp -fopenmp-targets=spir64="-mllvm -vpo-paropt-enable-64bit-opencl-atomics=true -mllvm -vpo-paropt-opt-data-sharing-for-reduction=false" -qmkl -DMKL_ILP64 -I"${MKLROOT}/include" -fpp -fixed -free + +#OBJ_OPTIMIZE += ccsd_trpdrv_mkl_module.o +#.SECONDARY: ccsd_trpdrv_mkl_module.o +#ccsd_trpdrv_mkl_module.o: ccsd_trpdrv_mkl_module.F +# $(FC) $(MK_MODULE_FLAG) -c $< -o $@ + +#-include $(ccsd_trpdrv_mkl_module_) +#.PRECIOUS: ccsd_trpdrv_mkl_module.F +# $(FC) $(MK_MODULE_FLAG) -c $< -o $@ +#USES_BLAS += ccsd_trpdrv_mkl_module.F +#ccsd_trpdrv_mkl_module.o : ccsd_trpdrv_mkl_module.F +# $(FC) $(MK_MODULE_FLAG) -c $< -o $@ + +endif + + LIB_DEFINES +=-DUSE_F90INTERFACE + + LIBRARY = libccsd.a + +include ../../config/makelib.h + diff --git a/src/ccsd/module/ccsd_trpdrv_mkl_module.F b/src/ccsd/module/ccsd_trpdrv_mkl_module.F new file mode 100644 index 0000000000..1771b0fee9 --- /dev/null +++ b/src/ccsd/module/ccsd_trpdrv_mkl_module.F @@ -0,0 +1,5 @@ +include "mkl_omp_offload.f90" + + module ccsd_mklmodule + + end module ccsd_mklmodule From b7dbf4994f85941b470a92b69813dd69c6671eb6 Mon Sep 17 00:00:00 2001 From: Omar Khalil Ahmed Date: Mon, 13 Nov 2023 15:42:50 -0800 Subject: [PATCH 3/5] Ensure OMP Offload output is under DEBUG ifdef --- src/util/util_getenv.F | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/util/util_getenv.F b/src/util/util_getenv.F index 8f7820c07c..42f30818dd 100644 --- a/src/util/util_getenv.F +++ b/src/util/util_getenv.F @@ -111,7 +111,9 @@ integer function offload_device() span = offload_span() #if USE_OPENMP ndev = omp_get_num_devices() +#ifdef DEBUG write(6,*) ' omp_get_num_devices() ',ndev +#endif offload_device = mod(ga_nodeid() / span, ndev) call omp_set_default_device(offload_device) #else From 2291962e8df8517365890f2ba61981c1df964374 Mon Sep 17 00:00:00 2001 From: edoapra Date: Wed, 22 Nov 2023 10:00:00 -0800 Subject: [PATCH 4/5] one single definition for USES_BLAS --- src/ccsd/GNUmakefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ccsd/GNUmakefile b/src/ccsd/GNUmakefile index 681f040e71..056dc4521e 100644 --- a/src/ccsd/GNUmakefile +++ b/src/ccsd/GNUmakefile @@ -94,17 +94,19 @@ endif aoccsd2.F \ ccsd_fsig1.F \ ccsd_fsig2.F \ + ccsd_trpdrv_bgp2.F \ + ccsd_trpdrv_offload.F \ + ccsd_trpdrv_openacc.F \ + ccsd_trpdrv_openmp_imax.F \ moints_trp.F ifeq ($(TARGET),BGP) OBJ_OPTIMIZE += ccsd_trpdrv_bgp2.o ccsd_tengy_bgp2.o ccsd_tengy_bgp.o - USES_BLAS += ccsd_trpdrv_bgp2.F LIB_DEFINES += -DBGP endif ifdef USE_MIC_TRPDRV OBJ_OPTIMIZE += ccsd_trpdrv_offload.o - USES_BLAS += ccsd_trpdrv_offload.F LIB_DEFINES += -DUSE_MIC_TRPDRV endif ifeq ($(_FC),xlf) @@ -123,7 +125,6 @@ endif ifdef USE_IMAX_OPENMP_TRPDRV OBJ_OPTIMIZE += ccsd_trpdrv_openmp_imax.o - USES_BLAS += ccsd_trpdrv_openmp_imax.F OBJ_OPTIMIZE += ccsd_trpdrv_omp_reduce_f.o @@ -138,7 +139,6 @@ endif ifdef USE_OPENACC_TRPDRV OBJ_OPTIMIZE += ccsd_trpdrv_openacc.o - USES_BLAS += ccsd_trpdrv_openacc.F FOPTIONS += -DUSE_OPENACC_TRPDRV ifeq ($(_FC),pgf90) FOPTIONS += -Mextend -acc -cuda -cudalib=cublas From a3fc85f7443cfae0dfe71460f33343bce48e86d4 Mon Sep 17 00:00:00 2001 From: Omar Khalil Ahmed Date: Mon, 27 Nov 2023 05:43:47 -0800 Subject: [PATCH 5/5] Fixup to build for Intel Xe GPU OpenMP support --- src/config/makefile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/makefile.h b/src/config/makefile.h index ee6a9e8a9b..4bdd71006a 100644 --- a/src/config/makefile.h +++ b/src/config/makefile.h @@ -2440,7 +2440,7 @@ ifneq ($(TARGET),LINUX) ifdef USE_IFX FOPTIONS += -fiopenmp ifdef USE_OFFLOAD - FOPTIONS += -fopenmp-targets=spirv64 + FOPTIONS += -fopenmp-targets=spir64 ifdef USE_IMAX_OPENMP_TRPDRV DEFINES += -DUSE_IMAX_OPENMP_TRPDRV endif