Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TCE CCSD(T) CUDA logic update #968

Merged
merged 5 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 23 additions & 118 deletions src/tce/ccsd_t/ccsd_t_gpu.F
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,

integer size,i
integer g_energy
integer g_total_d, g_total_s
integer nxtask
integer next
integer nprocs
integer count
integer armci_master
c - T1/X1 LOCALIZATION -------------------
integer l_t1_local,k_t1_local
integer size_t1
Expand All @@ -42,42 +40,28 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
double precision energy1,energy2,energy2_t
double precision factor
double precision factor_l(1)
double precision energy_l(2),total_d(1),total_s(1),total_all_d
double precision total_all_s
double precision energy_l(2),total_d(1),total_s(1)
external nxtask
c Wenjing
c for getting device information
external integer armci_master
external device_init
double precision time1
ckbn -2
c double precision sum_s,sum_d
c NEW...
integer has_GPU
external check_device
logical nodezero
c static int device_id=-1
c
c - T1/X1 LOCALIZATION ----------
c opening l_t1_local and l_x1_local
c NEW...

cTCE_CUDA
integer icuda
integer cuda_device_number
cuda_device_number = 0

ckbn -2
nodezero=(ga_nodeid().eq.0)
ckbn sum_s = 0.0d0
ckbn sum_d = 0.0d0
has_GPU = check_device(icuda)
if (has_GPU.eq.1) then
call device_init(icuda,cuda_device_number)
if(cuda_device_number .eq. 30 ) call errquit("cuda",30,INPUT_ERR)
if (cuda_device_number .eq. 30) then
call errquit("cuda",30,INPUT_ERR)
endif
endif
if (nodezero) then
write(*,'(A,I3,A)') "Using ",icuda," device per node"
endif
if(nodezero)
+ write(*,'(A,I3,A)') "Using ",icuda, " device per node"
if (nodezero) call util_flush(LuOut)

ckbn In a large cluster it is better to get and broadcast
Expand All @@ -86,21 +70,17 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
1 call errquit('t1_local',1,MA_ERR)
call ma_zero(dbl_mb(k_t1_local),size_t1)
c copy d_t1 ==> l_t1_local
! copied pattern from cr-eomccsd_t/cr_eomccsd_t.F
#if 1
call util_mygabcast(d_t1,size_t1,1,dbl_mb(k_t1_local),size_t1)
#else
call ga_get(d_t1,1,size_t1,1,1,dbl_mb(k_t1_local),size_t1)
#endif
c -------------------------------
c


c if (.not.ga_create(mt_dbl,1,1,'total_d',1,1,g_total_d))
c 1 call errquit('ccsd_t: GA problem',0,GA_ERR)
c if (.not.ga_create(mt_dbl,1,1,'total_s',1,1,g_total_s))
c 1 call errquit('ccsd_t: GA problem',0,GA_ERR)

nprocs = GA_NNODES()
count = 0
next = nxtask(nprocs,1)
c total_all_d = 0.0d0
c total_all_s = 0.0d0

energy1=0.0d0
energy2=0.0d0
Expand All @@ -110,8 +90,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
do t_h1b = 1,noab
do t_h2b = t_h1b,noab
do t_h3b = t_h2b,noab
ccx if (next.eq.count) then


if (int_mb(k_spin+t_p4b-1)
1 +int_mb(k_spin+t_p5b-1)
Expand Down Expand Up @@ -141,45 +119,28 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
3 * int_mb(k_range+t_h1b-1)
4 * int_mb(k_range+t_h2b-1)
5 * int_mb(k_range+t_h3b-1)
time1 = - util_wallsec()
if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',l_singles,
1 k_singles)) call errquit('ccsd_t: MA error',1,MA_ERR)

if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',l_doubles,
1 k_doubles)) call errquit('ccsd_t: MA error',2,MA_ERR)
time1=time1+ util_wallsec()
c write (*,*) 'time for MA_PUSH_GET ', time1
ccx do i = 1, size
ccx dbl_mb(k_singles+i-1) = 0.0d0
ccx dbl_mb(k_doubles+i-1) = 0.0d0
ccx enddo
c zeroing ---
time1 = - util_wallsec()
call dfill(size, 0.0d0, dbl_mb(k_singles), 1)

if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',
& l_singles,k_singles)) then
call errquit('ccsd_t_gpu: MA error - singles',size,MA_ERR)
endif
if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',
& l_doubles,k_doubles)) then
call errquit('ccsd_t_gpu: MA error - doubles',size,MA_ERR)
endif
call dfill(size, 0.0d0, dbl_mb(k_singles), 1)
call dfill(size, 0.0d0, dbl_mb(k_doubles), 1)

time1=time1+ util_wallsec()
c write (*,*) 'time for dfill MA_PUSH_GET ', time1
call dfill(size, 0.0d0, dbl_mb(k_doubles), 1)
c -----------
c call device_init()
c device_me = get_device_id()
c NEW..
c init GPU mem
has_GPU = check_device(icuda)
if (has_GPU.eq.1) then
call initmemmodule()
endif

c NEW
has_GPU = check_device(icuda)
call ccsd_t_singles_gpu(dbl_mb(k_singles),
1 k_t1_local,d_v2,k_t1_offset,
2 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2,
3 has_GPU)

c device_me = get_device_id()
c if (device_me<NUM_DEVICE) then
call ccsd_t_doubles_gpu(dbl_mb(k_doubles),d_t2,d_v2,
+ k_t2_offset,
1 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2,
Expand All @@ -189,11 +150,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
c 1 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2, 0)
c endif

ckbn do i=1,size
ckbn sum_s = sum_s + dbl_mb(k_singles+i-1)
ckbn sum_d = sum_d + dbl_mb(k_doubles+i-1)
ckbn enddo

if (restricted) then
factor = 2.0d0
else
Expand All @@ -210,26 +166,16 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
factor = factor / 2.0d0
endif
i = 0
c write (*,*) 't_p4 is ', int_mb(k_offset+t_p4b-1),
c 1 int_mb(k_offset+t_p5b-1),int_mb(k_offset+t_p6b-1),
c 2 int_mb(k_offset+t_h1b-1),int_mb(k_offset+t_h2b-1),
c 3 int_mb(k_offset+t_h3b-1)


c NEW
c device_me = get_device_id()
has_GPU = check_device(icuda)
if (has_GPU.eq.0) then
c CPU process

do t_p4 = 1, int_mb(k_range+t_p4b-1)
do t_p5 = 1, int_mb(k_range+t_p5b-1)
do t_p6 = 1, int_mb(k_range+t_p6b-1)
do t_h1 = 1, int_mb(k_range+t_h1b-1)
do t_h2 = 1, int_mb(k_range+t_h2b-1)
do t_h3 = 1, int_mb(k_range+t_h3b-1)
i = i + 1
c energy1 = energy1 + dbl_mb(k_singles+i-1)

energy1 = energy1 + factor * dbl_mb(k_doubles+i-1)
1 * dbl_mb(k_doubles+i-1)
2 / (-dbl_mb(k_evl_sorted+int_mb(k_offset+t_p4b-1)+t_p4-1)
Expand All @@ -238,9 +184,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
5 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h1b-1)+t_h1-1)
6 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h2b-1)+t_h2-1)
7 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h3b-1)+t_h3-1))
c energy2_t = factor * dbl_mb(k_doubles+i-1)
c energy2_t = dbl_mb(k_doubles+i-1)
c energy2 = energy2 + factor * dbl_mb(k_doubles+i-1)
energy2 = energy2 + factor * dbl_mb(k_doubles+i-1)
1 * (dbl_mb(k_singles+i-1) + dbl_mb(k_doubles+i-1))
2 / (-dbl_mb(k_evl_sorted+int_mb(k_offset+t_p4b-1)+t_p4-1)
Expand All @@ -249,21 +192,12 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
5 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h1b-1)+t_h1-1)
6 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h2b-1)+t_h2-1)
7 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h3b-1)+t_h3-1))
c write (*,*) 'CPU energy ', energy2, energy2_t
c energy2 = energy2+energy2_t
c total_all_d = total_all_d + dbl_mb(k_doubles+1)
c dbl_mb(k_evl_sorted+int_mb
c &(k_offset+t_p4b-1)+t_p4-1)
c dbl_mb(k_doubles+i-1)
c total_all_s = total_all_s + dbl_mb(k_singles+i-1)
enddo
enddo
enddo
enddo
enddo
enddo
c total_all_d = total_all_d + dbl_mb(k_doubles+1)
c total_all_s = total_all_s + dbl_mb(k_singles+1)

else
c GPU process
Expand All @@ -279,22 +213,13 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
8 int_mb(k_range+t_h3b-1),int_mb(k_range+t_p4b-1),
9 int_mb(k_range+t_p5b-1),int_mb(k_range+t_p6b-1),
1 dbl_mb(k_doubles),dbl_mb(k_singles),total_d,total_s)
c write (*,*) 'F paras ',dbl_mb(k_doubles),dbl_mb(k_singles+1),
c 1 dbl_mb(k_evl_sorted+int_mb(k_offset+t_h1b-1)),factor_l(1)
c write (*,*) 'GPU energy ', energy1, energy_l(1), energy2,
c 1 energy_l(2)
c write (*,*) 'double and single', total_d(1), total_s(1)
c total_all_d = total_all_d + total_d(1)
c total_all_s = total_all_s + total_s(1)
energy1 = energy1 + energy_l(1)
energy2 = energy2 + energy_l(2)
c release GPU memory
call dev_release()
call finalizememmodule()
c endif
endif

c
if (.not.MA_POP_STACK(l_doubles))
1 call errquit('ccsd_t',3,MA_ERR)

Expand All @@ -308,35 +233,15 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
endif
endif
endif
ccx next = nxtask(nprocs,1)
ccx endif
ccx count = count + 1
enddo
enddo
enddo
enddo
enddo
enddo
next = nxtask(-nprocs,1)
#if 0
if (.not.ga_create(mt_dbl,1,1,'perturbative',1,1,g_energy))
1 call errquit('ccsd_t: GA problem',0,GA_ERR)
call ga_zero(g_energy)
call ga_acc(g_energy,1,1,1,1,energy1,1,1.0d0)
call ga_sync()
call ga_get(g_energy,1,1,1,1,energy1,1)
call ga_zero(g_energy)
call ga_acc(g_energy,1,1,1,1,energy2,1,1.0d0)
call ga_sync()
call ga_get(g_energy,1,1,1,1,energy2,1)
if (.not.ga_destroy(g_energy))
1 call errquit('ccsd_t: GA problem',1,GA_ERR)
#else
call ga_dgop(1975,energy1,1,'+')
call ga_dgop(1976,energy2,1,'+')
#endif



c - T1/X1 LOCALIZATION ------
if(.not.MA_POP_STACK(l_t1_local))
Expand Down
59 changes: 25 additions & 34 deletions src/tce/tce_energy.F
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,7 @@ logical function tce_energy(rtdb,excitedfragment)
external scf
integer i
integer j
#if defined(TCE_CUDA) || defined(TCE_HIP)
integer icuda
#endif
!#ifdef EACCSD
logical eaccsd
!#endif
Expand Down Expand Up @@ -3342,48 +3340,37 @@ logical function tce_energy(rtdb,excitedfragment)
if (nodezero) write(LuOut,*) 'CCSD(T)'
if (do_pt) perturbative='(t)'
!
cpu = - util_cpusec()
wall = - util_wallsec()
#ifdef USE_PSTAT
c
c profiling
c
! profiling
call ccsd_t_pstat_init(rtdb)
call ccsd_t_fdist_init()
#endif
cpu = - util_cpusec()
wall = - util_wallsec()
if (restart_ccsd_t.eq.0) then
if (.not.rtdb_get(rtdb,'tce:cuda',mt_int,1,icuda)) icuda = 0
if (icuda .ne. 0) then
#if defined(TCE_CUDA) || defined(TCE_HIP)
if(.not.rtdb_get(rtdb,'tce:cuda',mt_int,1,icuda)) then
if (nodezero) write(LuOut,*) 'Using plain CCSD(T) code'
if (nodezero) call util_flush(LuOut)
call ccsd_t(d_t1,k_t1_offset,d_t2,k_t2_offset,
1 d_v2,k_v2_offset,pt3_1,pt3_2,size_t1)
else
if (nodezero) write(LuOut,*) 'Using CUDA CCSD(T) code'
!kbn call errquit('tce_energy: reading cuda',icuda,CALC_ERR)
if (nodezero) call util_flush(LuOut)
call ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
1 d_v2,k_v2_offset,pt3_1,pt3_2,size_t1,icuda)
end if
if (nodezero) write(LuOut,*) 'Using CUDA CCSD(T) code'
if (nodezero) call util_flush(LuOut)
call ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
& d_v2,k_v2_offset,pt3_1,pt3_2,size_t1,
& icuda)
#else
call errquit('tce_energy: ccsd_t_gpu requested'//
& 'but not compiled',iter,CALC_ERR)
#endif
else if (slice) then
if (nodezero) write(LuOut,*) 'Using sliced CCSD(T) code'
call ccsd_t_6dts(d_t1,k_t1_offset,d_t2,k_t2_offset,
& d_v2,k_v2_offset,pt3_1,pt3_2,size_t1,
& xmem)
else
if (nodezero) write(LuOut,*) 'Using plain CCSD(T) code'
if (nodezero) call util_flush(LuOut)
if(.not.slice) then
call ccsd_t(d_t1,k_t1_offset,d_t2,k_t2_offset,
1 d_v2,k_v2_offset,pt3_1,pt3_2,size_t1)
else
if (nodezero) write(LuOut,*) 'Using sliced CCSD(T) code'
call ccsd_t_6dts(d_t1,k_t1_offset,d_t2,k_t2_offset,
1 d_v2,k_v2_offset,pt3_1,pt3_2,size_t1,xmem)
end if
#endif
#ifdef USE_PSTAT
c
c cleanup profiling
c
call ccsd_t_pstat_print
#endif

& d_v2,k_v2_offset,pt3_1,pt3_2,size_t1)
end if
elseif (restart_ccsd_t.gt.0) then
pt3_1 = 0.0d0
call ccsd_t_restart(rtdb,d_t1,k_t1_offset,
Expand All @@ -3392,6 +3379,10 @@ logical function tce_energy(rtdb,excitedfragment)
endif
cpu = cpu + util_cpusec()
wall = wall + util_wallsec()
#ifdef USE_PSTAT
! cleanup profiling
call ccsd_t_pstat_print
#endif
if (nodezero) then
write(LuOut,*)
if (restart_ccsd_t.eq.0) then
Expand Down