Skip to content

Commit

Permalink
remove all the debugging/dead code and make the MA error message better
Browse files Browse the repository at this point in the history
Signed-off-by: Jeff Hammond <[email protected]>
  • Loading branch information
jeffhammond committed Oct 23, 2024
1 parent ab9107d commit 0468583
Showing 1 changed file with 18 additions and 118 deletions.
136 changes: 18 additions & 118 deletions src/tce/ccsd_t/ccsd_t_gpu.F
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,

integer size,i
integer g_energy
integer g_total_d, g_total_s
integer nxtask
integer next
integer nprocs
integer count
integer armci_master
c - T1/X1 LOCALIZATION -------------------
integer l_t1_local,k_t1_local
integer size_t1
Expand All @@ -42,42 +40,28 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
double precision energy1,energy2,energy2_t
double precision factor
double precision factor_l(1)
double precision energy_l(2),total_d(1),total_s(1),total_all_d
double precision total_all_s
double precision energy_l(2),total_d(1),total_s(1)
external nxtask
c Wenjing
c for getting device information
external integer armci_master
external device_init
double precision time1
ckbn -2
c double precision sum_s,sum_d
c NEW...
integer has_GPU
external check_device
logical nodezero
c static int device_id=-1
c
c - T1/X1 LOCALIZATION ----------
c opening l_t1_local and l_x1_local
c NEW...

cTCE_CUDA
integer icuda
integer cuda_device_number
cuda_device_number = 0

ckbn -2
nodezero=(ga_nodeid().eq.0)
ckbn sum_s = 0.0d0
ckbn sum_d = 0.0d0
has_GPU = check_device(icuda)
if (has_GPU.eq.1) then
call device_init(icuda,cuda_device_number)
if(cuda_device_number .eq. 30 ) call errquit("cuda",30,INPUT_ERR)
if (cuda_device_number .eq. 30) then
call errquit("cuda",30,INPUT_ERR)
endif
endif
if (nodezero) then
write(*,'(A,I3,A)') "Using ",icuda," device per node"
endif
if(nodezero)
+ write(*,'(A,I3,A)') "Using ",icuda, " device per node"
if (nodezero) call util_flush(LuOut)

ckbn In a large cluster it is better to get and broadcast
Expand All @@ -89,18 +73,9 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
call ga_get(d_t1,1,size_t1,1,1,dbl_mb(k_t1_local),size_t1)
c -------------------------------
c


c if (.not.ga_create(mt_dbl,1,1,'total_d',1,1,g_total_d))
c 1 call errquit('ccsd_t: GA problem',0,GA_ERR)
c if (.not.ga_create(mt_dbl,1,1,'total_s',1,1,g_total_s))
c 1 call errquit('ccsd_t: GA problem',0,GA_ERR)

nprocs = GA_NNODES()
count = 0
next = nxtask(nprocs,1)
c total_all_d = 0.0d0
c total_all_s = 0.0d0

energy1=0.0d0
energy2=0.0d0
Expand All @@ -110,8 +85,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
do t_h1b = 1,noab
do t_h2b = t_h1b,noab
do t_h3b = t_h2b,noab
ccx if (next.eq.count) then


if (int_mb(k_spin+t_p4b-1)
1 +int_mb(k_spin+t_p5b-1)
Expand Down Expand Up @@ -141,45 +114,28 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
3 * int_mb(k_range+t_h1b-1)
4 * int_mb(k_range+t_h2b-1)
5 * int_mb(k_range+t_h3b-1)
time1 = - util_wallsec()
if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',l_singles,
1 k_singles)) call errquit('ccsd_t: MA error',1,MA_ERR)

if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',l_doubles,
1 k_doubles)) call errquit('ccsd_t: MA error',2,MA_ERR)
time1=time1+ util_wallsec()
c write (*,*) 'time for MA_PUSH_GET ', time1
ccx do i = 1, size
ccx dbl_mb(k_singles+i-1) = 0.0d0
ccx dbl_mb(k_doubles+i-1) = 0.0d0
ccx enddo
c zeroing ---
time1 = - util_wallsec()
call dfill(size, 0.0d0, dbl_mb(k_singles), 1)

if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',
& l_singles,k_singles)) then
call errquit('ccsd_t_gpu: MA error - singles',size,MA_ERR)
endif
if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',
& l_doubles,k_doubles)) then
call errquit('ccsd_t_gpu: MA error - doubles',size,MA_ERR)
endif
call dfill(size, 0.0d0, dbl_mb(k_singles), 1)
call dfill(size, 0.0d0, dbl_mb(k_doubles), 1)

time1=time1+ util_wallsec()
c write (*,*) 'time for dfill MA_PUSH_GET ', time1
call dfill(size, 0.0d0, dbl_mb(k_doubles), 1)
c -----------
c call device_init()
c device_me = get_device_id()
c NEW..
c init GPU mem
has_GPU = check_device(icuda)
if (has_GPU.eq.1) then
call initmemmodule()
endif

c NEW
has_GPU = check_device(icuda)
call ccsd_t_singles_gpu(dbl_mb(k_singles),
1 k_t1_local,d_v2,k_t1_offset,
2 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2,
3 has_GPU)

c device_me = get_device_id()
c if (device_me<NUM_DEVICE) then
call ccsd_t_doubles_gpu(dbl_mb(k_doubles),d_t2,d_v2,
+ k_t2_offset,
1 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2,
Expand All @@ -189,11 +145,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
c 1 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2, 0)
c endif

ckbn do i=1,size
ckbn sum_s = sum_s + dbl_mb(k_singles+i-1)
ckbn sum_d = sum_d + dbl_mb(k_doubles+i-1)
ckbn enddo

if (restricted) then
factor = 2.0d0
else
Expand All @@ -210,26 +161,16 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
factor = factor / 2.0d0
endif
i = 0
c write (*,*) 't_p4 is ', int_mb(k_offset+t_p4b-1),
c 1 int_mb(k_offset+t_p5b-1),int_mb(k_offset+t_p6b-1),
c 2 int_mb(k_offset+t_h1b-1),int_mb(k_offset+t_h2b-1),
c 3 int_mb(k_offset+t_h3b-1)


c NEW
c device_me = get_device_id()
has_GPU = check_device(icuda)
if (has_GPU.eq.0) then
c CPU process

do t_p4 = 1, int_mb(k_range+t_p4b-1)
do t_p5 = 1, int_mb(k_range+t_p5b-1)
do t_p6 = 1, int_mb(k_range+t_p6b-1)
do t_h1 = 1, int_mb(k_range+t_h1b-1)
do t_h2 = 1, int_mb(k_range+t_h2b-1)
do t_h3 = 1, int_mb(k_range+t_h3b-1)
i = i + 1
c energy1 = energy1 + dbl_mb(k_singles+i-1)

energy1 = energy1 + factor * dbl_mb(k_doubles+i-1)
1 * dbl_mb(k_doubles+i-1)
2 / (-dbl_mb(k_evl_sorted+int_mb(k_offset+t_p4b-1)+t_p4-1)
Expand All @@ -238,9 +179,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
5 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h1b-1)+t_h1-1)
6 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h2b-1)+t_h2-1)
7 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h3b-1)+t_h3-1))
c energy2_t = factor * dbl_mb(k_doubles+i-1)
c energy2_t = dbl_mb(k_doubles+i-1)
c energy2 = energy2 + factor * dbl_mb(k_doubles+i-1)
energy2 = energy2 + factor * dbl_mb(k_doubles+i-1)
1 * (dbl_mb(k_singles+i-1) + dbl_mb(k_doubles+i-1))
2 / (-dbl_mb(k_evl_sorted+int_mb(k_offset+t_p4b-1)+t_p4-1)
Expand All @@ -249,21 +187,12 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
5 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h1b-1)+t_h1-1)
6 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h2b-1)+t_h2-1)
7 +dbl_mb(k_evl_sorted+int_mb(k_offset+t_h3b-1)+t_h3-1))
c write (*,*) 'CPU energy ', energy2, energy2_t
c energy2 = energy2+energy2_t
c total_all_d = total_all_d + dbl_mb(k_doubles+1)
c dbl_mb(k_evl_sorted+int_mb
c &(k_offset+t_p4b-1)+t_p4-1)
c dbl_mb(k_doubles+i-1)
c total_all_s = total_all_s + dbl_mb(k_singles+i-1)
enddo
enddo
enddo
enddo
enddo
enddo
c total_all_d = total_all_d + dbl_mb(k_doubles+1)
c total_all_s = total_all_s + dbl_mb(k_singles+1)

else
c GPU process
Expand All @@ -279,22 +208,13 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
8 int_mb(k_range+t_h3b-1),int_mb(k_range+t_p4b-1),
9 int_mb(k_range+t_p5b-1),int_mb(k_range+t_p6b-1),
1 dbl_mb(k_doubles),dbl_mb(k_singles),total_d,total_s)
c write (*,*) 'F paras ',dbl_mb(k_doubles),dbl_mb(k_singles+1),
c 1 dbl_mb(k_evl_sorted+int_mb(k_offset+t_h1b-1)),factor_l(1)
c write (*,*) 'GPU energy ', energy1, energy_l(1), energy2,
c 1 energy_l(2)
c write (*,*) 'double and single', total_d(1), total_s(1)
c total_all_d = total_all_d + total_d(1)
c total_all_s = total_all_s + total_s(1)
energy1 = energy1 + energy_l(1)
energy2 = energy2 + energy_l(2)
c release GPU memory
call dev_release()
call finalizememmodule()
c endif
endif

c
if (.not.MA_POP_STACK(l_doubles))
1 call errquit('ccsd_t',3,MA_ERR)

Expand All @@ -308,35 +228,15 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset,
endif
endif
endif
ccx next = nxtask(nprocs,1)
ccx endif
ccx count = count + 1
enddo
enddo
enddo
enddo
enddo
enddo
next = nxtask(-nprocs,1)
#if 0
if (.not.ga_create(mt_dbl,1,1,'perturbative',1,1,g_energy))
1 call errquit('ccsd_t: GA problem',0,GA_ERR)
call ga_zero(g_energy)
call ga_acc(g_energy,1,1,1,1,energy1,1,1.0d0)
call ga_sync()
call ga_get(g_energy,1,1,1,1,energy1,1)
call ga_zero(g_energy)
call ga_acc(g_energy,1,1,1,1,energy2,1,1.0d0)
call ga_sync()
call ga_get(g_energy,1,1,1,1,energy2,1)
if (.not.ga_destroy(g_energy))
1 call errquit('ccsd_t: GA problem',1,GA_ERR)
#else
call ga_dgop(1975,energy1,1,'+')
call ga_dgop(1976,energy2,1,'+')
#endif



c - T1/X1 LOCALIZATION ------
if(.not.MA_POP_STACK(l_t1_local))
Expand Down

0 comments on commit 0468583

Please sign in to comment.