diff --git a/src/tce/ccsd_t/ccsd_t_gpu.F b/src/tce/ccsd_t/ccsd_t_gpu.F index 8207589789..b2be49f2a8 100644 --- a/src/tce/ccsd_t/ccsd_t_gpu.F +++ b/src/tce/ccsd_t/ccsd_t_gpu.F @@ -28,12 +28,10 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset, integer size,i integer g_energy - integer g_total_d, g_total_s integer nxtask integer next integer nprocs integer count - integer armci_master c - T1/X1 LOCALIZATION ------------------- integer l_t1_local,k_t1_local integer size_t1 @@ -42,42 +40,28 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset, double precision energy1,energy2,energy2_t double precision factor double precision factor_l(1) - double precision energy_l(2),total_d(1),total_s(1),total_all_d - double precision total_all_s + double precision energy_l(2),total_d(1),total_s(1) external nxtask -c Wenjing -c for getting device information - external integer armci_master external device_init - double precision time1 -ckbn -2 -c double precision sum_s,sum_d -c NEW... integer has_GPU external check_device logical nodezero -c static int device_id=-1 -c -c - T1/X1 LOCALIZATION ---------- -c opening l_t1_local and l_x1_local -c NEW... -cTCE_CUDA integer icuda integer cuda_device_number cuda_device_number = 0 -ckbn -2 nodezero=(ga_nodeid().eq.0) -ckbn sum_s = 0.0d0 -ckbn sum_d = 0.0d0 has_GPU = check_device(icuda) if (has_GPU.eq.1) then call device_init(icuda,cuda_device_number) - if(cuda_device_number .eq. 30 ) call errquit("cuda",30,INPUT_ERR) + if (cuda_device_number .eq. 30) then + call errquit("cuda",30,INPUT_ERR) + endif + endif + if (nodezero) then + write(*,'(A,I3,A)') "Using ",icuda," device per node" endif - if(nodezero) - + write(*,'(A,I3,A)') "Using ",icuda, " device per node" if (nodezero) call util_flush(LuOut) ckbn In a large cluster it is better to get and broadcast @@ -86,21 +70,17 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset, 1 call errquit('t1_local',1,MA_ERR) call ma_zero(dbl_mb(k_t1_local),size_t1) c copy d_t1 ==> l_t1_local +! copied pattern from cr-eomccsd_t/cr_eomccsd_t.F +#if 1 + call util_mygabcast(d_t1,size_t1,1,dbl_mb(k_t1_local),size_t1) +#else call ga_get(d_t1,1,size_t1,1,1,dbl_mb(k_t1_local),size_t1) +#endif c ------------------------------- c - - -c if (.not.ga_create(mt_dbl,1,1,'total_d',1,1,g_total_d)) -c 1 call errquit('ccsd_t: GA problem',0,GA_ERR) -c if (.not.ga_create(mt_dbl,1,1,'total_s',1,1,g_total_s)) -c 1 call errquit('ccsd_t: GA problem',0,GA_ERR) - nprocs = GA_NNODES() count = 0 next = nxtask(nprocs,1) -c total_all_d = 0.0d0 -c total_all_s = 0.0d0 energy1=0.0d0 energy2=0.0d0 @@ -110,8 +90,6 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset, do t_h1b = 1,noab do t_h2b = t_h1b,noab do t_h3b = t_h2b,noab -ccx if (next.eq.count) then - if (int_mb(k_spin+t_p4b-1) 1 +int_mb(k_spin+t_p5b-1) @@ -141,45 +119,28 @@ SUBROUTINE ccsd_t_gpu(d_t1,k_t1_offset,d_t2,k_t2_offset, 3 * int_mb(k_range+t_h1b-1) 4 * int_mb(k_range+t_h2b-1) 5 * int_mb(k_range+t_h3b-1) - time1 = - util_wallsec() - if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',l_singles, - 1 k_singles)) call errquit('ccsd_t: MA error',1,MA_ERR) - - if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',l_doubles, - 1 k_doubles)) call errquit('ccsd_t: MA error',2,MA_ERR) - time1=time1+ util_wallsec() -c write (*,*) 'time for MA_PUSH_GET ', time1 -ccx do i = 1, size -ccx dbl_mb(k_singles+i-1) = 0.0d0 -ccx dbl_mb(k_doubles+i-1) = 0.0d0 -ccx enddo -c zeroing --- - time1 = - util_wallsec() - call dfill(size, 0.0d0, dbl_mb(k_singles), 1) - + if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles', + & l_singles,k_singles)) then + call errquit('ccsd_t_gpu: MA error - singles',size,MA_ERR) + endif + if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles', + & l_doubles,k_doubles)) then + call errquit('ccsd_t_gpu: MA error - doubles',size,MA_ERR) + endif + call dfill(size, 0.0d0, dbl_mb(k_singles), 1) + call dfill(size, 0.0d0, dbl_mb(k_doubles), 1) - time1=time1+ util_wallsec() -c write (*,*) 'time for dfill MA_PUSH_GET ', time1 - call dfill(size, 0.0d0, dbl_mb(k_doubles), 1) -c ----------- -c call device_init() -c device_me = get_device_id() -c NEW.. -c init GPU mem has_GPU = check_device(icuda) if (has_GPU.eq.1) then call initmemmodule() endif -c NEW has_GPU = check_device(icuda) call ccsd_t_singles_gpu(dbl_mb(k_singles), 1 k_t1_local,d_v2,k_t1_offset, 2 k_v2_offset,t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2, 3 has_GPU) -c device_me = get_device_id() -c if (device_me