diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc index f7bb9ccf8..4e8bb4433 100644 --- a/src/cuda-sim/cuda-sim.cc +++ b/src/cuda-sim/cuda-sim.cc @@ -2180,9 +2180,8 @@ void gpgpu_cuda_ptx_sim_main_func( kernel_info_t &kernel, bool openCL ) //we excute the kernel one CTA (Block) at the time, as synchronization functions work block wise while(!kernel.no_more_ctas_to_run()){ - unsigned temp=kernel.get_next_cta_id_single(); + unsigned cta_id=kernel.get_next_cta_id_single(); - if(cp_op==0 || (cp_op==1 && cta_launchedgetShaderCoreConfig()->warp_size ); - cta.execute(cp_count,temp); + cta.execute(cp_count,cta_id); #if (CUDART_VERSION >= 5000) launch_all_device_kernels(); #endif - } - else - { - kernel.increment_cta_id(); - } - cta_launched++; + } + + kernel.increment_cta_id(); + cta_launched++; } diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 007ad4234..011091093 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -1696,6 +1696,7 @@ void ldst_unit::L1_latency_queue_cycle() assert( !read_sent ); l1_latency_queue[0] = NULL; if ( mf_next->get_inst().is_load() ) { + bool insn_completed = false; for ( unsigned r=0; r < MAX_OUTPUT_VALUES; r++) if (mf_next->get_inst().out[r] > 0) { @@ -1705,9 +1706,12 @@ void ldst_unit::L1_latency_queue_cycle() { m_pending_writes[mf_next->get_inst().warp_id()].erase(mf_next->get_inst().out[r]); m_scoreboard->releaseRegister(mf_next->get_inst().warp_id(),mf_next->get_inst().out[r]); - m_core->warp_inst_complete(mf_next->get_inst()); + insn_completed = true; } } + + if (insn_completed) + m_core->warp_inst_complete(mf_next->get_inst()); } //For write hit in WB policy