From eb6fc7519544ab369445fb87d3864b0eb1565b5c Mon Sep 17 00:00:00 2001
From: Serina Tan <serinatan@cs.toronto.edu>
Date: Thu, 8 Aug 2019 15:14:23 -0400
Subject: [PATCH 1/2] Bug fix: over counting completed instruction for vector
 load

---
 src/gpgpu-sim/shader.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 007ad4234..011091093 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1696,6 +1696,7 @@ void ldst_unit::L1_latency_queue_cycle()
 			   assert( !read_sent );
 			   l1_latency_queue[0] = NULL;
 			   if ( mf_next->get_inst().is_load() ) {
+				   bool insn_completed = false;
 				   for ( unsigned r=0; r < MAX_OUTPUT_VALUES; r++)
 					   if (mf_next->get_inst().out[r] > 0)
 					   {
@@ -1705,9 +1706,12 @@ void ldst_unit::L1_latency_queue_cycle()
 						   {
 							m_pending_writes[mf_next->get_inst().warp_id()].erase(mf_next->get_inst().out[r]);
 							m_scoreboard->releaseRegister(mf_next->get_inst().warp_id(),mf_next->get_inst().out[r]);
-							m_core->warp_inst_complete(mf_next->get_inst());
+							insn_completed = true;
 						   }
 					   }
+
+				   if (insn_completed)
+					   m_core->warp_inst_complete(mf_next->get_inst());
 			   }
 
 			   //For write hit in WB policy

From 79dd57a59865ee44eab621dbc0246ccbd6b84447 Mon Sep 17 00:00:00 2001
From: Serina Tan <serinatan@cs.toronto.edu>
Date: Thu, 22 Aug 2019 19:36:52 -0400
Subject: [PATCH 2/2] Bug fix: cta id should be incremented in func sim whether
 or not checkpoint is enabled

---
 src/cuda-sim/cuda-sim.cc | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index f7bb9ccf8..4e8bb4433 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -2180,9 +2180,8 @@ void gpgpu_cuda_ptx_sim_main_func( kernel_info_t &kernel, bool openCL )
 
     //we excute the kernel one CTA (Block) at the time, as synchronization functions work block wise
     while(!kernel.no_more_ctas_to_run()){
-        unsigned temp=kernel.get_next_cta_id_single();
+        unsigned cta_id=kernel.get_next_cta_id_single();
         
-
         if(cp_op==0 || (cp_op==1 && cta_launched<cp_cta_resume && kernel.get_uid()==cp_kernel) || kernel.get_uid()< cp_kernel) // just fro testing
         {
            functionalCoreSim cta(
@@ -2190,17 +2189,15 @@ void gpgpu_cuda_ptx_sim_main_func( kernel_info_t &kernel, bool openCL )
                g_the_gpu,
                g_the_gpu->getShaderCoreConfig()->warp_size
            );
-           cta.execute(cp_count,temp);
+           cta.execute(cp_count,cta_id);
 
             #if (CUDART_VERSION >= 5000)
             	launch_all_device_kernels();
             #endif
-         }
-         else
-         {
-            kernel.increment_cta_id();
-         }
-    cta_launched++;
+        }
+
+        kernel.increment_cta_id();
+        cta_launched++;
     }