gpgpu-sim · zhaosiying12138 · Jan 1, 2023 · Jan 1, 2023 · Jan 3, 2023 · Jan 3, 2023
diff --git a/README_Likely_Convergence.md b/README_Likely_Convergence.md
@@ -0,0 +1 @@
+This is an implementation of Likely convergence SIMT Stack By Zhaosiying12138.
diff --git a/configs/tested-cfgs/SM2_GTX480/a.1.sm_30.ptx b/configs/tested-cfgs/SM2_GTX480/a.1.sm_30.ptx
@@ -0,0 +1,100 @@
+
+
+
+
+
+
+
+
+.version 6.1
+.target sm_30
+.address_size 64
+
+
+
+.visible .entry _Z9vectorAddPiS_(
+.param .u64 _Z9vectorAddPiS__param_0,
+.param .u64 _Z9vectorAddPiS__param_1
+)
+{
+.reg .pred %p<7>;
+.reg .b32 %r<18>;
+.reg .b64 %rd<18>;
+
+
+ld.param.u64 %rd1, [_Z9vectorAddPiS__param_0];
+ld.param.u64 %rd2, [_Z9vectorAddPiS__param_1];
+mov.u32 %r5, 0;
+mov.u32 %r1, %tid.x;
+mov.u32 %r17, %r5;
+
+BB0_1:
+mov.u32 %r2, %r17;
+setp.lt.s32	%p1, %r2, 32;
+not.pred %p2, %p1;
+@%p2 bra BB0_8;
+bra.uni BB0_2;
+
+BB0_2:
+cvt.s64.s32	%rd3, %r1;
+shl.b64 %rd4, %rd3, 2;
+add.s64 %rd5, %rd1, %rd4;
+ld.u32 %r3, [%rd5];
+add.s32 %r6, %r2, 2;
+rem.s32 %r7, %r3, %r6;
+setp.eq.s32	%p3, %r7, 0;
+not.pred %p4, %p3;
+@%p4 bra BB0_4;
+bra.uni BB0_3;
+
+BB0_3:
+cvt.s64.s32	%rd12, %r1;
+shl.b64 %rd13, %rd12, 2;
+add.s64 %rd14, %rd2, %rd13;
+ld.u32 %r13, [%rd14];
+add.s32 %r14, %r13, %r3;
+st.u32 [%rd14], %r14;
+bra.uni BB0_7;
+
+BB0_4:
+setp.eq.s32	%p5, %r3, 31;
+not.pred %p6, %p5;
+@%p6 bra BB0_6;
+bra.uni BB0_5;
+
+BB0_5:
+mul.lo.s32 %r8, %r3, 2;
+cvt.s64.s32	%rd6, %r1;
+shl.b64 %rd7, %rd6, 2;
+add.s64 %rd8, %rd2, %rd7;
+ld.u32 %r9, [%rd8];
+add.s32 %r10, %r9, %r8;
+st.u32 [%rd8], %r10;
+bra.uni BB0_9;
+
+BB0_6:
+
+BB0_7:
+add.s32 %r4, %r2, 1;
+cvt.s64.s32	%rd15, %r1;
+shl.b64 %rd16, %rd15, 2;
+add.s64 %rd17, %rd1, %rd16;
+ld.u32 %r15, [%rd17];
+add.s32 %r16, %r15, 1;
+st.u32 [%rd17], %r16;
+mov.u32 %r17, %r4;
+bra.uni BB0_1;
+
+BB0_8:
+
+BB0_9:
+cvt.s64.s32	%rd9, %r1;
+shl.b64 %rd10, %rd9, 2;
+add.s64 %rd11, %rd2, %rd10;
+ld.u32 %r11, [%rd11];
+mul.lo.s32 %r12, %r11, 2;
+st.u32 [%rd11], %r12;
+ret;
+}
+
+
diff --git a/configs/tested-cfgs/SM2_GTX480/a.1.sm_30.ptxas b/configs/tested-cfgs/SM2_GTX480/a.1.sm_30.ptxas
@@ -0,0 +1,5 @@
+ptxas info    : 0 bytes gmem
+ptxas info    : Compiling entry function '_Z9vectorAddPiS_' for 'sm_30'
+ptxas info    : Function properties for _Z9vectorAddPiS_
+    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+ptxas info    : Used 11 registers, 336 bytes cmem[0]
diff --git a/configs/tested-cfgs/SM2_GTX480/baseline_pdom_1.log b/configs/tested-cfgs/SM2_GTX480/baseline_pdom_1.log
diff --git a/configs/tested-cfgs/SM2_GTX480/baseline_pdom_2.log b/configs/tested-cfgs/SM2_GTX480/baseline_pdom_2.log
diff --git a/configs/tested-cfgs/SM2_GTX480/baseline_pdom_32.log b/configs/tested-cfgs/SM2_GTX480/baseline_pdom_32.log
diff --git a/configs/tested-cfgs/SM2_GTX480/lc_pdom_1.log b/configs/tested-cfgs/SM2_GTX480/lc_pdom_1.log
diff --git a/configs/tested-cfgs/SM2_GTX480/lc_pdom_2.log b/configs/tested-cfgs/SM2_GTX480/lc_pdom_2.log
diff --git a/configs/tested-cfgs/SM2_GTX480/lc_pdom_32.log b/configs/tested-cfgs/SM2_GTX480/lc_pdom_32.log
diff --git a/configs/tested-cfgs/SM2_GTX480/test_LCPDOM.cu b/configs/tested-cfgs/SM2_GTX480/test_LCPDOM.cu
@@ -0,0 +1,122 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+/**
+ * CUDA Kernel Device code
+ */
+__global__ void vectorAdd(int *data, int *result) {
+
+  int idx = threadIdx.x;
+  int i = 0, K = 32;
+  while (i < K) {
+    int X = data[idx];
+    if (X % (2 + i) == 0) {
+      result[idx] += X;
+    } else if (X == 31) {
+      result[idx] += 2 * X;
+      break;
+    }
+    i++;
+    data[idx]++;
+  }
+  result[idx] *= 2;
+}
+
+/**
+ * Host main routine
+ */
+int main(void) {
+  // Error code to check return values for CUDA calls
+  cudaError_t err = cudaSuccess;
+
+  // Print the vector length to be used, and compute its size
+  int numElements = 32;
+  size_t size = numElements * sizeof(int);
+  printf("[Vector addition of %d elements]\n", numElements);
+
+  int *h_data = (int *)malloc(size);
+  int *h_result = (int *)malloc(size);
+
+  // Initialize the host input vectors
+  for (int i = 0; i < numElements; ++i) {
+    h_data[i] = i;
+  }
+
+  int *d_data = NULL;
+  err = cudaMalloc((void **)&d_data, size);
+  int *d_result = NULL;
+  err = cudaMalloc((void **)&d_result, size);
+
+  printf("Copy input data from the host memory to the CUDA device\n");
+  err = cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
+
+  // Launch the Vector Add CUDA Kernel
+  int threadsPerBlock = 32;
+  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
+         threadsPerBlock);
+  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_result);
+  err = cudaGetLastError();
+
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+  // Check Result
+  printf("[ZSY_APP] Check Result\n");
+  cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);
+  printf("[ZSY_APP] data: ");
+  for (int i = 0; i < numElements; ++i) {
+    printf("%d ", h_data[i]);
+  }
+  printf("\n");
+
+  printf("[ZSY_APP] result: ");
+  for (int i = 0; i < numElements; ++i) {
+    printf("%d ", h_result[i]);
+  }
+  printf("\n");
+
+  // Free device global memory
+  err = cudaFree(d_data);
+  err = cudaFree(d_result);
+
+  // Free host memory
+  free(h_data);
+  free(h_result);
+
+  printf("Done\n");
+  return 0;
+}
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
@@ -999,14 +999,24 @@ void simt_stack::print(FILE *fout) const {
       fprintf(fout, "%c", (stack_entry.m_active_mask.test(j) ? '1' : '0'));
     fprintf(fout, " pc: 0x%03x", stack_entry.m_pc);
     if (stack_entry.m_recvg_pc == (unsigned)-1) {
-      fprintf(fout, " rp: ---- tp: %s cd: %2u ",
+      fprintf(fout, " rpc: ---- tp: %s cd: %2u ",
               (stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
               stack_entry.m_calldepth);
     } else {
-      fprintf(fout, " rp: %4u tp: %s cd: %2u ", stack_entry.m_recvg_pc,
+      fprintf(fout, " rpc: 0x%03x tp: %s cd: %2u ", stack_entry.m_recvg_pc,
               (stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
               stack_entry.m_calldepth);
     }
+    if (stack_entry.m_likely_rpc == (unsigned)-1) {
+      fprintf(fout, " lpc: ----- ");
+    } else {
+      fprintf(fout, " lpc: 0x%03x ", stack_entry.m_likely_rpc);
+    }
+    if (stack_entry.m_lpos == (unsigned)-1) {
+      fprintf(fout, " lpos: -- ");
+    } else {
+      fprintf(fout, " lpos: %2u ", stack_entry.m_lpos);
+    }
     if (stack_entry.m_branch_div_cycle != 0) {
       fprintf(fout, " bd@%6u ", (unsigned)stack_entry.m_branch_div_cycle);
     } else {
@@ -1039,6 +1049,8 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
 
   simt_mask_t top_active_mask = m_stack.back().m_active_mask;
   address_type top_recvg_pc = m_stack.back().m_recvg_pc;
+  address_type top_likely_rpc = m_stack.back().m_likely_rpc;
+  unsigned int top_lpos = m_stack.back().m_lpos;
   address_type top_pc =
       m_stack.back().m_pc;  // the pc of the instruction just executed
   stack_entry_type top_type = m_stack.back().m_type;
@@ -1048,6 +1060,8 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
   const address_type null_pc = -1;
   bool warp_diverged = false;
   address_type new_recvg_pc = null_pc;
+  address_type new_likely_rpc = null_pc;
+  unsigned int new_lpos = -1;
   unsigned num_divergent_paths = 0;
 
   std::map<address_type, simt_mask_t> divergent_paths;
@@ -1133,8 +1147,13 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
     // discard the new entry if its PC matches with reconvergence PC
     // that automatically reconverges the entry
     // If the top stack entry is CALL, dont reconverge.
-    if (tmp_next_pc == top_recvg_pc && (top_type != STACK_ENTRY_TYPE_CALL))
+    if (tmp_next_pc == top_recvg_pc && (top_type != STACK_ENTRY_TYPE_CALL)) {
       continue;
+    } else if (tmp_next_pc == top_likely_rpc && (top_type != STACK_ENTRY_TYPE_CALL)) {
+      m_stack[top_lpos].m_active_mask |= tmp_active_mask;
+      printf("[ZSY_LPC] Reach LPC!! I can find lpc = 0x%x\n", m_stack[top_lpos].m_pc);
+      continue;
+    }
 
     // this new entry is not converging
     // if this entry does not include thread from the warp, divergence occurs
@@ -1150,19 +1169,41 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
 
         m_stack.push_back(simt_stack_entry());
       }
+      // the likely convergence pc of 0x88 is 0x130 according to profiling.
+      if (top_pc == 0x90) {
+        // suppose new_likely_rpc is found from a map generated by profiling.
+        // new_likely_rpc = find(profile_likerpc_map, 0x88);
+        new_likely_rpc = 0x138;
+        new_lpos = m_stack.size() - 1;
+        m_stack.back().m_pc = new_likely_rpc;
+        m_stack.back().m_recvg_pc = new_recvg_pc;
+        m_stack.back().m_active_mask.reset();
+        m_stack.back().m_branch_div_cycle =
+            m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle;
+
+        m_stack.push_back(simt_stack_entry());
+      }
     }
 
     // discard the new entry if its PC matches with reconvergence PC
-    if (warp_diverged && tmp_next_pc == new_recvg_pc) continue;
+    if (warp_diverged && tmp_next_pc == new_recvg_pc) {
+      //TODO: This may be buggy, maybe I shoule pop twice.
+      assert(top_pc != 0x90);
+      continue;
+    }
 
     // update the current top of pdom stack
     m_stack.back().m_pc = tmp_next_pc;
     m_stack.back().m_active_mask = tmp_active_mask;
     if (warp_diverged) {
       m_stack.back().m_calldepth = 0;
       m_stack.back().m_recvg_pc = new_recvg_pc;
+      m_stack.back().m_likely_rpc = new_likely_rpc;
+      m_stack.back().m_lpos = new_lpos;
     } else {
       m_stack.back().m_recvg_pc = top_recvg_pc;
+      m_stack.back().m_likely_rpc = top_likely_rpc;
+      m_stack.back().m_lpos = top_lpos;
     }
 
     m_stack.push_back(simt_stack_entry());
@@ -1173,6 +1214,8 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
   if (warp_diverged) {
     m_gpu->gpgpu_ctx->stats->ptx_file_line_stats_add_warp_divergence(top_pc, 1);
   }
+  printf("[ZSY]\n");
+  GPGPU_Context()->the_gpgpusim->g_the_gpu->dump_pipeline((0x40|0x4|0x1), 0, 0);
 }
 
 void core_t::execute_warp_inst_t(warp_inst_t &inst, unsigned warpId) {

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
@@ -158,7 +158,6 @@ enum _memory_op_t { no_memory_op = 0, memory_load, memory_store };
 #include <stdlib.h>
 #include <algorithm>
 #include <bitset>
-#include <deque>
 #include <list>
 #include <map>
 #include <vector>
@@ -425,18 +424,22 @@ class simt_stack {
     unsigned int m_calldepth;
     simt_mask_t m_active_mask;
     address_type m_recvg_pc;
+    address_type m_likely_rpc;
+    unsigned int m_lpos;
     unsigned long long m_branch_div_cycle;
     stack_entry_type m_type;
     simt_stack_entry()
         : m_pc(-1),
           m_calldepth(0),
           m_active_mask(),
           m_recvg_pc(-1),
+          m_likely_rpc(-1),
+          m_lpos(-1),
           m_branch_div_cycle(0),
           m_type(STACK_ENTRY_TYPE_NORMAL){};
   };
 
-  std::deque<simt_stack_entry> m_stack;
+  std::vector<simt_stack_entry> m_stack;
 
   class gpgpu_sim *m_gpu;
 };

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
@@ -2027,6 +2027,7 @@ void gpgpu_sim::dump_pipeline(int mask, int s, int m) const {
       break;
     }
   }
+#if 0
   if (mask & 0x10000) {
     for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
       if (m != -1) {
@@ -2041,6 +2042,7 @@ void gpgpu_sim::dump_pipeline(int mask, int s, int m) const {
       }
     }
   }
+#endif
   fflush(stdout);
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This is an implementation of Likely convergence SIMT Stack By Zhaosiying12138.
-Original file line number
+Diff line change
@@ Expand Up @@
           break;
         }
       }
+    #if 0
       if (mask & 0x10000) {
         for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
           if (m != -1) {
@@ Expand All @@
           }
         }
       }
+    #endif
       fflush(stdout);
     }
@@ Expand Down @@