Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lpc: initial version of likely convergence SIMT Stack #273

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README_Likely_Convergence.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is an implementation of Likely convergence SIMT Stack By Zhaosiying12138.
100 changes: 100 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/a.1.sm_30.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@








.version 6.1
.target sm_30
.address_size 64



.visible .entry _Z9vectorAddPiS_(
.param .u64 _Z9vectorAddPiS__param_0,
.param .u64 _Z9vectorAddPiS__param_1
)
{
.reg .pred %p<7>;
.reg .b32 %r<18>;
.reg .b64 %rd<18>;


ld.param.u64 %rd1, [_Z9vectorAddPiS__param_0];
ld.param.u64 %rd2, [_Z9vectorAddPiS__param_1];
mov.u32 %r5, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r17, %r5;

BB0_1:
mov.u32 %r2, %r17;
setp.lt.s32 %p1, %r2, 32;
not.pred %p2, %p1;
@%p2 bra BB0_8;
bra.uni BB0_2;

BB0_2:
cvt.s64.s32 %rd3, %r1;
shl.b64 %rd4, %rd3, 2;
add.s64 %rd5, %rd1, %rd4;
ld.u32 %r3, [%rd5];
add.s32 %r6, %r2, 2;
rem.s32 %r7, %r3, %r6;
setp.eq.s32 %p3, %r7, 0;
not.pred %p4, %p3;
@%p4 bra BB0_4;
bra.uni BB0_3;

BB0_3:
cvt.s64.s32 %rd12, %r1;
shl.b64 %rd13, %rd12, 2;
add.s64 %rd14, %rd2, %rd13;
ld.u32 %r13, [%rd14];
add.s32 %r14, %r13, %r3;
st.u32 [%rd14], %r14;
bra.uni BB0_7;

BB0_4:
setp.eq.s32 %p5, %r3, 31;
not.pred %p6, %p5;
@%p6 bra BB0_6;
bra.uni BB0_5;

BB0_5:
mul.lo.s32 %r8, %r3, 2;
cvt.s64.s32 %rd6, %r1;
shl.b64 %rd7, %rd6, 2;
add.s64 %rd8, %rd2, %rd7;
ld.u32 %r9, [%rd8];
add.s32 %r10, %r9, %r8;
st.u32 [%rd8], %r10;
bra.uni BB0_9;

BB0_6:

BB0_7:
add.s32 %r4, %r2, 1;
cvt.s64.s32 %rd15, %r1;
shl.b64 %rd16, %rd15, 2;
add.s64 %rd17, %rd1, %rd16;
ld.u32 %r15, [%rd17];
add.s32 %r16, %r15, 1;
st.u32 [%rd17], %r16;
mov.u32 %r17, %r4;
bra.uni BB0_1;

BB0_8:

BB0_9:
cvt.s64.s32 %rd9, %r1;
shl.b64 %rd10, %rd9, 2;
add.s64 %rd11, %rd2, %rd10;
ld.u32 %r11, [%rd11];
mul.lo.s32 %r12, %r11, 2;
st.u32 [%rd11], %r12;
ret;
}


5 changes: 5 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/a.1.sm_30.ptxas
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z9vectorAddPiS_' for 'sm_30'
ptxas info : Function properties for _Z9vectorAddPiS_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 11 registers, 336 bytes cmem[0]
2,155 changes: 2,155 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/baseline_pdom_1.log

Large diffs are not rendered by default.

3,745 changes: 3,745 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/baseline_pdom_2.log

Large diffs are not rendered by default.

155,119 changes: 155,119 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/baseline_pdom_32.log

Large diffs are not rendered by default.

1,978 changes: 1,978 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/lc_pdom_1.log

Large diffs are not rendered by default.

2,614 changes: 2,614 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/lc_pdom_2.log

Large diffs are not rendered by default.

21,549 changes: 21,549 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/lc_pdom_32.log

Large diffs are not rendered by default.

122 changes: 122 additions & 0 deletions configs/tested-cfgs/SM2_GTX480/test_LCPDOM.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

/**
* CUDA Kernel Device code
*/
__global__ void vectorAdd(int *data, int *result) {

int idx = threadIdx.x;
int i = 0, K = 32;
while (i < K) {
int X = data[idx];
if (X % (2 + i) == 0) {
result[idx] += X;
} else if (X == 31) {
result[idx] += 2 * X;
break;
}
i++;
data[idx]++;
}
result[idx] *= 2;
}

/**
* Host main routine
*/
int main(void) {
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;

// Print the vector length to be used, and compute its size
int numElements = 32;
size_t size = numElements * sizeof(int);
printf("[Vector addition of %d elements]\n", numElements);

int *h_data = (int *)malloc(size);
int *h_result = (int *)malloc(size);

// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_data[i] = i;
}

int *d_data = NULL;
err = cudaMalloc((void **)&d_data, size);
int *d_result = NULL;
err = cudaMalloc((void **)&d_result, size);

printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 32;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_result);
err = cudaGetLastError();

if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Check Result
printf("[ZSY_APP] Check Result\n");
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);
printf("[ZSY_APP] data: ");
for (int i = 0; i < numElements; ++i) {
printf("%d ", h_data[i]);
}
printf("\n");

printf("[ZSY_APP] result: ");
for (int i = 0; i < numElements; ++i) {
printf("%d ", h_result[i]);
}
printf("\n");

// Free device global memory
err = cudaFree(d_data);
err = cudaFree(d_result);

// Free host memory
free(h_data);
free(h_result);

printf("Done\n");
return 0;
}
51 changes: 47 additions & 4 deletions src/abstract_hardware_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -999,14 +999,24 @@ void simt_stack::print(FILE *fout) const {
fprintf(fout, "%c", (stack_entry.m_active_mask.test(j) ? '1' : '0'));
fprintf(fout, " pc: 0x%03x", stack_entry.m_pc);
if (stack_entry.m_recvg_pc == (unsigned)-1) {
fprintf(fout, " rp: ---- tp: %s cd: %2u ",
fprintf(fout, " rpc: ---- tp: %s cd: %2u ",
(stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
stack_entry.m_calldepth);
} else {
fprintf(fout, " rp: %4u tp: %s cd: %2u ", stack_entry.m_recvg_pc,
fprintf(fout, " rpc: 0x%03x tp: %s cd: %2u ", stack_entry.m_recvg_pc,
(stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
stack_entry.m_calldepth);
}
if (stack_entry.m_likely_rpc == (unsigned)-1) {
fprintf(fout, " lpc: ----- ");
} else {
fprintf(fout, " lpc: 0x%03x ", stack_entry.m_likely_rpc);
}
if (stack_entry.m_lpos == (unsigned)-1) {
fprintf(fout, " lpos: -- ");
} else {
fprintf(fout, " lpos: %2u ", stack_entry.m_lpos);
}
if (stack_entry.m_branch_div_cycle != 0) {
fprintf(fout, " bd@%6u ", (unsigned)stack_entry.m_branch_div_cycle);
} else {
Expand Down Expand Up @@ -1039,6 +1049,8 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,

simt_mask_t top_active_mask = m_stack.back().m_active_mask;
address_type top_recvg_pc = m_stack.back().m_recvg_pc;
address_type top_likely_rpc = m_stack.back().m_likely_rpc;
unsigned int top_lpos = m_stack.back().m_lpos;
address_type top_pc =
m_stack.back().m_pc; // the pc of the instruction just executed
stack_entry_type top_type = m_stack.back().m_type;
Expand All @@ -1048,6 +1060,8 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
const address_type null_pc = -1;
bool warp_diverged = false;
address_type new_recvg_pc = null_pc;
address_type new_likely_rpc = null_pc;
unsigned int new_lpos = -1;
unsigned num_divergent_paths = 0;

std::map<address_type, simt_mask_t> divergent_paths;
Expand Down Expand Up @@ -1133,8 +1147,13 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
// discard the new entry if its PC matches with reconvergence PC
// that automatically reconverges the entry
// If the top stack entry is CALL, dont reconverge.
if (tmp_next_pc == top_recvg_pc && (top_type != STACK_ENTRY_TYPE_CALL))
if (tmp_next_pc == top_recvg_pc && (top_type != STACK_ENTRY_TYPE_CALL)) {
continue;
} else if (tmp_next_pc == top_likely_rpc && (top_type != STACK_ENTRY_TYPE_CALL)) {
m_stack[top_lpos].m_active_mask |= tmp_active_mask;
printf("[ZSY_LPC] Reach LPC!! I can find lpc = 0x%x\n", m_stack[top_lpos].m_pc);
continue;
}

// this new entry is not converging
// if this entry does not include thread from the warp, divergence occurs
Expand All @@ -1150,19 +1169,41 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,

m_stack.push_back(simt_stack_entry());
}
// the likely convergence pc of 0x88 is 0x130 according to profiling.
if (top_pc == 0x90) {
// suppose new_likely_rpc is found from a map generated by profiling.
// new_likely_rpc = find(profile_likerpc_map, 0x88);
new_likely_rpc = 0x138;
new_lpos = m_stack.size() - 1;
m_stack.back().m_pc = new_likely_rpc;
m_stack.back().m_recvg_pc = new_recvg_pc;
m_stack.back().m_active_mask.reset();
m_stack.back().m_branch_div_cycle =
m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle;

m_stack.push_back(simt_stack_entry());
}
}

// discard the new entry if its PC matches with reconvergence PC
if (warp_diverged && tmp_next_pc == new_recvg_pc) continue;
if (warp_diverged && tmp_next_pc == new_recvg_pc) {
//TODO: This may be buggy, maybe I shoule pop twice.
assert(top_pc != 0x90);
continue;
}

// update the current top of pdom stack
m_stack.back().m_pc = tmp_next_pc;
m_stack.back().m_active_mask = tmp_active_mask;
if (warp_diverged) {
m_stack.back().m_calldepth = 0;
m_stack.back().m_recvg_pc = new_recvg_pc;
m_stack.back().m_likely_rpc = new_likely_rpc;
m_stack.back().m_lpos = new_lpos;
} else {
m_stack.back().m_recvg_pc = top_recvg_pc;
m_stack.back().m_likely_rpc = top_likely_rpc;
m_stack.back().m_lpos = top_lpos;
}

m_stack.push_back(simt_stack_entry());
Expand All @@ -1173,6 +1214,8 @@ void simt_stack::update(simt_mask_t &thread_done, addr_vector_t &next_pc,
if (warp_diverged) {
m_gpu->gpgpu_ctx->stats->ptx_file_line_stats_add_warp_divergence(top_pc, 1);
}
printf("[ZSY]\n");
GPGPU_Context()->the_gpgpusim->g_the_gpu->dump_pipeline((0x40|0x4|0x1), 0, 0);
}

void core_t::execute_warp_inst_t(warp_inst_t &inst, unsigned warpId) {
Expand Down
7 changes: 5 additions & 2 deletions src/abstract_hardware_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ enum _memory_op_t { no_memory_op = 0, memory_load, memory_store };
#include <stdlib.h>
#include <algorithm>
#include <bitset>
#include <deque>
#include <list>
#include <map>
#include <vector>
Expand Down Expand Up @@ -425,18 +424,22 @@ class simt_stack {
unsigned int m_calldepth;
simt_mask_t m_active_mask;
address_type m_recvg_pc;
address_type m_likely_rpc;
unsigned int m_lpos;
unsigned long long m_branch_div_cycle;
stack_entry_type m_type;
simt_stack_entry()
: m_pc(-1),
m_calldepth(0),
m_active_mask(),
m_recvg_pc(-1),
m_likely_rpc(-1),
m_lpos(-1),
m_branch_div_cycle(0),
m_type(STACK_ENTRY_TYPE_NORMAL){};
};

std::deque<simt_stack_entry> m_stack;
std::vector<simt_stack_entry> m_stack;

class gpgpu_sim *m_gpu;
};
Expand Down
2 changes: 2 additions & 0 deletions src/gpgpu-sim/gpu-sim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2027,6 +2027,7 @@ void gpgpu_sim::dump_pipeline(int mask, int s, int m) const {
break;
}
}
#if 0
if (mask & 0x10000) {
for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
if (m != -1) {
Expand All @@ -2041,6 +2042,7 @@ void gpgpu_sim::dump_pipeline(int mask, int s, int m) const {
}
}
}
#endif
fflush(stdout);
}

Expand Down
Loading