-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[QST] Question about "global_load" code in "cutlass/arch/memory.h" #1212
Comments
I believe the CUTLASS version is for something like initialising. Check the following global loading code snippet: __device__
void global_load(uint4 &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" mov.b32 %0, %6;\n"
" mov.b32 %1, %7;\n"
" mov.b32 %2, %8;\n"
" mov.b32 %3, %9;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int) pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
}
__device__
void global_load_simple(uint4 &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int) pred_guard));
}
Then check the following kernels: __global__ void
kernel(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
D.x = inX1[tid];
D.y = inY1[tid];
D.z = inZ1[tid];
D.w = inW1[tid];
global_load(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
__global__ void
kernel2(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
global_load(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
__global__ void
kernel_simple(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
D.x = inX1[tid];
D.y = inY1[tid];
D.z = inZ1[tid];
D.w = inW1[tid];
global_load_simple(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
__global__ void
kernel_simple2(const uint4 *in, const int *__restrict__ inX1, const int *__restrict__ inY1,
const int *__restrict__ inZ1,
const int *__restrict__ inW1, const int *__restrict__ inX2, const int *__restrict__ inY2,
const int *__restrict__ inZ2,
const int *__restrict__ inW2, const bool *pred_guard,
uint4 *out) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
uint4 D;
global_load_simple(D, in + tid, pred_guard[tid]);
D.x *= inX2[tid];
D.y *= inY2[tid];
D.z *= inZ2[tid];
D.w *= inW2[tid];
out[tid] = D;
}
Check the assembly code on Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_86
Function : kernel_simple2
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc60000000a00 */
/*0030*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0040*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0050*/ IADD3 R16, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a0000107a10 */
/* 0x000fc80007f1e0ff */
/*0060*/ IADD3.X R17, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff117a10 */
/* 0x000fca00007fe4ff */
/*0070*/ LDG.E.S8 R16, [R16.64] ; /* 0x0000000410107981 */
/* 0x000ea2000c1e1300 */
/*0080*/ MOV R15, 0x4 ; /* 0x00000004000f7802 */
/* 0x000fe40000000f00 */
/*0090*/ MOV R21, 0x10 ; /* 0x0000001000157802 */
/* 0x000fc60000000f00 */
/*00a0*/ IMAD.WIDE.U32 R8, R0, R15, c[0x0][0x188] ; /* 0x0000620000087625 */
/* 0x000fc800078e000f */
/*00b0*/ IMAD.WIDE.U32 R10, R0.reuse, R15.reuse, c[0x0][0x190] ; /* 0x00006400000a7625 */
/* 0x0c0fe400078e000f */
/*00c0*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000ee4000c1e9900 */
/*00d0*/ IMAD.WIDE.U32 R12, R0.reuse, R15, c[0x0][0x198] ; /* 0x00006600000c7625 */
/* 0x040fe400078e000f */
/*00e0*/ LDG.E.CONSTANT R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000f24000c1e9900 */
/*00f0*/ IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x040fe400078e0015 */
/*0100*/ LDG.E.CONSTANT R13, [R12.64] ; /* 0x000000040c0d7981 */
/* 0x000f64000c1e9900 */
/*0110*/ IMAD.WIDE.U32 R14, R0, R15, c[0x0][0x1a0] ; /* 0x00006800000e7625 */
/* 0x000fcc00078e000f */
/*0120*/ LDG.E.CONSTANT R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000ee2000c1e9900 */
/*0130*/ ISETP.NE.U32.AND P0, PT, R16, RZ, PT ; /* 0x000000ff1000720c */
/* 0x004fda0003f05070 */
/*0140*/ @P0 LDG.E.LU.128 R4, [R2.64] ; /* 0x0000000402040981 */
/* 0x000ee4000c3e1d00 */
/*0150*/ IMAD R16, R4, R9, RZ ; /* 0x0000000904107224 */
/* 0x008fe400078e02ff */
/*0160*/ IMAD R17, R5, R10, RZ ; /* 0x0000000a05117224 */
/* 0x010fe400078e02ff */
/*0170*/ IMAD R18, R6, R13, RZ ; /* 0x0000000d06127224 */
/* 0x020fe400078e02ff */
/*0180*/ IMAD R19, R7, R14, RZ ; /* 0x0000000e07137224 */
/* 0x000fe400078e02ff */
/*0190*/ IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ; /* 0x00006c0000047625 */
/* 0x000fca00078e0015 */
/*01a0*/ STG.E.128 [R4.64], R16 ; /* 0x0000001004007986 */
/* 0x000fe2000c101d04 */
/*01b0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01c0*/ BRA 0x1c0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0200*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0210*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0220*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0230*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0240*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : kernel_simple
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc60000000a00 */
/*0030*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0040*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0050*/ IADD3 R16, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a0000107a10 */
/* 0x000fc80007f1e0ff */
/*0060*/ IADD3.X R17, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff117a10 */
/* 0x000fca00007fe4ff */
/*0070*/ LDG.E.S8 R16, [R16.64] ; /* 0x0000000410107981 */
/* 0x000ea2000c1e1300 */
/*0080*/ MOV R15, 0x4 ; /* 0x00000004000f7802 */
/* 0x000fe40000000f00 */
/*0090*/ MOV R21, 0x10 ; /* 0x0000001000157802 */
/* 0x000fc60000000f00 */
/*00a0*/ IMAD.WIDE.U32 R8, R0, R15, c[0x0][0x188] ; /* 0x0000620000087625 */
/* 0x000fc800078e000f */
/*00b0*/ IMAD.WIDE.U32 R10, R0.reuse, R15.reuse, c[0x0][0x190] ; /* 0x00006400000a7625 */
/* 0x0c0fe400078e000f */
/*00c0*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000ee4000c1e9900 */
/*00d0*/ IMAD.WIDE.U32 R12, R0.reuse, R15, c[0x0][0x198] ; /* 0x00006600000c7625 */
/* 0x040fe400078e000f */
/*00e0*/ LDG.E.CONSTANT R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000f24000c1e9900 */
/*00f0*/ IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x040fe400078e0015 */
/*0100*/ LDG.E.CONSTANT R13, [R12.64] ; /* 0x000000040c0d7981 */
/* 0x000f64000c1e9900 */
/*0110*/ IMAD.WIDE.U32 R14, R0, R15, c[0x0][0x1a0] ; /* 0x00006800000e7625 */
/* 0x000fcc00078e000f */
/*0120*/ LDG.E.CONSTANT R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000ee2000c1e9900 */
/*0130*/ ISETP.NE.U32.AND P0, PT, R16, RZ, PT ; /* 0x000000ff1000720c */
/* 0x004fda0003f05070 */
/*0140*/ @P0 LDG.E.LU.128 R4, [R2.64] ; /* 0x0000000402040981 */
/* 0x000ee4000c3e1d00 */
/*0150*/ IMAD R16, R4, R9, RZ ; /* 0x0000000904107224 */
/* 0x008fe400078e02ff */
/*0160*/ IMAD R17, R5, R10, RZ ; /* 0x0000000a05117224 */
/* 0x010fe400078e02ff */
/*0170*/ IMAD R18, R6, R13, RZ ; /* 0x0000000d06127224 */
/* 0x020fe400078e02ff */
/*0180*/ IMAD R19, R7, R14, RZ ; /* 0x0000000e07137224 */
/* 0x000fe400078e02ff */
/*0190*/ IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ; /* 0x00006c0000047625 */
/* 0x000fca00078e0015 */
/*01a0*/ STG.E.128 [R4.64], R16 ; /* 0x0000001004007986 */
/* 0x000fe2000c101d04 */
/*01b0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01c0*/ BRA 0x1c0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0200*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0210*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0220*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0230*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0240*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : kernel2
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc60000000a00 */
/*0030*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0040*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0050*/ IADD3 R8, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a0000087a10 */
/* 0x000fc80007f1e0ff */
/*0060*/ IADD3.X R9, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff097a10 */
/* 0x000fca00007fe4ff */
/*0070*/ LDG.E.S8 R8, [R8.64] ; /* 0x0000000408087981 */
/* 0x000ea2000c1e1300 */
/*0080*/ MOV R7, 0x4 ; /* 0x0000000400077802 */
/* 0x000fe40000000f00 */
/*0090*/ MOV R21, 0x10 ; /* 0x0000001000157802 */
/* 0x000fc60000000f00 */
/*00a0*/ IMAD.WIDE.U32 R10, R0, R7, c[0x0][0x190] ; /* 0x00006400000a7625 */
/* 0x000fc800078e0007 */
/*00b0*/ IMAD.WIDE.U32 R14, R0.reuse, R7, c[0x0][0x1a0] ; /* 0x00006800000e7625 */
/* 0x040fe400078e0007 */
/*00c0*/ LDG.E.CONSTANT R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000ee4000c1e9900 */
/*00d0*/ IMAD.WIDE.U32 R2, R0.reuse, R21, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x040fe400078e0015 */
/*00e0*/ LDG.E.CONSTANT R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000f24000c1e9900 */
/*00f0*/ IMAD.WIDE.U32 R12, R0, R7, c[0x0][0x198] ; /* 0x00006600000c7625 */
/* 0x000fcc00078e0007 */
/*0100*/ LDG.E.CONSTANT R13, [R12.64] ; /* 0x000000040c0d7981 */
/* 0x000f62000c1e9900 */
/*0110*/ ISETP.NE.U32.AND P0, PT, R8, RZ, PT ; /* 0x000000ff0800720c */
/* 0x004fe20003f05070 */
/*0120*/ IMAD.WIDE.U32 R8, R0, R7, c[0x0][0x188] ; /* 0x0000620000087625 */
/* 0x000fcc00078e0007 */
/*0130*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000eac000c1e9900 */
/*0140*/ @P0 LDG.E.LU.128 R4, [R2.64] ; /* 0x0000000402040981 */
/* 0x000ee4000c3e1d00 */
/*0150*/ IMAD R17, R10, R5, RZ ; /* 0x000000050a117224 */
/* 0x008fe400078e02ff */
/*0160*/ IMAD R16, R9, R4, RZ ; /* 0x0000000409107224 */
/* 0x004fc400078e02ff */
/*0170*/ IMAD R19, R14, R7, RZ ; /* 0x000000070e137224 */
/* 0x010fe400078e02ff */
/*0180*/ IMAD R18, R13, R6, RZ ; /* 0x000000060d127224 */
/* 0x020fe400078e02ff */
/*0190*/ IMAD.WIDE.U32 R4, R0, R21, c[0x0][0x1b0] ; /* 0x00006c0000047625 */
/* 0x000fca00078e0015 */
/*01a0*/ STG.E.128 [R4.64], R16 ; /* 0x0000001004007986 */
/* 0x000fe2000c101d04 */
/*01b0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01c0*/ BRA 0x1c0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0200*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0210*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0220*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0230*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0240*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : kernel
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0030*/ MOV R23, 0x4 ; /* 0x0000000400177802 */
/* 0x000fe40000000f00 */
/*0040*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e240000002500 */
/*0050*/ IMAD R0, R3, c[0x0][0x0], R0 ; /* 0x0000000003007a24 */
/* 0x001fca00078e0200 */
/*0060*/ IADD3 R14, P0, R0, c[0x0][0x1a8], RZ ; /* 0x00006a00000e7a10 */
/* 0x000fc80007f1e0ff */
/*0070*/ IADD3.X R15, RZ, c[0x0][0x1ac], RZ, P0, !PT ; /* 0x00006b00ff0f7a10 */
/* 0x000fca00007fe4ff */
/*0080*/ LDG.E.S8 R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000ea2000c1e1300 */
/*0090*/ IMAD.WIDE.U32 R2, R0, R23, c[0x0][0x168] ; /* 0x00005a0000027625 */
/* 0x000fc800078e0017 */
/*00a0*/ IMAD.WIDE.U32 R8, R0.reuse, R23.reuse, c[0x0][0x170] ; /* 0x00005c0000087625 */
/* 0x0c0fe200078e0017 */
/*00b0*/ LDG.E.CONSTANT R4, [R2.64] ; /* 0x0000000402047981 */
/* 0x0000e6000c1e9900 */
/*00c0*/ IMAD.WIDE.U32 R10, R0.reuse, R23.reuse, c[0x0][0x178] ; /* 0x00005e00000a7625 */
/* 0x0c0fe200078e0017 */
/*00d0*/ LDG.E.CONSTANT R5, [R8.64] ; /* 0x0000000408057981 */
/* 0x000ee6000c1e9900 */
/*00e0*/ IMAD.WIDE.U32 R12, R0.reuse, R23.reuse, c[0x0][0x180] ; /* 0x00006000000c7625 */
/* 0x0c0fe200078e0017 */
/*00f0*/ LDG.E.CONSTANT R6, [R10.64] ; /* 0x000000040a067981 */
/* 0x000ee8000c1e9900 */
/*0100*/ LDG.E.CONSTANT R7, [R12.64] ; /* 0x000000040c077981 */
/* 0x000ee2000c1e9900 */
/*0110*/ MOV R25, 0x10 ; /* 0x0000001000197802 */
/* 0x000fe20000000f00 */
/*0120*/ IMAD.WIDE.U32 R16, R0, R23, c[0x0][0x188] ; /* 0x0000620000107625 */
/* 0x000fc800078e0017 */
/*0130*/ IMAD.WIDE.U32 R18, R0.reuse, R23.reuse, c[0x0][0x190] ; /* 0x0000640000127625 */
/* 0x0c0fe400078e0017 */
/*0140*/ LDG.E.CONSTANT R17, [R16.64] ; /* 0x0000000410117981 */
/* 0x000f24000c1e9900 */
/*0150*/ IMAD.WIDE.U32 R20, R0.reuse, R23.reuse, c[0x0][0x198] ; /* 0x0000660000147625 */
/* 0x0c0fe400078e0017 */
/*0160*/ LDG.E.CONSTANT R18, [R18.64] ; /* 0x0000000412127981 */
/* 0x000f64000c1e9900 */
/*0170*/ IMAD.WIDE.U32 R22, R0, R23, c[0x0][0x1a0] ; /* 0x0000680000167625 */
/* 0x000fe400078e0017 */
/*0180*/ LDG.E.CONSTANT R21, [R20.64] ; /* 0x0000000414157981 */
/* 0x000f28000c1e9900 */
/*0190*/ LDG.E.CONSTANT R22, [R22.64] ; /* 0x0000000416167981 */
/* 0x000f22000c1e9900 */
/*01a0*/ ISETP.NE.U32.AND P0, PT, R14, RZ, PT ; /* 0x000000ff0e00720c */
/* 0x004fe20003f05070 */
/*01b0*/ IMAD.WIDE.U32 R14, R0, R25, c[0x0][0x160] ; /* 0x00005800000e7625 */
/* 0x000fd800078e0019 */
/*01c0*/ @P0 LDG.E.LU.128 R4, [R14.64] ; /* 0x000000040e040981 */
/* 0x008f22000c3e1d00 */
/*01d0*/ IMAD.WIDE.U32 R2, R0, R25, c[0x0][0x1b0] ; /* 0x00006c0000027625 */
/* 0x001fc800078e0019 */
/*01e0*/ IMAD R4, R17, R4, RZ ; /* 0x0000000411047224 */
/* 0x010fe400078e02ff */
/*01f0*/ IMAD R5, R18, R5, RZ ; /* 0x0000000512057224 */
/* 0x020fe400078e02ff */
/*0200*/ IMAD R6, R21, R6, RZ ; /* 0x0000000615067224 */
/* 0x000fe400078e02ff */
/*0210*/ IMAD R7, R22, R7, RZ ; /* 0x0000000716077224 */
/* 0x000fca00078e02ff */
/*0220*/ STG.E.128 [R2.64], R4 ; /* 0x0000000402007986 */
/* 0x000fe2000c101d04 */
/*0230*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0240*/ BRA 0x240; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0250*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0260*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0270*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0280*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0290*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
.......... You will find that |
Also check this: CUDA PTX ISA
I am not sure what is the exact meaning of "initialized to zero", we can write more testing code for this. But anyway, the assembly code told us that |
@bmmcq has your issue been resolved? |
This issue has been labeled |
Closing due to inactivity |
The "global_load" function is used to copy data in global memory to a local array(maybe in registers). For examples:
The code above in "memory.h" seems to load the target data to registers(%6,%7,%8,%9) first, and then move the value of registers(%6,%7,%8,%9) to registers(%0, %1, %2, %3). Then load the source data(ptr) in global memory to registers(%0, %1, %2, %3), and write these values to the target data at last.
My question is what are the registers(%6,%7,%8,%9) and these "mov" instructions used for? It seems that I can load source data to registers(%0,%1,%2, %3) directly like this:
What are the considerations behind this?
The text was updated successfully, but these errors were encountered: