Skip to content

Commit

Permalink
finish
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony-Tan committed Jun 20, 2018
1 parent 368f89c commit ee69d9d
Show file tree
Hide file tree
Showing 24 changed files with 810 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/build/
/cmake-build-debug/
.DS_Stroe
.vscode
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ __global__ void reduceGmem(int * g_idata,int * g_odata,unsigned int n)
//convert global data pointer to the
int *idata = g_idata + blockIdx.x*blockDim.x;

__syncthreads();
//in-place reduction in global memory
if(blockDim.x>=1024 && tid <512)
idata[tid]+=idata[tid+512];
Expand Down
31 changes: 26 additions & 5 deletions 26_transform_shared_memory/transform_shared_memory.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
#define BDIMX 32
#define BDIMY 16
#define IPAD 1
#define BDIMX 8
#define BDIMY 8
#define IPAD 2
//cpu transform
void transformMatrix2D_CPU(float * in,float * out,int nx,int ny)
{
Expand All @@ -15,6 +15,16 @@ void transformMatrix2D_CPU(float * in,float * out,int nx,int ny)
}
}
}
__global__ void warmup(float * in,float * out,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx=ix+iy*nx;
if (ix<nx && iy<ny)
{
out[idx]=in[idx];
}
}
__global__ void copyRow(float * in,float * out,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
Expand Down Expand Up @@ -127,6 +137,7 @@ __global__ void transformSmemUnrollPad(float * in,float* out,int nx,int ny)
tile[row_idx+BDIMX]=in[transform_in_idx+BDIMX];
__syncthreads();
unsigned int col_idx=icol*(blockDim.x*2+IPAD)+irow;
out[transform_out_idx]=tile[col_idx];
out[transform_out_idx+ny*BDIMX]=tile[col_idx+BDIMX];

}
Expand Down Expand Up @@ -159,6 +170,7 @@ int main(int argc,char** argv)

//Malloc
float* A_host=(float*)malloc(nBytes);
float* B_host_cpu=(float*)malloc(nBytes);
float* B_host=(float*)malloc(nBytes);
initialData(A_host,nxy);

Expand All @@ -175,7 +187,7 @@ int main(int argc,char** argv)

// cpu compute
double iStart=cpuSecond();
transformMatrix2D_CPU(A_host,B_host,nx,ny);
transformMatrix2D_CPU(A_host,B_host_cpu,nx,ny);
double iElaps=cpuSecond()-iStart;
printf("CPU Execution Time elapsed %f sec\n",iElaps);

Expand All @@ -184,23 +196,31 @@ int main(int argc,char** argv)
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
dim3 block_1(dimx,dimy);
dim3 grid_1((nx-1)/(block_1.x*2)+1,(ny-1)/block_1.y+1);
//warmup
warmup<<<grid,block>>>(A_dev,B_dev,nx,ny);
CHECK(cudaDeviceSynchronize());
iStart=cpuSecond();
switch(transform_kernel)
{
case 0:
copyRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
printf("copyRow ");
break;
case 1:
transformNaiveRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
printf("transformNaiveRow ");
break;
case 2:
transformSmem<<<grid,block>>>(A_dev,B_dev,nx,ny);
printf("transformSmem ");
break;
case 3:
transformSmemPad<<<grid,block>>>(A_dev,B_dev,nx,ny);
printf("transformSmemPad ");
break;
case 4:
transformSmemUnrollPad<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny);
printf("transformSmemUnrollPad ");
break;
default:
break;
Expand All @@ -209,12 +229,13 @@ int main(int argc,char** argv)
iElaps=cpuSecond()-iStart;
printf(" Time elapsed %f sec\n",iElaps);
CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost));
checkResult(B_host,B_host,nxy);
checkResult(B_host,B_host_cpu,nxy);

cudaFree(A_dev);
cudaFree(B_dev);
free(A_host);
free(B_host);
free(B_host_cpu);
cudaDeviceReset();
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __global__ void stencil_1d(float * in,float * out)
//printf("%d:GPU :%lf,\n",idx,temp);
}
//read only
__global__ void stencil_1d(float * in,float * out,const float* __restrict__ dcoef)
__global__ void stencil_1d_readonly(float * in,float * out,const float* __restrict__ dcoef)
{
__shared__ float smem[BDIM+2*TEMP_RADIO_SIZE];
int idx=threadIdx.x+blockDim.x*blockIdx.x;
Expand Down Expand Up @@ -123,18 +123,18 @@ int main(int argc,char** argv)
stencil_1d<<<grid,block>>>(in_dev,out_dev);
CHECK(cudaDeviceSynchronize());
iElaps=cpuSecond()-iStart;
printf(" Time elapsed %f sec\n",iElaps);
printf("stencil_1d Time elapsed %f sec\n",iElaps);
CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost));
checkResult(out_cpu,out_gpu,nxy);
CHECK(cudaMemset(out_dev,0,nBytes));
// stencil 1d read only
float * dcoef_ro;
CHECK(cudaMalloc((void**)&dcoef_ro,TEMP_RADIO_SIZE * sizeof(float)));
CHECK(cudaMemcpy(dcoef_ro,templ_,TEMP_RADIO_SIZE * sizeof(float),cudaMemcpyHostToDevice));
stencil_1d<<<grid,block>>>(in_dev,out_dev,dcoef_ro);
stencil_1d_readonly<<<grid,block>>>(in_dev,out_dev,dcoef_ro);
CHECK(cudaDeviceSynchronize());
iElaps=cpuSecond()-iStart;
printf(" Time elapsed %f sec\n",iElaps);
printf("stencil_1d_readonly Time elapsed %f sec\n",iElaps);
CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost));
checkResult(out_cpu,out_gpu,nxy);

Expand Down
2 changes: 1 addition & 1 deletion 28_shfl_test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
add_executable(shfl_test shfl_test.cu)
add_executable(shfl_test shfl_test.cu)
1 change: 1 addition & 0 deletions 30_stream/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(stream stream.cu)
70 changes: 70 additions & 0 deletions 30_stream/stream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"

#define N 300000
__global__ void kernel_1()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_2()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_3()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_4()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
int main()
{
setenv("CUDA_DEVICE_MAX_CONNECTIONS","32",1);
int dev = 0;
cudaSetDevice(dev);
int n_stream=16;
cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
for(int i=0;i<n_stream;i++)
{
cudaStreamCreate(&stream[i]);
}
dim3 block(1);
dim3 grid(1);
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
for(int i=0;i<n_stream;i++)
{
kernel_1<<<grid,block,0,stream[i]>>>();
kernel_2<<<grid,block,0,stream[i]>>>();
kernel_3<<<grid,block,0,stream[i]>>>();
kernel_4<<<grid,block,0,stream[i]>>>();
}
cudaEventRecord(stop,0);
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
cudaEventElapsedTime(&elapsed_time,start,stop);
printf("elapsed time:%f ms\n",elapsed_time);

for(int i=0;i<n_stream;i++)
{
cudaStreamDestroy(stream[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(stream);
CHECK(cudaDeviceReset());
return 0;
}

67 changes: 67 additions & 0 deletions 31_stream_omp/stream_omp.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
#include <omp.h>
#define N 300000
__global__ void kernel_1()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_2()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_3()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_4()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
int main()
{
int n_stream=4;
cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
for(int i=0;i<n_stream;i++)
{
cudaStreamCreate(&stream[i]);
}
dim3 block(1);
dim3 grid(1);
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
omp_set_num_threads(n_stream);
#pragma omp parallel
{
int i=omp_get_thread_num();
kernel_1<<<grid,block,0,stream[i]>>>();
kernel_2<<<grid,block,0,stream[i]>>>();
kernel_3<<<grid,block,0,stream[i]>>>();
kernel_4<<<grid,block,0,stream[i]>>>();
}
cudaEventRecord(stop,0);
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
cudaEventElapsedTime(&elapsed_time,start,stop);
printf("elapsed time:%f ms\n",elapsed_time);

for(int i=0;i<n_stream;i++)
{
cudaStreamDestroy(stream[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(stream);
return 0;
}
1 change: 1 addition & 0 deletions 32_stream_resource/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(stream_resource stream_resource.cu)
65 changes: 65 additions & 0 deletions 32_stream_resource/stream_resource.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
#define N 100
__global__ void kernel_1()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_2()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_3()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
__global__ void kernel_4()
{
double sum=0.0;
for(int i=0;i<N;i++)
sum=sum+tan(0.1)*tan(0.1);
}
int main()
{
//setenv("CUDA_DEVICE_MAX_CONNECTIONS","32",1);
int n_stream=4;
cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
for(int i=0;i<n_stream;i++)
{
cudaStreamCreate(&stream[i]);
}
dim3 block(16,32);
dim3 grid(32);
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i=0;i<n_stream;i++)
{
kernel_1<<<grid,block,0,stream[i]>>>();
kernel_2<<<grid,block,0,stream[i]>>>();
kernel_3<<<grid,block,0,stream[i]>>>();
kernel_4<<<grid,block,0,stream[i]>>>();
}
cudaEventRecord(stop);
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
cudaEventElapsedTime(&elapsed_time,start,stop);
printf("elapsed time:%f ms\n",elapsed_time);

for(int i=0;i<n_stream;i++)
{
cudaStreamDestroy(stream[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(stream);
return 0;
}
1 change: 1 addition & 0 deletions 33_stream_block/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(stream_block stream_block.cu)
Loading

0 comments on commit ee69d9d

Please sign in to comment.