Skip to content

Commit

Permalink
27,28
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony-Tan committed May 29, 2018
1 parent 7514146 commit 5c264f1
Show file tree
Hide file tree
Showing 15 changed files with 1,278 additions and 2 deletions.
1 change: 1 addition & 0 deletions 22_transform_matrix2D/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(transform_matrix2D transform_matrix2D.cu)
204 changes: 204 additions & 0 deletions 22_transform_matrix2D/transform_matrix2D.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
//cpu transform
void transformMatrix2D_CPU(float * MatA,float * MatB,int nx,int ny)
{
for(int j=0;j<ny;j++)
{
for(int i=0;i<nx;i++)
{
MatB[i*nx+j]=MatA[j*nx+i];
}
}
}
__global__ void copyRow(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx=ix+iy*nx;
if (ix<nx && iy<ny)
{
MatB[idx]=MatA[idx];
}
}
__global__ void copyCol(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx]=MatA[idx];
}
}
__global__ void transformNaiveRow(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx_row=ix+iy*nx;
int idx_col=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx_col]=MatA[idx_row];
}
}
__global__ void transformNaiveCol(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx_row=ix+iy*nx;
int idx_col=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx_row]=MatA[idx_col];
}
}
__global__ void transformNaiveRowUnroll(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x*4;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx_row=ix+iy*nx;
int idx_col=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx_col]=MatA[idx_row];
MatB[idx_col+ny*1*blockDim.x]=MatA[idx_row+1*blockDim.x];
MatB[idx_col+ny*2*blockDim.x]=MatA[idx_row+2*blockDim.x];
MatB[idx_col+ny*3*blockDim.x]=MatA[idx_row+3*blockDim.x];
}
}
__global__ void transformNaiveColUnroll(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x*4;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx_row=ix+iy*nx;
int idx_col=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx_row]=MatA[idx_col];
MatB[idx_row+1*blockDim.x]=MatA[idx_col+ny*1*blockDim.x];
MatB[idx_row+2*blockDim.x]=MatA[idx_col+ny*2*blockDim.x];
MatB[idx_row+3*blockDim.x]=MatA[idx_col+ny*3*blockDim.x];
}
}
__global__ void transformNaiveRowDiagonal(float * MatA,float * MatB,int nx,int ny)
{
int block_y=blockIdx.x;
int block_x=(blockIdx.x+blockIdx.y)%gridDim.x;
int ix=threadIdx.x+blockDim.x*block_x;
int iy=threadIdx.y+blockDim.y*block_y;
int idx_row=ix+iy*nx;
int idx_col=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx_col]=MatA[idx_row];
}
}
__global__ void transformNaiveColDiagonal(float * MatA,float * MatB,int nx,int ny)
{
int block_y=blockIdx.x;
int block_x=(blockIdx.x+blockIdx.y)%gridDim.x;
int ix=threadIdx.x+blockDim.x*block_x;
int iy=threadIdx.y+blockDim.y*block_y;
int idx_row=ix+iy*nx;
int idx_col=ix*ny+iy;
if (ix<nx && iy<ny)
{
MatB[idx_row]=MatA[idx_col];
}
}



int main(int argc,char** argv)
{
printf("strating...\n");
initDevice(0);
int nx=1<<12;
int ny=1<<12;
int dimx=32;
int dimy=32;
int nxy=nx*ny;
int nBytes=nxy*sizeof(float);
int transform_kernel=0;
if(argc==2)
transform_kernel=atoi(argv[1]);
if(argc>=4)
{
transform_kernel=atoi(argv[1]);
dimx=atoi(argv[2]);
dimy=atoi(argv[3]);
}

//Malloc
float* A_host=(float*)malloc(nBytes);
float* B_host=(float*)malloc(nBytes);
initialData(A_host,nxy);

//cudaMalloc
float *A_dev=NULL;
float *B_dev=NULL;
CHECK(cudaMalloc((void**)&A_dev,nBytes));
CHECK(cudaMalloc((void**)&B_dev,nBytes));

CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
CHECK(cudaMemset(B_dev,0,nBytes));



// cpu compute
double iStart=cpuSecond();
transformMatrix2D_CPU(A_host,B_host,nx,ny);
double iElaps=cpuSecond()-iStart;
printf("CPU Execution Time elapsed %f sec\n",iElaps);

// 2d block and 2d grid
dim3 block(dimx,dimy);
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
dim3 block_1(dimx,dimy);
dim3 grid_1((nx-1)/(block_1.x*4)+1,(ny-1)/block_1.y+1);
iStart=cpuSecond();
switch(transform_kernel)
{
case 0:
copyRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
break;
case 1:
copyCol<<<grid,block>>>(A_dev,B_dev,nx,ny);
break;
case 2:
transformNaiveRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
break;
case 3:
transformNaiveCol<<<grid,block>>>(A_dev,B_dev,nx,ny);
break;
case 4:
transformNaiveColUnroll<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny);
break;
case 5:

transformNaiveColUnroll<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny);
break;
case 6:
transformNaiveRowDiagonal<<<grid,block>>>(A_dev,B_dev,nx,ny);
break;
case 7:
transformNaiveColDiagonal<<<grid,block>>>(A_dev,B_dev,nx,ny);
break;
default:
break;
}
CHECK(cudaDeviceSynchronize());
iElaps=cpuSecond()-iStart;
printf(" Time elapsed %f sec\n",iElaps);
CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost));
checkResult(B_host,B_host,nxy);

cudaFree(A_dev);
cudaFree(B_dev);
free(A_host);
free(B_host);
cudaDeviceReset();
return 0;
}
1 change: 1 addition & 0 deletions 23_sum_array_uniform_memory/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(sum_arrays_uniform_memory sum_arrays_uniform_memory.cu)
67 changes: 67 additions & 0 deletions 23_sum_array_uniform_memory/sum_arrays_uniform_memory.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"



void sumArrays(float * a,float * b,float * res,const int size)
{
for(int i=0;i<size;i+=4)
{
res[i]=a[i]+b[i];
res[i+1]=a[i+1]+b[i+1];
res[i+2]=a[i+2]+b[i+2];
res[i+3]=a[i+3]+b[i+3];
}
}
__global__ void sumArraysGPU(float*a,float*b,float*res,int N)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
if(i < N)
res[i]=a[i]+b[i];
}
int main(int argc,char **argv)
{
// set up device
initDevice(0);

int nElem=1<<24;
printf("Vector size:%d\n",nElem);
int nByte=sizeof(float)*nElem;
float *res_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
//memset(res_from_gpu_h,0,nByte);

float *a_d,*b_d,*res_d;
CHECK(cudaMallocManaged((float**)&a_d,nByte));
CHECK(cudaMallocManaged((float**)&b_d,nByte));
CHECK(cudaMallocManaged((float**)&res_d,nByte));

initialData(a_d,nElem);
initialData(b_d,nElem);

//CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
//CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));

dim3 block(512);
dim3 grid((nElem-1)/block.x+1);

double iStart,iElaps;
iStart=cpuSecond();
sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,nElem);
cudaDeviceSynchronize();
iElaps=cpuSecond()-iStart;
printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);

//CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
sumArrays(b_d,b_d,res_h,nElem);

checkResult(res_h,res_d,nElem);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(res_d);

free(res_h);

return 0;
}
1 change: 1 addition & 0 deletions 24_shared_memory_read_data/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(shared_memory_read_data shared_memory_read_data.cu)
Loading

0 comments on commit 5c264f1

Please sign in to comment.