-
Notifications
You must be signed in to change notification settings - Fork 458
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
1,278 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
add_executable(transform_matrix2D transform_matrix2D.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
#include "freshman.h" | ||
//cpu transform | ||
void transformMatrix2D_CPU(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
for(int j=0;j<ny;j++) | ||
{ | ||
for(int i=0;i<nx;i++) | ||
{ | ||
MatB[i*nx+j]=MatA[j*nx+i]; | ||
} | ||
} | ||
} | ||
__global__ void copyRow(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx=ix+iy*nx; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx]=MatA[idx]; | ||
} | ||
} | ||
__global__ void copyCol(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx]=MatA[idx]; | ||
} | ||
} | ||
__global__ void transformNaiveRow(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx_row=ix+iy*nx; | ||
int idx_col=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx_col]=MatA[idx_row]; | ||
} | ||
} | ||
__global__ void transformNaiveCol(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx_row=ix+iy*nx; | ||
int idx_col=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx_row]=MatA[idx_col]; | ||
} | ||
} | ||
__global__ void transformNaiveRowUnroll(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x*4; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx_row=ix+iy*nx; | ||
int idx_col=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx_col]=MatA[idx_row]; | ||
MatB[idx_col+ny*1*blockDim.x]=MatA[idx_row+1*blockDim.x]; | ||
MatB[idx_col+ny*2*blockDim.x]=MatA[idx_row+2*blockDim.x]; | ||
MatB[idx_col+ny*3*blockDim.x]=MatA[idx_row+3*blockDim.x]; | ||
} | ||
} | ||
__global__ void transformNaiveColUnroll(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x*4; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx_row=ix+iy*nx; | ||
int idx_col=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx_row]=MatA[idx_col]; | ||
MatB[idx_row+1*blockDim.x]=MatA[idx_col+ny*1*blockDim.x]; | ||
MatB[idx_row+2*blockDim.x]=MatA[idx_col+ny*2*blockDim.x]; | ||
MatB[idx_row+3*blockDim.x]=MatA[idx_col+ny*3*blockDim.x]; | ||
} | ||
} | ||
__global__ void transformNaiveRowDiagonal(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int block_y=blockIdx.x; | ||
int block_x=(blockIdx.x+blockIdx.y)%gridDim.x; | ||
int ix=threadIdx.x+blockDim.x*block_x; | ||
int iy=threadIdx.y+blockDim.y*block_y; | ||
int idx_row=ix+iy*nx; | ||
int idx_col=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx_col]=MatA[idx_row]; | ||
} | ||
} | ||
__global__ void transformNaiveColDiagonal(float * MatA,float * MatB,int nx,int ny) | ||
{ | ||
int block_y=blockIdx.x; | ||
int block_x=(blockIdx.x+blockIdx.y)%gridDim.x; | ||
int ix=threadIdx.x+blockDim.x*block_x; | ||
int iy=threadIdx.y+blockDim.y*block_y; | ||
int idx_row=ix+iy*nx; | ||
int idx_col=ix*ny+iy; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatB[idx_row]=MatA[idx_col]; | ||
} | ||
} | ||
|
||
|
||
|
||
int main(int argc,char** argv) | ||
{ | ||
printf("strating...\n"); | ||
initDevice(0); | ||
int nx=1<<12; | ||
int ny=1<<12; | ||
int dimx=32; | ||
int dimy=32; | ||
int nxy=nx*ny; | ||
int nBytes=nxy*sizeof(float); | ||
int transform_kernel=0; | ||
if(argc==2) | ||
transform_kernel=atoi(argv[1]); | ||
if(argc>=4) | ||
{ | ||
transform_kernel=atoi(argv[1]); | ||
dimx=atoi(argv[2]); | ||
dimy=atoi(argv[3]); | ||
} | ||
|
||
//Malloc | ||
float* A_host=(float*)malloc(nBytes); | ||
float* B_host=(float*)malloc(nBytes); | ||
initialData(A_host,nxy); | ||
|
||
//cudaMalloc | ||
float *A_dev=NULL; | ||
float *B_dev=NULL; | ||
CHECK(cudaMalloc((void**)&A_dev,nBytes)); | ||
CHECK(cudaMalloc((void**)&B_dev,nBytes)); | ||
|
||
CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice)); | ||
CHECK(cudaMemset(B_dev,0,nBytes)); | ||
|
||
|
||
|
||
// cpu compute | ||
double iStart=cpuSecond(); | ||
transformMatrix2D_CPU(A_host,B_host,nx,ny); | ||
double iElaps=cpuSecond()-iStart; | ||
printf("CPU Execution Time elapsed %f sec\n",iElaps); | ||
|
||
// 2d block and 2d grid | ||
dim3 block(dimx,dimy); | ||
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); | ||
dim3 block_1(dimx,dimy); | ||
dim3 grid_1((nx-1)/(block_1.x*4)+1,(ny-1)/block_1.y+1); | ||
iStart=cpuSecond(); | ||
switch(transform_kernel) | ||
{ | ||
case 0: | ||
copyRow<<<grid,block>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 1: | ||
copyCol<<<grid,block>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 2: | ||
transformNaiveRow<<<grid,block>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 3: | ||
transformNaiveCol<<<grid,block>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 4: | ||
transformNaiveColUnroll<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 5: | ||
|
||
transformNaiveColUnroll<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 6: | ||
transformNaiveRowDiagonal<<<grid,block>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
case 7: | ||
transformNaiveColDiagonal<<<grid,block>>>(A_dev,B_dev,nx,ny); | ||
break; | ||
default: | ||
break; | ||
} | ||
CHECK(cudaDeviceSynchronize()); | ||
iElaps=cpuSecond()-iStart; | ||
printf(" Time elapsed %f sec\n",iElaps); | ||
CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost)); | ||
checkResult(B_host,B_host,nxy); | ||
|
||
cudaFree(A_dev); | ||
cudaFree(B_dev); | ||
free(A_host); | ||
free(B_host); | ||
cudaDeviceReset(); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
add_executable(sum_arrays_uniform_memory sum_arrays_uniform_memory.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
#include "freshman.h" | ||
|
||
|
||
|
||
void sumArrays(float * a,float * b,float * res,const int size) | ||
{ | ||
for(int i=0;i<size;i+=4) | ||
{ | ||
res[i]=a[i]+b[i]; | ||
res[i+1]=a[i+1]+b[i+1]; | ||
res[i+2]=a[i+2]+b[i+2]; | ||
res[i+3]=a[i+3]+b[i+3]; | ||
} | ||
} | ||
__global__ void sumArraysGPU(float*a,float*b,float*res,int N) | ||
{ | ||
int i=blockIdx.x*blockDim.x+threadIdx.x; | ||
if(i < N) | ||
res[i]=a[i]+b[i]; | ||
} | ||
int main(int argc,char **argv) | ||
{ | ||
// set up device | ||
initDevice(0); | ||
|
||
int nElem=1<<24; | ||
printf("Vector size:%d\n",nElem); | ||
int nByte=sizeof(float)*nElem; | ||
float *res_h=(float*)malloc(nByte); | ||
memset(res_h,0,nByte); | ||
//memset(res_from_gpu_h,0,nByte); | ||
|
||
float *a_d,*b_d,*res_d; | ||
CHECK(cudaMallocManaged((float**)&a_d,nByte)); | ||
CHECK(cudaMallocManaged((float**)&b_d,nByte)); | ||
CHECK(cudaMallocManaged((float**)&res_d,nByte)); | ||
|
||
initialData(a_d,nElem); | ||
initialData(b_d,nElem); | ||
|
||
//CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice)); | ||
//CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice)); | ||
|
||
dim3 block(512); | ||
dim3 grid((nElem-1)/block.x+1); | ||
|
||
double iStart,iElaps; | ||
iStart=cpuSecond(); | ||
sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,nElem); | ||
cudaDeviceSynchronize(); | ||
iElaps=cpuSecond()-iStart; | ||
printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); | ||
|
||
//CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); | ||
sumArrays(b_d,b_d,res_h,nElem); | ||
|
||
checkResult(res_h,res_d,nElem); | ||
cudaFree(a_d); | ||
cudaFree(b_d); | ||
cudaFree(res_d); | ||
|
||
free(res_h); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
add_executable(shared_memory_read_data shared_memory_read_data.cu) |
Oops, something went wrong.