Skip to content

Commit

Permalink
sum matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony-Tan committed Jan 28, 2018
1 parent e50b2c8 commit e246169
Show file tree
Hide file tree
Showing 28 changed files with 3,328 additions and 24 deletions.
25 changes: 2 additions & 23 deletions 3_sum_arrays/sum_arrays.cu
Original file line number Diff line number Diff line change
@@ -1,29 +1,8 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
void checkResult(float * hostRef,float * gpuRef,const int N)
{
double epsilon=1.0E-8;
for(int i=0;i<N;i++)
{
if(abs(hostRef[i]-gpuRef[i])>epsilon)
{
printf("Results don\'t match!");
printf("%f(hostRef[%d] )!= %f(gpuRef[%d])",hostRef[i],i,gpuRef[i],i);
break;
}
}
printf("Check result success!\n");
}
void initialData(float* ip,int size)
{
time_t t;
srand((unsigned )time(&t));
for(int i=0;i<size;i++)
{
ip[i]=(float)(rand()&0xff)/10.0f;
}
}


void sumArrays(float * a,float * b,float * res,const int size)
{
for(int i=0;i<size;i+=4)
Expand Down
1 change: 1 addition & 0 deletions 4_sum_arrays_timer/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(sum_arrays_timer sum_arrays_timer.cu)
72 changes: 72 additions & 0 deletions 4_sum_arrays_timer/sum_arrays_timer.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"



void sumArrays(float * a,float * b,float * res,const int size)
{
for(int i=0;i<size;i+=4)
{
res[i]=a[i]+b[i];
res[i+1]=a[i+1]+b[i+1];
res[i+2]=a[i+2]+b[i+2];
res[i+3]=a[i+3]+b[i+3];
}
}
__global__ void sumArraysGPU(float*a,float*b,float*res)
{
int i=threadIdx.x;
res[i]=a[i]+b[i];
}
int main(int argc,char **argv)
{
// set up device
initDevice(0);

int nElem=32;
printf("Vector size:%d\n",nElem);
int nByte=sizeof(float)*nElem;
float *a_h=(float*)malloc(nByte);
float *b_h=(float*)malloc(nByte);
float *res_h=(float*)malloc(nByte);
float *res_from_gpu_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
memset(res_from_gpu_h,0,nByte);

float *a_d,*b_d,*res_d;
CHECK(cudaMalloc((float**)&a_d,nByte));
CHECK(cudaMalloc((float**)&b_d,nByte));
CHECK(cudaMalloc((float**)&res_d,nByte));

initialData(a_h,nElem);
initialData(b_h,nElem);

CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));

dim3 block(nElem);
dim3 grid(nElem/block.x);

//timer
double iStart,iElaps;
iStart=cpuSecond();
sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
iElaps=cpuSecond()-iStart;
printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",block.x,grid.x,iElaps);

CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
sumArrays(a_h,b_h,res_h,nElem);

checkResult(res_h,res_from_gpu_h,nElem);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(res_d);

free(a_h);
free(b_h);
free(res_h);
free(res_from_gpu_h);

return 0;
}
1 change: 1 addition & 0 deletions 5_thread_index/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(thread_index thread_index.cu)
43 changes: 43 additions & 0 deletions 5_thread_index/thread_index.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"

__global__ void printThreadIndex(float *A,const int nx,const int ny)
{
int ix=threadIdx.x+blockIdx.x*blockDim.x;
int iy=threadIdx.y+blockIdx.y*blockDim.y;
unsigned int idx=iy*nx+ix;
printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
"global index %2d ival %2d\n",threadIdx.x,threadIdx.y,
blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
}
int main(int argc,char** argv)
{
initDevice(0);
int nx=8,ny=6;
int nxy=nx*ny;
int nBytes=nxy*sizeof(float);

//Malloc
float* A_host=(float*)malloc(nBytes);
initialData(A_host,nxy);
printMatrix(A_host,nx,ny);

//cudaMalloc
float *A_dev=NULL;
CHECK(cudaMalloc((void**)&A_dev,nBytes));

cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice);

dim3 block(4,2);
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);

printThreadIndex<<<grid,block>>>(A_dev,nx,ny);

CHECK(cudaDeviceSynchronize());
cudaFree(A_dev);
free(A_host);

cudaDeviceReset();
return 0;
}
1 change: 1 addition & 0 deletions 6_sum_matrix/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(sum_matrix sum_matrix.cu)
87 changes: 87 additions & 0 deletions 6_sum_matrix/sum_matrix.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny)
{
float * a=MatA;
float * b=MatB;
float * c=MatC;
for(int j=0;j<ny;j++)
{
for(int i=0;i<nx;i++)
{
c[i]=a[i]+b[i];
}
c+=nx;
b+=nx;
a+=nx;
}


}
__global__ void sumMatrix2D(float * MatA,float * MatB,float * MatC,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx=ix+iy*ny;
if (ix<nx && iy<ny)
{
MatC[idx]=MatA[idx]+MatB[idx];
}
}
int main(int argc,char** argv)
{
printf("strating...\n");
initDevice(0);
int nx=1<<12;
int ny=1<<12;
int nxy=nx*ny;
int nBytes=nxy*sizeof(float);

//Malloc
float* A_host=(float*)malloc(nBytes);
float* B_host=(float*)malloc(nBytes);
float* C_host=(float*)malloc(nBytes);
float* C_from_gpu=(float*)malloc(nBytes);
initialData(A_host,nxy);
initialData(B_host,nxy);

//cudaMalloc
float *A_dev=NULL;
float *B_dev=NULL;
float *C_dev=NULL;
CHECK(cudaMalloc((void**)&A_dev,nBytes));
CHECK(cudaMalloc((void**)&B_dev,nBytes));
CHECK(cudaMalloc((void**)&C_dev,nBytes));


CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice));

int dimx=32;
int dimy=32;
dim3 block(dimx,dimy);
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
double iStart=cpuSecond();
sumMatrix2D<<<grid,block>>>(A_dev,B_dev,C_dev,nx,ny);

CHECK(cudaDeviceSynchronize());
double iElaps=cpuSecond()-iStart;
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
grid.x,grid.y,block.x,block.y,iElaps);
cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost);
iStart=cpuSecond();
sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny);
iElaps=cpuSecond()-iStart;
printf("CPU Execution Time elapsed %f sec\n",iElaps);
checkResult(C_host,C_from_gpu,nxy);
cudaFree(A_dev);
cudaFree(B_dev);
cudaFree(C_dev);
free(A_host);
free(B_host);
free(C_host);
free(C_from_gpu);
cudaDeviceReset();
return 0;
}
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
Project(CUDA_Freshman CXX C CUDA)
include_directories(./include)
add_subdirectory(0_hello_world)
add_subdirectory(1_check_dimension)
add_subdirectory(2_grid_block)
add_subdirectory(3_sum_arrays)
add_subdirectory(4_sum_arrays_timer)
add_subdirectory(5_thread_index)
add_subdirectory(6_sum_matrix)
Loading

0 comments on commit e246169

Please sign in to comment.