sum matrix

Tony-Tan · Jan 28, 2018 · e246169 · e246169
1 parent e50b2c8
commit e246169
Show file tree

Hide file tree

Showing 28 changed files with 3,328 additions and 24 deletions.
diff --git a/3_sum_arrays/sum_arrays.cu b/3_sum_arrays/sum_arrays.cu
@@ -1,29 +1,8 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include "freshman.h"
-void checkResult(float * hostRef,float * gpuRef,const int N)
-{
-  double epsilon=1.0E-8;
-  for(int i=0;i<N;i++)
-  {
-    if(abs(hostRef[i]-gpuRef[i])>epsilon)
-    {
-      printf("Results don\'t match!");
-      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])",hostRef[i],i,gpuRef[i],i);
-      break;
-    }
-  }
-  printf("Check result success!\n");
-}
-void initialData(float* ip,int size)
-{
-  time_t t;
-  srand((unsigned )time(&t));
-  for(int i=0;i<size;i++)
-  {
-    ip[i]=(float)(rand()&0xff)/10.0f;
-  }
-}
+
+
 void sumArrays(float * a,float * b,float * res,const int size)
 {
   for(int i=0;i<size;i+=4)

diff --git a/4_sum_arrays_timer/CMakeLists.txt b/4_sum_arrays_timer/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(sum_arrays_timer sum_arrays_timer.cu)
diff --git a/4_sum_arrays_timer/sum_arrays_timer.cu b/4_sum_arrays_timer/sum_arrays_timer.cu
@@ -0,0 +1,72 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+
+
+
+void sumArrays(float * a,float * b,float * res,const int size)
+{
+  for(int i=0;i<size;i+=4)
+  {
+    res[i]=a[i]+b[i];
+    res[i+1]=a[i+1]+b[i+1];
+    res[i+2]=a[i+2]+b[i+2];
+    res[i+3]=a[i+3]+b[i+3];
+  }
+}
+__global__ void sumArraysGPU(float*a,float*b,float*res)
+{
+  int i=threadIdx.x;
+  res[i]=a[i]+b[i];
+}
+int main(int argc,char **argv)
+{
+  // set up device
+  initDevice(0);
+
+  int nElem=32;
+  printf("Vector size:%d\n",nElem);
+  int nByte=sizeof(float)*nElem;
+  float *a_h=(float*)malloc(nByte);
+  float *b_h=(float*)malloc(nByte);
+  float *res_h=(float*)malloc(nByte);
+  float *res_from_gpu_h=(float*)malloc(nByte);
+  memset(res_h,0,nByte);
+  memset(res_from_gpu_h,0,nByte);
+
+  float *a_d,*b_d,*res_d;
+  CHECK(cudaMalloc((float**)&a_d,nByte));
+  CHECK(cudaMalloc((float**)&b_d,nByte));
+  CHECK(cudaMalloc((float**)&res_d,nByte));
+
+  initialData(a_h,nElem);
+  initialData(b_h,nElem);
+
+  CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
+
+  dim3 block(nElem);
+  dim3 grid(nElem/block.x);
+
+  //timer
+  double iStart,iElaps;
+  iStart=cpuSecond();
+  sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
+  iElaps=cpuSecond()-iStart;
+  printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",block.x,grid.x,iElaps);
+
+  CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
+  sumArrays(a_h,b_h,res_h,nElem);
+
+  checkResult(res_h,res_from_gpu_h,nElem);
+  cudaFree(a_d);
+  cudaFree(b_d);
+  cudaFree(res_d);
+
+  free(a_h);
+  free(b_h);
+  free(res_h);
+  free(res_from_gpu_h);
+
+  return 0;
+}
diff --git a/5_thread_index/CMakeLists.txt b/5_thread_index/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(thread_index thread_index.cu)
diff --git a/5_thread_index/thread_index.cu b/5_thread_index/thread_index.cu
@@ -0,0 +1,43 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+
+__global__ void printThreadIndex(float *A,const int nx,const int ny)
+{
+  int ix=threadIdx.x+blockIdx.x*blockDim.x;
+  int iy=threadIdx.y+blockIdx.y*blockDim.y;
+  unsigned int idx=iy*nx+ix;
+  printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
+          "global index %2d ival %2d\n",threadIdx.x,threadIdx.y,
+          blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
+}
+int main(int argc,char** argv)
+{
+  initDevice(0);
+  int nx=8,ny=6;
+  int nxy=nx*ny;
+  int nBytes=nxy*sizeof(float);
+
+  //Malloc
+  float* A_host=(float*)malloc(nBytes);
+  initialData(A_host,nxy);
+  printMatrix(A_host,nx,ny);
+
+  //cudaMalloc
+  float *A_dev=NULL;
+  CHECK(cudaMalloc((void**)&A_dev,nBytes));
+
+  cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice);
+
+  dim3 block(4,2);
+  dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
+
+  printThreadIndex<<<grid,block>>>(A_dev,nx,ny);
+
+  CHECK(cudaDeviceSynchronize());
+  cudaFree(A_dev);
+  free(A_host);
+
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/6_sum_matrix/CMakeLists.txt b/6_sum_matrix/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(sum_matrix sum_matrix.cu)
diff --git a/6_sum_matrix/sum_matrix.cu b/6_sum_matrix/sum_matrix.cu
@@ -0,0 +1,87 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny)
+{
+  float * a=MatA;
+  float * b=MatB;
+  float * c=MatC;
+  for(int j=0;j<ny;j++)
+  {
+    for(int i=0;i<nx;i++)
+    {
+      c[i]=a[i]+b[i];
+    }
+    c+=nx;
+    b+=nx;
+    a+=nx;
+  }
+
+
+}
+__global__ void sumMatrix2D(float * MatA,float * MatB,float * MatC,int nx,int ny)
+{
+    int ix=threadIdx.x+blockDim.x*blockIdx.x;
+    int iy=threadIdx.y+blockDim.y*blockIdx.y;
+    int idx=ix+iy*ny;
+    if (ix<nx && iy<ny)
+    {
+      MatC[idx]=MatA[idx]+MatB[idx];
+    }
+}
+int main(int argc,char** argv)
+{
+  printf("strating...\n");
+  initDevice(0);
+  int nx=1<<12;
+  int ny=1<<12;
+  int nxy=nx*ny;
+  int nBytes=nxy*sizeof(float);
+
+  //Malloc
+  float* A_host=(float*)malloc(nBytes);
+  float* B_host=(float*)malloc(nBytes);
+  float* C_host=(float*)malloc(nBytes);
+  float* C_from_gpu=(float*)malloc(nBytes);
+  initialData(A_host,nxy);
+  initialData(B_host,nxy);
+
+  //cudaMalloc
+  float *A_dev=NULL;
+  float *B_dev=NULL;
+  float *C_dev=NULL;
+  CHECK(cudaMalloc((void**)&A_dev,nBytes));
+  CHECK(cudaMalloc((void**)&B_dev,nBytes));
+  CHECK(cudaMalloc((void**)&C_dev,nBytes));
+
+
+  CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice));
+
+  int dimx=32;
+  int dimy=32;
+  dim3 block(dimx,dimy);
+  dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
+  double iStart=cpuSecond();
+  sumMatrix2D<<<grid,block>>>(A_dev,B_dev,C_dev,nx,ny);
+
+  CHECK(cudaDeviceSynchronize());
+  double iElaps=cpuSecond()-iStart;
+  printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
+        grid.x,grid.y,block.x,block.y,iElaps);
+  cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost);
+  iStart=cpuSecond();
+  sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny);
+  iElaps=cpuSecond()-iStart;
+  printf("CPU Execution Time elapsed %f sec\n",iElaps);
+  checkResult(C_host,C_from_gpu,nxy);
+  cudaFree(A_dev);
+  cudaFree(B_dev);
+  cudaFree(C_dev);
+  free(A_host);
+  free(B_host);
+  free(C_host);
+  free(C_from_gpu);
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,10 @@
-cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
 Project(CUDA_Freshman CXX C CUDA)
 include_directories(./include)
 add_subdirectory(0_hello_world)
 add_subdirectory(1_check_dimension)
 add_subdirectory(2_grid_block)
 add_subdirectory(3_sum_arrays)
+add_subdirectory(4_sum_arrays_timer)
+add_subdirectory(5_thread_index)
+add_subdirectory(6_sum_matrix)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		add_executable(sum_arrays_timer sum_arrays_timer.cu)