1,2,3

Tony-Tan · Jan 22, 2018 · e50b2c8 · e50b2c8
1 parent d824450
commit e50b2c8
Show file tree

Hide file tree

Showing 10 changed files with 153 additions and 2 deletions.
diff --git a/1_0_hello_world/CMakeLists.txt → 0_hello_world/CMakeLists.txt b/1_0_hello_world/CMakeLists.txt → 0_hello_world/CMakeLists.txt
diff --git a/1_0_hello_world/hello_world.cu → 0_hello_world/hello_world.cu b/1_0_hello_world/hello_world.cu → 0_hello_world/hello_world.cu
@@ -3,7 +3,7 @@ __global__ void hello_world(void)
 {
   printf("GPU: Hello world!\n");
 }
-int main()
+int main(int argc,char **argv)
 {
   printf("CPU: Hello world!\n");
   hello_world<<<1,10>>>();

diff --git a/1_check_dimension/CMakeLists.txt b/1_check_dimension/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(check_dimension check_dimension.cu)
diff --git a/1_check_dimension/check_dimension.cu b/1_check_dimension/check_dimension.cu
@@ -0,0 +1,20 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+__global__ void checkIndex(void)
+{
+  printf("threadIdx:(%d,%d,%d) blockIdx:(%d,%d,%d) blockDim:(%d,%d,%d)\
+  gridDim(%d,%d,%d)\n",threadIdx.x,threadIdx.y,threadIdx.z,
+  blockIdx.x,blockIdx.y,blockIdx.z,blockDim.x,blockDim.y,blockDim.z,
+  gridDim.x,gridDim.y,gridDim.z);
+}
+int main(int argc,char **argv)
+{
+  int nElem=6;
+  dim3 block(3);
+  dim3 grid((nElem+block.x-1)/block.x);
+  printf("grid.x %d grid.y %d grid.z %d\n",grid.x,grid.y,grid.z);
+  printf("block.x %d block.y %d block.z %d\n",block.x,block.y,block.z);
+  checkIndex<<<grid,block>>>();
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/2_grid_block/CMakeLists.txt b/2_grid_block/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(grid_block grid_block.cu)
diff --git a/2_grid_block/grid_block.cu b/2_grid_block/grid_block.cu
@@ -0,0 +1,24 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+int main(int argc,char ** argv)
+{
+  int nElem=1024;
+  dim3 block(1024);
+  dim3 grid((nElem-1)/block.x+1);
+  printf("grid.x %d block.x %d\n",grid.x,block.x);
+
+  block.x=512;
+  grid.x=(nElem-1)/block.x+1;
+  printf("grid.x %d block.x %d\n",grid.x,block.x);
+
+  block.x=256;
+  grid.x=(nElem-1)/block.x+1;
+  printf("grid.x %d block.x %d\n",grid.x,block.x);
+
+  block.x=128;
+  grid.x=(nElem-1)/block.x+1;
+  printf("grid.x %d block.x %d\n",grid.x,block.x);
+
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/3_sum_arrays/CMakeLists.txt b/3_sum_arrays/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(sum_arrays sum_arrays.cu)
diff --git a/3_sum_arrays/sum_arrays.cu b/3_sum_arrays/sum_arrays.cu
@@ -0,0 +1,87 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+void checkResult(float * hostRef,float * gpuRef,const int N)
+{
+  double epsilon=1.0E-8;
+  for(int i=0;i<N;i++)
+  {
+    if(abs(hostRef[i]-gpuRef[i])>epsilon)
+    {
+      printf("Results don\'t match!");
+      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])",hostRef[i],i,gpuRef[i],i);
+      break;
+    }
+  }
+  printf("Check result success!\n");
+}
+void initialData(float* ip,int size)
+{
+  time_t t;
+  srand((unsigned )time(&t));
+  for(int i=0;i<size;i++)
+  {
+    ip[i]=(float)(rand()&0xff)/10.0f;
+  }
+}
+void sumArrays(float * a,float * b,float * res,const int size)
+{
+  for(int i=0;i<size;i+=4)
+  {
+    res[i]=a[i]+b[i];
+    res[i+1]=a[i+1]+b[i+1];
+    res[i+2]=a[i+2]+b[i+2];
+    res[i+3]=a[i+3]+b[i+3];
+  }
+}
+__global__ void sumArraysGPU(float*a,float*b,float*res)
+{
+  int i=threadIdx.x;
+  res[i]=a[i]+b[i];
+}
+int main(int argc,char **argv)
+{
+  int dev = 0;
+  cudaSetDevice(dev);
+
+  int nElem=32;
+  printf("Vector size:%d\n",nElem);
+  int nByte=sizeof(float)*nElem;
+  float *a_h=(float*)malloc(nByte);
+  float *b_h=(float*)malloc(nByte);
+  float *res_h=(float*)malloc(nByte);
+  float *res_from_gpu_h=(float*)malloc(nByte);
+  memset(res_h,0,nByte);
+  memset(res_from_gpu_h,0,nByte);
+
+  float *a_d,*b_d,*res_d;
+  CHECK(cudaMalloc((float**)&a_d,nByte));
+  CHECK(cudaMalloc((float**)&b_d,nByte));
+  CHECK(cudaMalloc((float**)&res_d,nByte));
+
+  initialData(a_h,nElem);
+  initialData(b_h,nElem);
+
+  CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
+
+  dim3 block(nElem);
+  dim3 grid(nElem/block.x);
+  sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
+  printf("Execution configuration<<<%d,%d>>>\n",block.x,grid.x);
+
+  CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
+  sumArrays(a_h,b_h,res_h,nElem);
+
+  checkResult(res_h,res_from_gpu_h,nElem);
+  cudaFree(a_d);
+  cudaFree(b_d);
+  cudaFree(res_d);
+
+  free(a_h);
+  free(b_h);
+  free(res_h);
+  free(res_from_gpu_h);
+
+  return 0;
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,3 +1,7 @@
 cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
 Project(CUDA_Freshman CXX C CUDA)
-add_subdirectory(1_0_hello_world)
+include_directories(./include)
+add_subdirectory(0_hello_world)
+add_subdirectory(1_check_dimension)
+add_subdirectory(2_grid_block)
+add_subdirectory(3_sum_arrays)
diff --git a/include/freshman.h b/include/freshman.h
@@ -0,0 +1,13 @@
+#ifndef FRESHMAN_H
+#define FRESHMAN_H
+#define CHECK(call)\
+{\
+  const cudaError_t error=call;\
+  if(error!=cudaSuccess)\
+  {\
+      printf("ERROR: %s:%d,",__FILE__,__LINE__);\
+      printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
+      exit(1);\
+  }\
+}
+#endif//FRESHMAN_H
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		add_executable(check_dimension check_dimension.cu)