memory

Tony-Tan · May 2, 2018 · 1ddb872 · 1ddb872
1 parent 4b8efa8
commit 1ddb872
Show file tree

Hide file tree

Showing 9 changed files with 258 additions and 1 deletion.
diff --git a/14_global_variable/CMakeLists.txt b/14_global_variable/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(global_variable global_variable.cu)
diff --git a/14_global_variable/global_variable.cu b/14_global_variable/global_variable.cu
@@ -0,0 +1,19 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+__device__ float devData;
+__global__ void checkGlobalVariable()
+{
+    printf("Device: The value of the global variable is %f\n",devData);
+    devData+=2.0;
+}
+int main()
+{
+    float value=3.14f;
+    cudaMemcpyToSymbol(devData,&value,sizeof(float));
+    printf("Host: copy %f to the global variable\n",value);
+    checkGlobalVariable<<<1,1>>>();
+    cudaMemcpyFromSymbol(&value,devData,sizeof(float));
+    printf("Host: the value changed by the kernel to %f \n",value);
+    cudaDeviceReset();
+    return EXIT_SUCCESS;
+}
diff --git a/15_pine_memory/CMakeLists.txt b/15_pine_memory/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(pine_memory pine_memory.cu)
diff --git a/15_pine_memory/pine_memory.cu b/15_pine_memory/pine_memory.cu
@@ -0,0 +1,67 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+
+
+void sumArrays(float * a,float * b,float * res,const int size)
+{
+  for(int i=0;i<size;i+=4)
+  {
+    res[i]=a[i]+b[i];
+    res[i+1]=a[i+1]+b[i+1];
+    res[i+2]=a[i+2]+b[i+2];
+    res[i+3]=a[i+3]+b[i+3];
+  }
+}
+__global__ void sumArraysGPU(float*a,float*b,float*res)
+{
+  int i=blockIdx.x*blockDim.x+threadIdx.x;
+  res[i]=a[i]+b[i];
+}
+int main(int argc,char **argv)
+{
+  int dev = 0;
+  cudaSetDevice(dev);
+
+  int nElem=1<<14;
+  printf("Vector size:%d\n",nElem);
+  int nByte=sizeof(float)*nElem;
+  float *a_h=(float*)malloc(nByte);
+  float *b_h=(float*)malloc(nByte);
+  float *res_h=(float*)malloc(nByte);
+  float *res_from_gpu_h=(float*)malloc(nByte);
+  memset(res_h,0,nByte);
+  memset(res_from_gpu_h,0,nByte);
+
+  float *a_d,*b_d,*res_d;
+  // pine memory malloc
+  CHECK(cudaMallocHost((float**)&a_d,nByte));
+  CHECK(cudaMallocHost((float**)&b_d,nByte));
+  CHECK(cudaMallocHost((float**)&res_d,nByte));
+
+  initialData(a_h,nElem);
+  initialData(b_h,nElem);
+
+  CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
+
+  dim3 block(1024);
+  dim3 grid(nElem/block.x);
+  sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
+  printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
+
+  CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
+  sumArrays(a_h,b_h,res_h,nElem);
+
+  checkResult(res_h,res_from_gpu_h,nElem);
+  cudaFreeHost(a_d);
+  cudaFreeHost(b_d);
+  cudaFreeHost(res_d);
+
+  free(a_h);
+  free(b_h);
+  free(res_h);
+  free(res_from_gpu_h);
+
+  return 0;
+}
diff --git a/16_zero_copy_memory/CMakeLists.txt b/16_zero_copy_memory/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(zero_copy_memory zero_copy_memory.cu)
diff --git a/16_zero_copy_memory/zero_copy_memory.cu b/16_zero_copy_memory/zero_copy_memory.cu
@@ -0,0 +1,103 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+
+
+void sumArrays(float * a,float * b,float * res,const int size)
+{
+  for(int i=0;i<size;i+=4)
+  {
+    res[i]=a[i]+b[i];
+    res[i+1]=a[i+1]+b[i+1];
+    res[i+2]=a[i+2]+b[i+2];
+    res[i+3]=a[i+3]+b[i+3];
+  }
+}
+__global__ void sumArraysGPU(float*a,float*b,float*res)
+{
+  int i=blockIdx.x*blockDim.x+threadIdx.x;
+  res[i]=a[i]+b[i];
+}
+int main(int argc,char **argv)
+{
+  int dev = 0;
+  cudaSetDevice(dev);
+  int power=10;
+  if(argc>=2)
+    power=atoi(argv[1]);
+  int nElem=1<<power;
+  printf("Vector size:%d\n",nElem);
+  int nByte=sizeof(float)*nElem;
+  float *res_from_gpu_h=(float*)malloc(nByte);
+  float *res_h=(float*)malloc(nByte);
+  memset(res_h,0,nByte);
+  memset(res_from_gpu_h,0,nByte);
+
+  float *a_host,*b_host,*res_d;
+  double iStart,iElaps;
+  dim3 block(1024);
+  dim3 grid(nElem/block.x);
+  res_from_gpu_h=(float*)malloc(nByte);
+  float *a_dev,*b_dev;
+  CHECK(cudaHostAlloc((float**)&a_host,nByte,cudaHostAllocMapped));
+  CHECK(cudaHostAlloc((float**)&b_host,nByte,cudaHostAllocMapped));
+  CHECK(cudaMalloc((float**)&res_d,nByte));
+  initialData(a_host,nElem);
+  initialData(b_host,nElem);
+
+ //=============================================================//
+  iStart = cpuSecond();
+  CHECK(cudaHostGetDevicePointer((void**)&a_dev,(void*) a_host,0));
+  CHECK(cudaHostGetDevicePointer((void**)&b_dev,(void*) b_host,0));
+  sumArraysGPU<<<grid,block>>>(a_dev,b_dev,res_d);
+  CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
+  iElaps = cpuSecond() - iStart;
+ //=============================================================//
+  printf("zero copy memory elapsed %lf ms \n", iElaps);
+  printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
+//-----------------------normal memory---------------------------
+  float *a_h_n=(float*)malloc(nByte);
+  float *b_h_n=(float*)malloc(nByte);
+  float *res_h_n=(float*)malloc(nByte);
+  float *res_from_gpu_h_n=(float*)malloc(nByte);
+  memset(res_h_n,0,nByte);
+  memset(res_from_gpu_h_n,0,nByte);
+
+  float *a_d_n,*b_d_n,*res_d_n;
+  CHECK(cudaMalloc((float**)&a_d_n,nByte));
+  CHECK(cudaMalloc((float**)&b_d_n,nByte));
+  CHECK(cudaMalloc((float**)&res_d_n,nByte));
+
+  initialData(a_h_n,nElem);
+  initialData(b_h_n,nElem);
+//=============================================================//
+  iStart = cpuSecond();
+  CHECK(cudaMemcpy(a_d_n,a_h_n,nByte,cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(b_d_n,b_h_n,nByte,cudaMemcpyHostToDevice));
+  sumArraysGPU<<<grid,block>>>(a_d_n,b_d_n,res_d_n);
+  CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
+  iElaps = cpuSecond() - iStart;
+//=============================================================//
+  printf("device memory elapsed %lf ms \n", iElaps);
+  printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
+//--------------------------------------------------------------------
+
+  sumArrays(a_host,b_host,res_h,nElem);
+  checkResult(res_h,res_from_gpu_h,nElem);
+
+  cudaFreeHost(a_host);
+  cudaFreeHost(b_host);
+  cudaFree(res_d);
+  free(res_h);
+  free(res_from_gpu_h);
+
+  cudaFree(a_d_n);
+  cudaFree(b_d_n);
+  cudaFree(res_d_n);
+
+  free(a_h_n);
+  free(b_h_n);
+  free(res_h_n);
+  free(res_from_gpu_h_n);
+  return 0;
+}
diff --git a/17_UVA/CMakeLists.txt b/17_UVA/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(UVA UVA.cu)
diff --git a/17_UVA/UVA.cu b/17_UVA/UVA.cu
@@ -0,0 +1,60 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "freshman.h"
+
+
+void sumArrays(float * a,float * b,float * res,const int size)
+{
+  for(int i=0;i<size;i+=4)
+  {
+    res[i]=a[i]+b[i];
+    res[i+1]=a[i+1]+b[i+1];
+    res[i+2]=a[i+2]+b[i+2];
+    res[i+3]=a[i+3]+b[i+3];
+  }
+}
+__global__ void sumArraysGPU(float*a,float*b,float*res)
+{
+  int i=blockIdx.x*blockDim.x+threadIdx.x;
+  res[i]=a[i]+b[i];
+}
+int main(int argc,char **argv)
+{
+  int dev = 0;
+  cudaSetDevice(dev);
+
+  int nElem=1<<14;
+  printf("Vector size:%d\n",nElem);
+  int nByte=sizeof(float)*nElem;
+  float *res_from_gpu_h=(float*)malloc(nByte);
+  float *res_h=(float*)malloc(nByte);
+  memset(res_h,0,nByte);
+  memset(res_from_gpu_h,0,nByte);
+
+  float *a_host,*b_host,*res_d;
+  CHECK(cudaHostAlloc((float**)&a_host,nByte,cudaHostAllocMapped));
+  CHECK(cudaHostAlloc((float**)&b_host,nByte,cudaHostAllocMapped));
+  CHECK(cudaMalloc((float**)&res_d,nByte));
+  res_from_gpu_h=(float*)malloc(nByte);
+
+  initialData(a_host,nElem);
+  initialData(b_host,nElem);
+
+  dim3 block(1024);
+  dim3 grid(nElem/block.x);
+  sumArraysGPU<<<grid,block>>>(a_host,b_host,res_d);
+  printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
+
+  CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
+  sumArrays(a_host,b_host,res_h,nElem);
+
+  checkResult(res_h,res_from_gpu_h,nElem);
+  cudaFreeHost(a_host);
+  cudaFreeHost(b_host);
+  cudaFree(res_d);
+
+  free(res_h);
+  free(res_from_gpu_h);
+
+  return 0;
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,4 +14,8 @@ add_subdirectory(8_divergence)
 add_subdirectory(9_sum_matrix2D)
 add_subdirectory(10_reduceInteger)
 add_subdirectory(11_simple_sum_matrix2D)
-add_subdirectory(12_reduce_unrolling)
+add_subdirectory(12_reduce_unrolling)
+add_subdirectory(14_global_variable)
+add_subdirectory(15_pine_memory)
+add_subdirectory(16_zero_copy_memory)
+add_subdirectory(17_UVA)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		add_executable(global_variable global_variable.cu)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		add_executable(zero_copy_memory zero_copy_memory.cu)