Skip to content

Commit

Permalink
memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony-Tan committed May 2, 2018
1 parent 4b8efa8 commit 1ddb872
Show file tree
Hide file tree
Showing 9 changed files with 258 additions and 1 deletion.
1 change: 1 addition & 0 deletions 14_global_variable/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(global_variable global_variable.cu)
19 changes: 19 additions & 0 deletions 14_global_variable/global_variable.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include <cuda_runtime.h>
#include <stdio.h>
__device__ float devData;
__global__ void checkGlobalVariable()
{
printf("Device: The value of the global variable is %f\n",devData);
devData+=2.0;
}
int main()
{
float value=3.14f;
cudaMemcpyToSymbol(devData,&value,sizeof(float));
printf("Host: copy %f to the global variable\n",value);
checkGlobalVariable<<<1,1>>>();
cudaMemcpyFromSymbol(&value,devData,sizeof(float));
printf("Host: the value changed by the kernel to %f \n",value);
cudaDeviceReset();
return EXIT_SUCCESS;
}
1 change: 1 addition & 0 deletions 15_pine_memory/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(pine_memory pine_memory.cu)
67 changes: 67 additions & 0 deletions 15_pine_memory/pine_memory.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"


void sumArrays(float * a,float * b,float * res,const int size)
{
for(int i=0;i<size;i+=4)
{
res[i]=a[i]+b[i];
res[i+1]=a[i+1]+b[i+1];
res[i+2]=a[i+2]+b[i+2];
res[i+3]=a[i+3]+b[i+3];
}
}
__global__ void sumArraysGPU(float*a,float*b,float*res)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
res[i]=a[i]+b[i];
}
int main(int argc,char **argv)
{
int dev = 0;
cudaSetDevice(dev);

int nElem=1<<14;
printf("Vector size:%d\n",nElem);
int nByte=sizeof(float)*nElem;
float *a_h=(float*)malloc(nByte);
float *b_h=(float*)malloc(nByte);
float *res_h=(float*)malloc(nByte);
float *res_from_gpu_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
memset(res_from_gpu_h,0,nByte);

float *a_d,*b_d,*res_d;
// pine memory malloc
CHECK(cudaMallocHost((float**)&a_d,nByte));
CHECK(cudaMallocHost((float**)&b_d,nByte));
CHECK(cudaMallocHost((float**)&res_d,nByte));

initialData(a_h,nElem);
initialData(b_h,nElem);

CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));

dim3 block(1024);
dim3 grid(nElem/block.x);
sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);

CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
sumArrays(a_h,b_h,res_h,nElem);

checkResult(res_h,res_from_gpu_h,nElem);
cudaFreeHost(a_d);
cudaFreeHost(b_d);
cudaFreeHost(res_d);

free(a_h);
free(b_h);
free(res_h);
free(res_from_gpu_h);

return 0;
}
1 change: 1 addition & 0 deletions 16_zero_copy_memory/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(zero_copy_memory zero_copy_memory.cu)
103 changes: 103 additions & 0 deletions 16_zero_copy_memory/zero_copy_memory.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"


void sumArrays(float * a,float * b,float * res,const int size)
{
for(int i=0;i<size;i+=4)
{
res[i]=a[i]+b[i];
res[i+1]=a[i+1]+b[i+1];
res[i+2]=a[i+2]+b[i+2];
res[i+3]=a[i+3]+b[i+3];
}
}
__global__ void sumArraysGPU(float*a,float*b,float*res)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
res[i]=a[i]+b[i];
}
int main(int argc,char **argv)
{
int dev = 0;
cudaSetDevice(dev);
int power=10;
if(argc>=2)
power=atoi(argv[1]);
int nElem=1<<power;
printf("Vector size:%d\n",nElem);
int nByte=sizeof(float)*nElem;
float *res_from_gpu_h=(float*)malloc(nByte);
float *res_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
memset(res_from_gpu_h,0,nByte);

float *a_host,*b_host,*res_d;
double iStart,iElaps;
dim3 block(1024);
dim3 grid(nElem/block.x);
res_from_gpu_h=(float*)malloc(nByte);
float *a_dev,*b_dev;
CHECK(cudaHostAlloc((float**)&a_host,nByte,cudaHostAllocMapped));
CHECK(cudaHostAlloc((float**)&b_host,nByte,cudaHostAllocMapped));
CHECK(cudaMalloc((float**)&res_d,nByte));
initialData(a_host,nElem);
initialData(b_host,nElem);

//=============================================================//
iStart = cpuSecond();
CHECK(cudaHostGetDevicePointer((void**)&a_dev,(void*) a_host,0));
CHECK(cudaHostGetDevicePointer((void**)&b_dev,(void*) b_host,0));
sumArraysGPU<<<grid,block>>>(a_dev,b_dev,res_d);
CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
iElaps = cpuSecond() - iStart;
//=============================================================//
printf("zero copy memory elapsed %lf ms \n", iElaps);
printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
//-----------------------normal memory---------------------------
float *a_h_n=(float*)malloc(nByte);
float *b_h_n=(float*)malloc(nByte);
float *res_h_n=(float*)malloc(nByte);
float *res_from_gpu_h_n=(float*)malloc(nByte);
memset(res_h_n,0,nByte);
memset(res_from_gpu_h_n,0,nByte);

float *a_d_n,*b_d_n,*res_d_n;
CHECK(cudaMalloc((float**)&a_d_n,nByte));
CHECK(cudaMalloc((float**)&b_d_n,nByte));
CHECK(cudaMalloc((float**)&res_d_n,nByte));

initialData(a_h_n,nElem);
initialData(b_h_n,nElem);
//=============================================================//
iStart = cpuSecond();
CHECK(cudaMemcpy(a_d_n,a_h_n,nByte,cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(b_d_n,b_h_n,nByte,cudaMemcpyHostToDevice));
sumArraysGPU<<<grid,block>>>(a_d_n,b_d_n,res_d_n);
CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
iElaps = cpuSecond() - iStart;
//=============================================================//
printf("device memory elapsed %lf ms \n", iElaps);
printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
//--------------------------------------------------------------------

sumArrays(a_host,b_host,res_h,nElem);
checkResult(res_h,res_from_gpu_h,nElem);

cudaFreeHost(a_host);
cudaFreeHost(b_host);
cudaFree(res_d);
free(res_h);
free(res_from_gpu_h);

cudaFree(a_d_n);
cudaFree(b_d_n);
cudaFree(res_d_n);

free(a_h_n);
free(b_h_n);
free(res_h_n);
free(res_from_gpu_h_n);
return 0;
}
1 change: 1 addition & 0 deletions 17_UVA/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_executable(UVA UVA.cu)
60 changes: 60 additions & 0 deletions 17_UVA/UVA.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"


void sumArrays(float * a,float * b,float * res,const int size)
{
for(int i=0;i<size;i+=4)
{
res[i]=a[i]+b[i];
res[i+1]=a[i+1]+b[i+1];
res[i+2]=a[i+2]+b[i+2];
res[i+3]=a[i+3]+b[i+3];
}
}
__global__ void sumArraysGPU(float*a,float*b,float*res)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
res[i]=a[i]+b[i];
}
int main(int argc,char **argv)
{
int dev = 0;
cudaSetDevice(dev);

int nElem=1<<14;
printf("Vector size:%d\n",nElem);
int nByte=sizeof(float)*nElem;
float *res_from_gpu_h=(float*)malloc(nByte);
float *res_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
memset(res_from_gpu_h,0,nByte);

float *a_host,*b_host,*res_d;
CHECK(cudaHostAlloc((float**)&a_host,nByte,cudaHostAllocMapped));
CHECK(cudaHostAlloc((float**)&b_host,nByte,cudaHostAllocMapped));
CHECK(cudaMalloc((float**)&res_d,nByte));
res_from_gpu_h=(float*)malloc(nByte);

initialData(a_host,nElem);
initialData(b_host,nElem);

dim3 block(1024);
dim3 grid(nElem/block.x);
sumArraysGPU<<<grid,block>>>(a_host,b_host,res_d);
printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);

CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
sumArrays(a_host,b_host,res_h,nElem);

checkResult(res_h,res_from_gpu_h,nElem);
cudaFreeHost(a_host);
cudaFreeHost(b_host);
cudaFree(res_d);

free(res_h);
free(res_from_gpu_h);

return 0;
}
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@ add_subdirectory(8_divergence)
add_subdirectory(9_sum_matrix2D)
add_subdirectory(10_reduceInteger)
add_subdirectory(11_simple_sum_matrix2D)
add_subdirectory(12_reduce_unrolling)
add_subdirectory(12_reduce_unrolling)
add_subdirectory(14_global_variable)
add_subdirectory(15_pine_memory)
add_subdirectory(16_zero_copy_memory)
add_subdirectory(17_UVA)

0 comments on commit 1ddb872

Please sign in to comment.