-
Notifications
You must be signed in to change notification settings - Fork 458
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
28 changed files
with
3,328 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
add_executable(sum_arrays_timer sum_arrays_timer.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
#include "freshman.h" | ||
|
||
|
||
|
||
void sumArrays(float * a,float * b,float * res,const int size) | ||
{ | ||
for(int i=0;i<size;i+=4) | ||
{ | ||
res[i]=a[i]+b[i]; | ||
res[i+1]=a[i+1]+b[i+1]; | ||
res[i+2]=a[i+2]+b[i+2]; | ||
res[i+3]=a[i+3]+b[i+3]; | ||
} | ||
} | ||
__global__ void sumArraysGPU(float*a,float*b,float*res) | ||
{ | ||
int i=threadIdx.x; | ||
res[i]=a[i]+b[i]; | ||
} | ||
int main(int argc,char **argv) | ||
{ | ||
// set up device | ||
initDevice(0); | ||
|
||
int nElem=32; | ||
printf("Vector size:%d\n",nElem); | ||
int nByte=sizeof(float)*nElem; | ||
float *a_h=(float*)malloc(nByte); | ||
float *b_h=(float*)malloc(nByte); | ||
float *res_h=(float*)malloc(nByte); | ||
float *res_from_gpu_h=(float*)malloc(nByte); | ||
memset(res_h,0,nByte); | ||
memset(res_from_gpu_h,0,nByte); | ||
|
||
float *a_d,*b_d,*res_d; | ||
CHECK(cudaMalloc((float**)&a_d,nByte)); | ||
CHECK(cudaMalloc((float**)&b_d,nByte)); | ||
CHECK(cudaMalloc((float**)&res_d,nByte)); | ||
|
||
initialData(a_h,nElem); | ||
initialData(b_h,nElem); | ||
|
||
CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice)); | ||
CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice)); | ||
|
||
dim3 block(nElem); | ||
dim3 grid(nElem/block.x); | ||
|
||
//timer | ||
double iStart,iElaps; | ||
iStart=cpuSecond(); | ||
sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d); | ||
iElaps=cpuSecond()-iStart; | ||
printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",block.x,grid.x,iElaps); | ||
|
||
CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); | ||
sumArrays(a_h,b_h,res_h,nElem); | ||
|
||
checkResult(res_h,res_from_gpu_h,nElem); | ||
cudaFree(a_d); | ||
cudaFree(b_d); | ||
cudaFree(res_d); | ||
|
||
free(a_h); | ||
free(b_h); | ||
free(res_h); | ||
free(res_from_gpu_h); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
add_executable(thread_index thread_index.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
#include "freshman.h" | ||
|
||
__global__ void printThreadIndex(float *A,const int nx,const int ny) | ||
{ | ||
int ix=threadIdx.x+blockIdx.x*blockDim.x; | ||
int iy=threadIdx.y+blockIdx.y*blockDim.y; | ||
unsigned int idx=iy*nx+ix; | ||
printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)" | ||
"global index %2d ival %2d\n",threadIdx.x,threadIdx.y, | ||
blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]); | ||
} | ||
int main(int argc,char** argv) | ||
{ | ||
initDevice(0); | ||
int nx=8,ny=6; | ||
int nxy=nx*ny; | ||
int nBytes=nxy*sizeof(float); | ||
|
||
//Malloc | ||
float* A_host=(float*)malloc(nBytes); | ||
initialData(A_host,nxy); | ||
printMatrix(A_host,nx,ny); | ||
|
||
//cudaMalloc | ||
float *A_dev=NULL; | ||
CHECK(cudaMalloc((void**)&A_dev,nBytes)); | ||
|
||
cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice); | ||
|
||
dim3 block(4,2); | ||
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); | ||
|
||
printThreadIndex<<<grid,block>>>(A_dev,nx,ny); | ||
|
||
CHECK(cudaDeviceSynchronize()); | ||
cudaFree(A_dev); | ||
free(A_host); | ||
|
||
cudaDeviceReset(); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
add_executable(sum_matrix sum_matrix.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#include <cuda_runtime.h> | ||
#include <stdio.h> | ||
#include "freshman.h" | ||
void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny) | ||
{ | ||
float * a=MatA; | ||
float * b=MatB; | ||
float * c=MatC; | ||
for(int j=0;j<ny;j++) | ||
{ | ||
for(int i=0;i<nx;i++) | ||
{ | ||
c[i]=a[i]+b[i]; | ||
} | ||
c+=nx; | ||
b+=nx; | ||
a+=nx; | ||
} | ||
|
||
|
||
} | ||
__global__ void sumMatrix2D(float * MatA,float * MatB,float * MatC,int nx,int ny) | ||
{ | ||
int ix=threadIdx.x+blockDim.x*blockIdx.x; | ||
int iy=threadIdx.y+blockDim.y*blockIdx.y; | ||
int idx=ix+iy*ny; | ||
if (ix<nx && iy<ny) | ||
{ | ||
MatC[idx]=MatA[idx]+MatB[idx]; | ||
} | ||
} | ||
int main(int argc,char** argv) | ||
{ | ||
printf("strating...\n"); | ||
initDevice(0); | ||
int nx=1<<12; | ||
int ny=1<<12; | ||
int nxy=nx*ny; | ||
int nBytes=nxy*sizeof(float); | ||
|
||
//Malloc | ||
float* A_host=(float*)malloc(nBytes); | ||
float* B_host=(float*)malloc(nBytes); | ||
float* C_host=(float*)malloc(nBytes); | ||
float* C_from_gpu=(float*)malloc(nBytes); | ||
initialData(A_host,nxy); | ||
initialData(B_host,nxy); | ||
|
||
//cudaMalloc | ||
float *A_dev=NULL; | ||
float *B_dev=NULL; | ||
float *C_dev=NULL; | ||
CHECK(cudaMalloc((void**)&A_dev,nBytes)); | ||
CHECK(cudaMalloc((void**)&B_dev,nBytes)); | ||
CHECK(cudaMalloc((void**)&C_dev,nBytes)); | ||
|
||
|
||
CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice)); | ||
CHECK(cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice)); | ||
|
||
int dimx=32; | ||
int dimy=32; | ||
dim3 block(dimx,dimy); | ||
dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); | ||
double iStart=cpuSecond(); | ||
sumMatrix2D<<<grid,block>>>(A_dev,B_dev,C_dev,nx,ny); | ||
|
||
CHECK(cudaDeviceSynchronize()); | ||
double iElaps=cpuSecond()-iStart; | ||
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", | ||
grid.x,grid.y,block.x,block.y,iElaps); | ||
cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost); | ||
iStart=cpuSecond(); | ||
sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny); | ||
iElaps=cpuSecond()-iStart; | ||
printf("CPU Execution Time elapsed %f sec\n",iElaps); | ||
checkResult(C_host,C_from_gpu,nxy); | ||
cudaFree(A_dev); | ||
cudaFree(B_dev); | ||
cudaFree(C_dev); | ||
free(A_host); | ||
free(B_host); | ||
free(C_host); | ||
free(C_from_gpu); | ||
cudaDeviceReset(); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
cmake_minimum_required(VERSION 3.10 FATAL_ERROR) | ||
cmake_minimum_required(VERSION 3.9 FATAL_ERROR) | ||
Project(CUDA_Freshman CXX C CUDA) | ||
include_directories(./include) | ||
add_subdirectory(0_hello_world) | ||
add_subdirectory(1_check_dimension) | ||
add_subdirectory(2_grid_block) | ||
add_subdirectory(3_sum_arrays) | ||
add_subdirectory(4_sum_arrays_timer) | ||
add_subdirectory(5_thread_index) | ||
add_subdirectory(6_sum_matrix) |
Oops, something went wrong.