-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti-gpu_stencil_uva.cu
95 lines (83 loc) · 2.89 KB
/
multi-gpu_stencil_uva.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include <stdio.h>
#include <cuda_runtime.h>
// Define the kernel function for stencil operation
__global__ void stencil(int* grid, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i > 0 && i < N - 1 && j > 0 && j < N - 1) {
int new_value = (grid[(i - 1) * N + j] + grid[(i + 1) * N + j] + grid[i * N + j - 1] + grid[i * N + j + 1]);
grid[i * N + j] += new_value +1 ;
}
}
int main()
{
int num_gpus = 4; // Number of GPUs to use
int N = 8; // Size of the grid
int size = N * N * sizeof(float); // Size of grid in bytes
int threads_per_block = 4; // Number of threads per block
int blocks_per_dim = N / threads_per_block; // Number of blocks per dimension
// Initialize the grid on the CPU
int* grid = new int[N * N];
for (int i = 0; i < N * N; i++) {
grid[i] = 1;
}
// Declare arrays to hold device pointers and GPU IDs
int** grid_dev = new int*[num_gpus];
int* gpus = new int[num_gpus];
// Initialize the GPUs and device pointers
for (int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
cudaMalloc(&grid_dev[i], size / num_gpus);
cudaMemcpy(&grid_dev[i][(N / num_gpus) * N], &grid[(N / num_gpus) * N * i], size / num_gpus, cudaMemcpyHostToDevice);
gpus[i] = i;
}
// Enable peer-to-peer access between GPUs
for (int i = 0; i < num_gpus; i++) {
cudaSetDevice(gpus[i]);
for (int j = 0; j < num_gpus; j++) {
if (i != j) {
cudaDeviceEnablePeerAccess(gpus[j], 0);
}
}
}
// Launch the kernel on each GPU
dim3 threads(threads_per_block, threads_per_block);
for (int iter = 0; iter < 1; iter++) {
for (int i = 0; i < num_gpus; i++) {
cudaSetDevice(gpus[i]);
dim3 blocks(blocks_per_dim / num_gpus, blocks_per_dim);
stencil<<<blocks, threads>>>(grid_dev[i], N / num_gpus);
// Synchronize with other GPUs
for (int j = 0; j < num_gpus; j++) {
if (i != j) {
cudaSetDevice(gpus[j]);
cudaDeviceSynchronize();
}
}
}
}
// Copy the results back to the CPU and print them out
for (int i = 0; i < num_gpus; i++) {
cudaSetDevice(gpus[i]);
cudaMemcpy(&grid[(N / num_gpus) * N * i], &grid_dev[i][(N / num_gpus) * N], size / num_gpus, cudaMemcpyDeviceToHost);
}
// Print the results
printf("Results:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%i ", grid[i * N + j]);
}
printf("\n");
}
// Free the memory
for (int i = 0; i < num_gpus; i++) {
cudaSetDevice(gpus[i]);
cudaFree(grid_dev[i]);
cudaDeviceReset();
}
delete[] grid;
delete[] grid_dev;
delete[] gpus;
return 0;
}