-
Notifications
You must be signed in to change notification settings - Fork 0
/
5pt-2d-cuda-stencil.cu
120 lines (104 loc) · 3.32 KB
/
5pt-2d-cuda-stencil.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
//#define M 8
static const int M = 10;
static const int nr_blocks = 1024;
static inline struct timespec mydifftime(struct timespec start, struct timespec end)
{
struct timespec temp;
if((end.tv_nsec-start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
} else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
__global__ void compute(const float * a, float * b)
{
int i = blockIdx.x;
int j;
for (j = 0; j < M; j++) {
if ((i + j * nr_blocks) > 0 && (i + j * nr_blocks) < M) {
b[i + j * nr_blocks] = 0.2 * (a[M+((i+j*nr_blocks)-1)] + a[M+(i+j*nr_blocks)] + a[M+((i+j*nr_blocks)+1)] + a[(i+j*nr_blocks)] + a[2*M+(i+j*nr_blocks)]);
}
}
}
int main(int argc, char ** argv) {
float ** a, ** b, * c;
float * c_a, * c_b;
int i = 0;
int j = 0, k = 0;
struct timespec time1;
struct timespec time2;
struct timespec result;
a = (float **) malloc(sizeof(float *) * M);
b = (float **) malloc(sizeof(float *) * M);
cudaMalloc((void **)&c_a, sizeof(float) * M * M);
cudaMalloc((void **)&c_b, sizeof(float) * M * M);
for (j = 0; j < M; j++) {
a[j] = (float *) malloc(sizeof(float) * M);
b[j] = (float *) malloc(sizeof(float) * M);
memset(a[j], 0, sizeof(float) * M);
memset(b[j], 0, sizeof(float) * M);
}
for (j = 0; j < M; j++) {
a[j][0] = 1;
}
for (j = 0; j < M; j++) {
a[j][M-1] = 1;
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
#ifdef DEBUG
printf("[debug]\n");
for (j = 0; j < M; j++) {
for (k = 0; k < M; k++) {
printf("%g ", a[j][k]);
}
printf("\n");
}
printf("\n\n");
#endif /* DEBUG */
for (i = 0;i < 10;i++) {
#ifdef DEBUG
printf("Iter: %d\n", i);
fflush(stdout);
#endif
for (j = 1; j < M - 1; j++) {
cudaMemcpy(c_a, a[j - 1], M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(&c_a[M], a[j], M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(&c_a[2*M], a[j+1], M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(c_b, b[j], M * sizeof(float), cudaMemcpyHostToDevice);
compute<<<nr_blocks, 1>>>(c_a, c_b);
cudaMemcpy(b[j], c_b, M * sizeof(float), cudaMemcpyDeviceToHost);
}
//printf("[debug] updating a with b\n");
for (j = 1; j < M - 1; j++) {
for (k = 1; k < M - 1; k++) {
a[j][k] = b[j][k];
}
}
#ifdef DEBUG
printf("[debug output of b]\n");
for (j = 0; j < M; j++) {
for (k = 0; k < M; k++) {
printf("%5.5g ", a[j][k]);
}
printf("\n");
}
printf("\n\n");
#endif /* DEBUG */
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
result = mydifftime(time1, time2);
printf("timing: %lu.%.0f sec\n", result.tv_sec, (float)(result.tv_nsec / 1000000.0));
free(a);
free(b);
cudaFree(c_a);
cudaFree(c_b);
return 0;
}