-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-heat-conduction-solution.cu
133 lines (106 loc) · 3.77 KB
/
01-heat-conduction-solution.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include <stdio.h>
#include <math.h>
// Simple define to index into a 1D array from 2D space
#define I2D(num, c, r) ((r)*(num)+(c))
__global__
void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
// loop over all points in domain (except boundary)
if (j > 0 && i > 0 && j < nj-1 && i < ni-1) {
// find indices into linear memory
// for central point and neighbours
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
// evaluate derivatives
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
// update temperatures
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
void step_kernel_ref(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
// loop over all points in domain (except boundary)
for ( int j=1; j < nj-1; j++ ) {
for ( int i=1; i < ni-1; i++ ) {
// find indices into linear memory
// for central point and neighbours
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
// evaluate derivatives
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
// update temperatures
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
}
int main()
{
int istep;
int nstep = 200; // number of time steps
// Specify our 2D dimensions
const int ni = 200;
const int nj = 100;
float tfac = 8.418e-5; // thermal diffusivity of silver
float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp;
const int size = ni * nj * sizeof(float);
temp1_ref = (float*)malloc(size);
temp2_ref = (float*)malloc(size);
cudaMallocManaged(&temp1, size);
cudaMallocManaged(&temp2, size);
// Initialize with random data
for( int i = 0; i < ni*nj; ++i) {
temp1_ref[i] = temp2_ref[i] = temp1[i] = temp2[i] = (float)rand()/(float)(RAND_MAX/100.0f);
}
// Execute the CPU-only reference version
for (istep=0; istep < nstep; istep++) {
step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref);
// swap the temperature pointers
temp_tmp = temp1_ref;
temp1_ref = temp2_ref;
temp2_ref= temp_tmp;
}
dim3 tblocks(32, 16, 1);
dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1);
cudaError_t ierrSync, ierrAsync;
// Execute the modified version using same data
for (istep=0; istep < nstep; istep++) {
step_kernel_mod<<< grid, tblocks >>>(ni, nj, tfac, temp1, temp2);
ierrSync = cudaGetLastError();
ierrAsync = cudaDeviceSynchronize(); // Wait for the GPU to finish
if (ierrSync != cudaSuccess) { printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); }
if (ierrAsync != cudaSuccess) { printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); }
// swap the temperature pointers
temp_tmp = temp1;
temp1 = temp2;
temp2= temp_tmp;
}
float maxError = 0;
// Output should always be stored in the temp1 and temp1_ref at this point
for( int i = 0; i < ni*nj; ++i ) {
if (abs(temp1[i]-temp1_ref[i]) > maxError) { maxError = abs(temp1[i]-temp1_ref[i]); }
}
// Check and see if our maxError is greater than an error bound
if (maxError > 0.0005f)
printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError);
else
printf("The Max Error of %.5f is within acceptable bounds.\n", maxError);
free( temp1_ref );
free( temp2_ref );
cudaFree( temp1 );
cudaFree( temp2 );
return 0;
}