-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbcCUDA3D.cu
128 lines (109 loc) · 4.83 KB
/
bcCUDA3D.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
//===============================================================================
// This file is part of the software Fast Box-Counting:
// https://www.ugr.es/~demiras/fbc
//
// Copyright(c)2022 University of Granada - SPAIN
//
// FOR RESEARCH PURPOSES ONLY.THE SOFTWARE IS PROVIDED "AS IS," AND THE
// UNIVERSITY OF GRANADA DO NOT MAKE ANY WARRANTY, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE, NOR DO THEY ASSUME ANY LIABILITY OR RESPONSIBILITY FOR THE USE
// OF THIS SOFTWARE.
//
// For more information see license.html
// ===============================================================================
//
// Authors: Juan Ruiz de Miras and Miguel Ángel Posadas, 2022
// Contact: [email protected]
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include "bcCUDA3D.cuh"
__global__ void BCKernel3D(unsigned char* M, const long long int m, const unsigned long long int bits_m, const unsigned long long int sm,
const unsigned long long bits_s, const unsigned long long bits_TPB, unsigned int* n)
{
register unsigned long long int tid = threadIdx.x;
register unsigned long long int idx = (blockIdx.x << bits_TPB) + tid; // 2 ^ bits_TPB = TPB
// identifies grid index (i, j, k) from block and thread values
register unsigned int k = idx >> ((bits_m - bits_s) + (bits_m - bits_s)); // k index: idx / ((m/s)*(m/s))
register unsigned int offset = (idx & (((m >> bits_s) << (bits_m - bits_s)) - 1)); // (idx mod ((m/s)*(m/s))), offset inside k slice
register unsigned long long int i = offset >> (bits_m - bits_s); // i index: offset / (m/s)
register unsigned long long int j = offset & ((m >> bits_s) - 1); // j index: offset mod (m/s)
// global location of position (0, 0, 0) of the grid (i, j, k) is gk + gi + gj
const register unsigned long long int gi = (i << bits_s) << bits_m;
const register unsigned long long int gj = j << bits_s;
const register unsigned long long int gk = ((k << bits_s) << bits_m) << bits_m;
const register unsigned long long int gism = ((i << bits_s) + sm) << bits_m;
const register unsigned long long int gjsm = (j << bits_s) + sm;
const register unsigned long long int gksm = (((k << bits_s) + sm) << bits_m) << bits_m;
// compute and store the occupancy value of the grid (i, j, k)
M[gk + gi + gj] = M[gk + gi + gj] || M[gk + gi + gjsm] || M[gk + gism + gj] || M[gk + gism + gjsm] ||
M[gksm + gi + gj] || M[gksm + gi + gjsm] || M[gksm + gism + gj] || M[gksm + gism + gjsm];
atomicAdd(n, M[gk + gi + gj]);
}
cudaError_t CudaBC3D(const unsigned char* M, const long long int m, const unsigned char bits_m, const unsigned int TPB, const int nn, unsigned int* n)
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return cudaStatus;
}
// CPU-GPU data transfers
unsigned char* device_M = 0; // matrix M in GPU device
unsigned int* device_n = 0; // array n in GPU device
cudaStatus = cudaMalloc((void**)&device_M, m * m * m * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
return cudaStatus;
}
cudaStatus = cudaMalloc((void**)&device_n, nn * sizeof(unsigned int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
cudaFree(device_M);
return cudaStatus;
}
cudaStatus = cudaMemcpy(device_M, M, m * m * m * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
cudaFree(device_M);
cudaFree(device_n);
return cudaStatus;
}
cudaStatus = cudaMemset(device_n, 0, nn * sizeof(unsigned int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
cudaFree(device_M);
cudaFree(device_n);
return cudaStatus;
}
dim3 grid, block(TPB, 1, 1); // variables for kernel launching
unsigned int s = 2;
unsigned int size = m;
unsigned char ni = 0;
unsigned char bits_TPB = log(TPB) / log(2); // 2^bits_TPB = TPB
while (size > 2) {
unsigned int sm = s >> 1; // sm = s/2
unsigned long long int num_box = (m * m * m) / (s * s * s);
if (num_box >= TPB) {
grid.x = ceilf(num_box / (float)TPB); // m/s * m/s * m/s= (grid_size * TPB)
grid.y = 1; grid.z = 1;
}
else {
grid.x = 1; grid.y = 1; grid.z = 1;
block.x = num_box; block.y = 1; block.z = 1;
bits_TPB = log(num_box) / log(2);
}
// BCKernel call. Compute box-counting for grids of size s x s
BCKernel3D << <grid, block >> > (device_M, m, bits_m, sm, ni + 1, bits_TPB, &device_n[ni]);
ni++;
s <<= 1;
size >>= 1;
}
// GPU-CPU data transfer of the box-counting results
cudaStatus = cudaMemcpy(n, device_n, nn * sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaFree(device_M);
cudaFree(device_n);
return cudaStatus;
}