-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathRadixSortMsdParallel.h
319 lines (287 loc) · 14.6 KB
/
RadixSortMsdParallel.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#pragma once
#ifndef _RadixSortMsdParallel_h
#define _RadixSortMsdParallel_h
// TBB-only implementation
#include "tbb/tbb.h"
#include <tbb/parallel_invoke.h>
#include "InsertionSort.h"
#include "BinarySearch.h"
#include <iostream>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <random>
#include <ratio>
#include <vector>
#include <execution>
#include <thread>
using namespace tbb;
#include "RadixSortCommon.h"
#include "RadixSortMSD.h"
#include "InsertionSort.h"
#include "HistogramParallel.h"
namespace ParallelAlgorithms
{
// Simplified the implementation of the inner loop.
template< class _Type, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold >
inline void _RadixSort_Unsigned_PowerOf2Radix_Par_L1(_Type* a, size_t a_size, _Type bitMask, unsigned long shiftRightAmount)
{
size_t last = a_size - 1;
#if 0
size_t count[PowerOfTwoRadix];
for (unsigned long i = 0; i < PowerOfTwoRadix; i++) count[i] = 0;
for (size_t _current = 0; _current <= last; _current++) // Scan the array and count the number of times each value appears
count[(unsigned long)((a[_current] & bitMask) >> shiftRightAmount)]++;
#else
size_t* count = HistogramOneByteComponentParallel< PowerOfTwoRadix, Log2ofPowerOfTwoRadix >(a, 0, last, shiftRightAmount);
#endif
size_t startOfBin[PowerOfTwoRadix + 1], endOfBin[PowerOfTwoRadix], nextBin = 1;
startOfBin[0] = endOfBin[0] = 0; startOfBin[PowerOfTwoRadix] = 0; // sentinal
for (unsigned long i = 1; i < PowerOfTwoRadix; i++)
startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1];
for (size_t _current = 0; _current <= last; )
{
unsigned digit;
_Type _current_element = a[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
while (endOfBin[digit = (unsigned)((_current_element & bitMask) >> shiftRightAmount)] != _current) _swap(_current_element, a[endOfBin[digit]++]);
a[_current] = _current_element;
endOfBin[digit]++;
while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin
_current = endOfBin[nextBin - 1];
}
bitMask >>= Log2ofPowerOfTwoRadix;
if (bitMask != 0) // end recursion when all the bits have been processes
{
if (shiftRightAmount >= Log2ofPowerOfTwoRadix) shiftRightAmount -= Log2ofPowerOfTwoRadix;
else shiftRightAmount = 0;
#if 0
for (unsigned long i = 0; i < PowerOfTwoRadix; i++)
{
size_t numberOfElements = endOfBin[i] - startOfBin[i]; // endOfBin actually points to one beyond the bin
if (numberOfElements >= Threshold)
_RadixSort_Unsigned_PowerOf2Radix_Par_L1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount);
else if (numberOfElements >= 2)
insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements);
}
#else
// Multi-core version of the algorithm
#if defined(USE_PPL)
Concurrency::task_group g;
#else
tbb::task_group g;
#endif
for (unsigned long i = 0; i < PowerOfTwoRadix; i++)
{
size_t numberOfElements = endOfBin[i] - startOfBin[i];
if (numberOfElements >= Threshold) // endOfBin actually points to one beyond the bin
g.run([=] { // important to not pass by reference, as all tasks will then get the same/last value
_RadixSort_Unsigned_PowerOf2Radix_Par_L1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount);
});
else if (numberOfElements >= 2)
insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements);
}
g.wait();
#endif
}
}
// Permute phase of MSD Radix Sort with de-randomized write memory accesses
// Derandomizes system memory accesses by buffering all Radix bin accesses, turning 256-bin random memory writes into sequential writes
// Separates read pointers from write pointers of/to each bin
// Idea: It may be better to implement read/write buffering where all of the swaps happen within those buffers with writes dumping out to the bins and fetching the next buffer
// It's similar to caching, but doing it in a more cache-friendly way where all of the bin-buffers can fit into the cache and not map on top of each other. Otherwise, with
// bins we get data-dependent cache thrashing. Plus, all of the swapping will be within the buffers which are well organized for caching.
template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, unsigned long BufferDepth>
inline void _RadixSortMSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized_1(unsigned long* inout_array, size_t startIndex, size_t endIndex, unsigned long bitMask, unsigned long shiftRightAmount,
size_t* startOfBin, size_t* endOfBin, unsigned long bufferIndex[], unsigned long bufferDerandomize[][BufferDepth])
{
// TODO: This version is broken and needs to be fixed!!
//printf("Permute Derandomized #1: startIndex = %zu endIndex = %zu bitMask = %lx shiftRight = %lu \n", startIndex, endIndex, bitMask, shiftRightAmount);
const unsigned long NumberOfBins = PowerOfTwoRadix;
size_t writeEndOfBin[NumberOfBins]; // write pointers to each bin
std::copy(endOfBin + 0, endOfBin + NumberOfBins, writeEndOfBin); // copy read pointers (endOfBin) to write pointers
size_t nextBin = 1;
for (size_t _current = startIndex; _current <= endIndex;)
{
unsigned long digit;
unsigned long _current_element = inout_array[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
while (endOfBin[digit = (unsigned long)((_current_element & bitMask) >> shiftRightAmount)] != _current)
{
unsigned long tmp = _current_element; // read of the current element to squirl it away
_current_element = inout_array[endOfBin[digit]]; // read from a bin and place in the current element
endOfBin[digit]++; // advance the read pointer of that bin
//a[endOfBin[digit]] = tmp; // write the current element to that bin. This is the one (the write) which needs to be buffered
if (bufferIndex[digit] < BufferDepth)
{
bufferDerandomize[digit][bufferIndex[digit]++] = tmp; // write the current element into the buffer for that bin
}
else
{
unsigned long outIndex = writeEndOfBin[digit];
unsigned long* buff = &(bufferDerandomize[digit][0]);
#if 1
memcpy(&(inout_array[outIndex]), buff, BufferDepth * sizeof(unsigned long)); // significantly faster than a for loop. Need to try std::copy - simpler interface
#else
unsigned long* outBuff = &(output_array[outIndex]);
for (unsigned long i = 0; i < BufferDepth; i++)
*outBuff++ = *buff++;
#endif
writeEndOfBin[digit] += BufferDepth;
bufferDerandomize[digit][0] = tmp; // write the current element into the buffer for that bin
bufferIndex[digit] = 1;
}
}
inout_array[_current] = _current_element; // write the current element into the current bin
endOfBin[digit]++; // advance the read pointer
writeEndOfBin[digit]++; // advance the write pointer
while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin
_current = endOfBin[nextBin - 1];
}
// Flush all the derandomization buffers
for (unsigned long whichBuff = 0; whichBuff < NumberOfBins; whichBuff++)
{
unsigned long numOfElementsInBuff = bufferIndex[whichBuff];
for (size_t i = 0; i < numOfElementsInBuff; i++)
inout_array[writeEndOfBin[whichBuff]++] = bufferDerandomize[whichBuff][i];
bufferIndex[whichBuff] = 0;
}
}
// Permute phase of MSD Radix Sort with de-randomized write memory accesses
// Derandomizes system memory accesses by buffering all Radix bin accesses, turning 256-bin random memory writes into sequential writes
template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, unsigned long BufferDepth>
inline void _RadixSortMSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized(unsigned long* inout_array, size_t startIndex, size_t endIndex, unsigned long bitMask, unsigned long shiftRightAmount,
size_t* endOfBin, unsigned long bufferIndex[], unsigned long bufferDerandomize[][BufferDepth])
{
printf("Permute Derandomized: startIndex = %zu endIndex = %zu bitMask = %lx shiftRight = %lu \n", startIndex, endIndex, bitMask, shiftRightAmount);
const unsigned long NumberOfBins = PowerOfTwoRadix;
for (size_t _current = startIndex; _current <= endIndex; _current++)
{
unsigned long digit = extractDigit(inout_array[_current], bitMask, shiftRightAmount);
if (bufferIndex[digit] < BufferDepth)
{
bufferDerandomize[digit][bufferIndex[digit]++] = inout_array[_current];
}
else
{
unsigned long outIndex = endOfBin[digit];
unsigned long* buff = &(bufferDerandomize[digit][0]);
#if 1
memcpy(&(inout_array[outIndex]), buff, BufferDepth * sizeof(unsigned long)); // significantly faster than a for loop
#else
unsigned long* outBuff = &(output_array[outIndex]);
for (unsigned long i = 0; i < BufferDepth; i++)
*outBuff++ = *buff++;
#endif
endOfBin[digit] += BufferDepth;
bufferDerandomize[digit][0] = inout_array[_current];
bufferIndex[digit] = 1;
}
}
// Flush all the derandomization buffers
for (unsigned long whichBuff = 0; whichBuff < NumberOfBins; whichBuff++)
{
unsigned long numOfElementsInBuff = bufferIndex[whichBuff];
for (size_t i = 0; i < numOfElementsInBuff; i++)
inout_array[endOfBin[whichBuff]++] = bufferDerandomize[whichBuff][i];
bufferIndex[whichBuff] = 0;
}
}
// Simplified the implementation of the inner loop.
template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold >
inline void _RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1(unsigned long* a, size_t a_size, unsigned long bitMask, unsigned long shiftRightAmount)
{
size_t last = a_size - 1;
#if 0
size_t count[PowerOfTwoRadix];
for (unsigned long i = 0; i < PowerOfTwoRadix; i++) count[i] = 0;
for (size_t _current = 0; _current <= last; _current++) // Scan the array and count the number of times each value appears
count[(unsigned long)((a[_current] & bitMask) >> shiftRightAmount)]++;
#else
size_t* count = HistogramOneByteComponentParallel< PowerOfTwoRadix, Log2ofPowerOfTwoRadix >(a, 0, last, shiftRightAmount);
#endif
size_t startOfBin[PowerOfTwoRadix + 1], endOfBin[PowerOfTwoRadix];
startOfBin[0] = endOfBin[0] = 0; startOfBin[PowerOfTwoRadix] = 0; // sentinal
for (unsigned long i = 1; i < PowerOfTwoRadix; i++)
startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1];
#if 1
size_t nextBin = 1;
for (size_t _current = 0; _current <= last; )
{
unsigned long digit;
unsigned long _current_element = a[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
while (endOfBin[digit = (unsigned long)((_current_element & bitMask) >> shiftRightAmount)] != _current) _swap(_current_element, a[endOfBin[digit]++]);
a[_current] = _current_element;
endOfBin[digit]++;
while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin
_current = endOfBin[nextBin - 1];
}
#else
// TODO: This version is broken and needs to be fixed!!
const unsigned long NumberOfBins = PowerOfTwoRadix;
static const unsigned long bufferDepth = 128;
__declspec(align(64)) unsigned long bufferDerandomize[NumberOfBins][bufferDepth];
__declspec(align(64)) unsigned long bufferIndex[NumberOfBins] = { 0 };
//printf("Before Permute Derandomized \n");
_RadixSortMSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, bufferDepth>(
a, 0, last, bitMask, shiftRightAmount, startOfBin, endOfBin, bufferIndex, bufferDerandomize);
//printf("After Permute Derandomized \n");
#endif
bitMask >>= Log2ofPowerOfTwoRadix;
if (bitMask != 0) // end recursion when all the bits have been processes
{
if (shiftRightAmount >= Log2ofPowerOfTwoRadix) shiftRightAmount -= Log2ofPowerOfTwoRadix;
else shiftRightAmount = 0;
#if 0
for (unsigned long i = 0; i < PowerOfTwoRadix; i++)
{
size_t numberOfElements = endOfBin[i] - startOfBin[i]; // endOfBin actually points to one beyond the bin
if (numberOfElements >= Threshold)
_RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount);
else if (numberOfElements >= 2)
insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements);
}
#else
// Multi-core version of the algorithm
#if defined(USE_PPL)
Concurrency::task_group g;
#else
tbb::task_group g;
#endif
for (unsigned long i = 0; i < PowerOfTwoRadix; i++)
{
size_t numberOfElements = endOfBin[i] - startOfBin[i];
if (numberOfElements >= Threshold) // endOfBin actually points to one beyond the bin
g.run([=] { // important to not pass by reference, as all tasks will then get the same/last value
_RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount);
});
else if (numberOfElements >= 2)
insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements);
}
g.wait();
#endif
}
}
inline void parallel_hybrid_inplace_msd_radix_sort(unsigned* a, size_t a_size)
{
if (a_size < 2) return;
const long PowerOfTwoRadix = 256;
const long Log2ofPowerOfTwoRadix = 8;
const long Threshold = 100;
unsigned bitMask = 0x80000000; // bitMask controls how many bits we process at a time
unsigned shiftRightAmount = 31;
for (size_t i = 2; i < PowerOfTwoRadix; ) // if not power-of-two value then it will do up to the largest power-of-two value
{ // that's smaller than the value provided (e.g. radix-10 will do radix-8)
bitMask |= (bitMask >> 1);
shiftRightAmount -= 1;
i <<= 1;
}
if (a_size >= Threshold)
{
_RadixSort_Unsigned_PowerOf2Radix_Par_L1< unsigned, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, a_size, bitMask, shiftRightAmount); // same speed as de-randomization on 6-core
//_RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, a_size, bitMask, shiftRightAmount);
}
else
insertionSortSimilarToSTLnoSelfAssignment(a, a_size);
//insertionSortHybrid(a, a_size);
}
}
#endif