Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adopt @spullara's double parsing code #58

Merged
merged 3 commits into from
Jan 4, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,26 @@ static String bufferToString(ByteBuffer slice) {
return new String(bytes, StandardCharsets.UTF_8);
}

static double parseTemperature(ByteBuffer slice) {
// credit: adapted from spullara's submission
int value = 0;
int negative = 1;
int i = 0;
while (i != slice.limit()) {
byte b = slice.get(i++);
switch (b) {
case '-':
negative = -1;
case '.':
break;
default:
value = 10 * value + (b - '0');
}
}
value *= negative;
return value / 10.0;
}

@FunctionalInterface
interface IndexedStringConsumer {
void accept(String value, int index);
Expand All @@ -60,7 +80,7 @@ interface IndexedStringConsumer {
/** Maps text to an integer encoding. Adapted from async-profiler. */
public static class Dictionary {

private static final int ROW_BITS = 7;
private static final int ROW_BITS = 12;
private static final int ROWS = (1 << ROW_BITS);
private static final int CELLS = 3;
private static final int TABLE_CAPACITY = (ROWS * CELLS);
Expand Down Expand Up @@ -90,10 +110,10 @@ public void forEach(IndexedStringConsumer consumer) {
forEach(this.table, consumer);
}

public int encode(long hash, ByteBuffer slice) {
public int encode(int hash, ByteBuffer slice) {
Table table = this.table;
while (true) {
int rowIndex = (int)(Math.abs(hash) % ROWS);
int rowIndex = Math.abs(hash) % ROWS;
Row row = table.rows[rowIndex];
for (int c = 0; c < CELLS; c++) {
ByteBuffer storedKey = row.keys.get(c);
Expand All @@ -111,7 +131,7 @@ public int encode(long hash, ByteBuffer slice) {
}
}
table = row.getOrCreateNextTable(this::nextBaseIndex);
hash = Long.rotateRight(hash, ROW_BITS);
hash = Integer.rotateRight(hash, ROW_BITS);
}
}

Expand Down Expand Up @@ -207,43 +227,6 @@ private static int findIndexOf(ByteBuffer buffer, int offset, long pattern) {
return buffer.limit();
}

private static long hash(ByteBuffer slice) {
long hash = slice.limit() + PRIME_5 + 0x123456789abcdef1L;
int i = 0;
for (; i + Long.BYTES < slice.limit(); i += Long.BYTES) {
hash = hashLong(hash, slice.getLong(i));
}
long part = 0L;
for (; i < slice.limit(); i++) {
part = (part >>> 8) | ((slice.get(i) & 0xFFL) << 56);
}
hash = hashLong(hash, part);
return mix(hash);
}

static final long PRIME_1 = 0x9E3779B185EBCA87L;
static final long PRIME_2 = 0xC2B2AE3D27D4EB4FL;
static final long PRIME_3 = 0x165667B19E3779F9L;
static final long PRIME_4 = 0x85EBCA77C2B2AE63L;
static final long PRIME_5 = 0x27D4EB2F165667C5L;

private static long hashLong(long hash, long k) {
k *= PRIME_2;
k = Long.rotateLeft(k, 31);
k *= PRIME_1;
hash ^= k;
return Long.rotateLeft(hash, 27) * PRIME_1 + PRIME_4;
}

private static long mix(long hash) {
hash ^= hash >>> 33;
hash *= PRIME_2;
hash ^= hash >>> 29;
hash *= PRIME_3;
hash ^= hash >>> 32;
return hash;
}

static class Page {

static final int PAGE_SIZE = 1024;
Expand Down Expand Up @@ -311,16 +294,12 @@ private void computeSlice(ByteBuffer slice, double[][] pages) {
ByteBuffer key = slice.slice(offset, nextSeparator - offset).order(ByteOrder.LITTLE_ENDIAN);
// find the global dictionary code to aggregate,
// making this code global allows easy merging
int dictId = dictionary.encode(hash(key), key);
int dictId = dictionary.encode(key.hashCode(), key);

offset = nextSeparator + 1;
int newLine = findIndexOf(slice, offset, NEW_LINE);
// parse the double
// todo do this without allocating a string, could use a fast parsing falgorithm
var bytes = new byte[newLine - offset];
slice.get(offset, bytes);
var string = new String(bytes, StandardCharsets.US_ASCII);
double d = Double.parseDouble(string);
double d = parseTemperature(slice.slice(offset, newLine - offset));

Page.update(pages, dictId, d);

Expand Down Expand Up @@ -351,7 +330,7 @@ private static void merge(double[][] contribution, double[][] aggregate) {
protected double[][] compute() {
if (min == max) {
// fixme - hardcoded to problem size
var pages = new double[1024][];
var pages = new double[600][];
var slice = slices.get(min);
computeSlice(slice, pages);
return pages;
Expand All @@ -368,7 +347,7 @@ protected double[][] compute() {
}

public static void main(String[] args) throws IOException {
int maxChunkSize = 10 << 20; // 10MiB
int maxChunkSize = 250 << 20; // 250MiB
try (var raf = new RandomAccessFile(FILE, "r");
var channel = raf.getChannel()) {
long size = channel.size();
Expand Down
Loading