gunnarmorling · gunnarmorling · Jan 4, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java b/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java
@@ -52,6 +52,26 @@ static String bufferToString(ByteBuffer slice) {
         return new String(bytes, StandardCharsets.UTF_8);
     }
 
+    static double parseTemperature(ByteBuffer slice) {
+        // credit: adapted from spullara's submission
+        int value = 0;
+        int negative = 1;
+        int i = 0;
+        while (i != slice.limit()) {
+            byte b = slice.get(i++);
+            switch (b) {
+                case '-':
+                    negative = -1;
+                case '.':
+                    break;
+                default:
+                    value = 10 * value + (b - '0');
+            }
+        }
+        value *= negative;
+        return value / 10.0;
+    }
+
     @FunctionalInterface
     interface IndexedStringConsumer {
         void accept(String value, int index);
@@ -60,7 +80,7 @@ interface IndexedStringConsumer {
     /** Maps text to an integer encoding. Adapted from async-profiler. */
     public static class Dictionary {
 
-        private static final int ROW_BITS = 7;
+        private static final int ROW_BITS = 12;
         private static final int ROWS = (1 << ROW_BITS);
         private static final int CELLS = 3;
         private static final int TABLE_CAPACITY = (ROWS * CELLS);
@@ -90,10 +110,10 @@ public void forEach(IndexedStringConsumer consumer) {
             forEach(this.table, consumer);
         }
 
-        public int encode(long hash, ByteBuffer slice) {
+        public int encode(int hash, ByteBuffer slice) {
             Table table = this.table;
             while (true) {
-                int rowIndex = (int)(Math.abs(hash) % ROWS);
+                int rowIndex = Math.abs(hash) % ROWS;
                 Row row = table.rows[rowIndex];
                 for (int c = 0; c < CELLS; c++) {
                     ByteBuffer storedKey = row.keys.get(c);
@@ -111,7 +131,7 @@ public int encode(long hash, ByteBuffer slice) {
                     }
                 }
                 table = row.getOrCreateNextTable(this::nextBaseIndex);
-                hash = Long.rotateRight(hash, ROW_BITS);
+                hash = Integer.rotateRight(hash, ROW_BITS);
             }
         }
 
@@ -207,43 +227,6 @@ private static int findIndexOf(ByteBuffer buffer, int offset, long pattern) {
         return buffer.limit();
     }
 
-    private static long hash(ByteBuffer slice) {
-        long hash = slice.limit() + PRIME_5 + 0x123456789abcdef1L;
-        int i = 0;
-        for (; i + Long.BYTES < slice.limit(); i += Long.BYTES) {
-            hash = hashLong(hash, slice.getLong(i));
-        }
-        long part = 0L;
-        for (; i < slice.limit(); i++) {
-            part = (part >>> 8) | ((slice.get(i) & 0xFFL) << 56);
-        }
-        hash = hashLong(hash, part);
-        return mix(hash);
-    }
-
-    static final long PRIME_1 = 0x9E3779B185EBCA87L;
-    static final long PRIME_2 = 0xC2B2AE3D27D4EB4FL;
-    static final long PRIME_3 = 0x165667B19E3779F9L;
-    static final long PRIME_4 = 0x85EBCA77C2B2AE63L;
-    static final long PRIME_5 = 0x27D4EB2F165667C5L;
-
-    private static long hashLong(long hash, long k) {
-        k *= PRIME_2;
-        k = Long.rotateLeft(k, 31);
-        k *= PRIME_1;
-        hash ^= k;
-        return Long.rotateLeft(hash, 27) * PRIME_1 + PRIME_4;
-    }
-
-    private static long mix(long hash) {
-        hash ^= hash >>> 33;
-        hash *= PRIME_2;
-        hash ^= hash >>> 29;
-        hash *= PRIME_3;
-        hash ^= hash >>> 32;
-        return hash;
-    }
-
     static class Page {
 
         static final int PAGE_SIZE = 1024;
@@ -311,16 +294,12 @@ private void computeSlice(ByteBuffer slice, double[][] pages) {
                 ByteBuffer key = slice.slice(offset, nextSeparator - offset).order(ByteOrder.LITTLE_ENDIAN);
                 // find the global dictionary code to aggregate,
                 // making this code global allows easy merging
-                int dictId = dictionary.encode(hash(key), key);
+                int dictId = dictionary.encode(key.hashCode(), key);
 
                 offset = nextSeparator + 1;
                 int newLine = findIndexOf(slice, offset, NEW_LINE);
                 // parse the double
-                // todo do this without allocating a string, could use a fast parsing falgorithm
-                var bytes = new byte[newLine - offset];
-                slice.get(offset, bytes);
-                var string = new String(bytes, StandardCharsets.US_ASCII);
-                double d = Double.parseDouble(string);
+                double d = parseTemperature(slice.slice(offset, newLine - offset));
 
                 Page.update(pages, dictId, d);
 
@@ -351,7 +330,7 @@ private static void merge(double[][] contribution, double[][] aggregate) {
         protected double[][] compute() {
             if (min == max) {
                 // fixme - hardcoded to problem size
-                var pages = new double[1024][];
+                var pages = new double[600][];
                 var slice = slices.get(min);
                 computeSlice(slice, pages);
                 return pages;
@@ -368,7 +347,7 @@ protected double[][] compute() {
     }
 
     public static void main(String[] args) throws IOException {
-        int maxChunkSize = 10 << 20; // 10MiB
+        int maxChunkSize = 250 << 20; // 250MiB
         try (var raf = new RandomAccessFile(FILE, "r");
              var channel = raf.getChannel()) {
             long size = channel.size();