Elliot Barlas: Use proper hash key collision detection scheme
* Use open-addressing scheme to deal with hash table collisions. Reduce concurrency from 16 to 8. Use bit mask rather than mod operator to confine hash code to table range. * Properly handle file partitions that reside entirely within a line. * Reorder statements in doProcessBuffer.
This commit is contained in:
		| @@ -17,4 +17,4 @@ | |||||||
|  |  | ||||||
| sdk use java 21.0.1-graalce | sdk use java 21.0.1-graalce | ||||||
| JAVA_OPTS="" | JAVA_OPTS="" | ||||||
| time java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ebarlas measurements.txt 16 | time java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ebarlas measurements.txt 8 | ||||||
|   | |||||||
| @@ -21,13 +21,15 @@ import java.nio.channels.FileChannel; | |||||||
| import java.nio.charset.StandardCharsets; | import java.nio.charset.StandardCharsets; | ||||||
| import java.nio.file.Paths; | import java.nio.file.Paths; | ||||||
| import java.nio.file.StandardOpenOption; | import java.nio.file.StandardOpenOption; | ||||||
|  | import java.util.Arrays; | ||||||
| import java.util.List; | import java.util.List; | ||||||
| import java.util.TreeMap; | import java.util.TreeMap; | ||||||
|  |  | ||||||
| public class CalculateAverage_ebarlas { | public class CalculateAverage_ebarlas { | ||||||
|  |  | ||||||
|     private static final int HASH_FACTOR = 278; |     private static final int MAX_KEY_SIZE = 100; | ||||||
|     private static final int HASH_MOD = 3_487; |     private static final int HASH_FACTOR = 433; | ||||||
|  |     private static final int HASH_TBL_SIZE = 16_383; // range of allowed hash values, inclusive | ||||||
|  |  | ||||||
|     public static void main(String[] args) throws IOException, InterruptedException { |     public static void main(String[] args) throws IOException, InterruptedException { | ||||||
|         if (args.length != 2) { |         if (args.length != 2) { | ||||||
| @@ -92,11 +94,7 @@ public class CalculateAverage_ebarlas { | |||||||
|             var current = partitions.get(i).stats; |             var current = partitions.get(i).stats; | ||||||
|             for (int j = 0; j < current.length; j++) { |             for (int j = 0; j < current.length; j++) { | ||||||
|                 if (current[j] != null) { |                 if (current[j] != null) { | ||||||
|                     var t = target[j]; |                     var t = findInTable(target, current[j].hash, current[j].key, current[j].key.length); | ||||||
|                     if (t == null) { |  | ||||||
|                         target[j] = current[j]; // copy ref from current to target |  | ||||||
|                     } |  | ||||||
|                     else { |  | ||||||
|                     t.min = Math.min(t.min, current[j].min); |                     t.min = Math.min(t.min, current[j].min); | ||||||
|                     t.max = Math.max(t.max, current[j].max); |                     t.max = Math.max(t.max, current[j].max); | ||||||
|                     t.sum += current[j].sum; |                     t.sum += current[j].sum; | ||||||
| @@ -104,7 +102,6 @@ public class CalculateAverage_ebarlas { | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         } |  | ||||||
|         return target; |         return target; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -114,7 +111,12 @@ public class CalculateAverage_ebarlas { | |||||||
|             var pPrev = partitions.get(i - 1); |             var pPrev = partitions.get(i - 1); | ||||||
|             var merged = mergeFooterAndHeader(pPrev.footer, pNext.header); |             var merged = mergeFooterAndHeader(pPrev.footer, pNext.header); | ||||||
|             if (merged != null) { |             if (merged != null) { | ||||||
|                 doProcessBuffer(ByteBuffer.wrap(merged), true, pPrev.stats); // fold into prev partition |                 if (merged[merged.length - 1] == '\n') { // fold into prev partition | ||||||
|  |                     doProcessBuffer(ByteBuffer.wrap(merged), true, pPrev.stats); | ||||||
|  |                 } | ||||||
|  |                 else { // no newline appeared in partition, carry forward | ||||||
|  |                     pNext.footer = merged; | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -133,32 +135,36 @@ public class CalculateAverage_ebarlas { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     private static Partition processBuffer(ByteBuffer buffer, boolean first) { |     private static Partition processBuffer(ByteBuffer buffer, boolean first) { | ||||||
|         return doProcessBuffer(buffer, first, new Stats[HASH_MOD * 2]); |         return doProcessBuffer(buffer, first, new Stats[HASH_TBL_SIZE + 1]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     private static Partition doProcessBuffer(ByteBuffer buffer, boolean first, Stats[] stats) { |     private static Partition doProcessBuffer(ByteBuffer buffer, boolean first, Stats[] stats) { | ||||||
|         var readingKey = true; |  | ||||||
|         var keyHash = 0; |  | ||||||
|         var keyStart = 0; |  | ||||||
|         var negative = false; |  | ||||||
|         var val = 0; |  | ||||||
|         var header = first ? null : readHeader(buffer); |         var header = first ? null : readHeader(buffer); | ||||||
|  |         var readingKey = true; // reading key or value? | ||||||
|  |         var keyBuf = new byte[MAX_KEY_SIZE]; // buffer for key | ||||||
|  |         var keyPos = 0; // current position in key buffer | ||||||
|  |         var keyHash = 0; // accumulating hash of key | ||||||
|  |         var keyStart = buffer.position(); // start of key in buffer used for footer calc | ||||||
|  |         var negative = false; // is value negative? | ||||||
|  |         var val = 0; // accumulating value | ||||||
|         Stats st = null; |         Stats st = null; | ||||||
|         while (buffer.hasRemaining()) { |         while (buffer.hasRemaining()) { | ||||||
|             var b = buffer.get(); |             var b = buffer.get(); | ||||||
|             if (readingKey) { |             if (readingKey) { | ||||||
|                 if (b == ';') { |                 if (b != ';') { | ||||||
|                     var idx = HASH_MOD + keyHash % HASH_MOD; |                     keyHash = HASH_FACTOR * keyHash + b; | ||||||
|                     st = stats[idx]; |                     keyBuf[keyPos++] = b; | ||||||
|                     if (st == null) { |  | ||||||
|                         var key = new byte[buffer.position() - keyStart - 1]; |  | ||||||
|                         buffer.get(keyStart, key, 0, key.length); |  | ||||||
|                         st = stats[idx] = new Stats(key); |  | ||||||
|                     } |  | ||||||
|                     readingKey = false; |  | ||||||
|                 } |                 } | ||||||
|                 else { |                 else { | ||||||
|                     keyHash = HASH_FACTOR * keyHash + b; |                     var idx = keyHash & HASH_TBL_SIZE; | ||||||
|  |                     st = stats[idx]; | ||||||
|  |                     if (st == null) { // nothing in table, eagerly claim spot | ||||||
|  |                         st = stats[idx] = newStats(keyBuf, keyPos, keyHash); | ||||||
|  |                     } | ||||||
|  |                     else if (!Arrays.equals(st.key, 0, st.key.length, keyBuf, 0, keyPos)) { | ||||||
|  |                         st = findInTable(stats, keyHash, keyBuf, keyPos); | ||||||
|  |                     } | ||||||
|  |                     readingKey = false; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             else { |             else { | ||||||
| @@ -173,6 +179,7 @@ public class CalculateAverage_ebarlas { | |||||||
|                     val = 0; |                     val = 0; | ||||||
|                     negative = false; |                     negative = false; | ||||||
|                     keyStart = buffer.position(); |                     keyStart = buffer.position(); | ||||||
|  |                     keyPos = 0; | ||||||
|                 } |                 } | ||||||
|                 else if (b == '-') { |                 else if (b == '-') { | ||||||
|                     negative = true; |                     negative = true; | ||||||
| @@ -186,6 +193,25 @@ public class CalculateAverage_ebarlas { | |||||||
|         return new Partition(header, footer, stats); |         return new Partition(header, footer, stats); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     private static Stats findInTable(Stats[] stats, int hash, byte[] key, int len) { // open-addressing scan | ||||||
|  |         var idx = hash & HASH_TBL_SIZE; | ||||||
|  |         var st = stats[idx]; | ||||||
|  |         while (st != null && !Arrays.equals(st.key, 0, st.key.length, key, 0, len)) { | ||||||
|  |             idx = (idx + 1) % (HASH_TBL_SIZE + 1); | ||||||
|  |             st = stats[idx]; | ||||||
|  |         } | ||||||
|  |         if (st != null) { | ||||||
|  |             return st; | ||||||
|  |         } | ||||||
|  |         return stats[idx] = newStats(key, len, hash); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private static Stats newStats(byte[] buffer, int len, int hash) { | ||||||
|  |         var k = new byte[len]; | ||||||
|  |         System.arraycopy(buffer, 0, k, 0, len); | ||||||
|  |         return new Stats(k, hash); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     private static byte[] readFooter(ByteBuffer buffer, int lineStart) { // read from line start to current pos (end-of-input) |     private static byte[] readFooter(ByteBuffer buffer, int lineStart) { // read from line start to current pos (end-of-input) | ||||||
|         var footer = new byte[buffer.position() - lineStart]; |         var footer = new byte[buffer.position() - lineStart]; | ||||||
|         buffer.get(lineStart, footer, 0, footer.length); |         buffer.get(lineStart, footer, 0, footer.length); | ||||||
| @@ -200,18 +226,29 @@ public class CalculateAverage_ebarlas { | |||||||
|         return header; |         return header; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     record Partition(byte[] header, byte[] footer, Stats[] stats) { |     private static class Partition { | ||||||
|  |         byte[] header; | ||||||
|  |         byte[] footer; | ||||||
|  |         Stats[] stats; | ||||||
|  |  | ||||||
|  |         Partition(byte[] header, byte[] footer, Stats[] stats) { | ||||||
|  |             this.header = header; | ||||||
|  |             this.footer = footer; | ||||||
|  |             this.stats = stats; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     private static class Stats { // min, max, and sum values are modeled with integral types that represent tenths of a unit |     private static class Stats { // min, max, and sum values are modeled with integral types that represent tenths of a unit | ||||||
|         final byte[] key; |         final byte[] key; | ||||||
|  |         final int hash; | ||||||
|         int min = Integer.MAX_VALUE; |         int min = Integer.MAX_VALUE; | ||||||
|         int max = Integer.MIN_VALUE; |         int max = Integer.MIN_VALUE; | ||||||
|         long sum; |         long sum; | ||||||
|         long count; |         long count; | ||||||
|  |  | ||||||
|         Stats(byte[] key) { |         Stats(byte[] key, int hash) { | ||||||
|             this.key = key; |             this.key = key; | ||||||
|  |             this.hash = hash; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user