final version for abeobk (#654)

* final version * Correct stupid mistake * min/max trick does not help that much, setting initial value does. * cut the tail
2024-02-01 04:03:20 +09:00
parent f0f6570975
commit 3c454d0222
2 changed files with 393 additions and 355 deletions
--- a/prepare_abeobk.sh
+++ b/prepare_abeobk.sh
@@ -20,6 +20,6 @@ sdk use java 21.0.2-graal 1>&2

 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_abeobk_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -dsa -march=native -H:InlineAllBonus=10 -H:-GenLoopSafepoints -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -H:InlineAllBonus=10 -H:-GenLoopSafepoints --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
 fi
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -34,7 +34,6 @@ import java.util.stream.IntStream;
 import sun.misc.Unsafe;

 public class CalculateAverage_abeobk {
-    private static final boolean SHOW_ANALYSIS = false;
    private static final int CPU_CNT = Runtime.getRuntime().availableProcessors();

    private static final String FILE = "./measurements.txt";
@@ -42,7 +41,7 @@ public class CalculateAverage_abeobk {
    private static final long BUCKET_MASK = BUCKET_SIZE - 1;
    private static final int MAX_STR_LEN = 100;
    private static final int MAX_STATIONS = 10000;
-    private static final long CHUNK_SZ = 1 << 22; // 4MB chunk
+    private static final long CHUNK_SZ = 1 << 22;
    private static final Unsafe UNSAFE = initUnsafe();
    private static final long[] HASH_MASKS = new long[]{
            0x0L,
@@ -60,10 +59,6 @@ public class CalculateAverage_abeobk {
    private static int chunk_cnt;
    private static long start_addr, end_addr;

-    private static final void debug(String s, Object... args) {
-        System.out.println(String.format(s, args));
-    }
-
    private static Unsafe initUnsafe() {
        try {
            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
@@ -75,354 +70,9 @@ public class CalculateAverage_abeobk {
        }
    }

-    // use native type, less conversion
-    static class Node {
-        long addr;
-        long hash;
-        long word0;
-        long tail;
-        long sum;
-        long min, max;
-        int keylen;
-        int count;
-
-        public final String toString() {
-            return (min / 10.0) + "/"
-                    + (Math.round(((double) sum / count)) / 10.0) + "/"
-                    + (max / 10.0);
-        }
-
-        final String key() {
-            byte[] sbuf = new byte[MAX_STR_LEN];
-            UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
-            return new String(sbuf, 0, (int) keylen, StandardCharsets.UTF_8);
-        }
-
-        Node(long a, long t, int kl, long h) {
-            addr = a;
-            tail = t;
-            min = 999;
-            max = -999;
-            keylen = kl;
-            hash = h;
-        }
-
-        Node(long a, long w0, long t, int kl, long h) {
-            addr = a;
-            word0 = w0;
-            min = 999;
-            max = -999;
-            tail = t;
-            keylen = kl;
-            hash = h;
-        }
-
-        final void add(long val) {
-            sum += val;
-            count++;
-            if (val > max) {
-                max = val;
-            }
-            if (val < min) {
-                min = val;
-            }
-        }
-
-        final void merge(Node other) {
-            sum += other.sum;
-            count += other.count;
-            if (other.max > max) {
-                max = other.max;
-            }
-            if (other.min < min) {
-                min = other.min;
-            }
-        }
-
-        final boolean contentEquals(long other_addr, long other_word0, long other_tail, long kl) {
-            if (word0 != other_word0 || tail != other_tail)
-                return false;
-            // this is faster than comparision if key is short
-            long xsum = 0;
-            long n = kl & 0xF8;
-            for (long i = 8; i < n; i += 8) {
-                xsum |= (UNSAFE.getLong(addr + i) ^ UNSAFE.getLong(other_addr + i));
-            }
-            return xsum == 0;
-        }
-
-        final boolean contentEquals(Node other) {
-            if (tail != other.tail)
-                return false;
-            long n = keylen & 0xF8;
-            for (long i = 0; i < n; i += 8) {
-                if (UNSAFE.getLong(addr + i) != UNSAFE.getLong(other.addr + i))
-                    return false;
-            }
-            return true;
-        }
-    }
-
-    // idea from royvanrijn
-    static final long getSemiPosCode(final long word) {
-        long xor_semi = word ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
-        return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
-    }
-
-    static final long getLFCode(final long word) {
-        long xor_semi = word ^ 0x0A0A0A0A0A0A0A0AL; // xor with \n\n\n\n\n\n\n\n
-        return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
-    }
-
-    static final long nextLine(long addr) {
-        long word = UNSAFE.getLong(addr);
-        long lfpos_code = getLFCode(word);
-        while (lfpos_code == 0) {
-            addr += 8;
-            word = UNSAFE.getLong(addr);
-            lfpos_code = getLFCode(word);
-        }
-        return addr + (Long.numberOfTrailingZeros(lfpos_code) >>> 3) + 1;
-    }
-
-    // speed/collision balance
-    static final long xxh32(long hash) {
-        long h = hash * 37;
-        return (h ^ (h >>> 29));
-    }
-
-    static final class ChunkParser {
-        long addr;
-        long end;
-        Node[] map;
-
-        ChunkParser(Node[] m, long a, long e) {
-            map = m;
-            addr = a;
-            end = e;
-        }
-
-        final boolean ok() {
-            return addr < end;
-        }
-
-        final long word() {
-            return UNSAFE.getLong(addr);
-        }
-
-        final long val() {
-            long num_word = UNSAFE.getLong(addr);
-            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-            addr += (dot_pos >>> 3) + 3;
-            // great idea from merykitty (Quan Anh Mai)
-            int shift = 28 - dot_pos;
-            long signed = (~num_word << 59) >> 63;
-            long dsmask = ~(signed & 0xFF);
-            long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
-            long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-            return ((abs_val ^ signed) - signed);
-        }
-
-        // optimize for contest
-        // save as much slow memory access as possible
-        // about 50% key < 8chars, 25% key bettween 8-10 chars
-        // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
-        final Node key(long word0, long semipos_code) {
-            long row_addr = addr;
-            // about 50% chance key < 8 chars
-            if (semipos_code != 0) {
-                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos + 1;
-                long tail = word0 & HASH_MASKS[semi_pos];
-                long hash = xxh32(tail);
-                int bucket = (int) (hash & BUCKET_MASK);
-                while (true) {
-                    Node node = map[bucket];
-                    if (node == null) {
-                        return (map[bucket] = new Node(row_addr, tail, semi_pos, hash));
-                    }
-                    if (node.tail == tail) {
-                        return node;
-                    }
-                    bucket++;
-                }
-            }
-
-            addr += 8;
-            long word = UNSAFE.getLong(addr);
-            semipos_code = getSemiPosCode(word);
-            // 43% chance
-            if (semipos_code != 0) {
-                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos + 1;
-                long tail = (word & HASH_MASKS[semi_pos]);
-                long hash = xxh32(word0 ^ tail);
-                int bucket = (int) (hash & BUCKET_MASK);
-                while (true) {
-                    Node node = map[bucket];
-                    if (node == null) {
-                        return (map[bucket] = new Node(row_addr, word0, tail, semi_pos + 8, hash));
-                    }
-                    if (node.word0 == word0 && node.tail == tail) {
-                        return node;
-                    }
-                    bucket++;
-                }
-            }
-
-            // why not going for more? tested, slower
-            long hash = word0;
-            while (semipos_code == 0) {
-                hash ^= word;
-                addr += 8;
-                word = UNSAFE.getLong(addr);
-                semipos_code = getSemiPosCode(word);
-            }
-
-            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-            addr += semi_pos;
-            long keylen = addr - row_addr;
-            addr++;
-            long tail = (word & HASH_MASKS[semi_pos]);
-            hash = xxh32(hash ^ tail);
-            int bucket = (int) (hash & BUCKET_MASK);
-
-            while (true) {
-                Node node = map[bucket];
-                if (node == null) {
-                    return (map[bucket] = new Node(row_addr, word0, tail, (int) keylen, hash));
-                }
-                if (node.contentEquals(row_addr, word0, tail, keylen)) {
-                    return node;
-                }
-                bucket++;
-            }
-        }
-    }
-
-    // Thread pool worker
-    static final class Worker extends Thread {
-        final int thread_id; // for debug use only
-        int cls = 0;
-
-        Worker(int i) {
-            thread_id = i;
-            this.start();
-        }
-
-        @Override
-        public void run() {
-            var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
-
-            int id;
-            // process in small chunk to maintain disk locality (artsiomkorzun trick)
-            while ((id = chunk_id.getAndIncrement()) < chunk_cnt) {
-                long addr = start_addr + id * CHUNK_SZ;
-                long end = Math.min(addr + CHUNK_SZ, end_addr);
-
-                // find start of line
-                if (id > 0) {
-                    addr = nextLine(addr);
-                }
-
-                final int num_segs = 3;
-                long seglen = (end - addr) / num_segs;
-
-                long a0 = addr;
-                long a1 = nextLine(addr + 1 * seglen);
-                long a2 = nextLine(addr + 2 * seglen);
-                ChunkParser p0 = new ChunkParser(map, a0, a1);
-                ChunkParser p1 = new ChunkParser(map, a1, a2);
-                ChunkParser p2 = new ChunkParser(map, a2, end);
-
-                while (p0.ok() && p1.ok() && p2.ok()) {
-                    long w0 = p0.word();
-                    long w1 = p1.word();
-                    long w2 = p2.word();
-                    long sc0 = getSemiPosCode(w0);
-                    long sc1 = getSemiPosCode(w1);
-                    long sc2 = getSemiPosCode(w2);
-                    Node n0 = p0.key(w0, sc0);
-                    Node n1 = p1.key(w1, sc1);
-                    Node n2 = p2.key(w2, sc2);
-                    long v0 = p0.val();
-                    long v1 = p1.val();
-                    long v2 = p2.val();
-                    n0.add(v0);
-                    n1.add(v1);
-                    n2.add(v2);
-                }
-
-                while (p0.ok()) {
-                    long w = p0.word();
-                    long sc = getSemiPosCode(w);
-                    Node n = p0.key(w, sc);
-                    long v = p0.val();
-                    n.add(v);
-                }
-                while (p1.ok()) {
-                    long w = p1.word();
-                    long sc = getSemiPosCode(w);
-                    Node n = p1.key(w, sc);
-                    long v = p1.val();
-                    n.add(v);
-                }
-                while (p2.ok()) {
-                    long w = p2.word();
-                    long sc = getSemiPosCode(w);
-                    Node n = p2.key(w, sc);
-                    long v = p2.val();
-                    n.add(v);
-                }
-            }
-
-            // merge is cheaper than string casting (artsiomkorzun)
-            while (!mapref.compareAndSet(null, map)) {
-                var other_map = mapref.getAndSet(null);
-                if (other_map != null) {
-                    for (int i = 0; i < other_map.length; i++) {
-                        var other = other_map[i];
-                        if (other == null)
-                            continue;
-                        int bucket = (int) (other.hash & BUCKET_MASK);
-                        while (true) {
-                            var node = map[bucket];
-                            if (node == null) {
-                                map[bucket] = other;
-                                break;
-                            }
-                            if (node.contentEquals(other)) {
-                                node.merge(other);
-                                break;
-                            }
-                            bucket++;
-                            if (SHOW_ANALYSIS)
-                                cls++;
-                        }
-                    }
-                }
-            }
-
-            if (SHOW_ANALYSIS) {
-                debug("Thread %d collision = %d", thread_id, cls);
-            }
-        }
-    }
-
-    // thomaswue trick
-    private static void spawnWorker() throws IOException {
-        ProcessHandle.Info info = ProcessHandle.current().info();
-        ArrayList<String> workerCommand = new ArrayList<>();
-        info.command().ifPresent(workerCommand::add);
-        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
-        workerCommand.add("--worker");
-        new ProcessBuilder()
-                .command(workerCommand)
-                .start()
-                .getInputStream()
-                .transferTo(System.out);
-    }
-
+    /*
+     * MAIN FUNCTION
+     */
    public static void main(String[] args) throws InterruptedException, IOException {
        // thomaswue trick
        if (args.length == 0 || !("--worker".equals(args[0]))) {
@@ -457,4 +107,392 @@ public class CalculateAverage_abeobk {
        System.out.println(ms);
        System.out.close();
    }
+
+    /*
+     * HELPER FUNCTIONS
+     */
+
+    // Get semicolon pos code
+    static final long getSemiCode(final long w) {
+        long x = w ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
+        return (x - 0x0101010101010101L) & (~x & 0x8080808080808080L);
+    }
+
+    // Get new line pos code
+    static final long getLFCode(final long w) {
+        long x = w ^ 0x0A0A0A0A0A0A0A0AL; // xor with \n\n\n\n\n\n\n\n
+        return (x - 0x0101010101010101L) & (~x & 0x8080808080808080L);
+    }
+
+    // Get decimal point pos code
+    static final int getDotCode(final long w) {
+        return Long.numberOfTrailingZeros(~w & 0x10101000);
+    }
+
+    // Convert semicolon pos code to position
+    static final int getSemiPos(final long spc) {
+        return Long.numberOfTrailingZeros(spc) >>> 3;
+    }
+
+    // Find next line address
+    static final long nextLF(long addr) {
+        long word = UNSAFE.getLong(addr);
+        long lfpos_code = getLFCode(word);
+        while (lfpos_code == 0) {
+            addr += 8;
+            word = UNSAFE.getLong(addr);
+            lfpos_code = getLFCode(word);
+        }
+        return addr + (Long.numberOfTrailingZeros(lfpos_code) >>> 3) + 1;
+    }
+
+    // Parse number
+    // great idea from merykitty (Quan Anh Mai)
+    static final long num(long w, int d) {
+        int shift = 28 - d;
+        long signed = (~w << 59) >> 63;
+        long dsmask = ~(signed & 0xFF);
+        long digits = ((w & dsmask) << shift) & 0x0F000F0F00L;
+        long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+        return ((abs_val ^ signed) - signed);
+    }
+
+    // Hash mixer
+    static final long mix(long hash) {
+        long h = hash * 37;
+        return (h ^ (h >>> 29));
+    }
+
+    // Spawn worker (thomaswue trick
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
+    final static class Node {
+        long addr;
+        long hash;
+        long word0;
+        long sum;
+        long min, max;
+        int keylen;
+        int count;
+
+        public final String toString() {
+            return (min / 10.0) + "/"
+                    + (Math.round(((double) sum / count)) / 10.0) + "/"
+                    + (max / 10.0);
+        }
+
+        final String key() {
+            byte[] sbuf = new byte[MAX_STR_LEN];
+            UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
+            return new String(sbuf, 0, (int) keylen, StandardCharsets.UTF_8);
+        }
+
+        Node(long a, long h, int kl, long v) {
+            addr = a;
+            min = max = v;
+            keylen = kl;
+            hash = h;
+        }
+
+        Node(long a, long h, int kl) {
+            addr = a;
+            hash = h;
+            min = 999;
+            max = -999;
+            keylen = kl;
+        }
+
+        Node(long a, long w0, long h, int kl, long v) {
+            addr = a;
+            word0 = w0;
+            hash = h;
+            min = max = v;
+            keylen = kl;
+        }
+
+        Node(long a, long w0, long h, int kl) {
+            addr = a;
+            word0 = w0;
+            hash = h;
+            min = 999;
+            max = -999;
+            keylen = kl;
+        }
+
+        final void add(long val) {
+            sum += val;
+            count++;
+            if (val > max) {
+                max = val;
+            }
+            if (val < min) {
+                min = val;
+            }
+        }
+
+        final void merge(Node other) {
+            sum += other.sum;
+            count += other.count;
+            if (other.max > max) {
+                max = other.max;
+            }
+            if (other.min < min) {
+                min = other.min;
+            }
+        }
+
+        final boolean contentEquals(long other_addr, long other_word0, long other_hash, long kl) {
+            if (word0 != other_word0 || hash != other_hash)
+                return false;
+            // this is faster than comparision if key is short
+            long xsum = 0;
+            long n = kl & 0xF8;
+            for (long i = 8; i < n; i += 8) {
+                xsum |= (UNSAFE.getLong(addr + i) ^ UNSAFE.getLong(other_addr + i));
+            }
+            return xsum == 0;
+        }
+
+        final boolean contentEquals(Node other) {
+            if (hash != other.hash)
+                return false;
+            long n = keylen & 0xF8;
+            for (long i = 0; i < n; i += 8) {
+                if (UNSAFE.getLong(addr + i) != UNSAFE.getLong(other.addr + i))
+                    return false;
+            }
+            return true;
+        }
+    }
+
+    // Thread pool worker
+    static final class Worker extends Thread {
+        final int thread_id; // for debug use only
+
+        Worker(int i) {
+            thread_id = i;
+            this.setPriority(Thread.MAX_PRIORITY);
+            this.start();
+        }
+
+        @Override
+        public void run() {
+            var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
+
+            int id;
+            // process in small chunk to maintain disk locality (artsiomkorzun trick)
+            while ((id = chunk_id.getAndIncrement()) < chunk_cnt) {
+                long addr = start_addr + id * CHUNK_SZ;
+                long end = Math.min(addr + CHUNK_SZ, end_addr);
+
+                // find start of line
+                if (id > 0) {
+                    addr = nextLF(addr);
+                }
+
+                final int num_segs = 3;
+                long seglen = (end - addr) / num_segs;
+
+                long a0 = addr;
+                long a1 = nextLF(addr + 1 * seglen);
+                long a2 = nextLF(addr + 2 * seglen);
+                ChunkParser p0 = new ChunkParser(map, a0, a1);
+                ChunkParser p1 = new ChunkParser(map, a1, a2);
+                ChunkParser p2 = new ChunkParser(map, a2, end);
+
+                while (p0.ok() && p1.ok() && p2.ok()) {
+                    long w0 = p0.word();
+                    long w1 = p1.word();
+                    long w2 = p2.word();
+                    long sc0 = getSemiCode(w0);
+                    long sc1 = getSemiCode(w1);
+                    long sc2 = getSemiCode(w2);
+                    Node n0 = p0.key(w0, sc0);
+                    Node n1 = p1.key(w1, sc1);
+                    Node n2 = p2.key(w2, sc2);
+                    long v0 = p0.val();
+                    long v1 = p1.val();
+                    long v2 = p2.val();
+                    n0.add(v0);
+                    n1.add(v1);
+                    n2.add(v2);
+                }
+
+                while (p0.ok()) {
+                    long w = p0.word();
+                    long sc = getSemiCode(w);
+                    Node n = p0.key(w, sc);
+                    long v = p0.val();
+                    n.add(v);
+                }
+                while (p1.ok()) {
+                    long w = p1.word();
+                    long sc = getSemiCode(w);
+                    Node n = p1.key(w, sc);
+                    long v = p1.val();
+                    n.add(v);
+                }
+                while (p2.ok()) {
+                    long w = p2.word();
+                    long sc = getSemiCode(w);
+                    Node n = p2.key(w, sc);
+                    long v = p2.val();
+                    n.add(v);
+                }
+            }
+
+            // merge is cheaper than string casting (artsiomkorzun)
+            while (!mapref.compareAndSet(null, map)) {
+                var other_map = mapref.getAndSet(null);
+                if (other_map != null) {
+                    for (int i = 0; i < other_map.length; i++) {
+                        var other = other_map[i];
+                        if (other == null)
+                            continue;
+                        int bucket = (int) (other.hash & BUCKET_MASK);
+                        while (true) {
+                            var node = map[bucket];
+                            if (node == null) {
+                                map[bucket] = other;
+                                break;
+                            }
+                            if (node.contentEquals(other)) {
+                                node.merge(other);
+                                break;
+                            }
+                            bucket++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    static final class ChunkParser {
+        long addr;
+        long end;
+        Node[] map;
+
+        ChunkParser(Node[] m, long a, long e) {
+            map = m;
+            addr = a;
+            end = e;
+        }
+
+        final boolean ok() {
+            return addr < end;
+        }
+
+        final long word() {
+            return UNSAFE.getLong(addr);
+        }
+
+        final void skip(int n) {
+            addr += n;
+        }
+
+        final void skip(long n) {
+            addr += n;
+        }
+
+        final long val0() {
+            long w = word();
+            int d = getDotCode(w);
+            return num(w, d);
+        }
+
+        final long val() {
+            long w = word();
+            int d = getDotCode(w);
+            skip((d >>> 3) + 3);
+            return num(w, d);
+        }
+
+        // optimize for contest
+        // save as much slow memory access as possible
+        // about 50% key < 8chars, 25% key bettween 8-10 chars
+        // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
+        final Node key(long word0, long semipos_code) {
+            long row_addr = addr;
+            // about 50% chance key < 8 chars
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                skip(semi_pos + 1);
+                long tail = word0 & HASH_MASKS[semi_pos];
+                long hash = mix(tail);
+                int bucket = (int) (hash & BUCKET_MASK);
+                while (true) {
+                    Node node = map[bucket];
+                    if (node == null) {
+                        return (map[bucket] = new Node(row_addr, hash, semi_pos));
+                    }
+                    if (node.hash == hash) {
+                        return node;
+                    }
+                    bucket++;
+                }
+            }
+
+            skip(8);
+            long word = UNSAFE.getLong(addr);
+            semipos_code = getSemiCode(word);
+            // 43% chance
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                skip(semi_pos + 1);
+                long tail = word0 ^ (word & HASH_MASKS[semi_pos]);
+                long hash = mix(tail);
+                int bucket = (int) (hash & BUCKET_MASK);
+                while (true) {
+                    Node node = map[bucket];
+                    if (node == null) {
+                        return (map[bucket] = new Node(row_addr, word0, hash, semi_pos + 8));
+                    }
+                    if (node.word0 == word0 && node.hash == hash) {
+                        return node;
+                    }
+                    bucket++;
+                }
+            }
+
+            // why not going for more? tested, slower
+            long hash = word0;
+            while (semipos_code == 0) {
+                hash ^= word;
+                skip(8);
+                word = UNSAFE.getLong(addr);
+                semipos_code = getSemiCode(word);
+            }
+
+            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+            skip(semi_pos);
+            long keylen = addr - row_addr;
+            skip(1);
+            long tail = hash ^ (word & HASH_MASKS[semi_pos]);
+            hash = mix(tail);
+            int bucket = (int) (hash & BUCKET_MASK);
+
+            while (true) {
+                Node node = map[bucket];
+                if (node == null) {
+                    return (map[bucket] = new Node(row_addr, word0, hash, (int) keylen));
+                }
+                if (node.contentEquals(row_addr, word0, hash, keylen)) {
+                    return node;
+                }
+                bucket++;
+            }
+        }
+    }
 }