CalculateAverage_gonix update (#706)

Backported some of the optimizations from unsafe solution.

Co-authored-by: Giedrius D <d.giedrius@gmail.com>
This commit is contained in:
gonix 2024-02-01 12:53:46 +02:00 committed by GitHub
parent fdd539e1f9
commit 1e7314d5fb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 320 additions and 209 deletions

View File

@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash
#
# Copyright 2023 The original authors
#
@ -17,4 +17,4 @@
JAVA_OPTS="--enable-preview"
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gonix
exec cat < <(exec java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gonix)

View File

@ -46,6 +46,7 @@ public class CalculateAverage_gonix {
TreeMap::new));
System.out.println(res);
System.out.close();
}
private static List<MappedByteBuffer> buildChunks(RandomAccessFile file) throws IOException {
@ -75,248 +76,358 @@ public class CalculateAverage_gonix {
}
return chunks;
}
}
class Aggregator {
private static final int MAX_STATIONS = 10_000;
private static final int MAX_STATION_SIZE = Math.ceilDiv(100, 8) + 5;
private static final int INDEX_SIZE = 1024 * 1024;
private static final int INDEX_MASK = INDEX_SIZE - 1;
private static final int FLD_COUNT = 0;
private static final int FLD_SUM = 1;
private static final int FLD_MIN = 2;
private static final int FLD_MAX = 3;
private static class Aggregator {
private static final int MAX_STATIONS = 10_000;
private static final int MAX_STATION_SIZE = Math.ceilDiv(100, 8) + 5;
private static final int INDEX_SIZE = 1024 * 1024;
private static final int INDEX_MASK = INDEX_SIZE - 1;
private static final int FLD_COUNT = 0;
private static final int FLD_SUM = 1;
private static final int FLD_MIN = 2;
private static final int FLD_MAX = 3;
// Poor man's hash map: hash code to offset in `mem`.
private final int[] index;
// Poor man's hash map: hash code to offset in `mem`.
private final int[] index;
// Contiguous storage of key (station name) and stats fields of all
// unique stations.
// The idea here is to improve locality so that stats fields would
// possibly be already in the CPU cache after we are done comparing
// the key.
private final long[] mem;
private int memUsed;
// Contiguous storage of key (station name) and stats fields of all
// unique stations.
// The idea here is to improve locality so that stats fields would
// possibly be already in the CPU cache after we are done comparing
// the key.
private final long[] mem;
private int memUsed;
Aggregator() {
assert ((INDEX_SIZE & (INDEX_SIZE - 1)) == 0) : "INDEX_SIZE must be power of 2";
assert (INDEX_SIZE > MAX_STATIONS) : "INDEX_SIZE must be greater than MAX_STATIONS";
Aggregator() {
assert ((INDEX_SIZE & (INDEX_SIZE - 1)) == 0) : "INDEX_SIZE must be power of 2";
assert (INDEX_SIZE > MAX_STATIONS) : "INDEX_SIZE must be greater than MAX_STATIONS";
index = new int[INDEX_SIZE];
mem = new long[1 + (MAX_STATIONS * MAX_STATION_SIZE)];
memUsed = 1;
}
Aggregator processChunk(MappedByteBuffer buf) {
// To avoid checking if it is safe to read a whole long near the
// end of a chunk, we copy last couple of lines to a padded buffer
// and process that part separately.
int limit = buf.limit();
int pos = Math.max(limit - 16, -1);
while (pos >= 0 && buf.get(pos) != '\n') {
pos--;
index = new int[INDEX_SIZE];
mem = new long[1 + (MAX_STATIONS * MAX_STATION_SIZE)];
memUsed = 1;
}
pos++;
if (pos > 0) {
processChunkLongs(buf, pos);
Aggregator processChunk(MappedByteBuffer buf) {
// To avoid checking if it is safe to read a whole long near the
// end of a chunk, we copy last couple of lines to a padded buffer
// and process that part separately.
int limit = buf.limit();
int pos = Math.max(limit - 16, -1);
while (pos >= 0 && buf.get(pos) != '\n') {
pos--;
}
pos++;
if (pos > 0) {
processChunkLongs(buf, pos);
}
int tailLen = limit - pos;
var tailBuf = ByteBuffer.allocate(tailLen + 8).order(ByteOrder.nativeOrder());
buf.get(pos, tailBuf.array(), 0, tailLen);
processChunkLongs(tailBuf, tailLen);
return this;
}
int tailLen = limit - pos;
var tailBuf = ByteBuffer.allocate(tailLen + 8).order(ByteOrder.nativeOrder());
buf.get(pos, tailBuf.array(), 0, tailLen);
processChunkLongs(tailBuf, tailLen);
return this;
}
Aggregator processChunkLongs(ByteBuffer buf, int limit) {
int pos = 0;
while (pos < limit) {
Aggregator processChunkLongs(ByteBuffer buf, int limit) {
int pos = 0;
while (pos < limit) {
int start = pos;
int hash = 0;
long tail = 0;
while (true) {
// Seen this trick used in multiple other solutions.
// Nice breakdown here: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
long tmpLong = buf.getLong(pos);
long match = tmpLong ^ 0x3B3B3B3B_3B3B3B3BL; // 3B == ';'
match = ((match - 0x01010101_01010101L) & (~match & 0x80808080_80808080L));
if (match == 0) {
hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFFFF);
pos += 8;
int start = pos;
long keyLong = buf.getLong(pos);
long valueSepMark = valueSepMark(keyLong);
if (valueSepMark != 0) {
int tailBits = tailBits(valueSepMark);
pos += valueOffset(tailBits);
// assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (1), pos=" + (pos - startAddr);
long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
long valueLong = buf.getLong(pos);
int decimalSepMark = decimalSepMark(valueLong);
pos += nextKeyOffset(decimalSepMark);
// assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (1), pos=" + (pos - startAddr);
int measurement = decimalValue(decimalSepMark, valueLong);
add1(buf, start, tailAndLen, hash(hash1(tailAndLen)), measurement);
continue;
}
int tailBits = Long.numberOfTrailingZeros(match >>> 7);
long tailMask = ~(-1L << tailBits);
tail = tmpLong & tailMask;
hash = ((33 * hash) ^ (int) (tail & 0xFFFFFFFF)) + (int) ((tail >>> 33) & 0xFFFFFFFF);
pos += tailBits >> 3;
break;
pos += 8;
long keyLong1 = keyLong;
keyLong = buf.getLong(pos);
valueSepMark = valueSepMark(keyLong);
if (valueSepMark != 0) {
int tailBits = tailBits(valueSepMark);
pos += valueOffset(tailBits);
// assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (2), pos=" + (pos - startAddr);
long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
long valueLong = buf.getLong(pos);
int decimalSepMark = decimalSepMark(valueLong);
pos += nextKeyOffset(decimalSepMark);
// assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (2), pos=" + (pos - startAddr);
int measurement = decimalValue(decimalSepMark, valueLong);
add2(buf, start, keyLong1, tailAndLen, hash(hash(hash1(keyLong1), tailAndLen)), measurement);
continue;
}
long hash = hash1(keyLong1);
do {
pos += 8;
hash = hash(hash, keyLong);
keyLong = buf.getLong(pos);
valueSepMark = valueSepMark(keyLong);
} while (valueSepMark == 0);
int tailBits = tailBits(valueSepMark);
pos += valueOffset(tailBits);
// assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (N), pos=" + (pos - startAddr);
long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
hash = hash(hash, tailAndLen);
long valueLong = buf.getLong(pos);
int decimalSepMark = decimalSepMark(valueLong);
pos += nextKeyOffset(decimalSepMark);
// assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (N), pos=" + (pos - startAddr);
int measurement = decimalValue(decimalSepMark, valueLong);
addN(buf, start, tailAndLen, hash(hash), measurement);
}
hash = (33 * hash) ^ (hash >>> 15);
int lenInLongs = (pos - start) >> 3;
long tailAndLen = (tail << 8) | (lenInLongs & 0xFF);
// assert (buf.get(pos) == ';') : "Expected ';'";
pos++;
int measurement;
{
// Seen this trick used in multiple other solutions.
// Looks like the original author is @merykitty.
long tmpLong = buf.getLong(pos);
return this;
}
// The 4th binary digit of the ascii of a digit is 1 while
// that of the '.' is 0. This finds the decimal separator
// The value can be 12, 20, 28
int decimalSepPos = Long.numberOfTrailingZeros(~tmpLong & 0x10101000);
int shift = 28 - decimalSepPos;
// signed is -1 if negative, 0 otherwise
long signed = (~tmpLong << 59) >> 63;
long designMask = ~(signed & 0xFF);
// Align the number to a specific position and transform the ascii code
// to actual digit value in each byte
long digits = ((tmpLong & designMask) << shift) & 0x0F000F0F00L;
public Stream<Entry> stream() {
return Arrays.stream(index)
.filter(offset -> offset != 0)
.mapToObj(offset -> new Entry(mem, offset));
}
// Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
// 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
// 0x000000UU00TTHH00 +
// 0x00UU00TTHH000000 * 10 +
// 0xUU00TTHH00000000 * 100
// Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
// This results in our value lies in the bit 32 to 41 of this product
// That was close :)
long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
measurement = (int) ((absValue ^ signed) - signed);
pos += (decimalSepPos >>> 3) + 3;
private static long hash1(long value) {
return value;
}
private static long hash(long hash, long value) {
return hash ^ value;
}
private static int hash(long hash) {
hash *= 0x9E3779B97F4A7C15L; // Fibonacci hashing multiplier
return (int) (hash >>> 39);
}
private static long valueSepMark(long keyLong) {
// Seen this trick used in multiple other solutions.
// Nice breakdown here: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
long match = keyLong ^ 0x3B3B3B3B_3B3B3B3BL; // 3B == ';'
match = (match - 0x01010101_01010101L) & (~match & 0x80808080_80808080L);
return match;
}
private static int tailBits(long valueSepMark) {
return Long.numberOfTrailingZeros(valueSepMark >>> 7);
}
private static int valueOffset(int tailBits) {
return (int) (tailBits >>> 3) + 1;
}
private static long tailAndLen(int tailBits, long keyLong, long keyLen) {
long tailMask = ~(-1L << tailBits);
long tail = keyLong & tailMask;
return (tail << 8) | ((keyLen >> 3) & 0xFF);
}
private static int decimalSepMark(long value) {
// Seen this trick used in multiple other solutions.
// Looks like the original author is @merykitty.
// The 4th binary digit of the ascii of a digit is 1 while
// that of the '.' is 0. This finds the decimal separator
// The value can be 12, 20, 28
return Long.numberOfTrailingZeros(~value & 0x10101000);
}
private static int decimalValue(int decimalSepMark, long value) {
// Seen this trick used in multiple other solutions.
// Looks like the original author is @merykitty.
int shift = 28 - decimalSepMark;
// signed is -1 if negative, 0 otherwise
long signed = (~value << 59) >> 63;
long designMask = ~(signed & 0xFF);
// Align the number to a specific position and transform the ascii code
// to actual digit value in each byte
long digits = ((value & designMask) << shift) & 0x0F000F0F00L;
// Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
// 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
// 0x000000UU00TTHH00 +
// 0x00UU00TTHH000000 * 10 +
// 0xUU00TTHH00000000 * 100
// Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
// This results in our value lies in the bit 32 to 41 of this product
// That was close :)
long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
return (int) ((absValue ^ signed) - signed);
}
private static int nextKeyOffset(int decimalSepMark) {
return (decimalSepMark >>> 3) + 3;
}
private void add1(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
int idx = hash & INDEX_MASK;
for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
if (update1(index[idx], tailAndLen, measurement)) {
return;
}
}
// assert (buf.get(pos - 1) == '\n') : "Expected '\\n'";
add(buf, start, tailAndLen, hash, measurement);
index[idx] = create(buf, start, tailAndLen, measurement);
}
return this;
}
public Stream<Entry> stream() {
return Arrays.stream(index)
.filter(offset -> offset != 0)
.mapToObj(offset -> new Entry(mem, offset));
}
private void add(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
int idx = hash & INDEX_MASK;
for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
if (update(index[idx], buf, start, tailAndLen, measurement)) {
return;
private void add2(ByteBuffer buf, int start, long keyLong, long tailAndLen, int hash, int measurement) {
int idx = hash & INDEX_MASK;
for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
if (update2(index[idx], keyLong, tailAndLen, measurement)) {
return;
}
}
}
index[idx] = create(buf, start, tailAndLen, measurement);
}
private int create(ByteBuffer buf, int start, long tailAndLen, int measurement) {
int offset = memUsed;
mem[offset] = tailAndLen;
int memPos = offset + 1;
int memEnd = memPos + (int) (tailAndLen & 0xFF);
int bufPos = start;
while (memPos < memEnd) {
mem[memPos] = buf.getLong(bufPos);
memPos += 1;
bufPos += 8;
index[idx] = create(buf, start, tailAndLen, measurement);
}
mem[memPos + FLD_MIN] = measurement;
mem[memPos + FLD_MAX] = measurement;
mem[memPos + FLD_SUM] = measurement;
mem[memPos + FLD_COUNT] = 1;
memUsed = memPos + 4;
return offset;
}
private boolean update(int offset, ByteBuffer buf, int start, long tailAndLen, int measurement) {
var mem = this.mem;
if (mem[offset] != tailAndLen) {
return false;
private void addN(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
int idx = hash & INDEX_MASK;
for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
if (updateN(index[idx], buf, start, tailAndLen, measurement)) {
return;
}
}
index[idx] = create(buf, start, tailAndLen, measurement);
}
int memPos = offset + 1;
int memEnd = memPos + (int) (tailAndLen & 0xFF);
int bufPos = start;
while (memPos < memEnd) {
if (mem[memPos] != buf.getLong(bufPos)) {
private int create(ByteBuffer buf, int start, long tailAndLen, int measurement) {
int offset = memUsed;
mem[offset] = tailAndLen;
int memPos = offset + 1;
int memEnd = memPos + (int) (tailAndLen & 0xFF);
int bufPos = start;
while (memPos < memEnd) {
mem[memPos] = buf.getLong(bufPos);
memPos += 1;
bufPos += 8;
}
mem[memPos + FLD_MIN] = measurement;
mem[memPos + FLD_MAX] = measurement;
mem[memPos + FLD_SUM] = measurement;
mem[memPos + FLD_COUNT] = 1;
memUsed = memPos + 4;
return offset;
}
private boolean update1(int offset, long tailAndLen, int measurement) {
if (mem[offset] != tailAndLen) {
return false;
}
memPos += 1;
bufPos += 8;
updateStats(offset + 1, measurement);
return true;
}
mem[memPos + FLD_COUNT] += 1;
mem[memPos + FLD_SUM] += measurement;
if (measurement < mem[memPos + FLD_MIN]) {
mem[memPos + FLD_MIN] = measurement;
}
if (measurement > mem[memPos + FLD_MAX]) {
mem[memPos + FLD_MAX] = measurement;
}
return true;
}
public static class Entry {
private final long[] mem;
private final int offset;
private String key;
Entry(long[] mem, int offset) {
this.mem = mem;
this.offset = offset;
}
public String getKey() {
if (key == null) {
int pos = this.offset;
long tailAndLen = mem[pos++];
int keyLen = (int) (tailAndLen & 0xFF);
var tmpBuf = ByteBuffer.allocate((keyLen << 3) + 8).order(ByteOrder.nativeOrder());
for (int i = 0; i < keyLen; i++) {
tmpBuf.putLong(mem[pos++]);
}
long tail = tailAndLen >>> 8;
tmpBuf.putLong(tail);
int keyLenBytes = (keyLen << 3) + 8 - (Long.numberOfLeadingZeros(tail) >> 3);
key = new String(tmpBuf.array(), 0, keyLenBytes, StandardCharsets.UTF_8);
private boolean update2(int offset, long keyLong, long tailAndLen, int measurement) {
if (mem[offset] != tailAndLen || mem[offset + 1] != keyLong) {
return false;
}
return key;
updateStats(offset + 2, measurement);
return true;
}
public Entry add(Entry other) {
int fldOffset = (int) (mem[offset] & 0xFF) + 1;
int pos = offset + fldOffset;
int otherPos = other.offset + fldOffset;
long[] otherMem = other.mem;
mem[pos + FLD_MIN] = Math.min((int) mem[pos + FLD_MIN], (int) otherMem[otherPos + FLD_MIN]);
mem[pos + FLD_MAX] = Math.max((int) mem[pos + FLD_MAX], (int) otherMem[otherPos + FLD_MAX]);
mem[pos + FLD_SUM] += otherMem[otherPos + FLD_SUM];
mem[pos + FLD_COUNT] += otherMem[otherPos + FLD_COUNT];
return this;
private boolean updateN(int offset, ByteBuffer buf, int start, long tailAndLen, int measurement) {
var mem = this.mem;
if (mem[offset] != tailAndLen) {
return false;
}
int memPos = offset + 1;
int memEnd = memPos + (int) (tailAndLen & 0xFF);
int bufPos = start;
while (memPos < memEnd) {
if (mem[memPos] != buf.getLong(bufPos)) {
return false;
}
memPos += 1;
bufPos += 8;
}
updateStats(memPos, measurement);
return true;
}
public Entry getValue() {
return this;
private void updateStats(int memPos, int measurement) {
mem[memPos + FLD_COUNT] += 1;
mem[memPos + FLD_SUM] += measurement;
if (measurement < mem[memPos + FLD_MIN]) {
mem[memPos + FLD_MIN] = measurement;
}
if (measurement > mem[memPos + FLD_MAX]) {
mem[memPos + FLD_MAX] = measurement;
}
}
@Override
public String toString() {
int pos = offset + (int) (mem[offset] & 0xFF) + 1;
return round(mem[pos + FLD_MIN])
+ "/" + round(((double) mem[pos + FLD_SUM]) / mem[pos + FLD_COUNT])
+ "/" + round(mem[pos + FLD_MAX]);
}
public static class Entry {
private final long[] mem;
private final int offset;
private String key;
private static double round(double value) {
return Math.round(value) / 10.0;
Entry(long[] mem, int offset) {
this.mem = mem;
this.offset = offset;
}
public String getKey() {
if (key == null) {
int pos = this.offset;
long tailAndLen = mem[pos++];
int keyLen = (int) (tailAndLen & 0xFF);
var tmpBuf = ByteBuffer.allocate((keyLen << 3) + 8).order(ByteOrder.nativeOrder());
for (int i = 0; i < keyLen; i++) {
tmpBuf.putLong(mem[pos++]);
}
long tail = tailAndLen >>> 8;
tmpBuf.putLong(tail);
int keyLenBytes = (keyLen << 3) + 8 - (Long.numberOfLeadingZeros(tail) >> 3);
key = new String(tmpBuf.array(), 0, keyLenBytes, StandardCharsets.UTF_8);
}
return key;
}
public Entry add(Entry other) {
int fldOffset = (int) (mem[offset] & 0xFF) + 1;
int pos = offset + fldOffset;
int otherPos = other.offset + fldOffset;
long[] otherMem = other.mem;
mem[pos + FLD_MIN] = Math.min((int) mem[pos + FLD_MIN], (int) otherMem[otherPos + FLD_MIN]);
mem[pos + FLD_MAX] = Math.max((int) mem[pos + FLD_MAX], (int) otherMem[otherPos + FLD_MAX]);
mem[pos + FLD_SUM] += otherMem[otherPos + FLD_SUM];
mem[pos + FLD_COUNT] += otherMem[otherPos + FLD_COUNT];
return this;
}
public Entry getValue() {
return this;
}
@Override
public String toString() {
int pos = offset + (int) (mem[offset] & 0xFF) + 1;
return round(mem[pos + FLD_MIN])
+ "/" + round(((double) mem[pos + FLD_SUM]) / mem[pos + FLD_COUNT])
+ "/" + round(mem[pos + FLD_MAX]);
}
private static double round(double value) {
return Math.round(value) / 10.0;
}
}
}
}