jerrinot - final(?) improvements (#690)
* decrease instruction level parallelism it turns out doing 2 things was too much. perf annotate showed spilling. * more trickery with latency hiding * work-stealing, lookp tables, credits * do not assume gender
This commit is contained in:
parent
b529ef2a59
commit
9b9bb8ed3f
@ -24,6 +24,7 @@ import java.lang.foreign.Arena;
|
||||
import java.lang.reflect.Field;
|
||||
import java.nio.channels.FileChannel.MapMode;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* I figured out it would be very hard to win the main competition of the One Billion Rows Challenge.
|
||||
@ -31,17 +32,59 @@ import java.util.*;
|
||||
*
|
||||
* Anyway, if you can make sense out of not exactly idiomatic Java code, and you enjoy pushing performance limits
|
||||
* then QuestDB - the fastest open-source time-series database - is hiring: https://questdb.io/careers/core-database-engineer/
|
||||
*
|
||||
* <p>
|
||||
* <b>Credit</b>
|
||||
* <p>
|
||||
* I stand on shoulders of giants. I wouldn't be able to code this without analyzing and borrowing from solutions of others.
|
||||
* People who helped me the most:
|
||||
* <ul>
|
||||
* <li>Thomas Wuerthinger (thomaswue): The munmap() trick and work-stealing. In both cases, I shameless copy-pasted their code.
|
||||
* Including SWAR for detecting new lines. Thomas also gave me helpful hints on how to detect register spilling issues.</li>
|
||||
* <li>Quan Anh Mai (merykitty): I borrowed their phenomenal branch-free parser.</li>
|
||||
* <li>Marko Topolnik (mtopolnik): I use a hashing function I saw in his code. It seems the produce good quality hashes
|
||||
* and it's next-level in speed. Marko joined the challenge before me and our discussions made me to join too!</li>
|
||||
* <li>Van Phu DO (abeobk): I saw the idea with simple lookup tables instead of complicated bit-twiddling in their code first.</li>
|
||||
* <li>Roy van Rijn (royvanrijn): I borrowed their SWAR code and initially their hash code impl</li>
|
||||
* <li>Francesco Nigro (franz1981): For our online discussions about performance. Both before and during this challenge.
|
||||
* Francesco gave me the idea to check register spilling.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class CalculateAverage_jerrinot {
|
||||
private static final Unsafe UNSAFE = unsafe();
|
||||
private static final String MEASUREMENTS_TXT = "measurements.txt";
|
||||
// todo: with hyper-threading enable we would be better of with availableProcessors / 2;
|
||||
// todo: validate the testing env. params.
|
||||
private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
|
||||
// private static final int THREAD_COUNT = 4;
|
||||
private static final int EXTRA_THREAD_COUNT = Runtime.getRuntime().availableProcessors() - 1;
|
||||
// private static final int THREAD_COUNT = 1;
|
||||
|
||||
private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
|
||||
private static final long NEW_LINE_PATTERN = 0x0A0A0A0A0A0A0A0AL;
|
||||
private static final int SEGMENT_SIZE = 4 * 1024 * 1024;
|
||||
|
||||
// credits for the idea with lookup tables instead of bit-shifting: abeobk
|
||||
private static final long[] HASH_MASKS = new long[]{
|
||||
0x0000000000000000L, // semicolon is the first char
|
||||
0x00000000000000ffL,
|
||||
0x000000000000ffffL,
|
||||
0x0000000000ffffffL,
|
||||
0x00000000ffffffffL,
|
||||
0x000000ffffffffffL,
|
||||
0x0000ffffffffffffL,
|
||||
0x00ffffffffffffffL, // semicolon is the last char
|
||||
0xffffffffffffffffL // there is no semicolon at all
|
||||
};
|
||||
|
||||
private static final long[] ADVANCE_MASKS = new long[]{
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0x0000000000000000L,
|
||||
0xffffffffffffffffL,
|
||||
};
|
||||
|
||||
private static Unsafe unsafe() {
|
||||
try {
|
||||
@ -81,56 +124,29 @@ public class CalculateAverage_jerrinot {
|
||||
static void calculate() throws Exception {
|
||||
final File file = new File(MEASUREMENTS_TXT);
|
||||
final long length = file.length();
|
||||
// final int chunkCount = Runtime.getRuntime().availableProcessors();
|
||||
int chunkPerThread = 3;
|
||||
final int chunkCount = THREAD_COUNT * chunkPerThread;
|
||||
final var chunkStartOffsets = new long[chunkCount + 1];
|
||||
try (var raf = new RandomAccessFile(file, "r")) {
|
||||
// credit - chunking code: mtopolnik
|
||||
final var inputBase = raf.getChannel().map(MapMode.READ_ONLY, 0, length, Arena.global()).address();
|
||||
for (int i = 1; i < chunkStartOffsets.length - 1; i++) {
|
||||
var start = length * i / (chunkStartOffsets.length - 1);
|
||||
raf.seek(start);
|
||||
while (raf.read() != (byte) '\n') {
|
||||
}
|
||||
start = raf.getFilePointer();
|
||||
chunkStartOffsets[i] = start + inputBase;
|
||||
}
|
||||
chunkStartOffsets[0] = inputBase;
|
||||
chunkStartOffsets[chunkCount] = inputBase + length;
|
||||
long fileStart = raf.getChannel().map(MapMode.READ_ONLY, 0, length, Arena.global()).address();
|
||||
long fileEnd = fileStart + length;
|
||||
var globalCursor = new AtomicLong(fileStart);
|
||||
|
||||
Processor[] processors = new Processor[THREAD_COUNT];
|
||||
Thread[] threads = new Thread[THREAD_COUNT];
|
||||
Processor[] processors = new Processor[EXTRA_THREAD_COUNT];
|
||||
Thread[] threads = new Thread[EXTRA_THREAD_COUNT];
|
||||
|
||||
for (int i = 0; i < THREAD_COUNT - 1; i++) {
|
||||
long startA = chunkStartOffsets[i * chunkPerThread];
|
||||
long endA = chunkStartOffsets[i * chunkPerThread + 1];
|
||||
long startB = chunkStartOffsets[i * chunkPerThread + 1];
|
||||
long endB = chunkStartOffsets[i * chunkPerThread + 2];
|
||||
long startC = chunkStartOffsets[i * chunkPerThread + 2];
|
||||
long endC = chunkStartOffsets[i * chunkPerThread + 3];
|
||||
|
||||
Processor processor = new Processor(startA, endA, startB, endB, startC, endC);
|
||||
processors[i] = processor;
|
||||
for (int i = 0; i < EXTRA_THREAD_COUNT; i++) {
|
||||
Processor processor = new Processor(fileStart, fileEnd, globalCursor);
|
||||
Thread thread = new Thread(processor);
|
||||
processors[i] = processor;
|
||||
threads[i] = thread;
|
||||
thread.start();
|
||||
}
|
||||
|
||||
int ownIndex = THREAD_COUNT - 1;
|
||||
long startA = chunkStartOffsets[ownIndex * chunkPerThread];
|
||||
long endA = chunkStartOffsets[ownIndex * chunkPerThread + 1];
|
||||
long startB = chunkStartOffsets[ownIndex * chunkPerThread + 1];
|
||||
long endB = chunkStartOffsets[ownIndex * chunkPerThread + 2];
|
||||
long startC = chunkStartOffsets[ownIndex * chunkPerThread + 2];
|
||||
long endC = chunkStartOffsets[ownIndex * chunkPerThread + 3];
|
||||
Processor processor = new Processor(startA, endA, startB, endB, startC, endC);
|
||||
Processor processor = new Processor(fileStart, fileEnd, globalCursor);
|
||||
processor.run();
|
||||
|
||||
var accumulator = new TreeMap<String, Processor.StationStats>();
|
||||
var accumulator = new TreeMap<String, StationStats>();
|
||||
processor.accumulateStatus(accumulator);
|
||||
|
||||
for (int i = 0; i < THREAD_COUNT - 1; i++) {
|
||||
for (int i = 0; i < EXTRA_THREAD_COUNT; i++) {
|
||||
Thread t = threads[i];
|
||||
t.join();
|
||||
processors[i].accumulateStatus(accumulator);
|
||||
@ -140,10 +156,10 @@ public class CalculateAverage_jerrinot {
|
||||
}
|
||||
}
|
||||
|
||||
private static void printResults(TreeMap<String, Processor.StationStats> accumulator) {
|
||||
private static void printResults(TreeMap<String, StationStats> accumulator) {
|
||||
var sb = new StringBuilder(10000);
|
||||
boolean first = true;
|
||||
for (Map.Entry<String, Processor.StationStats> statsEntry : accumulator.entrySet()) {
|
||||
for (Map.Entry<String, StationStats> statsEntry : accumulator.entrySet()) {
|
||||
if (first) {
|
||||
sb.append("{");
|
||||
first = false;
|
||||
@ -210,20 +226,17 @@ public class CalculateAverage_jerrinot {
|
||||
private static final int FAST_MAP_SIZE_BYTES = MAPS_SLOT_COUNT * FAST_MAP_ENTRY_SIZE_BYTES;
|
||||
private static final int SLOW_MAP_MAP_NAMES_BYTES = MAX_UNIQUE_KEYS * STATION_MAX_NAME_BYTES;
|
||||
private static final int MAP_MASK = MAPS_SLOT_COUNT - 1;
|
||||
private final AtomicLong globalCursor;
|
||||
|
||||
private long slowMap;
|
||||
private long slowMapNamesPtr;
|
||||
private long slowMapNamesLo;
|
||||
// private long fastMap;
|
||||
private long cursorA;
|
||||
private long endA;
|
||||
private long cursorB;
|
||||
private long endB;
|
||||
private long cursorC;
|
||||
private long endC;
|
||||
private HashMap<String, StationStats> stats = new HashMap<>(1000);
|
||||
|
||||
// private long maxClusterLen;
|
||||
private HashMap<String, CalculateAverage_jerrinot.StationStats> stats = new HashMap<>(1000);
|
||||
private final long fileEnd;
|
||||
private final long fileStart;
|
||||
|
||||
// credit: merykitty
|
||||
private long parseAndStoreTemperature(long startCursor, long baseEntryPtr, long word) {
|
||||
@ -264,20 +277,12 @@ public class CalculateAverage_jerrinot {
|
||||
return (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
|
||||
}
|
||||
|
||||
// todo: immutability cost us in allocations, but that's probably peanuts in the grand scheme of things. still worth checking
|
||||
// maybe JVM trusting Final in Records offsets it ..a test is needed
|
||||
record StationStats(int min, int max, int count, long sum) {
|
||||
StationStats mergeWith(StationStats other) {
|
||||
return new StationStats(Math.min(min, other.min), Math.max(max, other.max), count + other.count, sum + other.sum);
|
||||
}
|
||||
}
|
||||
|
||||
void accumulateStatus(TreeMap<String, StationStats> accumulator) {
|
||||
for (Map.Entry<String, StationStats> entry : stats.entrySet()) {
|
||||
void accumulateStatus(TreeMap<String, CalculateAverage_jerrinot.StationStats> accumulator) {
|
||||
for (Map.Entry<String, CalculateAverage_jerrinot.StationStats> entry : stats.entrySet()) {
|
||||
String name = entry.getKey();
|
||||
StationStats localStats = entry.getValue();
|
||||
CalculateAverage_jerrinot.StationStats localStats = entry.getValue();
|
||||
|
||||
StationStats globalStats = accumulator.get(name);
|
||||
CalculateAverage_jerrinot.StationStats globalStats = accumulator.get(name);
|
||||
if (globalStats == null) {
|
||||
accumulator.put(name, localStats);
|
||||
}
|
||||
@ -287,24 +292,10 @@ public class CalculateAverage_jerrinot {
|
||||
}
|
||||
}
|
||||
|
||||
Processor(long startA, long endA, long startB, long endB, long startC, long endC) {
|
||||
this.cursorA = startA;
|
||||
this.cursorB = startB;
|
||||
this.cursorC = startC;
|
||||
this.endA = endA;
|
||||
this.endB = endB;
|
||||
this.endC = endC;
|
||||
}
|
||||
|
||||
private void doTail(long fastMAp) {
|
||||
doOne(cursorA, endA);
|
||||
doOne(cursorB, endB);
|
||||
doOne(cursorC, endC);
|
||||
|
||||
transferToHeap(fastMAp);
|
||||
// UNSAFE.freeMemory(fastMap);
|
||||
// UNSAFE.freeMemory(slowMap);
|
||||
// UNSAFE.freeMemory(slowMapNamesLo);
|
||||
Processor(long fileStart, long fileEnd, AtomicLong globalCursor) {
|
||||
this.globalCursor = globalCursor;
|
||||
this.fileEnd = fileEnd;
|
||||
this.fileStart = fileStart;
|
||||
}
|
||||
|
||||
private void transferToHeap(long fastMap) {
|
||||
@ -324,7 +315,7 @@ public class CalculateAverage_jerrinot {
|
||||
int count = UNSAFE.getInt(baseAddress + MAP_COUNT_OFFSET);
|
||||
long sum = UNSAFE.getLong(baseAddress + MAP_SUM_OFFSET);
|
||||
|
||||
stats.put(name, new StationStats(min, max, count, sum));
|
||||
stats.put(name, new CalculateAverage_jerrinot.StationStats(min, max, count, sum));
|
||||
}
|
||||
|
||||
for (long baseAddress = fastMap; baseAddress < fastMap + FAST_MAP_SIZE_BYTES; baseAddress += FAST_MAP_ENTRY_SIZE_BYTES) {
|
||||
@ -345,16 +336,21 @@ public class CalculateAverage_jerrinot {
|
||||
|
||||
var v = stats.get(name);
|
||||
if (v == null) {
|
||||
stats.put(name, new StationStats(min, max, count, sum));
|
||||
stats.put(name, new CalculateAverage_jerrinot.StationStats(min, max, count, sum));
|
||||
}
|
||||
else {
|
||||
stats.put(name, new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
|
||||
stats.put(name, new CalculateAverage_jerrinot.StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doOne(long cursor, long endA) {
|
||||
while (cursor < endA) {
|
||||
private void doOne(long cursor, long end) {
|
||||
while (cursor < end) {
|
||||
// it seems that when pulling just from a single chunk
|
||||
// then bit-twiddling is faster than lookup tables
|
||||
// hypothesis: when processing multiple things at once then LOAD latency is partially hidden
|
||||
// but when processing just one thing then it's better to keep things local as much as possible? maybe:)
|
||||
|
||||
long start = cursor;
|
||||
long currentWord = UNSAFE.getLong(cursor);
|
||||
long mask = getDelimiterMask(currentWord);
|
||||
@ -392,135 +388,139 @@ public class CalculateAverage_jerrinot {
|
||||
return (int) hash;
|
||||
}
|
||||
|
||||
private static long nextNewLine(long prev) {
|
||||
// again: credits to @thomaswue for this code, literally copy'n'paste
|
||||
while (true) {
|
||||
long currentWord = UNSAFE.getLong(prev);
|
||||
long input = currentWord ^ NEW_LINE_PATTERN;
|
||||
long pos = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
|
||||
if (pos != 0) {
|
||||
prev += Long.numberOfTrailingZeros(pos) >>> 3;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
prev += 8;
|
||||
}
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
long fastMap = allocateMem();
|
||||
for (;;) {
|
||||
long startingPtr = globalCursor.addAndGet(SEGMENT_SIZE) - SEGMENT_SIZE;
|
||||
if (startingPtr >= fileEnd) {
|
||||
break;
|
||||
}
|
||||
setCursors(startingPtr);
|
||||
mainLoop(fastMap);
|
||||
doOne(cursorA, endA);
|
||||
doOne(cursorB, endB);
|
||||
}
|
||||
transferToHeap(fastMap);
|
||||
}
|
||||
|
||||
private long allocateMem() {
|
||||
this.slowMap = UNSAFE.allocateMemory(SLOW_MAP_SIZE_BYTES);
|
||||
this.slowMapNamesPtr = UNSAFE.allocateMemory(SLOW_MAP_MAP_NAMES_BYTES);
|
||||
this.slowMapNamesLo = slowMapNamesPtr;
|
||||
long fastMap = UNSAFE.allocateMemory(FAST_MAP_SIZE_BYTES);
|
||||
UNSAFE.setMemory(slowMap, SLOW_MAP_SIZE_BYTES, (byte) 0);
|
||||
UNSAFE.setMemory(fastMap, FAST_MAP_SIZE_BYTES, (byte) 0);
|
||||
UNSAFE.setMemory(slowMapNamesPtr, SLOW_MAP_MAP_NAMES_BYTES, (byte) 0);
|
||||
return fastMap;
|
||||
}
|
||||
|
||||
while (cursorA < endA && cursorB < endB && cursorC < endC) {
|
||||
private void mainLoop(long fastMap) {
|
||||
while (cursorA < endA && cursorB < endB) {
|
||||
long currentWordA = UNSAFE.getLong(cursorA);
|
||||
long currentWordB = UNSAFE.getLong(cursorB);
|
||||
long currentWordC = UNSAFE.getLong(cursorC);
|
||||
|
||||
long delimiterMaskA = getDelimiterMask(currentWordA);
|
||||
long delimiterMaskB = getDelimiterMask(currentWordB);
|
||||
|
||||
long candidateWordA = UNSAFE.getLong(cursorA + 8);
|
||||
long candidateWordB = UNSAFE.getLong(cursorB + 8);
|
||||
|
||||
long startA = cursorA;
|
||||
long startB = cursorB;
|
||||
long startC = cursorC;
|
||||
|
||||
long maskA = getDelimiterMask(currentWordA);
|
||||
long maskB = getDelimiterMask(currentWordB);
|
||||
long maskC = getDelimiterMask(currentWordC);
|
||||
int trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
|
||||
int trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
|
||||
|
||||
long maskComplementA = -maskA;
|
||||
long maskComplementB = -maskB;
|
||||
long maskComplementC = -maskC;
|
||||
long advanceMaskA = ADVANCE_MASKS[trailingZerosA];
|
||||
long advanceMaskB = ADVANCE_MASKS[trailingZerosB];
|
||||
|
||||
long maskWithDelimiterA = (maskA ^ (maskA - 1));
|
||||
long maskWithDelimiterB = (maskB ^ (maskB - 1));
|
||||
long maskWithDelimiterC = (maskC ^ (maskC - 1));
|
||||
long wordMaskA = HASH_MASKS[trailingZerosA];
|
||||
long wordMaskB = HASH_MASKS[trailingZerosB];
|
||||
|
||||
long isMaskZeroA = (((maskA | maskComplementA) >>> 63) ^ 1);
|
||||
long isMaskZeroB = (((maskB | maskComplementB) >>> 63) ^ 1);
|
||||
long isMaskZeroC = (((maskC | maskComplementC) >>> 63) ^ 1);
|
||||
long negAdvanceMaskA = ~advanceMaskA;
|
||||
long negAdvanceMaskB = ~advanceMaskB;
|
||||
|
||||
cursorA += isMaskZeroA << 3;
|
||||
cursorB += isMaskZeroB << 3;
|
||||
cursorC += isMaskZeroC << 3;
|
||||
cursorA += advanceMaskA & 8;
|
||||
cursorB += advanceMaskB & 8;
|
||||
|
||||
long nextWordA = UNSAFE.getLong(cursorA);
|
||||
long nextWordB = UNSAFE.getLong(cursorB);
|
||||
long nextWordC = UNSAFE.getLong(cursorC);
|
||||
long nextWordA = (advanceMaskA & candidateWordA) | (negAdvanceMaskA & currentWordA);
|
||||
long nextWordB = (advanceMaskB & candidateWordB) | (negAdvanceMaskB & currentWordB);
|
||||
|
||||
long firstWordMaskA = maskWithDelimiterA >>> 8;
|
||||
long firstWordMaskB = maskWithDelimiterB >>> 8;
|
||||
long firstWordMaskC = maskWithDelimiterC >>> 8;
|
||||
long nextDelimiterMaskA = getDelimiterMask(nextWordA);
|
||||
long nextDelimiterMaskB = getDelimiterMask(nextWordB);
|
||||
|
||||
long nextMaskA = getDelimiterMask(nextWordA);
|
||||
long nextMaskB = getDelimiterMask(nextWordB);
|
||||
long nextMaskC = getDelimiterMask(nextWordC);
|
||||
boolean slowA = nextDelimiterMaskA == 0;
|
||||
boolean slowB = nextDelimiterMaskB == 0;
|
||||
boolean slowSome = (slowA || slowB);
|
||||
|
||||
boolean slowA = nextMaskA == 0;
|
||||
boolean slowB = nextMaskB == 0;
|
||||
boolean slowC = nextMaskC == 0;
|
||||
boolean slowSome = (slowA || slowB || slowC);
|
||||
|
||||
long extA = -isMaskZeroA;
|
||||
long extB = -isMaskZeroB;
|
||||
long extC = -isMaskZeroC;
|
||||
|
||||
long maskedFirstWordA = (extA | firstWordMaskA) & currentWordA;
|
||||
long maskedFirstWordB = (extB | firstWordMaskB) & currentWordB;
|
||||
long maskedFirstWordC = (extC | firstWordMaskC) & currentWordC;
|
||||
long maskedFirstWordA = wordMaskA & currentWordA;
|
||||
long maskedFirstWordB = wordMaskB & currentWordB;
|
||||
|
||||
int hashA = hash(maskedFirstWordA);
|
||||
int hashB = hash(maskedFirstWordB);
|
||||
int hashC = hash(maskedFirstWordC);
|
||||
|
||||
currentWordA = nextWordA;
|
||||
currentWordB = nextWordB;
|
||||
currentWordC = nextWordC;
|
||||
|
||||
maskA = nextMaskA;
|
||||
maskB = nextMaskB;
|
||||
maskC = nextMaskC;
|
||||
delimiterMaskA = nextDelimiterMaskA;
|
||||
delimiterMaskB = nextDelimiterMaskB;
|
||||
if (slowSome) {
|
||||
while (maskA == 0) {
|
||||
while (delimiterMaskA == 0) {
|
||||
cursorA += 8;
|
||||
currentWordA = UNSAFE.getLong(cursorA);
|
||||
maskA = getDelimiterMask(currentWordA);
|
||||
delimiterMaskA = getDelimiterMask(currentWordA);
|
||||
}
|
||||
|
||||
while (maskB == 0) {
|
||||
while (delimiterMaskB == 0) {
|
||||
cursorB += 8;
|
||||
currentWordB = UNSAFE.getLong(cursorB);
|
||||
maskB = getDelimiterMask(currentWordB);
|
||||
}
|
||||
while (maskC == 0) {
|
||||
cursorC += 8;
|
||||
currentWordC = UNSAFE.getLong(cursorC);
|
||||
maskC = getDelimiterMask(currentWordC);
|
||||
delimiterMaskB = getDelimiterMask(currentWordB);
|
||||
}
|
||||
}
|
||||
|
||||
final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
|
||||
final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
|
||||
final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
|
||||
trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
|
||||
trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
|
||||
|
||||
final long semicolonA = cursorA + (delimiterByteA >> 3);
|
||||
final long semicolonB = cursorB + (delimiterByteB >> 3);
|
||||
final long semicolonC = cursorC + (delimiterByteC >> 3);
|
||||
final long semicolonA = cursorA + trailingZerosA;
|
||||
final long semicolonB = cursorB + trailingZerosB;
|
||||
|
||||
long digitStartA = semicolonA + 1;
|
||||
long digitStartB = semicolonB + 1;
|
||||
long digitStartC = semicolonC + 1;
|
||||
|
||||
long lastWordMaskA = HASH_MASKS[trailingZerosA];
|
||||
long lastWordMaskB = HASH_MASKS[trailingZerosB];
|
||||
|
||||
long temperatureWordA = UNSAFE.getLong(digitStartA);
|
||||
long temperatureWordB = UNSAFE.getLong(digitStartB);
|
||||
long temperatureWordC = UNSAFE.getLong(digitStartC);
|
||||
|
||||
long lastWordMaskA = ((maskA - 1) ^ maskA) >>> 8;
|
||||
long lastWordMaskB = ((maskB - 1) ^ maskB) >>> 8;
|
||||
long lastWordMaskC = ((maskC - 1) ^ maskC) >>> 8;
|
||||
|
||||
final long maskedLastWordA = currentWordA & lastWordMaskA;
|
||||
final long maskedLastWordB = currentWordB & lastWordMaskB;
|
||||
final long maskedLastWordC = currentWordC & lastWordMaskC;
|
||||
|
||||
int lenA = (int) (semicolonA - startA);
|
||||
int lenB = (int) (semicolonB - startB);
|
||||
int lenC = (int) (semicolonC - startC);
|
||||
|
||||
int mapIndexA = hashA & MAP_MASK;
|
||||
int mapIndexB = hashB & MAP_MASK;
|
||||
int mapIndexC = hashC & MAP_MASK;
|
||||
|
||||
long baseEntryPtrA;
|
||||
long baseEntryPtrB;
|
||||
long baseEntryPtrC;
|
||||
|
||||
if (slowSome) {
|
||||
if (slowA) {
|
||||
@ -537,25 +537,37 @@ public class CalculateAverage_jerrinot {
|
||||
baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
|
||||
}
|
||||
|
||||
if (slowC) {
|
||||
baseEntryPtrC = getOrCreateEntryBaseOffsetSlow(lenC, startC, hashC, maskedLastWordC);
|
||||
}
|
||||
else {
|
||||
baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC, fastMap);
|
||||
}
|
||||
}
|
||||
else {
|
||||
baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
|
||||
baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
|
||||
baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC, fastMap);
|
||||
}
|
||||
|
||||
cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
|
||||
cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
|
||||
cursorC = parseAndStoreTemperature(digitStartC, baseEntryPtrC, temperatureWordC);
|
||||
}
|
||||
doTail(fastMap);
|
||||
// System.out.println("Longest chain: " + longestChain);
|
||||
}
|
||||
|
||||
private void setCursors(long current) {
|
||||
// Credit for the whole work-stealing scheme: @thomaswue
|
||||
// I have totally stolen it from him. I changed the order a bit to suite my taste better,
|
||||
// but it's his code
|
||||
long segmentStart;
|
||||
if (current == fileStart) {
|
||||
segmentStart = current;
|
||||
}
|
||||
else {
|
||||
segmentStart = nextNewLine(current) + 1;
|
||||
}
|
||||
long segmentEnd = nextNewLine(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
|
||||
|
||||
long size = (segmentEnd - segmentStart) / 2;
|
||||
long mid = nextNewLine(segmentStart + size);
|
||||
|
||||
cursorA = segmentStart;
|
||||
endA = mid;
|
||||
cursorB = mid + 1;
|
||||
endB = segmentEnd;
|
||||
}
|
||||
|
||||
private static long getOrCreateEntryBaseOffsetFast(int mapIndexA, int lenA, long maskedLastWord, long maskedFirstWord, long fastMap) {
|
||||
@ -625,4 +637,9 @@ public class CalculateAverage_jerrinot {
|
||||
}
|
||||
}
|
||||
|
||||
record StationStats(int min, int max, int count, long sum) {
|
||||
StationStats mergeWith(StationStats other) {
|
||||
return new StationStats(Math.min(min, other.min), Math.max(max, other.max), count + other.count, sum + other.sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user