From 457a36be639aae2186e641b6db1090965d9951f2 Mon Sep 17 00:00:00 2001 From: Jason Nochlin <91577+hundredwatt@users.noreply.github.com> Date: Fri, 26 Jan 2024 10:22:35 -0700 Subject: [PATCH] Fix hundredwatt's entry on 10k dataset (#558) * Improve hash function * remove limit on number of cores * fix calculation of boundaries between chunks * fix IOOBE --------- Co-authored-by: Jason Nochlin --- .../onebrc/CalculateAverage_hundredwatt.java | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java b/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java index 9d935ff..24a173a 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java @@ -31,7 +31,7 @@ import java.util.stream.IntStream; public class CalculateAverage_hundredwatt { private static final String FILE = "./measurements.txt"; private static final int MAX_ROW_SIZE = 100 + 1 + 5 + 1; // 100 for city name, 1 for ;, 5 for temperature, 1 for \n - private static final int THREAD_COUNT = Math.min(8, Runtime.getRuntime().availableProcessors()); + private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long BUFFER_SIZE = 128 * 1024 * 1024; // 128MB private static final long CHUNK_SIZE = BUFFER_SIZE / THREAD_COUNT; private static final long FILE_CHUNK_SIZE = CHUNK_SIZE - MAX_ROW_SIZE; @@ -209,10 +209,10 @@ public class CalculateAverage_hundredwatt { short temperature_value; int hashInt; - int i = 0; + int rc = 0; int end = (int) (size - MAX_ROW_SIZE); - while (position < end) { - i++; + while (position <= end) { + // rc++; offset = -1; // Parse city name @@ -257,11 +257,11 @@ public class CalculateAverage_hundredwatt { position = position + newlinePos / 8 + 2; // +1 for \n - hashInt = (int) (hash ^ (hash >> 32)); + hashInt = (int) (hash ^ (hash >> 32) ^ (hash >> 17)); hashTable.putOrMerge(hashInt, offset + 1, key, temperature_value); } - return i; + return rc; } public static void main(String[] args) throws IOException { @@ -282,7 +282,7 @@ public class CalculateAverage_hundredwatt { byte[] trailing = new byte[MAX_ROW_SIZE * 2]; fileChannel.read(ByteBuffer.wrap(trailing), Math.max(0, fileSize - MAX_ROW_SIZE)); var rc = processChunk(ByteBuffer.wrap(trailing), hashTable, Math.max(0, fileSize - MAX_ROW_SIZE), - MAX_ROW_SIZE + Math.min(fileSize, MAX_ROW_SIZE)); + MAX_ROW_SIZE + Math.min(fileSize, MAX_ROW_SIZE) - 1); // rowCount.addAndGet(rc); return hashTable; @@ -292,11 +292,16 @@ public class CalculateAverage_hundredwatt { } } + // if file is smaller than max row size, we're done b/c the trailing bytes handler processed the whole file + if (fileSize <= MAX_ROW_SIZE) { + return hashTable; + } + while (start < fileSize) { long end = Math.min(start + CHUNK_SIZE, fileSize); MappedByteBuffer bb = null; try { - bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, end - start); + bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, Math.min(end - start + 8, fileSize - start)); } catch (IOException e) { throw new RuntimeException(e);