From 936fc1da5493849d2aaf7f71f00f7f81067b6129 Mon Sep 17 00:00:00 2001 From: Alberto Venturini Date: Sun, 28 Jan 2024 11:02:42 +0200 Subject: [PATCH] Second version by albertoventurini (#609) * Contribution by albertoventurini * Use byte arrays of size 2^20 --------- Co-authored-by: Alberto Venturini --- calculate_average_albertoventurini.sh | 2 +- .../CalculateAverage_albertoventurini.java | 91 ++++++++++++------- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/calculate_average_albertoventurini.sh b/calculate_average_albertoventurini.sh index d997264..6263e14 100755 --- a/calculate_average_albertoventurini.sh +++ b/calculate_average_albertoventurini.sh @@ -15,5 +15,5 @@ # limitations under the License. # -JAVA_OPTS="-server -Xnoclassgc" +JAVA_OPTS="-Xnoclassgc" java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java b/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java index 406c759..91e00e3 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java @@ -58,31 +58,31 @@ public class CalculateAverage_albertoventurini { // Process a chunk and write results in a Trie rooted at 'root'. private static void processChunk(final TrieNode root, final ChunkReader cr) { - while (cr.hasNext()) { + while (cr.ensureHasMoreRows()) { TrieNode node = root; // Process the location name navigating through the trie - int b = cr.getNext() & 0xFF; - while (b != ';') { + int b = cr.getNext(); + do { + b &= 0xFF; if (node.children[b] == null) { node.children[b] = new TrieNode(); } node = node.children[b]; - b = cr.getNext() & 0xFF; - } + b = cr.getNext(); + } while (b != ';'); // Process the reading value (temperature) - int reading; + final int reading; - byte b1 = cr.getNext(); - byte b2 = cr.getNext(); - byte b3 = cr.getNext(); - byte b4 = cr.getNext(); + final byte b1 = cr.getNext(); + final byte b2 = cr.getNext(); if (b2 == '.') { // value is n.n - reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT); - // b4 == \n + reading = (b1 * 10 + cr.getNext() - TWO_BYTE_TO_INT); } else { + final byte b3 = cr.getNext(); + final byte b4 = cr.getNext(); if (b4 == '.') { // value is -nn.n reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT); } @@ -92,11 +92,15 @@ public class CalculateAverage_albertoventurini { else { // value is nn.n reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT); } - cr.getNext(); // new line } + cr.cursor++; // new line - node.min = Math.min(node.min, reading); - node.max = Math.max(node.max, reading); + if (reading < node.min) { + node.min = reading; + } + if (reading > node.max) { + node.max = reading; + } node.sum += reading; node.count++; } @@ -165,27 +169,41 @@ public class CalculateAverage_albertoventurini { bytes[index] = (byte) i; printResultsRec(childNodes, bytes, index + 1); } - } } } private static final String FILE = "./measurements.txt"; + /** + * Read a chunk of a {@link RandomAccessFile} file. + * Internally, the chunk is further subdivided into "sub-chunks" (byte arrays). + */ private static final class ChunkReader { - // Byte arrays of size 2^22 seem to have the best performance on my machine. - private static final int BYTE_ARRAY_SIZE = 1 << 22; + // Byte arrays of size 2^20 seem to have the best performance on my machine. + private static final int BYTE_ARRAY_SIZE = 1 << 20; private final byte[] bytes; private final RandomAccessFile file; + + // The initial position of this chunk. private final long chunkBegin; + + // The length of this chunk. private final long chunkLength; - private int readBytes = 0; - - private int cursor = 0; + // The beginning of the current "sub-chunk", relative to the initial position of the chunk. private long offset = 0; + // The size of the current "sub-chunk". + private int subChunkSize = 0; + + // The current position within the current "sub-chunk". + private int cursor = 0; + + // The maximum size of a row + private static final int MAX_ROW_SIZE_BYTES = 107; + ChunkReader( final RandomAccessFile file, final long chunkBegin, @@ -197,32 +215,43 @@ public class CalculateAverage_albertoventurini { int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE; this.bytes = new byte[byteArraySize]; - readNextBytes(); + readSubChunk(); } - boolean hasNext() { - return (offset + cursor) < chunkLength; + // Return true if this ChunkReader has more bytes available, false otherwise. + // If this ChunkReader needs to read a new "sub-chunk", it does so in this method. + boolean ensureHasMoreRows() { + if (cursor >= subChunkSize) { + offset += cursor; + if (offset >= chunkLength) { + return false; + } + readSubChunk(); + } + + return true; } byte getNext() { - if (cursor >= readBytes) { - readNextBytes(); - } return bytes[cursor++]; } - private void readNextBytes() { + private void readSubChunk() { try { - offset += readBytes; synchronized (file) { file.seek(chunkBegin + offset); - readBytes = file.read(bytes); + subChunkSize = file.read(bytes); } - cursor = 0; } catch (IOException e) { throw new RuntimeException(e); } + + // Always "pretend" that we've read a few bytes less, + // so that we don't stop in the middle of reading a row + subChunkSize -= MAX_ROW_SIZE_BYTES; + + cursor = 0; } }