Second version by albertoventurini (#609)

* Contribution by albertoventurini

* Use byte arrays of size 2^20

---------

Co-authored-by: Alberto Venturini <alberto.venturini@accso.de>
This commit is contained in:
Alberto Venturini 2024-01-28 11:02:42 +02:00 committed by GitHub
parent 3e208be741
commit 936fc1da54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 61 additions and 32 deletions

View File

@ -15,5 +15,5 @@
# limitations under the License. # limitations under the License.
# #
JAVA_OPTS="-server -Xnoclassgc" JAVA_OPTS="-Xnoclassgc"
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini

View File

@ -58,31 +58,31 @@ public class CalculateAverage_albertoventurini {
// Process a chunk and write results in a Trie rooted at 'root'. // Process a chunk and write results in a Trie rooted at 'root'.
private static void processChunk(final TrieNode root, final ChunkReader cr) { private static void processChunk(final TrieNode root, final ChunkReader cr) {
while (cr.hasNext()) { while (cr.ensureHasMoreRows()) {
TrieNode node = root; TrieNode node = root;
// Process the location name navigating through the trie // Process the location name navigating through the trie
int b = cr.getNext() & 0xFF; int b = cr.getNext();
while (b != ';') { do {
b &= 0xFF;
if (node.children[b] == null) { if (node.children[b] == null) {
node.children[b] = new TrieNode(); node.children[b] = new TrieNode();
} }
node = node.children[b]; node = node.children[b];
b = cr.getNext() & 0xFF; b = cr.getNext();
} } while (b != ';');
// Process the reading value (temperature) // Process the reading value (temperature)
int reading; final int reading;
byte b1 = cr.getNext(); final byte b1 = cr.getNext();
byte b2 = cr.getNext(); final byte b2 = cr.getNext();
byte b3 = cr.getNext();
byte b4 = cr.getNext();
if (b2 == '.') { // value is n.n if (b2 == '.') { // value is n.n
reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT); reading = (b1 * 10 + cr.getNext() - TWO_BYTE_TO_INT);
// b4 == \n
} }
else { else {
final byte b3 = cr.getNext();
final byte b4 = cr.getNext();
if (b4 == '.') { // value is -nn.n if (b4 == '.') { // value is -nn.n
reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT); reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
} }
@ -92,11 +92,15 @@ public class CalculateAverage_albertoventurini {
else { // value is nn.n else { // value is nn.n
reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT); reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
} }
cr.getNext(); // new line
} }
cr.cursor++; // new line
node.min = Math.min(node.min, reading); if (reading < node.min) {
node.max = Math.max(node.max, reading); node.min = reading;
}
if (reading > node.max) {
node.max = reading;
}
node.sum += reading; node.sum += reading;
node.count++; node.count++;
} }
@ -165,27 +169,41 @@ public class CalculateAverage_albertoventurini {
bytes[index] = (byte) i; bytes[index] = (byte) i;
printResultsRec(childNodes, bytes, index + 1); printResultsRec(childNodes, bytes, index + 1);
} }
} }
} }
} }
private static final String FILE = "./measurements.txt"; private static final String FILE = "./measurements.txt";
/**
* Read a chunk of a {@link RandomAccessFile} file.
* Internally, the chunk is further subdivided into "sub-chunks" (byte arrays).
*/
private static final class ChunkReader { private static final class ChunkReader {
// Byte arrays of size 2^22 seem to have the best performance on my machine. // Byte arrays of size 2^20 seem to have the best performance on my machine.
private static final int BYTE_ARRAY_SIZE = 1 << 22; private static final int BYTE_ARRAY_SIZE = 1 << 20;
private final byte[] bytes; private final byte[] bytes;
private final RandomAccessFile file; private final RandomAccessFile file;
// The initial position of this chunk.
private final long chunkBegin; private final long chunkBegin;
// The length of this chunk.
private final long chunkLength; private final long chunkLength;
private int readBytes = 0; // The beginning of the current "sub-chunk", relative to the initial position of the chunk.
private int cursor = 0;
private long offset = 0; private long offset = 0;
// The size of the current "sub-chunk".
private int subChunkSize = 0;
// The current position within the current "sub-chunk".
private int cursor = 0;
// The maximum size of a row
private static final int MAX_ROW_SIZE_BYTES = 107;
ChunkReader( ChunkReader(
final RandomAccessFile file, final RandomAccessFile file,
final long chunkBegin, final long chunkBegin,
@ -197,32 +215,43 @@ public class CalculateAverage_albertoventurini {
int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE; int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
this.bytes = new byte[byteArraySize]; this.bytes = new byte[byteArraySize];
readNextBytes(); readSubChunk();
} }
boolean hasNext() { // Return true if this ChunkReader has more bytes available, false otherwise.
return (offset + cursor) < chunkLength; // If this ChunkReader needs to read a new "sub-chunk", it does so in this method.
boolean ensureHasMoreRows() {
if (cursor >= subChunkSize) {
offset += cursor;
if (offset >= chunkLength) {
return false;
}
readSubChunk();
}
return true;
} }
byte getNext() { byte getNext() {
if (cursor >= readBytes) {
readNextBytes();
}
return bytes[cursor++]; return bytes[cursor++];
} }
private void readNextBytes() { private void readSubChunk() {
try { try {
offset += readBytes;
synchronized (file) { synchronized (file) {
file.seek(chunkBegin + offset); file.seek(chunkBegin + offset);
readBytes = file.read(bytes); subChunkSize = file.read(bytes);
} }
cursor = 0;
} }
catch (IOException e) { catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
// Always "pretend" that we've read a few bytes less,
// so that we don't stop in the middle of reading a row
subChunkSize -= MAX_ROW_SIZE_BYTES;
cursor = 0;
} }
} }