Second version by albertoventurini (#609)

* Contribution by albertoventurini

* Use byte arrays of size 2^20

---------

Co-authored-by: Alberto Venturini <alberto.venturini@accso.de>
This commit is contained in:
Alberto Venturini 2024-01-28 11:02:42 +02:00 committed by GitHub
parent 3e208be741
commit 936fc1da54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 61 additions and 32 deletions

View File

@ -15,5 +15,5 @@
# limitations under the License.
#
JAVA_OPTS="-server -Xnoclassgc"
JAVA_OPTS="-Xnoclassgc"
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini

View File

@ -58,31 +58,31 @@ public class CalculateAverage_albertoventurini {
// Process a chunk and write results in a Trie rooted at 'root'.
private static void processChunk(final TrieNode root, final ChunkReader cr) {
while (cr.hasNext()) {
while (cr.ensureHasMoreRows()) {
TrieNode node = root;
// Process the location name navigating through the trie
int b = cr.getNext() & 0xFF;
while (b != ';') {
int b = cr.getNext();
do {
b &= 0xFF;
if (node.children[b] == null) {
node.children[b] = new TrieNode();
}
node = node.children[b];
b = cr.getNext() & 0xFF;
}
b = cr.getNext();
} while (b != ';');
// Process the reading value (temperature)
int reading;
final int reading;
byte b1 = cr.getNext();
byte b2 = cr.getNext();
byte b3 = cr.getNext();
byte b4 = cr.getNext();
final byte b1 = cr.getNext();
final byte b2 = cr.getNext();
if (b2 == '.') { // value is n.n
reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
// b4 == \n
reading = (b1 * 10 + cr.getNext() - TWO_BYTE_TO_INT);
}
else {
final byte b3 = cr.getNext();
final byte b4 = cr.getNext();
if (b4 == '.') { // value is -nn.n
reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
}
@ -92,11 +92,15 @@ public class CalculateAverage_albertoventurini {
else { // value is nn.n
reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
}
cr.getNext(); // new line
}
cr.cursor++; // new line
node.min = Math.min(node.min, reading);
node.max = Math.max(node.max, reading);
if (reading < node.min) {
node.min = reading;
}
if (reading > node.max) {
node.max = reading;
}
node.sum += reading;
node.count++;
}
@ -165,27 +169,41 @@ public class CalculateAverage_albertoventurini {
bytes[index] = (byte) i;
printResultsRec(childNodes, bytes, index + 1);
}
}
}
}
private static final String FILE = "./measurements.txt";
/**
* Read a chunk of a {@link RandomAccessFile} file.
* Internally, the chunk is further subdivided into "sub-chunks" (byte arrays).
*/
private static final class ChunkReader {
// Byte arrays of size 2^22 seem to have the best performance on my machine.
private static final int BYTE_ARRAY_SIZE = 1 << 22;
// Byte arrays of size 2^20 seem to have the best performance on my machine.
private static final int BYTE_ARRAY_SIZE = 1 << 20;
private final byte[] bytes;
private final RandomAccessFile file;
// The initial position of this chunk.
private final long chunkBegin;
// The length of this chunk.
private final long chunkLength;
private int readBytes = 0;
private int cursor = 0;
// The beginning of the current "sub-chunk", relative to the initial position of the chunk.
private long offset = 0;
// The size of the current "sub-chunk".
private int subChunkSize = 0;
// The current position within the current "sub-chunk".
private int cursor = 0;
// The maximum size of a row
private static final int MAX_ROW_SIZE_BYTES = 107;
ChunkReader(
final RandomAccessFile file,
final long chunkBegin,
@ -197,32 +215,43 @@ public class CalculateAverage_albertoventurini {
int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
this.bytes = new byte[byteArraySize];
readNextBytes();
readSubChunk();
}
boolean hasNext() {
return (offset + cursor) < chunkLength;
// Return true if this ChunkReader has more bytes available, false otherwise.
// If this ChunkReader needs to read a new "sub-chunk", it does so in this method.
boolean ensureHasMoreRows() {
if (cursor >= subChunkSize) {
offset += cursor;
if (offset >= chunkLength) {
return false;
}
readSubChunk();
}
return true;
}
byte getNext() {
if (cursor >= readBytes) {
readNextBytes();
}
return bytes[cursor++];
}
private void readNextBytes() {
private void readSubChunk() {
try {
offset += readBytes;
synchronized (file) {
file.seek(chunkBegin + offset);
readBytes = file.read(bytes);
subChunkSize = file.read(bytes);
}
cursor = 0;
}
catch (IOException e) {
throw new RuntimeException(e);
}
// Always "pretend" that we've read a few bytes less,
// so that we don't stop in the middle of reading a row
subChunkSize -= MAX_ROW_SIZE_BYTES;
cursor = 0;
}
}