Second version by albertoventurini (#609)
* Contribution by albertoventurini * Use byte arrays of size 2^20 --------- Co-authored-by: Alberto Venturini <alberto.venturini@accso.de>
This commit is contained in:
parent
3e208be741
commit
936fc1da54
@ -15,5 +15,5 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
JAVA_OPTS="-server -Xnoclassgc"
|
JAVA_OPTS="-Xnoclassgc"
|
||||||
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini
|
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini
|
||||||
|
@ -58,31 +58,31 @@ public class CalculateAverage_albertoventurini {
|
|||||||
|
|
||||||
// Process a chunk and write results in a Trie rooted at 'root'.
|
// Process a chunk and write results in a Trie rooted at 'root'.
|
||||||
private static void processChunk(final TrieNode root, final ChunkReader cr) {
|
private static void processChunk(final TrieNode root, final ChunkReader cr) {
|
||||||
while (cr.hasNext()) {
|
while (cr.ensureHasMoreRows()) {
|
||||||
TrieNode node = root;
|
TrieNode node = root;
|
||||||
|
|
||||||
// Process the location name navigating through the trie
|
// Process the location name navigating through the trie
|
||||||
int b = cr.getNext() & 0xFF;
|
int b = cr.getNext();
|
||||||
while (b != ';') {
|
do {
|
||||||
|
b &= 0xFF;
|
||||||
if (node.children[b] == null) {
|
if (node.children[b] == null) {
|
||||||
node.children[b] = new TrieNode();
|
node.children[b] = new TrieNode();
|
||||||
}
|
}
|
||||||
node = node.children[b];
|
node = node.children[b];
|
||||||
b = cr.getNext() & 0xFF;
|
b = cr.getNext();
|
||||||
}
|
} while (b != ';');
|
||||||
|
|
||||||
// Process the reading value (temperature)
|
// Process the reading value (temperature)
|
||||||
int reading;
|
final int reading;
|
||||||
|
|
||||||
byte b1 = cr.getNext();
|
final byte b1 = cr.getNext();
|
||||||
byte b2 = cr.getNext();
|
final byte b2 = cr.getNext();
|
||||||
byte b3 = cr.getNext();
|
|
||||||
byte b4 = cr.getNext();
|
|
||||||
if (b2 == '.') { // value is n.n
|
if (b2 == '.') { // value is n.n
|
||||||
reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
|
reading = (b1 * 10 + cr.getNext() - TWO_BYTE_TO_INT);
|
||||||
// b4 == \n
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
final byte b3 = cr.getNext();
|
||||||
|
final byte b4 = cr.getNext();
|
||||||
if (b4 == '.') { // value is -nn.n
|
if (b4 == '.') { // value is -nn.n
|
||||||
reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
|
reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
|
||||||
}
|
}
|
||||||
@ -92,11 +92,15 @@ public class CalculateAverage_albertoventurini {
|
|||||||
else { // value is nn.n
|
else { // value is nn.n
|
||||||
reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
|
reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
|
||||||
}
|
}
|
||||||
cr.getNext(); // new line
|
|
||||||
}
|
}
|
||||||
|
cr.cursor++; // new line
|
||||||
|
|
||||||
node.min = Math.min(node.min, reading);
|
if (reading < node.min) {
|
||||||
node.max = Math.max(node.max, reading);
|
node.min = reading;
|
||||||
|
}
|
||||||
|
if (reading > node.max) {
|
||||||
|
node.max = reading;
|
||||||
|
}
|
||||||
node.sum += reading;
|
node.sum += reading;
|
||||||
node.count++;
|
node.count++;
|
||||||
}
|
}
|
||||||
@ -165,27 +169,41 @@ public class CalculateAverage_albertoventurini {
|
|||||||
bytes[index] = (byte) i;
|
bytes[index] = (byte) i;
|
||||||
printResultsRec(childNodes, bytes, index + 1);
|
printResultsRec(childNodes, bytes, index + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final String FILE = "./measurements.txt";
|
private static final String FILE = "./measurements.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a chunk of a {@link RandomAccessFile} file.
|
||||||
|
* Internally, the chunk is further subdivided into "sub-chunks" (byte arrays).
|
||||||
|
*/
|
||||||
private static final class ChunkReader {
|
private static final class ChunkReader {
|
||||||
// Byte arrays of size 2^22 seem to have the best performance on my machine.
|
// Byte arrays of size 2^20 seem to have the best performance on my machine.
|
||||||
private static final int BYTE_ARRAY_SIZE = 1 << 22;
|
private static final int BYTE_ARRAY_SIZE = 1 << 20;
|
||||||
private final byte[] bytes;
|
private final byte[] bytes;
|
||||||
|
|
||||||
private final RandomAccessFile file;
|
private final RandomAccessFile file;
|
||||||
|
|
||||||
|
// The initial position of this chunk.
|
||||||
private final long chunkBegin;
|
private final long chunkBegin;
|
||||||
|
|
||||||
|
// The length of this chunk.
|
||||||
private final long chunkLength;
|
private final long chunkLength;
|
||||||
|
|
||||||
private int readBytes = 0;
|
// The beginning of the current "sub-chunk", relative to the initial position of the chunk.
|
||||||
|
|
||||||
private int cursor = 0;
|
|
||||||
private long offset = 0;
|
private long offset = 0;
|
||||||
|
|
||||||
|
// The size of the current "sub-chunk".
|
||||||
|
private int subChunkSize = 0;
|
||||||
|
|
||||||
|
// The current position within the current "sub-chunk".
|
||||||
|
private int cursor = 0;
|
||||||
|
|
||||||
|
// The maximum size of a row
|
||||||
|
private static final int MAX_ROW_SIZE_BYTES = 107;
|
||||||
|
|
||||||
ChunkReader(
|
ChunkReader(
|
||||||
final RandomAccessFile file,
|
final RandomAccessFile file,
|
||||||
final long chunkBegin,
|
final long chunkBegin,
|
||||||
@ -197,32 +215,43 @@ public class CalculateAverage_albertoventurini {
|
|||||||
int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
|
int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
|
||||||
this.bytes = new byte[byteArraySize];
|
this.bytes = new byte[byteArraySize];
|
||||||
|
|
||||||
readNextBytes();
|
readSubChunk();
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasNext() {
|
// Return true if this ChunkReader has more bytes available, false otherwise.
|
||||||
return (offset + cursor) < chunkLength;
|
// If this ChunkReader needs to read a new "sub-chunk", it does so in this method.
|
||||||
|
boolean ensureHasMoreRows() {
|
||||||
|
if (cursor >= subChunkSize) {
|
||||||
|
offset += cursor;
|
||||||
|
if (offset >= chunkLength) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
readSubChunk();
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
byte getNext() {
|
byte getNext() {
|
||||||
if (cursor >= readBytes) {
|
|
||||||
readNextBytes();
|
|
||||||
}
|
|
||||||
return bytes[cursor++];
|
return bytes[cursor++];
|
||||||
}
|
}
|
||||||
|
|
||||||
private void readNextBytes() {
|
private void readSubChunk() {
|
||||||
try {
|
try {
|
||||||
offset += readBytes;
|
|
||||||
synchronized (file) {
|
synchronized (file) {
|
||||||
file.seek(chunkBegin + offset);
|
file.seek(chunkBegin + offset);
|
||||||
readBytes = file.read(bytes);
|
subChunkSize = file.read(bytes);
|
||||||
}
|
}
|
||||||
cursor = 0;
|
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Always "pretend" that we've read a few bytes less,
|
||||||
|
// so that we don't stop in the middle of reading a row
|
||||||
|
subChunkSize -= MAX_ROW_SIZE_BYTES;
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user