From e8b2d2d7b4caf114b12fa0386e35f741f31df905 Mon Sep 17 00:00:00 2001 From: Yavuz Tas <12643010+yavuztas@users.noreply.github.com> Date: Sat, 6 Jan 2024 19:09:16 +0100 Subject: [PATCH] Improvement in CalculateAverage_yavuztas (#162) * improve double reading by eleminating string parsing in between, make calculations over on integer instead of double, parse into double at the end only once * more improvements, sharing a single StringBuilder to build all toStrings, minor performance gain. * micro optimizations on reading temperature * a small skip for redundant traverses, micro optmization * micro optimization, eleminate some if cases, saves 0.5 seconds more * micro optimization, calculate key hash ahead eleminates more more loop, saves 0.5 seconds more :) * optimize key equals and handling the case when a region is larger than max integer size --------- Co-authored-by: Yavuz Tas --- .../onebrc/CalculateAverage_yavuztas.java | 137 +++++++++++------- 1 file changed, 82 insertions(+), 55 deletions(-) diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java b/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java index bef902e..eb3d191 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java @@ -37,19 +37,29 @@ public class CalculateAverage_yavuztas { private static final Path FILE = Path.of("./measurements.txt"); static class Measurement { - private double min; - private double max; - private double sum; - private int count = 1; - public Measurement(double initial) { + // Only accessed by a single thread, so it is safe to share + private static final StringBuilder STRING_BUILDER = new StringBuilder(14); + + private int min; // calculations over int is faster than double, we convert to double in the end only once + private int max; + private long sum; + private long count = 1; + + public Measurement(int initial) { this.min = initial; this.max = initial; this.sum = initial; } public String toString() { - return round(this.min) + "/" + round(this.sum / this.count) + "/" + round(this.max); + STRING_BUILDER.setLength(0); // clear the builder to reuse + STRING_BUILDER.append(this.min / 10.0); // convert to double while generating the string output + STRING_BUILDER.append("/"); + STRING_BUILDER.append(round((this.sum / 10.0) / this.count)); + STRING_BUILDER.append("/"); + STRING_BUILDER.append(this.max / 10.0); + return STRING_BUILDER.toString(); } private double round(double value) { @@ -59,24 +69,23 @@ public class CalculateAverage_yavuztas { static class KeyBuffer { - ByteBuffer value; + ByteBuffer buffer; + int length; int hash; - public KeyBuffer(ByteBuffer buffer) { - this.value = buffer; - this.hash = buffer.hashCode(); + public KeyBuffer(ByteBuffer buffer, int length, int hash) { + this.buffer = buffer; + this.length = length; + this.hash = hash; } @Override public boolean equals(Object o) { - if (this == o) - return true; - final KeyBuffer keyBuffer = (KeyBuffer) o; - if (o == null || getClass() != o.getClass() || this.hash != keyBuffer.hash) + if (this.length != keyBuffer.length || this.hash != keyBuffer.hash) return false; - return this.value.equals(keyBuffer.value); + return this.buffer.equals(keyBuffer.buffer); } @Override @@ -86,20 +95,14 @@ public class CalculateAverage_yavuztas { @Override public String toString() { - final int limit = this.value.limit(); - final byte[] bytes = new byte[limit]; - this.value.get(bytes); - return new String(bytes, 0, limit, StandardCharsets.UTF_8); + final byte[] bytes = new byte[this.length]; + this.buffer.get(bytes); + return new String(bytes, 0, this.length, StandardCharsets.UTF_8); } } static class FixedRegionDataAccessor { - static final byte SEMI_COLON = 59; // ';' - static final byte LINE_BREAK = 10; // '\n' - - final byte[] workBuffer = new byte[256]; // assuming max 256 bytes for a row is enough - long startPos; long size; ByteBuffer buffer; @@ -111,30 +114,35 @@ public class CalculateAverage_yavuztas { this.buffer = buffer; } - void traverse(BiConsumer consumer) { - - int semiColonPos = 0; - int lineBreakPos = 0; + void traverse(BiConsumer consumer) { + int keyHash; + int length; while (this.buffer.hasRemaining()) { - while ((this.workBuffer[0] = this.buffer.get()) != LINE_BREAK) { - if (this.workBuffer[0] == SEMI_COLON) { // save semicolon pos - semiColonPos = this.buffer.position(); // semicolon exclusive - } + this.position = this.buffer.position(); // save line start pos + + byte b; + keyHash = 0; + length = 0; + while ((b = this.buffer.get()) != ';') { // read until semicolon + keyHash = 31 * keyHash + b; // calculate key hash ahead, eleminates one more loop later + length++; } - // found linebreak - lineBreakPos = this.buffer.position(); - this.buffer.position(this.position); // set back to line start - final int length1 = semiColonPos - this.position; // station length - final int length2 = lineBreakPos - semiColonPos; // temperature length + final ByteBuffer station = this.buffer.slice(this.position, length); + final KeyBuffer key = new KeyBuffer(station, length, keyHash); - final ByteBuffer station = getRef(length1); // read station - final String temperature = readString(length2); // read temperature + this.buffer.mark(); // semicolon pos + skip(3); // skip more since minimum temperature length is 3 + length = 4; // +1 for semicolon - this.position = lineBreakPos; // skip to line end + while (this.buffer.get() != '\n') { + length++; // read until linebreak + // TODO how to read temperature here + } - consumer.accept(new KeyBuffer(station), Double.parseDouble(temperature)); + this.buffer.reset(); // set to after semicolon + consumer.accept(key, readTemperature(length)); } } @@ -157,21 +165,40 @@ public class CalculateAverage_yavuztas { return initial; } - String readString(int length) { - this.buffer.get(this.workBuffer, 0, length); - return new String(this.workBuffer, 0, length - 1, // strip the last char - StandardCharsets.UTF_8); + // caching Math.pow calculation improves a lot! + // interestingly, instance field access is much faster than static field access + final int[] powerOfTenCache = new int[]{ 1, 10, 100 }; + + int readTemperature(int length) { + int temp = 0; + final byte b1 = this.buffer.get(); // get first byte + + int digits = length - 4; // digit position + final boolean negative = b1 == '-'; + if (!negative) { + temp += this.powerOfTenCache[digits + 1] * (b1 - 48); // add first digit ahead + } + + byte b; + while ((b = this.buffer.get()) != '.') { // read until dot + temp += this.powerOfTenCache[digits--] * (b - 48); + } + b = this.buffer.get(); // read after dot, only one digit no loop + temp += this.powerOfTenCache[digits] * (b - 48); + this.buffer.get(); // skip line break + + return (negative) ? -temp : temp; } - ByteBuffer getRef(int length) { + ByteBuffer getKeyRef(int length) { final ByteBuffer slice = this.buffer.slice().limit(length - 1); - skip(this.buffer, length); + skip(length); return slice; } - static void skip(ByteBuffer buffer, int length) { - final int pos = buffer.position(); - buffer.position(pos + length); + void skip(int length) { + final int pos = this.buffer.position(); + this.buffer.position(pos + length); } } @@ -187,11 +214,11 @@ public class CalculateAverage_yavuztas { final long fileSize = Files.size(path); long regionSize = fileSize / concurrency; - if (regionSize > Integer.MAX_VALUE) { - // TODO multiply concurrency and try again - throw new IllegalArgumentException("Bigger than integer!"); - } // handling extreme cases + while (regionSize > Integer.MAX_VALUE) { + concurrency *= 2; + regionSize = fileSize / concurrency; + } if (regionSize <= 256) { // small file, no need concurrency concurrency = 1; regionSize = fileSize; @@ -251,7 +278,7 @@ public class CalculateAverage_yavuztas { private static int findClosestLineEnd(int regionSize, ByteBuffer buffer) { int position = regionSize; int left = regionSize; - while (buffer.get(position) != FixedRegionDataAccessor.LINE_BREAK) { + while (buffer.get(position) != '\n') { position = --left; } return position;