Improving first iteration by avoiding string creation as much as possible (#516)

- It avoids creating unnecessary Strings objects and handles with the station names with its djb2 hashes instead
- Initializes hashmaps with capacity and load factor
- Adds -XX:+AlwaysPreTouch
This commit is contained in:
adri 2024-01-20 21:27:34 +01:00 committed by GitHub
parent 36ffed1315
commit 0a7726cc64
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 33 deletions

View File

@ -16,6 +16,6 @@
#
JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC"
JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
java --enable-preview -classpath target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_adriacabeza

View File

@ -52,3 +52,4 @@ gnmathur;Gaurav Mathur
vemana;Subrahmanyam
jincongho;Jin Cong Ho
yonatang;Yonatan Graber
adriacabeza;Adrià Cabeza

View File

@ -23,9 +23,13 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
/**
@ -35,11 +39,22 @@ public class CalculateAverage_adriacabeza {
private static final Path FILE_PATH = Paths.get("./measurements.txt");
public static final int CITY_NAME_MAX_CHARACTERS = 128;
private static final int N_PROCESSORS = Runtime.getRuntime().availableProcessors();
private static final int DJB2_INIT = 5381;
private static final Map<Integer, String> cityMap = new ConcurrentHashMap<>(10_000, 1, N_PROCESSORS);
/**
* Represents result containing a HashMap with city as key and ResultRow as value.
*/
private static class Result {
public void addStation(int hash, int value) {
resultMap.put(hash, new StationData(value));
}
public StationData getData(int hash) {
return resultMap.get(hash);
}
private static class StationData {
private int min, sum, count, max;
@ -63,28 +78,16 @@ public class CalculateAverage_adriacabeza {
}
private final Map<String, StationData> resultMap;
private final Map<Integer, StationData> resultMap;
public Result() {
this.resultMap = new HashMap<>();
this.resultMap = new HashMap<>(10_000, 1);
}
public Map<String, StationData> getResultMap() {
public Map<Integer, StationData> getResultMap() {
return resultMap;
}
public void addMeasurement(String city, int value) {
resultMap.compute(city, (_, resultRow) -> {
if (resultRow == null) {
return new StationData(value);
}
else {
resultRow.update(value);
return resultRow;
}
});
}
public void merge(Result other) {
other.getResultMap().forEach((city, resultRow) -> resultMap.merge(city, resultRow, (existing, incoming) -> {
existing.min = Math.min(existing.min, incoming.min);
@ -96,9 +99,9 @@ public class CalculateAverage_adriacabeza {
}
public String toString() {
return this.resultMap.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
return this.resultMap.entrySet().parallelStream()
.map(entry -> "%s=%s".formatted(cityMap.get(entry.getKey()), entry.getValue()))
.sorted(Comparator.comparing(s -> s.split("=")[0]))
.collect(Collectors.joining(", ", "{", "}"));
}
}
@ -155,6 +158,21 @@ public class CalculateAverage_adriacabeza {
}
}
private static int readNumberFromBuffer(ByteBuffer buffer, int limit) {
var number = 0;
var sign = 1;
while (buffer.position() < limit) {
var numberByte = buffer.get();
if (numberByte == '-')
sign = -1;
else if (numberByte == '\n')
break;
else if (numberByte != '.')
number = number * 10 + (numberByte - '0');
}
return sign * number;
}
/**
* Calculates average measurements from the file.
*
@ -167,28 +185,31 @@ public class CalculateAverage_adriacabeza {
Result partialResult = new Result();
var limit = buffer.limit();
var field = new byte[CITY_NAME_MAX_CHARACTERS];
Set<Integer> seenHashes = new HashSet<>(10_000, 1);
while (buffer.position() < limit) {
var fieldCurrentIndex = 0;
field[fieldCurrentIndex++] = buffer.get();
var fieldByte = buffer.get();
field[fieldCurrentIndex++] = fieldByte;
// implement djb2 hash: https://theartincode.stanis.me/008-djb2/
int hash = DJB2_INIT;
while (buffer.position() < limit) {
var fieldByte = buffer.get();
// hash = hash * 33 + fieldByte
hash = (((hash << 5) + hash) + fieldByte);
fieldByte = buffer.get();
if (fieldByte == ';')
break;
field[fieldCurrentIndex++] = fieldByte;
}
var fieldStr = new String(field, 0, fieldCurrentIndex);
var number = 0;
var sign = 1;
while (buffer.position() < limit) {
var numberByte = buffer.get();
if (numberByte == '-')
sign = -1;
else if (numberByte == '\n')
break;
else if (numberByte != '.')
number = number * 10 + (numberByte - '0');
var number = readNumberFromBuffer(buffer, limit);
if (!seenHashes.contains(hash)) {
seenHashes.add(hash);
cityMap.put(hash, new String(field, 0, fieldCurrentIndex));
partialResult.addStation(hash, number);
}
else {
partialResult.getData(hash).update(number);
}
partialResult.addMeasurement(fieldStr, sign * number);
}
return partialResult;
}).reduce(new Result(), (partialResult1, partialResult2) -> {