From 0a7726cc643dc4068884af37a1280b5094454e25 Mon Sep 17 00:00:00 2001 From: adri Date: Sat, 20 Jan 2024 21:27:34 +0100 Subject: [PATCH] Improving first iteration by avoiding string creation as much as possible (#516) - It avoids creating unnecessary Strings objects and handles with the station names with its djb2 hashes instead - Initializes hashmaps with capacity and load factor - Adds -XX:+AlwaysPreTouch --- calculate_average_adriacabeza.sh | 2 +- github_users.txt | 1 + .../onebrc/CalculateAverage_adriacabeza.java | 85 ++++++++++++------- 3 files changed, 55 insertions(+), 33 deletions(-) diff --git a/calculate_average_adriacabeza.sh b/calculate_average_adriacabeza.sh index e2c6557..7d4be43 100755 --- a/calculate_average_adriacabeza.sh +++ b/calculate_average_adriacabeza.sh @@ -16,6 +16,6 @@ # -JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC" +JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch" java --enable-preview -classpath target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_adriacabeza diff --git a/github_users.txt b/github_users.txt index bd47267..1e640d8 100644 --- a/github_users.txt +++ b/github_users.txt @@ -52,3 +52,4 @@ gnmathur;Gaurav Mathur vemana;Subrahmanyam jincongho;Jin Cong Ho yonatang;Yonatan Graber +adriacabeza;AdriĆ  Cabeza diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java index a1a6953..99936b2 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java @@ -23,9 +23,13 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; /** @@ -35,11 +39,22 @@ public class CalculateAverage_adriacabeza { private static final Path FILE_PATH = Paths.get("./measurements.txt"); public static final int CITY_NAME_MAX_CHARACTERS = 128; + private static final int N_PROCESSORS = Runtime.getRuntime().availableProcessors(); + private static final int DJB2_INIT = 5381; + private static final Map cityMap = new ConcurrentHashMap<>(10_000, 1, N_PROCESSORS); /** * Represents result containing a HashMap with city as key and ResultRow as value. */ private static class Result { + public void addStation(int hash, int value) { + resultMap.put(hash, new StationData(value)); + } + + public StationData getData(int hash) { + return resultMap.get(hash); + } + private static class StationData { private int min, sum, count, max; @@ -63,28 +78,16 @@ public class CalculateAverage_adriacabeza { } - private final Map resultMap; + private final Map resultMap; public Result() { - this.resultMap = new HashMap<>(); + this.resultMap = new HashMap<>(10_000, 1); } - public Map getResultMap() { + public Map getResultMap() { return resultMap; } - public void addMeasurement(String city, int value) { - resultMap.compute(city, (_, resultRow) -> { - if (resultRow == null) { - return new StationData(value); - } - else { - resultRow.update(value); - return resultRow; - } - }); - } - public void merge(Result other) { other.getResultMap().forEach((city, resultRow) -> resultMap.merge(city, resultRow, (existing, incoming) -> { existing.min = Math.min(existing.min, incoming.min); @@ -96,9 +99,9 @@ public class CalculateAverage_adriacabeza { } public String toString() { - return this.resultMap.entrySet().stream() - .sorted(Map.Entry.comparingByKey()) - .map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue())) + return this.resultMap.entrySet().parallelStream() + .map(entry -> "%s=%s".formatted(cityMap.get(entry.getKey()), entry.getValue())) + .sorted(Comparator.comparing(s -> s.split("=")[0])) .collect(Collectors.joining(", ", "{", "}")); } } @@ -155,6 +158,21 @@ public class CalculateAverage_adriacabeza { } } + private static int readNumberFromBuffer(ByteBuffer buffer, int limit) { + var number = 0; + var sign = 1; + while (buffer.position() < limit) { + var numberByte = buffer.get(); + if (numberByte == '-') + sign = -1; + else if (numberByte == '\n') + break; + else if (numberByte != '.') + number = number * 10 + (numberByte - '0'); + } + return sign * number; + } + /** * Calculates average measurements from the file. * @@ -167,28 +185,31 @@ public class CalculateAverage_adriacabeza { Result partialResult = new Result(); var limit = buffer.limit(); var field = new byte[CITY_NAME_MAX_CHARACTERS]; + Set seenHashes = new HashSet<>(10_000, 1); while (buffer.position() < limit) { var fieldCurrentIndex = 0; - field[fieldCurrentIndex++] = buffer.get(); + var fieldByte = buffer.get(); + field[fieldCurrentIndex++] = fieldByte; + // implement djb2 hash: https://theartincode.stanis.me/008-djb2/ + int hash = DJB2_INIT; while (buffer.position() < limit) { - var fieldByte = buffer.get(); + // hash = hash * 33 + fieldByte + hash = (((hash << 5) + hash) + fieldByte); + fieldByte = buffer.get(); if (fieldByte == ';') break; field[fieldCurrentIndex++] = fieldByte; } - var fieldStr = new String(field, 0, fieldCurrentIndex); - var number = 0; - var sign = 1; - while (buffer.position() < limit) { - var numberByte = buffer.get(); - if (numberByte == '-') - sign = -1; - else if (numberByte == '\n') - break; - else if (numberByte != '.') - number = number * 10 + (numberByte - '0'); + + var number = readNumberFromBuffer(buffer, limit); + if (!seenHashes.contains(hash)) { + seenHashes.add(hash); + cityMap.put(hash, new String(field, 0, fieldCurrentIndex)); + partialResult.addStation(hash, number); + } + else { + partialResult.getData(hash).update(number); } - partialResult.addMeasurement(fieldStr, sign * number); } return partialResult; }).reduce(new Result(), (partialResult1, partialResult2) -> {