diff --git a/calculate_average_filiphr.sh b/calculate_average_filiphr.sh index 56d4e37..6f63449 100755 --- a/calculate_average_filiphr.sh +++ b/calculate_average_filiphr.sh @@ -16,7 +16,7 @@ # -sdk use java 21.0.1-graal -java -version +source "$HOME/.sdkman/bin/sdkman-init.sh" +sdk use java 21.0.1-graal 1>&2 JAVA_OPTS="" time java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_filiphr diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java b/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java index 7eabba7..c55871c 100644 --- a/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java @@ -18,11 +18,11 @@ package dev.morling.onebrc; import java.io.IOException; import java.io.UncheckedIOException; import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -40,8 +40,10 @@ import java.util.stream.StreamSupport; * Adding memory mapped files: 0m 55s (based on bjhara's submission) * Using big decimal and iterating the buffer once: 0m 20s * Using long parse: 0m 11s - * Using array hash code for city key: 0m 7.1s + * Using array hash code for city key: 0m 7.1s (this is invalid since it can lead to hash collisions) * Manually compute the value: 0m 6.8s + * Revert array hash code for city key: 0m 10s + * Use array hash and Arrays#equals for city key: 0m 7.2s *

* Using 21.0.1 Temurin with ShenandoahGC on Macbook (Intel) Pro * `sdk use java 21.0.1-tem` @@ -61,16 +63,11 @@ public class CalculateAverage_filiphr { private static final class Measurement { - private final String city; private long min = Long.MAX_VALUE; private long max = Long.MIN_VALUE; private long sum = 0L; private long count = 0L; - private Measurement(String city) { - this.city = city; - } - private void add(long value) { this.min = Math.min(this.min, value); this.max = Math.max(this.max, value); @@ -79,7 +76,7 @@ public class CalculateAverage_filiphr { } public static Measurement combine(Measurement m1, Measurement m2) { - Measurement measurement = new Measurement(m1.city); + Measurement measurement = new Measurement(); measurement.min = Math.min(m1.min, m2.min); measurement.max = Math.max(m1.max, m2.max); measurement.sum = m1.sum + m2.sum; @@ -100,7 +97,7 @@ public class CalculateAverage_filiphr { public static void main(String[] args) throws IOException { // long start = System.nanoTime(); - Map measurements; + Map measurements; try (FileChannel fileChannel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) { measurements = fineChannelStream(fileChannel) .parallel() @@ -109,24 +106,26 @@ public class CalculateAverage_filiphr { } Map finalMeasurements = new TreeMap<>(); - for (Measurement measurement : measurements.values()) { - finalMeasurements.put(measurement.city, measurement); + for (Map.Entry entry : measurements.entrySet()) { + StoredKey key = (StoredKey) entry.getKey(); + Measurement measurement = entry.getValue(); + finalMeasurements.put(new String(key.keyBytes), measurement); } System.out.println(finalMeasurements); // System.out.println("Done in " + (System.nanoTime() - start) / 1000000 + " ms"); } - private static Map mergeMaps(Map map1, Map map2) { + private static Map mergeMaps(Map map1, Map map2) { if (map1.isEmpty()) { return map2; } else { - Set cities = new HashSet<>(map1.keySet()); + Set cities = new HashSet<>(map1.keySet()); cities.addAll(map2.keySet()); - Map result = HashMap.newHashMap(cities.size()); + Map result = HashMap.newHashMap(cities.size()); - for (Integer city : cities) { + for (Key city : cities) { Measurement m1 = map1.get(city); Measurement m2 = map2.get(city); if (m2 == null) { @@ -153,8 +152,8 @@ public class CalculateAverage_filiphr { * We are using {@code Map} because creating the string key on every single line is obsolete. * Instead, we create a hash key from the string, and we use that as a key in the map. */ - private static Map parseBuffer(ByteBuffer bb) { - Map measurements = HashMap.newHashMap(415); + private static Map parseBuffer(ByteBuffer bb) { + Map measurements = HashMap.newHashMap(415); int limit = bb.limit(); byte[] cityBuffer = new byte[128]; @@ -163,16 +162,18 @@ public class CalculateAverage_filiphr { // Iterate through the byte buffer and fill the buffer until we find the separator (;) // While iterating we are also going to compute the city hash key - int cityKey = 1; + int cityHash = 1; while (bb.position() < limit) { byte positionByte = bb.get(); if (positionByte == ';') { break; } cityBuffer[cityBufferIndex++] = positionByte; - cityKey = 31 * cityKey + positionByte; + cityHash = 31 * cityHash + positionByte; } + SearchKey searchKey = new SearchKey(cityBuffer, cityHash, cityBufferIndex); + byte lastPositionByte = '\n'; boolean negative = false; long value = 0; @@ -198,11 +199,13 @@ public class CalculateAverage_filiphr { value = -value; } - Measurement measurement = measurements.get(cityKey); + Measurement measurement = measurements.get(searchKey); if (measurement == null) { - String city = new String(cityBuffer, 0, cityBufferIndex); - measurement = new Measurement(city); - measurements.put(cityKey, measurement); + byte[] keyBytes = new byte[cityBufferIndex]; + System.arraycopy(cityBuffer, 0, keyBytes, 0, cityBufferIndex); + StoredKey storedKey = new StoredKey(keyBytes, cityHash); + measurement = new Measurement(); + measurements.put(storedKey, measurement); } measurement.add(value); @@ -258,4 +261,86 @@ public class CalculateAverage_filiphr { } }; } + + /** + * This is a class that is used to reference a city key using its bytes only. + * It has the hash precomputed, and it is equal to a {@link SearchKey} when the key bytes are equal to the {@link SearchKey#buffer} up to the {@link SearchKey#limit}. + */ + private static final class StoredKey implements Key { + + private final byte[] keyBytes; + private final int hash; + + private StoredKey(byte[] keyBytes, int hash) { + this.keyBytes = keyBytes; + this.hash = hash; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null) { + return false; + } + if (o instanceof SearchKey key) { + return Arrays.equals(keyBytes, 0, keyBytes.length, key.buffer, 0, key.limit); + } + else if (o instanceof StoredKey key) { + return Arrays.equals(keyBytes, key.keyBytes); + } + return false; + } + + @Override + public int hashCode() { + return hash; + } + } + + /** + * A class that is used to lookup for a value in a map. + * This key is equal to {@link StoredKey} when the buffer has the same contents as the {@link StoredKey#keyBytes}. + */ + private static final class SearchKey implements Key { + + private final byte[] buffer; + private final int hash; + private final int limit; + + private SearchKey(byte[] buffer, int hash, int limit) { + this.buffer = buffer; + this.hash = hash; + this.limit = limit; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null) { + return false; + } + + if (o instanceof StoredKey key) { + return Arrays.equals(buffer, 0, limit, key.keyBytes, 0, limit); + } + else if (o instanceof SearchKey key) { + return Arrays.equals(buffer, 0, limit, key.buffer, 0, key.limit); + } + return false; + } + + @Override + public int hashCode() { + return hash; + } + } + + private interface Key { + + } + }