From c01668403a13d86dcd2305884198387ee1f073f0 Mon Sep 17 00:00:00 2001 From: Kidlike Date: Sat, 13 Jan 2024 22:22:36 +0100 Subject: [PATCH] submission for kidlike (#294) * first version * second version (0m59s) * third version (0m46s) * fourth version (0m39s) * fifth version (0m18s) * follow naming conventions from project structure * fix rounding (see /issues/49) * formatting changes from build * name should case-match github username * sixth version (14s) * seventh version (11s) * potential fix for other systems? * no need for sdk install * binary should go to ./target * building native-image only when not existing yet --- calculate_average_Kidlike.sh | 24 ++ prepare_Kidlike.sh | 24 ++ .../onebrc/CalculateAverage_Kidlike.java | 230 ++++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100755 calculate_average_Kidlike.sh create mode 100755 prepare_Kidlike.sh create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java diff --git a/calculate_average_Kidlike.sh b/calculate_average_Kidlike.sh new file mode 100755 index 0000000..83c69c1 --- /dev/null +++ b/calculate_average_Kidlike.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if [ -f ./target/image_calculateaverage_Kidlike ]; then + ./target/image_calculateaverage_Kidlike +else + # -XX:+UseEpsilonGC + JAVA_OPTS="--enable-preview -Xms18g -Xmx18g -XX:+UnlockExperimentalVMOptions" + java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_Kidlike +fi diff --git a/prepare_Kidlike.sh b/prepare_Kidlike.sh new file mode 100755 index 0000000..d2aded3 --- /dev/null +++ b/prepare_Kidlike.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if [ ! -e target/image_calculateaverage_Kidlike ]; then + source "$HOME/.sdkman/bin/sdkman-init.sh" + sdk use java 21.0.1-graal + #NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview" + NATIVE_IMAGE_OPTS="-O3 -march=native --enable-preview" + native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/image_calculateaverage_Kidlike dev.morling.onebrc.CalculateAverage_Kidlike +fi \ No newline at end of file diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java b/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java new file mode 100644 index 0000000..903da31 --- /dev/null +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java @@ -0,0 +1,230 @@ +/* + * Copyright 2023 The original authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dev.morling.onebrc; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel.MapMode; +import java.text.DecimalFormat; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; + +/** + * Versions (correlate to git commits): + *
    + *
  1. 2m34s: parallel file read -> load byte chunks in memory -> sequentially process bytes for result
  2. + *
  3. 0m59s: process byte chunks in parallel (had to introduce smarter byte chunking so it splits only on newlines)
  4. + *
  5. 0m46s: smaller numeric types for MeasurementAggregator
  6. + *
  7. 0m39s: implement custom byte[] to int parsing, instead of Double.parseDouble(new String(bytes))
  8. + *
  9. 0m18s: run with GraalVM native-image
  10. + *
  11. 0m14s: remove ConcurrentHashMap
  12. + *
  13. 0m11s: remove Map#compute
  14. + *
+ * + *

+ * Hardware: + *

+ *

+ */ +public class CalculateAverage_Kidlike { + + public static void main(String[] args) { + File file = new File("./measurements.txt"); + long fileSize = file.length(); + int processors = (fileSize < 1_000_000) ? 1 : Runtime.getRuntime().availableProcessors(); + long chunkSize = fileSize / processors; + + MappedByteBuffer[] byteBuffers = new MappedByteBuffer[processors]; + + try (var executor = Executors.newVirtualThreadPerTaskExecutor()) { + for (int i = 0; i < processors; i++) { + long start = i * chunkSize; + long length = (i == processors - 1) ? fileSize - chunkSize * (processors - 1) : chunkSize; + int processor = i; + executor.execute(() -> { + long realStart = start; + try (RandomAccessFile raf = new RandomAccessFile(file, "r")) { + if (start != 0) { + raf.seek(start); + while (raf.readByte() != '\n') { + // move pointer to newline, so that we can process the byte chunks in parallel later. + } + realStart = raf.getFilePointer(); + } + long realLength = fileSize - realStart; + if (realStart + length < fileSize) { + raf.seek(realStart + length); + while (raf.readByte() != '\n') { + // move pointer to newline, so that we can process the byte chunks in parallel later. + } + realLength = raf.getFilePointer() - realStart; + } + + MappedByteBuffer byteBuffer = raf.getChannel().map(MapMode.READ_ONLY, realStart, realLength); + byteBuffer.load(); + byteBuffers[processor] = byteBuffer; + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + + System.out.println(new TreeMap(calculateMeasurements(byteBuffers))); + } + + private static Map calculateMeasurements(MappedByteBuffer[] buffers) { + return Arrays.stream(buffers).parallel() + .map(buffer -> { + var results = new HashMap(); + + var state = State.NEXT_READ_CITY; + var citySink = new CheapByteBuffer(100); + var measurementSink = new CheapByteBuffer(10); + + while (buffer.hasRemaining()) { + byte b = buffer.get(); + char c = (char) b; + + if (c == ';') { + state = State.NEXT_READ_MEASUREMENT; + continue; + } + + if (c == '\n') { + String city = new String(citySink.getBytes(), UTF_8); + int measurement = bytesToInt(measurementSink.getBytes()); + var entry = results.get(city); + if (entry == null) { + entry = new MeasurementAggregator(); + results.put(city, entry); + } + + entry.count++; + entry.sum += measurement; + entry.min = (short) Math.min(entry.min, measurement); + entry.max = (short) Math.max(entry.max, measurement); + + citySink.clear(); + measurementSink.clear(); + state = State.NEXT_READ_CITY; + continue; + } + + switch (state) { + case NEXT_READ_CITY -> citySink.append(b); + case NEXT_READ_MEASUREMENT -> measurementSink.append(b); + } + } + + return results; + }) + .flatMap(m -> m.entrySet().stream()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, MeasurementAggregator::merge)); + } + + /** + * Removes decimal points and returns an integer. For example -12.3 would return -123 + */ + static int bytesToInt(byte[] bytes) { + short index = (short) (bytes.length - 1); + + boolean isNegative = (bytes[0] == '-'); + int number = (bytes[index] - '0'); + index -= 2; + + number += (10 * (bytes[index--] - '0')); + + if (index == 1 || (!isNegative && index == 0)) { + number += (100 * (bytes[index] - '0')); + } + + if (isNegative) { + return -number; + } + else { + return number; + } + } + + private enum State { + NEXT_READ_CITY, + NEXT_READ_MEASUREMENT + } + + /** + * Numbers are stored as integers, because of {@link #bytesToInt}, and then divided by 10.0 to restore their decimal point. + */ + private static class MeasurementAggregator { + + private static final DecimalFormat rounder = new DecimalFormat("0.0"); + short min = Short.MAX_VALUE; + short max = Short.MIN_VALUE; + long sum; + int count; + + private MeasurementAggregator merge(MeasurementAggregator other) { + min = (short) Math.min(min, other.min); + max = (short) Math.max(max, other.max); + sum += other.sum; + count += other.count; + + return this; + } + + @Override + public String toString() { + return rounder.format(min / 10.0) + + "/" + rounder.format(Math.round((double) sum / count) / 10.0) + + "/" + rounder.format(max / 10.0); + } + } + + private static class CheapByteBuffer { + + private final byte[] data; + private int length; + + public CheapByteBuffer(final int startSize) { + this.data = new byte[startSize]; + this.length = 0; + } + + public void append(final byte b) { + data[length++] = b; + } + + public void clear() { + this.length = 0; + } + + public byte[] getBytes() { + return Arrays.copyOf(this.data, this.length); + } + } +}