Adding filiphr's submission;

* Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script
2024-01-03 20:32:16 +01:00
parent eebc23bd89
commit d57cf78faa
2 changed files with 257 additions and 0 deletions
--- a/calculate_average_filiphr.sh
+++ b/calculate_average_filiphr.sh
@@ -0,0 +1,22 @@
 #!/bin/sh
 #
 #  Copyright 2023 The original authors
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 sdk use java 21.0.1-graal
 java -version
 JAVA_OPTS=""
 time java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_filiphr
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java
@@ -0,0 +1,235 @@
 /*
 *  Copyright 2023 The original authors
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 package dev.morling.onebrc;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.math.BigDecimal;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.Spliterator;
 import java.util.Spliterators;
 import java.util.TreeMap;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 /**
 * Initial submission:                                 1m 35s
 * Adding memory mapped files:                         0m 55s (based on bjhara's submission)
 * Using big decimal and iterating the buffer once:    0m 20s
 * <p>
 * Using 21.0.1 Temurin with ShenandoahGC on Macbook (Intel) Pro
 * `sdk use java 21.0.1-tem`
 *
 * When using Oracle GraalVM 21.0.1+12.1
 * `sdk use java 21.0.1-graal`
 * It takes 0m 15s on my machine
 * `sdk use java 21.0.1-graalce`
 * It takes 0m 20s on my machine
 *
 * @author Filip Hrisafov
 */
 public class CalculateAverage_filiphr {
    private static final String FILE = "./measurements.txt";
    private static final long CHUNK_SIZE = 1024 * 1024 * 10L; // 1KB * 10KB ~ 10MB
    private static final class Measurement {
        private double min = Long.MAX_VALUE;
        private double max = Long.MIN_VALUE;
        private double sum = 0L;
        private long count = 0L;
        private void add(double value) {
            this.min = Math.min(this.min, value);
            this.max = Math.max(this.max, value);
            this.sum += value;
            this.count++;
        }
        public static Measurement combine(Measurement m1, Measurement m2) {
            Measurement measurement = new Measurement();
            measurement.min = Math.min(m1.min, m2.min);
            measurement.max = Math.max(m1.max, m2.max);
            measurement.sum = m1.sum + m2.sum;
            measurement.count = m1.count + m2.count;
            return measurement;
        }
        @Override
        public String toString() {
            return round(min) + "/" + round((sum) / count) + "/" + round(max);
        }
        private double round(double value) {
            return Math.round(value * 10.0) / 10.0;
        }
    }
    public static void main(String[] args) throws IOException {
        // long start = System.nanoTime();
        Map<String, Measurement> measurements;
        try (FileChannel fileChannel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {
            measurements = fineChannelStream(fileChannel)
                    .parallel()
                    .map(CalculateAverage_filiphr::parseBuffer)
                    .reduce(Collections.emptyMap(), CalculateAverage_filiphr::mergeMaps);
        }
        System.out.println(new TreeMap<>(measurements));
        // System.out.println("Done in " + (System.nanoTime() - start) / 1000000 + " ms");
    }
    private static Map<String, Measurement> mergeMaps(Map<String, Measurement> map1, Map<String, Measurement> map2) {
        if (map1.isEmpty()) {
            return map2;
        }
        else {
            Set<String> cities = new HashSet<>(map1.keySet());
            cities.addAll(map2.keySet());
            Map<String, Measurement> result = HashMap.newHashMap(cities.size());
            for (String city : cities) {
                Measurement m1 = map1.get(city);
                Measurement m2 = map2.get(city);
                if (m2 == null) {
                    // When m2 is null then it is not possible for m1 to be null as well,
                    // since cities is a union of the map key sets
                    result.put(city, m1);
                }
                else if (m1 == null) {
                    // When m1 is null then it is not possible for m2 to be null as well,
                    // since cities is a union of the map key sets
                    result.put(city, m2);
                }
                else {
                    result.put(city, Measurement.combine(m1, m2));
                }
            }
            return result;
        }
    }
    /**
     * This is an adapted implementation of the bjhara parseBuffer
     */
    private static Map<String, Measurement> parseBuffer(ByteBuffer bb) {
        Map<String, Measurement> measurements = HashMap.newHashMap(415);
        int limit = bb.limit();
        byte[] buffer = new byte[128];
        CharBuffer charBuffer = CharBuffer.allocate(8);
        while (bb.position() < limit) {
            int bufferIndex = 0;
            // Iterate through the byte buffer and fill the buffer until we find the separator (;)
            while (bb.position() < limit) {
                byte positionByte = bb.get();
                if (positionByte == ';') {
                    break;
                }
                buffer[bufferIndex++] = positionByte;
            }
            // Create the city
            String city = new String(buffer, 0, bufferIndex);
            charBuffer.clear();
            byte lastPositionByte = '\n';
            while (bb.position() < limit) {
                byte positionByte = bb.get();
                if (positionByte == '\r' || positionByte == '\n') {
                    lastPositionByte = positionByte;
                    break;
                }
                charBuffer.append((char) positionByte);
            }
            int position = charBuffer.position();
            charBuffer.position(0);
            // Create the temperature string
            BigDecimal bigDecimal = new BigDecimal(charBuffer.array(), 0, position);
            double value = bigDecimal.doubleValue();
            measurements.computeIfAbsent(city, k -> new Measurement())
                    .add(value);
            // and get rid of the new line (handle both kinds)
            if (lastPositionByte == '\r') {
                bb.get();
            }
        }
        return measurements;
    }
    /**
     * Thanks to bjhara and royvanrijn for the idea of using (and learning about) memory mapped files.
     */
    private static Stream<ByteBuffer> fineChannelStream(FileChannel fileChannel) throws IOException {
        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(fileChannelIterator(fileChannel), Spliterator.IMMUTABLE), false);
    }
    private static Iterator<ByteBuffer> fileChannelIterator(FileChannel fileChannel) throws IOException {
        return new Iterator<>() {
            private final long size = fileChannel.size();
            private long start = 0;
            @Override
            public boolean hasNext() {
                return start < size;
            }
            @Override
            public ByteBuffer next() {
                try {
                    MappedByteBuffer mappedByteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, start,
                            Math.min(CHUNK_SIZE, size - start));
                    // don't split the data in the middle of lines
                    // find the closest previous newline
                    int realEnd = mappedByteBuffer.limit() - 1;
                    while (mappedByteBuffer.get(realEnd) != '\n')
                        realEnd--;
                    realEnd++;
                    mappedByteBuffer.limit(realEnd);
                    start += realEnd;
                    return mappedByteBuffer;
                }
                catch (IOException ex) {
                    throw new UncheckedIOException(ex);
                }
            }
        };
    }
 }