1brc/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java

/*
 *  Copyright 2023 The original authors
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package dev.morling.onebrc;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.TreeMap;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * Initial submission:                                 1m 35s
 * Adding memory mapped files:                         0m 55s (based on bjhara's submission)
 * Using big decimal and iterating the buffer once:    0m 20s
 * Using long parse:                                   0m 11s
 * <p>
 * Using 21.0.1 Temurin with ShenandoahGC on Macbook (Intel) Pro
 * `sdk use java 21.0.1-tem`
 *
 * When using Oracle GraalVM 21.0.1+12.1
 * `sdk use java 21.0.1-graal`
 * It takes 0m 15s on my machine
 * `sdk use java 21.0.1-graalce`
 * It takes 0m 20s on my machine
 *
 * @author Filip Hrisafov
 */
public class CalculateAverage_filiphr {

    private static final String FILE = "./measurements.txt";
    private static final long CHUNK_SIZE = 1024 * 1024 * 10L; // 1KB * 10KB ~ 10MB

    private static final class Measurement {

        private long min = Long.MAX_VALUE;
        private long max = Long.MIN_VALUE;
        private long sum = 0L;
        private long count = 0L;

        private void add(long value) {
            this.min = Math.min(this.min, value);
            this.max = Math.max(this.max, value);
            this.sum += value;
            this.count++;
        }

        public static Measurement combine(Measurement m1, Measurement m2) {
            Measurement measurement = new Measurement();
            measurement.min = Math.min(m1.min, m2.min);
            measurement.max = Math.max(m1.max, m2.max);
            measurement.sum = m1.sum + m2.sum;
            measurement.count = m1.count + m2.count;
            return measurement;
        }

        @Override
        public String toString() {
            return round(min / 10.0) + "/" + round((sum / 10.0) / count) + "/" + round(max / 10.0);
        }

        private double round(double value) {
            return Math.round(value * 10.0) / 10.0;
        }
    }

    public static void main(String[] args) throws IOException {
        // long start = System.nanoTime();

        Map<String, Measurement> measurements;
        try (FileChannel fileChannel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {
            measurements = fineChannelStream(fileChannel)
                    .parallel()
                    .map(CalculateAverage_filiphr::parseBuffer)
                    .reduce(Collections.emptyMap(), CalculateAverage_filiphr::mergeMaps);
        }

        System.out.println(new TreeMap<>(measurements));
        // System.out.println("Done in " + (System.nanoTime() - start) / 1000000 + " ms");
    }

    private static Map<String, Measurement> mergeMaps(Map<String, Measurement> map1, Map<String, Measurement> map2) {
        if (map1.isEmpty()) {
            return map2;
        }
        else {
            Set<String> cities = new HashSet<>(map1.keySet());
            cities.addAll(map2.keySet());
            Map<String, Measurement> result = HashMap.newHashMap(cities.size());

            for (String city : cities) {
                Measurement m1 = map1.get(city);
                Measurement m2 = map2.get(city);
                if (m2 == null) {
                    // When m2 is null then it is not possible for m1 to be null as well,
                    // since cities is a union of the map key sets
                    result.put(city, m1);
                }
                else if (m1 == null) {
                    // When m1 is null then it is not possible for m2 to be null as well,
                    // since cities is a union of the map key sets
                    result.put(city, m2);
                }
                else {
                    result.put(city, Measurement.combine(m1, m2));
                }
            }

            return result;
        }
    }

    /**
     * This is an adapted implementation of the bjhara parseBuffer
     */
    private static Map<String, Measurement> parseBuffer(ByteBuffer bb) {
        Map<String, Measurement> measurements = HashMap.newHashMap(415);
        int limit = bb.limit();
        byte[] buffer = new byte[128];
        char[] charArray = new char[8];
        CharBuffer charBuffer = CharBuffer.wrap(charArray);
        charBuffer.clear();
        charBuffer.position(0);

        while (bb.position() < limit) {
            int bufferIndex = 0;

            // Iterate through the byte buffer and fill the buffer until we find the separator (;)
            while (bb.position() < limit) {
                byte positionByte = bb.get();
                if (positionByte == ';') {
                    break;
                }
                buffer[bufferIndex++] = positionByte;
            }

            // Create the city
            String city = new String(buffer, 0, bufferIndex);

            byte lastPositionByte = '\n';
            bufferIndex = 0;
            while (bb.position() < limit) {
                byte positionByte = bb.get();
                if (positionByte == '\r' || positionByte == '\n') {
                    lastPositionByte = positionByte;
                    break;
                }
                else if (positionByte != '.') {
                    charArray[bufferIndex++] = (char) positionByte;
                }
            }

            // Create the temperature string
            long value = Long.parseLong(charBuffer, 0, bufferIndex, 10);

            measurements.computeIfAbsent(city, k -> new Measurement())
                    .add(value);

            // and get rid of the new line (handle both kinds)
            if (lastPositionByte == '\r') {
                bb.get();
            }
        }

        return measurements;
    }

    /**
     * Thanks to bjhara and royvanrijn for the idea of using (and learning about) memory mapped files.
     */
    private static Stream<ByteBuffer> fineChannelStream(FileChannel fileChannel) throws IOException {
        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(fileChannelIterator(fileChannel), Spliterator.IMMUTABLE), false);
    }

    private static Iterator<ByteBuffer> fileChannelIterator(FileChannel fileChannel) throws IOException {
        return new Iterator<>() {

            private final long size = fileChannel.size();
            private long start = 0;

            @Override
            public boolean hasNext() {
                return start < size;
            }

            @Override
            public ByteBuffer next() {
                try {
                    MappedByteBuffer mappedByteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, start,
                            Math.min(CHUNK_SIZE, size - start));

                    // don't split the data in the middle of lines
                    // find the closest previous newline
                    int realEnd = mappedByteBuffer.limit() - 1;
                    while (mappedByteBuffer.get(realEnd) != '\n')
                        realEnd--;

                    realEnd++;

                    mappedByteBuffer.limit(realEnd);
                    start += realEnd;

                    return mappedByteBuffer;
                }
                catch (IOException ex) {
                    throw new UncheckedIOException(ex);
                }
            }
        };
    }
}
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`/*`
			`* Copyright 2023 The original authors`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`
			`package dev.morling.onebrc;`

			`import java.io.IOException;`
			`import java.io.UncheckedIOException;`
			`import java.nio.ByteBuffer;`
			`import java.nio.CharBuffer;`
			`import java.nio.MappedByteBuffer;`
			`import java.nio.channels.FileChannel;`
			`import java.nio.file.Paths;`
			`import java.nio.file.StandardOpenOption;`
			`import java.util.Collections;`
			`import java.util.HashMap;`
			`import java.util.HashSet;`
			`import java.util.Iterator;`
			`import java.util.Map;`
			`import java.util.Set;`
			`import java.util.Spliterator;`
			`import java.util.Spliterators;`
			`import java.util.TreeMap;`
			`import java.util.stream.Stream;`
			`import java.util.stream.StreamSupport;`

			`/**`
			`* Initial submission: 1m 35s`
			`* Adding memory mapped files: 0m 55s (based on bjhara's submission)`
			`* Using big decimal and iterating the buffer once: 0m 20s`
Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`* Using long parse: 0m 11s`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`* <p>`
			`* Using 21.0.1 Temurin with ShenandoahGC on Macbook (Intel) Pro`
			* `sdk use java 21.0.1-tem`
			`*`
			`* When using Oracle GraalVM 21.0.1+12.1`
			* `sdk use java 21.0.1-graal`
			`* It takes 0m 15s on my machine`
			* `sdk use java 21.0.1-graalce`
			`* It takes 0m 20s on my machine`
			`*`
			`* @author Filip Hrisafov`
			`*/`
			`public class CalculateAverage_filiphr {`

			`private static final String FILE = "./measurements.txt";`
			`private static final long CHUNK_SIZE = 1024 * 1024 * 10L; // 1KB * 10KB ~ 10MB`

			`private static final class Measurement {`

Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`private long min = Long.MAX_VALUE;`
			`private long max = Long.MIN_VALUE;`
			`private long sum = 0L;`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`private long count = 0L;`

Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`private void add(long value) {`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`this.min = Math.min(this.min, value);`
			`this.max = Math.max(this.max, value);`
			`this.sum += value;`
			`this.count++;`
			`}`

			`public static Measurement combine(Measurement m1, Measurement m2) {`
			`Measurement measurement = new Measurement();`
			`measurement.min = Math.min(m1.min, m2.min);`
			`measurement.max = Math.max(m1.max, m2.max);`
			`measurement.sum = m1.sum + m2.sum;`
			`measurement.count = m1.count + m2.count;`
			`return measurement;`
			`}`

			`@Override`
			`public String toString() {`
Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`return round(min / 10.0) + "/" + round((sum / 10.0) / count) + "/" + round(max / 10.0);`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`}`

			`private double round(double value) {`
			`return Math.round(value * 10.0) / 10.0;`
			`}`
			`}`

			`public static void main(String[] args) throws IOException {`
			`// long start = System.nanoTime();`

			`Map<String, Measurement> measurements;`
			`try (FileChannel fileChannel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {`
			`measurements = fineChannelStream(fileChannel)`
			`.parallel()`
			`.map(CalculateAverage_filiphr::parseBuffer)`
			`.reduce(Collections.emptyMap(), CalculateAverage_filiphr::mergeMaps);`
			`}`

			`System.out.println(new TreeMap<>(measurements));`
			`// System.out.println("Done in " + (System.nanoTime() - start) / 1000000 + " ms");`
			`}`

			`private static Map<String, Measurement> mergeMaps(Map<String, Measurement> map1, Map<String, Measurement> map2) {`
			`if (map1.isEmpty()) {`
			`return map2;`
			`}`
			`else {`
			`Set<String> cities = new HashSet<>(map1.keySet());`
			`cities.addAll(map2.keySet());`
			`Map<String, Measurement> result = HashMap.newHashMap(cities.size());`

			`for (String city : cities) {`
			`Measurement m1 = map1.get(city);`
			`Measurement m2 = map2.get(city);`
			`if (m2 == null) {`
			`// When m2 is null then it is not possible for m1 to be null as well,`
			`// since cities is a union of the map key sets`
			`result.put(city, m1);`
			`}`
			`else if (m1 == null) {`
			`// When m1 is null then it is not possible for m2 to be null as well,`
			`// since cities is a union of the map key sets`
			`result.put(city, m2);`
			`}`
			`else {`
			`result.put(city, Measurement.combine(m1, m2));`
			`}`
			`}`

			`return result;`
			`}`
			`}`

			`/**`
			`* This is an adapted implementation of the bjhara parseBuffer`
			`*/`
			`private static Map<String, Measurement> parseBuffer(ByteBuffer bb) {`
			`Map<String, Measurement> measurements = HashMap.newHashMap(415);`
			`int limit = bb.limit();`
			`byte[] buffer = new byte[128];`
Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`char[] charArray = new char[8];`
			`CharBuffer charBuffer = CharBuffer.wrap(charArray);`
			`charBuffer.clear();`
			`charBuffer.position(0);`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00
			`while (bb.position() < limit) {`
			`int bufferIndex = 0;`

			`// Iterate through the byte buffer and fill the buffer until we find the separator (;)`
			`while (bb.position() < limit) {`
			`byte positionByte = bb.get();`
			`if (positionByte == ';') {`
			`break;`
			`}`
			`buffer[bufferIndex++] = positionByte;`
			`}`

			`// Create the city`
			`String city = new String(buffer, 0, bufferIndex);`

			`byte lastPositionByte = '\n';`
Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`bufferIndex = 0;`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`while (bb.position() < limit) {`
			`byte positionByte = bb.get();`
			`if (positionByte == '\r' \|\| positionByte == '\n') {`
			`lastPositionByte = positionByte;`
			`break;`
			`}`
Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`else if (positionByte != '.') {`
			`charArray[bufferIndex++] = (char) positionByte;`
			`}`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00			`}`

			`// Create the temperature string`
Use long parse and use char array instead of CharBuffer for adding to it 2024-01-03 22:06:33 +01:00			`long value = Long.parseLong(charBuffer, 0, bufferIndex, 10);`
Adding filiphr's submission; * Initial implementation using Shenandoah GC and parallel iteration * Use memory mapped files * Iterate the buffer once and use BigDecimal parsing instead of Double.parseDouble * Add information about Graal * Add sdk use to calculate script 2024-01-03 20:32:16 +01:00
			`measurements.computeIfAbsent(city, k -> new Measurement())`
			`.add(value);`

			`// and get rid of the new line (handle both kinds)`
			`if (lastPositionByte == '\r') {`
			`bb.get();`
			`}`
			`}`

			`return measurements;`
			`}`

			`/**`
			`* Thanks to bjhara and royvanrijn for the idea of using (and learning about) memory mapped files.`
			`*/`
			`private static Stream<ByteBuffer> fineChannelStream(FileChannel fileChannel) throws IOException {`
			`return StreamSupport.stream(Spliterators.spliteratorUnknownSize(fileChannelIterator(fileChannel), Spliterator.IMMUTABLE), false);`
			`}`

			`private static Iterator<ByteBuffer> fileChannelIterator(FileChannel fileChannel) throws IOException {`
			`return new Iterator<>() {`

			`private final long size = fileChannel.size();`
			`private long start = 0;`

			`@Override`
			`public boolean hasNext() {`
			`return start < size;`
			`}`

			`@Override`
			`public ByteBuffer next() {`
			`try {`
			`MappedByteBuffer mappedByteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, start,`
			`Math.min(CHUNK_SIZE, size - start));`

			`// don't split the data in the middle of lines`
			`// find the closest previous newline`
			`int realEnd = mappedByteBuffer.limit() - 1;`
			`while (mappedByteBuffer.get(realEnd) != '\n')`
			`realEnd--;`

			`realEnd++;`

			`mappedByteBuffer.limit(realEnd);`
			`start += realEnd;`

			`return mappedByteBuffer;`
			`}`
			`catch (IOException ex) {`
			`throw new UncheckedIOException(ex);`
			`}`
			`}`
			`};`
			`}`
			`}`