diff --git a/calculate_average_vemana.sh b/calculate_average_vemana.sh new file mode 100755 index 0000000..b3437f2 --- /dev/null +++ b/calculate_average_vemana.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Basics +JAVA_OPTS="" +JAVA_OPTS="$JAVA_OPTS --enable-preview" +#JAVA_OPTS="$JAVA_OPTS --add-modules jdk.incubator.vector" +#JAVA_OPTS="$JAVA_OPTS -XX:+UnlockDiagnosticVMOptions" + +# JIT parameters +#JAVA_OPTS="$JAVA_OPTS -Xlog:class+load=info" +#JAVA_OPTS="$JAVA_OPTS -XX:+LogCompilation" +JAVA_OPTS="$JAVA_OPTS -XX:+AlwaysCompileLoopMethods" +#JAVA_OPTS="$JAVA_OPTS -XX:TieredStopAtLevel=1" +#JAVA_OPTS="$JAVA_OPTS -XX:TieredStopAtLevel=1" +#JAVA_OPTS="$JAVA_OPTS -XX:CompileCommand=inline,*State.processLine()" +#JAVA_OPTS="$JAVA_OPTS -XX:+PrintAssembly" +#JAVA_OPTS="$JAVA_OPTS -XX:LogFile=../hotspot.log" +#JAVA_OPTS="$JAVA_OPTS -XX:+DebugNonSafepoints" +#JAVA_OPTS="$JAVA_OPTS -XX:C1MaxInlineSize=150" +#JAVA_OPTS="$JAVA_OPTS -XX:C1InlineStackLimit=40" + +#JAVA_OPTS="$JAVA_OPTS -XX:FreqInlineSize=500" +#JAVA_OPTS="$JAVA_OPTS -XX:+PrintCompilation" +#JAVA_OPTS="$JAVA_OPTS -XX:+PrintInlining" +#JAVA_OPTS="$JAVA_OPTS -XX:CompileThreshold=20 " +#JAVA_OPTS="$JAVA_OPTS -Xlog:async" + +# GC parameters +JAVA_OPTS="$JAVA_OPTS -XX:+UseParallelGC" + +#JAVA_OPTS="$JAVA_OPTS -Xlog:gc*=debug:file=/tmp/gc.log" +#JAVA_OPTS="$JAVA_OPTS -XX:+UseEpsilonGC -Xlog:all=off" +#JAVA_OPTS="$JAVA_OPTS -XX:+PrintGC -XX:+PrintGCDetails" + +java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_vemana "$@" diff --git a/github_users.txt b/github_users.txt index 5bfc5dc..ef5ef51 100644 --- a/github_users.txt +++ b/github_users.txt @@ -49,3 +49,4 @@ yavuztas;Yavuz Tas yehwankim23;κΉμν Ye-Hwan Kim (Sam) hundredwatt;Jason Nochlin gnmathur;Gaurav Mathur +vemana;Subrahmanyam diff --git a/prepare_vemana.sh b/prepare_vemana.sh new file mode 100755 index 0000000..0099505 --- /dev/null +++ b/prepare_vemana.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +source "$HOME/.sdkman/bin/sdkman-init.sh" +#sdk1 use java 21.0.1-open 1>&2 +sdk use java 21.0.1-graal 1>&2 +#sdk1 use java 21.0.1-zulu 1>&2 +#sdk1 use java 21.0.1-graalce 1>&2 + diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java new file mode 100644 index 0000000..7673fb5 --- /dev/null +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java @@ -0,0 +1,692 @@ +/* + * Copyright 2023 The original authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dev.morling.onebrc; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +/** + * This submission focuses on exploiting the non-SIMD parallelism that is inherent in OOO + * super-scalar CPUs and avoids using Unsafe, SWAR and other such fine techniques. The hope is to + * remain readable for a majority of SWEs. At a high level, the approach relies on a few principles + * listed herein. + * + *
+ * [Exploit Parallelism] Distribute the work into Shards. Separate threads (one per core) process + * Shards and follow it up by merging the results. parallelStream() is appealing but carries + * potential run-time variance (i.e. std. deviation) penalties based on informal testing. Variance + * is not ideal when trying to minimize the maximum worker latency. + * + *
+ * [Use ByteBuffers over MemorySegment] Each Shard is further divided in Chunks. This would've been + * unnecessary except that Shards are too big to be backed by ByteBuffers. Besides, MemorySegment + * appears slower than ByteBuffers. So, to use ByteBuffers, we have to use smaller chunks. + * + *
+ * [Straggler freedom] The optimization function here is to minimize the maximal worker thread + * completion. Law of large number averages means that all the threads will end up with similar + * amounts of work and similar completion times; but, however ever so often there could be a bad + * sharding and more importantly, Cores are not created equal; some will be throttled more than + * others. So, we have a shared {@code LazyShardQueue} that aims to distribute work to minimize the + * latest completion time. + * + *
+ * [Work Assignment with LazyShardQueue] The queue provides each thread with its next big-chunk + * until X% of the work remains. Big-chunks belong to the thread and will not be provided to another + * thread. Then, it switches to providing small-chunk sizes. Small-chunks comprise the last X% of + * work and every thread can participate in completing the chunk. Even though the queue is shared + * across threads, there's no communication across thread during the big-chunk phases. The queue is + * effectively a per-thread queue while processing big-chunks. The small-chunk phase uses an + * AtomicLong to coordinate chunk allocation across threads. + * + *
+ * [Chunk processing] Chunk processing is typical. Process line by line. Find a hash function + * (polynomial hash fns are slow, but will work fine), hash the city name, resolve conflicts using + * linear probing and then accumulate the temperature into the appropriate hash slot. The key + * element then is how fast can you identify the hash slot, read the temperature and update the new + * temperature in the slot (i.e. min, max, count). + * + *
+ * [Cache friendliness] 7502P and my machine (7950X) offer 4MB L3 cache/core. This means we can hope + * to fit all our datastructures in L3 cache. Since SMT is turned on, the Runtime's available + * processors will show twice the number of actual cores and so we get 2MB L3 cache/thread. To be + * safe, we try to stay within 1.8 MB/thread and size our hashtable appropriately. + * + *
+ * [Allocation] Since MemorySegment seemed slower than ByteBuffers, backing Chunks by bytebuffers + * was the logical option. Creating one ByteBuffer per chunk was no bueno because the system doesn't + * like it (JVM runs out of mapped file handle quota). Other than that, allocation in the hot path + * was avoided. + * + *
+ * [General approach to fast hashing and temperature reading] Here, it helps to understand the + * various bottlenecks in execution. One particular thing that I kept coming back to was to + * understand the relative costs of instructions: See + * https://www.agner.org/optimize/instruction_tables.pdf It is helpful to think of hardware as a + * smart parallel execution machine that can do several operations in one cycle if only you can feed + * it. So, the idea is to reduce data-dependency chains in the bottleneck path. The other major idea + * is to just avoid unnecessary work. For example, copying the city name into a byte array just for + * the purpose of looking it up was costing a noticeable amount. Instead, encoding it as + * (bytebuffer, start, len) was helpful. Spotting unnecessary work was non-trivial.So, them pesky + * range checks? see if you can avoid them. For example, sometimes you can eliminate a "nextPos < + * endPos" in a tight loop by breaking it into two pieces: one piece where the check will not be + * needed and a tail piece where it will be needed. + * + *
+ * [Understand What Cores like]. Cores like to go straight and loop back. Despite good branch + * prediction, performance sucks with mispredicted branches. + * + *
+ * [JIT] Java performance requires understanding the JIT. It is helpful to understand what the JIT + * likes though it is still somewhat of a mystery to me. In general, it inlines small methods very + * well and after constant folding, it can optimize quite well across a reasonably deep call chain. + * My experience with the JIT was that everything I tried to tune it made it worse except for one + * parameter. I have a new-found respect for JIT - it likes and understands typical Java idioms. + * + *
[Tuning] Nothing was more insightful than actually playing with various tuning parameters. + * I can have all the theories but the hardware and JIT are giant blackboxes. I used a bunch of + * tools to optimize: (1) Command line parameters to tune big and small chunk sizes etc. This was + * also very helpful in forming a mental model of the JIT. Sometimes, it would compile some methods + * and sometimes it would just run them interpreted since the compilation threshold wouldn't be + * reached for intermediate methods. (2) AsyncProfiler - this was the first line tool to understand + * cache misses and cpu time to figure where to aim the next optimization effort. (3) JitWatch - + * invaluable for forming a mental model and attempting to tune the JIT. + * + *
[Things that didn't work]. This is a looong list and the hit rate is quite low. In general, + * removing unnecessary work had a high hit rate and my pet theories on how things work in hardware + * had a low hit rate. Java Vector API lacked a good gather API to load from arbitrary memory + * addresses; this prevented performing real SIMD on the entire dataset, one where you load 64 bytes + * from 64 different line starting positions (just after a previous line end) and then step through + * one byte at a time. This to me is the most natural SIMD approach to the 1BRC problem but I + * couldn't use it. I tried other local uses of the vector API and it was always slower, and mostly + * much slower. In other words, Java Vector API needs a problem for which it is suited (duh) but, + * unless I am overlooking something, the API still lacks gather from arbitrary memory addresses. + * + *
[My general takeaways]. Write simple, idiomatic java code and get 70-80% of the max
+ * performance of an optimally hand-tuned code. Focus any optimization efforts on being friendly to
+ * the JIT *before* thinking about tuning for the hardware. There's a real cost to EXTREME
+ * performance tuning: a loss of abstraction and maintainability, but being JIT friendly is probably
+ * much more achievable without sacrificing abstraction.
+ */
+public class CalculateAverage_vemana {
+
+ public static void checkArg(boolean condition) {
+ if (!condition) {
+ throw new IllegalArgumentException();
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ // First process in large chunks without coordination among threads
+ // Use chunkSizeBits for the large-chunk size
+ int chunkSizeBits = 20;
+
+ // For the last commonChunkFraction fraction of total work, use smaller chunk sizes
+ double commonChunkFraction = 0;
+
+ // Use commonChunkSizeBits for the small-chunk size
+ int commonChunkSizeBits = 18;
+
+ // Size of the hashtable (attempt to fit in L3)
+ int hashtableSizeBits = 14;
+
+ if (args.length > 0) {
+ chunkSizeBits = Integer.parseInt(args[0]);
+ }
+
+ if (args.length > 1) {
+ commonChunkFraction = Double.parseDouble(args[1]);
+ }
+
+ if (args.length > 2) {
+ commonChunkSizeBits = Integer.parseInt(args[2]);
+ }
+
+ if (args.length > 3) {
+ hashtableSizeBits = Integer.parseInt(args[3]);
+ }
+
+ // System.err.println(STR."""
+ // Using the following parameters:
+ // - chunkSizeBits = \{chunkSizeBits}
+ // - commonChunkFraction = \{commonChunkFraction}
+ // - commonChunkSizeBits = \{commonChunkSizeBits}
+ // - hashtableSizeBits = \{hashtableSizeBits}
+ // """);
+
+ System.out.println(new Runner(
+ Path.of("measurements.txt"),
+ chunkSizeBits,
+ commonChunkFraction,
+ commonChunkSizeBits,
+ hashtableSizeBits).getSummaryStatistics());
+ }
+
+ public interface LazyShardQueue {
+
+ ByteRange take(int shardIdx);
+ }
+
+ // Mutable to avoid allocation
+ public static class ByteRange {
+
+ private static final int BUF_SIZE = 1 << 30;
+
+ private final long fileSize;
+ private final RandomAccessFile raf;
+
+ // ***************** What this is doing and why *****************
+ // Reading from ByteBuffer appears faster from MemorySegment, but ByteBuffer can only be
+ // Integer.MAX_VALUE long; Creating one byteBuffer per chunk kills native memory quota
+ // and JVM crashes without futher parameters.
+ //
+ // So, in this solution, create a sliding window of bytebuffers:
+ // - Create a large bytebuffer that spans the chunk
+ // - If the next chunk falls outside the byteBuffer, create another byteBuffer that spans the
+ // chunk. Because chunks are allocated serially, a single large (1<<30) byteBuffer spans
+ // many successive chunks.
+ // - In fact, for serial chunk allocation (which is friendly to page faulting anyway),
+ // the number of created ByteBuffers doesn't exceed [size of shard/(1<<30)] which is less than
+ // 100/thread and is comfortably below what the JVM can handle (65K) without further param
+ // tuning
+ // - This enables (relatively) allocation free chunking implementation. Our chunking impl uses
+ // fine grained chunking for the last say X% of work to avoid being hostage to stragglers
+
+ // The PUBLIC API
+ public MappedByteBuffer byteBuffer;
+ public int endInBuf; // where the chunk ends inside the buffer
+ public int startInBuf; // where the chunk starts inside the buffer
+ // Private State
+ private long bufferEnd; // byteBuffer's ending coordinate
+ private long bufferStart; // byteBuffer's begin coordinate
+
+ // Uninitialized; for mutability
+ public ByteRange(RandomAccessFile raf) {
+ this.raf = raf;
+ try {
+ this.fileSize = raf.length();
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ bufferEnd = bufferStart = -1;
+ }
+
+ public void setRange(long rangeStart, long rangeEnd) {
+ if (rangeEnd + 1024 > bufferEnd || rangeStart < bufferStart) {
+ bufferStart = rangeStart;
+ bufferEnd = Math.min(bufferStart + BUF_SIZE, fileSize);
+ setByteBufferToRange(bufferStart, bufferEnd);
+ }
+
+ if (rangeStart > 0) {
+ rangeStart = 1 + nextNewLine(rangeStart);
+ }
+
+ if (rangeEnd < fileSize) {
+ rangeEnd = 1 + nextNewLine(rangeEnd);
+ }
+ else {
+ rangeEnd = fileSize;
+ }
+
+ startInBuf = (int) (rangeStart - bufferStart);
+ endInBuf = (int) (rangeEnd - bufferStart);
+ }
+
+ @Override
+ public String toString() {
+ return STR."""
+ ByteRange {
+ startInBuf = \{startInBuf}
+ endInBuf = \{endInBuf}
+ }
+ """;
+ }
+
+ private long nextNewLine(long pos) {
+ int nextPos = (int) (pos - bufferStart);
+ while (byteBuffer.get(nextPos) != '\n') {
+ nextPos++;
+ }
+ return nextPos + bufferStart;
+ }
+
+ private void setByteBufferToRange(long start, long end) {
+ try {
+ byteBuffer = raf.getChannel().map(MapMode.READ_ONLY, start, end - start);
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ public record Result(Map