Initial submission for jonathan_aotearoa. (#586)
* Initial submission for jonathan_aotearoa * Fixing typos * Adding hyphens to prepare and calculate shell scripts so that they're aligned with my GitHub username. * Making chunk processing more robust in attempt to fix the cause of the build error. * Fixing typo. * Fixed the handling of files less than 8 bytes in length. * Additional assertion, comment improvements. * Refactoring to improve testability. Additional assertion and comments. * Updating collision checking to include checking if the station name is equal. * Minor refactoring to make param ordering consistent. * Adding a custom toString method for the results map. * Fixing collision checking bug * Fixing rounding bug. * Fixing collision bug. --------- Co-authored-by: jonathan <jonathan@example.com>
This commit is contained in:
		
							
								
								
									
										27
									
								
								calculate_average_jonathan-aotearoa.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										27
									
								
								calculate_average_jonathan-aotearoa.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | #!/bin/sh | ||||||
|  | # | ||||||
|  | #  Copyright 2023 The original authors | ||||||
|  | # | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | # | ||||||
|  | #      http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | # | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | # | ||||||
|  |  | ||||||
|  | if [ -f target/CalculateAverage_jonathan-aotearoa_image ]; then | ||||||
|  |     echo "Using native image 'target/CalculateAverage_jonathan-aotearoa_image'. Delete this file to select JVM mode." 1>&2 | ||||||
|  |     target/CalculateAverage_jonathan-aotearoa_image | ||||||
|  | else | ||||||
|  |     JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA" | ||||||
|  |     JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages" | ||||||
|  |     echo "Running in JVM mode as no native image was found. Run 'prepare_jonathan-aotearoa.sh' to generate a native image." 1>&2 | ||||||
|  |     java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jonathanaotearoa | ||||||
|  | fi | ||||||
|  |  | ||||||
							
								
								
									
										28
									
								
								prepare_jonathan-aotearoa.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										28
									
								
								prepare_jonathan-aotearoa.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,28 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | # | ||||||
|  | #  Copyright 2023 The original authors | ||||||
|  | # | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | # | ||||||
|  | #      http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | # | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | # | ||||||
|  |  | ||||||
|  | source "$HOME/.sdkman/bin/sdkman-init.sh" | ||||||
|  | sdk use java 21.0.2-graal 1>&2 | ||||||
|  |  | ||||||
|  | if [ ! -f target/CalculateAverage_jonathan-aotearoa_image ]; then | ||||||
|  |     # Enable preview features and disable system assertions. | ||||||
|  |     JAVA_OPTS="--enable-preview -dsa" | ||||||
|  |     # Use the no-op GC. | ||||||
|  |     # Enable CPU features (-march=native) and level-3 optimisations (-O3) | ||||||
|  |     NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_jonathanaotearoa --gc=epsilon -O3 -march=native --strict-image-heap $JAVA_OPTS" | ||||||
|  |     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_jonathan-aotearoa_image dev.morling.onebrc.CalculateAverage_jonathanaotearoa | ||||||
|  | fi | ||||||
| @@ -0,0 +1,587 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright 2023 The original authors | ||||||
|  |  * | ||||||
|  |  *  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  *  you may not use this file except in compliance with the License. | ||||||
|  |  *  You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *      http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  *  Unless required by applicable law or agreed to in writing, software | ||||||
|  |  *  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  *  See the License for the specific language governing permissions and | ||||||
|  |  *  limitations under the License. | ||||||
|  |  */ | ||||||
|  | package dev.morling.onebrc; | ||||||
|  |  | ||||||
|  | import sun.misc.Unsafe; | ||||||
|  |  | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.lang.foreign.Arena; | ||||||
|  | import java.lang.reflect.Field; | ||||||
|  | import java.nio.ByteBuffer; | ||||||
|  | import java.nio.ByteOrder; | ||||||
|  | import java.nio.channels.FileChannel; | ||||||
|  | import java.nio.charset.StandardCharsets; | ||||||
|  | import java.nio.file.DirectoryStream; | ||||||
|  | import java.nio.file.Files; | ||||||
|  | import java.nio.file.Path; | ||||||
|  | import java.nio.file.StandardOpenOption; | ||||||
|  | import java.util.*; | ||||||
|  | import java.util.concurrent.ForkJoinPool; | ||||||
|  | import java.util.stream.Collectors; | ||||||
|  | import java.util.stream.Stream; | ||||||
|  |  | ||||||
|  | public class CalculateAverage_jonathanaotearoa { | ||||||
|  |  | ||||||
|  |     public static final Unsafe UNSAFE; | ||||||
|  |  | ||||||
|  |     static { | ||||||
|  |         try { | ||||||
|  |             final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe"); | ||||||
|  |             theUnsafe.setAccessible(true); | ||||||
|  |             UNSAFE = (Unsafe) theUnsafe.get(null); | ||||||
|  |         } catch (NoSuchFieldException | IllegalAccessException e) { | ||||||
|  |             throw new RuntimeException(STR."Error getting instance of \{Unsafe.class.getName()}"); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private static final int WORD_BYTES = Long.BYTES; | ||||||
|  |     private static final Path FILE_PATH = Path.of("./measurements.txt"); | ||||||
|  |     private static final Path SAMPLE_DIR_PATH = Path.of("./src/test/resources/samples"); | ||||||
|  |     private static final byte MAX_LINE_BYTES = 107; | ||||||
|  |     private static final byte NEW_LINE_BYTE = '\n'; | ||||||
|  |     private static final long SEPARATOR_XOR_MASK = 0x3b3b3b3b3b3b3b3bL; | ||||||
|  |  | ||||||
|  |     // A mask where the 4th bit of the 5th, 6th and 7th bytes is set to 1. | ||||||
|  |     // Leverages the fact that the 4th bit of a digit byte will 1. | ||||||
|  |     // Whereas the 4th bit of the decimal point byte will be 0. | ||||||
|  |     // Assumes little endianness. | ||||||
|  |     private static final long DECIMAL_POINT_MASK = 0x10101000L; | ||||||
|  |  | ||||||
|  |     // This mask performs two tasks: | ||||||
|  |     // Sets the right-most and 3 left-most bytes to zero. | ||||||
|  |     // Given a temp value be at most 5 bytes in length, .e.g -99.9, we can safely ignore the last 3 bytes. | ||||||
|  |     // Subtracts 48, i.e. the UFT-8 value offset, from the digits bytes. | ||||||
|  |     // As a result, '0' (48) becomes 0, '1' (49) becomes 1, and so on. | ||||||
|  |     private static final long TEMP_DIGITS_MASK = 0x0f000f0f00L; | ||||||
|  |  | ||||||
|  |     public static void main(final String[] args) throws IOException { | ||||||
|  |         assert ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN : "Big endian byte order is not supported"; | ||||||
|  |         System.out.println(resultsToString(processFile(FILE_PATH))); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * A custom version of AbstractMap's toString() method. | ||||||
|  |      * <p> | ||||||
|  |      * This should be more performant as we can: | ||||||
|  |      * <ul> | ||||||
|  |      *     <li>Set the initial capacity of the string builder</li> | ||||||
|  |      *     <li>Append double values directly, which avoids string creation</li> | ||||||
|  |      * </ul> | ||||||
|  |      * </p> | ||||||
|  |      * | ||||||
|  |      * @param results the results. | ||||||
|  |      * @return a string representation of the results. | ||||||
|  |      */ | ||||||
|  |     private static String resultsToString(final Map<String, TemperatureData> results) { | ||||||
|  |         final Iterator<Map.Entry<String, TemperatureData>> i = results.entrySet().iterator(); | ||||||
|  |         if (!i.hasNext()) { | ||||||
|  |             System.out.println("{}"); | ||||||
|  |         } | ||||||
|  |         // Capacity based the output for measurements.txt. | ||||||
|  |         final StringBuilder sb = new StringBuilder(1100).append('{'); | ||||||
|  |         while (i.hasNext()) { | ||||||
|  |             Map.Entry<String, TemperatureData> e = i.next(); | ||||||
|  |             sb.append(e.getKey()) | ||||||
|  |                     .append('=') | ||||||
|  |                     .append(e.getValue().getMin()) | ||||||
|  |                     .append('/') | ||||||
|  |                     .append(e.getValue().getMean()) | ||||||
|  |                     .append('/') | ||||||
|  |                     .append(e.getValue().getMax()); | ||||||
|  |             if (i.hasNext()) { | ||||||
|  |                 sb.append(',').append(' '); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         sb.append('}'); | ||||||
|  |         return sb.toString(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Processes the specified file. | ||||||
|  |      * <p> | ||||||
|  |      * Extracted from the main method for testability. | ||||||
|  |      * </p> | ||||||
|  |      * | ||||||
|  |      * @param filePath the path of the file we want to process. | ||||||
|  |      * @return a sorted map of station data keyed by station name. | ||||||
|  |      * @throws IOException if an error occurs. | ||||||
|  |      */ | ||||||
|  |     private static SortedMap<String, TemperatureData> processFile(final Path filePath) throws IOException { | ||||||
|  |         assert filePath != null : "filePath cannot be null"; | ||||||
|  |         assert Files.isRegularFile(filePath) : STR."\{filePath.toAbsolutePath()} is not a valid file"; | ||||||
|  |  | ||||||
|  |         try (final FileChannel fc = FileChannel.open(filePath, StandardOpenOption.READ)) { | ||||||
|  |             final long fileSize = fc.size(); | ||||||
|  |             if (fileSize < WORD_BYTES) { | ||||||
|  |                 // The file size is less than our word size. | ||||||
|  |                 // Keep it simple and fall back to non-performant processing. | ||||||
|  |                 return processTinyFile(fc, fileSize); | ||||||
|  |             } | ||||||
|  |             return processFile(fc, fileSize); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * An unoptimised method for processing a tiny file. | ||||||
|  |      * <p> | ||||||
|  |      * Handling tiny files in a separate method reduces the complexity of {@link #processFile(FileChannel, long)}. | ||||||
|  |      * </p> | ||||||
|  |      * | ||||||
|  |      * @param fc       the file channel to read from. | ||||||
|  |      * @param fileSize the file size in bytes. | ||||||
|  |      * @return a sorted map of station data keyed by station name. | ||||||
|  |      * @throws IOException if an error occurs reading from the file channel. | ||||||
|  |      */ | ||||||
|  |     private static SortedMap<String, TemperatureData> processTinyFile(final FileChannel fc, final long fileSize) throws IOException { | ||||||
|  |         final ByteBuffer byteBuffer = ByteBuffer.allocate((int) fileSize); | ||||||
|  |         fc.read(byteBuffer); | ||||||
|  |         return new String(byteBuffer.array(), StandardCharsets.UTF_8) | ||||||
|  |                 .lines() | ||||||
|  |                 .map(line -> line.trim().split(";")) | ||||||
|  |                 .map(tokens -> { | ||||||
|  |                     final String stationName = tokens[0]; | ||||||
|  |                     final short temp = Short.parseShort(tokens[1].replace(".", "")); | ||||||
|  |                     return new SimpleStationData(stationName, temp); | ||||||
|  |                 }) | ||||||
|  |                 .collect(Collectors.toMap( | ||||||
|  |                         sd -> sd.name, | ||||||
|  |                         sd -> sd, | ||||||
|  |                         TemperatureData::merge, | ||||||
|  |                         TreeMap::new)); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * An optimised method for processing files > {@link Long#BYTES} in size. | ||||||
|  |      * | ||||||
|  |      * @param fc       the file channel to map into memory. | ||||||
|  |      * @param fileSize the file size in bytes. | ||||||
|  |      * @return a sorted map of station data keyed by station name. | ||||||
|  |      * @throws IOException if an error occurs mapping the file channel into memory. | ||||||
|  |      */ | ||||||
|  |     private static SortedMap<String, TemperatureData> processFile(final FileChannel fc, final long fileSize) throws IOException { | ||||||
|  |         assert fileSize >= WORD_BYTES : STR."File size cannot be less than word size \{WORD_BYTES}, but was \{fileSize}"; | ||||||
|  |  | ||||||
|  |         try (final Arena arena = Arena.ofConfined()) { | ||||||
|  |             final long fileAddress = fc.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, arena).address(); | ||||||
|  |             return createChunks(fileAddress, fileSize) | ||||||
|  |                     .parallel() | ||||||
|  |                     .map(CalculateAverage_jonathanaotearoa::processChunk) | ||||||
|  |                     .flatMap(Repository::entries) | ||||||
|  |                     .collect(Collectors.toMap( | ||||||
|  |                             StationData::getName, | ||||||
|  |                             sd -> sd, | ||||||
|  |                             TemperatureData::merge, | ||||||
|  |                             TreeMap::new)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Divides the file into chunks that can be processed in parallel. | ||||||
|  |      * <p> | ||||||
|  |      * If dividing the file into {@link ForkJoinPool#getCommonPoolParallelism() parallelism} chunks would result in a | ||||||
|  |      * chunk size less than the maximum line size in bytes, then a single chunk is returned for the entire file. | ||||||
|  |      * </p> | ||||||
|  |      * | ||||||
|  |      * @param fileAddress the address of the file. | ||||||
|  |      * @param fileSize    the size of the file in bytes. | ||||||
|  |      * @return a stream of chunks. | ||||||
|  |      */ | ||||||
|  |     private static Stream<Chunk> createChunks(final long fileAddress, final long fileSize) { | ||||||
|  |         // The number of cores - 1. | ||||||
|  |         final int parallelism = ForkJoinPool.getCommonPoolParallelism(); | ||||||
|  |         final long chunkStep = fileSize / parallelism; | ||||||
|  |         final long lastFileByteAddress = fileAddress + fileSize - 1; | ||||||
|  |         if (chunkStep < MAX_LINE_BYTES) { | ||||||
|  |             // We're dealing with a small file, return a single chunk. | ||||||
|  |             return Stream.of(new Chunk(fileAddress, lastFileByteAddress, true)); | ||||||
|  |         } | ||||||
|  |         final Chunk[] chunks = new Chunk[parallelism]; | ||||||
|  |         long startAddress = fileAddress; | ||||||
|  |         for (int i = 0, n = parallelism - 1; i < n; i++) { | ||||||
|  |             // Find end of the *previous* line. | ||||||
|  |             // We know there's a previous line in this chunk because chunkStep >= MAX_LINE_BYTES. | ||||||
|  |             // The last chunk may be slightly bigger than the others. | ||||||
|  |             // For a 1 billion line file, this has zero impact. | ||||||
|  |             long lastByteAddress = startAddress + chunkStep; | ||||||
|  |             while (UNSAFE.getByte(lastByteAddress) != NEW_LINE_BYTE) { | ||||||
|  |                 lastByteAddress--; | ||||||
|  |             } | ||||||
|  |             // We've found the end of the previous line. | ||||||
|  |             chunks[i] = new Chunk(startAddress, lastByteAddress, false); | ||||||
|  |             startAddress = ++lastByteAddress; | ||||||
|  |         } | ||||||
|  |         // The remaining bytes are assigned to the last chunk. | ||||||
|  |         chunks[chunks.length - 1] = (new Chunk(startAddress, lastFileByteAddress, true)); | ||||||
|  |         return Stream.of(chunks); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Does the work of processing a chunk. | ||||||
|  |      * | ||||||
|  |      * @param chunk the chunk to process. | ||||||
|  |      * @return a repository containing the chunk's station data. | ||||||
|  |      */ | ||||||
|  |     private static Repository processChunk(final Chunk chunk) { | ||||||
|  |         final Repository repo = new Repository(); | ||||||
|  |         long address = chunk.startAddress; | ||||||
|  |  | ||||||
|  |         while (address <= chunk.lastByteAddress) { | ||||||
|  |             // Read station name. | ||||||
|  |             long nameAddress = address; | ||||||
|  |             long nameWord; | ||||||
|  |             long separatorMask; | ||||||
|  |             int nameHash = 1; | ||||||
|  |  | ||||||
|  |             while (true) { | ||||||
|  |                 nameWord = chunk.getWord(address); | ||||||
|  |  | ||||||
|  |                 // Based on the Hacker's Delight "Find First 0-Byte" branch-free, 5-instruction, algorithm. | ||||||
|  |                 // See also https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord | ||||||
|  |                 final long separatorXorResult = nameWord ^ SEPARATOR_XOR_MASK; | ||||||
|  |                 // If the separator is not present, all bits in the mask will be zero. | ||||||
|  |                 // If the separator is present, the first bit of the corresponding byte in the mask will be 1. | ||||||
|  |                 separatorMask = (separatorXorResult - 0x0101010101010101L) & (~separatorXorResult & 0x8080808080808080L); | ||||||
|  |                 if (separatorMask == 0) { | ||||||
|  |                     address += Long.BYTES; | ||||||
|  |                     // Multiplicative hashing, as per Arrays.hashCode(). | ||||||
|  |                     // We could use XOR here, but it "might" produce more collisions. | ||||||
|  |                     nameHash = 31 * nameHash + Long.hashCode(nameWord); | ||||||
|  |                 } | ||||||
|  |                 else { | ||||||
|  |                     break; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // We've found the separator. | ||||||
|  |             // We only support little endian, so we use the *trailing* number of zeros to get the number of name bits. | ||||||
|  |             final int numberOfNameBits = Long.numberOfTrailingZeros(separatorMask) & ~7; | ||||||
|  |             final int numberOfNameBytes = numberOfNameBits >> 3; | ||||||
|  |             final long separatorAddress = address + numberOfNameBytes; | ||||||
|  |  | ||||||
|  |             if (numberOfNameBytes > 0) { | ||||||
|  |                 // Truncate the word, so we only have the portion before the separator, i.e. the name bytes. | ||||||
|  |                 final int bitsToDiscard = Long.SIZE - numberOfNameBits; | ||||||
|  |                 // Little endian. | ||||||
|  |                 final long truncatedNameWord = (nameWord << bitsToDiscard) >>> bitsToDiscard; | ||||||
|  |                 nameHash = 31 * nameHash + Long.hashCode(truncatedNameWord); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             final long tempAddress = separatorAddress + 1; | ||||||
|  |             final long tempWord = chunk.getWord(tempAddress); | ||||||
|  |  | ||||||
|  |             // "0" in UTF-8 is 48, which is 00110000 in binary. | ||||||
|  |             // The first 4 bits of any UTF-8 digit byte are therefore 0011. | ||||||
|  |  | ||||||
|  |             // Get the position of the decimal point... | ||||||
|  |             // "." in UTF-8 is 46, which is 00101110 in binary. | ||||||
|  |             // We can therefore use the 4th bit to check which byte is the decimal point. | ||||||
|  |             final int decimalPointIndex = Long.numberOfTrailingZeros(~tempWord & DECIMAL_POINT_MASK) >> 3; | ||||||
|  |  | ||||||
|  |             // Check if we've got a negative or positive number... | ||||||
|  |             // "-" in UTF-8 is 45, which is 00101101 in binary. | ||||||
|  |             // As per above, we use the 4th bit to check if the word contains a positive, or negative, temperature. | ||||||
|  |             // If the temperature is negative, the value of "sign" will be -1. If it's positive, it'll be 0. | ||||||
|  |             final long sign = (~tempWord << 59) >> 63; | ||||||
|  |  | ||||||
|  |             // Create a mask that zeros out the minus-sign byte, if present. | ||||||
|  |             // Little endian, i.e. the minus sign is the right-most byte. | ||||||
|  |             final long signMask = ~(sign & 0xFF); | ||||||
|  |  | ||||||
|  |             // To get the temperature value, we left-shift the digit bytes into the following, known, positions. | ||||||
|  |             // 0x00 0x00 0x00 <fractional-digit> 0x00 <integer-part-digit> <integer-part-digit> 0x00 | ||||||
|  |             // Because we're ANDing with the sign mask, if the value only has a single integer-part digit, the right-most one will be zero. | ||||||
|  |             final int leftShift = (3 - decimalPointIndex) * Byte.SIZE; | ||||||
|  |             final long digitsWord = ((tempWord & signMask) << leftShift) & TEMP_DIGITS_MASK; | ||||||
|  |  | ||||||
|  |             // Get the unsigned int value. | ||||||
|  |             final byte b100 = (byte) (digitsWord >> 8); | ||||||
|  |             final byte b10 = (byte) (digitsWord >> 16); | ||||||
|  |             final byte b1 = (byte) (digitsWord >> 32); | ||||||
|  |             final short unsignedTemp = (short) (b100 * 100 + b10 * 10 + b1); | ||||||
|  |             final short temp = (short) ((unsignedTemp + sign) ^ sign); | ||||||
|  |  | ||||||
|  |             final byte nameSize = (byte) (separatorAddress - nameAddress); | ||||||
|  |             repo.addTemp(nameHash, nameAddress, nameSize, temp); | ||||||
|  |  | ||||||
|  |             // Calculate the address of the next line. | ||||||
|  |             address = tempAddress + decimalPointIndex + 3; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return repo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Represents a portion of a file containing 1 or more whole lines. | ||||||
|  |      * | ||||||
|  |      * @param startAddress    the memory address of the first byte. | ||||||
|  |      * @param lastByteAddress the memory address of the last byte. | ||||||
|  |      * @param lastWordAddress the memory address of the last whole word. | ||||||
|  |      * @param isLast          whether this is the last chunk. | ||||||
|  |      */ | ||||||
|  |     private record Chunk(long startAddress, long lastByteAddress, long lastWordAddress, boolean isLast) { | ||||||
|  |  | ||||||
|  |         public Chunk(final long startAddress, final long lastByteAddress, final boolean isLast) { | ||||||
|  |             this(startAddress, lastByteAddress, lastByteAddress - (Long.BYTES - 1), isLast); | ||||||
|  |  | ||||||
|  |             assert lastByteAddress > startAddress : STR."lastByteAddress \{lastByteAddress} must be > startAddress \{startAddress}"; | ||||||
|  |             assert lastWordAddress >= startAddress : STR."lastWordAddress \{lastWordAddress} must be >= startAddress \{startAddress}"; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         /** | ||||||
|  |          * Gets an 8 byte word from this chunk. | ||||||
|  |          * <p> | ||||||
|  |          * If the specified address is greater than {@link Chunk#lastWordAddress} and {@link Chunk#isLast}, the word | ||||||
|  |          * will be truncated. This ensures we never read beyond the end of the file. | ||||||
|  |          * </p> | ||||||
|  |          * | ||||||
|  |          * @param address the address of the word we want. | ||||||
|  |          * @return the word at the specified address. | ||||||
|  |          */ | ||||||
|  |         public long getWord(final long address) { | ||||||
|  |             assert address >= startAddress : STR."address must be >= startAddress \{startAddress}, but was \{address}"; | ||||||
|  |             assert address < lastByteAddress : STR."address must be < lastByteAddress \{lastByteAddress}, but was \{address}"; | ||||||
|  |  | ||||||
|  |             if (isLast && address > lastWordAddress) { | ||||||
|  |                 // Make sure we don't read beyond the end of the file and potentially crash the JVM. | ||||||
|  |                 final long word = UNSAFE.getLong(lastWordAddress); | ||||||
|  |                 final int bytesToDiscard = (int) (address - lastWordAddress); | ||||||
|  |                 // As with elsewhere, this assumes little endianness. | ||||||
|  |                 return word >>> (bytesToDiscard << 3); | ||||||
|  |             } | ||||||
|  |             return UNSAFE.getLong(address); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Abstract class encapsulating temperature data. | ||||||
|  |      */ | ||||||
|  |     private static abstract class TemperatureData { | ||||||
|  |  | ||||||
|  |         private short min; | ||||||
|  |         private short max; | ||||||
|  |         private long sum; | ||||||
|  |         private int count; | ||||||
|  |  | ||||||
|  |         protected TemperatureData(final short temp) { | ||||||
|  |             min = max = temp; | ||||||
|  |             sum = temp; | ||||||
|  |             count = 1; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         void addTemp(final short temp) { | ||||||
|  |             if (temp < min) { | ||||||
|  |                 min = temp; | ||||||
|  |             } | ||||||
|  |             else if (temp > max) { | ||||||
|  |                 max = temp; | ||||||
|  |             } | ||||||
|  |             sum += temp; | ||||||
|  |             count++; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         TemperatureData merge(final TemperatureData other) { | ||||||
|  |             if (other.min < min) { | ||||||
|  |                 min = other.min; | ||||||
|  |             } | ||||||
|  |             if (other.max > max) { | ||||||
|  |                 max = other.max; | ||||||
|  |             } | ||||||
|  |             sum += other.sum; | ||||||
|  |             count += other.count; | ||||||
|  |             return this; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         double getMin() { | ||||||
|  |             return round(((double) min) / 10.0); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         double getMax() { | ||||||
|  |             return round(((double) max) / 10.0); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         double getMean() { | ||||||
|  |             return round((((double) sum) / 10.0) / count); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         private static double round(double value) { | ||||||
|  |             return Math.round(value * 10.0) / 10.0; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * For use with tiny files. | ||||||
|  |      * | ||||||
|  |      * @see CalculateAverage_jonathanaotearoa#processTinyFile(FileChannel, long). | ||||||
|  |      */ | ||||||
|  |     private static final class SimpleStationData extends TemperatureData implements Comparable<SimpleStationData> { | ||||||
|  |  | ||||||
|  |         private final String name; | ||||||
|  |  | ||||||
|  |         SimpleStationData(final String name, final short temp) { | ||||||
|  |             super(temp); | ||||||
|  |             this.name = name; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         @Override | ||||||
|  |         public int compareTo(final SimpleStationData other) { | ||||||
|  |             return name.compareTo(other.name); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private static final class StationData extends TemperatureData implements Comparable<StationData> { | ||||||
|  |  | ||||||
|  |         private final int nameHash; | ||||||
|  |         private final long nameAddress; | ||||||
|  |         private final byte nameSize; | ||||||
|  |         private String name; | ||||||
|  |  | ||||||
|  |         StationData(final int nameHash, final long nameAddress, final byte nameSize, final short temp) { | ||||||
|  |             super(temp); | ||||||
|  |             this.nameAddress = nameAddress; | ||||||
|  |             this.nameSize = nameSize; | ||||||
|  |             this.nameHash = nameHash; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         @Override | ||||||
|  |         public int compareTo(final StationData other) { | ||||||
|  |             return getName().compareTo(other.getName()); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         String getName() { | ||||||
|  |             if (name == null) { | ||||||
|  |                 final byte[] nameBytes = new byte[nameSize]; | ||||||
|  |                 UNSAFE.copyMemory(null, nameAddress, nameBytes, UNSAFE.arrayBaseOffset(nameBytes.getClass()), nameSize); | ||||||
|  |                 name = new String(nameBytes, StandardCharsets.UTF_8); | ||||||
|  |             } | ||||||
|  |             return name; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Open addressing, linear probing, hash map repository. | ||||||
|  |      */ | ||||||
|  |     private static final class Repository { | ||||||
|  |  | ||||||
|  |         private static final int CAPACITY = 100_003; | ||||||
|  |         private static final int LAST_INDEX = CAPACITY - 1; | ||||||
|  |  | ||||||
|  |         private final StationData[] table; | ||||||
|  |  | ||||||
|  |         public Repository() { | ||||||
|  |             this.table = new StationData[CAPACITY]; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         /** | ||||||
|  |          * Adds a station temperature value to this repository. | ||||||
|  |          * | ||||||
|  |          * @param nameHash    the station name hash. | ||||||
|  |          * @param nameAddress the station name address in memory. | ||||||
|  |          * @param nameSize    the station name size in bytes. | ||||||
|  |          * @param temp        the temperature value. | ||||||
|  |          */ | ||||||
|  |         public void addTemp(final int nameHash, final long nameAddress, final byte nameSize, short temp) { | ||||||
|  |             final int index = findIndex(nameHash, nameAddress, nameSize); | ||||||
|  |             if (table[index] == null) { | ||||||
|  |                 table[index] = new StationData(nameHash, nameAddress, nameSize, temp); | ||||||
|  |             } | ||||||
|  |             else { | ||||||
|  |                 table[index].addTemp(temp); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         public Stream<StationData> entries() { | ||||||
|  |             return Arrays.stream(table).filter(Objects::nonNull); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         private int findIndex(int nameHash, final long nameAddress, final byte nameSize) { | ||||||
|  |             // Think about replacing modulo. | ||||||
|  |             // https://lemire.me/blog/2018/08/20/performance-of-ranged-accesses-into-arrays-modulo-multiply-shift-and-masks/ | ||||||
|  |             int index = (nameHash & 0x7FFFFFFF) % CAPACITY; | ||||||
|  |             while (isCollision(index, nameHash, nameAddress, nameSize)) { | ||||||
|  |                 index = index == LAST_INDEX ? 0 : index + 1; | ||||||
|  |             } | ||||||
|  |             return index; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         private boolean isCollision(final int index, final long nameHash, final long nameAddress, final byte nameSize) { | ||||||
|  |             final StationData existing = table[index]; | ||||||
|  |             if (existing == null) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |             if (nameHash != existing.nameHash) { | ||||||
|  |                 return true; | ||||||
|  |             } | ||||||
|  |             if (nameSize != existing.nameSize) { | ||||||
|  |                 return true; | ||||||
|  |             } | ||||||
|  |             // Last resort; check if the names are the same. | ||||||
|  |             // This is real performance hit :( | ||||||
|  |             return !isMemoryEqual(nameAddress, existing.nameAddress, nameSize); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         /** | ||||||
|  |          * Checks if two locations in memory have the same value. | ||||||
|  |          * | ||||||
|  |          * @param address1 the address of the first location. | ||||||
|  |          * @param address2 the address of the second locations. | ||||||
|  |          * @param size     the number of bytes to check for equality. | ||||||
|  |          * @return true if both addresses contain the same bytes. | ||||||
|  |          */ | ||||||
|  |         private static boolean isMemoryEqual(final long address1, final long address2, final byte size) { | ||||||
|  |             // Checking 1 byte at a time, so we can bail as early as possible. | ||||||
|  |             for (int offset = 0; offset < size; offset++) { | ||||||
|  |                 final byte b1 = UNSAFE.getByte(address1 + offset); | ||||||
|  |                 final byte b2 = UNSAFE.getByte(address2 + offset); | ||||||
|  |                 if (b1 != b2) { | ||||||
|  |                     return false; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Helper for running tests without blowing away the main measurements.txt file. | ||||||
|  |      * Saves regenerating the 1 billion line file after each test run. | ||||||
|  |      * Enable assertions in the IDE run config. | ||||||
|  |      */ | ||||||
|  |     public static final class TestRunner { | ||||||
|  |         public static void main(String[] args) throws IOException { | ||||||
|  |             final StringBuilder testResults = new StringBuilder(); | ||||||
|  |             try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(SAMPLE_DIR_PATH, "*.txt")) { | ||||||
|  |                 dirStream.forEach(filePath -> { | ||||||
|  |                     testResults.append(STR."Testing '\{filePath.getFileName()}'... "); | ||||||
|  |                     final String expectedResultFileName = filePath.getFileName().toString().replace(".txt", ".out"); | ||||||
|  |                     try { | ||||||
|  |                         final String expected = Files.readString(SAMPLE_DIR_PATH.resolve(expectedResultFileName)); | ||||||
|  |                         final SortedMap<String, TemperatureData> results = processFile(filePath); | ||||||
|  |                         // Appending \n to the results string to mimic println(). | ||||||
|  |                         final String actual = STR."\{resultsToString(results)}\n"; | ||||||
|  |                         if (actual.equals(expected)) { | ||||||
|  |                             testResults.append("Passed\n"); | ||||||
|  |                         } else { | ||||||
|  |                             testResults.append("Failed. Actual output does not match expected\n"); | ||||||
|  |                         } | ||||||
|  |                     } catch (IOException e) { | ||||||
|  |                         throw new RuntimeException(STR."Error testing '\{filePath.getFileName()}"); | ||||||
|  |                     } | ||||||
|  |                 }); | ||||||
|  |             } finally { | ||||||
|  |                 System.out.println(testResults); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user