Primitive hash (#345)
* Calculate average by vaidhy * Calculate average by vaidhy * More changes * remove worker log * Pass -Dparellelism and switch back to open * Try out mmap * Improve mmap solution * no copy version * reduce threads * hash code computed on the fly * Reuse the char (Do not know if it helps) * primitive hash map * Primite HashMap * Micro optimizations to push for optimizations * Revert "Micro optimizations to push for optimizations" This reverts commit ea333e2821ebb5c1d6d71a4e87e569a8f2f8f7f0. * Micro optimizations to get the juice * floorMod fixes * findSemi and findNewLine as separate functions * Optimized parseDouble * More micro changes * Aligned equal check * more small changes * XOR instead of compare * Reduce loop length * Revert changes * Loop optimization and added native build * Hand unrolled findSemi loop. * Remove incorrect comments * Taking care fo PR comments * Add prepare script * Missing header error fix * remove wrong comment --------- Co-authored-by: Anita S V <anitasvasu@gmail.com> Co-authored-by: Anita SV <anitvasu@amazon.com>
This commit is contained in:
parent
a1adf191e1
commit
33c614a1e3
19
calculate_average_vaidhy.sh
Executable file
19
calculate_average_vaidhy.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Copyright 2023 The original authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
JAVA_OPTS="--enable-preview"
|
||||
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_vaidhy
|
19
prepare_vaidhy.sh
Executable file
19
prepare_vaidhy.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright 2023 The original authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
source "$HOME/.sdkman/bin/sdkman-init.sh"
|
||||
sdk use java 21.0.1-graal 1>&2
|
427
src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java
Normal file
427
src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java
Normal file
@ -0,0 +1,427 @@
|
||||
/*
|
||||
* Copyright 2023 The original authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package dev.morling.onebrc;
|
||||
|
||||
import sun.misc.Unsafe;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.reflect.Field;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
public class CalculateAverage_vaidhy<I, T> {
|
||||
|
||||
private static final class HashEntry {
|
||||
private long startAddress;
|
||||
private long endAddress;
|
||||
private long suffix;
|
||||
private int hash;
|
||||
|
||||
IntSummaryStatistics value;
|
||||
}
|
||||
|
||||
private static class PrimitiveHashMap {
|
||||
private final HashEntry[] entries;
|
||||
private final int twoPow;
|
||||
|
||||
PrimitiveHashMap(int twoPow) {
|
||||
this.twoPow = twoPow;
|
||||
this.entries = new HashEntry[1 << twoPow];
|
||||
for (int i = 0; i < entries.length; i++) {
|
||||
this.entries[i] = new HashEntry();
|
||||
}
|
||||
}
|
||||
|
||||
public HashEntry find(long startAddress, long endAddress, long suffix, int hash) {
|
||||
int len = entries.length;
|
||||
int i = (hash ^ (hash >> twoPow)) & (len - 1);
|
||||
|
||||
do {
|
||||
HashEntry entry = entries[i];
|
||||
if (entry.value == null) {
|
||||
return entry;
|
||||
}
|
||||
if (entry.hash == hash) {
|
||||
long entryLength = entry.endAddress - entry.startAddress;
|
||||
long lookupLength = endAddress - startAddress;
|
||||
if ((entryLength == lookupLength) && (entry.suffix == suffix)) {
|
||||
boolean found = compareEntryKeys(startAddress, endAddress, entry);
|
||||
|
||||
if (found) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
}
|
||||
i++;
|
||||
if (i == len) {
|
||||
i = 0;
|
||||
}
|
||||
} while (i != hash);
|
||||
return null;
|
||||
}
|
||||
|
||||
private static boolean compareEntryKeys(long startAddress, long endAddress, HashEntry entry) {
|
||||
long entryIndex = entry.startAddress;
|
||||
long lookupIndex = startAddress;
|
||||
|
||||
for (; (lookupIndex + 7) < endAddress; lookupIndex += 8) {
|
||||
if (UNSAFE.getLong(entryIndex) != UNSAFE.getLong(lookupIndex)) {
|
||||
return false;
|
||||
}
|
||||
entryIndex += 8;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private static final String FILE = "./measurements.txt";
|
||||
|
||||
private static Unsafe initUnsafe() {
|
||||
try {
|
||||
Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
|
||||
theUnsafe.setAccessible(true);
|
||||
return (Unsafe) theUnsafe.get(Unsafe.class);
|
||||
}
|
||||
catch (NoSuchFieldException | IllegalAccessException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Unsafe UNSAFE = initUnsafe();
|
||||
|
||||
private static int parseDouble(long startAddress, long endAddress) {
|
||||
int normalized;
|
||||
int length = (int) (endAddress - startAddress);
|
||||
if (length == 5) {
|
||||
normalized = (UNSAFE.getByte(startAddress + 1) ^ 0x30);
|
||||
normalized = (normalized << 3) + (normalized << 1) + (UNSAFE.getByte(startAddress + 2) ^ 0x30);
|
||||
normalized = (normalized << 3) + (normalized << 1) + (UNSAFE.getByte(startAddress + 4) ^ 0x30);
|
||||
normalized = -normalized;
|
||||
return normalized;
|
||||
}
|
||||
if (length == 3) {
|
||||
normalized = (UNSAFE.getByte(startAddress) ^ 0x30);
|
||||
normalized = (normalized << 3) + (normalized << 1) + (UNSAFE.getByte(startAddress + 2) ^ 0x30);
|
||||
return normalized;
|
||||
}
|
||||
|
||||
if (UNSAFE.getByte(startAddress) == '-') {
|
||||
normalized = (UNSAFE.getByte(startAddress + 1) ^ 0x30);
|
||||
normalized = (normalized << 3) + (normalized << 1) + (UNSAFE.getByte(startAddress + 3) ^ 0x30);
|
||||
normalized = -normalized;
|
||||
return normalized;
|
||||
}
|
||||
else {
|
||||
normalized = (UNSAFE.getByte(startAddress) ^ 0x30);
|
||||
normalized = (normalized << 3) + (normalized << 1) + (UNSAFE.getByte(startAddress + 1) ^ 0x30);
|
||||
normalized = (normalized << 3) + (normalized << 1) + (UNSAFE.getByte(startAddress + 3) ^ 0x30);
|
||||
return normalized;
|
||||
}
|
||||
}
|
||||
|
||||
interface MapReduce<I> {
|
||||
|
||||
void process(long keyStartAddress, long keyEndAddress, int hash, int temperature, long suffix);
|
||||
|
||||
I result();
|
||||
}
|
||||
|
||||
private final FileService fileService;
|
||||
private final Supplier<MapReduce<I>> chunkProcessCreator;
|
||||
private final Function<List<I>, T> reducer;
|
||||
|
||||
interface FileService {
|
||||
long length();
|
||||
|
||||
long address();
|
||||
}
|
||||
|
||||
CalculateAverage_vaidhy(FileService fileService,
|
||||
Supplier<MapReduce<I>> mapReduce,
|
||||
Function<List<I>, T> reducer) {
|
||||
this.fileService = fileService;
|
||||
this.chunkProcessCreator = mapReduce;
|
||||
this.reducer = reducer;
|
||||
}
|
||||
|
||||
static class LineStream {
|
||||
private final long fileEnd;
|
||||
private final long chunkEnd;
|
||||
|
||||
private long position;
|
||||
private int hash;
|
||||
private long suffix;
|
||||
byte[] b = new byte[4];
|
||||
|
||||
public LineStream(FileService fileService, long offset, long chunkSize) {
|
||||
long fileStart = fileService.address();
|
||||
this.fileEnd = fileStart + fileService.length();
|
||||
this.chunkEnd = fileStart + offset + chunkSize;
|
||||
this.position = fileStart + offset;
|
||||
this.hash = 0;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return position <= chunkEnd && position < fileEnd;
|
||||
}
|
||||
|
||||
public long findSemi() {
|
||||
int h = 0;
|
||||
long s = 0;
|
||||
long i = position;
|
||||
while ((i + 3) < fileEnd) {
|
||||
// Adding 16 as it is the offset for primitive arrays
|
||||
ByteBuffer.wrap(b).putInt(UNSAFE.getInt(i));
|
||||
|
||||
if (b[3] == 0x3B) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
h = ((h << 5) - h) ^ b[3];
|
||||
s = (s << 8) ^ b[3];
|
||||
|
||||
if (b[2] == 0x3B) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
h = ((h << 5) - h) ^ b[2];
|
||||
s = (s << 8) ^ b[2];
|
||||
|
||||
if (b[1] == 0x3B) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
h = ((h << 5) - h) ^ b[1];
|
||||
s = (s << 8) ^ b[1];
|
||||
|
||||
if (b[0] == 0x3B) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
h = ((h << 5) - h) ^ b[0];
|
||||
s = (s << 8) ^ b[0];
|
||||
}
|
||||
|
||||
this.hash = h;
|
||||
this.suffix = s;
|
||||
position = i + 1;
|
||||
return i;
|
||||
}
|
||||
|
||||
public long skipLine() {
|
||||
for (long i = position; i < fileEnd; i++) {
|
||||
byte ch = UNSAFE.getByte(i);
|
||||
if (ch == 0x0a) {
|
||||
position = i + 1;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
position = fileEnd;
|
||||
return fileEnd;
|
||||
}
|
||||
|
||||
public long findTemperature() {
|
||||
position += 3;
|
||||
for (long i = position; i < fileEnd; i++) {
|
||||
byte ch = UNSAFE.getByte(i);
|
||||
if (ch == 0x0a) {
|
||||
position = i + 1;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
position = fileEnd;
|
||||
return fileEnd;
|
||||
}
|
||||
}
|
||||
|
||||
private void worker(long offset, long chunkSize, MapReduce<I> lineConsumer) {
|
||||
LineStream lineStream = new LineStream(fileService, offset, chunkSize);
|
||||
|
||||
if (offset != 0) {
|
||||
if (lineStream.hasNext()) {
|
||||
// Skip the first line.
|
||||
lineStream.skipLine();
|
||||
}
|
||||
else {
|
||||
// No lines then do nothing.
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (lineStream.hasNext()) {
|
||||
long keyStartAddress = lineStream.position;
|
||||
long keyEndAddress = lineStream.findSemi();
|
||||
long keySuffix = lineStream.suffix;
|
||||
int keyHash = lineStream.hash;
|
||||
long valueStartAddress = lineStream.position;
|
||||
long valueEndAddress = lineStream.findTemperature();
|
||||
int temperature = parseDouble(valueStartAddress, valueEndAddress);
|
||||
lineConsumer.process(keyStartAddress, keyEndAddress, keyHash, temperature, keySuffix);
|
||||
}
|
||||
}
|
||||
|
||||
public T master(long chunkSize, ExecutorService executor) {
|
||||
long len = fileService.length();
|
||||
List<Future<I>> summaries = new ArrayList<>();
|
||||
|
||||
for (long offset = 0; offset < len; offset += chunkSize) {
|
||||
long workerLength = Math.min(len, offset + chunkSize) - offset;
|
||||
MapReduce<I> mr = chunkProcessCreator.get();
|
||||
final long transferOffset = offset;
|
||||
Future<I> task = executor.submit(() -> {
|
||||
worker(transferOffset, workerLength, mr);
|
||||
return mr.result();
|
||||
});
|
||||
summaries.add(task);
|
||||
}
|
||||
List<I> summariesDone = summaries.stream()
|
||||
.map(task -> {
|
||||
try {
|
||||
return task.get();
|
||||
}
|
||||
catch (InterruptedException | ExecutionException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.toList();
|
||||
return reducer.apply(summariesDone);
|
||||
}
|
||||
|
||||
static class DiskFileService implements FileService {
|
||||
private final long fileSize;
|
||||
private final long mappedAddress;
|
||||
|
||||
DiskFileService(String fileName) throws IOException {
|
||||
FileChannel fileChannel = FileChannel.open(Path.of(fileName),
|
||||
StandardOpenOption.READ);
|
||||
this.fileSize = fileChannel.size();
|
||||
this.mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0,
|
||||
fileSize, Arena.global()).address();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long length() {
|
||||
return fileSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long address() {
|
||||
return mappedAddress;
|
||||
}
|
||||
}
|
||||
|
||||
private static class ChunkProcessorImpl implements MapReduce<PrimitiveHashMap> {
|
||||
|
||||
// 1 << 14 > 10,000 so it works
|
||||
private final PrimitiveHashMap statistics = new PrimitiveHashMap(14);
|
||||
|
||||
@Override
|
||||
public void process(long keyStartAddress, long keyEndAddress, int hash, int temperature, long suffix) {
|
||||
HashEntry entry = statistics.find(keyStartAddress, keyEndAddress, suffix, hash);
|
||||
if (entry == null) {
|
||||
throw new IllegalStateException("Hash table too small :(");
|
||||
}
|
||||
if (entry.value == null) {
|
||||
entry.startAddress = keyStartAddress;
|
||||
entry.endAddress = keyEndAddress;
|
||||
entry.suffix = suffix;
|
||||
entry.hash = hash;
|
||||
entry.value = new IntSummaryStatistics();
|
||||
}
|
||||
entry.value.accept(temperature);
|
||||
}
|
||||
|
||||
@Override
|
||||
public PrimitiveHashMap result() {
|
||||
return statistics;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
DiskFileService diskFileService = new DiskFileService(FILE);
|
||||
|
||||
CalculateAverage_vaidhy<PrimitiveHashMap, Map<String, IntSummaryStatistics>> calculateAverageVaidhy = new CalculateAverage_vaidhy<>(
|
||||
diskFileService,
|
||||
ChunkProcessorImpl::new,
|
||||
CalculateAverage_vaidhy::combineOutputs);
|
||||
|
||||
int proc = 2 * Runtime.getRuntime().availableProcessors();
|
||||
|
||||
long fileSize = diskFileService.length();
|
||||
long chunkSize = Math.ceilDiv(fileSize, proc);
|
||||
|
||||
ExecutorService executor = Executors.newFixedThreadPool(proc);
|
||||
Map<String, IntSummaryStatistics> output = calculateAverageVaidhy.master(chunkSize, executor);
|
||||
executor.shutdown();
|
||||
|
||||
Map<String, String> outputStr = toPrintMap(output);
|
||||
System.out.println(outputStr);
|
||||
}
|
||||
|
||||
private static Map<String, String> toPrintMap(Map<String, IntSummaryStatistics> output) {
|
||||
|
||||
Map<String, String> outputStr = new TreeMap<>();
|
||||
for (Map.Entry<String, IntSummaryStatistics> entry : output.entrySet()) {
|
||||
IntSummaryStatistics stat = entry.getValue();
|
||||
outputStr.put(entry.getKey(),
|
||||
STR."\{stat.getMin() / 10.0}/\{Math.round(stat.getAverage()) / 10.0}/\{stat.getMax() / 10.0}");
|
||||
}
|
||||
return outputStr;
|
||||
}
|
||||
|
||||
private static Map<String, IntSummaryStatistics> combineOutputs(
|
||||
List<PrimitiveHashMap> list) {
|
||||
|
||||
Map<String, IntSummaryStatistics> output = new HashMap<>(10000);
|
||||
for (PrimitiveHashMap map : list) {
|
||||
for (HashEntry entry : map.entries) {
|
||||
if (entry.value != null) {
|
||||
String keyStr = unsafeToString(entry.startAddress, entry.endAddress);
|
||||
|
||||
output.compute(keyStr, (ignore, val) -> {
|
||||
if (val == null) {
|
||||
return entry.value;
|
||||
}
|
||||
else {
|
||||
val.combine(entry.value);
|
||||
return val;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
private static String unsafeToString(long startAddress, long endAddress) {
|
||||
byte[] keyBytes = new byte[(int) (endAddress - startAddress)];
|
||||
for (int i = 0; i < keyBytes.length; i++) {
|
||||
keyBytes[i] = UNSAFE.getByte(startAddress + i);
|
||||
}
|
||||
return new String(keyBytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user