[Attempt #2] String overflow hash + data/mem optimization (#356)

* Use graal

* Use dynamic cores computer

* Use stream API to cleanup code

* Use max processors

* Use hash to avoid init string

* optimize concurrentmap init

* Smaller hash size

* Avoid checking concurrentmap

* Optimize data type

* string dedup

* Faster write

* Change base

* Remove time

* Use mul instead of div
This commit is contained in:
Bang NGUYEN 2024-01-13 12:32:17 +01:00 committed by GitHub
parent 45056e073b
commit dc49249d36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 83 additions and 77 deletions

View File

@ -19,5 +19,5 @@
# source "$HOME/.sdkman/bin/sdkman-init.sh" # source "$HOME/.sdkman/bin/sdkman-init.sh"
# sdk use java 21.0.1-graal 1>&2 # sdk use java 21.0.1-graal 1>&2
JAVA_OPTS="" JAVA_OPTS="-XX:+UseStringDeduplication"
time java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gnabyl java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gnabyl

20
prepare_gnabyl.sh Executable file
View File

@ -0,0 +1,20 @@
#!/bin/bash
#
# Copyright 2023 The original authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Uncomment below to use sdk
source "$HOME/.sdkman/bin/sdkman-init.sh"
sdk use java 21.0.1-graal 1>&2

View File

@ -16,25 +16,30 @@
package dev.morling.onebrc; package dev.morling.onebrc;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer; import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.CompletableFuture; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;
public class CalculateAverage_gnabyl { public class CalculateAverage_gnabyl {
private static final String FILE = "./measurements.txt"; private static final String FILE = "./measurements.txt";
private static final int NB_CHUNKS = 8; private static final int NB_CHUNKS = Runtime.getRuntime().availableProcessors();
private static record Chunk(long start, int bytesCount, MappedByteBuffer mappedByteBuffer) { private static Map<Integer, String> stationNameMap = new ConcurrentHashMap<>(10000, 0.9f, NB_CHUNKS);
private static record Chunk(int bytesCount, MappedByteBuffer mappedByteBuffer) {
} }
private static int reduceSizeToFitLineBreak(FileChannel channel, long startPosition, int startSize) private static int reduceSizeToFitLineBreak(FileChannel channel, long startPosition, int startSize)
@ -61,9 +66,9 @@ public class CalculateAverage_gnabyl {
return realSize; return realSize;
} }
private static List<Chunk> readChunks(long nbChunks) throws IOException { private static List<Chunk> readChunks(int nbChunks) throws IOException {
RandomAccessFile file = new RandomAccessFile(FILE, "rw"); RandomAccessFile file = new RandomAccessFile(FILE, "rw");
List<Chunk> res = new ArrayList<>(); List<Chunk> res = new ArrayList<>(nbChunks);
FileChannel channel = file.getChannel(); FileChannel channel = file.getChannel();
long bytesCount = channel.size(); long bytesCount = channel.size();
long bytesPerChunk = bytesCount / nbChunks; long bytesPerChunk = bytesCount / nbChunks;
@ -71,16 +76,18 @@ public class CalculateAverage_gnabyl {
// Memory map the file in read-only mode // Memory map the file in read-only mode
// TODO: Optimize using threads // TODO: Optimize using threads
long currentPosition = 0; long currentPosition = 0;
int startSize;
int realSize;
for (int i = 0; i < nbChunks; i++) { for (int i = 0; i < nbChunks; i++) {
int startSize = (int) bytesPerChunk; startSize = (int) bytesPerChunk;
int realSize = startSize; realSize = startSize;
if (i == nbChunks - 1) { if (i == nbChunks - 1) {
realSize = (int) (bytesCount - currentPosition); realSize = (int) (bytesCount - currentPosition);
MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY, currentPosition, MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY, currentPosition,
realSize); realSize);
res.add(new Chunk(currentPosition, realSize, mappedByteBuffer)); res.add(new Chunk(realSize, mappedByteBuffer));
break; break;
} }
@ -90,7 +97,7 @@ public class CalculateAverage_gnabyl {
MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY, currentPosition, MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY, currentPosition,
realSize); realSize);
res.add(new Chunk(currentPosition, realSize, mappedByteBuffer)); res.add(new Chunk(realSize, mappedByteBuffer));
currentPosition += realSize; currentPosition += realSize;
} }
@ -101,32 +108,32 @@ public class CalculateAverage_gnabyl {
} }
private static class StationData { private static class StationData {
private double sum, min, max; private float sum, min, max;
private long count; private int count;
public StationData(double value) { public StationData(float value) {
this.count = 1; this.count = 1;
this.sum = value; this.sum = value;
this.min = value; this.min = value;
this.max = value; this.max = value;
} }
public void update(double value) { public void update(float value) {
this.count++; this.count++;
this.sum += value; this.sum += value;
this.min = Math.min(this.min, value); this.min = Math.min(this.min, value);
this.max = Math.max(this.max, value); this.max = Math.max(this.max, value);
} }
public double getMean() { public float getMean() {
return sum / count; return sum / count;
} }
public double getMin() { public float getMin() {
return min; return min;
} }
public double getMax() { public float getMax() {
return max; return max;
} }
@ -139,47 +146,44 @@ public class CalculateAverage_gnabyl {
} }
static double round(double value) { static float round(float value) {
return Math.round(value * 10.0) / 10.0; return Math.round(value * 10.0f) * 0.1f;
} }
private static class ChunkResult { private static class ChunkResult {
private Map<String, StationData> data; private Map<Integer, StationData> data;
public ChunkResult() { public ChunkResult() {
data = new HashMap<>(); data = new HashMap<>();
} }
public StationData getData(String name) { public StationData getData(int hash) {
return data.get(name); return data.get(hash);
} }
public void addStation(String name, double value) { public void addStation(int hash, float value) {
this.data.put(name, new StationData(value)); this.data.put(hash, new StationData(value));
} }
public void print() { public void print() {
var stationNames = new ArrayList<String>(this.data.keySet()); PrintWriter out = new PrintWriter(System.out);
Collections.sort(stationNames); out.println(
System.out.print("{"); this.data.keySet().parallelStream()
for (int i = 0; i < stationNames.size() - 1; i++) { .map(hash -> {
var name = stationNames.get(i); var stationData = data.get(hash);
var stationData = data.get(name); var name = stationNameMap.get(hash);
System.out.printf("%s=%.1f/%.1f/%.1f, ", name, round(stationData.getMin()), return String.format("%s=%.1f/%.1f/%.1f", name, round(stationData.getMin()),
round(stationData.getMean()), round(stationData.getMean()),
round(stationData.getMax())); round(stationData.getMax()));
} })
var name = stationNames.get(stationNames.size() - 1); .sorted((a, b) -> a.split("=")[0].compareTo(b.split("=")[0]))
var stationData = data.get(name); .collect(Collectors.joining(", ", "{", "}")));
System.out.printf("%s=%.1f/%.1f/%.1f", name, round(stationData.getMin()), out.flush();
round(stationData.getMean()),
round(stationData.getMax()));
System.out.println("}");
} }
public void mergeWith(ChunkResult other) { public void mergeWith(ChunkResult other) {
for (Map.Entry<String, StationData> entry : other.data.entrySet()) { for (Map.Entry<Integer, StationData> entry : other.data.entrySet()) {
String stationName = entry.getKey(); int stationName = entry.getKey();
StationData otherStationData = entry.getValue(); StationData otherStationData = entry.getValue();
StationData thisStationData = this.data.get(stationName); StationData thisStationData = this.data.get(stationName);
@ -201,16 +205,22 @@ public class CalculateAverage_gnabyl {
chunk.mappedByteBuffer().get(data); chunk.mappedByteBuffer().get(data);
// Process each line // Process each line
String stationName; float value;
double value;
int iSplit, iEol; int iSplit, iEol;
StationData stationData; StationData stationData;
long negative; int negative;
int hash, prime = 31;
Set<Integer> seenHashes = new HashSet<>(10000, 0.9f);
for (int offset = 0; offset < data.length; offset++) { for (int offset = 0; offset < data.length; offset++) {
// Find station name // Find station name
hash = 0;
for (iSplit = offset; data[iSplit] != ';'; iSplit++) { for (iSplit = offset; data[iSplit] != ';'; iSplit++) {
hash = (hash << 5) - hash + (data[iSplit] & 0xFF);
}
if (!seenHashes.contains(hash)) {
seenHashes.add(hash);
stationNameMap.put(hash, new String(data, offset, iSplit - offset, StandardCharsets.UTF_8));
} }
stationName = new String(data, offset, iSplit - offset, StandardCharsets.UTF_8);
// Find value // Find value
iSplit++; iSplit++;
@ -222,7 +232,7 @@ public class CalculateAverage_gnabyl {
continue; continue;
} }
if (data[iEol] == '.') { if (data[iEol] == '.') {
value = value + (data[iEol + 1] - 48) / 10.0; value = value + (data[iEol + 1] - 48) * 0.1f;
iEol += 2; iEol += 2;
break; break;
} }
@ -231,10 +241,10 @@ public class CalculateAverage_gnabyl {
value *= negative; value *= negative;
// Init & count // Init & count
stationData = result.getData(stationName); stationData = result.getData(hash);
if (stationData == null) { if (stationData == null) {
result.addStation(stationName, value); result.addStation(hash, value);
} }
else { else {
stationData.update(value); stationData.update(value);
@ -247,32 +257,8 @@ public class CalculateAverage_gnabyl {
} }
private static ChunkResult processAllChunks(List<Chunk> chunks) throws InterruptedException, ExecutionException { private static ChunkResult processAllChunks(List<Chunk> chunks) throws InterruptedException, ExecutionException {
// var globalRes = new ChunkResult(); return chunks.parallelStream().map(CalculateAverage_gnabyl::processChunk).collect(ChunkResult::new,
// for (var chunk : chunks) { ChunkResult::mergeWith, ChunkResult::mergeWith);
// var chunkRes = processChunk(chunk);
// globalRes.mergeWith(chunkRes);
// }
// return globalRes;
List<CompletableFuture<ChunkResult>> computeTasks = new ArrayList<>();
for (Chunk chunk : chunks) {
computeTasks.add(CompletableFuture.supplyAsync(() -> processChunk(chunk)));
}
ChunkResult globalRes = null;
for (CompletableFuture<ChunkResult> completedTask : computeTasks) {
ChunkResult chunkRes = completedTask.get();
if (globalRes == null) {
globalRes = completedTask.get();
}
else {
globalRes.mergeWith(chunkRes);
}
}
return globalRes;
} }
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException { public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {