First optimal solution attempt (#539)
* First optimal attempt * Removing debug lines * Using default string equals method --------- Co-authored-by: Gaurav Deshmukh <deshmgau@amazon.com>
This commit is contained in:
parent
c232346e87
commit
46d2058bd4
19
calculate_average_gauravdeshmukh.sh
Executable file
19
calculate_average_gauravdeshmukh.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
#
|
||||||
|
# Copyright 2023 The original authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
JAVA_OPTS=""
|
||||||
|
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gauravdeshmukh
|
@ -0,0 +1,308 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2023 The original authors
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package dev.morling.onebrc;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.MappedByteBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
|
public class CalculateAverage_gauravdeshmukh {
|
||||||
|
|
||||||
|
private static final String FILE = "./measurements.txt";
|
||||||
|
private static final byte NEGATIVE_SIGN_BYTE = 0x2D;
|
||||||
|
private static final byte DOT_BYTE = 0x2E;
|
||||||
|
private static final int SEARCH_SPACE_BUFFER_SIZE = 140;
|
||||||
|
|
||||||
|
private static final long SEMI_COLON_MASK = 0x3B3B3B3B3B3B3B3BL;
|
||||||
|
private static final long EOL_MASK = 0x0A0A0A0A0A0A0A0AL;
|
||||||
|
|
||||||
|
private static class ByteString {
|
||||||
|
final private String string;
|
||||||
|
final private int staticHashCode;
|
||||||
|
|
||||||
|
public ByteString(byte[] bytes) {
|
||||||
|
this.string = new String(bytes, StandardCharsets.UTF_8);
|
||||||
|
this.staticHashCode = this.string.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getBytes() {
|
||||||
|
return string.getBytes(StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object bs) {
|
||||||
|
return this.string.equals(bs.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return staticHashCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return this.string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class Measurement {
|
||||||
|
public ByteString station;
|
||||||
|
public int value;
|
||||||
|
|
||||||
|
public Measurement(ByteString station, int value) {
|
||||||
|
this.station = station;
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append(station.toString());
|
||||||
|
sb.append(";");
|
||||||
|
sb.append(value);
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class MeasurementAggregator {
|
||||||
|
private double min = Double.POSITIVE_INFINITY;
|
||||||
|
private double max = Double.NEGATIVE_INFINITY;
|
||||||
|
private int sum;
|
||||||
|
private long count;
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return round(min / 10.0) + "/" + round(sum * 1.0 / 10.0 / count) + "/" + round(max / 10.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private double round(double value) {
|
||||||
|
return Math.round(value * 10.0) / 10.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
// long st = System.currentTimeMillis();
|
||||||
|
int cores = 1;
|
||||||
|
|
||||||
|
File file = new File(FILE);
|
||||||
|
long fileSize = file.length();
|
||||||
|
if (fileSize > 1048576) {
|
||||||
|
cores = Runtime.getRuntime().availableProcessors();
|
||||||
|
}
|
||||||
|
long chunkSize = fileSize / cores;
|
||||||
|
|
||||||
|
ExecutorService executorService = Executors.newFixedThreadPool(cores);
|
||||||
|
List<ParallelFileReaderTask> callableTasks = new ArrayList<>(cores);
|
||||||
|
RandomAccessFile raf = new RandomAccessFile(file, "r");
|
||||||
|
long end = chunkSize, start = 0;
|
||||||
|
for (int i = 0; i < cores; i++) {
|
||||||
|
if (i < cores - 1) {
|
||||||
|
MappedByteBuffer mbb = raf.getChannel().map(FileChannel.MapMode.READ_ONLY, end, Math.min(SEARCH_SPACE_BUFFER_SIZE, fileSize - end));
|
||||||
|
int eolIndex = -1;
|
||||||
|
int extraBytes = 0;
|
||||||
|
while (true) {
|
||||||
|
long word;
|
||||||
|
try {
|
||||||
|
word = mbb.getLong();
|
||||||
|
}
|
||||||
|
catch (java.nio.BufferUnderflowException ex) {
|
||||||
|
byte[] remainingBytes = ByteBuffer.allocate(8).putLong(0).array();
|
||||||
|
mbb.get(mbb.position(), remainingBytes, 0, mbb.remaining());
|
||||||
|
word = ByteBuffer.wrap(remainingBytes).getLong();
|
||||||
|
}
|
||||||
|
eolIndex = findEolInLong(word);
|
||||||
|
if (eolIndex > -1) {
|
||||||
|
extraBytes = extraBytes + eolIndex + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
extraBytes += 8;
|
||||||
|
}
|
||||||
|
end = end + extraBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
callableTasks.add(new ParallelFileReaderTask(start, (end - start),
|
||||||
|
raf.getChannel().map(FileChannel.MapMode.READ_ONLY, start, (end - start))));
|
||||||
|
start = end;
|
||||||
|
end = Math.min(end + chunkSize, fileSize - 1);
|
||||||
|
}
|
||||||
|
List<Future<Map<ByteString, MeasurementAggregator>>> futures = executorService.invokeAll(callableTasks);
|
||||||
|
List<Map<ByteString, MeasurementAggregator>> resultList = new ArrayList<>(futures.size());
|
||||||
|
for (Future<Map<ByteString, MeasurementAggregator>> future : futures) {
|
||||||
|
resultList.add(future.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, MeasurementAggregator> resultMap = new TreeMap<>();
|
||||||
|
for (Map<ByteString, MeasurementAggregator> map : resultList) {
|
||||||
|
for (Map.Entry<ByteString, MeasurementAggregator> entry : map.entrySet()) {
|
||||||
|
MeasurementAggregator agg = resultMap.get(entry.getKey().toString());
|
||||||
|
if (agg == null) {
|
||||||
|
agg = new MeasurementAggregator();
|
||||||
|
resultMap.put(entry.getKey().toString(), agg);
|
||||||
|
}
|
||||||
|
agg.min = Math.min(agg.min, entry.getValue().min);
|
||||||
|
agg.max = Math.max(agg.max, entry.getValue().max);
|
||||||
|
agg.sum = agg.sum + entry.getValue().sum;
|
||||||
|
agg.count = agg.count + entry.getValue().count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println(resultMap);
|
||||||
|
executorService.shutdown();
|
||||||
|
// System.out.println("Time taken: " + (System.currentTimeMillis() - st));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int findEolInLong(long word) {
|
||||||
|
return findPositionInLong(word, EOL_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int findSemiColonInLong(long word) {
|
||||||
|
return findPositionInLong(word, SEMI_COLON_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int findPositionInLong(long word, long searchMask) {
|
||||||
|
long maskedWord = word ^ searchMask;
|
||||||
|
long tmp = (maskedWord - 0x0101010101010101L) & ~maskedWord & 0x8080808080808080L;
|
||||||
|
return tmp == 0 ? -1 : (Long.numberOfLeadingZeros(tmp) >>> 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class ParallelFileReaderTask implements Callable<Map<ByteString, MeasurementAggregator>> {
|
||||||
|
private long start;
|
||||||
|
private int size;
|
||||||
|
private MappedByteBuffer mbf;
|
||||||
|
byte[] bytes;
|
||||||
|
private static final int BATCH_READ_SIZE = 64;
|
||||||
|
Map<ByteString, MeasurementAggregator> map;
|
||||||
|
|
||||||
|
public ParallelFileReaderTask(long start, long size, MappedByteBuffer mbf) {
|
||||||
|
this.start = start;
|
||||||
|
this.size = (int) size;
|
||||||
|
this.mbf = mbf;
|
||||||
|
this.bytes = new byte[BATCH_READ_SIZE];
|
||||||
|
this.map = new HashMap<>(10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<ByteString, MeasurementAggregator> call() throws Exception {
|
||||||
|
int bytesReadTillNow = 0;
|
||||||
|
int startOfStation = 0, startOfNumber = -1, endOfStation = -1, endOfNumber = -1;
|
||||||
|
boolean isLastRead = false;
|
||||||
|
try {
|
||||||
|
while (bytesReadTillNow < this.size) {
|
||||||
|
int semiColonIndex = -1;
|
||||||
|
while (semiColonIndex == -1 && bytesReadTillNow < this.size) {
|
||||||
|
long currentWord;
|
||||||
|
try {
|
||||||
|
currentWord = mbf.getLong();
|
||||||
|
}
|
||||||
|
catch (java.nio.BufferUnderflowException ex) {
|
||||||
|
int remainingBytesCount = this.size - bytesReadTillNow;
|
||||||
|
byte[] remainingBytes = ByteBuffer.allocate(8).putLong(0).array();
|
||||||
|
mbf.get(bytesReadTillNow, remainingBytes, 0, remainingBytesCount);
|
||||||
|
currentWord = ByteBuffer.wrap(remainingBytes).getLong();
|
||||||
|
}
|
||||||
|
semiColonIndex = findSemiColonInLong(currentWord);
|
||||||
|
if (semiColonIndex > -1) {
|
||||||
|
endOfStation = bytesReadTillNow + semiColonIndex;
|
||||||
|
startOfNumber = bytesReadTillNow + semiColonIndex + 1;
|
||||||
|
mbf.position(startOfNumber);
|
||||||
|
bytesReadTillNow += semiColonIndex + 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
bytesReadTillNow += 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int stationLength = endOfStation - startOfStation;
|
||||||
|
byte[] stationBytes = new byte[stationLength];
|
||||||
|
mbf.get(startOfStation, stationBytes, 0, stationLength);
|
||||||
|
|
||||||
|
int eolIndex = -1;
|
||||||
|
while (eolIndex == -1 && bytesReadTillNow < this.size) {
|
||||||
|
long currentWord;
|
||||||
|
try {
|
||||||
|
currentWord = mbf.getLong();
|
||||||
|
}
|
||||||
|
catch (java.nio.BufferUnderflowException ex) {
|
||||||
|
int remainingBytesCount = this.size - bytesReadTillNow;
|
||||||
|
byte[] remainingBytes = ByteBuffer.allocate(8).putLong(0).array();
|
||||||
|
mbf.get(bytesReadTillNow, remainingBytes, 0, remainingBytesCount);
|
||||||
|
currentWord = ByteBuffer.wrap(remainingBytes).getLong();
|
||||||
|
isLastRead = true;
|
||||||
|
}
|
||||||
|
eolIndex = findEolInLong(currentWord);
|
||||||
|
if (eolIndex > -1) {
|
||||||
|
endOfNumber = bytesReadTillNow + eolIndex;
|
||||||
|
startOfStation = bytesReadTillNow + eolIndex + 1;
|
||||||
|
mbf.position(startOfStation);
|
||||||
|
bytesReadTillNow += eolIndex + 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
bytesReadTillNow += 8;
|
||||||
|
}
|
||||||
|
if (isLastRead) {
|
||||||
|
bytesReadTillNow = this.size;
|
||||||
|
if (eolIndex == -1) {
|
||||||
|
endOfNumber = this.size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int numberLength = endOfNumber - startOfNumber;
|
||||||
|
byte[] numberBytes = new byte[numberLength];
|
||||||
|
mbf.get(startOfNumber, numberBytes, 0, numberLength);
|
||||||
|
|
||||||
|
Measurement measurement = new Measurement(new ByteString(stationBytes),
|
||||||
|
getIntegerFromTemperatureBytes(numberBytes));
|
||||||
|
MeasurementAggregator aggregator = this.map.get(measurement.station);
|
||||||
|
if (aggregator == null) {
|
||||||
|
aggregator = new MeasurementAggregator();
|
||||||
|
this.map.put(measurement.station, aggregator);
|
||||||
|
}
|
||||||
|
aggregator.min = Math.min(aggregator.min, measurement.value);
|
||||||
|
aggregator.max = Math.max(aggregator.max, measurement.value);
|
||||||
|
aggregator.sum += measurement.value;
|
||||||
|
aggregator.count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.map;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getIntegerFromTemperatureBytes(byte[] numberBytes) {
|
||||||
|
int firstDigitIndex = (numberBytes[0] ^ NEGATIVE_SIGN_BYTE) == 0 ? 1 : 0;
|
||||||
|
int ret = 0;
|
||||||
|
for (int i = firstDigitIndex; i < numberBytes.length; i++) {
|
||||||
|
if ((numberBytes[i] ^ DOT_BYTE) != 0) {
|
||||||
|
ret = (ret << 3) + (ret << 1) + ((int) numberBytes[i] - 48);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (firstDigitIndex > 0) ? -ret : ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user