Fix hundredwatt's entry on 10k dataset (#558)

* Improve hash function

* remove limit on number of cores

* fix calculation of boundaries between chunks

* fix IOOBE

---------

Co-authored-by: Jason Nochlin <hundredwatt@users.noreply.github.com>
This commit is contained in:
Jason Nochlin 2024-01-26 10:22:35 -07:00 committed by GitHub
parent 09b0d75477
commit 457a36be63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -31,7 +31,7 @@ import java.util.stream.IntStream;
public class CalculateAverage_hundredwatt { public class CalculateAverage_hundredwatt {
private static final String FILE = "./measurements.txt"; private static final String FILE = "./measurements.txt";
private static final int MAX_ROW_SIZE = 100 + 1 + 5 + 1; // 100 for city name, 1 for ;, 5 for temperature, 1 for \n private static final int MAX_ROW_SIZE = 100 + 1 + 5 + 1; // 100 for city name, 1 for ;, 5 for temperature, 1 for \n
private static final int THREAD_COUNT = Math.min(8, Runtime.getRuntime().availableProcessors()); private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
private static final long BUFFER_SIZE = 128 * 1024 * 1024; // 128MB private static final long BUFFER_SIZE = 128 * 1024 * 1024; // 128MB
private static final long CHUNK_SIZE = BUFFER_SIZE / THREAD_COUNT; private static final long CHUNK_SIZE = BUFFER_SIZE / THREAD_COUNT;
private static final long FILE_CHUNK_SIZE = CHUNK_SIZE - MAX_ROW_SIZE; private static final long FILE_CHUNK_SIZE = CHUNK_SIZE - MAX_ROW_SIZE;
@ -209,10 +209,10 @@ public class CalculateAverage_hundredwatt {
short temperature_value; short temperature_value;
int hashInt; int hashInt;
int i = 0; int rc = 0;
int end = (int) (size - MAX_ROW_SIZE); int end = (int) (size - MAX_ROW_SIZE);
while (position < end) { while (position <= end) {
i++; // rc++;
offset = -1; offset = -1;
// Parse city name // Parse city name
@ -257,11 +257,11 @@ public class CalculateAverage_hundredwatt {
position = position + newlinePos / 8 + 2; // +1 for \n position = position + newlinePos / 8 + 2; // +1 for \n
hashInt = (int) (hash ^ (hash >> 32)); hashInt = (int) (hash ^ (hash >> 32) ^ (hash >> 17));
hashTable.putOrMerge(hashInt, offset + 1, key, temperature_value); hashTable.putOrMerge(hashInt, offset + 1, key, temperature_value);
} }
return i; return rc;
} }
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
@ -282,7 +282,7 @@ public class CalculateAverage_hundredwatt {
byte[] trailing = new byte[MAX_ROW_SIZE * 2]; byte[] trailing = new byte[MAX_ROW_SIZE * 2];
fileChannel.read(ByteBuffer.wrap(trailing), Math.max(0, fileSize - MAX_ROW_SIZE)); fileChannel.read(ByteBuffer.wrap(trailing), Math.max(0, fileSize - MAX_ROW_SIZE));
var rc = processChunk(ByteBuffer.wrap(trailing), hashTable, Math.max(0, fileSize - MAX_ROW_SIZE), var rc = processChunk(ByteBuffer.wrap(trailing), hashTable, Math.max(0, fileSize - MAX_ROW_SIZE),
MAX_ROW_SIZE + Math.min(fileSize, MAX_ROW_SIZE)); MAX_ROW_SIZE + Math.min(fileSize, MAX_ROW_SIZE) - 1);
// rowCount.addAndGet(rc); // rowCount.addAndGet(rc);
return hashTable; return hashTable;
@ -292,11 +292,16 @@ public class CalculateAverage_hundredwatt {
} }
} }
// if file is smaller than max row size, we're done b/c the trailing bytes handler processed the whole file
if (fileSize <= MAX_ROW_SIZE) {
return hashTable;
}
while (start < fileSize) { while (start < fileSize) {
long end = Math.min(start + CHUNK_SIZE, fileSize); long end = Math.min(start + CHUNK_SIZE, fileSize);
MappedByteBuffer bb = null; MappedByteBuffer bb = null;
try { try {
bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, end - start); bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, Math.min(end - start + 8, fileSize - start));
} }
catch (IOException e) { catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);