From 13c54a28117317ee512722050658f9475c9bc327 Mon Sep 17 00:00:00 2001
From: Fabian Schmidt <fabschmidt96@gmail.com>
Date: Mon, 5 Aug 2024 10:53:17 +0200
Subject: [PATCH] FxHashMap made me faster, memmap makes me slower, guess I'm
 using it wrong

---
 src/main/rust/Cargo.lock                      | 23 +++++------
 src/main/rust/Cargo.toml                      |  5 ++-
 .../rust/src/implementations/libraries.rs     | 38 +++++++----------
 .../src/implementations/multi_threaded.rs     |  7 ++--
 .../src/implementations/reference_impl.rs     |  2 +-
 .../rust/src/implementations/single_thread.rs |  5 ++-
 src/main/rust/src/utils/parse.rs              | 41 ++++++++++++++++---
 7 files changed, 71 insertions(+), 50 deletions(-)

diff --git a/src/main/rust/Cargo.lock b/src/main/rust/Cargo.lock
index b607136..df1a805 100644
--- a/src/main/rust/Cargo.lock
+++ b/src/main/rust/Cargo.lock
@@ -615,16 +615,6 @@ version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
-[[package]]
-name = "memmap"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "memmap2"
 version = "0.7.1"
@@ -634,6 +624,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "memmap2"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "multiversion"
 version = "0.7.4"
@@ -699,7 +698,7 @@ dependencies = [
  "fast-float",
  "libc",
  "memchr",
- "memmap",
+ "memmap2 0.9.4",
  "polars",
  "rayon",
  "rustc-hash",
@@ -901,7 +900,7 @@ dependencies = [
  "home",
  "itoa",
  "memchr",
- "memmap2",
+ "memmap2 0.7.1",
  "num-traits",
  "once_cell",
  "percent-encoding",
diff --git a/src/main/rust/Cargo.toml b/src/main/rust/Cargo.toml
index b40b893..d853bf3 100644
--- a/src/main/rust/Cargo.toml
+++ b/src/main/rust/Cargo.toml
@@ -9,7 +9,7 @@ edition = "2021"
 bstr = "1.9.1"
 fast-float = "0.2.0"
 memchr = "2.7.4"
-memmap = "0.7.0"
+memmap2 = "0.9.4"
 polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
 rayon = "1.10.0"
 rustc-hash = "2.0.0"
@@ -47,6 +47,7 @@ name = "phcs"
 harness = false
 
 [profile.release]
+debug = true
 lto = "fat"
-strip = "symbols"
+#strip = "symbols"
 panic = "abort"
diff --git a/src/main/rust/src/implementations/libraries.rs b/src/main/rust/src/implementations/libraries.rs
index dab33a3..dbd22bd 100644
--- a/src/main/rust/src/implementations/libraries.rs
+++ b/src/main/rust/src/implementations/libraries.rs
@@ -1,9 +1,11 @@
-use std::collections::HashMap;
+use std::{fs::File, io::BufReader, thread};
 use std::io::{BufRead, Seek, SeekFrom};
 use std::sync::mpsc;
 use std::time::Instant;
-use std::{fs::File, io::BufReader, thread};
-use memmap::MmapOptions;
+
+use memmap2::MmapOptions;
+use rustc_hash::{FxBuildHasher, FxHashMap as HashMap};
+
 use crate::models::station_measurements::StationMeasurements;
 use crate::utils::parse;
 use crate::utils::parse::hashstr;
@@ -14,8 +16,9 @@ pub fn run() {
     const FILE_PATH: &str = "../../../measurements.txt";
     let now = Instant::now();
     thread::scope(|s| {
+        let hasher = FxBuildHasher::default();
         let mut stations: HashMap<usize, (String, StationMeasurements)> =
-            HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
+            HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
         let (tx, rx) = mpsc::channel();
         let cores = thread::available_parallelism().unwrap().into();
         let file = File::open(FILE_PATH).expect("File measurements.txt not found");
@@ -40,26 +43,18 @@ pub fn run() {
         bounds.push(file_length);
         for i in 0..cores {
             let tx = tx.clone();
-            let mut currposition = *bounds.get(i).unwrap();
+            let currposition = *bounds.get(i).unwrap();
             let end = *bounds.get(i + 1).unwrap();
             s.spawn(move || {
-                let file = File::open(FILE_PATH).expect("File measurements.txt not found");
-                let mut reader = BufReader::new(&file);
-                reader.seek(SeekFrom::Start(currposition as u64)).unwrap();
+                let t_mmap = &mmap[currposition..end];
                 let mut t_stations: HashMap<usize, (String, StationMeasurements)> =
-                    HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
-                let mut line = Vec::with_capacity(108);
-                loop {
-                    let line_len = reader
-                        .read_until(b'\n', &mut line)
-                        .expect("could not read bytes");
-                    if line_len == 0 {
-                        break;
-                    }
-                    let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
+                    HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
+               for line in t_mmap.lines() {
+                   let line = line.expect("Could not read line");
+                    let (station, temp) = line.rsplit_once(|char| char == ';').unwrap();
                     let hash = hashstr(station);
                     let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
-                    let temp = parse::temp(temp.split_last().unwrap().1);
+                    let temp = parse::temp(temp.as_bytes());
                     let measurements_option = t_stations.get_mut(&hash);
                     if let Some((_, measurements)) = measurements_option {
                         measurements.update(temp);
@@ -72,11 +67,6 @@ pub fn run() {
                         };
                         t_stations.insert(hash, (station, measurements));
                     }
-                    currposition += line_len;
-                    if currposition >= end {
-                        break;
-                    }
-                    line.clear();
                 }
                 let _ = tx.send(t_stations);
             });
diff --git a/src/main/rust/src/implementations/multi_threaded.rs b/src/main/rust/src/implementations/multi_threaded.rs
index 109743f..1a5a7bf 100644
--- a/src/main/rust/src/implementations/multi_threaded.rs
+++ b/src/main/rust/src/implementations/multi_threaded.rs
@@ -1,11 +1,12 @@
+use std::{fs::File, io::BufReader, thread};
 use std::collections::HashMap;
 use std::io::{BufRead, Seek, SeekFrom};
 use std::sync::mpsc;
 use std::time::Instant;
-use std::{fs::File, io::BufReader, thread};
+
 use crate::models::station_measurements::StationMeasurements;
 use crate::utils::parse;
-use crate::utils::parse::hashstr;
+use crate::utils::parse::hashbytes;
 
 const DEFAULT_HASHMAP_LENGTH: usize = 10000;
 
@@ -56,7 +57,7 @@ pub fn run() {
                         break;
                     }
                     let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
-                    let hash = hashstr(station);
+                    let hash = hashbytes(station);
                     let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
                     let temp = parse::temp(temp.split_last().unwrap().1);
                     let measurements_option = t_stations.get_mut(&hash);
diff --git a/src/main/rust/src/implementations/reference_impl.rs b/src/main/rust/src/implementations/reference_impl.rs
index bdc2c6a..bd2b1e0 100644
--- a/src/main/rust/src/implementations/reference_impl.rs
+++ b/src/main/rust/src/implementations/reference_impl.rs
@@ -1,5 +1,5 @@
 use bstr::{BStr, ByteSlice};
-use memmap::MmapOptions;
+use memmap2::MmapOptions;
 use rayon::prelude::*;
 use rustc_hash::FxHashMap as HashMap;
 use std::time::Instant;
diff --git a/src/main/rust/src/implementations/single_thread.rs b/src/main/rust/src/implementations/single_thread.rs
index 2367974..6c9e1f2 100644
--- a/src/main/rust/src/implementations/single_thread.rs
+++ b/src/main/rust/src/implementations/single_thread.rs
@@ -2,9 +2,10 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::time::Instant;
+
 use crate::models::station_measurements::StationMeasurements;
 use crate::utils::parse;
-use crate::utils::parse::hashstr;
+use crate::utils::parse::hashbytes;
 
 const DEFAULT_HASHMAP_LENGTH: usize = 10000;
 
@@ -24,7 +25,7 @@ pub fn run() {
             break;
         }
         let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
-        let hash = hashstr(station);
+        let hash = hashbytes(station);
         let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
         let temp = parse::temp(temp.split_last().unwrap().1);
         let measurements_option = stations.get_mut(&hash);
diff --git a/src/main/rust/src/utils/parse.rs b/src/main/rust/src/utils/parse.rs
index 6f53c5e..fe4c56f 100644
--- a/src/main/rust/src/utils/parse.rs
+++ b/src/main/rust/src/utils/parse.rs
@@ -67,7 +67,7 @@ pub fn temp_simd(bytes: &[u8]) -> isize {
 }
 
 #[inline]
-pub fn hashstr(bytes: &[u8]) -> usize {
+pub fn hashbytes(bytes: &[u8]) -> usize {
     let mut hash = 0;
     let (chunks, remainder) = bytes.as_chunks::<8>();
     for &chunk in chunks {
@@ -84,9 +84,27 @@ pub fn hashstr(bytes: &[u8]) -> usize {
     hash
 }
 
+#[inline]
+pub fn hashstr(s: &str) -> usize {
+    let mut hash = 0;
+    let (chunks, remainder) = s.as_bytes().as_chunks::<8>();
+    for &chunk in chunks {
+        hash += usize::from_be_bytes(chunk);
+    }
+    let mut r = [0_u8; 8];
+    r[0] = remainder.len() as u8;
+    let mut idx = 1;
+    for &byte in remainder {
+        r[idx] = byte;
+        idx += 1;
+    }
+    hash += usize::from_be_bytes(r);
+    hash
+}
+
 #[cfg(test)]
 mod tests {
-    use crate::utils::parse::{hashstr, temp_new};
+    use crate::utils::parse::{hashbytes, hashstr, temp_new};
 
     #[test]
     fn test_temp_new_max() {
@@ -118,12 +136,23 @@ mod tests {
         assert_eq!(temp_neg_10, -99);
     }
 
+    #[test]
+    fn test_hashbytes() {
+        let hash_1 = hashbytes(b"abcdefghijk");
+        let hash_2 = hashbytes(b"kjihgfedcba");
+        let hash_3 = hashbytes(b"abba");
+        let hash_4 = hashbytes(b"baab");
+
+        assert_ne!(hash_1, hash_2);
+        assert_ne!(hash_3, hash_4);
+    }
+
     #[test]
     fn test_hashstr() {
-        let hash_1 = hashstr(b"abcdefghijk");
-        let hash_2 = hashstr(b"kjihgfedcba");
-        let hash_3 = hashstr(b"abba");
-        let hash_4 = hashstr(b"baab");
+        let hash_1 = hashstr("abcdefghijk");
+        let hash_2 = hashstr("kjihgfedcba");
+        let hash_3 = hashstr("abba");
+        let hash_4 = hashstr("baab");
 
         assert_ne!(hash_1, hash_2);
         assert_ne!(hash_3, hash_4);