diff --git a/src/main/rust/Cargo.lock b/src/main/rust/Cargo.lock index b607136..df1a805 100644 --- a/src/main/rust/Cargo.lock +++ b/src/main/rust/Cargo.lock @@ -615,16 +615,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "memmap2" version = "0.7.1" @@ -634,6 +624,15 @@ dependencies = [ "libc", ] +[[package]] +name = "memmap2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" +dependencies = [ + "libc", +] + [[package]] name = "multiversion" version = "0.7.4" @@ -699,7 +698,7 @@ dependencies = [ "fast-float", "libc", "memchr", - "memmap", + "memmap2 0.9.4", "polars", "rayon", "rustc-hash", @@ -901,7 +900,7 @@ dependencies = [ "home", "itoa", "memchr", - "memmap2", + "memmap2 0.7.1", "num-traits", "once_cell", "percent-encoding", diff --git a/src/main/rust/Cargo.toml b/src/main/rust/Cargo.toml index b40b893..d853bf3 100644 --- a/src/main/rust/Cargo.toml +++ b/src/main/rust/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" bstr = "1.9.1" fast-float = "0.2.0" memchr = "2.7.4" -memmap = "0.7.0" +memmap2 = "0.9.4" polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]} rayon = "1.10.0" rustc-hash = "2.0.0" @@ -47,6 +47,7 @@ name = "phcs" harness = false [profile.release] +debug = true lto = "fat" -strip = "symbols" +#strip = "symbols" panic = "abort" diff --git a/src/main/rust/src/implementations/libraries.rs b/src/main/rust/src/implementations/libraries.rs index dab33a3..dbd22bd 100644 --- a/src/main/rust/src/implementations/libraries.rs +++ b/src/main/rust/src/implementations/libraries.rs @@ -1,9 +1,11 @@ -use std::collections::HashMap; +use std::{fs::File, io::BufReader, thread}; use std::io::{BufRead, Seek, SeekFrom}; use std::sync::mpsc; use std::time::Instant; -use std::{fs::File, io::BufReader, thread}; -use memmap::MmapOptions; + +use memmap2::MmapOptions; +use rustc_hash::{FxBuildHasher, FxHashMap as HashMap}; + use crate::models::station_measurements::StationMeasurements; use crate::utils::parse; use crate::utils::parse::hashstr; @@ -14,8 +16,9 @@ pub fn run() { const FILE_PATH: &str = "../../../measurements.txt"; let now = Instant::now(); thread::scope(|s| { + let hasher = FxBuildHasher::default(); let mut stations: HashMap = - HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); + HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); let (tx, rx) = mpsc::channel(); let cores = thread::available_parallelism().unwrap().into(); let file = File::open(FILE_PATH).expect("File measurements.txt not found"); @@ -40,26 +43,18 @@ pub fn run() { bounds.push(file_length); for i in 0..cores { let tx = tx.clone(); - let mut currposition = *bounds.get(i).unwrap(); + let currposition = *bounds.get(i).unwrap(); let end = *bounds.get(i + 1).unwrap(); s.spawn(move || { - let file = File::open(FILE_PATH).expect("File measurements.txt not found"); - let mut reader = BufReader::new(&file); - reader.seek(SeekFrom::Start(currposition as u64)).unwrap(); + let t_mmap = &mmap[currposition..end]; let mut t_stations: HashMap = - HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); - let mut line = Vec::with_capacity(108); - loop { - let line_len = reader - .read_until(b'\n', &mut line) - .expect("could not read bytes"); - if line_len == 0 { - break; - } - let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); + HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); + for line in t_mmap.lines() { + let line = line.expect("Could not read line"); + let (station, temp) = line.rsplit_once(|char| char == ';').unwrap(); let hash = hashstr(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; - let temp = parse::temp(temp.split_last().unwrap().1); + let temp = parse::temp(temp.as_bytes()); let measurements_option = t_stations.get_mut(&hash); if let Some((_, measurements)) = measurements_option { measurements.update(temp); @@ -72,11 +67,6 @@ pub fn run() { }; t_stations.insert(hash, (station, measurements)); } - currposition += line_len; - if currposition >= end { - break; - } - line.clear(); } let _ = tx.send(t_stations); }); diff --git a/src/main/rust/src/implementations/multi_threaded.rs b/src/main/rust/src/implementations/multi_threaded.rs index 109743f..1a5a7bf 100644 --- a/src/main/rust/src/implementations/multi_threaded.rs +++ b/src/main/rust/src/implementations/multi_threaded.rs @@ -1,11 +1,12 @@ +use std::{fs::File, io::BufReader, thread}; use std::collections::HashMap; use std::io::{BufRead, Seek, SeekFrom}; use std::sync::mpsc; use std::time::Instant; -use std::{fs::File, io::BufReader, thread}; + use crate::models::station_measurements::StationMeasurements; use crate::utils::parse; -use crate::utils::parse::hashstr; +use crate::utils::parse::hashbytes; const DEFAULT_HASHMAP_LENGTH: usize = 10000; @@ -56,7 +57,7 @@ pub fn run() { break; } let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); - let hash = hashstr(station); + let hash = hashbytes(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let temp = parse::temp(temp.split_last().unwrap().1); let measurements_option = t_stations.get_mut(&hash); diff --git a/src/main/rust/src/implementations/reference_impl.rs b/src/main/rust/src/implementations/reference_impl.rs index bdc2c6a..bd2b1e0 100644 --- a/src/main/rust/src/implementations/reference_impl.rs +++ b/src/main/rust/src/implementations/reference_impl.rs @@ -1,5 +1,5 @@ use bstr::{BStr, ByteSlice}; -use memmap::MmapOptions; +use memmap2::MmapOptions; use rayon::prelude::*; use rustc_hash::FxHashMap as HashMap; use std::time::Instant; diff --git a/src/main/rust/src/implementations/single_thread.rs b/src/main/rust/src/implementations/single_thread.rs index 2367974..6c9e1f2 100644 --- a/src/main/rust/src/implementations/single_thread.rs +++ b/src/main/rust/src/implementations/single_thread.rs @@ -2,9 +2,10 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufRead, BufReader}; use std::time::Instant; + use crate::models::station_measurements::StationMeasurements; use crate::utils::parse; -use crate::utils::parse::hashstr; +use crate::utils::parse::hashbytes; const DEFAULT_HASHMAP_LENGTH: usize = 10000; @@ -24,7 +25,7 @@ pub fn run() { break; } let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); - let hash = hashstr(station); + let hash = hashbytes(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let temp = parse::temp(temp.split_last().unwrap().1); let measurements_option = stations.get_mut(&hash); diff --git a/src/main/rust/src/utils/parse.rs b/src/main/rust/src/utils/parse.rs index 6f53c5e..fe4c56f 100644 --- a/src/main/rust/src/utils/parse.rs +++ b/src/main/rust/src/utils/parse.rs @@ -67,7 +67,7 @@ pub fn temp_simd(bytes: &[u8]) -> isize { } #[inline] -pub fn hashstr(bytes: &[u8]) -> usize { +pub fn hashbytes(bytes: &[u8]) -> usize { let mut hash = 0; let (chunks, remainder) = bytes.as_chunks::<8>(); for &chunk in chunks { @@ -84,9 +84,27 @@ pub fn hashstr(bytes: &[u8]) -> usize { hash } +#[inline] +pub fn hashstr(s: &str) -> usize { + let mut hash = 0; + let (chunks, remainder) = s.as_bytes().as_chunks::<8>(); + for &chunk in chunks { + hash += usize::from_be_bytes(chunk); + } + let mut r = [0_u8; 8]; + r[0] = remainder.len() as u8; + let mut idx = 1; + for &byte in remainder { + r[idx] = byte; + idx += 1; + } + hash += usize::from_be_bytes(r); + hash +} + #[cfg(test)] mod tests { - use crate::utils::parse::{hashstr, temp_new}; + use crate::utils::parse::{hashbytes, hashstr, temp_new}; #[test] fn test_temp_new_max() { @@ -118,12 +136,23 @@ mod tests { assert_eq!(temp_neg_10, -99); } + #[test] + fn test_hashbytes() { + let hash_1 = hashbytes(b"abcdefghijk"); + let hash_2 = hashbytes(b"kjihgfedcba"); + let hash_3 = hashbytes(b"abba"); + let hash_4 = hashbytes(b"baab"); + + assert_ne!(hash_1, hash_2); + assert_ne!(hash_3, hash_4); + } + #[test] fn test_hashstr() { - let hash_1 = hashstr(b"abcdefghijk"); - let hash_2 = hashstr(b"kjihgfedcba"); - let hash_3 = hashstr(b"abba"); - let hash_4 = hashstr(b"baab"); + let hash_1 = hashstr("abcdefghijk"); + let hash_2 = hashstr("kjihgfedcba"); + let hash_3 = hashstr("abba"); + let hash_4 = hashstr("baab"); assert_ne!(hash_1, hash_2); assert_ne!(hash_3, hash_4);