diff --git a/src/main/rust/Cargo.lock b/src/main/rust/Cargo.lock index df1a805..b607136 100644 --- a/src/main/rust/Cargo.lock +++ b/src/main/rust/Cargo.lock @@ -616,19 +616,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] -name = "memmap2" -version = "0.7.1" +name = "memmap" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ "libc", + "winapi", ] [[package]] name = "memmap2" -version = "0.9.4" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" dependencies = [ "libc", ] @@ -698,7 +699,7 @@ dependencies = [ "fast-float", "libc", "memchr", - "memmap2 0.9.4", + "memmap", "polars", "rayon", "rustc-hash", @@ -900,7 +901,7 @@ dependencies = [ "home", "itoa", "memchr", - "memmap2 0.7.1", + "memmap2", "num-traits", "once_cell", "percent-encoding", diff --git a/src/main/rust/Cargo.toml b/src/main/rust/Cargo.toml index d853bf3..b40b893 100644 --- a/src/main/rust/Cargo.toml +++ b/src/main/rust/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" bstr = "1.9.1" fast-float = "0.2.0" memchr = "2.7.4" -memmap2 = "0.9.4" +memmap = "0.7.0" polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]} rayon = "1.10.0" rustc-hash = "2.0.0" @@ -47,7 +47,6 @@ name = "phcs" harness = false [profile.release] -debug = true lto = "fat" -#strip = "symbols" +strip = "symbols" panic = "abort" diff --git a/src/main/rust/src/implementations/libraries.rs b/src/main/rust/src/implementations/libraries.rs index 773bf85..dab33a3 100644 --- a/src/main/rust/src/implementations/libraries.rs +++ b/src/main/rust/src/implementations/libraries.rs @@ -1,11 +1,9 @@ -use std::{fs::File, io::BufReader, thread}; +use std::collections::HashMap; use std::io::{BufRead, Seek, SeekFrom}; use std::sync::mpsc; use std::time::Instant; - -use memmap2::MmapOptions; -use rustc_hash::{FxBuildHasher, FxHashMap as HashMap}; - +use std::{fs::File, io::BufReader, thread}; +use memmap::MmapOptions; use crate::models::station_measurements::StationMeasurements; use crate::utils::parse; use crate::utils::parse::hashstr; @@ -16,9 +14,8 @@ pub fn run() { const FILE_PATH: &str = "../../../measurements.txt"; let now = Instant::now(); thread::scope(|s| { - let hasher = FxBuildHasher::default(); let mut stations: HashMap = - HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); + HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); let (tx, rx) = mpsc::channel(); let cores = thread::available_parallelism().unwrap().into(); let file = File::open(FILE_PATH).expect("File measurements.txt not found"); @@ -43,19 +40,26 @@ pub fn run() { bounds.push(file_length); for i in 0..cores { let tx = tx.clone(); - let currposition = *bounds.get(i).unwrap(); + let mut currposition = *bounds.get(i).unwrap(); let end = *bounds.get(i + 1).unwrap(); s.spawn(move || { let file = File::open(FILE_PATH).expect("File measurements.txt not found"); - let t_mmap = &unsafe { MmapOptions::new().map(&file).unwrap() }[currposition..end]; + let mut reader = BufReader::new(&file); + reader.seek(SeekFrom::Start(currposition as u64)).unwrap(); let mut t_stations: HashMap = - HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); - for line in t_mmap.lines() { - let line = line.expect("Could not read line"); - let (station, temp) = line.rsplit_once(|char| char == ';').unwrap(); + HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); + let mut line = Vec::with_capacity(108); + loop { + let line_len = reader + .read_until(b'\n', &mut line) + .expect("could not read bytes"); + if line_len == 0 { + break; + } + let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); let hash = hashstr(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; - let temp = parse::temp(temp.as_bytes()); + let temp = parse::temp(temp.split_last().unwrap().1); let measurements_option = t_stations.get_mut(&hash); if let Some((_, measurements)) = measurements_option { measurements.update(temp); @@ -68,6 +72,11 @@ pub fn run() { }; t_stations.insert(hash, (station, measurements)); } + currposition += line_len; + if currposition >= end { + break; + } + line.clear(); } let _ = tx.send(t_stations); }); diff --git a/src/main/rust/src/implementations/multi_threaded.rs b/src/main/rust/src/implementations/multi_threaded.rs index 1a5a7bf..109743f 100644 --- a/src/main/rust/src/implementations/multi_threaded.rs +++ b/src/main/rust/src/implementations/multi_threaded.rs @@ -1,12 +1,11 @@ -use std::{fs::File, io::BufReader, thread}; use std::collections::HashMap; use std::io::{BufRead, Seek, SeekFrom}; use std::sync::mpsc; use std::time::Instant; - +use std::{fs::File, io::BufReader, thread}; use crate::models::station_measurements::StationMeasurements; use crate::utils::parse; -use crate::utils::parse::hashbytes; +use crate::utils::parse::hashstr; const DEFAULT_HASHMAP_LENGTH: usize = 10000; @@ -57,7 +56,7 @@ pub fn run() { break; } let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); - let hash = hashbytes(station); + let hash = hashstr(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let temp = parse::temp(temp.split_last().unwrap().1); let measurements_option = t_stations.get_mut(&hash); diff --git a/src/main/rust/src/implementations/reference_impl.rs b/src/main/rust/src/implementations/reference_impl.rs index bd2b1e0..bdc2c6a 100644 --- a/src/main/rust/src/implementations/reference_impl.rs +++ b/src/main/rust/src/implementations/reference_impl.rs @@ -1,5 +1,5 @@ use bstr::{BStr, ByteSlice}; -use memmap2::MmapOptions; +use memmap::MmapOptions; use rayon::prelude::*; use rustc_hash::FxHashMap as HashMap; use std::time::Instant; diff --git a/src/main/rust/src/implementations/single_thread.rs b/src/main/rust/src/implementations/single_thread.rs index 6c9e1f2..2367974 100644 --- a/src/main/rust/src/implementations/single_thread.rs +++ b/src/main/rust/src/implementations/single_thread.rs @@ -2,10 +2,9 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufRead, BufReader}; use std::time::Instant; - use crate::models::station_measurements::StationMeasurements; use crate::utils::parse; -use crate::utils::parse::hashbytes; +use crate::utils::parse::hashstr; const DEFAULT_HASHMAP_LENGTH: usize = 10000; @@ -25,7 +24,7 @@ pub fn run() { break; } let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); - let hash = hashbytes(station); + let hash = hashstr(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let temp = parse::temp(temp.split_last().unwrap().1); let measurements_option = stations.get_mut(&hash); diff --git a/src/main/rust/src/utils/parse.rs b/src/main/rust/src/utils/parse.rs index fe4c56f..6f53c5e 100644 --- a/src/main/rust/src/utils/parse.rs +++ b/src/main/rust/src/utils/parse.rs @@ -67,7 +67,7 @@ pub fn temp_simd(bytes: &[u8]) -> isize { } #[inline] -pub fn hashbytes(bytes: &[u8]) -> usize { +pub fn hashstr(bytes: &[u8]) -> usize { let mut hash = 0; let (chunks, remainder) = bytes.as_chunks::<8>(); for &chunk in chunks { @@ -84,27 +84,9 @@ pub fn hashbytes(bytes: &[u8]) -> usize { hash } -#[inline] -pub fn hashstr(s: &str) -> usize { - let mut hash = 0; - let (chunks, remainder) = s.as_bytes().as_chunks::<8>(); - for &chunk in chunks { - hash += usize::from_be_bytes(chunk); - } - let mut r = [0_u8; 8]; - r[0] = remainder.len() as u8; - let mut idx = 1; - for &byte in remainder { - r[idx] = byte; - idx += 1; - } - hash += usize::from_be_bytes(r); - hash -} - #[cfg(test)] mod tests { - use crate::utils::parse::{hashbytes, hashstr, temp_new}; + use crate::utils::parse::{hashstr, temp_new}; #[test] fn test_temp_new_max() { @@ -136,23 +118,12 @@ mod tests { assert_eq!(temp_neg_10, -99); } - #[test] - fn test_hashbytes() { - let hash_1 = hashbytes(b"abcdefghijk"); - let hash_2 = hashbytes(b"kjihgfedcba"); - let hash_3 = hashbytes(b"abba"); - let hash_4 = hashbytes(b"baab"); - - assert_ne!(hash_1, hash_2); - assert_ne!(hash_3, hash_4); - } - #[test] fn test_hashstr() { - let hash_1 = hashstr("abcdefghijk"); - let hash_2 = hashstr("kjihgfedcba"); - let hash_3 = hashstr("abba"); - let hash_4 = hashstr("baab"); + let hash_1 = hashstr(b"abcdefghijk"); + let hash_2 = hashstr(b"kjihgfedcba"); + let hash_3 = hashstr(b"abba"); + let hash_4 = hashstr(b"baab"); assert_ne!(hash_1, hash_2); assert_ne!(hash_3, hash_4);