FxHashMap made me faster, memmap makes me slower, guess I'm using it wrong
This commit is contained in:
		
							
								
								
									
										23
									
								
								src/main/rust/Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										23
									
								
								src/main/rust/Cargo.lock
									
									
									
										generated
									
									
									
								
							@@ -615,16 +615,6 @@ version = "2.7.4"
 | 
			
		||||
source = "registry+https://github.com/rust-lang/crates.io-index"
 | 
			
		||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "memmap"
 | 
			
		||||
version = "0.7.0"
 | 
			
		||||
source = "registry+https://github.com/rust-lang/crates.io-index"
 | 
			
		||||
checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
 | 
			
		||||
dependencies = [
 | 
			
		||||
 "libc",
 | 
			
		||||
 "winapi",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "memmap2"
 | 
			
		||||
version = "0.7.1"
 | 
			
		||||
@@ -634,6 +624,15 @@ dependencies = [
 | 
			
		||||
 "libc",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "memmap2"
 | 
			
		||||
version = "0.9.4"
 | 
			
		||||
source = "registry+https://github.com/rust-lang/crates.io-index"
 | 
			
		||||
checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
 | 
			
		||||
dependencies = [
 | 
			
		||||
 "libc",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "multiversion"
 | 
			
		||||
version = "0.7.4"
 | 
			
		||||
@@ -699,7 +698,7 @@ dependencies = [
 | 
			
		||||
 "fast-float",
 | 
			
		||||
 "libc",
 | 
			
		||||
 "memchr",
 | 
			
		||||
 "memmap",
 | 
			
		||||
 "memmap2 0.9.4",
 | 
			
		||||
 "polars",
 | 
			
		||||
 "rayon",
 | 
			
		||||
 "rustc-hash",
 | 
			
		||||
@@ -901,7 +900,7 @@ dependencies = [
 | 
			
		||||
 "home",
 | 
			
		||||
 "itoa",
 | 
			
		||||
 "memchr",
 | 
			
		||||
 "memmap2",
 | 
			
		||||
 "memmap2 0.7.1",
 | 
			
		||||
 "num-traits",
 | 
			
		||||
 "once_cell",
 | 
			
		||||
 "percent-encoding",
 | 
			
		||||
 
 | 
			
		||||
@@ -9,7 +9,7 @@ edition = "2021"
 | 
			
		||||
bstr = "1.9.1"
 | 
			
		||||
fast-float = "0.2.0"
 | 
			
		||||
memchr = "2.7.4"
 | 
			
		||||
memmap = "0.7.0"
 | 
			
		||||
memmap2 = "0.9.4"
 | 
			
		||||
polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
 | 
			
		||||
rayon = "1.10.0"
 | 
			
		||||
rustc-hash = "2.0.0"
 | 
			
		||||
@@ -47,6 +47,7 @@ name = "phcs"
 | 
			
		||||
harness = false
 | 
			
		||||
 | 
			
		||||
[profile.release]
 | 
			
		||||
debug = true
 | 
			
		||||
lto = "fat"
 | 
			
		||||
strip = "symbols"
 | 
			
		||||
#strip = "symbols"
 | 
			
		||||
panic = "abort"
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,11 @@
 | 
			
		||||
use std::collections::HashMap;
 | 
			
		||||
use std::{fs::File, io::BufReader, thread};
 | 
			
		||||
use std::io::{BufRead, Seek, SeekFrom};
 | 
			
		||||
use std::sync::mpsc;
 | 
			
		||||
use std::time::Instant;
 | 
			
		||||
use std::{fs::File, io::BufReader, thread};
 | 
			
		||||
use memmap::MmapOptions;
 | 
			
		||||
 | 
			
		||||
use memmap2::MmapOptions;
 | 
			
		||||
use rustc_hash::{FxBuildHasher, FxHashMap as HashMap};
 | 
			
		||||
 | 
			
		||||
use crate::models::station_measurements::StationMeasurements;
 | 
			
		||||
use crate::utils::parse;
 | 
			
		||||
use crate::utils::parse::hashstr;
 | 
			
		||||
@@ -14,8 +16,9 @@ pub fn run() {
 | 
			
		||||
    const FILE_PATH: &str = "../../../measurements.txt";
 | 
			
		||||
    let now = Instant::now();
 | 
			
		||||
    thread::scope(|s| {
 | 
			
		||||
        let hasher = FxBuildHasher::default();
 | 
			
		||||
        let mut stations: HashMap<usize, (String, StationMeasurements)> =
 | 
			
		||||
            HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
 | 
			
		||||
            HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
 | 
			
		||||
        let (tx, rx) = mpsc::channel();
 | 
			
		||||
        let cores = thread::available_parallelism().unwrap().into();
 | 
			
		||||
        let file = File::open(FILE_PATH).expect("File measurements.txt not found");
 | 
			
		||||
@@ -40,26 +43,18 @@ pub fn run() {
 | 
			
		||||
        bounds.push(file_length);
 | 
			
		||||
        for i in 0..cores {
 | 
			
		||||
            let tx = tx.clone();
 | 
			
		||||
            let mut currposition = *bounds.get(i).unwrap();
 | 
			
		||||
            let currposition = *bounds.get(i).unwrap();
 | 
			
		||||
            let end = *bounds.get(i + 1).unwrap();
 | 
			
		||||
            s.spawn(move || {
 | 
			
		||||
                let file = File::open(FILE_PATH).expect("File measurements.txt not found");
 | 
			
		||||
                let mut reader = BufReader::new(&file);
 | 
			
		||||
                reader.seek(SeekFrom::Start(currposition as u64)).unwrap();
 | 
			
		||||
                let t_mmap = &mmap[currposition..end];
 | 
			
		||||
                let mut t_stations: HashMap<usize, (String, StationMeasurements)> =
 | 
			
		||||
                    HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
 | 
			
		||||
                let mut line = Vec::with_capacity(108);
 | 
			
		||||
                loop {
 | 
			
		||||
                    let line_len = reader
 | 
			
		||||
                        .read_until(b'\n', &mut line)
 | 
			
		||||
                        .expect("could not read bytes");
 | 
			
		||||
                    if line_len == 0 {
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                    let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
 | 
			
		||||
                    HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
 | 
			
		||||
               for line in t_mmap.lines() {
 | 
			
		||||
                   let line = line.expect("Could not read line");
 | 
			
		||||
                    let (station, temp) = line.rsplit_once(|char| char == ';').unwrap();
 | 
			
		||||
                    let hash = hashstr(station);
 | 
			
		||||
                    let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
 | 
			
		||||
                    let temp = parse::temp(temp.split_last().unwrap().1);
 | 
			
		||||
                    let temp = parse::temp(temp.as_bytes());
 | 
			
		||||
                    let measurements_option = t_stations.get_mut(&hash);
 | 
			
		||||
                    if let Some((_, measurements)) = measurements_option {
 | 
			
		||||
                        measurements.update(temp);
 | 
			
		||||
@@ -72,11 +67,6 @@ pub fn run() {
 | 
			
		||||
                        };
 | 
			
		||||
                        t_stations.insert(hash, (station, measurements));
 | 
			
		||||
                    }
 | 
			
		||||
                    currposition += line_len;
 | 
			
		||||
                    if currposition >= end {
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                    line.clear();
 | 
			
		||||
                }
 | 
			
		||||
                let _ = tx.send(t_stations);
 | 
			
		||||
            });
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,12 @@
 | 
			
		||||
use std::{fs::File, io::BufReader, thread};
 | 
			
		||||
use std::collections::HashMap;
 | 
			
		||||
use std::io::{BufRead, Seek, SeekFrom};
 | 
			
		||||
use std::sync::mpsc;
 | 
			
		||||
use std::time::Instant;
 | 
			
		||||
use std::{fs::File, io::BufReader, thread};
 | 
			
		||||
 | 
			
		||||
use crate::models::station_measurements::StationMeasurements;
 | 
			
		||||
use crate::utils::parse;
 | 
			
		||||
use crate::utils::parse::hashstr;
 | 
			
		||||
use crate::utils::parse::hashbytes;
 | 
			
		||||
 | 
			
		||||
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
 | 
			
		||||
 | 
			
		||||
@@ -56,7 +57,7 @@ pub fn run() {
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                    let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
 | 
			
		||||
                    let hash = hashstr(station);
 | 
			
		||||
                    let hash = hashbytes(station);
 | 
			
		||||
                    let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
 | 
			
		||||
                    let temp = parse::temp(temp.split_last().unwrap().1);
 | 
			
		||||
                    let measurements_option = t_stations.get_mut(&hash);
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
use bstr::{BStr, ByteSlice};
 | 
			
		||||
use memmap::MmapOptions;
 | 
			
		||||
use memmap2::MmapOptions;
 | 
			
		||||
use rayon::prelude::*;
 | 
			
		||||
use rustc_hash::FxHashMap as HashMap;
 | 
			
		||||
use std::time::Instant;
 | 
			
		||||
 
 | 
			
		||||
@@ -2,9 +2,10 @@ use std::collections::HashMap;
 | 
			
		||||
use std::fs::File;
 | 
			
		||||
use std::io::{BufRead, BufReader};
 | 
			
		||||
use std::time::Instant;
 | 
			
		||||
 | 
			
		||||
use crate::models::station_measurements::StationMeasurements;
 | 
			
		||||
use crate::utils::parse;
 | 
			
		||||
use crate::utils::parse::hashstr;
 | 
			
		||||
use crate::utils::parse::hashbytes;
 | 
			
		||||
 | 
			
		||||
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
 | 
			
		||||
 | 
			
		||||
@@ -24,7 +25,7 @@ pub fn run() {
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
        let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
 | 
			
		||||
        let hash = hashstr(station);
 | 
			
		||||
        let hash = hashbytes(station);
 | 
			
		||||
        let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
 | 
			
		||||
        let temp = parse::temp(temp.split_last().unwrap().1);
 | 
			
		||||
        let measurements_option = stations.get_mut(&hash);
 | 
			
		||||
 
 | 
			
		||||
@@ -67,7 +67,7 @@ pub fn temp_simd(bytes: &[u8]) -> isize {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[inline]
 | 
			
		||||
pub fn hashstr(bytes: &[u8]) -> usize {
 | 
			
		||||
pub fn hashbytes(bytes: &[u8]) -> usize {
 | 
			
		||||
    let mut hash = 0;
 | 
			
		||||
    let (chunks, remainder) = bytes.as_chunks::<8>();
 | 
			
		||||
    for &chunk in chunks {
 | 
			
		||||
@@ -84,9 +84,27 @@ pub fn hashstr(bytes: &[u8]) -> usize {
 | 
			
		||||
    hash
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[inline]
 | 
			
		||||
pub fn hashstr(s: &str) -> usize {
 | 
			
		||||
    let mut hash = 0;
 | 
			
		||||
    let (chunks, remainder) = s.as_bytes().as_chunks::<8>();
 | 
			
		||||
    for &chunk in chunks {
 | 
			
		||||
        hash += usize::from_be_bytes(chunk);
 | 
			
		||||
    }
 | 
			
		||||
    let mut r = [0_u8; 8];
 | 
			
		||||
    r[0] = remainder.len() as u8;
 | 
			
		||||
    let mut idx = 1;
 | 
			
		||||
    for &byte in remainder {
 | 
			
		||||
        r[idx] = byte;
 | 
			
		||||
        idx += 1;
 | 
			
		||||
    }
 | 
			
		||||
    hash += usize::from_be_bytes(r);
 | 
			
		||||
    hash
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[cfg(test)]
 | 
			
		||||
mod tests {
 | 
			
		||||
    use crate::utils::parse::{hashstr, temp_new};
 | 
			
		||||
    use crate::utils::parse::{hashbytes, hashstr, temp_new};
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn test_temp_new_max() {
 | 
			
		||||
@@ -118,12 +136,23 @@ mod tests {
 | 
			
		||||
        assert_eq!(temp_neg_10, -99);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn test_hashbytes() {
 | 
			
		||||
        let hash_1 = hashbytes(b"abcdefghijk");
 | 
			
		||||
        let hash_2 = hashbytes(b"kjihgfedcba");
 | 
			
		||||
        let hash_3 = hashbytes(b"abba");
 | 
			
		||||
        let hash_4 = hashbytes(b"baab");
 | 
			
		||||
 | 
			
		||||
        assert_ne!(hash_1, hash_2);
 | 
			
		||||
        assert_ne!(hash_3, hash_4);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn test_hashstr() {
 | 
			
		||||
        let hash_1 = hashstr(b"abcdefghijk");
 | 
			
		||||
        let hash_2 = hashstr(b"kjihgfedcba");
 | 
			
		||||
        let hash_3 = hashstr(b"abba");
 | 
			
		||||
        let hash_4 = hashstr(b"baab");
 | 
			
		||||
        let hash_1 = hashstr("abcdefghijk");
 | 
			
		||||
        let hash_2 = hashstr("kjihgfedcba");
 | 
			
		||||
        let hash_3 = hashstr("abba");
 | 
			
		||||
        let hash_4 = hashstr("baab");
 | 
			
		||||
 | 
			
		||||
        assert_ne!(hash_1, hash_2);
 | 
			
		||||
        assert_ne!(hash_3, hash_4);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user