Store station name as byte slice and only convert to string once at the end, but using u64 hash as key is still faster

This commit is contained in:
Fabian Schmidt 2024-12-31 11:24:31 +01:00
parent 98cd6e930c
commit 45b3014cbb

View File

@ -17,7 +17,8 @@ pub fn run() {
let mmap_ptr = mmap.as_ptr(); let mmap_ptr = mmap.as_ptr();
let file_length = mmap.len(); let file_length = mmap.len();
let hasher = FxBuildHasher; let hasher = FxBuildHasher;
let mut stations: HashMap<u64, (String, StationMeasurements)> = // Even if I could now just use the byte slice as a key, doing the hash is still faster
let mut stations: HashMap<u64, (&[u8], StationMeasurements)> =
HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
let (tx, rx) = mpsc::channel(); let (tx, rx) = mpsc::channel();
let cores = thread::available_parallelism().unwrap().into(); let cores = thread::available_parallelism().unwrap().into();
@ -43,7 +44,7 @@ pub fn run() {
let (start, end) = *bounds.get(i).unwrap(); let (start, end) = *bounds.get(i).unwrap();
let mmap_slice = unsafe { from_raw_parts(mmap_ptr.add(start), end - start) }; let mmap_slice = unsafe { from_raw_parts(mmap_ptr.add(start), end - start) };
s.spawn(move || { s.spawn(move || {
let mut t_stations: HashMap<u64, (String, StationMeasurements)> = let mut t_stations: HashMap<u64, (&[u8], StationMeasurements)> =
HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
for line in mmap_slice.split(|&byte| byte == b'\n') { for line in mmap_slice.split(|&byte| byte == b'\n') {
if line.is_empty() { if line.is_empty() {
@ -51,7 +52,6 @@ pub fn run() {
} }
let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
let hash = hash::bytes(station); let hash = hash::bytes(station);
let station = unsafe { std::str::from_utf8_unchecked(station) };
let temp = parse::temp(temp); let temp = parse::temp(temp);
let measurements_option = t_stations.get_mut(&hash); let measurements_option = t_stations.get_mut(&hash);
if let Some((_, measurements)) = measurements_option { if let Some((_, measurements)) = measurements_option {
@ -63,7 +63,7 @@ pub fn run() {
count: 1, count: 1,
sum: temp, sum: temp,
}; };
t_stations.insert(hash, (station.to_string(), measurements)); t_stations.insert(hash, (station, measurements));
} }
} }
let _ = tx.send(t_stations); let _ = tx.send(t_stations);
@ -76,13 +76,14 @@ pub fn run() {
if let Some((_, joined_measurements)) = joined_measurements_options { if let Some((_, joined_measurements)) = joined_measurements_options {
joined_measurements.merge(measurements); joined_measurements.merge(measurements);
} else { } else {
stations.insert(*hash, (station.to_owned(), *measurements)); stations.insert(*hash, (station, *measurements));
} }
} }
} }
let mut stations: Vec<String> = stations let mut stations: Vec<String> = stations
.iter() .iter()
.map(|(_, (station, measurements))| { .map(|(_, (station, measurements))| {
let station = unsafe { std::str::from_utf8_unchecked(station) };
let measurements = measurements.to_string(); let measurements = measurements.to_string();
#[cfg(feature = "json")] #[cfg(feature = "json")]
{ {