diff --git a/src/main/rust/src/bin/single_thread.rs b/src/main/rust/src/bin/single_thread.rs index 2e6d088..8f6dda9 100644 --- a/src/main/rust/src/bin/single_thread.rs +++ b/src/main/rust/src/bin/single_thread.rs @@ -4,7 +4,8 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufRead, BufReader}; use std::time::Instant; -use onebrc::{hashstr, StationMeasurements}; + +use onebrc::{hashstr, parse_temp, StationMeasurements}; const DEFAULT_HASHMAP_LENGTH: usize = 10000; @@ -24,7 +25,7 @@ fn main() { let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); let hash = hashstr(station); let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; - let temp = onebrc::parse_temp(temp.split_last().unwrap().1); + let temp = parse_temp(temp.split_last().unwrap().1); let measurements_option = stations.get_mut(&hash); if let Some((_, measurements)) = measurements_option { measurements.update(temp); diff --git a/src/main/rust/src/lib.rs b/src/main/rust/src/lib.rs index 6d1394b..588c78d 100644 --- a/src/main/rust/src/lib.rs +++ b/src/main/rust/src/lib.rs @@ -47,9 +47,9 @@ pub fn format_nums(num: usize) -> String { } #[inline] -pub const fn get_digit(b: u8) -> u32 { +pub const fn get_digit(b: u8) -> isize { // wrapping_sub('0' as u32) same as - 48 but less magical - (b as u32).wrapping_sub('0' as u32) + (b as isize).wrapping_sub('0' as isize) } #[inline] @@ -63,9 +63,9 @@ pub fn parse_temp(bytes: &[u8]) -> isize { _x => panic!("could not parse temp: is_negative = {is_negative}, length = {}", bytes.len()), }; if is_negative { - -(as_decimal as isize) + -as_decimal } else { - as_decimal as isize + as_decimal } } @@ -123,32 +123,85 @@ pub fn new_parse_temp(bytes: &[u8]) -> isize { // } #[inline] -pub fn parse_line(line: &[u8]) -> (&[u8], &[u8]) { - let mut idx = 0; - let line_len = line.len(); - while idx < line_len && line[idx] != b';' { - idx += 1; +pub fn get_pos(bytes: &[u8], find: u8) -> Option { + let chunks = bytes.windows(4); + let mut pos = 0; + for chunk in chunks { + let inner_pos = get_pos_in_chunk(chunk, find); + if inner_pos < chunk.len() as u32 { + return Some(pos + inner_pos); + } + pos += 1; } - let station = &line[0..idx]; - (station, &line[(idx+1)..(line_len-1)]) + None +} + +#[inline] +fn get_pos_in_chunk(byte_chunk: &[u8], find: u8) -> u32 { + let find_hex = u32::from_be_bytes([find; 4]); + let x = u32::from_be_bytes(byte_chunk.try_into().unwrap()) ^ find_hex; + let mask = (x - 0x01010101) & (!x & (0x80808080)); + u32::leading_zeros(mask) >> 3 } #[cfg(test)] mod tests { - use crate::new_parse_temp; + use crate::{get_pos, hashstr, new_parse_temp}; #[test] - fn test_new_parse_temp() { + fn test_new_parse_temp_max() { let temp_max = new_parse_temp("99.9".as_bytes()); - let temp_min = new_parse_temp("-99.9".as_bytes()); - let temp_0 = new_parse_temp("0.0".as_bytes()); - let temp_10 = new_parse_temp("10.0".as_bytes()); - let temp_neg_10 = new_parse_temp("-10.0".as_bytes()); - assert_eq!(temp_max, 999); + } + + #[test] + fn test_new_parse_temp_min() { + let temp_min = new_parse_temp("-99.9".as_bytes()); assert_eq!(temp_min, -999); + } + + #[test] + fn test_new_parse_temp_zero() { + let temp_0 = new_parse_temp("0.0".as_bytes()); assert_eq!(temp_0, 0); - assert_eq!(temp_10, 100); - assert_eq!(temp_neg_10, -100); + } + + #[test] + fn test_new_parse_temp_pos() { + let temp_10 = new_parse_temp("9.9".as_bytes()); + assert_eq!(temp_10, 99); + } + + #[test] + fn test_new_parse_temp_neg() { + let temp_neg_10 = new_parse_temp("-9.9".as_bytes()); + assert_eq!(temp_neg_10, -99); + } + + #[test] + fn test_hashstr() { + let hash_1 = hashstr(b"abcdefghijk"); + let hash_2 = hashstr(b"kjihgfedcba"); + let hash_3 = hashstr(b"abba"); + let hash_4 = hashstr(b"baab"); + + assert_ne!(hash_1, hash_2); + assert_ne!(hash_3, hash_4); + } + + #[test] + fn test_getpos() { + let semi_bytes = vec![0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, b';', 0_u8, 0_u8]; + let semi_bytes = semi_bytes.as_slice(); + let pos = get_pos(semi_bytes, b';').unwrap(); + assert_eq!(pos, 8); + } + + #[test] + fn test_getpos_empty() { + let semi_bytes = vec![0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8, 0_u8]; + let semi_bytes = semi_bytes.as_slice(); + let pos = get_pos(semi_bytes, b';'); + assert_eq!(pos, None); } }