use read_until method instead of custom function because it is faster than mine using the Bytes struct

This commit is contained in:
Fabian Schmidt 2024-07-25 15:35:10 +02:00
parent c6b8273d65
commit 5aa94e67d1
3 changed files with 42 additions and 41 deletions

View File

@ -4,10 +4,11 @@ use std::{
thread, thread,
}; };
use std::collections::HashMap; use std::collections::HashMap;
use std::io::{Read, Seek, SeekFrom}; use std::io::{BufRead, Seek, SeekFrom};
use std::sync::mpsc; use std::sync::mpsc;
use std::time::Instant; use std::time::Instant;
use onebrc::{parse_line, parse_temp, read_bytes_until};
use onebrc::{parse_line, parse_temp};
const DEFAULT_HASHMAP_LENGTH: usize = 10000; const DEFAULT_HASHMAP_LENGTH: usize = 10000;
@ -28,18 +29,9 @@ fn main() {
let mut reader = BufReader::new(&file); let mut reader = BufReader::new(&file);
let mut byte_start = chunk_length * i; let mut byte_start = chunk_length * i;
reader.seek(SeekFrom::Start(byte_start as u64)).expect("could not seek"); reader.seek(SeekFrom::Start(byte_start as u64)).expect("could not seek");
let bytes = reader.bytes(); let mut line = Vec::with_capacity(108);
for byte in bytes { let line_len = reader.read_until(b'\n', &mut line).expect("could not read bytes");
match byte { byte_start += line_len;
Ok(byte) => {
byte_start += 1;
if byte == b'\n' {
break;
}
}
Err(_) => { panic!("could not go to next") }
}
}
bounds.push(byte_start as u64); bounds.push(byte_start as u64);
} }
bounds.push(file_length); bounds.push(file_length);
@ -51,12 +43,14 @@ fn main() {
let file = File::open(FILE_PATH).expect("File measurements.txt not found"); let file = File::open(FILE_PATH).expect("File measurements.txt not found");
let mut reader = BufReader::new(&file); let mut reader = BufReader::new(&file);
reader.seek(SeekFrom::Start(currposition)).unwrap(); reader.seek(SeekFrom::Start(currposition)).unwrap();
let mut bytes = reader.bytes();
let mut t_stations: HashMap<String, onebrc::StationMeasurements> = let mut t_stations: HashMap<String, onebrc::StationMeasurements> =
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let mut line = Vec::with_capacity(108);
while let Some(line) = read_bytes_until(&mut bytes, b'\n') { loop {
let line_len = reader.read_until(b'\n', &mut line).expect("could not read bytes");
if line_len == 0 {
break;
}
let (station, temp) = parse_line(&line); let (station, temp) = parse_line(&line);
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
let temp = parse_temp(temp); let temp = parse_temp(temp);
@ -76,6 +70,7 @@ fn main() {
if currposition >= end { if currposition >= end {
break; break;
} }
line.clear();
} }
let _ = tx.send(t_stations); let _ = tx.send(t_stations);
}); });

View File

@ -1,9 +1,9 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::{BufReader, Read}; use std::io::{BufRead, BufReader};
use std::time::Instant; use std::time::Instant;
use onebrc::{parse_line, read_bytes_until}; use onebrc::parse_line;
const DEFAULT_HASHMAP_LENGTH: usize = 10000; const DEFAULT_HASHMAP_LENGTH: usize = 10000;
@ -13,8 +13,13 @@ fn main() {
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let file = File::open("../../../measurements.txt").expect("File measurements.txt not found"); let file = File::open("../../../measurements.txt").expect("File measurements.txt not found");
let mut bytes = BufReader::new(&file).bytes(); let mut reader = BufReader::new(&file);
while let Some(line) = read_bytes_until(&mut bytes, b'\n') { let mut line = Vec::with_capacity(108);
loop {
let line_len = reader.read_until(b'\n', &mut line).expect("could not read bytes");
if line_len == 0 {
break;
}
let (station, temp) = parse_line(&line); let (station, temp) = parse_line(&line);
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
let temp = onebrc::parse_temp(temp); let temp = onebrc::parse_temp(temp);
@ -30,6 +35,7 @@ fn main() {
}; };
stations.insert(station, measurements); stations.insert(station, measurements);
} }
line.clear();
} }
let mut stations: Vec<String> = stations.iter().map(|(station, measurements)| { let mut stations: Vec<String> = stations.iter().map(|(station, measurements)| {
let measurements = measurements.to_string(); let measurements = measurements.to_string();

View File

@ -1,6 +1,4 @@
use std::fmt::Display; use std::fmt::Display;
use std::fs::File;
use std::io::{BufReader, Bytes};
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct StationMeasurements { pub struct StationMeasurements {
@ -68,28 +66,30 @@ pub fn parse_temp(bytes: &[u8]) -> isize {
} }
} }
#[inline] // using Bytes struct has more performance impact than the std read_until method which uses Vec instead of slice
pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> Option<Vec<u8>> { // #[inline]
let mut buf: Vec<u8> = Vec::with_capacity(108); // pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> Option<Vec<u8>> {
for byte in bytes { // let mut buf: Vec<u8> = Vec::with_capacity(108);
if byte.is_err() { // for byte in bytes {
panic!("Could not read byte"); // if byte.is_err() {
} // panic!("Could not read byte");
let byte = byte.unwrap(); // }
if delimiter == byte { // let byte = byte.unwrap();
return Some(buf); // if delimiter == byte {
} // return Some(buf);
buf.push(byte); // }
} // buf.push(byte);
None // }
} // None
// }
#[inline] #[inline]
pub fn parse_line(line: &[u8]) -> (&[u8], &[u8]) { pub fn parse_line(line: &[u8]) -> (&[u8], &[u8]) {
let mut idx = 0; let mut idx = 0;
while idx < line.len() && line[idx] != b';' { let line_len = line.len();
while idx < line_len && line[idx] != b';' {
idx += 1; idx += 1;
} }
let station = &line[0..idx]; let station = &line[0..idx];
(station, &line[(idx + 1)..]) (station, &line[(idx+1)..(line_len-1)])
} }