use read_until method instead of custom function because it is faster than mine using the Bytes struct

This commit is contained in:
Fabian Schmidt 2024-07-25 15:35:10 +02:00
parent c6b8273d65
commit 5aa94e67d1
3 changed files with 42 additions and 41 deletions

View File

@ -4,10 +4,11 @@ use std::{
thread,
};
use std::collections::HashMap;
use std::io::{Read, Seek, SeekFrom};
use std::io::{BufRead, Seek, SeekFrom};
use std::sync::mpsc;
use std::time::Instant;
use onebrc::{parse_line, parse_temp, read_bytes_until};
use onebrc::{parse_line, parse_temp};
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
@ -28,18 +29,9 @@ fn main() {
let mut reader = BufReader::new(&file);
let mut byte_start = chunk_length * i;
reader.seek(SeekFrom::Start(byte_start as u64)).expect("could not seek");
let bytes = reader.bytes();
for byte in bytes {
match byte {
Ok(byte) => {
byte_start += 1;
if byte == b'\n' {
break;
}
}
Err(_) => { panic!("could not go to next") }
}
}
let mut line = Vec::with_capacity(108);
let line_len = reader.read_until(b'\n', &mut line).expect("could not read bytes");
byte_start += line_len;
bounds.push(byte_start as u64);
}
bounds.push(file_length);
@ -51,12 +43,14 @@ fn main() {
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
let mut reader = BufReader::new(&file);
reader.seek(SeekFrom::Start(currposition)).unwrap();
let mut bytes = reader.bytes();
let mut t_stations: HashMap<String, onebrc::StationMeasurements> =
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
while let Some(line) = read_bytes_until(&mut bytes, b'\n') {
let mut line = Vec::with_capacity(108);
loop {
let line_len = reader.read_until(b'\n', &mut line).expect("could not read bytes");
if line_len == 0 {
break;
}
let (station, temp) = parse_line(&line);
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
let temp = parse_temp(temp);
@ -76,6 +70,7 @@ fn main() {
if currposition >= end {
break;
}
line.clear();
}
let _ = tx.send(t_stations);
});

View File

@ -1,9 +1,9 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read};
use std::io::{BufRead, BufReader};
use std::time::Instant;
use onebrc::{parse_line, read_bytes_until};
use onebrc::parse_line;
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
@ -13,8 +13,13 @@ fn main() {
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let file = File::open("../../../measurements.txt").expect("File measurements.txt not found");
let mut bytes = BufReader::new(&file).bytes();
while let Some(line) = read_bytes_until(&mut bytes, b'\n') {
let mut reader = BufReader::new(&file);
let mut line = Vec::with_capacity(108);
loop {
let line_len = reader.read_until(b'\n', &mut line).expect("could not read bytes");
if line_len == 0 {
break;
}
let (station, temp) = parse_line(&line);
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
let temp = onebrc::parse_temp(temp);
@ -30,6 +35,7 @@ fn main() {
};
stations.insert(station, measurements);
}
line.clear();
}
let mut stations: Vec<String> = stations.iter().map(|(station, measurements)| {
let measurements = measurements.to_string();

View File

@ -1,6 +1,4 @@
use std::fmt::Display;
use std::fs::File;
use std::io::{BufReader, Bytes};
#[derive(Copy, Clone)]
pub struct StationMeasurements {
@ -68,28 +66,30 @@ pub fn parse_temp(bytes: &[u8]) -> isize {
}
}
#[inline]
pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> Option<Vec<u8>> {
let mut buf: Vec<u8> = Vec::with_capacity(108);
for byte in bytes {
if byte.is_err() {
panic!("Could not read byte");
}
let byte = byte.unwrap();
if delimiter == byte {
return Some(buf);
}
buf.push(byte);
}
None
}
// using Bytes struct has more performance impact than the std read_until method which uses Vec instead of slice
// #[inline]
// pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> Option<Vec<u8>> {
// let mut buf: Vec<u8> = Vec::with_capacity(108);
// for byte in bytes {
// if byte.is_err() {
// panic!("Could not read byte");
// }
// let byte = byte.unwrap();
// if delimiter == byte {
// return Some(buf);
// }
// buf.push(byte);
// }
// None
// }
#[inline]
pub fn parse_line(line: &[u8]) -> (&[u8], &[u8]) {
let mut idx = 0;
while idx < line.len() && line[idx] != b';' {
let line_len = line.len();
while idx < line_len && line[idx] != b';' {
idx += 1;
}
let station = &line[0..idx];
(station, &line[(idx + 1)..])
(station, &line[(idx+1)..(line_len-1)])
}