My multi-treaded version is now faster than polars and takes less time to compile. It's a little more complex though
This commit is contained in:
parent
dcaca0cc65
commit
e230a5ce2c
1264
src/main/rust/Cargo.lock
generated
1264
src/main/rust/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -10,7 +10,7 @@ bstr = "1.9.1"
|
|||||||
fast-float = "0.2.0"
|
fast-float = "0.2.0"
|
||||||
memchr = "2.7.4"
|
memchr = "2.7.4"
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
#polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
|
polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
|
||||||
rayon = "1.10.0"
|
rayon = "1.10.0"
|
||||||
rustc-hash = "2.0.0"
|
rustc-hash = "2.0.0"
|
||||||
|
|
||||||
|
@ -1,106 +0,0 @@
|
|||||||
use std::{
|
|
||||||
fs::File,
|
|
||||||
io::BufReader,
|
|
||||||
thread,
|
|
||||||
};
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
|
||||||
use std::sync::mpsc;
|
|
||||||
use onebrc::{parse_line, parse_temp, read_bytes_until};
|
|
||||||
|
|
||||||
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
const FILE_PATH: &str = "../../../measurements.txt";
|
|
||||||
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
|
||||||
thread::scope(|s| {
|
|
||||||
let mut stations: HashMap<String, onebrc::StationMeasurements> = HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
|
||||||
let (tx, rx) = mpsc::channel();
|
|
||||||
let cores = thread::available_parallelism().unwrap().into();
|
|
||||||
let mut reader = BufReader::new(&file);
|
|
||||||
let file_length = reader.seek(SeekFrom::End(0)).unwrap();
|
|
||||||
let chunk_length = file_length as usize / cores;
|
|
||||||
reader.seek(SeekFrom::Start(0)).unwrap();
|
|
||||||
for i in 0..cores {
|
|
||||||
let tx = tx.clone();
|
|
||||||
s.spawn(move || {
|
|
||||||
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
|
||||||
let mut reader = BufReader::new(&file);
|
|
||||||
let mut currposition = (chunk_length * i) as u64;
|
|
||||||
let end = currposition + chunk_length as u64;
|
|
||||||
reader.seek(SeekFrom::Start(currposition)).unwrap();
|
|
||||||
let mut bytes = reader.bytes();
|
|
||||||
|
|
||||||
while let Some(byte) = bytes.next() {
|
|
||||||
match byte {
|
|
||||||
Ok(byte) => {
|
|
||||||
if byte == b'\n' {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_) => { panic!("could not go to next") }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut t_stations: HashMap<String, onebrc::StationMeasurements> =
|
|
||||||
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
|
||||||
|
|
||||||
while let Some(line) = read_bytes_until(&mut bytes, b'\n') {
|
|
||||||
let (station, temp) = parse_line(&line);
|
|
||||||
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
|
|
||||||
let temp = parse_temp(temp);
|
|
||||||
let measurements_option = t_stations.get_mut(&station);
|
|
||||||
if let Some(measurements) = measurements_option {
|
|
||||||
measurements.update(temp);
|
|
||||||
} else {
|
|
||||||
let measurements = onebrc::StationMeasurements {
|
|
||||||
min: temp,
|
|
||||||
max: temp,
|
|
||||||
count: 1,
|
|
||||||
sum: temp,
|
|
||||||
};
|
|
||||||
t_stations.insert(station, measurements);
|
|
||||||
}
|
|
||||||
currposition += line.len() as u64;
|
|
||||||
if currposition > end {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let _ = tx.send(t_stations);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
drop(tx);
|
|
||||||
while let Ok(t_stations) = rx.recv() {
|
|
||||||
for (station, measurements) in t_stations.iter() {
|
|
||||||
let joined_measurements_options = stations.get_mut(station.as_str());
|
|
||||||
if let Some(joined_measurements) = joined_measurements_options {
|
|
||||||
joined_measurements.merge(measurements);
|
|
||||||
} else {
|
|
||||||
stations.insert(station.to_owned(), *measurements);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut stations: Vec<String> = stations.iter().map(|(station, measurements)| {
|
|
||||||
let measurements = measurements.to_string();
|
|
||||||
#[cfg(feature = "json")]
|
|
||||||
{
|
|
||||||
format!("{{\"{station}\":\"{measurements}\"}}")
|
|
||||||
}
|
|
||||||
#[cfg(not(feature = "json"))]
|
|
||||||
{
|
|
||||||
format!("{station}={measurements}")
|
|
||||||
}
|
|
||||||
}).collect();
|
|
||||||
stations.sort();
|
|
||||||
let stations = stations.join(",");
|
|
||||||
#[cfg(feature = "json")]
|
|
||||||
{
|
|
||||||
println!("\n\n[{stations}]");
|
|
||||||
}
|
|
||||||
#[cfg(not(feature = "json"))]
|
|
||||||
{
|
|
||||||
println!("\n\n{{{stations}}}");
|
|
||||||
}
|
|
||||||
// println!("\n\nTime={} ms", now.elapsed().as_millis());
|
|
||||||
});
|
|
||||||
}
|
|
@ -1,53 +1,67 @@
|
|||||||
use std::{
|
use std::{
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{BufRead, BufReader},
|
io::BufReader,
|
||||||
thread,
|
thread,
|
||||||
};
|
};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use std::sync::mpsc;
|
use std::sync::mpsc;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
use onebrc::{parse_line, parse_temp, read_bytes_until};
|
||||||
use onebrc::parse_temp;
|
|
||||||
|
|
||||||
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
//print!("\x1b[2J");
|
const FILE_PATH: &str = "../../../measurements.txt";
|
||||||
print!("\x1b[s");
|
let now = Instant::now();
|
||||||
thread::scope(|s| {
|
thread::scope(|s| {
|
||||||
let mut stations: HashMap<String, onebrc::StationMeasurements> = HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
let mut stations: HashMap<String, onebrc::StationMeasurements> = HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
||||||
let (tx, rx) = mpsc::channel();
|
let (tx, rx) = mpsc::channel();
|
||||||
let now = Instant::now();
|
let cores = thread::available_parallelism().unwrap().into();
|
||||||
let cores: usize = thread::available_parallelism().unwrap().into();
|
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
||||||
let chunk_length = 1_000_000_000 / cores;
|
let mut reader = BufReader::new(&file);
|
||||||
|
let file_length = reader.seek(SeekFrom::End(0)).unwrap();
|
||||||
|
println!("file_length = {file_length}");
|
||||||
|
let chunk_length = file_length as usize / cores;
|
||||||
|
let mut bounds = Vec::with_capacity(cores + 1);
|
||||||
|
bounds.push(0);
|
||||||
|
for i in 1..cores {
|
||||||
|
let mut reader = BufReader::new(&file);
|
||||||
|
let mut byte_start = chunk_length * i;
|
||||||
|
reader.seek(SeekFrom::Start(byte_start as u64)).expect("could not seek");
|
||||||
|
let mut bytes = reader.bytes();
|
||||||
|
while let Some(byte) = bytes.next() {
|
||||||
|
match byte {
|
||||||
|
Ok(byte) => {
|
||||||
|
byte_start += 1;
|
||||||
|
if byte == b'\n' {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => { panic!("could not go to next") }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bounds.push(byte_start as u64);
|
||||||
|
}
|
||||||
|
bounds.push(file_length);
|
||||||
for i in 0..cores {
|
for i in 0..cores {
|
||||||
let print_line = i + 1;
|
|
||||||
print!("\x1b[u\x1b[{print_line}B\x1b[0CThread #{i:0>2}: 0%");
|
|
||||||
let file = File::open("../../../measurements.txt").expect("File measurements.txt not found");
|
|
||||||
let reader = BufReader::new(file);
|
|
||||||
let line_chunk = reader.lines().skip(chunk_length * i).take(chunk_length);
|
|
||||||
let tx = tx.clone();
|
let tx = tx.clone();
|
||||||
|
let mut currposition = *bounds.get(i).unwrap();
|
||||||
|
let end = *bounds.get(i+1).unwrap();
|
||||||
s.spawn(move || {
|
s.spawn(move || {
|
||||||
|
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
||||||
|
let mut reader = BufReader::new(&file);
|
||||||
|
reader.seek(SeekFrom::Start(currposition)).unwrap();
|
||||||
|
let mut bytes = reader.bytes();
|
||||||
|
|
||||||
let mut t_stations: HashMap<String, onebrc::StationMeasurements> =
|
let mut t_stations: HashMap<String, onebrc::StationMeasurements> =
|
||||||
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
||||||
|
|
||||||
let now_read_line = Instant::now();
|
while let Some(line) = read_bytes_until(&mut bytes, b'\n') {
|
||||||
let mut line_num = 0;
|
let (station, temp) = parse_line(&line);
|
||||||
line_chunk.for_each(|line| {
|
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
|
||||||
if line_num == 0 {
|
let temp = parse_temp(temp);
|
||||||
print!("\x1b[u\x1b[{print_line}B\x1b[30CStart read line {}ms", now_read_line.elapsed().as_millis());
|
let measurements_option = t_stations.get_mut(&station);
|
||||||
}
|
|
||||||
if line_num % 10000 == 0 {
|
|
||||||
//let formatted = format_nums(line_num);
|
|
||||||
//print!("\x1b[u\x1b[{print_line}B\x1b[0CThread #{i:0>2}: {formatted}");
|
|
||||||
let percent = (line_num as f64 / chunk_length as f64) * 100.0;
|
|
||||||
print!("\x1b[u\x1b[{print_line}B\x1b[0CThread #{i:0>2}: {percent:.2}%");
|
|
||||||
}
|
|
||||||
line_num += 1;
|
|
||||||
let line = line.expect("could not read line");
|
|
||||||
let (station, temp) = line.split_once(';').expect("Error while splitting");
|
|
||||||
let temp = parse_temp(temp.as_bytes());
|
|
||||||
let measurements_option = t_stations.get_mut(station);
|
|
||||||
if let Some(measurements) = measurements_option {
|
if let Some(measurements) = measurements_option {
|
||||||
measurements.update(temp);
|
measurements.update(temp);
|
||||||
} else {
|
} else {
|
||||||
@ -57,14 +71,16 @@ fn main() {
|
|||||||
count: 1,
|
count: 1,
|
||||||
sum: temp,
|
sum: temp,
|
||||||
};
|
};
|
||||||
t_stations.insert(station.to_owned(), measurements);
|
t_stations.insert(station, measurements);
|
||||||
|
}
|
||||||
|
currposition += line.len() as u64;
|
||||||
|
if currposition >= end {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
|
||||||
print!("\x1b[u\x1b[{print_line}B\x1b[60CTime reading lines in thread {i}={} ms", now_read_line.elapsed().as_millis());
|
|
||||||
let _ = tx.send(t_stations);
|
let _ = tx.send(t_stations);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
print!("\x1b[{cores}B");
|
|
||||||
drop(tx);
|
drop(tx);
|
||||||
while let Ok(t_stations) = rx.recv() {
|
while let Ok(t_stations) = rx.recv() {
|
||||||
for (station, measurements) in t_stations.iter() {
|
for (station, measurements) in t_stations.iter() {
|
||||||
@ -78,11 +94,25 @@ fn main() {
|
|||||||
}
|
}
|
||||||
let mut stations: Vec<String> = stations.iter().map(|(station, measurements)| {
|
let mut stations: Vec<String> = stations.iter().map(|(station, measurements)| {
|
||||||
let measurements = measurements.to_string();
|
let measurements = measurements.to_string();
|
||||||
|
#[cfg(feature = "json")]
|
||||||
|
{
|
||||||
|
format!("{{\"{station}\":\"{measurements}\"}}")
|
||||||
|
}
|
||||||
|
#[cfg(not(feature = "json"))]
|
||||||
|
{
|
||||||
format!("{station}={measurements}")
|
format!("{station}={measurements}")
|
||||||
|
}
|
||||||
}).collect();
|
}).collect();
|
||||||
stations.sort();
|
stations.sort();
|
||||||
let _stations = stations.join(",");
|
let stations = stations.join(",");
|
||||||
// println!("{{{stations}}}");
|
#[cfg(feature = "json")]
|
||||||
|
{
|
||||||
|
println!("\n\n[{stations}]");
|
||||||
|
}
|
||||||
|
#[cfg(not(feature = "json"))]
|
||||||
|
{
|
||||||
|
println!("\n\n{{{stations}}}");
|
||||||
|
}
|
||||||
println!("\n\nTime={} ms", now.elapsed().as_millis());
|
println!("\n\nTime={} ms", now.elapsed().as_millis());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ use bstr::{BStr, ByteSlice};
|
|||||||
use memmap::MmapOptions;
|
use memmap::MmapOptions;
|
||||||
use rustc_hash::FxHashMap as HashMap;
|
use rustc_hash::FxHashMap as HashMap;
|
||||||
use std::{fmt::Display, fs::File};
|
use std::{fmt::Display, fs::File};
|
||||||
|
use std::time::Instant;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -68,10 +68,11 @@ fn merge<'a>(a: &mut HashMap<&'a BStr, State>, b: &HashMap<&'a BStr, State>) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
let now = Instant::now();
|
||||||
let cores: usize = std::thread::available_parallelism().unwrap().into();
|
let cores: usize = std::thread::available_parallelism().unwrap().into();
|
||||||
let path = match std::env::args().skip(1).next() {
|
let path = match std::env::args().skip(1).next() {
|
||||||
Some(path) => path,
|
Some(path) => path,
|
||||||
None => "measurements.txt".to_owned(),
|
None => "../../../measurements.txt".to_owned(),
|
||||||
};
|
};
|
||||||
let file = File::open(path).unwrap();
|
let file = File::open(path).unwrap();
|
||||||
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
|
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
|
||||||
@ -128,4 +129,5 @@ fn main() {
|
|||||||
}
|
}
|
||||||
println!("}}");
|
println!("}}");
|
||||||
}
|
}
|
||||||
|
println!("\n\nTime={} ms", now.elapsed().as_millis());
|
||||||
}
|
}
|
||||||
|
@ -69,10 +69,8 @@ pub fn parse_temp(bytes: &[u8]) -> isize {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> Option<[u8; 108]> {
|
pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> Option<Vec<u8>> {
|
||||||
// 108 max length of line in bytes
|
let mut buf: Vec<u8> = Vec::with_capacity(108);
|
||||||
let mut buf: [u8; 108] = [b'#'; 108];
|
|
||||||
let mut idx = 0;
|
|
||||||
while let Some(byte) = bytes.next() {
|
while let Some(byte) = bytes.next() {
|
||||||
if byte.is_err() {
|
if byte.is_err() {
|
||||||
panic!("Could not read byte");
|
panic!("Could not read byte");
|
||||||
@ -81,8 +79,7 @@ pub fn read_bytes_until(bytes: &mut Bytes<BufReader<&File>>, delimiter: u8) -> O
|
|||||||
if delimiter == byte {
|
if delimiter == byte {
|
||||||
return Some(buf);
|
return Some(buf);
|
||||||
}
|
}
|
||||||
buf[idx] = byte;
|
buf.push(byte);
|
||||||
idx += 1;
|
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@ -94,9 +91,5 @@ pub fn parse_line(line: &[u8]) -> (&[u8], &[u8]) {
|
|||||||
idx += 1;
|
idx += 1;
|
||||||
}
|
}
|
||||||
let station = &line[0..idx];
|
let station = &line[0..idx];
|
||||||
let midpoint = idx + 1;
|
(station, &line[(idx + 1)..])
|
||||||
while idx < line.len() && line[idx] != b'#' {
|
|
||||||
idx += 1;
|
|
||||||
}
|
|
||||||
(station, &line[midpoint..idx])
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user