Compare commits

...

4 Commits

11 changed files with 130 additions and 1258 deletions

1099
src/main/rust/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,18 +6,17 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
bstr = "1.9.1" bstr = "1.10.0"
fast-float = "0.2.0" fast-float = "0.2.0"
memchr = "2.7.4" memchr = "2.7.4"
memmap = "0.7.0" memmap2 = "0.9.4"
polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
rayon = "1.10.0" rayon = "1.10.0"
rustc-hash = "2.0.0" rustc-hash = "2.0.0"
libc = "0.2.155" libc = "0.2.158"
smol = "2.0.0" smol = "2.0.1"
[dev-dependencies] [dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] } criterion = { version = "0.5.1", features = ["html_reports"] }
[features] [features]
json = [] json = []
@ -36,7 +35,7 @@ name = "multi_threaded"
harness = false harness = false
[[bench]] [[bench]]
name = "polars" name = "multi_threaded_smol"
harness = false harness = false
[[bench]] [[bench]]

View File

@ -1,8 +1,9 @@
use criterion::{Criterion, criterion_group, criterion_main}; use criterion::{Criterion, criterion_group, criterion_main};
use onebrc::implementations::libraries::run;
pub fn criterion_benchmark(c: &mut Criterion) { pub fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("polars", |b| {b.iter(|| /*run_polars()*/ ())}); c.bench_function("libraries", |b| {b.iter(|| run())});
} }
criterion_group!(benches, criterion_benchmark); criterion_group!(benches, criterion_benchmark);
criterion_main!(benches); criterion_main!(benches);

View File

@ -0,0 +1,9 @@
use criterion::{Criterion, criterion_group, criterion_main};
use onebrc::implementations::multi_threaded_smol::run;
pub fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("multithreadedsmol", |b| {b.iter(|| run())});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@ -1,4 +0,0 @@
fn main() {
// let _ = run_polars();
}

View File

@ -4,7 +4,6 @@ pub mod multi_threaded;
pub mod multi_threaded_smol; pub mod multi_threaded_smol;
pub mod multi_threaded_structured; pub mod multi_threaded_structured;
pub mod phcs; pub mod phcs;
pub mod polars;
pub mod reference_impl; pub mod reference_impl;
pub mod single_thread; pub mod single_thread;
pub mod smol; pub mod smol;

View File

@ -1,67 +1,57 @@
use std::collections::HashMap;
use std::io::{BufRead, Seek, SeekFrom};
use std::sync::mpsc;
use std::time::Instant;
use std::{fs::File, io::BufReader, thread};
use memmap::MmapOptions;
use crate::models::station_measurements::StationMeasurements; use crate::models::station_measurements::StationMeasurements;
use crate::utils::parse; use crate::utils::parse;
use crate::utils::parse::hashstr; use memmap2::MmapOptions;
use rustc_hash::{FxBuildHasher, FxHashMap as HashMap};
use std::sync::mpsc;
use std::time::Instant;
use std::{fs::File, thread};
const DEFAULT_HASHMAP_LENGTH: usize = 10000; const DEFAULT_HASHMAP_LENGTH: usize = 10000;
pub fn run() { pub fn run() {
const FILE_PATH: &str = "../../../measurements.txt";
let now = Instant::now(); let now = Instant::now();
const FILE_PATH: &str = "../../../measurements.txt";
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
let file_length = mmap.len();
let hasher = FxBuildHasher::default();
let mut stations: HashMap<String, StationMeasurements> =
HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
let (tx, rx) = mpsc::channel();
let cores = thread::available_parallelism().unwrap().into();
let chunk_length = file_length / cores;
let mut bounds = Vec::with_capacity(cores + 1);
let mut start = 0;
for _ in 0..cores {
let end = (start + chunk_length).min(mmap.len());
let next_new_line = match memchr::memchr(b'\n', &mmap[end..]) {
Some(v) => v,
None => {
assert_eq!(end, mmap.len());
0
}
};
let end = end + next_new_line;
bounds.push((start, end));
start = end + 1;
}
thread::scope(|s| { thread::scope(|s| {
let mut stations: HashMap<usize, (String, StationMeasurements)> =
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let (tx, rx) = mpsc::channel();
let cores = thread::available_parallelism().unwrap().into();
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
let file_length = mmap.len();
let chunk_length = file_length / cores;
let mut bounds = Vec::with_capacity(cores + 1);
bounds.push(0);
for i in 1..cores {
let mut reader = BufReader::new(&file);
let mut byte_start = chunk_length * i;
reader
.seek(SeekFrom::Start(byte_start as u64))
.expect("could not seek");
let mut line = Vec::with_capacity(108);
let line_len = reader
.read_until(b'\n', &mut line)
.expect("could not read bytes");
byte_start += line_len;
bounds.push(byte_start);
}
bounds.push(file_length);
for i in 0..cores { for i in 0..cores {
let tx = tx.clone(); let tx = tx.clone();
let mut currposition = *bounds.get(i).unwrap(); let (start, end) = *bounds.get(i).unwrap();
let end = *bounds.get(i + 1).unwrap(); let mmap_slice = &mmap[start..end];
s.spawn(move || { s.spawn(move || {
let file = File::open(FILE_PATH).expect("File measurements.txt not found"); let mut t_stations: HashMap<String, StationMeasurements> =
let mut reader = BufReader::new(&file); HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
reader.seek(SeekFrom::Start(currposition as u64)).unwrap(); for line in mmap_slice.split(|&byte| { byte == b'\n' }) {
let mut t_stations: HashMap<usize, (String, StationMeasurements)> = if line.len() == 0 {
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let mut line = Vec::with_capacity(108);
loop {
let line_len = reader
.read_until(b'\n', &mut line)
.expect("could not read bytes");
if line_len == 0 {
break; break;
} }
let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
let hash = hashstr(station);
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
let temp = parse::temp(temp.split_last().unwrap().1); let temp = parse::temp(temp);
let measurements_option = t_stations.get_mut(&hash); let measurements_option = t_stations.get_mut(&station);
if let Some((_, measurements)) = measurements_option { if let Some(measurements) = measurements_option {
measurements.update(temp); measurements.update(temp);
} else { } else {
let measurements = StationMeasurements { let measurements = StationMeasurements {
@ -70,31 +60,26 @@ pub fn run() {
count: 1, count: 1,
sum: temp, sum: temp,
}; };
t_stations.insert(hash, (station, measurements)); t_stations.insert(station, measurements);
} }
currposition += line_len;
if currposition >= end {
break;
}
line.clear();
} }
let _ = tx.send(t_stations); let _ = tx.send(t_stations);
}); });
} }
drop(tx); drop(tx);
while let Ok(t_stations) = rx.recv() { while let Ok(t_stations) = rx.recv() {
for (&hash, (station, measurements)) in t_stations.iter() { for (station, measurements) in t_stations.iter() {
let joined_measurements_options = stations.get_mut(&hash); let joined_measurements_options = stations.get_mut(station);
if let Some((_, joined_measurements)) = joined_measurements_options { if let Some(joined_measurements) = joined_measurements_options {
joined_measurements.merge(measurements); joined_measurements.merge(measurements);
} else { } else {
stations.insert(hash, (station.to_owned(), *measurements)); stations.insert(station.to_owned(), *measurements);
} }
} }
} }
let mut stations: Vec<String> = stations let mut stations: Vec<String> = stations
.iter() .iter()
.map(|(_, (station, measurements))| { .map(|(station, measurements)| {
let measurements = measurements.to_string(); let measurements = measurements.to_string();
#[cfg(feature = "json")] #[cfg(feature = "json")]
{ {

View File

@ -1,10 +1,9 @@
use smol::fs::File; use smol::fs::File;
use smol::io::{AsyncBufReadExt, AsyncSeekExt, BufReader, SeekFrom}; use smol::io::{AsyncBufReadExt, AsyncSeekExt, BufReader, SeekFrom};
use rustc_hash::{FxHashMap as HashMap, FxBuildHasher};
use crate::models::station_measurements::StationMeasurements; use crate::models::station_measurements::StationMeasurements;
use crate::utils::parse; use crate::utils::parse;
use crate::utils::parse::hashstr;
use std::collections::HashMap;
use std::sync::mpsc; use std::sync::mpsc;
use std::thread; use std::thread;
use std::time::Instant; use std::time::Instant;
@ -15,8 +14,9 @@ pub fn run() {
const FILE_PATH: &str = "../../../measurements.txt"; const FILE_PATH: &str = "../../../measurements.txt";
let now = Instant::now(); let now = Instant::now();
thread::scope(|s| { thread::scope(|s| {
let mut stations: HashMap<usize, (String, StationMeasurements)> = let hasher = FxBuildHasher::default();
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); let mut stations: HashMap<String, StationMeasurements> =
HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
let (tx, rx) = mpsc::channel(); let (tx, rx) = mpsc::channel();
let cores = thread::available_parallelism().unwrap().into(); let cores = thread::available_parallelism().unwrap().into();
let bounds = smol::block_on(async { let bounds = smol::block_on(async {
@ -57,8 +57,8 @@ pub fn run() {
.expect("File measurements.txt not found"); .expect("File measurements.txt not found");
let mut reader = BufReader::new(&mut file); let mut reader = BufReader::new(&mut file);
reader.seek(SeekFrom::Start(currposition)).await.unwrap(); reader.seek(SeekFrom::Start(currposition)).await.unwrap();
let mut t_stations: HashMap<usize, (String, StationMeasurements)> = let mut t_stations: HashMap<String, StationMeasurements> =
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
let mut line = Vec::with_capacity(108); let mut line = Vec::with_capacity(108);
loop { loop {
let line_len = reader let line_len = reader
@ -69,11 +69,10 @@ pub fn run() {
break; break;
} }
let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
let hash = hashstr(station);
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
let temp = parse::temp(temp.split_last().unwrap().1); let temp = parse::temp(temp.split_last().unwrap().1);
let measurements_option = t_stations.get_mut(&hash); let measurements_option = t_stations.get_mut(&station);
if let Some((_, measurements)) = measurements_option { if let Some(measurements) = measurements_option {
measurements.update(temp); measurements.update(temp);
} else { } else {
let measurements = StationMeasurements { let measurements = StationMeasurements {
@ -82,7 +81,7 @@ pub fn run() {
count: 1, count: 1,
sum: temp, sum: temp,
}; };
t_stations.insert(hash, (station, measurements)); t_stations.insert(station, measurements);
} }
currposition += line_len as u64; currposition += line_len as u64;
if currposition >= end { if currposition >= end {
@ -96,18 +95,18 @@ pub fn run() {
} }
drop(tx); drop(tx);
while let Ok(t_stations) = rx.recv() { while let Ok(t_stations) = rx.recv() {
for (&hash, (station, measurements)) in t_stations.iter() { for (station, measurements) in t_stations.iter() {
let joined_measurements_options = stations.get_mut(&hash); let joined_measurements_options = stations.get_mut(station);
if let Some((_, joined_measurements)) = joined_measurements_options { if let Some(joined_measurements) = joined_measurements_options {
joined_measurements.merge(measurements); joined_measurements.merge(measurements);
} else { } else {
stations.insert(hash, (station.to_owned(), *measurements)); stations.insert(station.to_owned(), *measurements);
} }
} }
} }
let mut stations: Vec<String> = stations let mut stations: Vec<String> = stations
.iter() .iter()
.map(|(_, (station, measurements))| { .map(|(station, measurements)| {
let measurements = measurements.to_string(); let measurements = measurements.to_string();
#[cfg(feature = "json")] #[cfg(feature = "json")]
{ {

View File

@ -1,12 +1,11 @@
use std::collections::HashMap;
use std::io::{Read, Seek, SeekFrom};
use std::sync::mpsc; use std::sync::mpsc;
use std::time::Instant; use std::time::Instant;
use std::{fs::File, io::BufReader, thread}; use std::{fs::File, thread};
use std::ffi::CStr; use std::ffi::CStr;
use memmap2::MmapOptions;
use rustc_hash::{FxBuildHasher, FxHashMap};
use crate::models::station_measurements::StationMeasurements; use crate::models::station_measurements::StationMeasurements;
use crate::utils::parse; use crate::utils::parse;
use crate::utils::parse::hashstr;
const DEFAULT_HASHMAP_LENGTH: usize = 10000; const DEFAULT_HASHMAP_LENGTH: usize = 10000;
@ -14,55 +13,54 @@ pub fn run() {
print!("\x1b[J"); print!("\x1b[J");
const FILE_PATH: &str = "structured_measurements.txt"; const FILE_PATH: &str = "structured_measurements.txt";
let now = Instant::now(); let now = Instant::now();
let file = File::open(FILE_PATH).expect("File structured_measurements.txt not found");
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
let file_length = mmap.len();
let hasher = FxBuildHasher::default();
let mut stations: FxHashMap<String, StationMeasurements> =
FxHashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
let (tx, rx) = mpsc::channel();
let cores = thread::available_parallelism().unwrap().into();
let chunk_length = file_length / cores;
let mut bounds = Vec::with_capacity(cores + 1);
let mut start = 0;
for _ in 0..cores {
let end = (start + chunk_length).min(mmap.len());
let next_new_line = match memchr::memchr(b'\n', &mmap[end..]) {
Some(v) => v,
None => {
assert_eq!(end, mmap.len());
0
}
};
let end = end + next_new_line;
bounds.push((start, end));
start = end + 1;
}
thread::scope(|s| { thread::scope(|s| {
let mut stations: HashMap<usize, (String, StationMeasurements)> =
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let (tx, rx) = mpsc::channel();
let cores = thread::available_parallelism().unwrap().into();
let file = File::open(FILE_PATH).expect("File structured_measurements.txt not found");
let mut reader = BufReader::new(&file);
let file_length = reader.seek(SeekFrom::End(0)).unwrap();
let chunk_length = file_length as usize / cores;
let mut bounds = Vec::with_capacity(cores + 1);
bounds.push(0);
for i in 0..cores { for i in 0..cores {
let tx = tx.clone(); let tx = tx.clone();
let mut currposition = (i * chunk_length) as u64; let (start, end) = *bounds.get(i).unwrap();
let end = ((i + 1) * chunk_length) as u64; let mmap_slice = &mmap[start..end];
s.spawn(move || { s.spawn(move || {
let file = File::open(FILE_PATH).expect("File measurements.txt not found"); let mut t_stations: FxHashMap<String, StationMeasurements> =
let mut reader = BufReader::new(&file); FxHashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
reader.seek(SeekFrom::Start(currposition)).unwrap(); let lines = mmap_slice.chunks_exact(107);
let mut t_stations: HashMap<usize, (String, StationMeasurements)> = for (line_num, line) in lines.enumerate() {
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
let mut line = [0u8; 107];
let mut line_num = 0;
loop {
if line_num % 100000 == 0 { if line_num % 100000 == 0 {
print!("\x1b[{i};0Hlines: {line_num}"); print!("\x1b[{i};0Hlines: {line_num}");
} }
line_num += 1;
let read_res = reader
.read_exact(&mut line);
match read_res {
Ok(_) => (),
Err(e) => match e.kind() {
std::io::ErrorKind::UnexpectedEof => break,
_ => panic!("Could not read")
},
};
let (station, temp) = unsafe { line.split_at_unchecked(100) }; let (station, temp) = unsafe { line.split_at_unchecked(100) };
let hash = hashstr(station);
let station = { let station = {
if station[station.len() - 1] == 0u8 { if station[station.len() - 1] == 0u8 {
unsafe { std::str::from_utf8_unchecked(CStr::from_bytes_until_nul(station).unwrap().to_bytes()) } unsafe { std::str::from_utf8_unchecked(CStr::from_bytes_until_nul(station).unwrap().to_bytes()) }
} else { } else {
unsafe { std::str::from_utf8_unchecked(station) } unsafe { std::str::from_utf8_unchecked(station) }
} }
}; }.to_owned();
let temp = parse::temp_new(&temp[1..6]); let temp = parse::temp_new(&temp[1..6]);
let measurements_option = t_stations.get_mut(&hash); let measurements_option = t_stations.get_mut(&station);
if let Some((_, measurements)) = measurements_option { if let Some(measurements) = measurements_option {
measurements.update(temp); measurements.update(temp);
} else { } else {
let measurements = StationMeasurements { let measurements = StationMeasurements {
@ -71,11 +69,7 @@ pub fn run() {
count: 1, count: 1,
sum: temp, sum: temp,
}; };
t_stations.insert(hash, (station.to_string(), measurements)); t_stations.insert(station, measurements);
}
currposition += 107;
if currposition >= end {
break;
} }
} }
let _ = tx.send(t_stations); let _ = tx.send(t_stations);
@ -83,18 +77,18 @@ pub fn run() {
} }
drop(tx); drop(tx);
while let Ok(t_stations) = rx.recv() { while let Ok(t_stations) = rx.recv() {
for (&hash, (station, measurements)) in t_stations.iter() { for (station, measurements) in t_stations.iter() {
let joined_measurements_options = stations.get_mut(&hash); let joined_measurements_options = stations.get_mut(station);
if let Some((_, joined_measurements)) = joined_measurements_options { if let Some(joined_measurements) = joined_measurements_options {
joined_measurements.merge(measurements); joined_measurements.merge(measurements);
} else { } else {
stations.insert(hash, (station.to_owned(), *measurements)); stations.insert(station.to_owned(), *measurements);
} }
} }
} }
let mut stations: Vec<String> = stations let mut stations: Vec<String> = stations
.iter() .iter()
.map(|(_, (station, measurements))| { .map(|(station, measurements)| {
let measurements = measurements.to_string(); let measurements = measurements.to_string();
#[cfg(feature = "json")] #[cfg(feature = "json")]
{ {

View File

@ -1,31 +0,0 @@
use polars::prelude::*;
use std::time::Instant;
use std::vec;
pub fn run_polars() -> Result<DataFrame, PolarsError> {
let now = Instant::now();
let f1: Field = Field::new("station", DataType::String);
let f2: Field = Field::new("measure", DataType::Float64);
let sc: Schema = Schema::from_iter(vec![f1, f2]);
let q = LazyCsvReader::new("../../../measurements.txt")
.has_header(false)
.with_schema(Some(Arc::new(sc)))
.with_separator(b';')
.finish()?
.group_by(vec![col("station")])
.agg(vec![
col("measure").alias("min").min(),
col("measure").alias("mean").mean(),
col("measure").alias("max").max(),
])
.sort("station", Default::default())
.with_streaming(true);
let df = q.collect()?;
println!("Time={} μs", now.elapsed().as_micros());
Ok(df)
}

View File

@ -1,5 +1,5 @@
use bstr::{BStr, ByteSlice}; use bstr::{BStr, ByteSlice};
use memmap::MmapOptions; use memmap2::MmapOptions;
use rayon::prelude::*; use rayon::prelude::*;
use rustc_hash::FxHashMap as HashMap; use rustc_hash::FxHashMap as HashMap;
use std::time::Instant; use std::time::Instant;