managed to get library version on par with the reference implementation by using memmap2. I'm understanding scoped threads a little better now... I think
The fixed line length solution is still just as slow, even using memmap
This commit is contained in:
parent
b1d7ebaaea
commit
7add8793a5
23
src/main/rust/Cargo.lock
generated
23
src/main/rust/Cargo.lock
generated
@ -838,16 +838,6 @@ version = "2.7.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "memmap"
|
|
||||||
version = "0.7.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
|
|
||||||
dependencies = [
|
|
||||||
"libc",
|
|
||||||
"winapi",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memmap2"
|
name = "memmap2"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@ -857,6 +847,15 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memmap2"
|
||||||
|
version = "0.9.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "multiversion"
|
name = "multiversion"
|
||||||
version = "0.7.4"
|
version = "0.7.4"
|
||||||
@ -922,7 +921,7 @@ dependencies = [
|
|||||||
"fast-float",
|
"fast-float",
|
||||||
"libc",
|
"libc",
|
||||||
"memchr",
|
"memchr",
|
||||||
"memmap",
|
"memmap2 0.9.4",
|
||||||
"polars",
|
"polars",
|
||||||
"rayon",
|
"rayon",
|
||||||
"rustc-hash",
|
"rustc-hash",
|
||||||
@ -1148,7 +1147,7 @@ dependencies = [
|
|||||||
"home",
|
"home",
|
||||||
"itoa",
|
"itoa",
|
||||||
"memchr",
|
"memchr",
|
||||||
"memmap2",
|
"memmap2 0.7.1",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
|
@ -9,7 +9,7 @@ edition = "2021"
|
|||||||
bstr = "1.9.1"
|
bstr = "1.9.1"
|
||||||
fast-float = "0.2.0"
|
fast-float = "0.2.0"
|
||||||
memchr = "2.7.4"
|
memchr = "2.7.4"
|
||||||
memmap = "0.7.0"
|
memmap2 = "0.9.4"
|
||||||
polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
|
polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}
|
||||||
rayon = "1.10.0"
|
rayon = "1.10.0"
|
||||||
rustc-hash = "2.0.0"
|
rustc-hash = "2.0.0"
|
||||||
|
@ -1,67 +1,57 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::io::{BufRead, Seek, SeekFrom};
|
|
||||||
use std::sync::mpsc;
|
|
||||||
use std::time::Instant;
|
|
||||||
use std::{fs::File, io::BufReader, thread};
|
|
||||||
use memmap::MmapOptions;
|
|
||||||
use crate::models::station_measurements::StationMeasurements;
|
use crate::models::station_measurements::StationMeasurements;
|
||||||
use crate::utils::parse;
|
use crate::utils::parse;
|
||||||
use crate::utils::parse::hashstr;
|
use memmap2::MmapOptions;
|
||||||
|
use rustc_hash::{FxBuildHasher, FxHashMap as HashMap};
|
||||||
|
use std::sync::mpsc;
|
||||||
|
use std::time::Instant;
|
||||||
|
use std::{fs::File, thread};
|
||||||
|
|
||||||
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
||||||
|
|
||||||
pub fn run() {
|
pub fn run() {
|
||||||
const FILE_PATH: &str = "../../../measurements.txt";
|
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
const FILE_PATH: &str = "../../../measurements.txt";
|
||||||
|
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
||||||
|
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
|
||||||
|
let file_length = mmap.len();
|
||||||
|
let hasher = FxBuildHasher::default();
|
||||||
|
let mut stations: HashMap<String, StationMeasurements> =
|
||||||
|
HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
|
||||||
|
let (tx, rx) = mpsc::channel();
|
||||||
|
let cores = thread::available_parallelism().unwrap().into();
|
||||||
|
let chunk_length = file_length / cores;
|
||||||
|
let mut bounds = Vec::with_capacity(cores + 1);
|
||||||
|
let mut start = 0;
|
||||||
|
for _ in 0..cores {
|
||||||
|
let end = (start + chunk_length).min(mmap.len());
|
||||||
|
let next_new_line = match memchr::memchr(b'\n', &mmap[end..]) {
|
||||||
|
Some(v) => v,
|
||||||
|
None => {
|
||||||
|
assert_eq!(end, mmap.len());
|
||||||
|
0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let end = end + next_new_line;
|
||||||
|
bounds.push((start, end));
|
||||||
|
start = end + 1;
|
||||||
|
}
|
||||||
thread::scope(|s| {
|
thread::scope(|s| {
|
||||||
let mut stations: HashMap<usize, (String, StationMeasurements)> =
|
|
||||||
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
|
||||||
let (tx, rx) = mpsc::channel();
|
|
||||||
let cores = thread::available_parallelism().unwrap().into();
|
|
||||||
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
|
||||||
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
|
|
||||||
let file_length = mmap.len();
|
|
||||||
let chunk_length = file_length / cores;
|
|
||||||
let mut bounds = Vec::with_capacity(cores + 1);
|
|
||||||
bounds.push(0);
|
|
||||||
for i in 1..cores {
|
|
||||||
let mut reader = BufReader::new(&file);
|
|
||||||
let mut byte_start = chunk_length * i;
|
|
||||||
reader
|
|
||||||
.seek(SeekFrom::Start(byte_start as u64))
|
|
||||||
.expect("could not seek");
|
|
||||||
let mut line = Vec::with_capacity(108);
|
|
||||||
let line_len = reader
|
|
||||||
.read_until(b'\n', &mut line)
|
|
||||||
.expect("could not read bytes");
|
|
||||||
byte_start += line_len;
|
|
||||||
bounds.push(byte_start);
|
|
||||||
}
|
|
||||||
bounds.push(file_length);
|
|
||||||
for i in 0..cores {
|
for i in 0..cores {
|
||||||
let tx = tx.clone();
|
let tx = tx.clone();
|
||||||
let mut currposition = *bounds.get(i).unwrap();
|
let (start, end) = *bounds.get(i).unwrap();
|
||||||
let end = *bounds.get(i + 1).unwrap();
|
let mmap_slice = &mmap[start..end];
|
||||||
s.spawn(move || {
|
s.spawn(move || {
|
||||||
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
let mut t_stations: HashMap<String, StationMeasurements> =
|
||||||
let mut reader = BufReader::new(&file);
|
HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
|
||||||
reader.seek(SeekFrom::Start(currposition as u64)).unwrap();
|
for line in mmap_slice.split(|&byte| { byte == b'\n' }) {
|
||||||
let mut t_stations: HashMap<usize, (String, StationMeasurements)> =
|
if line.len() == 0 {
|
||||||
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
|
||||||
let mut line = Vec::with_capacity(108);
|
|
||||||
loop {
|
|
||||||
let line_len = reader
|
|
||||||
.read_until(b'\n', &mut line)
|
|
||||||
.expect("could not read bytes");
|
|
||||||
if line_len == 0 {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
|
let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap();
|
||||||
let hash = hashstr(station);
|
|
||||||
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
|
let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) };
|
||||||
let temp = parse::temp(temp.split_last().unwrap().1);
|
let temp = parse::temp(temp);
|
||||||
let measurements_option = t_stations.get_mut(&hash);
|
let measurements_option = t_stations.get_mut(&station);
|
||||||
if let Some((_, measurements)) = measurements_option {
|
if let Some(measurements) = measurements_option {
|
||||||
measurements.update(temp);
|
measurements.update(temp);
|
||||||
} else {
|
} else {
|
||||||
let measurements = StationMeasurements {
|
let measurements = StationMeasurements {
|
||||||
@ -70,31 +60,26 @@ pub fn run() {
|
|||||||
count: 1,
|
count: 1,
|
||||||
sum: temp,
|
sum: temp,
|
||||||
};
|
};
|
||||||
t_stations.insert(hash, (station, measurements));
|
t_stations.insert(station, measurements);
|
||||||
}
|
}
|
||||||
currposition += line_len;
|
|
||||||
if currposition >= end {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
line.clear();
|
|
||||||
}
|
}
|
||||||
let _ = tx.send(t_stations);
|
let _ = tx.send(t_stations);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
drop(tx);
|
drop(tx);
|
||||||
while let Ok(t_stations) = rx.recv() {
|
while let Ok(t_stations) = rx.recv() {
|
||||||
for (&hash, (station, measurements)) in t_stations.iter() {
|
for (station, measurements) in t_stations.iter() {
|
||||||
let joined_measurements_options = stations.get_mut(&hash);
|
let joined_measurements_options = stations.get_mut(station);
|
||||||
if let Some((_, joined_measurements)) = joined_measurements_options {
|
if let Some(joined_measurements) = joined_measurements_options {
|
||||||
joined_measurements.merge(measurements);
|
joined_measurements.merge(measurements);
|
||||||
} else {
|
} else {
|
||||||
stations.insert(hash, (station.to_owned(), *measurements));
|
stations.insert(station.to_owned(), *measurements);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut stations: Vec<String> = stations
|
let mut stations: Vec<String> = stations
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(_, (station, measurements))| {
|
.map(|(station, measurements)| {
|
||||||
let measurements = measurements.to_string();
|
let measurements = measurements.to_string();
|
||||||
#[cfg(feature = "json")]
|
#[cfg(feature = "json")]
|
||||||
{
|
{
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
|
||||||
use std::sync::mpsc;
|
use std::sync::mpsc;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use std::{fs::File, io::BufReader, thread};
|
use std::{fs::File, thread};
|
||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
|
use memmap2::MmapOptions;
|
||||||
|
use rustc_hash::{FxBuildHasher, FxHashMap};
|
||||||
use crate::models::station_measurements::StationMeasurements;
|
use crate::models::station_measurements::StationMeasurements;
|
||||||
use crate::utils::parse;
|
use crate::utils::parse;
|
||||||
use crate::utils::parse::hashstr;
|
|
||||||
|
|
||||||
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
const DEFAULT_HASHMAP_LENGTH: usize = 10000;
|
||||||
|
|
||||||
@ -14,55 +13,54 @@ pub fn run() {
|
|||||||
print!("\x1b[J");
|
print!("\x1b[J");
|
||||||
const FILE_PATH: &str = "structured_measurements.txt";
|
const FILE_PATH: &str = "structured_measurements.txt";
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
let file = File::open(FILE_PATH).expect("File structured_measurements.txt not found");
|
||||||
|
let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
|
||||||
|
let file_length = mmap.len();
|
||||||
|
let hasher = FxBuildHasher::default();
|
||||||
|
let mut stations: FxHashMap<String, StationMeasurements> =
|
||||||
|
FxHashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
|
||||||
|
let (tx, rx) = mpsc::channel();
|
||||||
|
let cores = thread::available_parallelism().unwrap().into();
|
||||||
|
let chunk_length = file_length / cores;
|
||||||
|
let mut bounds = Vec::with_capacity(cores + 1);
|
||||||
|
let mut start = 0;
|
||||||
|
for _ in 0..cores {
|
||||||
|
let end = (start + chunk_length).min(mmap.len());
|
||||||
|
let next_new_line = match memchr::memchr(b'\n', &mmap[end..]) {
|
||||||
|
Some(v) => v,
|
||||||
|
None => {
|
||||||
|
assert_eq!(end, mmap.len());
|
||||||
|
0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let end = end + next_new_line;
|
||||||
|
bounds.push((start, end));
|
||||||
|
start = end + 1;
|
||||||
|
}
|
||||||
thread::scope(|s| {
|
thread::scope(|s| {
|
||||||
let mut stations: HashMap<usize, (String, StationMeasurements)> =
|
|
||||||
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
|
||||||
let (tx, rx) = mpsc::channel();
|
|
||||||
let cores = thread::available_parallelism().unwrap().into();
|
|
||||||
let file = File::open(FILE_PATH).expect("File structured_measurements.txt not found");
|
|
||||||
let mut reader = BufReader::new(&file);
|
|
||||||
let file_length = reader.seek(SeekFrom::End(0)).unwrap();
|
|
||||||
let chunk_length = file_length as usize / cores;
|
|
||||||
let mut bounds = Vec::with_capacity(cores + 1);
|
|
||||||
bounds.push(0);
|
|
||||||
for i in 0..cores {
|
for i in 0..cores {
|
||||||
let tx = tx.clone();
|
let tx = tx.clone();
|
||||||
let mut currposition = (i * chunk_length) as u64;
|
let (start, end) = *bounds.get(i).unwrap();
|
||||||
let end = ((i + 1) * chunk_length) as u64;
|
let mmap_slice = &mmap[start..end];
|
||||||
s.spawn(move || {
|
s.spawn(move || {
|
||||||
let file = File::open(FILE_PATH).expect("File measurements.txt not found");
|
let mut t_stations: FxHashMap<String, StationMeasurements> =
|
||||||
let mut reader = BufReader::new(&file);
|
FxHashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher);
|
||||||
reader.seek(SeekFrom::Start(currposition)).unwrap();
|
let lines = mmap_slice.chunks_exact(107);
|
||||||
let mut t_stations: HashMap<usize, (String, StationMeasurements)> =
|
for (line_num, line) in lines.enumerate() {
|
||||||
HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH);
|
|
||||||
let mut line = [0u8; 107];
|
|
||||||
let mut line_num = 0;
|
|
||||||
loop {
|
|
||||||
if line_num % 100000 == 0 {
|
if line_num % 100000 == 0 {
|
||||||
print!("\x1b[{i};0Hlines: {line_num}");
|
print!("\x1b[{i};0Hlines: {line_num}");
|
||||||
}
|
}
|
||||||
line_num += 1;
|
|
||||||
let read_res = reader
|
|
||||||
.read_exact(&mut line);
|
|
||||||
match read_res {
|
|
||||||
Ok(_) => (),
|
|
||||||
Err(e) => match e.kind() {
|
|
||||||
std::io::ErrorKind::UnexpectedEof => break,
|
|
||||||
_ => panic!("Could not read")
|
|
||||||
},
|
|
||||||
};
|
|
||||||
let (station, temp) = unsafe { line.split_at_unchecked(100) };
|
let (station, temp) = unsafe { line.split_at_unchecked(100) };
|
||||||
let hash = hashstr(station);
|
|
||||||
let station = {
|
let station = {
|
||||||
if station[station.len() - 1] == 0u8 {
|
if station[station.len() - 1] == 0u8 {
|
||||||
unsafe { std::str::from_utf8_unchecked(CStr::from_bytes_until_nul(station).unwrap().to_bytes()) }
|
unsafe { std::str::from_utf8_unchecked(CStr::from_bytes_until_nul(station).unwrap().to_bytes()) }
|
||||||
} else {
|
} else {
|
||||||
unsafe { std::str::from_utf8_unchecked(station) }
|
unsafe { std::str::from_utf8_unchecked(station) }
|
||||||
}
|
}
|
||||||
};
|
}.to_owned();
|
||||||
let temp = parse::temp_new(&temp[1..6]);
|
let temp = parse::temp_new(&temp[1..6]);
|
||||||
let measurements_option = t_stations.get_mut(&hash);
|
let measurements_option = t_stations.get_mut(&station);
|
||||||
if let Some((_, measurements)) = measurements_option {
|
if let Some(measurements) = measurements_option {
|
||||||
measurements.update(temp);
|
measurements.update(temp);
|
||||||
} else {
|
} else {
|
||||||
let measurements = StationMeasurements {
|
let measurements = StationMeasurements {
|
||||||
@ -71,11 +69,7 @@ pub fn run() {
|
|||||||
count: 1,
|
count: 1,
|
||||||
sum: temp,
|
sum: temp,
|
||||||
};
|
};
|
||||||
t_stations.insert(hash, (station.to_string(), measurements));
|
t_stations.insert(station, measurements);
|
||||||
}
|
|
||||||
currposition += 107;
|
|
||||||
if currposition >= end {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let _ = tx.send(t_stations);
|
let _ = tx.send(t_stations);
|
||||||
@ -83,18 +77,18 @@ pub fn run() {
|
|||||||
}
|
}
|
||||||
drop(tx);
|
drop(tx);
|
||||||
while let Ok(t_stations) = rx.recv() {
|
while let Ok(t_stations) = rx.recv() {
|
||||||
for (&hash, (station, measurements)) in t_stations.iter() {
|
for (station, measurements) in t_stations.iter() {
|
||||||
let joined_measurements_options = stations.get_mut(&hash);
|
let joined_measurements_options = stations.get_mut(station);
|
||||||
if let Some((_, joined_measurements)) = joined_measurements_options {
|
if let Some(joined_measurements) = joined_measurements_options {
|
||||||
joined_measurements.merge(measurements);
|
joined_measurements.merge(measurements);
|
||||||
} else {
|
} else {
|
||||||
stations.insert(hash, (station.to_owned(), *measurements));
|
stations.insert(station.to_owned(), *measurements);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut stations: Vec<String> = stations
|
let mut stations: Vec<String> = stations
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(_, (station, measurements))| {
|
.map(|(station, measurements)| {
|
||||||
let measurements = measurements.to_string();
|
let measurements = measurements.to_string();
|
||||||
#[cfg(feature = "json")]
|
#[cfg(feature = "json")]
|
||||||
{
|
{
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use bstr::{BStr, ByteSlice};
|
use bstr::{BStr, ByteSlice};
|
||||||
use memmap::MmapOptions;
|
use memmap2::MmapOptions;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use rustc_hash::FxHashMap as HashMap;
|
use rustc_hash::FxHashMap as HashMap;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
Loading…
Reference in New Issue
Block a user